summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/builder/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/builder/__init__.py')
-rw-r--r--bitbake/lib/bs4/builder/__init__.py382
1 files changed, 347 insertions, 35 deletions
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py
index 6ccd4d23d6..ffb31fc25e 100644
--- a/bitbake/lib/bs4/builder/__init__.py
+++ b/bitbake/lib/bs4/builder/__init__.py
@@ -1,11 +1,21 @@
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
1from collections import defaultdict 4from collections import defaultdict
2import itertools 5import itertools
6import re
7import warnings
3import sys 8import sys
4from bs4.element import ( 9from bs4.element import (
5 CharsetMetaAttributeValue, 10 CharsetMetaAttributeValue,
6 ContentMetaAttributeValue, 11 ContentMetaAttributeValue,
7 whitespace_re 12 RubyParenthesisString,
8 ) 13 RubyTextString,
14 Stylesheet,
15 Script,
16 TemplateString,
17 nonwhitespace_re
18)
9 19
10__all__ = [ 20__all__ = [
11 'HTMLTreeBuilder', 21 'HTMLTreeBuilder',
@@ -22,20 +32,41 @@ XML = 'xml'
22HTML = 'html' 32HTML = 'html'
23HTML_5 = 'html5' 33HTML_5 = 'html5'
24 34
35class XMLParsedAsHTMLWarning(UserWarning):
36 """The warning issued when an HTML parser is used to parse
37 XML that is not XHTML.
38 """
39 MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
40
25 41
26class TreeBuilderRegistry(object): 42class TreeBuilderRegistry(object):
27 43 """A way of looking up TreeBuilder subclasses by their name or by desired
44 features.
45 """
46
28 def __init__(self): 47 def __init__(self):
29 self.builders_for_feature = defaultdict(list) 48 self.builders_for_feature = defaultdict(list)
30 self.builders = [] 49 self.builders = []
31 50
32 def register(self, treebuilder_class): 51 def register(self, treebuilder_class):
33 """Register a treebuilder based on its advertised features.""" 52 """Register a treebuilder based on its advertised features.
53
54 :param treebuilder_class: A subclass of Treebuilder. its .features
55 attribute should list its features.
56 """
34 for feature in treebuilder_class.features: 57 for feature in treebuilder_class.features:
35 self.builders_for_feature[feature].insert(0, treebuilder_class) 58 self.builders_for_feature[feature].insert(0, treebuilder_class)
36 self.builders.insert(0, treebuilder_class) 59 self.builders.insert(0, treebuilder_class)
37 60
38 def lookup(self, *features): 61 def lookup(self, *features):
62 """Look up a TreeBuilder subclass with the desired features.
63
64 :param features: A list of features to look for. If none are
65 provided, the most recently registered TreeBuilder subclass
66 will be used.
67 :return: A TreeBuilder subclass, or None if there's no
68 registered subclass with all the requested features.
69 """
39 if len(self.builders) == 0: 70 if len(self.builders) == 0:
40 # There are no builders at all. 71 # There are no builders at all.
41 return None 72 return None
@@ -78,7 +109,7 @@ class TreeBuilderRegistry(object):
78builder_registry = TreeBuilderRegistry() 109builder_registry = TreeBuilderRegistry()
79 110
80class TreeBuilder(object): 111class TreeBuilder(object):
81 """Turn a document into a Beautiful Soup object tree.""" 112 """Turn a textual document into a Beautiful Soup object tree."""
82 113
83 NAME = "[Unknown tree builder]" 114 NAME = "[Unknown tree builder]"
84 ALTERNATE_NAMES = [] 115 ALTERNATE_NAMES = []
@@ -86,19 +117,89 @@ class TreeBuilder(object):
86 117
87 is_xml = False 118 is_xml = False
88 picklable = False 119 picklable = False
89 preserve_whitespace_tags = set()
90 empty_element_tags = None # A tag will be considered an empty-element 120 empty_element_tags = None # A tag will be considered an empty-element
91 # tag when and only when it has no contents. 121 # tag when and only when it has no contents.
92 122
93 # A value for these tag/attribute combinations is a space- or 123 # A value for these tag/attribute combinations is a space- or
94 # comma-separated list of CDATA, rather than a single CDATA. 124 # comma-separated list of CDATA, rather than a single CDATA.
95 cdata_list_attributes = {} 125 DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
96 126
97 127 # Whitespace should be preserved inside these tags.
98 def __init__(self): 128 DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
129
130 # The textual contents of tags with these names should be
131 # instantiated with some class other than NavigableString.
132 DEFAULT_STRING_CONTAINERS = {}
133
134 USE_DEFAULT = object()
135
136 # Most parsers don't keep track of line numbers.
137 TRACKS_LINE_NUMBERS = False
138
139 def __init__(self, multi_valued_attributes=USE_DEFAULT,
140 preserve_whitespace_tags=USE_DEFAULT,
141 store_line_numbers=USE_DEFAULT,
142 string_containers=USE_DEFAULT,
143 ):
144 """Constructor.
145
146 :param multi_valued_attributes: If this is set to None, the
147 TreeBuilder will not turn any values for attributes like
148 'class' into lists. Setting this to a dictionary will
149 customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
150 for an example.
151
152 Internally, these are called "CDATA list attributes", but that
153 probably doesn't make sense to an end-user, so the argument name
154 is `multi_valued_attributes`.
155
156 :param preserve_whitespace_tags: A list of tags to treat
157 the way <pre> tags are treated in HTML. Tags in this list
158 are immune from pretty-printing; their contents will always be
159 output as-is.
160
161 :param string_containers: A dictionary mapping tag names to
162 the classes that should be instantiated to contain the textual
163 contents of those tags. The default is to use NavigableString
164 for every tag, no matter what the name. You can override the
165 default by changing DEFAULT_STRING_CONTAINERS.
166
167 :param store_line_numbers: If the parser keeps track of the
168 line numbers and positions of the original markup, that
169 information will, by default, be stored in each corresponding
170 `Tag` object. You can turn this off by passing
171 store_line_numbers=False. If the parser you're using doesn't
172 keep track of this information, then setting store_line_numbers=True
173 will do nothing.
174 """
99 self.soup = None 175 self.soup = None
100 176 if multi_valued_attributes is self.USE_DEFAULT:
177 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
178 self.cdata_list_attributes = multi_valued_attributes
179 if preserve_whitespace_tags is self.USE_DEFAULT:
180 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
181 self.preserve_whitespace_tags = preserve_whitespace_tags
182 if store_line_numbers == self.USE_DEFAULT:
183 store_line_numbers = self.TRACKS_LINE_NUMBERS
184 self.store_line_numbers = store_line_numbers
185 if string_containers == self.USE_DEFAULT:
186 string_containers = self.DEFAULT_STRING_CONTAINERS
187 self.string_containers = string_containers
188
189 def initialize_soup(self, soup):
190 """The BeautifulSoup object has been initialized and is now
191 being associated with the TreeBuilder.
192
193 :param soup: A BeautifulSoup object.
194 """
195 self.soup = soup
196
101 def reset(self): 197 def reset(self):
198 """Do any work necessary to reset the underlying parser
199 for a new document.
200
201 By default, this does nothing.
202 """
102 pass 203 pass
103 204
104 def can_be_empty_element(self, tag_name): 205 def can_be_empty_element(self, tag_name):
@@ -110,24 +211,58 @@ class TreeBuilder(object):
110 For instance: an HTMLBuilder does not consider a <p> tag to be 211 For instance: an HTMLBuilder does not consider a <p> tag to be
111 an empty-element tag (it's not in 212 an empty-element tag (it's not in
112 HTMLBuilder.empty_element_tags). This means an empty <p> tag 213 HTMLBuilder.empty_element_tags). This means an empty <p> tag
113 will be presented as "<p></p>", not "<p />". 214 will be presented as "<p></p>", not "<p/>" or "<p>".
114 215
115 The default implementation has no opinion about which tags are 216 The default implementation has no opinion about which tags are
116 empty-element tags, so a tag will be presented as an 217 empty-element tags, so a tag will be presented as an
117 empty-element tag if and only if it has no contents. 218 empty-element tag if and only if it has no children.
118 "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will 219 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
119 be left alone. 220 be left alone.
221
222 :param tag_name: The name of a markup tag.
120 """ 223 """
121 if self.empty_element_tags is None: 224 if self.empty_element_tags is None:
122 return True 225 return True
123 return tag_name in self.empty_element_tags 226 return tag_name in self.empty_element_tags
124 227
125 def feed(self, markup): 228 def feed(self, markup):
229 """Run some incoming markup through some parsing process,
230 populating the `BeautifulSoup` object in self.soup.
231
232 This method is not implemented in TreeBuilder; it must be
233 implemented in subclasses.
234
235 :return: None.
236 """
126 raise NotImplementedError() 237 raise NotImplementedError()
127 238
128 def prepare_markup(self, markup, user_specified_encoding=None, 239 def prepare_markup(self, markup, user_specified_encoding=None,
129 document_declared_encoding=None): 240 document_declared_encoding=None, exclude_encodings=None):
130 return markup, None, None, False 241 """Run any preliminary steps necessary to make incoming markup
242 acceptable to the parser.
243
244 :param markup: Some markup -- probably a bytestring.
245 :param user_specified_encoding: The user asked to try this encoding.
246 :param document_declared_encoding: The markup itself claims to be
247 in this encoding. NOTE: This argument is not used by the
248 calling code and can probably be removed.
249 :param exclude_encodings: The user asked _not_ to try any of
250 these encodings.
251
252 :yield: A series of 4-tuples:
253 (markup, encoding, declared encoding,
254 has undergone character replacement)
255
256 Each 4-tuple represents a strategy for converting the
257 document to Unicode and parsing it. Each strategy will be tried
258 in turn.
259
260 By default, the only strategy is to parse the markup
261 as-is. See `LXMLTreeBuilderForXML` and
262 `HTMLParserTreeBuilder` for implementations that take into
263 account the quirks of particular parsers.
264 """
265 yield markup, None, None, False
131 266
132 def test_fragment_to_document(self, fragment): 267 def test_fragment_to_document(self, fragment):
133 """Wrap an HTML fragment to make it look like a document. 268 """Wrap an HTML fragment to make it look like a document.
@@ -139,16 +274,36 @@ class TreeBuilder(object):
139 results against other HTML fragments. 274 results against other HTML fragments.
140 275
141 This method should not be used outside of tests. 276 This method should not be used outside of tests.
277
278 :param fragment: A string -- fragment of HTML.
279 :return: A string -- a full HTML document.
142 """ 280 """
143 return fragment 281 return fragment
144 282
145 def set_up_substitutions(self, tag): 283 def set_up_substitutions(self, tag):
284 """Set up any substitutions that will need to be performed on
285 a `Tag` when it's output as a string.
286
287 By default, this does nothing. See `HTMLTreeBuilder` for a
288 case where this is used.
289
290 :param tag: A `Tag`
291 :return: Whether or not a substitution was performed.
292 """
146 return False 293 return False
147 294
148 def _replace_cdata_list_attribute_values(self, tag_name, attrs): 295 def _replace_cdata_list_attribute_values(self, tag_name, attrs):
149 """Replaces class="foo bar" with class=["foo", "bar"] 296 """When an attribute value is associated with a tag that can
297 have multiple values for that attribute, convert the string
298 value to a list of strings.
150 299
151 Modifies its input in place. 300 Basically, replaces class="foo bar" with class=["foo", "bar"]
301
302 NOTE: This method modifies its input in place.
303
304 :param tag_name: The name of a tag.
305 :param attrs: A dictionary containing the tag's attributes.
306 Any appropriate attribute values will be modified in place.
152 """ 307 """
153 if not attrs: 308 if not attrs:
154 return attrs 309 return attrs
@@ -163,7 +318,7 @@ class TreeBuilder(object):
163 # values. Split it into a list. 318 # values. Split it into a list.
164 value = attrs[attr] 319 value = attrs[attr]
165 if isinstance(value, str): 320 if isinstance(value, str):
166 values = whitespace_re.split(value) 321 values = nonwhitespace_re.findall(value)
167 else: 322 else:
168 # html5lib sometimes calls setAttributes twice 323 # html5lib sometimes calls setAttributes twice
169 # for the same tag when rearranging the parse 324 # for the same tag when rearranging the parse
@@ -174,9 +329,13 @@ class TreeBuilder(object):
174 values = value 329 values = value
175 attrs[attr] = values 330 attrs[attr] = values
176 return attrs 331 return attrs
177 332
178class SAXTreeBuilder(TreeBuilder): 333class SAXTreeBuilder(TreeBuilder):
179 """A Beautiful Soup treebuilder that listens for SAX events.""" 334 """A Beautiful Soup treebuilder that listens for SAX events.
335
336 This is not currently used for anything, but it demonstrates
337 how a simple TreeBuilder would work.
338 """
180 339
181 def feed(self, markup): 340 def feed(self, markup):
182 raise NotImplementedError() 341 raise NotImplementedError()
@@ -186,11 +345,11 @@ class SAXTreeBuilder(TreeBuilder):
186 345
187 def startElement(self, name, attrs): 346 def startElement(self, name, attrs):
188 attrs = dict((key[1], value) for key, value in list(attrs.items())) 347 attrs = dict((key[1], value) for key, value in list(attrs.items()))
189 #print "Start %s, %r" % (name, attrs) 348 #print("Start %s, %r" % (name, attrs))
190 self.soup.handle_starttag(name, attrs) 349 self.soup.handle_starttag(name, attrs)
191 350
192 def endElement(self, name): 351 def endElement(self, name):
193 #print "End %s" % name 352 #print("End %s" % name)
194 self.soup.handle_endtag(name) 353 self.soup.handle_endtag(name)
195 354
196 def startElementNS(self, nsTuple, nodeName, attrs): 355 def startElementNS(self, nsTuple, nodeName, attrs):
@@ -227,10 +386,44 @@ class HTMLTreeBuilder(TreeBuilder):
227 Such as which tags are empty-element tags. 386 Such as which tags are empty-element tags.
228 """ 387 """
229 388
230 preserve_whitespace_tags = set(['pre', 'textarea']) 389 empty_element_tags = set([
231 empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 390 # These are from HTML5.
232 'spacer', 'link', 'frame', 'base']) 391 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
233 392
393 # These are from earlier versions of HTML and are removed in HTML5.
394 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
395 ])
396
397 # The HTML standard defines these as block-level elements. Beautiful
398 # Soup does not treat these elements differently from other elements,
399 # but it may do so eventually, and this information is available if
400 # you need to use it.
401 block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
402
403 # These HTML tags need special treatment so they can be
404 # represented by a string class other than NavigableString.
405 #
406 # For some of these tags, it's because the HTML standard defines
407 # an unusual content model for them. I made this list by going
408 # through the HTML spec
409 # (https://html.spec.whatwg.org/#metadata-content) and looking for
410 # "metadata content" elements that can contain strings.
411 #
412 # The Ruby tags (<rt> and <rp>) are here despite being normal
413 # "phrasing content" tags, because the content they contain is
414 # qualitatively different from other text in the document, and it
415 # can be useful to be able to distinguish it.
416 #
417 # TODO: Arguably <noscript> could go here but it seems
418 # qualitatively different from the other tags.
419 DEFAULT_STRING_CONTAINERS = {
420 'rt' : RubyTextString,
421 'rp' : RubyParenthesisString,
422 'style': Stylesheet,
423 'script': Script,
424 'template': TemplateString,
425 }
426
234 # The HTML standard defines these attributes as containing a 427 # The HTML standard defines these attributes as containing a
235 # space-separated list of values, not a single value. That is, 428 # space-separated list of values, not a single value. That is,
236 # class="foo bar" means that the 'class' attribute has two values, 429 # class="foo bar" means that the 'class' attribute has two values,
@@ -238,7 +431,7 @@ class HTMLTreeBuilder(TreeBuilder):
238 # encounter one of these attributes, we will parse its value into 431 # encounter one of these attributes, we will parse its value into
239 # a list of values if possible. Upon output, the list will be 432 # a list of values if possible. Upon output, the list will be
240 # converted back into a string. 433 # converted back into a string.
241 cdata_list_attributes = { 434 DEFAULT_CDATA_LIST_ATTRIBUTES = {
242 "*" : ['class', 'accesskey', 'dropzone'], 435 "*" : ['class', 'accesskey', 'dropzone'],
243 "a" : ['rel', 'rev'], 436 "a" : ['rel', 'rev'],
244 "link" : ['rel', 'rev'], 437 "link" : ['rel', 'rev'],
@@ -255,7 +448,19 @@ class HTMLTreeBuilder(TreeBuilder):
255 "output" : ["for"], 448 "output" : ["for"],
256 } 449 }
257 450
451 DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
452
258 def set_up_substitutions(self, tag): 453 def set_up_substitutions(self, tag):
454 """Replace the declared encoding in a <meta> tag with a placeholder,
455 to be substituted when the tag is output to a string.
456
457 An HTML document may come in to Beautiful Soup as one
458 encoding, but exit in a different encoding, and the <meta> tag
459 needs to be changed to reflect this.
460
461 :param tag: A `Tag`
462 :return: Whether or not a substitution was performed.
463 """
259 # We are only interested in <meta> tags 464 # We are only interested in <meta> tags
260 if tag.name != 'meta': 465 if tag.name != 'meta':
261 return False 466 return False
@@ -288,10 +493,107 @@ class HTMLTreeBuilder(TreeBuilder):
288 493
289 return (meta_encoding is not None) 494 return (meta_encoding is not None)
290 495
496class DetectsXMLParsedAsHTML(object):
497 """A mixin class for any class (a TreeBuilder, or some class used by a
498 TreeBuilder) that's in a position to detect whether an XML
499 document is being incorrectly parsed as HTML, and issue an
500 appropriate warning.
501
502 This requires being able to observe an incoming processing
503 instruction that might be an XML declaration, and also able to
504 observe tags as they're opened. If you can't do that for a given
505 TreeBuilder, there's a less reliable implementation based on
506 examining the raw markup.
507 """
508
509 # Regular expression for seeing if markup has an <html> tag.
510 LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
511 LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
512
513 XML_PREFIX = '<?xml'
514 XML_PREFIX_B = b'<?xml'
515
516 @classmethod
517 def warn_if_markup_looks_like_xml(cls, markup, stacklevel=3):
518 """Perform a check on some markup to see if it looks like XML
519 that's not XHTML. If so, issue a warning.
520
521 This is much less reliable than doing the check while parsing,
522 but some of the tree builders can't do that.
523
524 :param stacklevel: The stacklevel of the code calling this
525 function.
526
527 :return: True if the markup looks like non-XHTML XML, False
528 otherwise.
529
530 """
531 if isinstance(markup, bytes):
532 prefix = cls.XML_PREFIX_B
533 looks_like_html = cls.LOOKS_LIKE_HTML_B
534 else:
535 prefix = cls.XML_PREFIX
536 looks_like_html = cls.LOOKS_LIKE_HTML
537
538 if (markup is not None
539 and markup.startswith(prefix)
540 and not looks_like_html.search(markup[:500])
541 ):
542 cls._warn(stacklevel=stacklevel+2)
543 return True
544 return False
545
546 @classmethod
547 def _warn(cls, stacklevel=5):
548 """Issue a warning about XML being parsed as HTML."""
549 warnings.warn(
550 XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning,
551 stacklevel=stacklevel
552 )
553
554 def _initialize_xml_detector(self):
555 """Call this method before parsing a document."""
556 self._first_processing_instruction = None
557 self._root_tag = None
558
559 def _document_might_be_xml(self, processing_instruction):
560 """Call this method when encountering an XML declaration, or a
561 "processing instruction" that might be an XML declaration.
562 """
563 if (self._first_processing_instruction is not None
564 or self._root_tag is not None):
565 # The document has already started. Don't bother checking
566 # anymore.
567 return
568
569 self._first_processing_instruction = processing_instruction
570
571 # We won't know until we encounter the first tag whether or
572 # not this is actually a problem.
573
574 def _root_tag_encountered(self, name):
575 """Call this when you encounter the document's root tag.
576
577 This is where we actually check whether an XML document is
578 being incorrectly parsed as HTML, and issue the warning.
579 """
580 if self._root_tag is not None:
581 # This method was incorrectly called multiple times. Do
582 # nothing.
583 return
584
585 self._root_tag = name
586 if (name != 'html' and self._first_processing_instruction is not None
587 and self._first_processing_instruction.lower().startswith('xml ')):
588 # We encountered an XML declaration and then a tag other
589 # than 'html'. This is a reliable indicator that a
590 # non-XHTML document is being parsed as XML.
591 self._warn()
592
593
291def register_treebuilders_from(module): 594def register_treebuilders_from(module):
292 """Copy TreeBuilders from the given module into this module.""" 595 """Copy TreeBuilders from the given module into this module."""
293 # I'm fairly sure this is not the best way to do this. 596 this_module = sys.modules[__name__]
294 this_module = sys.modules['bs4.builder']
295 for name in module.__all__: 597 for name in module.__all__:
296 obj = getattr(module, name) 598 obj = getattr(module, name)
297 599
@@ -302,12 +604,22 @@ def register_treebuilders_from(module):
302 this_module.builder_registry.register(obj) 604 this_module.builder_registry.register(obj)
303 605
304class ParserRejectedMarkup(Exception): 606class ParserRejectedMarkup(Exception):
305 pass 607 """An Exception to be raised when the underlying parser simply
306 608 refuses to parse the given markup.
609 """
610 def __init__(self, message_or_exception):
611 """Explain why the parser rejected the given markup, either
612 with a textual explanation or another exception.
613 """
614 if isinstance(message_or_exception, Exception):
615 e = message_or_exception
616 message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
617 super(ParserRejectedMarkup, self).__init__(message_or_exception)
618
307# Builders are registered in reverse order of priority, so that custom 619# Builders are registered in reverse order of priority, so that custom
308# builder registrations will take precedence. In general, we want lxml 620# builder registrations will take precedence. In general, we want lxml
309# to take precedence over html5lib, because it's faster. And we only 621# to take precedence over html5lib, because it's faster. And we only
310# want to use HTMLParser as a last result. 622# want to use HTMLParser as a last resort.
311from . import _htmlparser 623from . import _htmlparser
312register_treebuilders_from(_htmlparser) 624register_treebuilders_from(_htmlparser)
313try: 625try: