summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/builder
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/builder')
-rw-r--r--bitbake/lib/bs4/builder/__init__.py382
-rw-r--r--bitbake/lib/bs4/builder/_html5lib.py251
-rw-r--r--bitbake/lib/bs4/builder/_htmlparser.py433
-rw-r--r--bitbake/lib/bs4/builder/_lxml.py212
4 files changed, 999 insertions, 279 deletions
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py
index 6ccd4d23d6..ffb31fc25e 100644
--- a/bitbake/lib/bs4/builder/__init__.py
+++ b/bitbake/lib/bs4/builder/__init__.py
@@ -1,11 +1,21 @@
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
1from collections import defaultdict 4from collections import defaultdict
2import itertools 5import itertools
6import re
7import warnings
3import sys 8import sys
4from bs4.element import ( 9from bs4.element import (
5 CharsetMetaAttributeValue, 10 CharsetMetaAttributeValue,
6 ContentMetaAttributeValue, 11 ContentMetaAttributeValue,
7 whitespace_re 12 RubyParenthesisString,
8 ) 13 RubyTextString,
14 Stylesheet,
15 Script,
16 TemplateString,
17 nonwhitespace_re
18)
9 19
10__all__ = [ 20__all__ = [
11 'HTMLTreeBuilder', 21 'HTMLTreeBuilder',
@@ -22,20 +32,41 @@ XML = 'xml'
22HTML = 'html' 32HTML = 'html'
23HTML_5 = 'html5' 33HTML_5 = 'html5'
24 34
35class XMLParsedAsHTMLWarning(UserWarning):
36 """The warning issued when an HTML parser is used to parse
37 XML that is not XHTML.
38 """
39 MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
40
25 41
26class TreeBuilderRegistry(object): 42class TreeBuilderRegistry(object):
27 43 """A way of looking up TreeBuilder subclasses by their name or by desired
44 features.
45 """
46
28 def __init__(self): 47 def __init__(self):
29 self.builders_for_feature = defaultdict(list) 48 self.builders_for_feature = defaultdict(list)
30 self.builders = [] 49 self.builders = []
31 50
32 def register(self, treebuilder_class): 51 def register(self, treebuilder_class):
33 """Register a treebuilder based on its advertised features.""" 52 """Register a treebuilder based on its advertised features.
53
54 :param treebuilder_class: A subclass of Treebuilder. its .features
55 attribute should list its features.
56 """
34 for feature in treebuilder_class.features: 57 for feature in treebuilder_class.features:
35 self.builders_for_feature[feature].insert(0, treebuilder_class) 58 self.builders_for_feature[feature].insert(0, treebuilder_class)
36 self.builders.insert(0, treebuilder_class) 59 self.builders.insert(0, treebuilder_class)
37 60
38 def lookup(self, *features): 61 def lookup(self, *features):
62 """Look up a TreeBuilder subclass with the desired features.
63
64 :param features: A list of features to look for. If none are
65 provided, the most recently registered TreeBuilder subclass
66 will be used.
67 :return: A TreeBuilder subclass, or None if there's no
68 registered subclass with all the requested features.
69 """
39 if len(self.builders) == 0: 70 if len(self.builders) == 0:
40 # There are no builders at all. 71 # There are no builders at all.
41 return None 72 return None
@@ -78,7 +109,7 @@ class TreeBuilderRegistry(object):
78builder_registry = TreeBuilderRegistry() 109builder_registry = TreeBuilderRegistry()
79 110
80class TreeBuilder(object): 111class TreeBuilder(object):
81 """Turn a document into a Beautiful Soup object tree.""" 112 """Turn a textual document into a Beautiful Soup object tree."""
82 113
83 NAME = "[Unknown tree builder]" 114 NAME = "[Unknown tree builder]"
84 ALTERNATE_NAMES = [] 115 ALTERNATE_NAMES = []
@@ -86,19 +117,89 @@ class TreeBuilder(object):
86 117
87 is_xml = False 118 is_xml = False
88 picklable = False 119 picklable = False
89 preserve_whitespace_tags = set()
90 empty_element_tags = None # A tag will be considered an empty-element 120 empty_element_tags = None # A tag will be considered an empty-element
91 # tag when and only when it has no contents. 121 # tag when and only when it has no contents.
92 122
93 # A value for these tag/attribute combinations is a space- or 123 # A value for these tag/attribute combinations is a space- or
94 # comma-separated list of CDATA, rather than a single CDATA. 124 # comma-separated list of CDATA, rather than a single CDATA.
95 cdata_list_attributes = {} 125 DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
96 126
97 127 # Whitespace should be preserved inside these tags.
98 def __init__(self): 128 DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
129
130 # The textual contents of tags with these names should be
131 # instantiated with some class other than NavigableString.
132 DEFAULT_STRING_CONTAINERS = {}
133
134 USE_DEFAULT = object()
135
136 # Most parsers don't keep track of line numbers.
137 TRACKS_LINE_NUMBERS = False
138
139 def __init__(self, multi_valued_attributes=USE_DEFAULT,
140 preserve_whitespace_tags=USE_DEFAULT,
141 store_line_numbers=USE_DEFAULT,
142 string_containers=USE_DEFAULT,
143 ):
144 """Constructor.
145
146 :param multi_valued_attributes: If this is set to None, the
147 TreeBuilder will not turn any values for attributes like
148 'class' into lists. Setting this to a dictionary will
149 customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
150 for an example.
151
152 Internally, these are called "CDATA list attributes", but that
153 probably doesn't make sense to an end-user, so the argument name
154 is `multi_valued_attributes`.
155
156 :param preserve_whitespace_tags: A list of tags to treat
157 the way <pre> tags are treated in HTML. Tags in this list
158 are immune from pretty-printing; their contents will always be
159 output as-is.
160
161 :param string_containers: A dictionary mapping tag names to
162 the classes that should be instantiated to contain the textual
163 contents of those tags. The default is to use NavigableString
164 for every tag, no matter what the name. You can override the
165 default by changing DEFAULT_STRING_CONTAINERS.
166
167 :param store_line_numbers: If the parser keeps track of the
168 line numbers and positions of the original markup, that
169 information will, by default, be stored in each corresponding
170 `Tag` object. You can turn this off by passing
171 store_line_numbers=False. If the parser you're using doesn't
172 keep track of this information, then setting store_line_numbers=True
173 will do nothing.
174 """
99 self.soup = None 175 self.soup = None
100 176 if multi_valued_attributes is self.USE_DEFAULT:
177 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
178 self.cdata_list_attributes = multi_valued_attributes
179 if preserve_whitespace_tags is self.USE_DEFAULT:
180 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
181 self.preserve_whitespace_tags = preserve_whitespace_tags
182 if store_line_numbers == self.USE_DEFAULT:
183 store_line_numbers = self.TRACKS_LINE_NUMBERS
184 self.store_line_numbers = store_line_numbers
185 if string_containers == self.USE_DEFAULT:
186 string_containers = self.DEFAULT_STRING_CONTAINERS
187 self.string_containers = string_containers
188
189 def initialize_soup(self, soup):
190 """The BeautifulSoup object has been initialized and is now
191 being associated with the TreeBuilder.
192
193 :param soup: A BeautifulSoup object.
194 """
195 self.soup = soup
196
101 def reset(self): 197 def reset(self):
198 """Do any work necessary to reset the underlying parser
199 for a new document.
200
201 By default, this does nothing.
202 """
102 pass 203 pass
103 204
104 def can_be_empty_element(self, tag_name): 205 def can_be_empty_element(self, tag_name):
@@ -110,24 +211,58 @@ class TreeBuilder(object):
110 For instance: an HTMLBuilder does not consider a <p> tag to be 211 For instance: an HTMLBuilder does not consider a <p> tag to be
111 an empty-element tag (it's not in 212 an empty-element tag (it's not in
112 HTMLBuilder.empty_element_tags). This means an empty <p> tag 213 HTMLBuilder.empty_element_tags). This means an empty <p> tag
113 will be presented as "<p></p>", not "<p />". 214 will be presented as "<p></p>", not "<p/>" or "<p>".
114 215
115 The default implementation has no opinion about which tags are 216 The default implementation has no opinion about which tags are
116 empty-element tags, so a tag will be presented as an 217 empty-element tags, so a tag will be presented as an
117 empty-element tag if and only if it has no contents. 218 empty-element tag if and only if it has no children.
118 "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will 219 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
119 be left alone. 220 be left alone.
221
222 :param tag_name: The name of a markup tag.
120 """ 223 """
121 if self.empty_element_tags is None: 224 if self.empty_element_tags is None:
122 return True 225 return True
123 return tag_name in self.empty_element_tags 226 return tag_name in self.empty_element_tags
124 227
125 def feed(self, markup): 228 def feed(self, markup):
229 """Run some incoming markup through some parsing process,
230 populating the `BeautifulSoup` object in self.soup.
231
232 This method is not implemented in TreeBuilder; it must be
233 implemented in subclasses.
234
235 :return: None.
236 """
126 raise NotImplementedError() 237 raise NotImplementedError()
127 238
128 def prepare_markup(self, markup, user_specified_encoding=None, 239 def prepare_markup(self, markup, user_specified_encoding=None,
129 document_declared_encoding=None): 240 document_declared_encoding=None, exclude_encodings=None):
130 return markup, None, None, False 241 """Run any preliminary steps necessary to make incoming markup
242 acceptable to the parser.
243
244 :param markup: Some markup -- probably a bytestring.
245 :param user_specified_encoding: The user asked to try this encoding.
246 :param document_declared_encoding: The markup itself claims to be
247 in this encoding. NOTE: This argument is not used by the
248 calling code and can probably be removed.
249 :param exclude_encodings: The user asked _not_ to try any of
250 these encodings.
251
252 :yield: A series of 4-tuples:
253 (markup, encoding, declared encoding,
254 has undergone character replacement)
255
256 Each 4-tuple represents a strategy for converting the
257 document to Unicode and parsing it. Each strategy will be tried
258 in turn.
259
260 By default, the only strategy is to parse the markup
261 as-is. See `LXMLTreeBuilderForXML` and
262 `HTMLParserTreeBuilder` for implementations that take into
263 account the quirks of particular parsers.
264 """
265 yield markup, None, None, False
131 266
132 def test_fragment_to_document(self, fragment): 267 def test_fragment_to_document(self, fragment):
133 """Wrap an HTML fragment to make it look like a document. 268 """Wrap an HTML fragment to make it look like a document.
@@ -139,16 +274,36 @@ class TreeBuilder(object):
139 results against other HTML fragments. 274 results against other HTML fragments.
140 275
141 This method should not be used outside of tests. 276 This method should not be used outside of tests.
277
278 :param fragment: A string -- fragment of HTML.
279 :return: A string -- a full HTML document.
142 """ 280 """
143 return fragment 281 return fragment
144 282
145 def set_up_substitutions(self, tag): 283 def set_up_substitutions(self, tag):
284 """Set up any substitutions that will need to be performed on
285 a `Tag` when it's output as a string.
286
287 By default, this does nothing. See `HTMLTreeBuilder` for a
288 case where this is used.
289
290 :param tag: A `Tag`
291 :return: Whether or not a substitution was performed.
292 """
146 return False 293 return False
147 294
148 def _replace_cdata_list_attribute_values(self, tag_name, attrs): 295 def _replace_cdata_list_attribute_values(self, tag_name, attrs):
149 """Replaces class="foo bar" with class=["foo", "bar"] 296 """When an attribute value is associated with a tag that can
297 have multiple values for that attribute, convert the string
298 value to a list of strings.
150 299
151 Modifies its input in place. 300 Basically, replaces class="foo bar" with class=["foo", "bar"]
301
302 NOTE: This method modifies its input in place.
303
304 :param tag_name: The name of a tag.
305 :param attrs: A dictionary containing the tag's attributes.
306 Any appropriate attribute values will be modified in place.
152 """ 307 """
153 if not attrs: 308 if not attrs:
154 return attrs 309 return attrs
@@ -163,7 +318,7 @@ class TreeBuilder(object):
163 # values. Split it into a list. 318 # values. Split it into a list.
164 value = attrs[attr] 319 value = attrs[attr]
165 if isinstance(value, str): 320 if isinstance(value, str):
166 values = whitespace_re.split(value) 321 values = nonwhitespace_re.findall(value)
167 else: 322 else:
168 # html5lib sometimes calls setAttributes twice 323 # html5lib sometimes calls setAttributes twice
169 # for the same tag when rearranging the parse 324 # for the same tag when rearranging the parse
@@ -174,9 +329,13 @@ class TreeBuilder(object):
174 values = value 329 values = value
175 attrs[attr] = values 330 attrs[attr] = values
176 return attrs 331 return attrs
177 332
178class SAXTreeBuilder(TreeBuilder): 333class SAXTreeBuilder(TreeBuilder):
179 """A Beautiful Soup treebuilder that listens for SAX events.""" 334 """A Beautiful Soup treebuilder that listens for SAX events.
335
336 This is not currently used for anything, but it demonstrates
337 how a simple TreeBuilder would work.
338 """
180 339
181 def feed(self, markup): 340 def feed(self, markup):
182 raise NotImplementedError() 341 raise NotImplementedError()
@@ -186,11 +345,11 @@ class SAXTreeBuilder(TreeBuilder):
186 345
187 def startElement(self, name, attrs): 346 def startElement(self, name, attrs):
188 attrs = dict((key[1], value) for key, value in list(attrs.items())) 347 attrs = dict((key[1], value) for key, value in list(attrs.items()))
189 #print "Start %s, %r" % (name, attrs) 348 #print("Start %s, %r" % (name, attrs))
190 self.soup.handle_starttag(name, attrs) 349 self.soup.handle_starttag(name, attrs)
191 350
192 def endElement(self, name): 351 def endElement(self, name):
193 #print "End %s" % name 352 #print("End %s" % name)
194 self.soup.handle_endtag(name) 353 self.soup.handle_endtag(name)
195 354
196 def startElementNS(self, nsTuple, nodeName, attrs): 355 def startElementNS(self, nsTuple, nodeName, attrs):
@@ -227,10 +386,44 @@ class HTMLTreeBuilder(TreeBuilder):
227 Such as which tags are empty-element tags. 386 Such as which tags are empty-element tags.
228 """ 387 """
229 388
230 preserve_whitespace_tags = set(['pre', 'textarea']) 389 empty_element_tags = set([
231 empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 390 # These are from HTML5.
232 'spacer', 'link', 'frame', 'base']) 391 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
233 392
393 # These are from earlier versions of HTML and are removed in HTML5.
394 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
395 ])
396
397 # The HTML standard defines these as block-level elements. Beautiful
398 # Soup does not treat these elements differently from other elements,
399 # but it may do so eventually, and this information is available if
400 # you need to use it.
401 block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
402
403 # These HTML tags need special treatment so they can be
404 # represented by a string class other than NavigableString.
405 #
406 # For some of these tags, it's because the HTML standard defines
407 # an unusual content model for them. I made this list by going
408 # through the HTML spec
409 # (https://html.spec.whatwg.org/#metadata-content) and looking for
410 # "metadata content" elements that can contain strings.
411 #
412 # The Ruby tags (<rt> and <rp>) are here despite being normal
413 # "phrasing content" tags, because the content they contain is
414 # qualitatively different from other text in the document, and it
415 # can be useful to be able to distinguish it.
416 #
417 # TODO: Arguably <noscript> could go here but it seems
418 # qualitatively different from the other tags.
419 DEFAULT_STRING_CONTAINERS = {
420 'rt' : RubyTextString,
421 'rp' : RubyParenthesisString,
422 'style': Stylesheet,
423 'script': Script,
424 'template': TemplateString,
425 }
426
234 # The HTML standard defines these attributes as containing a 427 # The HTML standard defines these attributes as containing a
235 # space-separated list of values, not a single value. That is, 428 # space-separated list of values, not a single value. That is,
236 # class="foo bar" means that the 'class' attribute has two values, 429 # class="foo bar" means that the 'class' attribute has two values,
@@ -238,7 +431,7 @@ class HTMLTreeBuilder(TreeBuilder):
238 # encounter one of these attributes, we will parse its value into 431 # encounter one of these attributes, we will parse its value into
239 # a list of values if possible. Upon output, the list will be 432 # a list of values if possible. Upon output, the list will be
240 # converted back into a string. 433 # converted back into a string.
241 cdata_list_attributes = { 434 DEFAULT_CDATA_LIST_ATTRIBUTES = {
242 "*" : ['class', 'accesskey', 'dropzone'], 435 "*" : ['class', 'accesskey', 'dropzone'],
243 "a" : ['rel', 'rev'], 436 "a" : ['rel', 'rev'],
244 "link" : ['rel', 'rev'], 437 "link" : ['rel', 'rev'],
@@ -255,7 +448,19 @@ class HTMLTreeBuilder(TreeBuilder):
255 "output" : ["for"], 448 "output" : ["for"],
256 } 449 }
257 450
451 DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
452
258 def set_up_substitutions(self, tag): 453 def set_up_substitutions(self, tag):
454 """Replace the declared encoding in a <meta> tag with a placeholder,
455 to be substituted when the tag is output to a string.
456
457 An HTML document may come in to Beautiful Soup as one
458 encoding, but exit in a different encoding, and the <meta> tag
459 needs to be changed to reflect this.
460
461 :param tag: A `Tag`
462 :return: Whether or not a substitution was performed.
463 """
259 # We are only interested in <meta> tags 464 # We are only interested in <meta> tags
260 if tag.name != 'meta': 465 if tag.name != 'meta':
261 return False 466 return False
@@ -288,10 +493,107 @@ class HTMLTreeBuilder(TreeBuilder):
288 493
289 return (meta_encoding is not None) 494 return (meta_encoding is not None)
290 495
496class DetectsXMLParsedAsHTML(object):
497 """A mixin class for any class (a TreeBuilder, or some class used by a
498 TreeBuilder) that's in a position to detect whether an XML
499 document is being incorrectly parsed as HTML, and issue an
500 appropriate warning.
501
502 This requires being able to observe an incoming processing
503 instruction that might be an XML declaration, and also able to
504 observe tags as they're opened. If you can't do that for a given
505 TreeBuilder, there's a less reliable implementation based on
506 examining the raw markup.
507 """
508
509 # Regular expression for seeing if markup has an <html> tag.
510 LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
511 LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
512
513 XML_PREFIX = '<?xml'
514 XML_PREFIX_B = b'<?xml'
515
516 @classmethod
517 def warn_if_markup_looks_like_xml(cls, markup, stacklevel=3):
518 """Perform a check on some markup to see if it looks like XML
519 that's not XHTML. If so, issue a warning.
520
521 This is much less reliable than doing the check while parsing,
522 but some of the tree builders can't do that.
523
524 :param stacklevel: The stacklevel of the code calling this
525 function.
526
527 :return: True if the markup looks like non-XHTML XML, False
528 otherwise.
529
530 """
531 if isinstance(markup, bytes):
532 prefix = cls.XML_PREFIX_B
533 looks_like_html = cls.LOOKS_LIKE_HTML_B
534 else:
535 prefix = cls.XML_PREFIX
536 looks_like_html = cls.LOOKS_LIKE_HTML
537
538 if (markup is not None
539 and markup.startswith(prefix)
540 and not looks_like_html.search(markup[:500])
541 ):
542 cls._warn(stacklevel=stacklevel+2)
543 return True
544 return False
545
546 @classmethod
547 def _warn(cls, stacklevel=5):
548 """Issue a warning about XML being parsed as HTML."""
549 warnings.warn(
550 XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning,
551 stacklevel=stacklevel
552 )
553
554 def _initialize_xml_detector(self):
555 """Call this method before parsing a document."""
556 self._first_processing_instruction = None
557 self._root_tag = None
558
559 def _document_might_be_xml(self, processing_instruction):
560 """Call this method when encountering an XML declaration, or a
561 "processing instruction" that might be an XML declaration.
562 """
563 if (self._first_processing_instruction is not None
564 or self._root_tag is not None):
565 # The document has already started. Don't bother checking
566 # anymore.
567 return
568
569 self._first_processing_instruction = processing_instruction
570
571 # We won't know until we encounter the first tag whether or
572 # not this is actually a problem.
573
574 def _root_tag_encountered(self, name):
575 """Call this when you encounter the document's root tag.
576
577 This is where we actually check whether an XML document is
578 being incorrectly parsed as HTML, and issue the warning.
579 """
580 if self._root_tag is not None:
581 # This method was incorrectly called multiple times. Do
582 # nothing.
583 return
584
585 self._root_tag = name
586 if (name != 'html' and self._first_processing_instruction is not None
587 and self._first_processing_instruction.lower().startswith('xml ')):
588 # We encountered an XML declaration and then a tag other
589 # than 'html'. This is a reliable indicator that a
590 # non-XHTML document is being parsed as XML.
591 self._warn()
592
593
291def register_treebuilders_from(module): 594def register_treebuilders_from(module):
292 """Copy TreeBuilders from the given module into this module.""" 595 """Copy TreeBuilders from the given module into this module."""
293 # I'm fairly sure this is not the best way to do this. 596 this_module = sys.modules[__name__]
294 this_module = sys.modules['bs4.builder']
295 for name in module.__all__: 597 for name in module.__all__:
296 obj = getattr(module, name) 598 obj = getattr(module, name)
297 599
@@ -302,12 +604,22 @@ def register_treebuilders_from(module):
302 this_module.builder_registry.register(obj) 604 this_module.builder_registry.register(obj)
303 605
304class ParserRejectedMarkup(Exception): 606class ParserRejectedMarkup(Exception):
305 pass 607 """An Exception to be raised when the underlying parser simply
306 608 refuses to parse the given markup.
609 """
610 def __init__(self, message_or_exception):
611 """Explain why the parser rejected the given markup, either
612 with a textual explanation or another exception.
613 """
614 if isinstance(message_or_exception, Exception):
615 e = message_or_exception
616 message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
617 super(ParserRejectedMarkup, self).__init__(message_or_exception)
618
307# Builders are registered in reverse order of priority, so that custom 619# Builders are registered in reverse order of priority, so that custom
308# builder registrations will take precedence. In general, we want lxml 620# builder registrations will take precedence. In general, we want lxml
309# to take precedence over html5lib, because it's faster. And we only 621# to take precedence over html5lib, because it's faster. And we only
310# want to use HTMLParser as a last result. 622# want to use HTMLParser as a last resort.
311from . import _htmlparser 623from . import _htmlparser
312register_treebuilders_from(_htmlparser) 624register_treebuilders_from(_htmlparser)
313try: 625try:
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py
index 9e9216ef9c..7c46a85118 100644
--- a/bitbake/lib/bs4/builder/_html5lib.py
+++ b/bitbake/lib/bs4/builder/_html5lib.py
@@ -1,9 +1,14 @@
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
1__all__ = [ 4__all__ = [
2 'HTML5TreeBuilder', 5 'HTML5TreeBuilder',
3 ] 6 ]
4 7
5import warnings 8import warnings
9import re
6from bs4.builder import ( 10from bs4.builder import (
11 DetectsXMLParsedAsHTML,
7 PERMISSIVE, 12 PERMISSIVE,
8 HTML, 13 HTML,
9 HTML_5, 14 HTML_5,
@@ -11,17 +16,13 @@ from bs4.builder import (
11 ) 16 )
12from bs4.element import ( 17from bs4.element import (
13 NamespacedAttribute, 18 NamespacedAttribute,
14 whitespace_re, 19 nonwhitespace_re,
15) 20)
16import html5lib 21import html5lib
17try: 22from html5lib.constants import (
18 # html5lib >= 0.99999999/1.0b9 23 namespaces,
19 from html5lib.treebuilders import base as treebuildersbase 24 prefixes,
20except ImportError: 25 )
21 # html5lib <= 0.9999999/1.0b8
22 from html5lib.treebuilders import _base as treebuildersbase
23from html5lib.constants import namespaces
24
25from bs4.element import ( 26from bs4.element import (
26 Comment, 27 Comment,
27 Doctype, 28 Doctype,
@@ -29,13 +30,37 @@ from bs4.element import (
29 Tag, 30 Tag,
30 ) 31 )
31 32
33try:
34 # Pre-0.99999999
35 from html5lib.treebuilders import _base as treebuilder_base
36 new_html5lib = False
37except ImportError as e:
38 # 0.99999999 and up
39 from html5lib.treebuilders import base as treebuilder_base
40 new_html5lib = True
41
32class HTML5TreeBuilder(HTMLTreeBuilder): 42class HTML5TreeBuilder(HTMLTreeBuilder):
33 """Use html5lib to build a tree.""" 43 """Use html5lib to build a tree.
44
45 Note that this TreeBuilder does not support some features common
46 to HTML TreeBuilders. Some of these features could theoretically
47 be implemented, but at the very least it's quite difficult,
48 because html5lib moves the parse tree around as it's being built.
49
50 * This TreeBuilder doesn't use different subclasses of NavigableString
51 based on the name of the tag in which the string was found.
52
53 * You can't use a SoupStrainer to parse only part of a document.
54 """
34 55
35 NAME = "html5lib" 56 NAME = "html5lib"
36 57
37 features = [NAME, PERMISSIVE, HTML_5, HTML] 58 features = [NAME, PERMISSIVE, HTML_5, HTML]
38 59
60 # html5lib can tell us which line number and position in the
61 # original file is the source of an element.
62 TRACKS_LINE_NUMBERS = True
63
39 def prepare_markup(self, markup, user_specified_encoding, 64 def prepare_markup(self, markup, user_specified_encoding,
40 document_declared_encoding=None, exclude_encodings=None): 65 document_declared_encoding=None, exclude_encodings=None):
41 # Store the user-specified encoding for use later on. 66 # Store the user-specified encoding for use later on.
@@ -45,27 +70,56 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
45 # ATM because the html5lib TreeBuilder doesn't use 70 # ATM because the html5lib TreeBuilder doesn't use
46 # UnicodeDammit. 71 # UnicodeDammit.
47 if exclude_encodings: 72 if exclude_encodings:
48 warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 73 warnings.warn(
74 "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
75 stacklevel=3
76 )
77
78 # html5lib only parses HTML, so if it's given XML that's worth
79 # noting.
80 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
81 markup, stacklevel=3
82 )
83
49 yield (markup, None, None, False) 84 yield (markup, None, None, False)
50 85
51 # These methods are defined by Beautiful Soup. 86 # These methods are defined by Beautiful Soup.
52 def feed(self, markup): 87 def feed(self, markup):
53 if self.soup.parse_only is not None: 88 if self.soup.parse_only is not None:
54 warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 89 warnings.warn(
90 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
91 stacklevel=4
92 )
55 parser = html5lib.HTMLParser(tree=self.create_treebuilder) 93 parser = html5lib.HTMLParser(tree=self.create_treebuilder)
56 doc = parser.parse(markup, encoding=self.user_specified_encoding) 94 self.underlying_builder.parser = parser
57 95 extra_kwargs = dict()
96 if not isinstance(markup, str):
97 if new_html5lib:
98 extra_kwargs['override_encoding'] = self.user_specified_encoding
99 else:
100 extra_kwargs['encoding'] = self.user_specified_encoding
101 doc = parser.parse(markup, **extra_kwargs)
102
58 # Set the character encoding detected by the tokenizer. 103 # Set the character encoding detected by the tokenizer.
59 if isinstance(markup, str): 104 if isinstance(markup, str):
60 # We need to special-case this because html5lib sets 105 # We need to special-case this because html5lib sets
61 # charEncoding to UTF-8 if it gets Unicode input. 106 # charEncoding to UTF-8 if it gets Unicode input.
62 doc.original_encoding = None 107 doc.original_encoding = None
63 else: 108 else:
64 doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 109 original_encoding = parser.tokenizer.stream.charEncoding[0]
65 110 if not isinstance(original_encoding, str):
111 # In 0.99999999 and up, the encoding is an html5lib
112 # Encoding object. We want to use a string for compatibility
113 # with other tree builders.
114 original_encoding = original_encoding.name
115 doc.original_encoding = original_encoding
116 self.underlying_builder.parser = None
117
66 def create_treebuilder(self, namespaceHTMLElements): 118 def create_treebuilder(self, namespaceHTMLElements):
67 self.underlying_builder = TreeBuilderForHtml5lib( 119 self.underlying_builder = TreeBuilderForHtml5lib(
68 self.soup, namespaceHTMLElements) 120 namespaceHTMLElements, self.soup,
121 store_line_numbers=self.store_line_numbers
122 )
69 return self.underlying_builder 123 return self.underlying_builder
70 124
71 def test_fragment_to_document(self, fragment): 125 def test_fragment_to_document(self, fragment):
@@ -73,12 +127,30 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
73 return '<html><head></head><body>%s</body></html>' % fragment 127 return '<html><head></head><body>%s</body></html>' % fragment
74 128
75 129
76class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder): 130class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
77 131
78 def __init__(self, soup, namespaceHTMLElements): 132 def __init__(self, namespaceHTMLElements, soup=None,
79 self.soup = soup 133 store_line_numbers=True, **kwargs):
134 if soup:
135 self.soup = soup
136 else:
137 from bs4 import BeautifulSoup
138 # TODO: Why is the parser 'html.parser' here? To avoid an
139 # infinite loop?
140 self.soup = BeautifulSoup(
141 "", "html.parser", store_line_numbers=store_line_numbers,
142 **kwargs
143 )
144 # TODO: What are **kwargs exactly? Should they be passed in
145 # here in addition to/instead of being passed to the BeautifulSoup
146 # constructor?
80 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 147 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
81 148
149 # This will be set later to an html5lib.html5parser.HTMLParser
150 # object, which we can use to track the current line number.
151 self.parser = None
152 self.store_line_numbers = store_line_numbers
153
82 def documentClass(self): 154 def documentClass(self):
83 self.soup.reset() 155 self.soup.reset()
84 return Element(self.soup, self.soup, None) 156 return Element(self.soup, self.soup, None)
@@ -92,14 +164,26 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
92 self.soup.object_was_parsed(doctype) 164 self.soup.object_was_parsed(doctype)
93 165
94 def elementClass(self, name, namespace): 166 def elementClass(self, name, namespace):
95 tag = self.soup.new_tag(name, namespace) 167 kwargs = {}
168 if self.parser and self.store_line_numbers:
169 # This represents the point immediately after the end of the
170 # tag. We don't know when the tag started, but we do know
171 # where it ended -- the character just before this one.
172 sourceline, sourcepos = self.parser.tokenizer.stream.position()
173 kwargs['sourceline'] = sourceline
174 kwargs['sourcepos'] = sourcepos-1
175 tag = self.soup.new_tag(name, namespace, **kwargs)
176
96 return Element(tag, self.soup, namespace) 177 return Element(tag, self.soup, namespace)
97 178
98 def commentClass(self, data): 179 def commentClass(self, data):
99 return TextNode(Comment(data), self.soup) 180 return TextNode(Comment(data), self.soup)
100 181
101 def fragmentClass(self): 182 def fragmentClass(self):
102 self.soup = BeautifulSoup("") 183 from bs4 import BeautifulSoup
184 # TODO: Why is the parser 'html.parser' here? To avoid an
185 # infinite loop?
186 self.soup = BeautifulSoup("", "html.parser")
103 self.soup.name = "[document_fragment]" 187 self.soup.name = "[document_fragment]"
104 return Element(self.soup, self.soup, None) 188 return Element(self.soup, self.soup, None)
105 189
@@ -111,7 +195,57 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
111 return self.soup 195 return self.soup
112 196
113 def getFragment(self): 197 def getFragment(self):
114 return treebuildersbase.TreeBuilder.getFragment(self).element 198 return treebuilder_base.TreeBuilder.getFragment(self).element
199
200 def testSerializer(self, element):
201 from bs4 import BeautifulSoup
202 rv = []
203 doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
204
205 def serializeElement(element, indent=0):
206 if isinstance(element, BeautifulSoup):
207 pass
208 if isinstance(element, Doctype):
209 m = doctype_re.match(element)
210 if m:
211 name = m.group(1)
212 if m.lastindex > 1:
213 publicId = m.group(2) or ""
214 systemId = m.group(3) or m.group(4) or ""
215 rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
216 (' ' * indent, name, publicId, systemId))
217 else:
218 rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
219 else:
220 rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
221 elif isinstance(element, Comment):
222 rv.append("|%s<!-- %s -->" % (' ' * indent, element))
223 elif isinstance(element, NavigableString):
224 rv.append("|%s\"%s\"" % (' ' * indent, element))
225 else:
226 if element.namespace:
227 name = "%s %s" % (prefixes[element.namespace],
228 element.name)
229 else:
230 name = element.name
231 rv.append("|%s<%s>" % (' ' * indent, name))
232 if element.attrs:
233 attributes = []
234 for name, value in list(element.attrs.items()):
235 if isinstance(name, NamespacedAttribute):
236 name = "%s %s" % (prefixes[name.namespace], name.name)
237 if isinstance(value, list):
238 value = " ".join(value)
239 attributes.append((name, value))
240
241 for name, value in sorted(attributes):
242 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
243 indent += 2
244 for child in element.children:
245 serializeElement(child, indent)
246 serializeElement(element, 0)
247
248 return "\n".join(rv)
115 249
116class AttrList(object): 250class AttrList(object):
117 def __init__(self, element): 251 def __init__(self, element):
@@ -122,14 +256,14 @@ class AttrList(object):
122 def __setitem__(self, name, value): 256 def __setitem__(self, name, value):
123 # If this attribute is a multi-valued attribute for this element, 257 # If this attribute is a multi-valued attribute for this element,
124 # turn its value into a list. 258 # turn its value into a list.
125 list_attr = HTML5TreeBuilder.cdata_list_attributes 259 list_attr = self.element.cdata_list_attributes or {}
126 if (name in list_attr['*'] 260 if (name in list_attr.get('*', [])
127 or (self.element.name in list_attr 261 or (self.element.name in list_attr
128 and name in list_attr[self.element.name])): 262 and name in list_attr.get(self.element.name, []))):
129 # A node that is being cloned may have already undergone 263 # A node that is being cloned may have already undergone
130 # this procedure. 264 # this procedure.
131 if not isinstance(value, list): 265 if not isinstance(value, list):
132 value = whitespace_re.split(value) 266 value = nonwhitespace_re.findall(value)
133 self.element[name] = value 267 self.element[name] = value
134 def items(self): 268 def items(self):
135 return list(self.attrs.items()) 269 return list(self.attrs.items())
@@ -143,9 +277,9 @@ class AttrList(object):
143 return name in list(self.attrs.keys()) 277 return name in list(self.attrs.keys())
144 278
145 279
146class Element(treebuildersbase.Node): 280class Element(treebuilder_base.Node):
147 def __init__(self, element, soup, namespace): 281 def __init__(self, element, soup, namespace):
148 treebuildersbase.Node.__init__(self, element.name) 282 treebuilder_base.Node.__init__(self, element.name)
149 self.element = element 283 self.element = element
150 self.soup = soup 284 self.soup = soup
151 self.namespace = namespace 285 self.namespace = namespace
@@ -164,13 +298,15 @@ class Element(treebuildersbase.Node):
164 child = node 298 child = node
165 elif node.element.__class__ == NavigableString: 299 elif node.element.__class__ == NavigableString:
166 string_child = child = node.element 300 string_child = child = node.element
301 node.parent = self
167 else: 302 else:
168 child = node.element 303 child = node.element
304 node.parent = self
169 305
170 if not isinstance(child, str) and child.parent is not None: 306 if not isinstance(child, str) and child.parent is not None:
171 node.element.extract() 307 node.element.extract()
172 308
173 if (string_child and self.element.contents 309 if (string_child is not None and self.element.contents
174 and self.element.contents[-1].__class__ == NavigableString): 310 and self.element.contents[-1].__class__ == NavigableString):
175 # We are appending a string onto another string. 311 # We are appending a string onto another string.
176 # TODO This has O(n^2) performance, for input like 312 # TODO This has O(n^2) performance, for input like
@@ -203,12 +339,12 @@ class Element(treebuildersbase.Node):
203 most_recent_element=most_recent_element) 339 most_recent_element=most_recent_element)
204 340
205 def getAttributes(self): 341 def getAttributes(self):
342 if isinstance(self.element, Comment):
343 return {}
206 return AttrList(self.element) 344 return AttrList(self.element)
207 345
208 def setAttributes(self, attributes): 346 def setAttributes(self, attributes):
209
210 if attributes is not None and len(attributes) > 0: 347 if attributes is not None and len(attributes) > 0:
211
212 converted_attributes = [] 348 converted_attributes = []
213 for name, value in list(attributes.items()): 349 for name, value in list(attributes.items()):
214 if isinstance(name, tuple): 350 if isinstance(name, tuple):
@@ -230,11 +366,11 @@ class Element(treebuildersbase.Node):
230 attributes = property(getAttributes, setAttributes) 366 attributes = property(getAttributes, setAttributes)
231 367
232 def insertText(self, data, insertBefore=None): 368 def insertText(self, data, insertBefore=None):
369 text = TextNode(self.soup.new_string(data), self.soup)
233 if insertBefore: 370 if insertBefore:
234 text = TextNode(self.soup.new_string(data), self.soup) 371 self.insertBefore(text, insertBefore)
235 self.insertBefore(data, insertBefore)
236 else: 372 else:
237 self.appendChild(data) 373 self.appendChild(text)
238 374
239 def insertBefore(self, node, refNode): 375 def insertBefore(self, node, refNode):
240 index = self.element.index(refNode.element) 376 index = self.element.index(refNode.element)
@@ -253,9 +389,10 @@ class Element(treebuildersbase.Node):
253 389
254 def reparentChildren(self, new_parent): 390 def reparentChildren(self, new_parent):
255 """Move all of this tag's children into another tag.""" 391 """Move all of this tag's children into another tag."""
256 # print "MOVE", self.element.contents 392 # print("MOVE", self.element.contents)
257 # print "FROM", self.element 393 # print("FROM", self.element)
258 # print "TO", new_parent.element 394 # print("TO", new_parent.element)
395
259 element = self.element 396 element = self.element
260 new_parent_element = new_parent.element 397 new_parent_element = new_parent.element
261 # Determine what this tag's next_element will be once all the children 398 # Determine what this tag's next_element will be once all the children
@@ -274,29 +411,35 @@ class Element(treebuildersbase.Node):
274 new_parents_last_descendant_next_element = new_parent_element.next_element 411 new_parents_last_descendant_next_element = new_parent_element.next_element
275 412
276 to_append = element.contents 413 to_append = element.contents
277 append_after = new_parent_element.contents
278 if len(to_append) > 0: 414 if len(to_append) > 0:
279 # Set the first child's previous_element and previous_sibling 415 # Set the first child's previous_element and previous_sibling
280 # to elements within the new parent 416 # to elements within the new parent
281 first_child = to_append[0] 417 first_child = to_append[0]
282 if new_parents_last_descendant: 418 if new_parents_last_descendant is not None:
283 first_child.previous_element = new_parents_last_descendant 419 first_child.previous_element = new_parents_last_descendant
284 else: 420 else:
285 first_child.previous_element = new_parent_element 421 first_child.previous_element = new_parent_element
286 first_child.previous_sibling = new_parents_last_child 422 first_child.previous_sibling = new_parents_last_child
287 if new_parents_last_descendant: 423 if new_parents_last_descendant is not None:
288 new_parents_last_descendant.next_element = first_child 424 new_parents_last_descendant.next_element = first_child
289 else: 425 else:
290 new_parent_element.next_element = first_child 426 new_parent_element.next_element = first_child
291 if new_parents_last_child: 427 if new_parents_last_child is not None:
292 new_parents_last_child.next_sibling = first_child 428 new_parents_last_child.next_sibling = first_child
293 429
294 # Fix the last child's next_element and next_sibling 430 # Find the very last element being moved. It is now the
295 last_child = to_append[-1] 431 # parent's last descendant. It has no .next_sibling and
296 last_child.next_element = new_parents_last_descendant_next_element 432 # its .next_element is whatever the previous last
297 if new_parents_last_descendant_next_element: 433 # descendant had.
298 new_parents_last_descendant_next_element.previous_element = last_child 434 last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
299 last_child.next_sibling = None 435
436 last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
437 if new_parents_last_descendant_next_element is not None:
438 # TODO: This code has no test coverage and I'm not sure
439 # how to get html5lib to go through this path, but it's
440 # just the other side of the previous line.
441 new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
442 last_childs_last_descendant.next_sibling = None
300 443
301 for child in to_append: 444 for child in to_append:
302 child.parent = new_parent_element 445 child.parent = new_parent_element
@@ -306,9 +449,9 @@ class Element(treebuildersbase.Node):
306 element.contents = [] 449 element.contents = []
307 element.next_element = final_next_element 450 element.next_element = final_next_element
308 451
309 # print "DONE WITH MOVE" 452 # print("DONE WITH MOVE")
310 # print "FROM", self.element 453 # print("FROM", self.element)
311 # print "TO", new_parent_element 454 # print("TO", new_parent_element)
312 455
313 def cloneNode(self): 456 def cloneNode(self):
314 tag = self.soup.new_tag(self.element.name, self.namespace) 457 tag = self.soup.new_tag(self.element.name, self.namespace)
@@ -321,7 +464,7 @@ class Element(treebuildersbase.Node):
321 return self.element.contents 464 return self.element.contents
322 465
323 def getNameTuple(self): 466 def getNameTuple(self):
324 if self.namespace is None: 467 if self.namespace == None:
325 return namespaces["html"], self.name 468 return namespaces["html"], self.name
326 else: 469 else:
327 return self.namespace, self.name 470 return self.namespace, self.name
@@ -330,7 +473,7 @@ class Element(treebuildersbase.Node):
330 473
331class TextNode(Element): 474class TextNode(Element):
332 def __init__(self, element, soup): 475 def __init__(self, element, soup):
333 treebuildersbase.Node.__init__(self, None) 476 treebuilder_base.Node.__init__(self, None)
334 self.element = element 477 self.element = element
335 self.soup = soup 478 self.soup = soup
336 479
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py
index bb0a63f2f3..3cc187f892 100644
--- a/bitbake/lib/bs4/builder/_htmlparser.py
+++ b/bitbake/lib/bs4/builder/_htmlparser.py
@@ -1,35 +1,18 @@
1# encoding: utf-8
1"""Use the HTMLParser library to parse HTML files that aren't too bad.""" 2"""Use the HTMLParser library to parse HTML files that aren't too bad."""
2 3
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
6
3__all__ = [ 7__all__ = [
4 'HTMLParserTreeBuilder', 8 'HTMLParserTreeBuilder',
5 ] 9 ]
6 10
7from html.parser import HTMLParser 11from html.parser import HTMLParser
8 12
9try:
10 from html.parser import HTMLParseError
11except ImportError as e:
12 # HTMLParseError is removed in Python 3.5. Since it can never be
13 # thrown in 3.5, we can just define our own class as a placeholder.
14 class HTMLParseError(Exception):
15 pass
16
17import sys 13import sys
18import warnings 14import warnings
19 15
20# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
21# argument, which we'd like to set to False. Unfortunately,
22# http://bugs.python.org/issue13273 makes strict=True a better bet
23# before Python 3.2.3.
24#
25# At the end of this file, we monkeypatch HTMLParser so that
26# strict=True works well on Python 3.2.2.
27major, minor, release = sys.version_info[:3]
28CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
29CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
30CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
31
32
33from bs4.element import ( 16from bs4.element import (
34 CData, 17 CData,
35 Comment, 18 Comment,
@@ -40,6 +23,8 @@ from bs4.element import (
40from bs4.dammit import EntitySubstitution, UnicodeDammit 23from bs4.dammit import EntitySubstitution, UnicodeDammit
41 24
42from bs4.builder import ( 25from bs4.builder import (
26 DetectsXMLParsedAsHTML,
27 ParserRejectedMarkup,
43 HTML, 28 HTML,
44 HTMLTreeBuilder, 29 HTMLTreeBuilder,
45 STRICT, 30 STRICT,
@@ -48,8 +33,84 @@ from bs4.builder import (
48 33
49HTMLPARSER = 'html.parser' 34HTMLPARSER = 'html.parser'
50 35
51class BeautifulSoupHTMLParser(HTMLParser): 36class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
52 def handle_starttag(self, name, attrs): 37 """A subclass of the Python standard library's HTMLParser class, which
38 listens for HTMLParser events and translates them into calls
39 to Beautiful Soup's tree construction API.
40 """
41
42 # Strategies for handling duplicate attributes
43 IGNORE = 'ignore'
44 REPLACE = 'replace'
45
46 def __init__(self, *args, **kwargs):
47 """Constructor.
48
49 :param on_duplicate_attribute: A strategy for what to do if a
50 tag includes the same attribute more than once. Accepted
51 values are: REPLACE (replace earlier values with later
52 ones, the default), IGNORE (keep the earliest value
53 encountered), or a callable. A callable must take three
54 arguments: the dictionary of attributes already processed,
55 the name of the duplicate attribute, and the most recent value
56 encountered.
57 """
58 self.on_duplicate_attribute = kwargs.pop(
59 'on_duplicate_attribute', self.REPLACE
60 )
61 HTMLParser.__init__(self, *args, **kwargs)
62
63 # Keep a list of empty-element tags that were encountered
64 # without an explicit closing tag. If we encounter a closing tag
65 # of this type, we'll associate it with one of those entries.
66 #
67 # This isn't a stack because we don't care about the
68 # order. It's a list of closing tags we've already handled and
69 # will ignore, assuming they ever show up.
70 self.already_closed_empty_element = []
71
72 self._initialize_xml_detector()
73
74 def error(self, message):
75 # NOTE: This method is required so long as Python 3.9 is
76 # supported. The corresponding code is removed from HTMLParser
77 # in 3.5, but not removed from ParserBase until 3.10.
78 # https://github.com/python/cpython/issues/76025
79 #
80 # The original implementation turned the error into a warning,
81 # but in every case I discovered, this made HTMLParser
82 # immediately crash with an error message that was less
83 # helpful than the warning. The new implementation makes it
84 # more clear that html.parser just can't parse this
85 # markup. The 3.10 implementation does the same, though it
86 # raises AssertionError rather than calling a method. (We
87 # catch this error and wrap it in a ParserRejectedMarkup.)
88 raise ParserRejectedMarkup(message)
89
90 def handle_startendtag(self, name, attrs):
91 """Handle an incoming empty-element tag.
92
93 This is only called when the markup looks like <tag/>.
94
95 :param name: Name of the tag.
96 :param attrs: Dictionary of the tag's attributes.
97 """
98 # is_startend() tells handle_starttag not to close the tag
99 # just because its name matches a known empty-element tag. We
100 # know that this is an empty-element tag and we want to call
101 # handle_endtag ourselves.
102 tag = self.handle_starttag(name, attrs, handle_empty_element=False)
103 self.handle_endtag(name)
104
105 def handle_starttag(self, name, attrs, handle_empty_element=True):
106 """Handle an opening tag, e.g. '<tag>'
107
108 :param name: Name of the tag.
109 :param attrs: Dictionary of the tag's attributes.
110 :param handle_empty_element: True if this tag is known to be
111 an empty-element tag (i.e. there is not expected to be any
112 closing tag).
113 """
53 # XXX namespace 114 # XXX namespace
54 attr_dict = {} 115 attr_dict = {}
55 for key, value in attrs: 116 for key, value in attrs:
@@ -57,20 +118,78 @@ class BeautifulSoupHTMLParser(HTMLParser):
57 # for consistency with the other tree builders. 118 # for consistency with the other tree builders.
58 if value is None: 119 if value is None:
59 value = '' 120 value = ''
60 attr_dict[key] = value 121 if key in attr_dict:
122 # A single attribute shows up multiple times in this
123 # tag. How to handle it depends on the
124 # on_duplicate_attribute setting.
125 on_dupe = self.on_duplicate_attribute
126 if on_dupe == self.IGNORE:
127 pass
128 elif on_dupe in (None, self.REPLACE):
129 attr_dict[key] = value
130 else:
131 on_dupe(attr_dict, key, value)
132 else:
133 attr_dict[key] = value
61 attrvalue = '""' 134 attrvalue = '""'
62 self.soup.handle_starttag(name, None, None, attr_dict) 135 #print("START", name)
63 136 sourceline, sourcepos = self.getpos()
64 def handle_endtag(self, name): 137 tag = self.soup.handle_starttag(
65 self.soup.handle_endtag(name) 138 name, None, None, attr_dict, sourceline=sourceline,
66 139 sourcepos=sourcepos
140 )
141 if tag and tag.is_empty_element and handle_empty_element:
142 # Unlike other parsers, html.parser doesn't send separate end tag
143 # events for empty-element tags. (It's handled in
144 # handle_startendtag, but only if the original markup looked like
145 # <tag/>.)
146 #
147 # So we need to call handle_endtag() ourselves. Since we
148 # know the start event is identical to the end event, we
149 # don't want handle_endtag() to cross off any previous end
150 # events for tags of this name.
151 self.handle_endtag(name, check_already_closed=False)
152
153 # But we might encounter an explicit closing tag for this tag
154 # later on. If so, we want to ignore it.
155 self.already_closed_empty_element.append(name)
156
157 if self._root_tag is None:
158 self._root_tag_encountered(name)
159
160 def handle_endtag(self, name, check_already_closed=True):
161 """Handle a closing tag, e.g. '</tag>'
162
163 :param name: A tag name.
164 :param check_already_closed: True if this tag is expected to
165 be the closing portion of an empty-element tag,
166 e.g. '<tag></tag>'.
167 """
168 #print("END", name)
169 if check_already_closed and name in self.already_closed_empty_element:
170 # This is a redundant end tag for an empty-element tag.
171 # We've already called handle_endtag() for it, so just
172 # check it off the list.
173 #print("ALREADY CLOSED", name)
174 self.already_closed_empty_element.remove(name)
175 else:
176 self.soup.handle_endtag(name)
177
67 def handle_data(self, data): 178 def handle_data(self, data):
179 """Handle some textual data that shows up between tags."""
68 self.soup.handle_data(data) 180 self.soup.handle_data(data)
69 181
70 def handle_charref(self, name): 182 def handle_charref(self, name):
71 # XXX workaround for a bug in HTMLParser. Remove this once 183 """Handle a numeric character reference by converting it to the
72 # it's fixed in all supported versions. 184 corresponding Unicode character and treating it as textual
73 # http://bugs.python.org/issue13633 185 data.
186
187 :param name: Character number, possibly in hexadecimal.
188 """
189 # TODO: This was originally a workaround for a bug in
190 # HTMLParser. (http://bugs.python.org/issue13633) The bug has
191 # been fixed, but removing this code still makes some
192 # Beautiful Soup tests fail. This needs investigation.
74 if name.startswith('x'): 193 if name.startswith('x'):
75 real_name = int(name.lstrip('x'), 16) 194 real_name = int(name.lstrip('x'), 16)
76 elif name.startswith('X'): 195 elif name.startswith('X'):
@@ -78,37 +197,71 @@ class BeautifulSoupHTMLParser(HTMLParser):
78 else: 197 else:
79 real_name = int(name) 198 real_name = int(name)
80 199
81 try: 200 data = None
82 data = chr(real_name) 201 if real_name < 256:
83 except (ValueError, OverflowError) as e: 202 # HTML numeric entities are supposed to reference Unicode
84 data = "\N{REPLACEMENT CHARACTER}" 203 # code points, but sometimes they reference code points in
85 204 # some other encoding (ahem, Windows-1252). E.g. &#147;
205 # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
206 # code tries to detect this situation and compensate.
207 for encoding in (self.soup.original_encoding, 'windows-1252'):
208 if not encoding:
209 continue
210 try:
211 data = bytearray([real_name]).decode(encoding)
212 except UnicodeDecodeError as e:
213 pass
214 if not data:
215 try:
216 data = chr(real_name)
217 except (ValueError, OverflowError) as e:
218 pass
219 data = data or "\N{REPLACEMENT CHARACTER}"
86 self.handle_data(data) 220 self.handle_data(data)
87 221
88 def handle_entityref(self, name): 222 def handle_entityref(self, name):
223 """Handle a named entity reference by converting it to the
224 corresponding Unicode character(s) and treating it as textual
225 data.
226
227 :param name: Name of the entity reference.
228 """
89 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 229 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
90 if character is not None: 230 if character is not None:
91 data = character 231 data = character
92 else: 232 else:
93 data = "&%s;" % name 233 # If this were XML, it would be ambiguous whether "&foo"
234 # was an character entity reference with a missing
235 # semicolon or the literal string "&foo". Since this is
236 # HTML, we have a complete list of all character entity references,
237 # and this one wasn't found, so assume it's the literal string "&foo".
238 data = "&%s" % name
94 self.handle_data(data) 239 self.handle_data(data)
95 240
96 def handle_comment(self, data): 241 def handle_comment(self, data):
242 """Handle an HTML comment.
243
244 :param data: The text of the comment.
245 """
97 self.soup.endData() 246 self.soup.endData()
98 self.soup.handle_data(data) 247 self.soup.handle_data(data)
99 self.soup.endData(Comment) 248 self.soup.endData(Comment)
100 249
101 def handle_decl(self, data): 250 def handle_decl(self, data):
251 """Handle a DOCTYPE declaration.
252
253 :param data: The text of the declaration.
254 """
102 self.soup.endData() 255 self.soup.endData()
103 if data.startswith("DOCTYPE "): 256 data = data[len("DOCTYPE "):]
104 data = data[len("DOCTYPE "):]
105 elif data == 'DOCTYPE':
106 # i.e. "<!DOCTYPE>"
107 data = ''
108 self.soup.handle_data(data) 257 self.soup.handle_data(data)
109 self.soup.endData(Doctype) 258 self.soup.endData(Doctype)
110 259
111 def unknown_decl(self, data): 260 def unknown_decl(self, data):
261 """Handle a declaration of unknown type -- probably a CDATA block.
262
263 :param data: The text of the declaration.
264 """
112 if data.upper().startswith('CDATA['): 265 if data.upper().startswith('CDATA['):
113 cls = CData 266 cls = CData
114 data = data[len('CDATA['):] 267 data = data[len('CDATA['):]
@@ -119,144 +272,116 @@ class BeautifulSoupHTMLParser(HTMLParser):
119 self.soup.endData(cls) 272 self.soup.endData(cls)
120 273
121 def handle_pi(self, data): 274 def handle_pi(self, data):
275 """Handle a processing instruction.
276
277 :param data: The text of the instruction.
278 """
122 self.soup.endData() 279 self.soup.endData()
123 self.soup.handle_data(data) 280 self.soup.handle_data(data)
281 self._document_might_be_xml(data)
124 self.soup.endData(ProcessingInstruction) 282 self.soup.endData(ProcessingInstruction)
125 283
126 284
127class HTMLParserTreeBuilder(HTMLTreeBuilder): 285class HTMLParserTreeBuilder(HTMLTreeBuilder):
128 286 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
287 found in the Python standard library.
288 """
129 is_xml = False 289 is_xml = False
130 picklable = True 290 picklable = True
131 NAME = HTMLPARSER 291 NAME = HTMLPARSER
132 features = [NAME, HTML, STRICT] 292 features = [NAME, HTML, STRICT]
133 293
134 def __init__(self, *args, **kwargs): 294 # The html.parser knows which line number and position in the
135 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 295 # original file is the source of an element.
136 kwargs['strict'] = False 296 TRACKS_LINE_NUMBERS = True
137 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
138 kwargs['convert_charrefs'] = False
139 self.parser_args = (args, kwargs)
140 297
298 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
299 """Constructor.
300
301 :param parser_args: Positional arguments to pass into
302 the BeautifulSoupHTMLParser constructor, once it's
303 invoked.
304 :param parser_kwargs: Keyword arguments to pass into
305 the BeautifulSoupHTMLParser constructor, once it's
306 invoked.
307 :param kwargs: Keyword arguments for the superclass constructor.
308 """
309 # Some keyword arguments will be pulled out of kwargs and placed
310 # into parser_kwargs.
311 extra_parser_kwargs = dict()
312 for arg in ('on_duplicate_attribute',):
313 if arg in kwargs:
314 value = kwargs.pop(arg)
315 extra_parser_kwargs[arg] = value
316 super(HTMLParserTreeBuilder, self).__init__(**kwargs)
317 parser_args = parser_args or []
318 parser_kwargs = parser_kwargs or {}
319 parser_kwargs.update(extra_parser_kwargs)
320 parser_kwargs['convert_charrefs'] = False
321 self.parser_args = (parser_args, parser_kwargs)
322
141 def prepare_markup(self, markup, user_specified_encoding=None, 323 def prepare_markup(self, markup, user_specified_encoding=None,
142 document_declared_encoding=None, exclude_encodings=None): 324 document_declared_encoding=None, exclude_encodings=None):
143 """ 325
144 :return: A 4-tuple (markup, original encoding, encoding 326 """Run any preliminary steps necessary to make incoming markup
145 declared within markup, whether any characters had to be 327 acceptable to the parser.
146 replaced with REPLACEMENT CHARACTER). 328
329 :param markup: Some markup -- probably a bytestring.
330 :param user_specified_encoding: The user asked to try this encoding.
331 :param document_declared_encoding: The markup itself claims to be
332 in this encoding.
333 :param exclude_encodings: The user asked _not_ to try any of
334 these encodings.
335
336 :yield: A series of 4-tuples:
337 (markup, encoding, declared encoding,
338 has undergone character replacement)
339
340 Each 4-tuple represents a strategy for converting the
341 document to Unicode and parsing it. Each strategy will be tried
342 in turn.
147 """ 343 """
148 if isinstance(markup, str): 344 if isinstance(markup, str):
345 # Parse Unicode as-is.
149 yield (markup, None, None, False) 346 yield (markup, None, None, False)
150 return 347 return
151 348
349 # Ask UnicodeDammit to sniff the most likely encoding.
350
351 # This was provided by the end-user; treat it as a known
352 # definite encoding per the algorithm laid out in the HTML5
353 # spec. (See the EncodingDetector class for details.)
354 known_definite_encodings = [user_specified_encoding]
355
356 # This was found in the document; treat it as a slightly lower-priority
357 # user encoding.
358 user_encodings = [document_declared_encoding]
359
152 try_encodings = [user_specified_encoding, document_declared_encoding] 360 try_encodings = [user_specified_encoding, document_declared_encoding]
153 dammit = UnicodeDammit(markup, try_encodings, is_html=True, 361 dammit = UnicodeDammit(
154 exclude_encodings=exclude_encodings) 362 markup,
363 known_definite_encodings=known_definite_encodings,
364 user_encodings=user_encodings,
365 is_html=True,
366 exclude_encodings=exclude_encodings
367 )
155 yield (dammit.markup, dammit.original_encoding, 368 yield (dammit.markup, dammit.original_encoding,
156 dammit.declared_html_encoding, 369 dammit.declared_html_encoding,
157 dammit.contains_replacement_characters) 370 dammit.contains_replacement_characters)
158 371
159 def feed(self, markup): 372 def feed(self, markup):
373 """Run some incoming markup through some parsing process,
374 populating the `BeautifulSoup` object in self.soup.
375 """
160 args, kwargs = self.parser_args 376 args, kwargs = self.parser_args
161 parser = BeautifulSoupHTMLParser(*args, **kwargs) 377 parser = BeautifulSoupHTMLParser(*args, **kwargs)
162 parser.soup = self.soup 378 parser.soup = self.soup
163 try: 379 try:
164 parser.feed(markup) 380 parser.feed(markup)
165 except HTMLParseError as e: 381 parser.close()
166 warnings.warn(RuntimeWarning( 382 except AssertionError as e:
167 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 383 # html.parser raises AssertionError in rare cases to
168 raise e 384 # indicate a fatal problem with the markup, especially
169 385 # when there's an error in the doctype declaration.
170# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 386 raise ParserRejectedMarkup(e)
171# 3.2.3 code. This ensures they don't treat markup like <p></p> as a 387 parser.already_closed_empty_element = []
172# string.
173#
174# XXX This code can be removed once most Python 3 users are on 3.2.3.
175if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
176 import re
177 attrfind_tolerant = re.compile(
178 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
179 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
180 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
181
182 locatestarttagend = re.compile(r"""
183 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
184 (?:\s+ # whitespace before attribute name
185 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
186 (?:\s*=\s* # value indicator
187 (?:'[^']*' # LITA-enclosed value
188 |\"[^\"]*\" # LIT-enclosed value
189 |[^'\">\s]+ # bare value
190 )
191 )?
192 )
193 )*
194 \s* # trailing whitespace
195""", re.VERBOSE)
196 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
197
198 from html.parser import tagfind, attrfind
199
200 def parse_starttag(self, i):
201 self.__starttag_text = None
202 endpos = self.check_for_whole_start_tag(i)
203 if endpos < 0:
204 return endpos
205 rawdata = self.rawdata
206 self.__starttag_text = rawdata[i:endpos]
207
208 # Now parse the data between i+1 and j into a tag and attrs
209 attrs = []
210 match = tagfind.match(rawdata, i+1)
211 assert match, 'unexpected call to parse_starttag()'
212 k = match.end()
213 self.lasttag = tag = rawdata[i+1:k].lower()
214 while k < endpos:
215 if self.strict:
216 m = attrfind.match(rawdata, k)
217 else:
218 m = attrfind_tolerant.match(rawdata, k)
219 if not m:
220 break
221 attrname, rest, attrvalue = m.group(1, 2, 3)
222 if not rest:
223 attrvalue = None
224 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
225 attrvalue[:1] == '"' == attrvalue[-1:]:
226 attrvalue = attrvalue[1:-1]
227 if attrvalue:
228 attrvalue = self.unescape(attrvalue)
229 attrs.append((attrname.lower(), attrvalue))
230 k = m.end()
231
232 end = rawdata[k:endpos].strip()
233 if end not in (">", "/>"):
234 lineno, offset = self.getpos()
235 if "\n" in self.__starttag_text:
236 lineno = lineno + self.__starttag_text.count("\n")
237 offset = len(self.__starttag_text) \
238 - self.__starttag_text.rfind("\n")
239 else:
240 offset = offset + len(self.__starttag_text)
241 if self.strict:
242 self.error("junk characters in start tag: %r"
243 % (rawdata[k:endpos][:20],))
244 self.handle_data(rawdata[i:endpos])
245 return endpos
246 if end.endswith('/>'):
247 # XHTML-style empty tag: <span attr="value" />
248 self.handle_startendtag(tag, attrs)
249 else:
250 self.handle_starttag(tag, attrs)
251 if tag in self.CDATA_CONTENT_ELEMENTS:
252 self.set_cdata_mode(tag)
253 return endpos
254
255 def set_cdata_mode(self, elem):
256 self.cdata_elem = elem.lower()
257 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
258
259 BeautifulSoupHTMLParser.parse_starttag = parse_starttag
260 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
261
262 CONSTRUCTOR_TAKES_STRICT = True
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py
index 9c6c14ee65..4f7cf74681 100644
--- a/bitbake/lib/bs4/builder/_lxml.py
+++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -1,19 +1,28 @@
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
1__all__ = [ 4__all__ = [
2 'LXMLTreeBuilderForXML', 5 'LXMLTreeBuilderForXML',
3 'LXMLTreeBuilder', 6 'LXMLTreeBuilder',
4 ] 7 ]
5 8
9try:
10 from collections.abc import Callable # Python 3.6
11except ImportError as e:
12 from collections import Callable
13
6from io import BytesIO 14from io import BytesIO
7from io import StringIO 15from io import StringIO
8import collections
9from lxml import etree 16from lxml import etree
10from bs4.element import ( 17from bs4.element import (
11 Comment, 18 Comment,
12 Doctype, 19 Doctype,
13 NamespacedAttribute, 20 NamespacedAttribute,
14 ProcessingInstruction, 21 ProcessingInstruction,
22 XMLProcessingInstruction,
15) 23)
16from bs4.builder import ( 24from bs4.builder import (
25 DetectsXMLParsedAsHTML,
17 FAST, 26 FAST,
18 HTML, 27 HTML,
19 HTMLTreeBuilder, 28 HTMLTreeBuilder,
@@ -25,10 +34,15 @@ from bs4.dammit import EncodingDetector
25 34
26LXML = 'lxml' 35LXML = 'lxml'
27 36
37def _invert(d):
38 "Invert a dictionary."
39 return dict((v,k) for k, v in list(d.items()))
40
28class LXMLTreeBuilderForXML(TreeBuilder): 41class LXMLTreeBuilderForXML(TreeBuilder):
29 DEFAULT_PARSER_CLASS = etree.XMLParser 42 DEFAULT_PARSER_CLASS = etree.XMLParser
30 43
31 is_xml = True 44 is_xml = True
45 processing_instruction_class = XMLProcessingInstruction
32 46
33 NAME = "lxml-xml" 47 NAME = "lxml-xml"
34 ALTERNATE_NAMES = ["xml"] 48 ALTERNATE_NAMES = ["xml"]
@@ -40,26 +54,79 @@ class LXMLTreeBuilderForXML(TreeBuilder):
40 54
41 # This namespace mapping is specified in the XML Namespace 55 # This namespace mapping is specified in the XML Namespace
42 # standard. 56 # standard.
43 DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 57 DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
58
59 DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
60
61 # NOTE: If we parsed Element objects and looked at .sourceline,
62 # we'd be able to see the line numbers from the original document.
63 # But instead we build an XMLParser or HTMLParser object to serve
64 # as the target of parse messages, and those messages don't include
65 # line numbers.
66 # See: https://bugs.launchpad.net/lxml/+bug/1846906
67
68 def initialize_soup(self, soup):
69 """Let the BeautifulSoup object know about the standard namespace
70 mapping.
71
72 :param soup: A `BeautifulSoup`.
73 """
74 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
75 self._register_namespaces(self.DEFAULT_NSMAPS)
76
77 def _register_namespaces(self, mapping):
78 """Let the BeautifulSoup object know about namespaces encountered
79 while parsing the document.
80
81 This might be useful later on when creating CSS selectors.
82
83 This will track (almost) all namespaces, even ones that were
84 only in scope for part of the document. If two namespaces have
85 the same prefix, only the first one encountered will be
86 tracked. Un-prefixed namespaces are not tracked.
44 87
88 :param mapping: A dictionary mapping namespace prefixes to URIs.
89 """
90 for key, value in list(mapping.items()):
91 # This is 'if key' and not 'if key is not None' because we
92 # don't track un-prefixed namespaces. Soupselect will
93 # treat an un-prefixed namespace as the default, which
94 # causes confusion in some cases.
95 if key and key not in self.soup._namespaces:
96 # Let the BeautifulSoup object know about a new namespace.
97 # If there are multiple namespaces defined with the same
98 # prefix, the first one in the document takes precedence.
99 self.soup._namespaces[key] = value
100
45 def default_parser(self, encoding): 101 def default_parser(self, encoding):
46 # This can either return a parser object or a class, which 102 """Find the default parser for the given encoding.
47 # will be instantiated with default arguments. 103
104 :param encoding: A string.
105 :return: Either a parser object or a class, which
106 will be instantiated with default arguments.
107 """
48 if self._default_parser is not None: 108 if self._default_parser is not None:
49 return self._default_parser 109 return self._default_parser
50 return etree.XMLParser( 110 return etree.XMLParser(
51 target=self, strip_cdata=False, recover=True, encoding=encoding) 111 target=self, strip_cdata=False, recover=True, encoding=encoding)
52 112
53 def parser_for(self, encoding): 113 def parser_for(self, encoding):
114 """Instantiate an appropriate parser for the given encoding.
115
116 :param encoding: A string.
117 :return: A parser object such as an `etree.XMLParser`.
118 """
54 # Use the default parser. 119 # Use the default parser.
55 parser = self.default_parser(encoding) 120 parser = self.default_parser(encoding)
56 121
57 if isinstance(parser, collections.Callable): 122 if isinstance(parser, Callable):
58 # Instantiate the parser with default arguments 123 # Instantiate the parser with default arguments
59 parser = parser(target=self, strip_cdata=False, encoding=encoding) 124 parser = parser(
125 target=self, strip_cdata=False, recover=True, encoding=encoding
126 )
60 return parser 127 return parser
61 128
62 def __init__(self, parser=None, empty_element_tags=None): 129 def __init__(self, parser=None, empty_element_tags=None, **kwargs):
63 # TODO: Issue a warning if parser is present but not a 130 # TODO: Issue a warning if parser is present but not a
64 # callable, since that means there's no way to create new 131 # callable, since that means there's no way to create new
65 # parsers for different encodings. 132 # parsers for different encodings.
@@ -67,8 +134,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
67 if empty_element_tags is not None: 134 if empty_element_tags is not None:
68 self.empty_element_tags = set(empty_element_tags) 135 self.empty_element_tags = set(empty_element_tags)
69 self.soup = None 136 self.soup = None
70 self.nsmaps = [self.DEFAULT_NSMAPS] 137 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
71 138 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
139 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
140
72 def _getNsTag(self, tag): 141 def _getNsTag(self, tag):
73 # Split the namespace URL out of a fully-qualified lxml tag 142 # Split the namespace URL out of a fully-qualified lxml tag
74 # name. Copied from lxml's src/lxml/sax.py. 143 # name. Copied from lxml's src/lxml/sax.py.
@@ -80,16 +149,51 @@ class LXMLTreeBuilderForXML(TreeBuilder):
80 def prepare_markup(self, markup, user_specified_encoding=None, 149 def prepare_markup(self, markup, user_specified_encoding=None,
81 exclude_encodings=None, 150 exclude_encodings=None,
82 document_declared_encoding=None): 151 document_declared_encoding=None):
83 """ 152 """Run any preliminary steps necessary to make incoming markup
84 :yield: A series of 4-tuples. 153 acceptable to the parser.
154
155 lxml really wants to get a bytestring and convert it to
156 Unicode itself. So instead of using UnicodeDammit to convert
157 the bytestring to Unicode using different encodings, this
158 implementation uses EncodingDetector to iterate over the
159 encodings, and tell lxml to try to parse the document as each
160 one in turn.
161
162 :param markup: Some markup -- hopefully a bytestring.
163 :param user_specified_encoding: The user asked to try this encoding.
164 :param document_declared_encoding: The markup itself claims to be
165 in this encoding.
166 :param exclude_encodings: The user asked _not_ to try any of
167 these encodings.
168
169 :yield: A series of 4-tuples:
85 (markup, encoding, declared encoding, 170 (markup, encoding, declared encoding,
86 has undergone character replacement) 171 has undergone character replacement)
87 172
88 Each 4-tuple represents a strategy for parsing the document. 173 Each 4-tuple represents a strategy for converting the
174 document to Unicode and parsing it. Each strategy will be tried
175 in turn.
89 """ 176 """
177 is_html = not self.is_xml
178 if is_html:
179 self.processing_instruction_class = ProcessingInstruction
180 # We're in HTML mode, so if we're given XML, that's worth
181 # noting.
182 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
183 markup, stacklevel=3
184 )
185 else:
186 self.processing_instruction_class = XMLProcessingInstruction
187
90 if isinstance(markup, str): 188 if isinstance(markup, str):
91 # We were given Unicode. Maybe lxml can parse Unicode on 189 # We were given Unicode. Maybe lxml can parse Unicode on
92 # this system? 190 # this system?
191
192 # TODO: This is a workaround for
193 # https://bugs.launchpad.net/lxml/+bug/1948551.
194 # We can remove it once the upstream issue is fixed.
195 if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
196 markup = markup[1:]
93 yield markup, None, document_declared_encoding, False 197 yield markup, None, document_declared_encoding, False
94 198
95 if isinstance(markup, str): 199 if isinstance(markup, str):
@@ -98,14 +202,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
98 yield (markup.encode("utf8"), "utf8", 202 yield (markup.encode("utf8"), "utf8",
99 document_declared_encoding, False) 203 document_declared_encoding, False)
100 204
101 # Instead of using UnicodeDammit to convert the bytestring to 205 # This was provided by the end-user; treat it as a known
102 # Unicode using different encodings, use EncodingDetector to 206 # definite encoding per the algorithm laid out in the HTML5
103 # iterate over the encodings, and tell lxml to try to parse 207 # spec. (See the EncodingDetector class for details.)
104 # the document as each one in turn. 208 known_definite_encodings = [user_specified_encoding]
105 is_html = not self.is_xml 209
106 try_encodings = [user_specified_encoding, document_declared_encoding] 210 # This was found in the document; treat it as a slightly lower-priority
211 # user encoding.
212 user_encodings = [document_declared_encoding]
107 detector = EncodingDetector( 213 detector = EncodingDetector(
108 markup, try_encodings, is_html, exclude_encodings) 214 markup, known_definite_encodings=known_definite_encodings,
215 user_encodings=user_encodings, is_html=is_html,
216 exclude_encodings=exclude_encodings
217 )
109 for encoding in detector.encodings: 218 for encoding in detector.encodings:
110 yield (detector.markup, encoding, document_declared_encoding, False) 219 yield (detector.markup, encoding, document_declared_encoding, False)
111 220
@@ -128,25 +237,45 @@ class LXMLTreeBuilderForXML(TreeBuilder):
128 self.parser.feed(data) 237 self.parser.feed(data)
129 self.parser.close() 238 self.parser.close()
130 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 239 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
131 raise ParserRejectedMarkup(str(e)) 240 raise ParserRejectedMarkup(e)
132 241
133 def close(self): 242 def close(self):
134 self.nsmaps = [self.DEFAULT_NSMAPS] 243 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
135 244
136 def start(self, name, attrs, nsmap={}): 245 def start(self, name, attrs, nsmap={}):
137 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 246 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
138 attrs = dict(attrs) 247 attrs = dict(attrs)
139 nsprefix = None 248 nsprefix = None
140 # Invert each namespace map as it comes in. 249 # Invert each namespace map as it comes in.
141 if len(self.nsmaps) > 1: 250 if len(nsmap) == 0 and len(self.nsmaps) > 1:
142 # There are no new namespaces for this tag, but 251 # There are no new namespaces for this tag, but
143 # non-default namespaces are in play, so we need a 252 # non-default namespaces are in play, so we need a
144 # separate tag stack to know when they end. 253 # separate tag stack to know when they end.
145 self.nsmaps.append(None) 254 self.nsmaps.append(None)
146 elif len(nsmap) > 0: 255 elif len(nsmap) > 0:
147 # A new namespace mapping has come into play. 256 # A new namespace mapping has come into play.
148 inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) 257
149 self.nsmaps.append(inverted_nsmap) 258 # First, Let the BeautifulSoup object know about it.
259 self._register_namespaces(nsmap)
260
261 # Then, add it to our running list of inverted namespace
262 # mappings.
263 self.nsmaps.append(_invert(nsmap))
264
265 # The currently active namespace prefixes have
266 # changed. Calculate the new mapping so it can be stored
267 # with all Tag objects created while these prefixes are in
268 # scope.
269 current_mapping = dict(self.active_namespace_prefixes[-1])
270 current_mapping.update(nsmap)
271
272 # We should not track un-prefixed namespaces as we can only hold one
273 # and it will be recognized as the default namespace by soupsieve,
274 # which may be confusing in some situations.
275 if '' in current_mapping:
276 del current_mapping['']
277 self.active_namespace_prefixes.append(current_mapping)
278
150 # Also treat the namespace mapping as a set of attributes on the 279 # Also treat the namespace mapping as a set of attributes on the
151 # tag, so we can recreate it later. 280 # tag, so we can recreate it later.
152 attrs = attrs.copy() 281 attrs = attrs.copy()
@@ -171,8 +300,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
171 300
172 namespace, name = self._getNsTag(name) 301 namespace, name = self._getNsTag(name)
173 nsprefix = self._prefix_for_namespace(namespace) 302 nsprefix = self._prefix_for_namespace(namespace)
174 self.soup.handle_starttag(name, namespace, nsprefix, attrs) 303 self.soup.handle_starttag(
175 304 name, namespace, nsprefix, attrs,
305 namespaces=self.active_namespace_prefixes[-1]
306 )
307
176 def _prefix_for_namespace(self, namespace): 308 def _prefix_for_namespace(self, namespace):
177 """Find the currently active prefix for the given namespace.""" 309 """Find the currently active prefix for the given namespace."""
178 if namespace is None: 310 if namespace is None:
@@ -196,13 +328,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
196 if len(self.nsmaps) > 1: 328 if len(self.nsmaps) > 1:
197 # This tag, or one of its parents, introduced a namespace 329 # This tag, or one of its parents, introduced a namespace
198 # mapping, so pop it off the stack. 330 # mapping, so pop it off the stack.
199 self.nsmaps.pop() 331 out_of_scope_nsmap = self.nsmaps.pop()
200 332
333 if out_of_scope_nsmap is not None:
334 # This tag introduced a namespace mapping which is no
335 # longer in scope. Recalculate the currently active
336 # namespace prefixes.
337 self.active_namespace_prefixes.pop()
338
201 def pi(self, target, data): 339 def pi(self, target, data):
202 self.soup.endData() 340 self.soup.endData()
203 self.soup.handle_data(target + ' ' + data) 341 data = target + ' ' + data
204 self.soup.endData(ProcessingInstruction) 342 self.soup.handle_data(data)
205 343 self.soup.endData(self.processing_instruction_class)
344
206 def data(self, content): 345 def data(self, content):
207 self.soup.handle_data(content) 346 self.soup.handle_data(content)
208 347
@@ -229,6 +368,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
229 368
230 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 369 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
231 is_xml = False 370 is_xml = False
371 processing_instruction_class = ProcessingInstruction
232 372
233 def default_parser(self, encoding): 373 def default_parser(self, encoding):
234 return etree.HTMLParser 374 return etree.HTMLParser
@@ -240,7 +380,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
240 self.parser.feed(markup) 380 self.parser.feed(markup)
241 self.parser.close() 381 self.parser.close()
242 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 382 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
243 raise ParserRejectedMarkup(str(e)) 383 raise ParserRejectedMarkup(e)
244 384
245 385
246 def test_fragment_to_document(self, fragment): 386 def test_fragment_to_document(self, fragment):