diff options
Diffstat (limited to 'bitbake/lib/bs4/builder/__init__.py')
-rw-r--r-- | bitbake/lib/bs4/builder/__init__.py | 382 |
1 files changed, 347 insertions, 35 deletions
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py index 6ccd4d23d6..ffb31fc25e 100644 --- a/bitbake/lib/bs4/builder/__init__.py +++ b/bitbake/lib/bs4/builder/__init__.py | |||
@@ -1,11 +1,21 @@ | |||
1 | # Use of this source code is governed by the MIT license. | ||
2 | __license__ = "MIT" | ||
3 | |||
1 | from collections import defaultdict | 4 | from collections import defaultdict |
2 | import itertools | 5 | import itertools |
6 | import re | ||
7 | import warnings | ||
3 | import sys | 8 | import sys |
4 | from bs4.element import ( | 9 | from bs4.element import ( |
5 | CharsetMetaAttributeValue, | 10 | CharsetMetaAttributeValue, |
6 | ContentMetaAttributeValue, | 11 | ContentMetaAttributeValue, |
7 | whitespace_re | 12 | RubyParenthesisString, |
8 | ) | 13 | RubyTextString, |
14 | Stylesheet, | ||
15 | Script, | ||
16 | TemplateString, | ||
17 | nonwhitespace_re | ||
18 | ) | ||
9 | 19 | ||
10 | __all__ = [ | 20 | __all__ = [ |
11 | 'HTMLTreeBuilder', | 21 | 'HTMLTreeBuilder', |
@@ -22,20 +32,41 @@ XML = 'xml' | |||
22 | HTML = 'html' | 32 | HTML = 'html' |
23 | HTML_5 = 'html5' | 33 | HTML_5 = 'html5' |
24 | 34 | ||
35 | class XMLParsedAsHTMLWarning(UserWarning): | ||
36 | """The warning issued when an HTML parser is used to parse | ||
37 | XML that is not XHTML. | ||
38 | """ | ||
39 | MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.""" | ||
40 | |||
25 | 41 | ||
26 | class TreeBuilderRegistry(object): | 42 | class TreeBuilderRegistry(object): |
27 | 43 | """A way of looking up TreeBuilder subclasses by their name or by desired | |
44 | features. | ||
45 | """ | ||
46 | |||
28 | def __init__(self): | 47 | def __init__(self): |
29 | self.builders_for_feature = defaultdict(list) | 48 | self.builders_for_feature = defaultdict(list) |
30 | self.builders = [] | 49 | self.builders = [] |
31 | 50 | ||
32 | def register(self, treebuilder_class): | 51 | def register(self, treebuilder_class): |
33 | """Register a treebuilder based on its advertised features.""" | 52 | """Register a treebuilder based on its advertised features. |
53 | |||
54 | :param treebuilder_class: A subclass of Treebuilder. its .features | ||
55 | attribute should list its features. | ||
56 | """ | ||
34 | for feature in treebuilder_class.features: | 57 | for feature in treebuilder_class.features: |
35 | self.builders_for_feature[feature].insert(0, treebuilder_class) | 58 | self.builders_for_feature[feature].insert(0, treebuilder_class) |
36 | self.builders.insert(0, treebuilder_class) | 59 | self.builders.insert(0, treebuilder_class) |
37 | 60 | ||
38 | def lookup(self, *features): | 61 | def lookup(self, *features): |
62 | """Look up a TreeBuilder subclass with the desired features. | ||
63 | |||
64 | :param features: A list of features to look for. If none are | ||
65 | provided, the most recently registered TreeBuilder subclass | ||
66 | will be used. | ||
67 | :return: A TreeBuilder subclass, or None if there's no | ||
68 | registered subclass with all the requested features. | ||
69 | """ | ||
39 | if len(self.builders) == 0: | 70 | if len(self.builders) == 0: |
40 | # There are no builders at all. | 71 | # There are no builders at all. |
41 | return None | 72 | return None |
@@ -78,7 +109,7 @@ class TreeBuilderRegistry(object): | |||
78 | builder_registry = TreeBuilderRegistry() | 109 | builder_registry = TreeBuilderRegistry() |
79 | 110 | ||
80 | class TreeBuilder(object): | 111 | class TreeBuilder(object): |
81 | """Turn a document into a Beautiful Soup object tree.""" | 112 | """Turn a textual document into a Beautiful Soup object tree.""" |
82 | 113 | ||
83 | NAME = "[Unknown tree builder]" | 114 | NAME = "[Unknown tree builder]" |
84 | ALTERNATE_NAMES = [] | 115 | ALTERNATE_NAMES = [] |
@@ -86,19 +117,89 @@ class TreeBuilder(object): | |||
86 | 117 | ||
87 | is_xml = False | 118 | is_xml = False |
88 | picklable = False | 119 | picklable = False |
89 | preserve_whitespace_tags = set() | ||
90 | empty_element_tags = None # A tag will be considered an empty-element | 120 | empty_element_tags = None # A tag will be considered an empty-element |
91 | # tag when and only when it has no contents. | 121 | # tag when and only when it has no contents. |
92 | 122 | ||
93 | # A value for these tag/attribute combinations is a space- or | 123 | # A value for these tag/attribute combinations is a space- or |
94 | # comma-separated list of CDATA, rather than a single CDATA. | 124 | # comma-separated list of CDATA, rather than a single CDATA. |
95 | cdata_list_attributes = {} | 125 | DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list) |
96 | 126 | ||
97 | 127 | # Whitespace should be preserved inside these tags. | |
98 | def __init__(self): | 128 | DEFAULT_PRESERVE_WHITESPACE_TAGS = set() |
129 | |||
130 | # The textual contents of tags with these names should be | ||
131 | # instantiated with some class other than NavigableString. | ||
132 | DEFAULT_STRING_CONTAINERS = {} | ||
133 | |||
134 | USE_DEFAULT = object() | ||
135 | |||
136 | # Most parsers don't keep track of line numbers. | ||
137 | TRACKS_LINE_NUMBERS = False | ||
138 | |||
139 | def __init__(self, multi_valued_attributes=USE_DEFAULT, | ||
140 | preserve_whitespace_tags=USE_DEFAULT, | ||
141 | store_line_numbers=USE_DEFAULT, | ||
142 | string_containers=USE_DEFAULT, | ||
143 | ): | ||
144 | """Constructor. | ||
145 | |||
146 | :param multi_valued_attributes: If this is set to None, the | ||
147 | TreeBuilder will not turn any values for attributes like | ||
148 | 'class' into lists. Setting this to a dictionary will | ||
149 | customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES | ||
150 | for an example. | ||
151 | |||
152 | Internally, these are called "CDATA list attributes", but that | ||
153 | probably doesn't make sense to an end-user, so the argument name | ||
154 | is `multi_valued_attributes`. | ||
155 | |||
156 | :param preserve_whitespace_tags: A list of tags to treat | ||
157 | the way <pre> tags are treated in HTML. Tags in this list | ||
158 | are immune from pretty-printing; their contents will always be | ||
159 | output as-is. | ||
160 | |||
161 | :param string_containers: A dictionary mapping tag names to | ||
162 | the classes that should be instantiated to contain the textual | ||
163 | contents of those tags. The default is to use NavigableString | ||
164 | for every tag, no matter what the name. You can override the | ||
165 | default by changing DEFAULT_STRING_CONTAINERS. | ||
166 | |||
167 | :param store_line_numbers: If the parser keeps track of the | ||
168 | line numbers and positions of the original markup, that | ||
169 | information will, by default, be stored in each corresponding | ||
170 | `Tag` object. You can turn this off by passing | ||
171 | store_line_numbers=False. If the parser you're using doesn't | ||
172 | keep track of this information, then setting store_line_numbers=True | ||
173 | will do nothing. | ||
174 | """ | ||
99 | self.soup = None | 175 | self.soup = None |
100 | 176 | if multi_valued_attributes is self.USE_DEFAULT: | |
177 | multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES | ||
178 | self.cdata_list_attributes = multi_valued_attributes | ||
179 | if preserve_whitespace_tags is self.USE_DEFAULT: | ||
180 | preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS | ||
181 | self.preserve_whitespace_tags = preserve_whitespace_tags | ||
182 | if store_line_numbers == self.USE_DEFAULT: | ||
183 | store_line_numbers = self.TRACKS_LINE_NUMBERS | ||
184 | self.store_line_numbers = store_line_numbers | ||
185 | if string_containers == self.USE_DEFAULT: | ||
186 | string_containers = self.DEFAULT_STRING_CONTAINERS | ||
187 | self.string_containers = string_containers | ||
188 | |||
189 | def initialize_soup(self, soup): | ||
190 | """The BeautifulSoup object has been initialized and is now | ||
191 | being associated with the TreeBuilder. | ||
192 | |||
193 | :param soup: A BeautifulSoup object. | ||
194 | """ | ||
195 | self.soup = soup | ||
196 | |||
101 | def reset(self): | 197 | def reset(self): |
198 | """Do any work necessary to reset the underlying parser | ||
199 | for a new document. | ||
200 | |||
201 | By default, this does nothing. | ||
202 | """ | ||
102 | pass | 203 | pass |
103 | 204 | ||
104 | def can_be_empty_element(self, tag_name): | 205 | def can_be_empty_element(self, tag_name): |
@@ -110,24 +211,58 @@ class TreeBuilder(object): | |||
110 | For instance: an HTMLBuilder does not consider a <p> tag to be | 211 | For instance: an HTMLBuilder does not consider a <p> tag to be |
111 | an empty-element tag (it's not in | 212 | an empty-element tag (it's not in |
112 | HTMLBuilder.empty_element_tags). This means an empty <p> tag | 213 | HTMLBuilder.empty_element_tags). This means an empty <p> tag |
113 | will be presented as "<p></p>", not "<p />". | 214 | will be presented as "<p></p>", not "<p/>" or "<p>". |
114 | 215 | ||
115 | The default implementation has no opinion about which tags are | 216 | The default implementation has no opinion about which tags are |
116 | empty-element tags, so a tag will be presented as an | 217 | empty-element tags, so a tag will be presented as an |
117 | empty-element tag if and only if it has no contents. | 218 | empty-element tag if and only if it has no children. |
118 | "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will | 219 | "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will |
119 | be left alone. | 220 | be left alone. |
221 | |||
222 | :param tag_name: The name of a markup tag. | ||
120 | """ | 223 | """ |
121 | if self.empty_element_tags is None: | 224 | if self.empty_element_tags is None: |
122 | return True | 225 | return True |
123 | return tag_name in self.empty_element_tags | 226 | return tag_name in self.empty_element_tags |
124 | 227 | ||
125 | def feed(self, markup): | 228 | def feed(self, markup): |
229 | """Run some incoming markup through some parsing process, | ||
230 | populating the `BeautifulSoup` object in self.soup. | ||
231 | |||
232 | This method is not implemented in TreeBuilder; it must be | ||
233 | implemented in subclasses. | ||
234 | |||
235 | :return: None. | ||
236 | """ | ||
126 | raise NotImplementedError() | 237 | raise NotImplementedError() |
127 | 238 | ||
128 | def prepare_markup(self, markup, user_specified_encoding=None, | 239 | def prepare_markup(self, markup, user_specified_encoding=None, |
129 | document_declared_encoding=None): | 240 | document_declared_encoding=None, exclude_encodings=None): |
130 | return markup, None, None, False | 241 | """Run any preliminary steps necessary to make incoming markup |
242 | acceptable to the parser. | ||
243 | |||
244 | :param markup: Some markup -- probably a bytestring. | ||
245 | :param user_specified_encoding: The user asked to try this encoding. | ||
246 | :param document_declared_encoding: The markup itself claims to be | ||
247 | in this encoding. NOTE: This argument is not used by the | ||
248 | calling code and can probably be removed. | ||
249 | :param exclude_encodings: The user asked _not_ to try any of | ||
250 | these encodings. | ||
251 | |||
252 | :yield: A series of 4-tuples: | ||
253 | (markup, encoding, declared encoding, | ||
254 | has undergone character replacement) | ||
255 | |||
256 | Each 4-tuple represents a strategy for converting the | ||
257 | document to Unicode and parsing it. Each strategy will be tried | ||
258 | in turn. | ||
259 | |||
260 | By default, the only strategy is to parse the markup | ||
261 | as-is. See `LXMLTreeBuilderForXML` and | ||
262 | `HTMLParserTreeBuilder` for implementations that take into | ||
263 | account the quirks of particular parsers. | ||
264 | """ | ||
265 | yield markup, None, None, False | ||
131 | 266 | ||
132 | def test_fragment_to_document(self, fragment): | 267 | def test_fragment_to_document(self, fragment): |
133 | """Wrap an HTML fragment to make it look like a document. | 268 | """Wrap an HTML fragment to make it look like a document. |
@@ -139,16 +274,36 @@ class TreeBuilder(object): | |||
139 | results against other HTML fragments. | 274 | results against other HTML fragments. |
140 | 275 | ||
141 | This method should not be used outside of tests. | 276 | This method should not be used outside of tests. |
277 | |||
278 | :param fragment: A string -- fragment of HTML. | ||
279 | :return: A string -- a full HTML document. | ||
142 | """ | 280 | """ |
143 | return fragment | 281 | return fragment |
144 | 282 | ||
145 | def set_up_substitutions(self, tag): | 283 | def set_up_substitutions(self, tag): |
284 | """Set up any substitutions that will need to be performed on | ||
285 | a `Tag` when it's output as a string. | ||
286 | |||
287 | By default, this does nothing. See `HTMLTreeBuilder` for a | ||
288 | case where this is used. | ||
289 | |||
290 | :param tag: A `Tag` | ||
291 | :return: Whether or not a substitution was performed. | ||
292 | """ | ||
146 | return False | 293 | return False |
147 | 294 | ||
148 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): | 295 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): |
149 | """Replaces class="foo bar" with class=["foo", "bar"] | 296 | """When an attribute value is associated with a tag that can |
297 | have multiple values for that attribute, convert the string | ||
298 | value to a list of strings. | ||
150 | 299 | ||
151 | Modifies its input in place. | 300 | Basically, replaces class="foo bar" with class=["foo", "bar"] |
301 | |||
302 | NOTE: This method modifies its input in place. | ||
303 | |||
304 | :param tag_name: The name of a tag. | ||
305 | :param attrs: A dictionary containing the tag's attributes. | ||
306 | Any appropriate attribute values will be modified in place. | ||
152 | """ | 307 | """ |
153 | if not attrs: | 308 | if not attrs: |
154 | return attrs | 309 | return attrs |
@@ -163,7 +318,7 @@ class TreeBuilder(object): | |||
163 | # values. Split it into a list. | 318 | # values. Split it into a list. |
164 | value = attrs[attr] | 319 | value = attrs[attr] |
165 | if isinstance(value, str): | 320 | if isinstance(value, str): |
166 | values = whitespace_re.split(value) | 321 | values = nonwhitespace_re.findall(value) |
167 | else: | 322 | else: |
168 | # html5lib sometimes calls setAttributes twice | 323 | # html5lib sometimes calls setAttributes twice |
169 | # for the same tag when rearranging the parse | 324 | # for the same tag when rearranging the parse |
@@ -174,9 +329,13 @@ class TreeBuilder(object): | |||
174 | values = value | 329 | values = value |
175 | attrs[attr] = values | 330 | attrs[attr] = values |
176 | return attrs | 331 | return attrs |
177 | 332 | ||
178 | class SAXTreeBuilder(TreeBuilder): | 333 | class SAXTreeBuilder(TreeBuilder): |
179 | """A Beautiful Soup treebuilder that listens for SAX events.""" | 334 | """A Beautiful Soup treebuilder that listens for SAX events. |
335 | |||
336 | This is not currently used for anything, but it demonstrates | ||
337 | how a simple TreeBuilder would work. | ||
338 | """ | ||
180 | 339 | ||
181 | def feed(self, markup): | 340 | def feed(self, markup): |
182 | raise NotImplementedError() | 341 | raise NotImplementedError() |
@@ -186,11 +345,11 @@ class SAXTreeBuilder(TreeBuilder): | |||
186 | 345 | ||
187 | def startElement(self, name, attrs): | 346 | def startElement(self, name, attrs): |
188 | attrs = dict((key[1], value) for key, value in list(attrs.items())) | 347 | attrs = dict((key[1], value) for key, value in list(attrs.items())) |
189 | #print "Start %s, %r" % (name, attrs) | 348 | #print("Start %s, %r" % (name, attrs)) |
190 | self.soup.handle_starttag(name, attrs) | 349 | self.soup.handle_starttag(name, attrs) |
191 | 350 | ||
192 | def endElement(self, name): | 351 | def endElement(self, name): |
193 | #print "End %s" % name | 352 | #print("End %s" % name) |
194 | self.soup.handle_endtag(name) | 353 | self.soup.handle_endtag(name) |
195 | 354 | ||
196 | def startElementNS(self, nsTuple, nodeName, attrs): | 355 | def startElementNS(self, nsTuple, nodeName, attrs): |
@@ -227,10 +386,44 @@ class HTMLTreeBuilder(TreeBuilder): | |||
227 | Such as which tags are empty-element tags. | 386 | Such as which tags are empty-element tags. |
228 | """ | 387 | """ |
229 | 388 | ||
230 | preserve_whitespace_tags = set(['pre', 'textarea']) | 389 | empty_element_tags = set([ |
231 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', | 390 | # These are from HTML5. |
232 | 'spacer', 'link', 'frame', 'base']) | 391 | 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', |
233 | 392 | ||
393 | # These are from earlier versions of HTML and are removed in HTML5. | ||
394 | 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' | ||
395 | ]) | ||
396 | |||
397 | # The HTML standard defines these as block-level elements. Beautiful | ||
398 | # Soup does not treat these elements differently from other elements, | ||
399 | # but it may do so eventually, and this information is available if | ||
400 | # you need to use it. | ||
401 | block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) | ||
402 | |||
403 | # These HTML tags need special treatment so they can be | ||
404 | # represented by a string class other than NavigableString. | ||
405 | # | ||
406 | # For some of these tags, it's because the HTML standard defines | ||
407 | # an unusual content model for them. I made this list by going | ||
408 | # through the HTML spec | ||
409 | # (https://html.spec.whatwg.org/#metadata-content) and looking for | ||
410 | # "metadata content" elements that can contain strings. | ||
411 | # | ||
412 | # The Ruby tags (<rt> and <rp>) are here despite being normal | ||
413 | # "phrasing content" tags, because the content they contain is | ||
414 | # qualitatively different from other text in the document, and it | ||
415 | # can be useful to be able to distinguish it. | ||
416 | # | ||
417 | # TODO: Arguably <noscript> could go here but it seems | ||
418 | # qualitatively different from the other tags. | ||
419 | DEFAULT_STRING_CONTAINERS = { | ||
420 | 'rt' : RubyTextString, | ||
421 | 'rp' : RubyParenthesisString, | ||
422 | 'style': Stylesheet, | ||
423 | 'script': Script, | ||
424 | 'template': TemplateString, | ||
425 | } | ||
426 | |||
234 | # The HTML standard defines these attributes as containing a | 427 | # The HTML standard defines these attributes as containing a |
235 | # space-separated list of values, not a single value. That is, | 428 | # space-separated list of values, not a single value. That is, |
236 | # class="foo bar" means that the 'class' attribute has two values, | 429 | # class="foo bar" means that the 'class' attribute has two values, |
@@ -238,7 +431,7 @@ class HTMLTreeBuilder(TreeBuilder): | |||
238 | # encounter one of these attributes, we will parse its value into | 431 | # encounter one of these attributes, we will parse its value into |
239 | # a list of values if possible. Upon output, the list will be | 432 | # a list of values if possible. Upon output, the list will be |
240 | # converted back into a string. | 433 | # converted back into a string. |
241 | cdata_list_attributes = { | 434 | DEFAULT_CDATA_LIST_ATTRIBUTES = { |
242 | "*" : ['class', 'accesskey', 'dropzone'], | 435 | "*" : ['class', 'accesskey', 'dropzone'], |
243 | "a" : ['rel', 'rev'], | 436 | "a" : ['rel', 'rev'], |
244 | "link" : ['rel', 'rev'], | 437 | "link" : ['rel', 'rev'], |
@@ -255,7 +448,19 @@ class HTMLTreeBuilder(TreeBuilder): | |||
255 | "output" : ["for"], | 448 | "output" : ["for"], |
256 | } | 449 | } |
257 | 450 | ||
451 | DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) | ||
452 | |||
258 | def set_up_substitutions(self, tag): | 453 | def set_up_substitutions(self, tag): |
454 | """Replace the declared encoding in a <meta> tag with a placeholder, | ||
455 | to be substituted when the tag is output to a string. | ||
456 | |||
457 | An HTML document may come in to Beautiful Soup as one | ||
458 | encoding, but exit in a different encoding, and the <meta> tag | ||
459 | needs to be changed to reflect this. | ||
460 | |||
461 | :param tag: A `Tag` | ||
462 | :return: Whether or not a substitution was performed. | ||
463 | """ | ||
259 | # We are only interested in <meta> tags | 464 | # We are only interested in <meta> tags |
260 | if tag.name != 'meta': | 465 | if tag.name != 'meta': |
261 | return False | 466 | return False |
@@ -288,10 +493,107 @@ class HTMLTreeBuilder(TreeBuilder): | |||
288 | 493 | ||
289 | return (meta_encoding is not None) | 494 | return (meta_encoding is not None) |
290 | 495 | ||
496 | class DetectsXMLParsedAsHTML(object): | ||
497 | """A mixin class for any class (a TreeBuilder, or some class used by a | ||
498 | TreeBuilder) that's in a position to detect whether an XML | ||
499 | document is being incorrectly parsed as HTML, and issue an | ||
500 | appropriate warning. | ||
501 | |||
502 | This requires being able to observe an incoming processing | ||
503 | instruction that might be an XML declaration, and also able to | ||
504 | observe tags as they're opened. If you can't do that for a given | ||
505 | TreeBuilder, there's a less reliable implementation based on | ||
506 | examining the raw markup. | ||
507 | """ | ||
508 | |||
509 | # Regular expression for seeing if markup has an <html> tag. | ||
510 | LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I) | ||
511 | LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I) | ||
512 | |||
513 | XML_PREFIX = '<?xml' | ||
514 | XML_PREFIX_B = b'<?xml' | ||
515 | |||
516 | @classmethod | ||
517 | def warn_if_markup_looks_like_xml(cls, markup, stacklevel=3): | ||
518 | """Perform a check on some markup to see if it looks like XML | ||
519 | that's not XHTML. If so, issue a warning. | ||
520 | |||
521 | This is much less reliable than doing the check while parsing, | ||
522 | but some of the tree builders can't do that. | ||
523 | |||
524 | :param stacklevel: The stacklevel of the code calling this | ||
525 | function. | ||
526 | |||
527 | :return: True if the markup looks like non-XHTML XML, False | ||
528 | otherwise. | ||
529 | |||
530 | """ | ||
531 | if isinstance(markup, bytes): | ||
532 | prefix = cls.XML_PREFIX_B | ||
533 | looks_like_html = cls.LOOKS_LIKE_HTML_B | ||
534 | else: | ||
535 | prefix = cls.XML_PREFIX | ||
536 | looks_like_html = cls.LOOKS_LIKE_HTML | ||
537 | |||
538 | if (markup is not None | ||
539 | and markup.startswith(prefix) | ||
540 | and not looks_like_html.search(markup[:500]) | ||
541 | ): | ||
542 | cls._warn(stacklevel=stacklevel+2) | ||
543 | return True | ||
544 | return False | ||
545 | |||
546 | @classmethod | ||
547 | def _warn(cls, stacklevel=5): | ||
548 | """Issue a warning about XML being parsed as HTML.""" | ||
549 | warnings.warn( | ||
550 | XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning, | ||
551 | stacklevel=stacklevel | ||
552 | ) | ||
553 | |||
554 | def _initialize_xml_detector(self): | ||
555 | """Call this method before parsing a document.""" | ||
556 | self._first_processing_instruction = None | ||
557 | self._root_tag = None | ||
558 | |||
559 | def _document_might_be_xml(self, processing_instruction): | ||
560 | """Call this method when encountering an XML declaration, or a | ||
561 | "processing instruction" that might be an XML declaration. | ||
562 | """ | ||
563 | if (self._first_processing_instruction is not None | ||
564 | or self._root_tag is not None): | ||
565 | # The document has already started. Don't bother checking | ||
566 | # anymore. | ||
567 | return | ||
568 | |||
569 | self._first_processing_instruction = processing_instruction | ||
570 | |||
571 | # We won't know until we encounter the first tag whether or | ||
572 | # not this is actually a problem. | ||
573 | |||
574 | def _root_tag_encountered(self, name): | ||
575 | """Call this when you encounter the document's root tag. | ||
576 | |||
577 | This is where we actually check whether an XML document is | ||
578 | being incorrectly parsed as HTML, and issue the warning. | ||
579 | """ | ||
580 | if self._root_tag is not None: | ||
581 | # This method was incorrectly called multiple times. Do | ||
582 | # nothing. | ||
583 | return | ||
584 | |||
585 | self._root_tag = name | ||
586 | if (name != 'html' and self._first_processing_instruction is not None | ||
587 | and self._first_processing_instruction.lower().startswith('xml ')): | ||
588 | # We encountered an XML declaration and then a tag other | ||
589 | # than 'html'. This is a reliable indicator that a | ||
590 | # non-XHTML document is being parsed as XML. | ||
591 | self._warn() | ||
592 | |||
593 | |||
291 | def register_treebuilders_from(module): | 594 | def register_treebuilders_from(module): |
292 | """Copy TreeBuilders from the given module into this module.""" | 595 | """Copy TreeBuilders from the given module into this module.""" |
293 | # I'm fairly sure this is not the best way to do this. | 596 | this_module = sys.modules[__name__] |
294 | this_module = sys.modules['bs4.builder'] | ||
295 | for name in module.__all__: | 597 | for name in module.__all__: |
296 | obj = getattr(module, name) | 598 | obj = getattr(module, name) |
297 | 599 | ||
@@ -302,12 +604,22 @@ def register_treebuilders_from(module): | |||
302 | this_module.builder_registry.register(obj) | 604 | this_module.builder_registry.register(obj) |
303 | 605 | ||
304 | class ParserRejectedMarkup(Exception): | 606 | class ParserRejectedMarkup(Exception): |
305 | pass | 607 | """An Exception to be raised when the underlying parser simply |
306 | 608 | refuses to parse the given markup. | |
609 | """ | ||
610 | def __init__(self, message_or_exception): | ||
611 | """Explain why the parser rejected the given markup, either | ||
612 | with a textual explanation or another exception. | ||
613 | """ | ||
614 | if isinstance(message_or_exception, Exception): | ||
615 | e = message_or_exception | ||
616 | message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) | ||
617 | super(ParserRejectedMarkup, self).__init__(message_or_exception) | ||
618 | |||
307 | # Builders are registered in reverse order of priority, so that custom | 619 | # Builders are registered in reverse order of priority, so that custom |
308 | # builder registrations will take precedence. In general, we want lxml | 620 | # builder registrations will take precedence. In general, we want lxml |
309 | # to take precedence over html5lib, because it's faster. And we only | 621 | # to take precedence over html5lib, because it's faster. And we only |
310 | # want to use HTMLParser as a last result. | 622 | # want to use HTMLParser as a last resort. |
311 | from . import _htmlparser | 623 | from . import _htmlparser |
312 | register_treebuilders_from(_htmlparser) | 624 | register_treebuilders_from(_htmlparser) |
313 | try: | 625 | try: |