diff options
Diffstat (limited to 'bitbake/lib/bs4/builder/_lxml.py')
-rw-r--r-- | bitbake/lib/bs4/builder/_lxml.py | 212 |
1 files changed, 176 insertions, 36 deletions
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py index 9c6c14ee65..4f7cf74681 100644 --- a/bitbake/lib/bs4/builder/_lxml.py +++ b/bitbake/lib/bs4/builder/_lxml.py | |||
@@ -1,19 +1,28 @@ | |||
1 | # Use of this source code is governed by the MIT license. | ||
2 | __license__ = "MIT" | ||
3 | |||
1 | __all__ = [ | 4 | __all__ = [ |
2 | 'LXMLTreeBuilderForXML', | 5 | 'LXMLTreeBuilderForXML', |
3 | 'LXMLTreeBuilder', | 6 | 'LXMLTreeBuilder', |
4 | ] | 7 | ] |
5 | 8 | ||
9 | try: | ||
10 | from collections.abc import Callable # Python 3.6 | ||
11 | except ImportError as e: | ||
12 | from collections import Callable | ||
13 | |||
6 | from io import BytesIO | 14 | from io import BytesIO |
7 | from io import StringIO | 15 | from io import StringIO |
8 | import collections | ||
9 | from lxml import etree | 16 | from lxml import etree |
10 | from bs4.element import ( | 17 | from bs4.element import ( |
11 | Comment, | 18 | Comment, |
12 | Doctype, | 19 | Doctype, |
13 | NamespacedAttribute, | 20 | NamespacedAttribute, |
14 | ProcessingInstruction, | 21 | ProcessingInstruction, |
22 | XMLProcessingInstruction, | ||
15 | ) | 23 | ) |
16 | from bs4.builder import ( | 24 | from bs4.builder import ( |
25 | DetectsXMLParsedAsHTML, | ||
17 | FAST, | 26 | FAST, |
18 | HTML, | 27 | HTML, |
19 | HTMLTreeBuilder, | 28 | HTMLTreeBuilder, |
@@ -25,10 +34,15 @@ from bs4.dammit import EncodingDetector | |||
25 | 34 | ||
26 | LXML = 'lxml' | 35 | LXML = 'lxml' |
27 | 36 | ||
37 | def _invert(d): | ||
38 | "Invert a dictionary." | ||
39 | return dict((v,k) for k, v in list(d.items())) | ||
40 | |||
28 | class LXMLTreeBuilderForXML(TreeBuilder): | 41 | class LXMLTreeBuilderForXML(TreeBuilder): |
29 | DEFAULT_PARSER_CLASS = etree.XMLParser | 42 | DEFAULT_PARSER_CLASS = etree.XMLParser |
30 | 43 | ||
31 | is_xml = True | 44 | is_xml = True |
45 | processing_instruction_class = XMLProcessingInstruction | ||
32 | 46 | ||
33 | NAME = "lxml-xml" | 47 | NAME = "lxml-xml" |
34 | ALTERNATE_NAMES = ["xml"] | 48 | ALTERNATE_NAMES = ["xml"] |
@@ -40,26 +54,79 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
40 | 54 | ||
41 | # This namespace mapping is specified in the XML Namespace | 55 | # This namespace mapping is specified in the XML Namespace |
42 | # standard. | 56 | # standard. |
43 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} | 57 | DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') |
58 | |||
59 | DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) | ||
60 | |||
61 | # NOTE: If we parsed Element objects and looked at .sourceline, | ||
62 | # we'd be able to see the line numbers from the original document. | ||
63 | # But instead we build an XMLParser or HTMLParser object to serve | ||
64 | # as the target of parse messages, and those messages don't include | ||
65 | # line numbers. | ||
66 | # See: https://bugs.launchpad.net/lxml/+bug/1846906 | ||
67 | |||
68 | def initialize_soup(self, soup): | ||
69 | """Let the BeautifulSoup object know about the standard namespace | ||
70 | mapping. | ||
71 | |||
72 | :param soup: A `BeautifulSoup`. | ||
73 | """ | ||
74 | super(LXMLTreeBuilderForXML, self).initialize_soup(soup) | ||
75 | self._register_namespaces(self.DEFAULT_NSMAPS) | ||
76 | |||
77 | def _register_namespaces(self, mapping): | ||
78 | """Let the BeautifulSoup object know about namespaces encountered | ||
79 | while parsing the document. | ||
80 | |||
81 | This might be useful later on when creating CSS selectors. | ||
82 | |||
83 | This will track (almost) all namespaces, even ones that were | ||
84 | only in scope for part of the document. If two namespaces have | ||
85 | the same prefix, only the first one encountered will be | ||
86 | tracked. Un-prefixed namespaces are not tracked. | ||
44 | 87 | ||
88 | :param mapping: A dictionary mapping namespace prefixes to URIs. | ||
89 | """ | ||
90 | for key, value in list(mapping.items()): | ||
91 | # This is 'if key' and not 'if key is not None' because we | ||
92 | # don't track un-prefixed namespaces. Soupselect will | ||
93 | # treat an un-prefixed namespace as the default, which | ||
94 | # causes confusion in some cases. | ||
95 | if key and key not in self.soup._namespaces: | ||
96 | # Let the BeautifulSoup object know about a new namespace. | ||
97 | # If there are multiple namespaces defined with the same | ||
98 | # prefix, the first one in the document takes precedence. | ||
99 | self.soup._namespaces[key] = value | ||
100 | |||
45 | def default_parser(self, encoding): | 101 | def default_parser(self, encoding): |
46 | # This can either return a parser object or a class, which | 102 | """Find the default parser for the given encoding. |
47 | # will be instantiated with default arguments. | 103 | |
104 | :param encoding: A string. | ||
105 | :return: Either a parser object or a class, which | ||
106 | will be instantiated with default arguments. | ||
107 | """ | ||
48 | if self._default_parser is not None: | 108 | if self._default_parser is not None: |
49 | return self._default_parser | 109 | return self._default_parser |
50 | return etree.XMLParser( | 110 | return etree.XMLParser( |
51 | target=self, strip_cdata=False, recover=True, encoding=encoding) | 111 | target=self, strip_cdata=False, recover=True, encoding=encoding) |
52 | 112 | ||
53 | def parser_for(self, encoding): | 113 | def parser_for(self, encoding): |
114 | """Instantiate an appropriate parser for the given encoding. | ||
115 | |||
116 | :param encoding: A string. | ||
117 | :return: A parser object such as an `etree.XMLParser`. | ||
118 | """ | ||
54 | # Use the default parser. | 119 | # Use the default parser. |
55 | parser = self.default_parser(encoding) | 120 | parser = self.default_parser(encoding) |
56 | 121 | ||
57 | if isinstance(parser, collections.Callable): | 122 | if isinstance(parser, Callable): |
58 | # Instantiate the parser with default arguments | 123 | # Instantiate the parser with default arguments |
59 | parser = parser(target=self, strip_cdata=False, encoding=encoding) | 124 | parser = parser( |
125 | target=self, strip_cdata=False, recover=True, encoding=encoding | ||
126 | ) | ||
60 | return parser | 127 | return parser |
61 | 128 | ||
62 | def __init__(self, parser=None, empty_element_tags=None): | 129 | def __init__(self, parser=None, empty_element_tags=None, **kwargs): |
63 | # TODO: Issue a warning if parser is present but not a | 130 | # TODO: Issue a warning if parser is present but not a |
64 | # callable, since that means there's no way to create new | 131 | # callable, since that means there's no way to create new |
65 | # parsers for different encodings. | 132 | # parsers for different encodings. |
@@ -67,8 +134,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
67 | if empty_element_tags is not None: | 134 | if empty_element_tags is not None: |
68 | self.empty_element_tags = set(empty_element_tags) | 135 | self.empty_element_tags = set(empty_element_tags) |
69 | self.soup = None | 136 | self.soup = None |
70 | self.nsmaps = [self.DEFAULT_NSMAPS] | 137 | self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] |
71 | 138 | self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)] | |
139 | super(LXMLTreeBuilderForXML, self).__init__(**kwargs) | ||
140 | |||
72 | def _getNsTag(self, tag): | 141 | def _getNsTag(self, tag): |
73 | # Split the namespace URL out of a fully-qualified lxml tag | 142 | # Split the namespace URL out of a fully-qualified lxml tag |
74 | # name. Copied from lxml's src/lxml/sax.py. | 143 | # name. Copied from lxml's src/lxml/sax.py. |
@@ -80,16 +149,51 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
80 | def prepare_markup(self, markup, user_specified_encoding=None, | 149 | def prepare_markup(self, markup, user_specified_encoding=None, |
81 | exclude_encodings=None, | 150 | exclude_encodings=None, |
82 | document_declared_encoding=None): | 151 | document_declared_encoding=None): |
83 | """ | 152 | """Run any preliminary steps necessary to make incoming markup |
84 | :yield: A series of 4-tuples. | 153 | acceptable to the parser. |
154 | |||
155 | lxml really wants to get a bytestring and convert it to | ||
156 | Unicode itself. So instead of using UnicodeDammit to convert | ||
157 | the bytestring to Unicode using different encodings, this | ||
158 | implementation uses EncodingDetector to iterate over the | ||
159 | encodings, and tell lxml to try to parse the document as each | ||
160 | one in turn. | ||
161 | |||
162 | :param markup: Some markup -- hopefully a bytestring. | ||
163 | :param user_specified_encoding: The user asked to try this encoding. | ||
164 | :param document_declared_encoding: The markup itself claims to be | ||
165 | in this encoding. | ||
166 | :param exclude_encodings: The user asked _not_ to try any of | ||
167 | these encodings. | ||
168 | |||
169 | :yield: A series of 4-tuples: | ||
85 | (markup, encoding, declared encoding, | 170 | (markup, encoding, declared encoding, |
86 | has undergone character replacement) | 171 | has undergone character replacement) |
87 | 172 | ||
88 | Each 4-tuple represents a strategy for parsing the document. | 173 | Each 4-tuple represents a strategy for converting the |
174 | document to Unicode and parsing it. Each strategy will be tried | ||
175 | in turn. | ||
89 | """ | 176 | """ |
177 | is_html = not self.is_xml | ||
178 | if is_html: | ||
179 | self.processing_instruction_class = ProcessingInstruction | ||
180 | # We're in HTML mode, so if we're given XML, that's worth | ||
181 | # noting. | ||
182 | DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml( | ||
183 | markup, stacklevel=3 | ||
184 | ) | ||
185 | else: | ||
186 | self.processing_instruction_class = XMLProcessingInstruction | ||
187 | |||
90 | if isinstance(markup, str): | 188 | if isinstance(markup, str): |
91 | # We were given Unicode. Maybe lxml can parse Unicode on | 189 | # We were given Unicode. Maybe lxml can parse Unicode on |
92 | # this system? | 190 | # this system? |
191 | |||
192 | # TODO: This is a workaround for | ||
193 | # https://bugs.launchpad.net/lxml/+bug/1948551. | ||
194 | # We can remove it once the upstream issue is fixed. | ||
195 | if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}': | ||
196 | markup = markup[1:] | ||
93 | yield markup, None, document_declared_encoding, False | 197 | yield markup, None, document_declared_encoding, False |
94 | 198 | ||
95 | if isinstance(markup, str): | 199 | if isinstance(markup, str): |
@@ -98,14 +202,19 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
98 | yield (markup.encode("utf8"), "utf8", | 202 | yield (markup.encode("utf8"), "utf8", |
99 | document_declared_encoding, False) | 203 | document_declared_encoding, False) |
100 | 204 | ||
101 | # Instead of using UnicodeDammit to convert the bytestring to | 205 | # This was provided by the end-user; treat it as a known |
102 | # Unicode using different encodings, use EncodingDetector to | 206 | # definite encoding per the algorithm laid out in the HTML5 |
103 | # iterate over the encodings, and tell lxml to try to parse | 207 | # spec. (See the EncodingDetector class for details.) |
104 | # the document as each one in turn. | 208 | known_definite_encodings = [user_specified_encoding] |
105 | is_html = not self.is_xml | 209 | |
106 | try_encodings = [user_specified_encoding, document_declared_encoding] | 210 | # This was found in the document; treat it as a slightly lower-priority |
211 | # user encoding. | ||
212 | user_encodings = [document_declared_encoding] | ||
107 | detector = EncodingDetector( | 213 | detector = EncodingDetector( |
108 | markup, try_encodings, is_html, exclude_encodings) | 214 | markup, known_definite_encodings=known_definite_encodings, |
215 | user_encodings=user_encodings, is_html=is_html, | ||
216 | exclude_encodings=exclude_encodings | ||
217 | ) | ||
109 | for encoding in detector.encodings: | 218 | for encoding in detector.encodings: |
110 | yield (detector.markup, encoding, document_declared_encoding, False) | 219 | yield (detector.markup, encoding, document_declared_encoding, False) |
111 | 220 | ||
@@ -128,25 +237,45 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
128 | self.parser.feed(data) | 237 | self.parser.feed(data) |
129 | self.parser.close() | 238 | self.parser.close() |
130 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: | 239 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
131 | raise ParserRejectedMarkup(str(e)) | 240 | raise ParserRejectedMarkup(e) |
132 | 241 | ||
133 | def close(self): | 242 | def close(self): |
134 | self.nsmaps = [self.DEFAULT_NSMAPS] | 243 | self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] |
135 | 244 | ||
136 | def start(self, name, attrs, nsmap={}): | 245 | def start(self, name, attrs, nsmap={}): |
137 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. | 246 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. |
138 | attrs = dict(attrs) | 247 | attrs = dict(attrs) |
139 | nsprefix = None | 248 | nsprefix = None |
140 | # Invert each namespace map as it comes in. | 249 | # Invert each namespace map as it comes in. |
141 | if len(self.nsmaps) > 1: | 250 | if len(nsmap) == 0 and len(self.nsmaps) > 1: |
142 | # There are no new namespaces for this tag, but | 251 | # There are no new namespaces for this tag, but |
143 | # non-default namespaces are in play, so we need a | 252 | # non-default namespaces are in play, so we need a |
144 | # separate tag stack to know when they end. | 253 | # separate tag stack to know when they end. |
145 | self.nsmaps.append(None) | 254 | self.nsmaps.append(None) |
146 | elif len(nsmap) > 0: | 255 | elif len(nsmap) > 0: |
147 | # A new namespace mapping has come into play. | 256 | # A new namespace mapping has come into play. |
148 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) | 257 | |
149 | self.nsmaps.append(inverted_nsmap) | 258 | # First, Let the BeautifulSoup object know about it. |
259 | self._register_namespaces(nsmap) | ||
260 | |||
261 | # Then, add it to our running list of inverted namespace | ||
262 | # mappings. | ||
263 | self.nsmaps.append(_invert(nsmap)) | ||
264 | |||
265 | # The currently active namespace prefixes have | ||
266 | # changed. Calculate the new mapping so it can be stored | ||
267 | # with all Tag objects created while these prefixes are in | ||
268 | # scope. | ||
269 | current_mapping = dict(self.active_namespace_prefixes[-1]) | ||
270 | current_mapping.update(nsmap) | ||
271 | |||
272 | # We should not track un-prefixed namespaces as we can only hold one | ||
273 | # and it will be recognized as the default namespace by soupsieve, | ||
274 | # which may be confusing in some situations. | ||
275 | if '' in current_mapping: | ||
276 | del current_mapping[''] | ||
277 | self.active_namespace_prefixes.append(current_mapping) | ||
278 | |||
150 | # Also treat the namespace mapping as a set of attributes on the | 279 | # Also treat the namespace mapping as a set of attributes on the |
151 | # tag, so we can recreate it later. | 280 | # tag, so we can recreate it later. |
152 | attrs = attrs.copy() | 281 | attrs = attrs.copy() |
@@ -171,8 +300,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
171 | 300 | ||
172 | namespace, name = self._getNsTag(name) | 301 | namespace, name = self._getNsTag(name) |
173 | nsprefix = self._prefix_for_namespace(namespace) | 302 | nsprefix = self._prefix_for_namespace(namespace) |
174 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) | 303 | self.soup.handle_starttag( |
175 | 304 | name, namespace, nsprefix, attrs, | |
305 | namespaces=self.active_namespace_prefixes[-1] | ||
306 | ) | ||
307 | |||
176 | def _prefix_for_namespace(self, namespace): | 308 | def _prefix_for_namespace(self, namespace): |
177 | """Find the currently active prefix for the given namespace.""" | 309 | """Find the currently active prefix for the given namespace.""" |
178 | if namespace is None: | 310 | if namespace is None: |
@@ -196,13 +328,20 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
196 | if len(self.nsmaps) > 1: | 328 | if len(self.nsmaps) > 1: |
197 | # This tag, or one of its parents, introduced a namespace | 329 | # This tag, or one of its parents, introduced a namespace |
198 | # mapping, so pop it off the stack. | 330 | # mapping, so pop it off the stack. |
199 | self.nsmaps.pop() | 331 | out_of_scope_nsmap = self.nsmaps.pop() |
200 | 332 | ||
333 | if out_of_scope_nsmap is not None: | ||
334 | # This tag introduced a namespace mapping which is no | ||
335 | # longer in scope. Recalculate the currently active | ||
336 | # namespace prefixes. | ||
337 | self.active_namespace_prefixes.pop() | ||
338 | |||
201 | def pi(self, target, data): | 339 | def pi(self, target, data): |
202 | self.soup.endData() | 340 | self.soup.endData() |
203 | self.soup.handle_data(target + ' ' + data) | 341 | data = target + ' ' + data |
204 | self.soup.endData(ProcessingInstruction) | 342 | self.soup.handle_data(data) |
205 | 343 | self.soup.endData(self.processing_instruction_class) | |
344 | |||
206 | def data(self, content): | 345 | def data(self, content): |
207 | self.soup.handle_data(content) | 346 | self.soup.handle_data(content) |
208 | 347 | ||
@@ -229,6 +368,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | |||
229 | 368 | ||
230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] | 369 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] |
231 | is_xml = False | 370 | is_xml = False |
371 | processing_instruction_class = ProcessingInstruction | ||
232 | 372 | ||
233 | def default_parser(self, encoding): | 373 | def default_parser(self, encoding): |
234 | return etree.HTMLParser | 374 | return etree.HTMLParser |
@@ -240,7 +380,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | |||
240 | self.parser.feed(markup) | 380 | self.parser.feed(markup) |
241 | self.parser.close() | 381 | self.parser.close() |
242 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: | 382 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
243 | raise ParserRejectedMarkup(str(e)) | 383 | raise ParserRejectedMarkup(e) |
244 | 384 | ||
245 | 385 | ||
246 | def test_fragment_to_document(self, fragment): | 386 | def test_fragment_to_document(self, fragment): |