summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/builder/_lxml.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/builder/_lxml.py')
-rw-r--r--bitbake/lib/bs4/builder/_lxml.py212
1 files changed, 176 insertions, 36 deletions
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py
index 9c6c14ee65..4f7cf74681 100644
--- a/bitbake/lib/bs4/builder/_lxml.py
+++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -1,19 +1,28 @@
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
1__all__ = [ 4__all__ = [
2 'LXMLTreeBuilderForXML', 5 'LXMLTreeBuilderForXML',
3 'LXMLTreeBuilder', 6 'LXMLTreeBuilder',
4 ] 7 ]
5 8
9try:
10 from collections.abc import Callable # Python 3.6
11except ImportError as e:
12 from collections import Callable
13
6from io import BytesIO 14from io import BytesIO
7from io import StringIO 15from io import StringIO
8import collections
9from lxml import etree 16from lxml import etree
10from bs4.element import ( 17from bs4.element import (
11 Comment, 18 Comment,
12 Doctype, 19 Doctype,
13 NamespacedAttribute, 20 NamespacedAttribute,
14 ProcessingInstruction, 21 ProcessingInstruction,
22 XMLProcessingInstruction,
15) 23)
16from bs4.builder import ( 24from bs4.builder import (
25 DetectsXMLParsedAsHTML,
17 FAST, 26 FAST,
18 HTML, 27 HTML,
19 HTMLTreeBuilder, 28 HTMLTreeBuilder,
@@ -25,10 +34,15 @@ from bs4.dammit import EncodingDetector
25 34
26LXML = 'lxml' 35LXML = 'lxml'
27 36
37def _invert(d):
38 "Invert a dictionary."
39 return dict((v,k) for k, v in list(d.items()))
40
28class LXMLTreeBuilderForXML(TreeBuilder): 41class LXMLTreeBuilderForXML(TreeBuilder):
29 DEFAULT_PARSER_CLASS = etree.XMLParser 42 DEFAULT_PARSER_CLASS = etree.XMLParser
30 43
31 is_xml = True 44 is_xml = True
45 processing_instruction_class = XMLProcessingInstruction
32 46
33 NAME = "lxml-xml" 47 NAME = "lxml-xml"
34 ALTERNATE_NAMES = ["xml"] 48 ALTERNATE_NAMES = ["xml"]
@@ -40,26 +54,79 @@ class LXMLTreeBuilderForXML(TreeBuilder):
40 54
41 # This namespace mapping is specified in the XML Namespace 55 # This namespace mapping is specified in the XML Namespace
42 # standard. 56 # standard.
43 DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 57 DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
58
59 DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
60
61 # NOTE: If we parsed Element objects and looked at .sourceline,
62 # we'd be able to see the line numbers from the original document.
63 # But instead we build an XMLParser or HTMLParser object to serve
64 # as the target of parse messages, and those messages don't include
65 # line numbers.
66 # See: https://bugs.launchpad.net/lxml/+bug/1846906
67
68 def initialize_soup(self, soup):
69 """Let the BeautifulSoup object know about the standard namespace
70 mapping.
71
72 :param soup: A `BeautifulSoup`.
73 """
74 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
75 self._register_namespaces(self.DEFAULT_NSMAPS)
76
77 def _register_namespaces(self, mapping):
78 """Let the BeautifulSoup object know about namespaces encountered
79 while parsing the document.
80
81 This might be useful later on when creating CSS selectors.
82
83 This will track (almost) all namespaces, even ones that were
84 only in scope for part of the document. If two namespaces have
85 the same prefix, only the first one encountered will be
86 tracked. Un-prefixed namespaces are not tracked.
44 87
88 :param mapping: A dictionary mapping namespace prefixes to URIs.
89 """
90 for key, value in list(mapping.items()):
91 # This is 'if key' and not 'if key is not None' because we
92 # don't track un-prefixed namespaces. Soupselect will
93 # treat an un-prefixed namespace as the default, which
94 # causes confusion in some cases.
95 if key and key not in self.soup._namespaces:
96 # Let the BeautifulSoup object know about a new namespace.
97 # If there are multiple namespaces defined with the same
98 # prefix, the first one in the document takes precedence.
99 self.soup._namespaces[key] = value
100
45 def default_parser(self, encoding): 101 def default_parser(self, encoding):
46 # This can either return a parser object or a class, which 102 """Find the default parser for the given encoding.
47 # will be instantiated with default arguments. 103
104 :param encoding: A string.
105 :return: Either a parser object or a class, which
106 will be instantiated with default arguments.
107 """
48 if self._default_parser is not None: 108 if self._default_parser is not None:
49 return self._default_parser 109 return self._default_parser
50 return etree.XMLParser( 110 return etree.XMLParser(
51 target=self, strip_cdata=False, recover=True, encoding=encoding) 111 target=self, strip_cdata=False, recover=True, encoding=encoding)
52 112
53 def parser_for(self, encoding): 113 def parser_for(self, encoding):
114 """Instantiate an appropriate parser for the given encoding.
115
116 :param encoding: A string.
117 :return: A parser object such as an `etree.XMLParser`.
118 """
54 # Use the default parser. 119 # Use the default parser.
55 parser = self.default_parser(encoding) 120 parser = self.default_parser(encoding)
56 121
57 if isinstance(parser, collections.Callable): 122 if isinstance(parser, Callable):
58 # Instantiate the parser with default arguments 123 # Instantiate the parser with default arguments
59 parser = parser(target=self, strip_cdata=False, encoding=encoding) 124 parser = parser(
125 target=self, strip_cdata=False, recover=True, encoding=encoding
126 )
60 return parser 127 return parser
61 128
62 def __init__(self, parser=None, empty_element_tags=None): 129 def __init__(self, parser=None, empty_element_tags=None, **kwargs):
63 # TODO: Issue a warning if parser is present but not a 130 # TODO: Issue a warning if parser is present but not a
64 # callable, since that means there's no way to create new 131 # callable, since that means there's no way to create new
65 # parsers for different encodings. 132 # parsers for different encodings.
@@ -67,8 +134,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
67 if empty_element_tags is not None: 134 if empty_element_tags is not None:
68 self.empty_element_tags = set(empty_element_tags) 135 self.empty_element_tags = set(empty_element_tags)
69 self.soup = None 136 self.soup = None
70 self.nsmaps = [self.DEFAULT_NSMAPS] 137 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
71 138 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
139 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
140
72 def _getNsTag(self, tag): 141 def _getNsTag(self, tag):
73 # Split the namespace URL out of a fully-qualified lxml tag 142 # Split the namespace URL out of a fully-qualified lxml tag
74 # name. Copied from lxml's src/lxml/sax.py. 143 # name. Copied from lxml's src/lxml/sax.py.
@@ -80,16 +149,51 @@ class LXMLTreeBuilderForXML(TreeBuilder):
80 def prepare_markup(self, markup, user_specified_encoding=None, 149 def prepare_markup(self, markup, user_specified_encoding=None,
81 exclude_encodings=None, 150 exclude_encodings=None,
82 document_declared_encoding=None): 151 document_declared_encoding=None):
83 """ 152 """Run any preliminary steps necessary to make incoming markup
84 :yield: A series of 4-tuples. 153 acceptable to the parser.
154
155 lxml really wants to get a bytestring and convert it to
156 Unicode itself. So instead of using UnicodeDammit to convert
157 the bytestring to Unicode using different encodings, this
158 implementation uses EncodingDetector to iterate over the
159 encodings, and tell lxml to try to parse the document as each
160 one in turn.
161
162 :param markup: Some markup -- hopefully a bytestring.
163 :param user_specified_encoding: The user asked to try this encoding.
164 :param document_declared_encoding: The markup itself claims to be
165 in this encoding.
166 :param exclude_encodings: The user asked _not_ to try any of
167 these encodings.
168
169 :yield: A series of 4-tuples:
85 (markup, encoding, declared encoding, 170 (markup, encoding, declared encoding,
86 has undergone character replacement) 171 has undergone character replacement)
87 172
88 Each 4-tuple represents a strategy for parsing the document. 173 Each 4-tuple represents a strategy for converting the
174 document to Unicode and parsing it. Each strategy will be tried
175 in turn.
89 """ 176 """
177 is_html = not self.is_xml
178 if is_html:
179 self.processing_instruction_class = ProcessingInstruction
180 # We're in HTML mode, so if we're given XML, that's worth
181 # noting.
182 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
183 markup, stacklevel=3
184 )
185 else:
186 self.processing_instruction_class = XMLProcessingInstruction
187
90 if isinstance(markup, str): 188 if isinstance(markup, str):
91 # We were given Unicode. Maybe lxml can parse Unicode on 189 # We were given Unicode. Maybe lxml can parse Unicode on
92 # this system? 190 # this system?
191
192 # TODO: This is a workaround for
193 # https://bugs.launchpad.net/lxml/+bug/1948551.
194 # We can remove it once the upstream issue is fixed.
195 if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
196 markup = markup[1:]
93 yield markup, None, document_declared_encoding, False 197 yield markup, None, document_declared_encoding, False
94 198
95 if isinstance(markup, str): 199 if isinstance(markup, str):
@@ -98,14 +202,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
98 yield (markup.encode("utf8"), "utf8", 202 yield (markup.encode("utf8"), "utf8",
99 document_declared_encoding, False) 203 document_declared_encoding, False)
100 204
101 # Instead of using UnicodeDammit to convert the bytestring to 205 # This was provided by the end-user; treat it as a known
102 # Unicode using different encodings, use EncodingDetector to 206 # definite encoding per the algorithm laid out in the HTML5
103 # iterate over the encodings, and tell lxml to try to parse 207 # spec. (See the EncodingDetector class for details.)
104 # the document as each one in turn. 208 known_definite_encodings = [user_specified_encoding]
105 is_html = not self.is_xml 209
106 try_encodings = [user_specified_encoding, document_declared_encoding] 210 # This was found in the document; treat it as a slightly lower-priority
211 # user encoding.
212 user_encodings = [document_declared_encoding]
107 detector = EncodingDetector( 213 detector = EncodingDetector(
108 markup, try_encodings, is_html, exclude_encodings) 214 markup, known_definite_encodings=known_definite_encodings,
215 user_encodings=user_encodings, is_html=is_html,
216 exclude_encodings=exclude_encodings
217 )
109 for encoding in detector.encodings: 218 for encoding in detector.encodings:
110 yield (detector.markup, encoding, document_declared_encoding, False) 219 yield (detector.markup, encoding, document_declared_encoding, False)
111 220
@@ -128,25 +237,45 @@ class LXMLTreeBuilderForXML(TreeBuilder):
128 self.parser.feed(data) 237 self.parser.feed(data)
129 self.parser.close() 238 self.parser.close()
130 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 239 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
131 raise ParserRejectedMarkup(str(e)) 240 raise ParserRejectedMarkup(e)
132 241
133 def close(self): 242 def close(self):
134 self.nsmaps = [self.DEFAULT_NSMAPS] 243 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
135 244
136 def start(self, name, attrs, nsmap={}): 245 def start(self, name, attrs, nsmap={}):
137 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 246 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
138 attrs = dict(attrs) 247 attrs = dict(attrs)
139 nsprefix = None 248 nsprefix = None
140 # Invert each namespace map as it comes in. 249 # Invert each namespace map as it comes in.
141 if len(self.nsmaps) > 1: 250 if len(nsmap) == 0 and len(self.nsmaps) > 1:
142 # There are no new namespaces for this tag, but 251 # There are no new namespaces for this tag, but
143 # non-default namespaces are in play, so we need a 252 # non-default namespaces are in play, so we need a
144 # separate tag stack to know when they end. 253 # separate tag stack to know when they end.
145 self.nsmaps.append(None) 254 self.nsmaps.append(None)
146 elif len(nsmap) > 0: 255 elif len(nsmap) > 0:
147 # A new namespace mapping has come into play. 256 # A new namespace mapping has come into play.
148 inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) 257
149 self.nsmaps.append(inverted_nsmap) 258 # First, Let the BeautifulSoup object know about it.
259 self._register_namespaces(nsmap)
260
261 # Then, add it to our running list of inverted namespace
262 # mappings.
263 self.nsmaps.append(_invert(nsmap))
264
265 # The currently active namespace prefixes have
266 # changed. Calculate the new mapping so it can be stored
267 # with all Tag objects created while these prefixes are in
268 # scope.
269 current_mapping = dict(self.active_namespace_prefixes[-1])
270 current_mapping.update(nsmap)
271
272 # We should not track un-prefixed namespaces as we can only hold one
273 # and it will be recognized as the default namespace by soupsieve,
274 # which may be confusing in some situations.
275 if '' in current_mapping:
276 del current_mapping['']
277 self.active_namespace_prefixes.append(current_mapping)
278
150 # Also treat the namespace mapping as a set of attributes on the 279 # Also treat the namespace mapping as a set of attributes on the
151 # tag, so we can recreate it later. 280 # tag, so we can recreate it later.
152 attrs = attrs.copy() 281 attrs = attrs.copy()
@@ -171,8 +300,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
171 300
172 namespace, name = self._getNsTag(name) 301 namespace, name = self._getNsTag(name)
173 nsprefix = self._prefix_for_namespace(namespace) 302 nsprefix = self._prefix_for_namespace(namespace)
174 self.soup.handle_starttag(name, namespace, nsprefix, attrs) 303 self.soup.handle_starttag(
175 304 name, namespace, nsprefix, attrs,
305 namespaces=self.active_namespace_prefixes[-1]
306 )
307
176 def _prefix_for_namespace(self, namespace): 308 def _prefix_for_namespace(self, namespace):
177 """Find the currently active prefix for the given namespace.""" 309 """Find the currently active prefix for the given namespace."""
178 if namespace is None: 310 if namespace is None:
@@ -196,13 +328,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
196 if len(self.nsmaps) > 1: 328 if len(self.nsmaps) > 1:
197 # This tag, or one of its parents, introduced a namespace 329 # This tag, or one of its parents, introduced a namespace
198 # mapping, so pop it off the stack. 330 # mapping, so pop it off the stack.
199 self.nsmaps.pop() 331 out_of_scope_nsmap = self.nsmaps.pop()
200 332
333 if out_of_scope_nsmap is not None:
334 # This tag introduced a namespace mapping which is no
335 # longer in scope. Recalculate the currently active
336 # namespace prefixes.
337 self.active_namespace_prefixes.pop()
338
201 def pi(self, target, data): 339 def pi(self, target, data):
202 self.soup.endData() 340 self.soup.endData()
203 self.soup.handle_data(target + ' ' + data) 341 data = target + ' ' + data
204 self.soup.endData(ProcessingInstruction) 342 self.soup.handle_data(data)
205 343 self.soup.endData(self.processing_instruction_class)
344
206 def data(self, content): 345 def data(self, content):
207 self.soup.handle_data(content) 346 self.soup.handle_data(content)
208 347
@@ -229,6 +368,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
229 368
230 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 369 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
231 is_xml = False 370 is_xml = False
371 processing_instruction_class = ProcessingInstruction
232 372
233 def default_parser(self, encoding): 373 def default_parser(self, encoding):
234 return etree.HTMLParser 374 return etree.HTMLParser
@@ -240,7 +380,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
240 self.parser.feed(markup) 380 self.parser.feed(markup)
241 self.parser.close() 381 self.parser.close()
242 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 382 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
243 raise ParserRejectedMarkup(str(e)) 383 raise ParserRejectedMarkup(e)
244 384
245 385
246 def test_fragment_to_document(self, fragment): 386 def test_fragment_to_document(self, fragment):