diff options
| author | Richard Purdie <richard.purdie@linuxfoundation.org> | 2024-05-31 12:04:03 +0100 |
|---|---|---|
| committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2024-05-31 12:43:18 +0100 |
| commit | 12fa81e8d67f0d9755decde5c5b766f56b2af8db (patch) | |
| tree | de58af9a17e4760de36091d525d7eba8bc6f1578 /bitbake/lib/bs4/builder/_lxml.py | |
| parent | 99ff46cc9bb12619af55c892452cee3b90a545f0 (diff) | |
| download | poky-12fa81e8d67f0d9755decde5c5b766f56b2af8db.tar.gz | |
bs4: Update to 4.12.3 from 4.4.1
It makes sense to switch to a more recent version and keep up to date
with upstream changes and things like new python version support.
(Bitbake rev: f5462156036e71911c66d07dbf3303cde862785b)
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/builder/_lxml.py')
| -rw-r--r-- | bitbake/lib/bs4/builder/_lxml.py | 212 |
1 files changed, 176 insertions, 36 deletions
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py index 9c6c14ee65..4f7cf74681 100644 --- a/bitbake/lib/bs4/builder/_lxml.py +++ b/bitbake/lib/bs4/builder/_lxml.py | |||
| @@ -1,19 +1,28 @@ | |||
| 1 | # Use of this source code is governed by the MIT license. | ||
| 2 | __license__ = "MIT" | ||
| 3 | |||
| 1 | __all__ = [ | 4 | __all__ = [ |
| 2 | 'LXMLTreeBuilderForXML', | 5 | 'LXMLTreeBuilderForXML', |
| 3 | 'LXMLTreeBuilder', | 6 | 'LXMLTreeBuilder', |
| 4 | ] | 7 | ] |
| 5 | 8 | ||
| 9 | try: | ||
| 10 | from collections.abc import Callable # Python 3.6 | ||
| 11 | except ImportError as e: | ||
| 12 | from collections import Callable | ||
| 13 | |||
| 6 | from io import BytesIO | 14 | from io import BytesIO |
| 7 | from io import StringIO | 15 | from io import StringIO |
| 8 | import collections | ||
| 9 | from lxml import etree | 16 | from lxml import etree |
| 10 | from bs4.element import ( | 17 | from bs4.element import ( |
| 11 | Comment, | 18 | Comment, |
| 12 | Doctype, | 19 | Doctype, |
| 13 | NamespacedAttribute, | 20 | NamespacedAttribute, |
| 14 | ProcessingInstruction, | 21 | ProcessingInstruction, |
| 22 | XMLProcessingInstruction, | ||
| 15 | ) | 23 | ) |
| 16 | from bs4.builder import ( | 24 | from bs4.builder import ( |
| 25 | DetectsXMLParsedAsHTML, | ||
| 17 | FAST, | 26 | FAST, |
| 18 | HTML, | 27 | HTML, |
| 19 | HTMLTreeBuilder, | 28 | HTMLTreeBuilder, |
| @@ -25,10 +34,15 @@ from bs4.dammit import EncodingDetector | |||
| 25 | 34 | ||
| 26 | LXML = 'lxml' | 35 | LXML = 'lxml' |
| 27 | 36 | ||
| 37 | def _invert(d): | ||
| 38 | "Invert a dictionary." | ||
| 39 | return dict((v,k) for k, v in list(d.items())) | ||
| 40 | |||
| 28 | class LXMLTreeBuilderForXML(TreeBuilder): | 41 | class LXMLTreeBuilderForXML(TreeBuilder): |
| 29 | DEFAULT_PARSER_CLASS = etree.XMLParser | 42 | DEFAULT_PARSER_CLASS = etree.XMLParser |
| 30 | 43 | ||
| 31 | is_xml = True | 44 | is_xml = True |
| 45 | processing_instruction_class = XMLProcessingInstruction | ||
| 32 | 46 | ||
| 33 | NAME = "lxml-xml" | 47 | NAME = "lxml-xml" |
| 34 | ALTERNATE_NAMES = ["xml"] | 48 | ALTERNATE_NAMES = ["xml"] |
| @@ -40,26 +54,79 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 40 | 54 | ||
| 41 | # This namespace mapping is specified in the XML Namespace | 55 | # This namespace mapping is specified in the XML Namespace |
| 42 | # standard. | 56 | # standard. |
| 43 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} | 57 | DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') |
| 58 | |||
| 59 | DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) | ||
| 60 | |||
| 61 | # NOTE: If we parsed Element objects and looked at .sourceline, | ||
| 62 | # we'd be able to see the line numbers from the original document. | ||
| 63 | # But instead we build an XMLParser or HTMLParser object to serve | ||
| 64 | # as the target of parse messages, and those messages don't include | ||
| 65 | # line numbers. | ||
| 66 | # See: https://bugs.launchpad.net/lxml/+bug/1846906 | ||
| 67 | |||
| 68 | def initialize_soup(self, soup): | ||
| 69 | """Let the BeautifulSoup object know about the standard namespace | ||
| 70 | mapping. | ||
| 71 | |||
| 72 | :param soup: A `BeautifulSoup`. | ||
| 73 | """ | ||
| 74 | super(LXMLTreeBuilderForXML, self).initialize_soup(soup) | ||
| 75 | self._register_namespaces(self.DEFAULT_NSMAPS) | ||
| 76 | |||
| 77 | def _register_namespaces(self, mapping): | ||
| 78 | """Let the BeautifulSoup object know about namespaces encountered | ||
| 79 | while parsing the document. | ||
| 80 | |||
| 81 | This might be useful later on when creating CSS selectors. | ||
| 82 | |||
| 83 | This will track (almost) all namespaces, even ones that were | ||
| 84 | only in scope for part of the document. If two namespaces have | ||
| 85 | the same prefix, only the first one encountered will be | ||
| 86 | tracked. Un-prefixed namespaces are not tracked. | ||
| 44 | 87 | ||
| 88 | :param mapping: A dictionary mapping namespace prefixes to URIs. | ||
| 89 | """ | ||
| 90 | for key, value in list(mapping.items()): | ||
| 91 | # This is 'if key' and not 'if key is not None' because we | ||
| 92 | # don't track un-prefixed namespaces. Soupselect will | ||
| 93 | # treat an un-prefixed namespace as the default, which | ||
| 94 | # causes confusion in some cases. | ||
| 95 | if key and key not in self.soup._namespaces: | ||
| 96 | # Let the BeautifulSoup object know about a new namespace. | ||
| 97 | # If there are multiple namespaces defined with the same | ||
| 98 | # prefix, the first one in the document takes precedence. | ||
| 99 | self.soup._namespaces[key] = value | ||
| 100 | |||
| 45 | def default_parser(self, encoding): | 101 | def default_parser(self, encoding): |
| 46 | # This can either return a parser object or a class, which | 102 | """Find the default parser for the given encoding. |
| 47 | # will be instantiated with default arguments. | 103 | |
| 104 | :param encoding: A string. | ||
| 105 | :return: Either a parser object or a class, which | ||
| 106 | will be instantiated with default arguments. | ||
| 107 | """ | ||
| 48 | if self._default_parser is not None: | 108 | if self._default_parser is not None: |
| 49 | return self._default_parser | 109 | return self._default_parser |
| 50 | return etree.XMLParser( | 110 | return etree.XMLParser( |
| 51 | target=self, strip_cdata=False, recover=True, encoding=encoding) | 111 | target=self, strip_cdata=False, recover=True, encoding=encoding) |
| 52 | 112 | ||
| 53 | def parser_for(self, encoding): | 113 | def parser_for(self, encoding): |
| 114 | """Instantiate an appropriate parser for the given encoding. | ||
| 115 | |||
| 116 | :param encoding: A string. | ||
| 117 | :return: A parser object such as an `etree.XMLParser`. | ||
| 118 | """ | ||
| 54 | # Use the default parser. | 119 | # Use the default parser. |
| 55 | parser = self.default_parser(encoding) | 120 | parser = self.default_parser(encoding) |
| 56 | 121 | ||
| 57 | if isinstance(parser, collections.Callable): | 122 | if isinstance(parser, Callable): |
| 58 | # Instantiate the parser with default arguments | 123 | # Instantiate the parser with default arguments |
| 59 | parser = parser(target=self, strip_cdata=False, encoding=encoding) | 124 | parser = parser( |
| 125 | target=self, strip_cdata=False, recover=True, encoding=encoding | ||
| 126 | ) | ||
| 60 | return parser | 127 | return parser |
| 61 | 128 | ||
| 62 | def __init__(self, parser=None, empty_element_tags=None): | 129 | def __init__(self, parser=None, empty_element_tags=None, **kwargs): |
| 63 | # TODO: Issue a warning if parser is present but not a | 130 | # TODO: Issue a warning if parser is present but not a |
| 64 | # callable, since that means there's no way to create new | 131 | # callable, since that means there's no way to create new |
| 65 | # parsers for different encodings. | 132 | # parsers for different encodings. |
| @@ -67,8 +134,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 67 | if empty_element_tags is not None: | 134 | if empty_element_tags is not None: |
| 68 | self.empty_element_tags = set(empty_element_tags) | 135 | self.empty_element_tags = set(empty_element_tags) |
| 69 | self.soup = None | 136 | self.soup = None |
| 70 | self.nsmaps = [self.DEFAULT_NSMAPS] | 137 | self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] |
| 71 | 138 | self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)] | |
| 139 | super(LXMLTreeBuilderForXML, self).__init__(**kwargs) | ||
| 140 | |||
| 72 | def _getNsTag(self, tag): | 141 | def _getNsTag(self, tag): |
| 73 | # Split the namespace URL out of a fully-qualified lxml tag | 142 | # Split the namespace URL out of a fully-qualified lxml tag |
| 74 | # name. Copied from lxml's src/lxml/sax.py. | 143 | # name. Copied from lxml's src/lxml/sax.py. |
| @@ -80,16 +149,51 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 80 | def prepare_markup(self, markup, user_specified_encoding=None, | 149 | def prepare_markup(self, markup, user_specified_encoding=None, |
| 81 | exclude_encodings=None, | 150 | exclude_encodings=None, |
| 82 | document_declared_encoding=None): | 151 | document_declared_encoding=None): |
| 83 | """ | 152 | """Run any preliminary steps necessary to make incoming markup |
| 84 | :yield: A series of 4-tuples. | 153 | acceptable to the parser. |
| 154 | |||
| 155 | lxml really wants to get a bytestring and convert it to | ||
| 156 | Unicode itself. So instead of using UnicodeDammit to convert | ||
| 157 | the bytestring to Unicode using different encodings, this | ||
| 158 | implementation uses EncodingDetector to iterate over the | ||
| 159 | encodings, and tell lxml to try to parse the document as each | ||
| 160 | one in turn. | ||
| 161 | |||
| 162 | :param markup: Some markup -- hopefully a bytestring. | ||
| 163 | :param user_specified_encoding: The user asked to try this encoding. | ||
| 164 | :param document_declared_encoding: The markup itself claims to be | ||
| 165 | in this encoding. | ||
| 166 | :param exclude_encodings: The user asked _not_ to try any of | ||
| 167 | these encodings. | ||
| 168 | |||
| 169 | :yield: A series of 4-tuples: | ||
| 85 | (markup, encoding, declared encoding, | 170 | (markup, encoding, declared encoding, |
| 86 | has undergone character replacement) | 171 | has undergone character replacement) |
| 87 | 172 | ||
| 88 | Each 4-tuple represents a strategy for parsing the document. | 173 | Each 4-tuple represents a strategy for converting the |
| 174 | document to Unicode and parsing it. Each strategy will be tried | ||
| 175 | in turn. | ||
| 89 | """ | 176 | """ |
| 177 | is_html = not self.is_xml | ||
| 178 | if is_html: | ||
| 179 | self.processing_instruction_class = ProcessingInstruction | ||
| 180 | # We're in HTML mode, so if we're given XML, that's worth | ||
| 181 | # noting. | ||
| 182 | DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml( | ||
| 183 | markup, stacklevel=3 | ||
| 184 | ) | ||
| 185 | else: | ||
| 186 | self.processing_instruction_class = XMLProcessingInstruction | ||
| 187 | |||
| 90 | if isinstance(markup, str): | 188 | if isinstance(markup, str): |
| 91 | # We were given Unicode. Maybe lxml can parse Unicode on | 189 | # We were given Unicode. Maybe lxml can parse Unicode on |
| 92 | # this system? | 190 | # this system? |
| 191 | |||
| 192 | # TODO: This is a workaround for | ||
| 193 | # https://bugs.launchpad.net/lxml/+bug/1948551. | ||
| 194 | # We can remove it once the upstream issue is fixed. | ||
| 195 | if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}': | ||
| 196 | markup = markup[1:] | ||
| 93 | yield markup, None, document_declared_encoding, False | 197 | yield markup, None, document_declared_encoding, False |
| 94 | 198 | ||
| 95 | if isinstance(markup, str): | 199 | if isinstance(markup, str): |
| @@ -98,14 +202,19 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 98 | yield (markup.encode("utf8"), "utf8", | 202 | yield (markup.encode("utf8"), "utf8", |
| 99 | document_declared_encoding, False) | 203 | document_declared_encoding, False) |
| 100 | 204 | ||
| 101 | # Instead of using UnicodeDammit to convert the bytestring to | 205 | # This was provided by the end-user; treat it as a known |
| 102 | # Unicode using different encodings, use EncodingDetector to | 206 | # definite encoding per the algorithm laid out in the HTML5 |
| 103 | # iterate over the encodings, and tell lxml to try to parse | 207 | # spec. (See the EncodingDetector class for details.) |
| 104 | # the document as each one in turn. | 208 | known_definite_encodings = [user_specified_encoding] |
| 105 | is_html = not self.is_xml | 209 | |
| 106 | try_encodings = [user_specified_encoding, document_declared_encoding] | 210 | # This was found in the document; treat it as a slightly lower-priority |
| 211 | # user encoding. | ||
| 212 | user_encodings = [document_declared_encoding] | ||
| 107 | detector = EncodingDetector( | 213 | detector = EncodingDetector( |
| 108 | markup, try_encodings, is_html, exclude_encodings) | 214 | markup, known_definite_encodings=known_definite_encodings, |
| 215 | user_encodings=user_encodings, is_html=is_html, | ||
| 216 | exclude_encodings=exclude_encodings | ||
| 217 | ) | ||
| 109 | for encoding in detector.encodings: | 218 | for encoding in detector.encodings: |
| 110 | yield (detector.markup, encoding, document_declared_encoding, False) | 219 | yield (detector.markup, encoding, document_declared_encoding, False) |
| 111 | 220 | ||
| @@ -128,25 +237,45 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 128 | self.parser.feed(data) | 237 | self.parser.feed(data) |
| 129 | self.parser.close() | 238 | self.parser.close() |
| 130 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: | 239 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
| 131 | raise ParserRejectedMarkup(str(e)) | 240 | raise ParserRejectedMarkup(e) |
| 132 | 241 | ||
| 133 | def close(self): | 242 | def close(self): |
| 134 | self.nsmaps = [self.DEFAULT_NSMAPS] | 243 | self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] |
| 135 | 244 | ||
| 136 | def start(self, name, attrs, nsmap={}): | 245 | def start(self, name, attrs, nsmap={}): |
| 137 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. | 246 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. |
| 138 | attrs = dict(attrs) | 247 | attrs = dict(attrs) |
| 139 | nsprefix = None | 248 | nsprefix = None |
| 140 | # Invert each namespace map as it comes in. | 249 | # Invert each namespace map as it comes in. |
| 141 | if len(self.nsmaps) > 1: | 250 | if len(nsmap) == 0 and len(self.nsmaps) > 1: |
| 142 | # There are no new namespaces for this tag, but | 251 | # There are no new namespaces for this tag, but |
| 143 | # non-default namespaces are in play, so we need a | 252 | # non-default namespaces are in play, so we need a |
| 144 | # separate tag stack to know when they end. | 253 | # separate tag stack to know when they end. |
| 145 | self.nsmaps.append(None) | 254 | self.nsmaps.append(None) |
| 146 | elif len(nsmap) > 0: | 255 | elif len(nsmap) > 0: |
| 147 | # A new namespace mapping has come into play. | 256 | # A new namespace mapping has come into play. |
| 148 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) | 257 | |
| 149 | self.nsmaps.append(inverted_nsmap) | 258 | # First, Let the BeautifulSoup object know about it. |
| 259 | self._register_namespaces(nsmap) | ||
| 260 | |||
| 261 | # Then, add it to our running list of inverted namespace | ||
| 262 | # mappings. | ||
| 263 | self.nsmaps.append(_invert(nsmap)) | ||
| 264 | |||
| 265 | # The currently active namespace prefixes have | ||
| 266 | # changed. Calculate the new mapping so it can be stored | ||
| 267 | # with all Tag objects created while these prefixes are in | ||
| 268 | # scope. | ||
| 269 | current_mapping = dict(self.active_namespace_prefixes[-1]) | ||
| 270 | current_mapping.update(nsmap) | ||
| 271 | |||
| 272 | # We should not track un-prefixed namespaces as we can only hold one | ||
| 273 | # and it will be recognized as the default namespace by soupsieve, | ||
| 274 | # which may be confusing in some situations. | ||
| 275 | if '' in current_mapping: | ||
| 276 | del current_mapping[''] | ||
| 277 | self.active_namespace_prefixes.append(current_mapping) | ||
| 278 | |||
| 150 | # Also treat the namespace mapping as a set of attributes on the | 279 | # Also treat the namespace mapping as a set of attributes on the |
| 151 | # tag, so we can recreate it later. | 280 | # tag, so we can recreate it later. |
| 152 | attrs = attrs.copy() | 281 | attrs = attrs.copy() |
| @@ -171,8 +300,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 171 | 300 | ||
| 172 | namespace, name = self._getNsTag(name) | 301 | namespace, name = self._getNsTag(name) |
| 173 | nsprefix = self._prefix_for_namespace(namespace) | 302 | nsprefix = self._prefix_for_namespace(namespace) |
| 174 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) | 303 | self.soup.handle_starttag( |
| 175 | 304 | name, namespace, nsprefix, attrs, | |
| 305 | namespaces=self.active_namespace_prefixes[-1] | ||
| 306 | ) | ||
| 307 | |||
| 176 | def _prefix_for_namespace(self, namespace): | 308 | def _prefix_for_namespace(self, namespace): |
| 177 | """Find the currently active prefix for the given namespace.""" | 309 | """Find the currently active prefix for the given namespace.""" |
| 178 | if namespace is None: | 310 | if namespace is None: |
| @@ -196,13 +328,20 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 196 | if len(self.nsmaps) > 1: | 328 | if len(self.nsmaps) > 1: |
| 197 | # This tag, or one of its parents, introduced a namespace | 329 | # This tag, or one of its parents, introduced a namespace |
| 198 | # mapping, so pop it off the stack. | 330 | # mapping, so pop it off the stack. |
| 199 | self.nsmaps.pop() | 331 | out_of_scope_nsmap = self.nsmaps.pop() |
| 200 | 332 | ||
| 333 | if out_of_scope_nsmap is not None: | ||
| 334 | # This tag introduced a namespace mapping which is no | ||
| 335 | # longer in scope. Recalculate the currently active | ||
| 336 | # namespace prefixes. | ||
| 337 | self.active_namespace_prefixes.pop() | ||
| 338 | |||
| 201 | def pi(self, target, data): | 339 | def pi(self, target, data): |
| 202 | self.soup.endData() | 340 | self.soup.endData() |
| 203 | self.soup.handle_data(target + ' ' + data) | 341 | data = target + ' ' + data |
| 204 | self.soup.endData(ProcessingInstruction) | 342 | self.soup.handle_data(data) |
| 205 | 343 | self.soup.endData(self.processing_instruction_class) | |
| 344 | |||
| 206 | def data(self, content): | 345 | def data(self, content): |
| 207 | self.soup.handle_data(content) | 346 | self.soup.handle_data(content) |
| 208 | 347 | ||
| @@ -229,6 +368,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | |||
| 229 | 368 | ||
| 230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] | 369 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] |
| 231 | is_xml = False | 370 | is_xml = False |
| 371 | processing_instruction_class = ProcessingInstruction | ||
| 232 | 372 | ||
| 233 | def default_parser(self, encoding): | 373 | def default_parser(self, encoding): |
| 234 | return etree.HTMLParser | 374 | return etree.HTMLParser |
| @@ -240,7 +380,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | |||
| 240 | self.parser.feed(markup) | 380 | self.parser.feed(markup) |
| 241 | self.parser.close() | 381 | self.parser.close() |
| 242 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: | 382 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
| 243 | raise ParserRejectedMarkup(str(e)) | 383 | raise ParserRejectedMarkup(e) |
| 244 | 384 | ||
| 245 | 385 | ||
| 246 | def test_fragment_to_document(self, fragment): | 386 | def test_fragment_to_document(self, fragment): |
