bs4: Update to 4.12.3 from 4.4.1

It makes sense to switch to a more recent version and keep up to date with upstream changes and things like new python version support. (Bitbake rev: f5462156036e71911c66d07dbf3303cde862785b) Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
author: Richard Purdie <richard.purdie@linuxfoundation.org> 2024-05-31 12:04:03 +0100
committer: Richard Purdie <richard.purdie@linuxfoundation.org> 2024-05-31 12:43:18 +0100
commit: 12fa81e8d67f0d9755decde5c5b766f56b2af8db (patch)
tree: de58af9a17e4760de36091d525d7eba8bc6f1578 /bitbake/lib/bs4/builder/_lxml.py
parent: 99ff46cc9bb12619af55c892452cee3b90a545f0 (diff)
download: poky-12fa81e8d67f0d9755decde5c5b766f56b2af8db.tar.gz
1 files changed, 176 insertions, 36 deletions
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py
index 9c6c14ee65..4f7cf74681 100644
--- a/bitbake/lib/bs4/builder/_lxml.py
+++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -1,19 +1,28 @@
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
 __all__ = [
    'LXMLTreeBuilderForXML',
    'LXMLTreeBuilder',
    ]
+try:
+    from collections.abc import Callable # Python 3.6
+except ImportError as e:
+    from collections import Callable
 from io import BytesIO
 from io import StringIO
-import collections
 from lxml import etree
 from bs4.element import (
    Comment,
    Doctype,
    NamespacedAttribute,
    ProcessingInstruction,
+    XMLProcessingInstruction,
 )
 from bs4.builder import (
+    DetectsXMLParsedAsHTML,
    FAST,
    HTML,
    HTMLTreeBuilder,
@@ -25,10 +34,15 @@ from bs4.dammit import EncodingDetector
 LXML = 'lxml'
+def _invert(d):
+    "Invert a dictionary."
+    return dict((v,k) for k, v in list(d.items()))
 class LXMLTreeBuilderForXML(TreeBuilder):
    DEFAULT_PARSER_CLASS = etree.XMLParser
    is_xml = True
+    processing_instruction_class = XMLProcessingInstruction
    NAME = "lxml-xml"
    ALTERNATE_NAMES = ["xml"]
@@ -40,26 +54,79 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    # This namespace mapping is specified in the XML Namespace
    # standard.
-    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+    DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
+    DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
+    # NOTE: If we parsed Element objects and looked at .sourceline,
+    # we'd be able to see the line numbers from the original document.
+    # But instead we build an XMLParser or HTMLParser object to serve
+    # as the target of parse messages, and those messages don't include
+    # line numbers.
+    # See: https://bugs.launchpad.net/lxml/+bug/1846906
+    
+    def initialize_soup(self, soup):
+        """Let the BeautifulSoup object know about the standard namespace
+        mapping.
+        :param soup: A `BeautifulSoup`.
+        """
+        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+        self._register_namespaces(self.DEFAULT_NSMAPS)
+    def _register_namespaces(self, mapping):
+        """Let the BeautifulSoup object know about namespaces encountered
+        while parsing the document.
+        This might be useful later on when creating CSS selectors.
+        This will track (almost) all namespaces, even ones that were
+        only in scope for part of the document. If two namespaces have
+        the same prefix, only the first one encountered will be
+        tracked. Un-prefixed namespaces are not tracked.
+        :param mapping: A dictionary mapping namespace prefixes to URIs.
+        """
+        for key, value in list(mapping.items()):
+            # This is 'if key' and not 'if key is not None' because we
+            # don't track un-prefixed namespaces. Soupselect will
+            # treat an un-prefixed namespace as the default, which
+            # causes confusion in some cases.
+            if key and key not in self.soup._namespaces:
+                # Let the BeautifulSoup object know about a new namespace.
+                # If there are multiple namespaces defined with the same
+                # prefix, the first one in the document takes precedence.
+                self.soup._namespaces[key] = value
+                
    def default_parser(self, encoding):
-        # This can either return a parser object or a class, which
+        """Find the default parser for the given encoding.
-        # will be instantiated with default arguments.
+        :param encoding: A string.
+        :return: Either a parser object or a class, which
+          will be instantiated with default arguments.
+        """
        if self._default_parser is not None:
            return self._default_parser
        return etree.XMLParser(
            target=self, strip_cdata=False, recover=True, encoding=encoding)
    def parser_for(self, encoding):
+        """Instantiate an appropriate parser for the given encoding.
+        :param encoding: A string.
+        :return: A parser object such as an `etree.XMLParser`.
+        """
        # Use the default parser.
        parser = self.default_parser(encoding)
-        if isinstance(parser, collections.Callable):
+        if isinstance(parser, Callable):
            # Instantiate the parser with default arguments
-            parser = parser(target=self, strip_cdata=False, encoding=encoding)
+            parser = parser(
+                target=self, strip_cdata=False, recover=True, encoding=encoding
+            )
        return parser
-    def __init__(self, parser=None, empty_element_tags=None):
+    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
        # TODO: Issue a warning if parser is present but not a
        # callable, since that means there's no way to create new
        # parsers for different encodings.
@@ -67,8 +134,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        if empty_element_tags is not None:
            self.empty_element_tags = set(empty_element_tags)
        self.soup = None
-        self.nsmaps = [self.DEFAULT_NSMAPS]
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+        self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
+        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
+        
    def _getNsTag(self, tag):
        # Split the namespace URL out of a fully-qualified lxml tag
        # name. Copied from lxml's src/lxml/sax.py.
@@ -80,16 +149,51 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    def prepare_markup(self, markup, user_specified_encoding=None,
                       exclude_encodings=None,
                       document_declared_encoding=None):
-        """
+        """Run any preliminary steps necessary to make incoming markup
-        :yield: A series of 4-tuples.
+        acceptable to the parser.
+        lxml really wants to get a bytestring and convert it to
+        Unicode itself. So instead of using UnicodeDammit to convert
+        the bytestring to Unicode using different encodings, this
+        implementation uses EncodingDetector to iterate over the
+        encodings, and tell lxml to try to parse the document as each
+        one in turn.
+        :param markup: Some markup -- hopefully a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+        :yield: A series of 4-tuples:
         (markup, encoding, declared encoding,
          has undergone character replacement)
-        Each 4-tuple represents a strategy for parsing the document.
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
        """
+        is_html = not self.is_xml
+        if is_html:
+            self.processing_instruction_class = ProcessingInstruction
+            # We're in HTML mode, so if we're given XML, that's worth
+            # noting.
+            DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
+                markup, stacklevel=3
+            )
+        else:
+            self.processing_instruction_class = XMLProcessingInstruction
        if isinstance(markup, str):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
+            # TODO: This is a workaround for
+            # https://bugs.launchpad.net/lxml/+bug/1948551.
+            # We can remove it once the upstream issue is fixed.
+            if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
+                markup = markup[1:]
            yield markup, None, document_declared_encoding, False
        if isinstance(markup, str):
@@ -98,14 +202,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            yield (markup.encode("utf8"), "utf8",
                   document_declared_encoding, False)
-        # Instead of using UnicodeDammit to convert the bytestring to
+        # This was provided by the end-user; treat it as a known
-        # Unicode using different encodings, use EncodingDetector to
+        # definite encoding per the algorithm laid out in the HTML5
-        # iterate over the encodings, and tell lxml to try to parse
+        # spec.  (See the EncodingDetector class for details.)
-        # the document as each one in turn.
+        known_definite_encodings = [user_specified_encoding]
-        is_html = not self.is_xml
-        try_encodings = [user_specified_encoding, document_declared_encoding]
+        # This was found in the document; treat it as a slightly lower-priority
+        # user encoding.
+        user_encodings = [document_declared_encoding]
        detector = EncodingDetector(
-            markup, try_encodings, is_html, exclude_encodings)
+            markup, known_definite_encodings=known_definite_encodings,
+            user_encodings=user_encodings, is_html=is_html,
+            exclude_encodings=exclude_encodings
+        )
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding, False)
@@ -128,25 +237,45 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                    self.parser.feed(data)
            self.parser.close()
        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)
    def close(self):
-        self.nsmaps = [self.DEFAULT_NSMAPS]
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
    def start(self, name, attrs, nsmap={}):
        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
        attrs = dict(attrs)
        nsprefix = None
        # Invert each namespace map as it comes in.
-        if len(self.nsmaps) > 1:
+        if len(nsmap) == 0 and len(self.nsmaps) > 1:
-            # There are no new namespaces for this tag, but
+                # There are no new namespaces for this tag, but
-            # non-default namespaces are in play, so we need a
+                # non-default namespaces are in play, so we need a
-            # separate tag stack to know when they end.
+                # separate tag stack to know when they end.
-            self.nsmaps.append(None)
+                self.nsmaps.append(None)
        elif len(nsmap) > 0:
            # A new namespace mapping has come into play.
-            inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
-            self.nsmaps.append(inverted_nsmap)
+            # First, Let the BeautifulSoup object know about it.
+            self._register_namespaces(nsmap)
+            # Then, add it to our running list of inverted namespace
+            # mappings.
+            self.nsmaps.append(_invert(nsmap))
+            # The currently active namespace prefixes have
+            # changed. Calculate the new mapping so it can be stored
+            # with all Tag objects created while these prefixes are in
+            # scope.
+            current_mapping = dict(self.active_namespace_prefixes[-1])
+            current_mapping.update(nsmap)
+            # We should not track un-prefixed namespaces as we can only hold one
+            # and it will be recognized as the default namespace by soupsieve,
+            # which may be confusing in some situations.
+            if '' in current_mapping:
+                del current_mapping['']
+            self.active_namespace_prefixes.append(current_mapping)
+            
            # Also treat the namespace mapping as a set of attributes on the
            # tag, so we can recreate it later.
            attrs = attrs.copy()
@@ -171,8 +300,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        namespace, name = self._getNsTag(name)
        nsprefix = self._prefix_for_namespace(namespace)
-        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
+        self.soup.handle_starttag(
+            name, namespace, nsprefix, attrs,
+            namespaces=self.active_namespace_prefixes[-1]
+        )
+        
    def _prefix_for_namespace(self, namespace):
        """Find the currently active prefix for the given namespace."""
        if namespace is None:
@@ -196,13 +328,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        if len(self.nsmaps) > 1:
            # This tag, or one of its parents, introduced a namespace
            # mapping, so pop it off the stack.
-            self.nsmaps.pop()
+            out_of_scope_nsmap = self.nsmaps.pop()
+            if out_of_scope_nsmap is not None:
+                # This tag introduced a namespace mapping which is no
+                # longer in scope. Recalculate the currently active
+                # namespace prefixes.
+                self.active_namespace_prefixes.pop()
+            
    def pi(self, target, data):
        self.soup.endData()
-        self.soup.handle_data(target + ' ' + data)
+        data = target + ' ' + data
-        self.soup.endData(ProcessingInstruction)
+        self.soup.handle_data(data)
+        self.soup.endData(self.processing_instruction_class)
+        
    def data(self, content):
        self.soup.handle_data(content)
@@ -229,6 +368,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
    is_xml = False
+    processing_instruction_class = ProcessingInstruction
    def default_parser(self, encoding):
        return etree.HTMLParser
@@ -240,7 +380,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
            self.parser.feed(markup)
            self.parser.close()
        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)
    def test_fragment_to_document(self, fragment):
author	Richard Purdie <richard.purdie@linuxfoundation.org>	2024-05-31 12:04:03 +0100
committer	Richard Purdie <richard.purdie@linuxfoundation.org>	2024-05-31 12:43:18 +0100
commit	12fa81e8d67f0d9755decde5c5b766f56b2af8db (patch)
tree	de58af9a17e4760de36091d525d7eba8bc6f1578 /bitbake/lib/bs4/builder/_lxml.py
parent	99ff46cc9bb12619af55c892452cee3b90a545f0 (diff)
download	poky-12fa81e8d67f0d9755decde5c5b766f56b2af8db.tar.gz

diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py index 9c6c14ee65..4f7cf74681 100644 --- a/bitbake/lib/bs4/builder/_lxml.py +++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -1,19 +1,28 @@
		1	# Use of this source code is governed by the MIT license.
		2	__license__ = "MIT"
		3
1	__all__ = [	4	__all__ = [
2	'LXMLTreeBuilderForXML',	5	'LXMLTreeBuilderForXML',
3	'LXMLTreeBuilder',	6	'LXMLTreeBuilder',
4	]	7	]
5		8
		9	try:
		10	from collections.abc import Callable # Python 3.6
		11	except ImportError as e:
		12	from collections import Callable
		13
6	from io import BytesIO	14	from io import BytesIO
7	from io import StringIO	15	from io import StringIO
8	import collections
9	from lxml import etree	16	from lxml import etree
10	from bs4.element import (	17	from bs4.element import (
11	Comment,	18	Comment,
12	Doctype,	19	Doctype,
13	NamespacedAttribute,	20	NamespacedAttribute,
14	ProcessingInstruction,	21	ProcessingInstruction,
		22	XMLProcessingInstruction,
15	)	23	)
16	from bs4.builder import (	24	from bs4.builder import (
		25	DetectsXMLParsedAsHTML,
17	FAST,	26	FAST,
18	HTML,	27	HTML,
19	HTMLTreeBuilder,	28	HTMLTreeBuilder,
@@ -25,10 +34,15 @@ from bs4.dammit import EncodingDetector
25		34
26	LXML = 'lxml'	35	LXML = 'lxml'
27		36
		37	def _invert(d):
		38	"Invert a dictionary."
		39	return dict((v,k) for k, v in list(d.items()))
		40
28	class LXMLTreeBuilderForXML(TreeBuilder):	41	class LXMLTreeBuilderForXML(TreeBuilder):
29	DEFAULT_PARSER_CLASS = etree.XMLParser	42	DEFAULT_PARSER_CLASS = etree.XMLParser
30		43
31	is_xml = True	44	is_xml = True
		45	processing_instruction_class = XMLProcessingInstruction
32		46
33	NAME = "lxml-xml"	47	NAME = "lxml-xml"
34	ALTERNATE_NAMES = ["xml"]	48	ALTERNATE_NAMES = ["xml"]
@@ -40,26 +54,79 @@ class LXMLTreeBuilderForXML(TreeBuilder):
40		54
41	# This namespace mapping is specified in the XML Namespace	55	# This namespace mapping is specified in the XML Namespace
42	# standard.	56	# standard.
43	DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}	57	DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
		58
		59	DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
		60
		61	# NOTE: If we parsed Element objects and looked at .sourceline,
		62	# we'd be able to see the line numbers from the original document.
		63	# But instead we build an XMLParser or HTMLParser object to serve
		64	# as the target of parse messages, and those messages don't include
		65	# line numbers.
		66	# See: https://bugs.launchpad.net/lxml/+bug/1846906
		67
		68	def initialize_soup(self, soup):
		69	"""Let the BeautifulSoup object know about the standard namespace
		70	mapping.
		71
		72	:param soup: A `BeautifulSoup`.
		73	"""
		74	super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
		75	self._register_namespaces(self.DEFAULT_NSMAPS)
		76
		77	def _register_namespaces(self, mapping):
		78	"""Let the BeautifulSoup object know about namespaces encountered
		79	while parsing the document.
		80
		81	This might be useful later on when creating CSS selectors.
		82
		83	This will track (almost) all namespaces, even ones that were
		84	only in scope for part of the document. If two namespaces have
		85	the same prefix, only the first one encountered will be
		86	tracked. Un-prefixed namespaces are not tracked.
44		87
		88	:param mapping: A dictionary mapping namespace prefixes to URIs.
		89	"""
		90	for key, value in list(mapping.items()):
		91	# This is 'if key' and not 'if key is not None' because we
		92	# don't track un-prefixed namespaces. Soupselect will
		93	# treat an un-prefixed namespace as the default, which
		94	# causes confusion in some cases.
		95	if key and key not in self.soup._namespaces:
		96	# Let the BeautifulSoup object know about a new namespace.
		97	# If there are multiple namespaces defined with the same
		98	# prefix, the first one in the document takes precedence.
		99	self.soup._namespaces[key] = value
		100
45	def default_parser(self, encoding):	101	def default_parser(self, encoding):
46	# This can either return a parser object or a class, which	102	"""Find the default parser for the given encoding.
47	# will be instantiated with default arguments.	103
		104	:param encoding: A string.
		105	:return: Either a parser object or a class, which
		106	will be instantiated with default arguments.
		107	"""
48	if self._default_parser is not None:	108	if self._default_parser is not None:
49	return self._default_parser	109	return self._default_parser
50	return etree.XMLParser(	110	return etree.XMLParser(
51	target=self, strip_cdata=False, recover=True, encoding=encoding)	111	target=self, strip_cdata=False, recover=True, encoding=encoding)
52		112
53	def parser_for(self, encoding):	113	def parser_for(self, encoding):
		114	"""Instantiate an appropriate parser for the given encoding.
		115
		116	:param encoding: A string.
		117	:return: A parser object such as an `etree.XMLParser`.
		118	"""
54	# Use the default parser.	119	# Use the default parser.
55	parser = self.default_parser(encoding)	120	parser = self.default_parser(encoding)
56		121
57	if isinstance(parser, collections.Callable):	122	if isinstance(parser, Callable):
58	# Instantiate the parser with default arguments	123	# Instantiate the parser with default arguments
59	parser = parser(target=self, strip_cdata=False, encoding=encoding)	124	parser = parser(
		125	target=self, strip_cdata=False, recover=True, encoding=encoding
		126	)
60	return parser	127	return parser
61		128
62	def __init__(self, parser=None, empty_element_tags=None):	129	def __init__(self, parser=None, empty_element_tags=None, **kwargs):
63	# TODO: Issue a warning if parser is present but not a	130	# TODO: Issue a warning if parser is present but not a
64	# callable, since that means there's no way to create new	131	# callable, since that means there's no way to create new
65	# parsers for different encodings.	132	# parsers for different encodings.
@@ -67,8 +134,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
67	if empty_element_tags is not None:	134	if empty_element_tags is not None:
68	self.empty_element_tags = set(empty_element_tags)	135	self.empty_element_tags = set(empty_element_tags)
69	self.soup = None	136	self.soup = None
70	self.nsmaps = [self.DEFAULT_NSMAPS]	137	self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
71		138	self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
		139	super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
		140
72	def _getNsTag(self, tag):	141	def _getNsTag(self, tag):
73	# Split the namespace URL out of a fully-qualified lxml tag	142	# Split the namespace URL out of a fully-qualified lxml tag
74	# name. Copied from lxml's src/lxml/sax.py.	143	# name. Copied from lxml's src/lxml/sax.py.
@@ -80,16 +149,51 @@ class LXMLTreeBuilderForXML(TreeBuilder):
80	def prepare_markup(self, markup, user_specified_encoding=None,	149	def prepare_markup(self, markup, user_specified_encoding=None,
81	exclude_encodings=None,	150	exclude_encodings=None,
82	document_declared_encoding=None):	151	document_declared_encoding=None):
83	"""	152	"""Run any preliminary steps necessary to make incoming markup
84	:yield: A series of 4-tuples.	153	acceptable to the parser.
		154
		155	lxml really wants to get a bytestring and convert it to
		156	Unicode itself. So instead of using UnicodeDammit to convert
		157	the bytestring to Unicode using different encodings, this
		158	implementation uses EncodingDetector to iterate over the
		159	encodings, and tell lxml to try to parse the document as each
		160	one in turn.
		161
		162	:param markup: Some markup -- hopefully a bytestring.
		163	:param user_specified_encoding: The user asked to try this encoding.
		164	:param document_declared_encoding: The markup itself claims to be
		165	in this encoding.
		166	:param exclude_encodings: The user asked _not_ to try any of
		167	these encodings.
		168
		169	:yield: A series of 4-tuples:
85	(markup, encoding, declared encoding,	170	(markup, encoding, declared encoding,
86	has undergone character replacement)	171	has undergone character replacement)
87		172
88	Each 4-tuple represents a strategy for parsing the document.	173	Each 4-tuple represents a strategy for converting the
		174	document to Unicode and parsing it. Each strategy will be tried
		175	in turn.
89	"""	176	"""
		177	is_html = not self.is_xml
		178	if is_html:
		179	self.processing_instruction_class = ProcessingInstruction
		180	# We're in HTML mode, so if we're given XML, that's worth
		181	# noting.
		182	DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
		183	markup, stacklevel=3
		184	)
		185	else:
		186	self.processing_instruction_class = XMLProcessingInstruction
		187
90	if isinstance(markup, str):	188	if isinstance(markup, str):
91	# We were given Unicode. Maybe lxml can parse Unicode on	189	# We were given Unicode. Maybe lxml can parse Unicode on
92	# this system?	190	# this system?
		191
		192	# TODO: This is a workaround for
		193	# https://bugs.launchpad.net/lxml/+bug/1948551.
		194	# We can remove it once the upstream issue is fixed.
		195	if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
		196	markup = markup[1:]
93	yield markup, None, document_declared_encoding, False	197	yield markup, None, document_declared_encoding, False
94		198
95	if isinstance(markup, str):	199	if isinstance(markup, str):
@@ -98,14 +202,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
98	yield (markup.encode("utf8"), "utf8",	202	yield (markup.encode("utf8"), "utf8",
99	document_declared_encoding, False)	203	document_declared_encoding, False)
100		204
101	# Instead of using UnicodeDammit to convert the bytestring to	205	# This was provided by the end-user; treat it as a known
102	# Unicode using different encodings, use EncodingDetector to	206	# definite encoding per the algorithm laid out in the HTML5
103	# iterate over the encodings, and tell lxml to try to parse	207	# spec. (See the EncodingDetector class for details.)
104	# the document as each one in turn.	208	known_definite_encodings = [user_specified_encoding]
105	is_html = not self.is_xml	209
106	try_encodings = [user_specified_encoding, document_declared_encoding]	210	# This was found in the document; treat it as a slightly lower-priority
		211	# user encoding.
		212	user_encodings = [document_declared_encoding]
107	detector = EncodingDetector(	213	detector = EncodingDetector(
108	markup, try_encodings, is_html, exclude_encodings)	214	markup, known_definite_encodings=known_definite_encodings,
		215	user_encodings=user_encodings, is_html=is_html,
		216	exclude_encodings=exclude_encodings
		217	)
109	for encoding in detector.encodings:	218	for encoding in detector.encodings:
110	yield (detector.markup, encoding, document_declared_encoding, False)	219	yield (detector.markup, encoding, document_declared_encoding, False)
111		220
@@ -128,25 +237,45 @@ class LXMLTreeBuilderForXML(TreeBuilder):
128	self.parser.feed(data)	237	self.parser.feed(data)
129	self.parser.close()	238	self.parser.close()
130	except (UnicodeDecodeError, LookupError, etree.ParserError) as e:	239	except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
131	raise ParserRejectedMarkup(str(e))	240	raise ParserRejectedMarkup(e)
132		241
133	def close(self):	242	def close(self):
134	self.nsmaps = [self.DEFAULT_NSMAPS]	243	self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
135		244
136	def start(self, name, attrs, nsmap={}):	245	def start(self, name, attrs, nsmap={}):
137	# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.	246	# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
138	attrs = dict(attrs)	247	attrs = dict(attrs)
139	nsprefix = None	248	nsprefix = None
140	# Invert each namespace map as it comes in.	249	# Invert each namespace map as it comes in.
141	if len(self.nsmaps) > 1:	250	if len(nsmap) == 0 and len(self.nsmaps) > 1:
142	# There are no new namespaces for this tag, but	251	# There are no new namespaces for this tag, but
143	# non-default namespaces are in play, so we need a	252	# non-default namespaces are in play, so we need a
144	# separate tag stack to know when they end.	253	# separate tag stack to know when they end.
145	self.nsmaps.append(None)	254	self.nsmaps.append(None)
146	elif len(nsmap) > 0:	255	elif len(nsmap) > 0:
147	# A new namespace mapping has come into play.	256	# A new namespace mapping has come into play.
148	inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))	257
149	self.nsmaps.append(inverted_nsmap)	258	# First, Let the BeautifulSoup object know about it.
		259	self._register_namespaces(nsmap)
		260
		261	# Then, add it to our running list of inverted namespace
		262	# mappings.
		263	self.nsmaps.append(_invert(nsmap))
		264
		265	# The currently active namespace prefixes have
		266	# changed. Calculate the new mapping so it can be stored
		267	# with all Tag objects created while these prefixes are in
		268	# scope.
		269	current_mapping = dict(self.active_namespace_prefixes[-1])
		270	current_mapping.update(nsmap)
		271
		272	# We should not track un-prefixed namespaces as we can only hold one
		273	# and it will be recognized as the default namespace by soupsieve,
		274	# which may be confusing in some situations.
		275	if '' in current_mapping:
		276	del current_mapping['']
		277	self.active_namespace_prefixes.append(current_mapping)
		278
150	# Also treat the namespace mapping as a set of attributes on the	279	# Also treat the namespace mapping as a set of attributes on the
151	# tag, so we can recreate it later.	280	# tag, so we can recreate it later.
152	attrs = attrs.copy()	281	attrs = attrs.copy()
@@ -171,8 +300,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
171		300
172	namespace, name = self._getNsTag(name)	301	namespace, name = self._getNsTag(name)
173	nsprefix = self._prefix_for_namespace(namespace)	302	nsprefix = self._prefix_for_namespace(namespace)
174	self.soup.handle_starttag(name, namespace, nsprefix, attrs)	303	self.soup.handle_starttag(
175		304	name, namespace, nsprefix, attrs,
		305	namespaces=self.active_namespace_prefixes[-1]
		306	)
		307
176	def _prefix_for_namespace(self, namespace):	308	def _prefix_for_namespace(self, namespace):
177	"""Find the currently active prefix for the given namespace."""	309	"""Find the currently active prefix for the given namespace."""
178	if namespace is None:	310	if namespace is None:
@@ -196,13 +328,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
196	if len(self.nsmaps) > 1:	328	if len(self.nsmaps) > 1:
197	# This tag, or one of its parents, introduced a namespace	329	# This tag, or one of its parents, introduced a namespace
198	# mapping, so pop it off the stack.	330	# mapping, so pop it off the stack.
199	self.nsmaps.pop()	331	out_of_scope_nsmap = self.nsmaps.pop()
200		332
		333	if out_of_scope_nsmap is not None:
		334	# This tag introduced a namespace mapping which is no
		335	# longer in scope. Recalculate the currently active
		336	# namespace prefixes.
		337	self.active_namespace_prefixes.pop()
		338
201	def pi(self, target, data):	339	def pi(self, target, data):
202	self.soup.endData()	340	self.soup.endData()
203	self.soup.handle_data(target + ' ' + data)	341	data = target + ' ' + data
204	self.soup.endData(ProcessingInstruction)	342	self.soup.handle_data(data)
205		343	self.soup.endData(self.processing_instruction_class)
		344
206	def data(self, content):	345	def data(self, content):
207	self.soup.handle_data(content)	346	self.soup.handle_data(content)
208		347
@@ -229,6 +368,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
229		368
230	features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]	369	features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
231	is_xml = False	370	is_xml = False
		371	processing_instruction_class = ProcessingInstruction
232		372
233	def default_parser(self, encoding):	373	def default_parser(self, encoding):
234	return etree.HTMLParser	374	return etree.HTMLParser
@@ -240,7 +380,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
240	self.parser.feed(markup)	380	self.parser.feed(markup)
241	self.parser.close()	381	self.parser.close()
242	except (UnicodeDecodeError, LookupError, etree.ParserError) as e:	382	except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
243	raise ParserRejectedMarkup(str(e))	383	raise ParserRejectedMarkup(e)
244		384
245		385
246	def test_fragment_to_document(self, fragment):	386	def test_fragment_to_document(self, fragment):