1 files changed, 197 insertions, 54 deletions
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py
index 9e9216ef9c..7c46a85118 100644
--- a/bitbake/lib/bs4/builder/_html5lib.py
+++ b/bitbake/lib/bs4/builder/_html5lib.py
@@ -1,9 +1,14 @@
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
 __all__ = [
    'HTML5TreeBuilder',
    ]
 import warnings
+import re
 from bs4.builder import (
+    DetectsXMLParsedAsHTML,
    PERMISSIVE,
    HTML,
    HTML_5,
@@ -11,17 +16,13 @@ from bs4.builder import (
    )
 from bs4.element import (
    NamespacedAttribute,
-    whitespace_re,
+    nonwhitespace_re,
 )
 import html5lib
-try:
+from html5lib.constants import (
-    # html5lib >= 0.99999999/1.0b9
+    namespaces,
-    from html5lib.treebuilders import base as treebuildersbase
+    prefixes,
-except ImportError:
+    )
-    # html5lib <= 0.9999999/1.0b8
-    from html5lib.treebuilders import _base as treebuildersbase
-from html5lib.constants import namespaces
 from bs4.element import (
    Comment,
    Doctype,
@@ -29,13 +30,37 @@ from bs4.element import (
    Tag,
    )
+try:
+    # Pre-0.99999999
+    from html5lib.treebuilders import _base as treebuilder_base
+    new_html5lib = False
+except ImportError as e:
+    # 0.99999999 and up
+    from html5lib.treebuilders import base as treebuilder_base
+    new_html5lib = True
 class HTML5TreeBuilder(HTMLTreeBuilder):
-    """Use html5lib to build a tree."""
+    """Use html5lib to build a tree.
+    Note that this TreeBuilder does not support some features common
+    to HTML TreeBuilders. Some of these features could theoretically
+    be implemented, but at the very least it's quite difficult,
+    because html5lib moves the parse tree around as it's being built.
+    * This TreeBuilder doesn't use different subclasses of NavigableString
+      based on the name of the tag in which the string was found.
+    * You can't use a SoupStrainer to parse only part of a document.
+    """
    NAME = "html5lib"
    features = [NAME, PERMISSIVE, HTML_5, HTML]
+    # html5lib can tell us which line number and position in the
+    # original file is the source of an element.
+    TRACKS_LINE_NUMBERS = True
+    
    def prepare_markup(self, markup, user_specified_encoding,
                       document_declared_encoding=None, exclude_encodings=None):
        # Store the user-specified encoding for use later on.
@@ -45,27 +70,56 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        # ATM because the html5lib TreeBuilder doesn't use
        # UnicodeDammit.
        if exclude_encodings:
-            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
+            warnings.warn(
+                "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
+                stacklevel=3
+            )
+        # html5lib only parses HTML, so if it's given XML that's worth
+        # noting.
+        DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
+            markup, stacklevel=3
+        )
        yield (markup, None, None, False)
    # These methods are defined by Beautiful Soup.
    def feed(self, markup):
        if self.soup.parse_only is not None:
-            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+            warnings.warn(
+                "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
+                stacklevel=4
+            )
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+        self.underlying_builder.parser = parser
+        extra_kwargs = dict()
+        if not isinstance(markup, str):
+            if new_html5lib:
+                extra_kwargs['override_encoding'] = self.user_specified_encoding
+            else:
+                extra_kwargs['encoding'] = self.user_specified_encoding
+        doc = parser.parse(markup, **extra_kwargs)
+        
        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
-            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+            original_encoding = parser.tokenizer.stream.charEncoding[0]
+            if not isinstance(original_encoding, str):
+                # In 0.99999999 and up, the encoding is an html5lib
+                # Encoding object. We want to use a string for compatibility
+                # with other tree builders.
+                original_encoding = original_encoding.name
+            doc.original_encoding = original_encoding
+        self.underlying_builder.parser = None
+            
    def create_treebuilder(self, namespaceHTMLElements):
        self.underlying_builder = TreeBuilderForHtml5lib(
-            self.soup, namespaceHTMLElements)
+            namespaceHTMLElements, self.soup,
+            store_line_numbers=self.store_line_numbers
+        )
        return self.underlying_builder
    def test_fragment_to_document(self, fragment):
@@ -73,12 +127,30 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        return '<html><head></head><body>%s</body></html>' % fragment
-class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
+    
-    def __init__(self, soup, namespaceHTMLElements):
+    def __init__(self, namespaceHTMLElements, soup=None,
-        self.soup = soup
+                 store_line_numbers=True, **kwargs):
+        if soup:
+            self.soup = soup
+        else:
+            from bs4 import BeautifulSoup
+            # TODO: Why is the parser 'html.parser' here? To avoid an
+            # infinite loop?
+            self.soup = BeautifulSoup(
+                "", "html.parser", store_line_numbers=store_line_numbers,
+                **kwargs
+            )
+        # TODO: What are **kwargs exactly? Should they be passed in
+        # here in addition to/instead of being passed to the BeautifulSoup
+        # constructor?
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+        # This will be set later to an html5lib.html5parser.HTMLParser
+        # object, which we can use to track the current line number.
+        self.parser = None
+        self.store_line_numbers = store_line_numbers
+        
    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)
@@ -92,14 +164,26 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
        self.soup.object_was_parsed(doctype)
    def elementClass(self, name, namespace):
-        tag = self.soup.new_tag(name, namespace)
+        kwargs = {}
+        if self.parser and self.store_line_numbers:
+            # This represents the point immediately after the end of the
+            # tag. We don't know when the tag started, but we do know
+            # where it ended -- the character just before this one.
+            sourceline, sourcepos = self.parser.tokenizer.stream.position()
+            kwargs['sourceline'] = sourceline
+            kwargs['sourcepos'] = sourcepos-1
+        tag = self.soup.new_tag(name, namespace, **kwargs)
        return Element(tag, self.soup, namespace)
    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)
    def fragmentClass(self):
-        self.soup = BeautifulSoup("")
+        from bs4 import BeautifulSoup
+        # TODO: Why is the parser 'html.parser' here? To avoid an
+        # infinite loop?
+        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)
@@ -111,7 +195,57 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
        return self.soup
    def getFragment(self):
-        return treebuildersbase.TreeBuilder.getFragment(self).element
+        return treebuilder_base.TreeBuilder.getFragment(self).element
+    def testSerializer(self, element):
+        from bs4 import BeautifulSoup
+        rv = []
+        doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
+        def serializeElement(element, indent=0):
+            if isinstance(element, BeautifulSoup):
+                pass
+            if isinstance(element, Doctype):
+                m = doctype_re.match(element)
+                if m:
+                    name = m.group(1)
+                    if m.lastindex > 1:
+                        publicId = m.group(2) or ""
+                        systemId = m.group(3) or m.group(4) or ""
+                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (' ' * indent, name, publicId, systemId))
+                    else:
+                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
+                else:
+                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
+            elif isinstance(element, Comment):
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
+            elif isinstance(element, NavigableString):
+                rv.append("|%s\"%s\"" % (' ' * indent, element))
+            else:
+                if element.namespace:
+                    name = "%s %s" % (prefixes[element.namespace],
+                                      element.name)
+                else:
+                    name = element.name
+                rv.append("|%s<%s>" % (' ' * indent, name))
+                if element.attrs:
+                    attributes = []
+                    for name, value in list(element.attrs.items()):
+                        if isinstance(name, NamespacedAttribute):
+                            name = "%s %s" % (prefixes[name.namespace], name.name)
+                        if isinstance(value, list):
+                            value = " ".join(value)
+                        attributes.append((name, value))
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+                indent += 2
+                for child in element.children:
+                    serializeElement(child, indent)
+        serializeElement(element, 0)
+        return "\n".join(rv)
 class AttrList(object):
    def __init__(self, element):
@@ -122,14 +256,14 @@ class AttrList(object):
    def __setitem__(self, name, value):
        # If this attribute is a multi-valued attribute for this element,
        # turn its value into a list.
-        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        list_attr = self.element.cdata_list_attributes or {}
-        if (name in list_attr['*']
+        if (name in list_attr.get('*', [])
            or (self.element.name in list_attr
-                and name in list_attr[self.element.name])):
+                and name in list_attr.get(self.element.name, []))):
            # A node that is being cloned may have already undergone
            # this procedure.
            if not isinstance(value, list):
-                value = whitespace_re.split(value)
+                value = nonwhitespace_re.findall(value)
        self.element[name] = value
    def items(self):
        return list(self.attrs.items())
@@ -143,9 +277,9 @@ class AttrList(object):
        return name in list(self.attrs.keys())
-class Element(treebuildersbase.Node):
+class Element(treebuilder_base.Node):
    def __init__(self, element, soup, namespace):
-        treebuildersbase.Node.__init__(self, element.name)
+        treebuilder_base.Node.__init__(self, element.name)
        self.element = element
        self.soup = soup
        self.namespace = namespace
@@ -164,13 +298,15 @@ class Element(treebuildersbase.Node):
            child = node
        elif node.element.__class__ == NavigableString:
            string_child = child = node.element
+            node.parent = self
        else:
            child = node.element
+            node.parent = self
        if not isinstance(child, str) and child.parent is not None:
            node.element.extract()
-        if (string_child and self.element.contents
+        if (string_child is not None and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
            # We are appending a string onto another string.
            # TODO This has O(n^2) performance, for input like
@@ -203,12 +339,12 @@ class Element(treebuildersbase.Node):
                most_recent_element=most_recent_element)
    def getAttributes(self):
+        if isinstance(self.element, Comment):
+            return {}
        return AttrList(self.element)
    def setAttributes(self, attributes):
        if attributes is not None and len(attributes) > 0:
            converted_attributes = []
            for name, value in list(attributes.items()):
                if isinstance(name, tuple):
@@ -230,11 +366,11 @@ class Element(treebuildersbase.Node):
    attributes = property(getAttributes, setAttributes)
    def insertText(self, data, insertBefore=None):
+        text = TextNode(self.soup.new_string(data), self.soup)
        if insertBefore:
-            text = TextNode(self.soup.new_string(data), self.soup)
+            self.insertBefore(text, insertBefore)
-            self.insertBefore(data, insertBefore)
        else:
-            self.appendChild(data)
+            self.appendChild(text)
    def insertBefore(self, node, refNode):
        index = self.element.index(refNode.element)
@@ -253,9 +389,10 @@ class Element(treebuildersbase.Node):
    def reparentChildren(self, new_parent):
        """Move all of this tag's children into another tag."""
-        # print "MOVE", self.element.contents
+        # print("MOVE", self.element.contents)
-        # print "FROM", self.element
+        # print("FROM", self.element)
-        # print "TO", new_parent.element
+        # print("TO", new_parent.element)
        element = self.element
        new_parent_element = new_parent.element
        # Determine what this tag's next_element will be once all the children
@@ -274,29 +411,35 @@ class Element(treebuildersbase.Node):
            new_parents_last_descendant_next_element = new_parent_element.next_element
        to_append = element.contents
-        append_after = new_parent_element.contents
        if len(to_append) > 0:
            # Set the first child's previous_element and previous_sibling
            # to elements within the new parent
            first_child = to_append[0]
-            if new_parents_last_descendant:
+            if new_parents_last_descendant is not None:
                first_child.previous_element = new_parents_last_descendant
            else:
                first_child.previous_element = new_parent_element
            first_child.previous_sibling = new_parents_last_child
-            if new_parents_last_descendant:
+            if new_parents_last_descendant is not None:
                new_parents_last_descendant.next_element = first_child
            else:
                new_parent_element.next_element = first_child
-            if new_parents_last_child:
+            if new_parents_last_child is not None:
                new_parents_last_child.next_sibling = first_child
-            # Fix the last child's next_element and next_sibling
+            # Find the very last element being moved. It is now the
-            last_child = to_append[-1]
+            # parent's last descendant. It has no .next_sibling and
-            last_child.next_element = new_parents_last_descendant_next_element
+            # its .next_element is whatever the previous last
-            if new_parents_last_descendant_next_element:
+            # descendant had.
-                new_parents_last_descendant_next_element.previous_element = last_child
+            last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
-            last_child.next_sibling = None
+            last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
+            if new_parents_last_descendant_next_element is not None:
+                # TODO: This code has no test coverage and I'm not sure
+                # how to get html5lib to go through this path, but it's
+                # just the other side of the previous line.
+                new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
+            last_childs_last_descendant.next_sibling = None
        for child in to_append:
            child.parent = new_parent_element
@@ -306,9 +449,9 @@ class Element(treebuildersbase.Node):
        element.contents = []
        element.next_element = final_next_element
-        # print "DONE WITH MOVE"
+        # print("DONE WITH MOVE")
-        # print "FROM", self.element
+        # print("FROM", self.element)
-        # print "TO", new_parent_element
+        # print("TO", new_parent_element)
    def cloneNode(self):
        tag = self.soup.new_tag(self.element.name, self.namespace)
@@ -321,7 +464,7 @@ class Element(treebuildersbase.Node):
        return self.element.contents
    def getNameTuple(self):
-        if self.namespace is None:
+        if self.namespace == None:
            return namespaces["html"], self.name
        else:
            return self.namespace, self.name
@@ -330,7 +473,7 @@ class Element(treebuildersbase.Node):
 class TextNode(Element):
    def __init__(self, element, soup):
-        treebuildersbase.Node.__init__(self, None)
+        treebuilder_base.Node.__init__(self, None)
        self.element = element
        self.soup = soup

diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py index 9e9216ef9c..7c46a85118 100644 --- a/bitbake/lib/bs4/builder/_html5lib.py +++ b/bitbake/lib/bs4/builder/_html5lib.py
@@ -1,9 +1,14 @@
		1	# Use of this source code is governed by the MIT license.
		2	__license__ = "MIT"
		3
1	__all__ = [	4	__all__ = [
2	'HTML5TreeBuilder',	5	'HTML5TreeBuilder',
3	]	6	]
4		7
5	import warnings	8	import warnings
		9	import re
6	from bs4.builder import (	10	from bs4.builder import (
		11	DetectsXMLParsedAsHTML,
7	PERMISSIVE,	12	PERMISSIVE,
8	HTML,	13	HTML,
9	HTML_5,	14	HTML_5,
@@ -11,17 +16,13 @@ from bs4.builder import (
11	)	16	)
12	from bs4.element import (	17	from bs4.element import (
13	NamespacedAttribute,	18	NamespacedAttribute,
14	whitespace_re,	19	nonwhitespace_re,
15	)	20	)
16	import html5lib	21	import html5lib
17	try:	22	from html5lib.constants import (
18	# html5lib >= 0.99999999/1.0b9	23	namespaces,
19	from html5lib.treebuilders import base as treebuildersbase	24	prefixes,
20	except ImportError:	25	)
21	# html5lib <= 0.9999999/1.0b8
22	from html5lib.treebuilders import _base as treebuildersbase
23	from html5lib.constants import namespaces
24
25	from bs4.element import (	26	from bs4.element import (
26	Comment,	27	Comment,
27	Doctype,	28	Doctype,
@@ -29,13 +30,37 @@ from bs4.element import (
29	Tag,	30	Tag,
30	)	31	)
31		32
		33	try:
		34	# Pre-0.99999999
		35	from html5lib.treebuilders import _base as treebuilder_base
		36	new_html5lib = False
		37	except ImportError as e:
		38	# 0.99999999 and up
		39	from html5lib.treebuilders import base as treebuilder_base
		40	new_html5lib = True
		41
32	class HTML5TreeBuilder(HTMLTreeBuilder):	42	class HTML5TreeBuilder(HTMLTreeBuilder):
33	"""Use html5lib to build a tree."""	43	"""Use html5lib to build a tree.
		44
		45	Note that this TreeBuilder does not support some features common
		46	to HTML TreeBuilders. Some of these features could theoretically
		47	be implemented, but at the very least it's quite difficult,
		48	because html5lib moves the parse tree around as it's being built.
		49
		50	* This TreeBuilder doesn't use different subclasses of NavigableString
		51	based on the name of the tag in which the string was found.
		52
		53	* You can't use a SoupStrainer to parse only part of a document.
		54	"""
34		55
35	NAME = "html5lib"	56	NAME = "html5lib"
36		57
37	features = [NAME, PERMISSIVE, HTML_5, HTML]	58	features = [NAME, PERMISSIVE, HTML_5, HTML]
38		59
		60	# html5lib can tell us which line number and position in the
		61	# original file is the source of an element.
		62	TRACKS_LINE_NUMBERS = True
		63
39	def prepare_markup(self, markup, user_specified_encoding,	64	def prepare_markup(self, markup, user_specified_encoding,
40	document_declared_encoding=None, exclude_encodings=None):	65	document_declared_encoding=None, exclude_encodings=None):
41	# Store the user-specified encoding for use later on.	66	# Store the user-specified encoding for use later on.
@@ -45,27 +70,56 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
45	# ATM because the html5lib TreeBuilder doesn't use	70	# ATM because the html5lib TreeBuilder doesn't use
46	# UnicodeDammit.	71	# UnicodeDammit.
47	if exclude_encodings:	72	if exclude_encodings:
48	warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")	73	warnings.warn(
		74	"You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
		75	stacklevel=3
		76	)
		77
		78	# html5lib only parses HTML, so if it's given XML that's worth
		79	# noting.
		80	DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
		81	markup, stacklevel=3
		82	)
		83
49	yield (markup, None, None, False)	84	yield (markup, None, None, False)
50		85
51	# These methods are defined by Beautiful Soup.	86	# These methods are defined by Beautiful Soup.
52	def feed(self, markup):	87	def feed(self, markup):
53	if self.soup.parse_only is not None:	88	if self.soup.parse_only is not None:
54	warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")	89	warnings.warn(
		90	"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
		91	stacklevel=4
		92	)
55	parser = html5lib.HTMLParser(tree=self.create_treebuilder)	93	parser = html5lib.HTMLParser(tree=self.create_treebuilder)
56	doc = parser.parse(markup, encoding=self.user_specified_encoding)	94	self.underlying_builder.parser = parser
57		95	extra_kwargs = dict()
		96	if not isinstance(markup, str):
		97	if new_html5lib:
		98	extra_kwargs['override_encoding'] = self.user_specified_encoding
		99	else:
		100	extra_kwargs['encoding'] = self.user_specified_encoding
		101	doc = parser.parse(markup, **extra_kwargs)
		102
58	# Set the character encoding detected by the tokenizer.	103	# Set the character encoding detected by the tokenizer.
59	if isinstance(markup, str):	104	if isinstance(markup, str):
60	# We need to special-case this because html5lib sets	105	# We need to special-case this because html5lib sets
61	# charEncoding to UTF-8 if it gets Unicode input.	106	# charEncoding to UTF-8 if it gets Unicode input.
62	doc.original_encoding = None	107	doc.original_encoding = None
63	else:	108	else:
64	doc.original_encoding = parser.tokenizer.stream.charEncoding[0]	109	original_encoding = parser.tokenizer.stream.charEncoding[0]
65		110	if not isinstance(original_encoding, str):
		111	# In 0.99999999 and up, the encoding is an html5lib
		112	# Encoding object. We want to use a string for compatibility
		113	# with other tree builders.
		114	original_encoding = original_encoding.name
		115	doc.original_encoding = original_encoding
		116	self.underlying_builder.parser = None
		117
66	def create_treebuilder(self, namespaceHTMLElements):	118	def create_treebuilder(self, namespaceHTMLElements):
67	self.underlying_builder = TreeBuilderForHtml5lib(	119	self.underlying_builder = TreeBuilderForHtml5lib(
68	self.soup, namespaceHTMLElements)	120	namespaceHTMLElements, self.soup,
		121	store_line_numbers=self.store_line_numbers
		122	)
69	return self.underlying_builder	123	return self.underlying_builder
70		124
71	def test_fragment_to_document(self, fragment):	125	def test_fragment_to_document(self, fragment):
@@ -73,12 +127,30 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
73	return '<html><head></head><body>%s</body></html>' % fragment	127	return '<html><head></head><body>%s</body></html>' % fragment
74		128
75		129
76	class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):	130	class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
77		131
78	def __init__(self, soup, namespaceHTMLElements):	132	def __init__(self, namespaceHTMLElements, soup=None,
79	self.soup = soup	133	store_line_numbers=True, **kwargs):
		134	if soup:
		135	self.soup = soup
		136	else:
		137	from bs4 import BeautifulSoup
		138	# TODO: Why is the parser 'html.parser' here? To avoid an
		139	# infinite loop?
		140	self.soup = BeautifulSoup(
		141	"", "html.parser", store_line_numbers=store_line_numbers,
		142	**kwargs
		143	)
		144	# TODO: What are **kwargs exactly? Should they be passed in
		145	# here in addition to/instead of being passed to the BeautifulSoup
		146	# constructor?
80	super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)	147	super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
81		148
		149	# This will be set later to an html5lib.html5parser.HTMLParser
		150	# object, which we can use to track the current line number.
		151	self.parser = None
		152	self.store_line_numbers = store_line_numbers
		153
82	def documentClass(self):	154	def documentClass(self):
83	self.soup.reset()	155	self.soup.reset()
84	return Element(self.soup, self.soup, None)	156	return Element(self.soup, self.soup, None)
@@ -92,14 +164,26 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
92	self.soup.object_was_parsed(doctype)	164	self.soup.object_was_parsed(doctype)
93		165
94	def elementClass(self, name, namespace):	166	def elementClass(self, name, namespace):
95	tag = self.soup.new_tag(name, namespace)	167	kwargs = {}
		168	if self.parser and self.store_line_numbers:
		169	# This represents the point immediately after the end of the
		170	# tag. We don't know when the tag started, but we do know
		171	# where it ended -- the character just before this one.
		172	sourceline, sourcepos = self.parser.tokenizer.stream.position()
		173	kwargs['sourceline'] = sourceline
		174	kwargs['sourcepos'] = sourcepos-1
		175	tag = self.soup.new_tag(name, namespace, **kwargs)
		176
96	return Element(tag, self.soup, namespace)	177	return Element(tag, self.soup, namespace)
97		178
98	def commentClass(self, data):	179	def commentClass(self, data):
99	return TextNode(Comment(data), self.soup)	180	return TextNode(Comment(data), self.soup)
100		181
101	def fragmentClass(self):	182	def fragmentClass(self):
102	self.soup = BeautifulSoup("")	183	from bs4 import BeautifulSoup
		184	# TODO: Why is the parser 'html.parser' here? To avoid an
		185	# infinite loop?
		186	self.soup = BeautifulSoup("", "html.parser")
103	self.soup.name = "[document_fragment]"	187	self.soup.name = "[document_fragment]"
104	return Element(self.soup, self.soup, None)	188	return Element(self.soup, self.soup, None)
105		189
@@ -111,7 +195,57 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
111	return self.soup	195	return self.soup
112		196
113	def getFragment(self):	197	def getFragment(self):
114	return treebuildersbase.TreeBuilder.getFragment(self).element	198	return treebuilder_base.TreeBuilder.getFragment(self).element
		199
		200	def testSerializer(self, element):
		201	from bs4 import BeautifulSoup
		202	rv = []
		203	doctype_re = re.compile(r'^(.?)(?: PUBLIC "(.?)"(?: "(.?)")?\| SYSTEM "(.?)")?$')
		204
		205	def serializeElement(element, indent=0):
		206	if isinstance(element, BeautifulSoup):
		207	pass
		208	if isinstance(element, Doctype):
		209	m = doctype_re.match(element)
		210	if m:
		211	name = m.group(1)
		212	if m.lastindex > 1:
		213	publicId = m.group(2) or ""
		214	systemId = m.group(3) or m.group(4) or ""
		215	rv.append("""\|%s<!DOCTYPE %s "%s" "%s">""" %
		216	(' ' * indent, name, publicId, systemId))
		217	else:
		218	rv.append("\|%s<!DOCTYPE %s>" % (' ' * indent, name))
		219	else:
		220	rv.append("\|%s<!DOCTYPE >" % (' ' * indent,))
		221	elif isinstance(element, Comment):
		222	rv.append("\|%s<!-- %s -->" % (' ' * indent, element))
		223	elif isinstance(element, NavigableString):
		224	rv.append("\|%s\"%s\"" % (' ' * indent, element))
		225	else:
		226	if element.namespace:
		227	name = "%s %s" % (prefixes[element.namespace],
		228	element.name)
		229	else:
		230	name = element.name
		231	rv.append("\|%s<%s>" % (' ' * indent, name))
		232	if element.attrs:
		233	attributes = []
		234	for name, value in list(element.attrs.items()):
		235	if isinstance(name, NamespacedAttribute):
		236	name = "%s %s" % (prefixes[name.namespace], name.name)
		237	if isinstance(value, list):
		238	value = " ".join(value)
		239	attributes.append((name, value))
		240
		241	for name, value in sorted(attributes):
		242	rv.append('\|%s%s="%s"' % (' ' * (indent + 2), name, value))
		243	indent += 2
		244	for child in element.children:
		245	serializeElement(child, indent)
		246	serializeElement(element, 0)
		247
		248	return "\n".join(rv)
115		249
116	class AttrList(object):	250	class AttrList(object):
117	def __init__(self, element):	251	def __init__(self, element):
@@ -122,14 +256,14 @@ class AttrList(object):
122	def __setitem__(self, name, value):	256	def __setitem__(self, name, value):
123	# If this attribute is a multi-valued attribute for this element,	257	# If this attribute is a multi-valued attribute for this element,
124	# turn its value into a list.	258	# turn its value into a list.
125	list_attr = HTML5TreeBuilder.cdata_list_attributes	259	list_attr = self.element.cdata_list_attributes or {}
126	if (name in list_attr['*']	260	if (name in list_attr.get('*', [])
127	or (self.element.name in list_attr	261	or (self.element.name in list_attr
128	and name in list_attr[self.element.name])):	262	and name in list_attr.get(self.element.name, []))):
129	# A node that is being cloned may have already undergone	263	# A node that is being cloned may have already undergone
130	# this procedure.	264	# this procedure.
131	if not isinstance(value, list):	265	if not isinstance(value, list):
132	value = whitespace_re.split(value)	266	value = nonwhitespace_re.findall(value)
133	self.element[name] = value	267	self.element[name] = value
134	def items(self):	268	def items(self):
135	return list(self.attrs.items())	269	return list(self.attrs.items())
@@ -143,9 +277,9 @@ class AttrList(object):
143	return name in list(self.attrs.keys())	277	return name in list(self.attrs.keys())
144		278
145		279
146	class Element(treebuildersbase.Node):	280	class Element(treebuilder_base.Node):
147	def __init__(self, element, soup, namespace):	281	def __init__(self, element, soup, namespace):
148	treebuildersbase.Node.__init__(self, element.name)	282	treebuilder_base.Node.__init__(self, element.name)
149	self.element = element	283	self.element = element
150	self.soup = soup	284	self.soup = soup
151	self.namespace = namespace	285	self.namespace = namespace
@@ -164,13 +298,15 @@ class Element(treebuildersbase.Node):
164	child = node	298	child = node
165	elif node.element.__class__ == NavigableString:	299	elif node.element.__class__ == NavigableString:
166	string_child = child = node.element	300	string_child = child = node.element
		301	node.parent = self
167	else:	302	else:
168	child = node.element	303	child = node.element
		304	node.parent = self
169		305
170	if not isinstance(child, str) and child.parent is not None:	306	if not isinstance(child, str) and child.parent is not None:
171	node.element.extract()	307	node.element.extract()
172		308
173	if (string_child and self.element.contents	309	if (string_child is not None and self.element.contents
174	and self.element.contents[-1].__class__ == NavigableString):	310	and self.element.contents[-1].__class__ == NavigableString):
175	# We are appending a string onto another string.	311	# We are appending a string onto another string.
176	# TODO This has O(n^2) performance, for input like	312	# TODO This has O(n^2) performance, for input like
@@ -203,12 +339,12 @@ class Element(treebuildersbase.Node):
203	most_recent_element=most_recent_element)	339	most_recent_element=most_recent_element)
204		340
205	def getAttributes(self):	341	def getAttributes(self):
		342	if isinstance(self.element, Comment):
		343	return {}
206	return AttrList(self.element)	344	return AttrList(self.element)
207		345
208	def setAttributes(self, attributes):	346	def setAttributes(self, attributes):
209
210	if attributes is not None and len(attributes) > 0:	347	if attributes is not None and len(attributes) > 0:
211
212	converted_attributes = []	348	converted_attributes = []
213	for name, value in list(attributes.items()):	349	for name, value in list(attributes.items()):
214	if isinstance(name, tuple):	350	if isinstance(name, tuple):
@@ -230,11 +366,11 @@ class Element(treebuildersbase.Node):
230	attributes = property(getAttributes, setAttributes)	366	attributes = property(getAttributes, setAttributes)
231		367
232	def insertText(self, data, insertBefore=None):	368	def insertText(self, data, insertBefore=None):
		369	text = TextNode(self.soup.new_string(data), self.soup)
233	if insertBefore:	370	if insertBefore:
234	text = TextNode(self.soup.new_string(data), self.soup)	371	self.insertBefore(text, insertBefore)
235	self.insertBefore(data, insertBefore)
236	else:	372	else:
237	self.appendChild(data)	373	self.appendChild(text)
238		374
239	def insertBefore(self, node, refNode):	375	def insertBefore(self, node, refNode):
240	index = self.element.index(refNode.element)	376	index = self.element.index(refNode.element)
@@ -253,9 +389,10 @@ class Element(treebuildersbase.Node):
253		389
254	def reparentChildren(self, new_parent):	390	def reparentChildren(self, new_parent):
255	"""Move all of this tag's children into another tag."""	391	"""Move all of this tag's children into another tag."""
256	# print "MOVE", self.element.contents	392	# print("MOVE", self.element.contents)
257	# print "FROM", self.element	393	# print("FROM", self.element)
258	# print "TO", new_parent.element	394	# print("TO", new_parent.element)
		395
259	element = self.element	396	element = self.element
260	new_parent_element = new_parent.element	397	new_parent_element = new_parent.element
261	# Determine what this tag's next_element will be once all the children	398	# Determine what this tag's next_element will be once all the children
@@ -274,29 +411,35 @@ class Element(treebuildersbase.Node):
274	new_parents_last_descendant_next_element = new_parent_element.next_element	411	new_parents_last_descendant_next_element = new_parent_element.next_element
275		412
276	to_append = element.contents	413	to_append = element.contents
277	append_after = new_parent_element.contents
278	if len(to_append) > 0:	414	if len(to_append) > 0:
279	# Set the first child's previous_element and previous_sibling	415	# Set the first child's previous_element and previous_sibling
280	# to elements within the new parent	416	# to elements within the new parent
281	first_child = to_append[0]	417	first_child = to_append[0]
282	if new_parents_last_descendant:	418	if new_parents_last_descendant is not None:
283	first_child.previous_element = new_parents_last_descendant	419	first_child.previous_element = new_parents_last_descendant
284	else:	420	else:
285	first_child.previous_element = new_parent_element	421	first_child.previous_element = new_parent_element
286	first_child.previous_sibling = new_parents_last_child	422	first_child.previous_sibling = new_parents_last_child
287	if new_parents_last_descendant:	423	if new_parents_last_descendant is not None:
288	new_parents_last_descendant.next_element = first_child	424	new_parents_last_descendant.next_element = first_child
289	else:	425	else:
290	new_parent_element.next_element = first_child	426	new_parent_element.next_element = first_child
291	if new_parents_last_child:	427	if new_parents_last_child is not None:
292	new_parents_last_child.next_sibling = first_child	428	new_parents_last_child.next_sibling = first_child
293		429
294	# Fix the last child's next_element and next_sibling	430	# Find the very last element being moved. It is now the
295	last_child = to_append[-1]	431	# parent's last descendant. It has no .next_sibling and
296	last_child.next_element = new_parents_last_descendant_next_element	432	# its .next_element is whatever the previous last
297	if new_parents_last_descendant_next_element:	433	# descendant had.
298	new_parents_last_descendant_next_element.previous_element = last_child	434	last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
299	last_child.next_sibling = None	435
		436	last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
		437	if new_parents_last_descendant_next_element is not None:
		438	# TODO: This code has no test coverage and I'm not sure
		439	# how to get html5lib to go through this path, but it's
		440	# just the other side of the previous line.
		441	new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
		442	last_childs_last_descendant.next_sibling = None
300		443
301	for child in to_append:	444	for child in to_append:
302	child.parent = new_parent_element	445	child.parent = new_parent_element
@@ -306,9 +449,9 @@ class Element(treebuildersbase.Node):
306	element.contents = []	449	element.contents = []
307	element.next_element = final_next_element	450	element.next_element = final_next_element
308		451
309	# print "DONE WITH MOVE"	452	# print("DONE WITH MOVE")
310	# print "FROM", self.element	453	# print("FROM", self.element)
311	# print "TO", new_parent_element	454	# print("TO", new_parent_element)
312		455
313	def cloneNode(self):	456	def cloneNode(self):
314	tag = self.soup.new_tag(self.element.name, self.namespace)	457	tag = self.soup.new_tag(self.element.name, self.namespace)
@@ -321,7 +464,7 @@ class Element(treebuildersbase.Node):
321	return self.element.contents	464	return self.element.contents
322		465
323	def getNameTuple(self):	466	def getNameTuple(self):
324	if self.namespace is None:	467	if self.namespace == None:
325	return namespaces["html"], self.name	468	return namespaces["html"], self.name
326	else:	469	else:
327	return self.namespace, self.name	470	return self.namespace, self.name
@@ -330,7 +473,7 @@ class Element(treebuildersbase.Node):
330		473
331	class TextNode(Element):	474	class TextNode(Element):
332	def __init__(self, element, soup):	475	def __init__(self, element, soup):
333	treebuildersbase.Node.__init__(self, None)	476	treebuilder_base.Node.__init__(self, None)
334	self.element = element	477	self.element = element
335	self.soup = soup	478	self.soup = soup
336		479