summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/builder
diff options
context:
space:
mode:
authorRichard Purdie <richard.purdie@linuxfoundation.org>2016-05-06 09:06:51 +0100
committerRichard Purdie <richard.purdie@linuxfoundation.org>2016-06-02 08:24:02 +0100
commit822eabf32dd69346071bd25fc3639db252d2f346 (patch)
treeedac6d1d0d5114a4e3c72fea5589c069453b72d2 /bitbake/lib/bs4/builder
parent4f8959324df3b89487973bd4e8de21debb0a12ef (diff)
downloadpoky-822eabf32dd69346071bd25fc3639db252d2f346.tar.gz
bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)
Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers recommendation for v3 use. (Bitbake rev: 2f4b98af93c971a8c466ffaf3c09cca0edb6e3ad) Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/builder')
-rw-r--r--bitbake/lib/bs4/builder/__init__.py7
-rw-r--r--bitbake/lib/bs4/builder/_html5lib.py71
-rw-r--r--bitbake/lib/bs4/builder/_htmlparser.py56
-rw-r--r--bitbake/lib/bs4/builder/_lxml.py47
4 files changed, 125 insertions, 56 deletions
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py
index 740f5f29cd..6ccd4d23d6 100644
--- a/bitbake/lib/bs4/builder/__init__.py
+++ b/bitbake/lib/bs4/builder/__init__.py
@@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry()
80class TreeBuilder(object): 80class TreeBuilder(object):
81 """Turn a document into a Beautiful Soup object tree.""" 81 """Turn a document into a Beautiful Soup object tree."""
82 82
83 NAME = "[Unknown tree builder]"
84 ALTERNATE_NAMES = []
83 features = [] 85 features = []
84 86
85 is_xml = False 87 is_xml = False
88 picklable = False
86 preserve_whitespace_tags = set() 89 preserve_whitespace_tags = set()
87 empty_element_tags = None # A tag will be considered an empty-element 90 empty_element_tags = None # A tag will be considered an empty-element
88 # tag when and only when it has no contents. 91 # tag when and only when it has no contents.
@@ -153,13 +156,13 @@ class TreeBuilder(object):
153 universal = self.cdata_list_attributes.get('*', []) 156 universal = self.cdata_list_attributes.get('*', [])
154 tag_specific = self.cdata_list_attributes.get( 157 tag_specific = self.cdata_list_attributes.get(
155 tag_name.lower(), None) 158 tag_name.lower(), None)
156 for attr in attrs.keys(): 159 for attr in list(attrs.keys()):
157 if attr in universal or (tag_specific and attr in tag_specific): 160 if attr in universal or (tag_specific and attr in tag_specific):
158 # We have a "class"-type attribute whose string 161 # We have a "class"-type attribute whose string
159 # value is a whitespace-separated list of 162 # value is a whitespace-separated list of
160 # values. Split it into a list. 163 # values. Split it into a list.
161 value = attrs[attr] 164 value = attrs[attr]
162 if isinstance(value, basestring): 165 if isinstance(value, str):
163 values = whitespace_re.split(value) 166 values = whitespace_re.split(value)
164 else: 167 else:
165 # html5lib sometimes calls setAttributes twice 168 # html5lib sometimes calls setAttributes twice
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py
index 7de36ae75e..f0e5924ebb 100644
--- a/bitbake/lib/bs4/builder/_html5lib.py
+++ b/bitbake/lib/bs4/builder/_html5lib.py
@@ -2,6 +2,7 @@ __all__ = [
2 'HTML5TreeBuilder', 2 'HTML5TreeBuilder',
3 ] 3 ]
4 4
5from pdb import set_trace
5import warnings 6import warnings
6from bs4.builder import ( 7from bs4.builder import (
7 PERMISSIVE, 8 PERMISSIVE,
@@ -9,7 +10,10 @@ from bs4.builder import (
9 HTML_5, 10 HTML_5,
10 HTMLTreeBuilder, 11 HTMLTreeBuilder,
11 ) 12 )
12from bs4.element import NamespacedAttribute 13from bs4.element import (
14 NamespacedAttribute,
15 whitespace_re,
16)
13import html5lib 17import html5lib
14from html5lib.constants import namespaces 18from html5lib.constants import namespaces
15from bs4.element import ( 19from bs4.element import (
@@ -22,11 +26,20 @@ from bs4.element import (
22class HTML5TreeBuilder(HTMLTreeBuilder): 26class HTML5TreeBuilder(HTMLTreeBuilder):
23 """Use html5lib to build a tree.""" 27 """Use html5lib to build a tree."""
24 28
25 features = ['html5lib', PERMISSIVE, HTML_5, HTML] 29 NAME = "html5lib"
30
31 features = [NAME, PERMISSIVE, HTML_5, HTML]
26 32
27 def prepare_markup(self, markup, user_specified_encoding): 33 def prepare_markup(self, markup, user_specified_encoding,
34 document_declared_encoding=None, exclude_encodings=None):
28 # Store the user-specified encoding for use later on. 35 # Store the user-specified encoding for use later on.
29 self.user_specified_encoding = user_specified_encoding 36 self.user_specified_encoding = user_specified_encoding
37
38 # document_declared_encoding and exclude_encodings aren't used
39 # ATM because the html5lib TreeBuilder doesn't use
40 # UnicodeDammit.
41 if exclude_encodings:
42 warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
30 yield (markup, None, None, False) 43 yield (markup, None, None, False)
31 44
32 # These methods are defined by Beautiful Soup. 45 # These methods are defined by Beautiful Soup.
@@ -37,7 +50,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
37 doc = parser.parse(markup, encoding=self.user_specified_encoding) 50 doc = parser.parse(markup, encoding=self.user_specified_encoding)
38 51
39 # Set the character encoding detected by the tokenizer. 52 # Set the character encoding detected by the tokenizer.
40 if isinstance(markup, unicode): 53 if isinstance(markup, str):
41 # We need to special-case this because html5lib sets 54 # We need to special-case this because html5lib sets
42 # charEncoding to UTF-8 if it gets Unicode input. 55 # charEncoding to UTF-8 if it gets Unicode input.
43 doc.original_encoding = None 56 doc.original_encoding = None
@@ -51,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
51 64
52 def test_fragment_to_document(self, fragment): 65 def test_fragment_to_document(self, fragment):
53 """See `TreeBuilder`.""" 66 """See `TreeBuilder`."""
54 return u'<html><head></head><body>%s</body></html>' % fragment 67 return '<html><head></head><body>%s</body></html>' % fragment
55 68
56 69
57class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): 70class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
@@ -101,7 +114,16 @@ class AttrList(object):
101 def __iter__(self): 114 def __iter__(self):
102 return list(self.attrs.items()).__iter__() 115 return list(self.attrs.items()).__iter__()
103 def __setitem__(self, name, value): 116 def __setitem__(self, name, value):
104 "set attr", name, value 117 # If this attribute is a multi-valued attribute for this element,
118 # turn its value into a list.
119 list_attr = HTML5TreeBuilder.cdata_list_attributes
120 if (name in list_attr['*']
121 or (self.element.name in list_attr
122 and name in list_attr[self.element.name])):
123 # A node that is being cloned may have already undergone
124 # this procedure.
125 if not isinstance(value, list):
126 value = whitespace_re.split(value)
105 self.element[name] = value 127 self.element[name] = value
106 def items(self): 128 def items(self):
107 return list(self.attrs.items()) 129 return list(self.attrs.items())
@@ -124,7 +146,7 @@ class Element(html5lib.treebuilders._base.Node):
124 146
125 def appendChild(self, node): 147 def appendChild(self, node):
126 string_child = child = None 148 string_child = child = None
127 if isinstance(node, basestring): 149 if isinstance(node, str):
128 # Some other piece of code decided to pass in a string 150 # Some other piece of code decided to pass in a string
129 # instead of creating a TextElement object to contain the 151 # instead of creating a TextElement object to contain the
130 # string. 152 # string.
@@ -139,7 +161,7 @@ class Element(html5lib.treebuilders._base.Node):
139 else: 161 else:
140 child = node.element 162 child = node.element
141 163
142 if not isinstance(child, basestring) and child.parent is not None: 164 if not isinstance(child, str) and child.parent is not None:
143 node.element.extract() 165 node.element.extract()
144 166
145 if (string_child and self.element.contents 167 if (string_child and self.element.contents
@@ -152,7 +174,7 @@ class Element(html5lib.treebuilders._base.Node):
152 old_element.replace_with(new_element) 174 old_element.replace_with(new_element)
153 self.soup._most_recent_element = new_element 175 self.soup._most_recent_element = new_element
154 else: 176 else:
155 if isinstance(node, basestring): 177 if isinstance(node, str):
156 # Create a brand new NavigableString from this string. 178 # Create a brand new NavigableString from this string.
157 child = self.soup.new_string(node) 179 child = self.soup.new_string(node)
158 180
@@ -161,6 +183,12 @@ class Element(html5lib.treebuilders._base.Node):
161 # immediately after the parent, if it has no children.) 183 # immediately after the parent, if it has no children.)
162 if self.element.contents: 184 if self.element.contents:
163 most_recent_element = self.element._last_descendant(False) 185 most_recent_element = self.element._last_descendant(False)
186 elif self.element.next_element is not None:
187 # Something from further ahead in the parse tree is
188 # being inserted into this earlier element. This is
189 # very annoying because it means an expensive search
190 # for the last element in the tree.
191 most_recent_element = self.soup._last_descendant()
164 else: 192 else:
165 most_recent_element = self.element 193 most_recent_element = self.element
166 194
@@ -172,6 +200,7 @@ class Element(html5lib.treebuilders._base.Node):
172 return AttrList(self.element) 200 return AttrList(self.element)
173 201
174 def setAttributes(self, attributes): 202 def setAttributes(self, attributes):
203
175 if attributes is not None and len(attributes) > 0: 204 if attributes is not None and len(attributes) > 0:
176 205
177 converted_attributes = [] 206 converted_attributes = []
@@ -183,7 +212,7 @@ class Element(html5lib.treebuilders._base.Node):
183 212
184 self.soup.builder._replace_cdata_list_attribute_values( 213 self.soup.builder._replace_cdata_list_attribute_values(
185 self.name, attributes) 214 self.name, attributes)
186 for name, value in attributes.items(): 215 for name, value in list(attributes.items()):
187 self.element[name] = value 216 self.element[name] = value
188 217
189 # The attributes may contain variables that need substitution. 218 # The attributes may contain variables that need substitution.
@@ -218,6 +247,9 @@ class Element(html5lib.treebuilders._base.Node):
218 247
219 def reparentChildren(self, new_parent): 248 def reparentChildren(self, new_parent):
220 """Move all of this tag's children into another tag.""" 249 """Move all of this tag's children into another tag."""
250 # print "MOVE", self.element.contents
251 # print "FROM", self.element
252 # print "TO", new_parent.element
221 element = self.element 253 element = self.element
222 new_parent_element = new_parent.element 254 new_parent_element = new_parent.element
223 # Determine what this tag's next_element will be once all the children 255 # Determine what this tag's next_element will be once all the children
@@ -236,17 +268,28 @@ class Element(html5lib.treebuilders._base.Node):
236 new_parents_last_descendant_next_element = new_parent_element.next_element 268 new_parents_last_descendant_next_element = new_parent_element.next_element
237 269
238 to_append = element.contents 270 to_append = element.contents
239 append_after = new_parent.element.contents 271 append_after = new_parent_element.contents
240 if len(to_append) > 0: 272 if len(to_append) > 0:
241 # Set the first child's previous_element and previous_sibling 273 # Set the first child's previous_element and previous_sibling
242 # to elements within the new parent 274 # to elements within the new parent
243 first_child = to_append[0] 275 first_child = to_append[0]
244 first_child.previous_element = new_parents_last_descendant 276 if new_parents_last_descendant:
277 first_child.previous_element = new_parents_last_descendant
278 else:
279 first_child.previous_element = new_parent_element
245 first_child.previous_sibling = new_parents_last_child 280 first_child.previous_sibling = new_parents_last_child
281 if new_parents_last_descendant:
282 new_parents_last_descendant.next_element = first_child
283 else:
284 new_parent_element.next_element = first_child
285 if new_parents_last_child:
286 new_parents_last_child.next_sibling = first_child
246 287
247 # Fix the last child's next_element and next_sibling 288 # Fix the last child's next_element and next_sibling
248 last_child = to_append[-1] 289 last_child = to_append[-1]
249 last_child.next_element = new_parents_last_descendant_next_element 290 last_child.next_element = new_parents_last_descendant_next_element
291 if new_parents_last_descendant_next_element:
292 new_parents_last_descendant_next_element.previous_element = last_child
250 last_child.next_sibling = None 293 last_child.next_sibling = None
251 294
252 for child in to_append: 295 for child in to_append:
@@ -257,6 +300,10 @@ class Element(html5lib.treebuilders._base.Node):
257 element.contents = [] 300 element.contents = []
258 element.next_element = final_next_element 301 element.next_element = final_next_element
259 302
303 # print "DONE WITH MOVE"
304 # print "FROM", self.element
305 # print "TO", new_parent_element
306
260 def cloneNode(self): 307 def cloneNode(self):
261 tag = self.soup.new_tag(self.element.name, self.namespace) 308 tag = self.soup.new_tag(self.element.name, self.namespace)
262 node = Element(tag, self.soup, self.namespace) 309 node = Element(tag, self.soup, self.namespace)
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py
index ca8d8b892b..bb0a63f2f3 100644
--- a/bitbake/lib/bs4/builder/_htmlparser.py
+++ b/bitbake/lib/bs4/builder/_htmlparser.py
@@ -4,10 +4,16 @@ __all__ = [
4 'HTMLParserTreeBuilder', 4 'HTMLParserTreeBuilder',
5 ] 5 ]
6 6
7from HTMLParser import ( 7from html.parser import HTMLParser
8 HTMLParser, 8
9 HTMLParseError, 9try:
10 ) 10 from html.parser import HTMLParseError
11except ImportError as e:
12 # HTMLParseError is removed in Python 3.5. Since it can never be
13 # thrown in 3.5, we can just define our own class as a placeholder.
14 class HTMLParseError(Exception):
15 pass
16
11import sys 17import sys
12import warnings 18import warnings
13 19
@@ -19,10 +25,10 @@ import warnings
19# At the end of this file, we monkeypatch HTMLParser so that 25# At the end of this file, we monkeypatch HTMLParser so that
20# strict=True works well on Python 3.2.2. 26# strict=True works well on Python 3.2.2.
21major, minor, release = sys.version_info[:3] 27major, minor, release = sys.version_info[:3]
22CONSTRUCTOR_TAKES_STRICT = ( 28CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
23 major > 3 29CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
24 or (major == 3 and minor > 2) 30CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
25 or (major == 3 and minor == 2 and release >= 3)) 31
26 32
27from bs4.element import ( 33from bs4.element import (
28 CData, 34 CData,
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
63 69
64 def handle_charref(self, name): 70 def handle_charref(self, name):
65 # XXX workaround for a bug in HTMLParser. Remove this once 71 # XXX workaround for a bug in HTMLParser. Remove this once
66 # it's fixed. 72 # it's fixed in all supported versions.
73 # http://bugs.python.org/issue13633
67 if name.startswith('x'): 74 if name.startswith('x'):
68 real_name = int(name.lstrip('x'), 16) 75 real_name = int(name.lstrip('x'), 16)
69 elif name.startswith('X'): 76 elif name.startswith('X'):
@@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
72 real_name = int(name) 79 real_name = int(name)
73 80
74 try: 81 try:
75 data = unichr(real_name) 82 data = chr(real_name)
76 except (ValueError, OverflowError), e: 83 except (ValueError, OverflowError) as e:
77 data = u"\N{REPLACEMENT CHARACTER}" 84 data = "\N{REPLACEMENT CHARACTER}"
78 85
79 self.handle_data(data) 86 self.handle_data(data)
80 87
@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
113 120
114 def handle_pi(self, data): 121 def handle_pi(self, data):
115 self.soup.endData() 122 self.soup.endData()
116 if data.endswith("?") and data.lower().startswith("xml"):
117 # "An XHTML processing instruction using the trailing '?'
118 # will cause the '?' to be included in data." - HTMLParser
119 # docs.
120 #
121 # Strip the question mark so we don't end up with two
122 # question marks.
123 data = data[:-1]
124 self.soup.handle_data(data) 123 self.soup.handle_data(data)
125 self.soup.endData(ProcessingInstruction) 124 self.soup.endData(ProcessingInstruction)
126 125
@@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser):
128class HTMLParserTreeBuilder(HTMLTreeBuilder): 127class HTMLParserTreeBuilder(HTMLTreeBuilder):
129 128
130 is_xml = False 129 is_xml = False
131 features = [HTML, STRICT, HTMLPARSER] 130 picklable = True
131 NAME = HTMLPARSER
132 features = [NAME, HTML, STRICT]
132 133
133 def __init__(self, *args, **kwargs): 134 def __init__(self, *args, **kwargs):
134 if CONSTRUCTOR_TAKES_STRICT: 135 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
135 kwargs['strict'] = False 136 kwargs['strict'] = False
137 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
138 kwargs['convert_charrefs'] = False
136 self.parser_args = (args, kwargs) 139 self.parser_args = (args, kwargs)
137 140
138 def prepare_markup(self, markup, user_specified_encoding=None, 141 def prepare_markup(self, markup, user_specified_encoding=None,
139 document_declared_encoding=None): 142 document_declared_encoding=None, exclude_encodings=None):
140 """ 143 """
141 :return: A 4-tuple (markup, original encoding, encoding 144 :return: A 4-tuple (markup, original encoding, encoding
142 declared within markup, whether any characters had to be 145 declared within markup, whether any characters had to be
143 replaced with REPLACEMENT CHARACTER). 146 replaced with REPLACEMENT CHARACTER).
144 """ 147 """
145 if isinstance(markup, unicode): 148 if isinstance(markup, str):
146 yield (markup, None, None, False) 149 yield (markup, None, None, False)
147 return 150 return
148 151
149 try_encodings = [user_specified_encoding, document_declared_encoding] 152 try_encodings = [user_specified_encoding, document_declared_encoding]
150 dammit = UnicodeDammit(markup, try_encodings, is_html=True) 153 dammit = UnicodeDammit(markup, try_encodings, is_html=True,
154 exclude_encodings=exclude_encodings)
151 yield (dammit.markup, dammit.original_encoding, 155 yield (dammit.markup, dammit.original_encoding,
152 dammit.declared_html_encoding, 156 dammit.declared_html_encoding,
153 dammit.contains_replacement_characters) 157 dammit.contains_replacement_characters)
@@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
158 parser.soup = self.soup 162 parser.soup = self.soup
159 try: 163 try:
160 parser.feed(markup) 164 parser.feed(markup)
161 except HTMLParseError, e: 165 except HTMLParseError as e:
162 warnings.warn(RuntimeWarning( 166 warnings.warn(RuntimeWarning(
163 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 167 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
164 raise e 168 raise e
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py
index fa5d49875e..9c6c14ee65 100644
--- a/bitbake/lib/bs4/builder/_lxml.py
+++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -4,10 +4,15 @@ __all__ = [
4 ] 4 ]
5 5
6from io import BytesIO 6from io import BytesIO
7from StringIO import StringIO 7from io import StringIO
8import collections 8import collections
9from lxml import etree 9from lxml import etree
10from bs4.element import Comment, Doctype, NamespacedAttribute 10from bs4.element import (
11 Comment,
12 Doctype,
13 NamespacedAttribute,
14 ProcessingInstruction,
15)
11from bs4.builder import ( 16from bs4.builder import (
12 FAST, 17 FAST,
13 HTML, 18 HTML,
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
25 30
26 is_xml = True 31 is_xml = True
27 32
33 NAME = "lxml-xml"
34 ALTERNATE_NAMES = ["xml"]
35
28 # Well, it's permissive by XML parser standards. 36 # Well, it's permissive by XML parser standards.
29 features = [LXML, XML, FAST, PERMISSIVE] 37 features = [NAME, LXML, XML, FAST, PERMISSIVE]
30 38
31 CHUNK_SIZE = 512 39 CHUNK_SIZE = 512
32 40
@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
70 return (None, tag) 78 return (None, tag)
71 79
72 def prepare_markup(self, markup, user_specified_encoding=None, 80 def prepare_markup(self, markup, user_specified_encoding=None,
81 exclude_encodings=None,
73 document_declared_encoding=None): 82 document_declared_encoding=None):
74 """ 83 """
75 :yield: A series of 4-tuples. 84 :yield: A series of 4-tuples.
@@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
78 87
79 Each 4-tuple represents a strategy for parsing the document. 88 Each 4-tuple represents a strategy for parsing the document.
80 """ 89 """
81 if isinstance(markup, unicode): 90 if isinstance(markup, str):
82 # We were given Unicode. Maybe lxml can parse Unicode on 91 # We were given Unicode. Maybe lxml can parse Unicode on
83 # this system? 92 # this system?
84 yield markup, None, document_declared_encoding, False 93 yield markup, None, document_declared_encoding, False
85 94
86 if isinstance(markup, unicode): 95 if isinstance(markup, str):
87 # No, apparently not. Convert the Unicode to UTF-8 and 96 # No, apparently not. Convert the Unicode to UTF-8 and
88 # tell lxml to parse it as UTF-8. 97 # tell lxml to parse it as UTF-8.
89 yield (markup.encode("utf8"), "utf8", 98 yield (markup.encode("utf8"), "utf8",
@@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
95 # the document as each one in turn. 104 # the document as each one in turn.
96 is_html = not self.is_xml 105 is_html = not self.is_xml
97 try_encodings = [user_specified_encoding, document_declared_encoding] 106 try_encodings = [user_specified_encoding, document_declared_encoding]
98 detector = EncodingDetector(markup, try_encodings, is_html) 107 detector = EncodingDetector(
108 markup, try_encodings, is_html, exclude_encodings)
99 for encoding in detector.encodings: 109 for encoding in detector.encodings:
100 yield (detector.markup, encoding, document_declared_encoding, False) 110 yield (detector.markup, encoding, document_declared_encoding, False)
101 111
102 def feed(self, markup): 112 def feed(self, markup):
103 if isinstance(markup, bytes): 113 if isinstance(markup, bytes):
104 markup = BytesIO(markup) 114 markup = BytesIO(markup)
105 elif isinstance(markup, unicode): 115 elif isinstance(markup, str):
106 markup = StringIO(markup) 116 markup = StringIO(markup)
107 117
108 # Call feed() at least once, even if the markup is empty, 118 # Call feed() at least once, even if the markup is empty,
@@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
117 if len(data) != 0: 127 if len(data) != 0:
118 self.parser.feed(data) 128 self.parser.feed(data)
119 self.parser.close() 129 self.parser.close()
120 except (UnicodeDecodeError, LookupError, etree.ParserError), e: 130 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
121 raise ParserRejectedMarkup(str(e)) 131 raise ParserRejectedMarkup(str(e))
122 132
123 def close(self): 133 def close(self):
@@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
135 self.nsmaps.append(None) 145 self.nsmaps.append(None)
136 elif len(nsmap) > 0: 146 elif len(nsmap) > 0:
137 # A new namespace mapping has come into play. 147 # A new namespace mapping has come into play.
138 inverted_nsmap = dict((value, key) for key, value in nsmap.items()) 148 inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
139 self.nsmaps.append(inverted_nsmap) 149 self.nsmaps.append(inverted_nsmap)
140 # Also treat the namespace mapping as a set of attributes on the 150 # Also treat the namespace mapping as a set of attributes on the
141 # tag, so we can recreate it later. 151 # tag, so we can recreate it later.
142 attrs = attrs.copy() 152 attrs = attrs.copy()
143 for prefix, namespace in nsmap.items(): 153 for prefix, namespace in list(nsmap.items()):
144 attribute = NamespacedAttribute( 154 attribute = NamespacedAttribute(
145 "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 155 "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
146 attrs[attribute] = namespace 156 attrs[attribute] = namespace
@@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
149 # from lxml with namespaces attached to their names, and 159 # from lxml with namespaces attached to their names, and
150 # turn then into NamespacedAttribute objects. 160 # turn then into NamespacedAttribute objects.
151 new_attrs = {} 161 new_attrs = {}
152 for attr, value in attrs.items(): 162 for attr, value in list(attrs.items()):
153 namespace, attr = self._getNsTag(attr) 163 namespace, attr = self._getNsTag(attr)
154 if namespace is None: 164 if namespace is None:
155 new_attrs[attr] = value 165 new_attrs[attr] = value
@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
189 self.nsmaps.pop() 199 self.nsmaps.pop()
190 200
191 def pi(self, target, data): 201 def pi(self, target, data):
192 pass 202 self.soup.endData()
203 self.soup.handle_data(target + ' ' + data)
204 self.soup.endData(ProcessingInstruction)
193 205
194 def data(self, content): 206 def data(self, content):
195 self.soup.handle_data(content) 207 self.soup.handle_data(content)
@@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
207 219
208 def test_fragment_to_document(self, fragment): 220 def test_fragment_to_document(self, fragment):
209 """See `TreeBuilder`.""" 221 """See `TreeBuilder`."""
210 return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment 222 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
211 223
212 224
213class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 225class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
214 226
215 features = [LXML, HTML, FAST, PERMISSIVE] 227 NAME = LXML
228 ALTERNATE_NAMES = ["lxml-html"]
229
230 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
216 is_xml = False 231 is_xml = False
217 232
218 def default_parser(self, encoding): 233 def default_parser(self, encoding):
@@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
224 self.parser = self.parser_for(encoding) 239 self.parser = self.parser_for(encoding)
225 self.parser.feed(markup) 240 self.parser.feed(markup)
226 self.parser.close() 241 self.parser.close()
227 except (UnicodeDecodeError, LookupError, etree.ParserError), e: 242 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
228 raise ParserRejectedMarkup(str(e)) 243 raise ParserRejectedMarkup(str(e))
229 244
230 245
231 def test_fragment_to_document(self, fragment): 246 def test_fragment_to_document(self, fragment):
232 """See `TreeBuilder`.""" 247 """See `TreeBuilder`."""
233 return u'<html><body>%s</body></html>' % fragment 248 return '<html><body>%s</body></html>' % fragment