diff options
author | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-05-06 09:06:51 +0100 |
---|---|---|
committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-06-02 08:24:02 +0100 |
commit | 822eabf32dd69346071bd25fc3639db252d2f346 (patch) | |
tree | edac6d1d0d5114a4e3c72fea5589c069453b72d2 /bitbake/lib/bs4/builder | |
parent | 4f8959324df3b89487973bd4e8de21debb0a12ef (diff) | |
download | poky-822eabf32dd69346071bd25fc3639db252d2f346.tar.gz |
bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)
Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers
recommendation for v3 use.
(Bitbake rev: 2f4b98af93c971a8c466ffaf3c09cca0edb6e3ad)
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/builder')
-rw-r--r-- | bitbake/lib/bs4/builder/__init__.py | 7 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/_html5lib.py | 71 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/_htmlparser.py | 56 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/_lxml.py | 47 |
4 files changed, 125 insertions, 56 deletions
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py index 740f5f29cd..6ccd4d23d6 100644 --- a/bitbake/lib/bs4/builder/__init__.py +++ b/bitbake/lib/bs4/builder/__init__.py | |||
@@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry() | |||
80 | class TreeBuilder(object): | 80 | class TreeBuilder(object): |
81 | """Turn a document into a Beautiful Soup object tree.""" | 81 | """Turn a document into a Beautiful Soup object tree.""" |
82 | 82 | ||
83 | NAME = "[Unknown tree builder]" | ||
84 | ALTERNATE_NAMES = [] | ||
83 | features = [] | 85 | features = [] |
84 | 86 | ||
85 | is_xml = False | 87 | is_xml = False |
88 | picklable = False | ||
86 | preserve_whitespace_tags = set() | 89 | preserve_whitespace_tags = set() |
87 | empty_element_tags = None # A tag will be considered an empty-element | 90 | empty_element_tags = None # A tag will be considered an empty-element |
88 | # tag when and only when it has no contents. | 91 | # tag when and only when it has no contents. |
@@ -153,13 +156,13 @@ class TreeBuilder(object): | |||
153 | universal = self.cdata_list_attributes.get('*', []) | 156 | universal = self.cdata_list_attributes.get('*', []) |
154 | tag_specific = self.cdata_list_attributes.get( | 157 | tag_specific = self.cdata_list_attributes.get( |
155 | tag_name.lower(), None) | 158 | tag_name.lower(), None) |
156 | for attr in attrs.keys(): | 159 | for attr in list(attrs.keys()): |
157 | if attr in universal or (tag_specific and attr in tag_specific): | 160 | if attr in universal or (tag_specific and attr in tag_specific): |
158 | # We have a "class"-type attribute whose string | 161 | # We have a "class"-type attribute whose string |
159 | # value is a whitespace-separated list of | 162 | # value is a whitespace-separated list of |
160 | # values. Split it into a list. | 163 | # values. Split it into a list. |
161 | value = attrs[attr] | 164 | value = attrs[attr] |
162 | if isinstance(value, basestring): | 165 | if isinstance(value, str): |
163 | values = whitespace_re.split(value) | 166 | values = whitespace_re.split(value) |
164 | else: | 167 | else: |
165 | # html5lib sometimes calls setAttributes twice | 168 | # html5lib sometimes calls setAttributes twice |
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py index 7de36ae75e..f0e5924ebb 100644 --- a/bitbake/lib/bs4/builder/_html5lib.py +++ b/bitbake/lib/bs4/builder/_html5lib.py | |||
@@ -2,6 +2,7 @@ __all__ = [ | |||
2 | 'HTML5TreeBuilder', | 2 | 'HTML5TreeBuilder', |
3 | ] | 3 | ] |
4 | 4 | ||
5 | from pdb import set_trace | ||
5 | import warnings | 6 | import warnings |
6 | from bs4.builder import ( | 7 | from bs4.builder import ( |
7 | PERMISSIVE, | 8 | PERMISSIVE, |
@@ -9,7 +10,10 @@ from bs4.builder import ( | |||
9 | HTML_5, | 10 | HTML_5, |
10 | HTMLTreeBuilder, | 11 | HTMLTreeBuilder, |
11 | ) | 12 | ) |
12 | from bs4.element import NamespacedAttribute | 13 | from bs4.element import ( |
14 | NamespacedAttribute, | ||
15 | whitespace_re, | ||
16 | ) | ||
13 | import html5lib | 17 | import html5lib |
14 | from html5lib.constants import namespaces | 18 | from html5lib.constants import namespaces |
15 | from bs4.element import ( | 19 | from bs4.element import ( |
@@ -22,11 +26,20 @@ from bs4.element import ( | |||
22 | class HTML5TreeBuilder(HTMLTreeBuilder): | 26 | class HTML5TreeBuilder(HTMLTreeBuilder): |
23 | """Use html5lib to build a tree.""" | 27 | """Use html5lib to build a tree.""" |
24 | 28 | ||
25 | features = ['html5lib', PERMISSIVE, HTML_5, HTML] | 29 | NAME = "html5lib" |
30 | |||
31 | features = [NAME, PERMISSIVE, HTML_5, HTML] | ||
26 | 32 | ||
27 | def prepare_markup(self, markup, user_specified_encoding): | 33 | def prepare_markup(self, markup, user_specified_encoding, |
34 | document_declared_encoding=None, exclude_encodings=None): | ||
28 | # Store the user-specified encoding for use later on. | 35 | # Store the user-specified encoding for use later on. |
29 | self.user_specified_encoding = user_specified_encoding | 36 | self.user_specified_encoding = user_specified_encoding |
37 | |||
38 | # document_declared_encoding and exclude_encodings aren't used | ||
39 | # ATM because the html5lib TreeBuilder doesn't use | ||
40 | # UnicodeDammit. | ||
41 | if exclude_encodings: | ||
42 | warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") | ||
30 | yield (markup, None, None, False) | 43 | yield (markup, None, None, False) |
31 | 44 | ||
32 | # These methods are defined by Beautiful Soup. | 45 | # These methods are defined by Beautiful Soup. |
@@ -37,7 +50,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): | |||
37 | doc = parser.parse(markup, encoding=self.user_specified_encoding) | 50 | doc = parser.parse(markup, encoding=self.user_specified_encoding) |
38 | 51 | ||
39 | # Set the character encoding detected by the tokenizer. | 52 | # Set the character encoding detected by the tokenizer. |
40 | if isinstance(markup, unicode): | 53 | if isinstance(markup, str): |
41 | # We need to special-case this because html5lib sets | 54 | # We need to special-case this because html5lib sets |
42 | # charEncoding to UTF-8 if it gets Unicode input. | 55 | # charEncoding to UTF-8 if it gets Unicode input. |
43 | doc.original_encoding = None | 56 | doc.original_encoding = None |
@@ -51,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): | |||
51 | 64 | ||
52 | def test_fragment_to_document(self, fragment): | 65 | def test_fragment_to_document(self, fragment): |
53 | """See `TreeBuilder`.""" | 66 | """See `TreeBuilder`.""" |
54 | return u'<html><head></head><body>%s</body></html>' % fragment | 67 | return '<html><head></head><body>%s</body></html>' % fragment |
55 | 68 | ||
56 | 69 | ||
57 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): | 70 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): |
@@ -101,7 +114,16 @@ class AttrList(object): | |||
101 | def __iter__(self): | 114 | def __iter__(self): |
102 | return list(self.attrs.items()).__iter__() | 115 | return list(self.attrs.items()).__iter__() |
103 | def __setitem__(self, name, value): | 116 | def __setitem__(self, name, value): |
104 | "set attr", name, value | 117 | # If this attribute is a multi-valued attribute for this element, |
118 | # turn its value into a list. | ||
119 | list_attr = HTML5TreeBuilder.cdata_list_attributes | ||
120 | if (name in list_attr['*'] | ||
121 | or (self.element.name in list_attr | ||
122 | and name in list_attr[self.element.name])): | ||
123 | # A node that is being cloned may have already undergone | ||
124 | # this procedure. | ||
125 | if not isinstance(value, list): | ||
126 | value = whitespace_re.split(value) | ||
105 | self.element[name] = value | 127 | self.element[name] = value |
106 | def items(self): | 128 | def items(self): |
107 | return list(self.attrs.items()) | 129 | return list(self.attrs.items()) |
@@ -124,7 +146,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
124 | 146 | ||
125 | def appendChild(self, node): | 147 | def appendChild(self, node): |
126 | string_child = child = None | 148 | string_child = child = None |
127 | if isinstance(node, basestring): | 149 | if isinstance(node, str): |
128 | # Some other piece of code decided to pass in a string | 150 | # Some other piece of code decided to pass in a string |
129 | # instead of creating a TextElement object to contain the | 151 | # instead of creating a TextElement object to contain the |
130 | # string. | 152 | # string. |
@@ -139,7 +161,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
139 | else: | 161 | else: |
140 | child = node.element | 162 | child = node.element |
141 | 163 | ||
142 | if not isinstance(child, basestring) and child.parent is not None: | 164 | if not isinstance(child, str) and child.parent is not None: |
143 | node.element.extract() | 165 | node.element.extract() |
144 | 166 | ||
145 | if (string_child and self.element.contents | 167 | if (string_child and self.element.contents |
@@ -152,7 +174,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
152 | old_element.replace_with(new_element) | 174 | old_element.replace_with(new_element) |
153 | self.soup._most_recent_element = new_element | 175 | self.soup._most_recent_element = new_element |
154 | else: | 176 | else: |
155 | if isinstance(node, basestring): | 177 | if isinstance(node, str): |
156 | # Create a brand new NavigableString from this string. | 178 | # Create a brand new NavigableString from this string. |
157 | child = self.soup.new_string(node) | 179 | child = self.soup.new_string(node) |
158 | 180 | ||
@@ -161,6 +183,12 @@ class Element(html5lib.treebuilders._base.Node): | |||
161 | # immediately after the parent, if it has no children.) | 183 | # immediately after the parent, if it has no children.) |
162 | if self.element.contents: | 184 | if self.element.contents: |
163 | most_recent_element = self.element._last_descendant(False) | 185 | most_recent_element = self.element._last_descendant(False) |
186 | elif self.element.next_element is not None: | ||
187 | # Something from further ahead in the parse tree is | ||
188 | # being inserted into this earlier element. This is | ||
189 | # very annoying because it means an expensive search | ||
190 | # for the last element in the tree. | ||
191 | most_recent_element = self.soup._last_descendant() | ||
164 | else: | 192 | else: |
165 | most_recent_element = self.element | 193 | most_recent_element = self.element |
166 | 194 | ||
@@ -172,6 +200,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
172 | return AttrList(self.element) | 200 | return AttrList(self.element) |
173 | 201 | ||
174 | def setAttributes(self, attributes): | 202 | def setAttributes(self, attributes): |
203 | |||
175 | if attributes is not None and len(attributes) > 0: | 204 | if attributes is not None and len(attributes) > 0: |
176 | 205 | ||
177 | converted_attributes = [] | 206 | converted_attributes = [] |
@@ -183,7 +212,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
183 | 212 | ||
184 | self.soup.builder._replace_cdata_list_attribute_values( | 213 | self.soup.builder._replace_cdata_list_attribute_values( |
185 | self.name, attributes) | 214 | self.name, attributes) |
186 | for name, value in attributes.items(): | 215 | for name, value in list(attributes.items()): |
187 | self.element[name] = value | 216 | self.element[name] = value |
188 | 217 | ||
189 | # The attributes may contain variables that need substitution. | 218 | # The attributes may contain variables that need substitution. |
@@ -218,6 +247,9 @@ class Element(html5lib.treebuilders._base.Node): | |||
218 | 247 | ||
219 | def reparentChildren(self, new_parent): | 248 | def reparentChildren(self, new_parent): |
220 | """Move all of this tag's children into another tag.""" | 249 | """Move all of this tag's children into another tag.""" |
250 | # print "MOVE", self.element.contents | ||
251 | # print "FROM", self.element | ||
252 | # print "TO", new_parent.element | ||
221 | element = self.element | 253 | element = self.element |
222 | new_parent_element = new_parent.element | 254 | new_parent_element = new_parent.element |
223 | # Determine what this tag's next_element will be once all the children | 255 | # Determine what this tag's next_element will be once all the children |
@@ -236,17 +268,28 @@ class Element(html5lib.treebuilders._base.Node): | |||
236 | new_parents_last_descendant_next_element = new_parent_element.next_element | 268 | new_parents_last_descendant_next_element = new_parent_element.next_element |
237 | 269 | ||
238 | to_append = element.contents | 270 | to_append = element.contents |
239 | append_after = new_parent.element.contents | 271 | append_after = new_parent_element.contents |
240 | if len(to_append) > 0: | 272 | if len(to_append) > 0: |
241 | # Set the first child's previous_element and previous_sibling | 273 | # Set the first child's previous_element and previous_sibling |
242 | # to elements within the new parent | 274 | # to elements within the new parent |
243 | first_child = to_append[0] | 275 | first_child = to_append[0] |
244 | first_child.previous_element = new_parents_last_descendant | 276 | if new_parents_last_descendant: |
277 | first_child.previous_element = new_parents_last_descendant | ||
278 | else: | ||
279 | first_child.previous_element = new_parent_element | ||
245 | first_child.previous_sibling = new_parents_last_child | 280 | first_child.previous_sibling = new_parents_last_child |
281 | if new_parents_last_descendant: | ||
282 | new_parents_last_descendant.next_element = first_child | ||
283 | else: | ||
284 | new_parent_element.next_element = first_child | ||
285 | if new_parents_last_child: | ||
286 | new_parents_last_child.next_sibling = first_child | ||
246 | 287 | ||
247 | # Fix the last child's next_element and next_sibling | 288 | # Fix the last child's next_element and next_sibling |
248 | last_child = to_append[-1] | 289 | last_child = to_append[-1] |
249 | last_child.next_element = new_parents_last_descendant_next_element | 290 | last_child.next_element = new_parents_last_descendant_next_element |
291 | if new_parents_last_descendant_next_element: | ||
292 | new_parents_last_descendant_next_element.previous_element = last_child | ||
250 | last_child.next_sibling = None | 293 | last_child.next_sibling = None |
251 | 294 | ||
252 | for child in to_append: | 295 | for child in to_append: |
@@ -257,6 +300,10 @@ class Element(html5lib.treebuilders._base.Node): | |||
257 | element.contents = [] | 300 | element.contents = [] |
258 | element.next_element = final_next_element | 301 | element.next_element = final_next_element |
259 | 302 | ||
303 | # print "DONE WITH MOVE" | ||
304 | # print "FROM", self.element | ||
305 | # print "TO", new_parent_element | ||
306 | |||
260 | def cloneNode(self): | 307 | def cloneNode(self): |
261 | tag = self.soup.new_tag(self.element.name, self.namespace) | 308 | tag = self.soup.new_tag(self.element.name, self.namespace) |
262 | node = Element(tag, self.soup, self.namespace) | 309 | node = Element(tag, self.soup, self.namespace) |
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py index ca8d8b892b..bb0a63f2f3 100644 --- a/bitbake/lib/bs4/builder/_htmlparser.py +++ b/bitbake/lib/bs4/builder/_htmlparser.py | |||
@@ -4,10 +4,16 @@ __all__ = [ | |||
4 | 'HTMLParserTreeBuilder', | 4 | 'HTMLParserTreeBuilder', |
5 | ] | 5 | ] |
6 | 6 | ||
7 | from HTMLParser import ( | 7 | from html.parser import HTMLParser |
8 | HTMLParser, | 8 | |
9 | HTMLParseError, | 9 | try: |
10 | ) | 10 | from html.parser import HTMLParseError |
11 | except ImportError as e: | ||
12 | # HTMLParseError is removed in Python 3.5. Since it can never be | ||
13 | # thrown in 3.5, we can just define our own class as a placeholder. | ||
14 | class HTMLParseError(Exception): | ||
15 | pass | ||
16 | |||
11 | import sys | 17 | import sys |
12 | import warnings | 18 | import warnings |
13 | 19 | ||
@@ -19,10 +25,10 @@ import warnings | |||
19 | # At the end of this file, we monkeypatch HTMLParser so that | 25 | # At the end of this file, we monkeypatch HTMLParser so that |
20 | # strict=True works well on Python 3.2.2. | 26 | # strict=True works well on Python 3.2.2. |
21 | major, minor, release = sys.version_info[:3] | 27 | major, minor, release = sys.version_info[:3] |
22 | CONSTRUCTOR_TAKES_STRICT = ( | 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 |
23 | major > 3 | 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 |
24 | or (major == 3 and minor > 2) | 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 |
25 | or (major == 3 and minor == 2 and release >= 3)) | 31 | |
26 | 32 | ||
27 | from bs4.element import ( | 33 | from bs4.element import ( |
28 | CData, | 34 | CData, |
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
63 | 69 | ||
64 | def handle_charref(self, name): | 70 | def handle_charref(self, name): |
65 | # XXX workaround for a bug in HTMLParser. Remove this once | 71 | # XXX workaround for a bug in HTMLParser. Remove this once |
66 | # it's fixed. | 72 | # it's fixed in all supported versions. |
73 | # http://bugs.python.org/issue13633 | ||
67 | if name.startswith('x'): | 74 | if name.startswith('x'): |
68 | real_name = int(name.lstrip('x'), 16) | 75 | real_name = int(name.lstrip('x'), 16) |
69 | elif name.startswith('X'): | 76 | elif name.startswith('X'): |
@@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
72 | real_name = int(name) | 79 | real_name = int(name) |
73 | 80 | ||
74 | try: | 81 | try: |
75 | data = unichr(real_name) | 82 | data = chr(real_name) |
76 | except (ValueError, OverflowError), e: | 83 | except (ValueError, OverflowError) as e: |
77 | data = u"\N{REPLACEMENT CHARACTER}" | 84 | data = "\N{REPLACEMENT CHARACTER}" |
78 | 85 | ||
79 | self.handle_data(data) | 86 | self.handle_data(data) |
80 | 87 | ||
@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
113 | 120 | ||
114 | def handle_pi(self, data): | 121 | def handle_pi(self, data): |
115 | self.soup.endData() | 122 | self.soup.endData() |
116 | if data.endswith("?") and data.lower().startswith("xml"): | ||
117 | # "An XHTML processing instruction using the trailing '?' | ||
118 | # will cause the '?' to be included in data." - HTMLParser | ||
119 | # docs. | ||
120 | # | ||
121 | # Strip the question mark so we don't end up with two | ||
122 | # question marks. | ||
123 | data = data[:-1] | ||
124 | self.soup.handle_data(data) | 123 | self.soup.handle_data(data) |
125 | self.soup.endData(ProcessingInstruction) | 124 | self.soup.endData(ProcessingInstruction) |
126 | 125 | ||
@@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
128 | class HTMLParserTreeBuilder(HTMLTreeBuilder): | 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): |
129 | 128 | ||
130 | is_xml = False | 129 | is_xml = False |
131 | features = [HTML, STRICT, HTMLPARSER] | 130 | picklable = True |
131 | NAME = HTMLPARSER | ||
132 | features = [NAME, HTML, STRICT] | ||
132 | 133 | ||
133 | def __init__(self, *args, **kwargs): | 134 | def __init__(self, *args, **kwargs): |
134 | if CONSTRUCTOR_TAKES_STRICT: | 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: |
135 | kwargs['strict'] = False | 136 | kwargs['strict'] = False |
137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: | ||
138 | kwargs['convert_charrefs'] = False | ||
136 | self.parser_args = (args, kwargs) | 139 | self.parser_args = (args, kwargs) |
137 | 140 | ||
138 | def prepare_markup(self, markup, user_specified_encoding=None, | 141 | def prepare_markup(self, markup, user_specified_encoding=None, |
139 | document_declared_encoding=None): | 142 | document_declared_encoding=None, exclude_encodings=None): |
140 | """ | 143 | """ |
141 | :return: A 4-tuple (markup, original encoding, encoding | 144 | :return: A 4-tuple (markup, original encoding, encoding |
142 | declared within markup, whether any characters had to be | 145 | declared within markup, whether any characters had to be |
143 | replaced with REPLACEMENT CHARACTER). | 146 | replaced with REPLACEMENT CHARACTER). |
144 | """ | 147 | """ |
145 | if isinstance(markup, unicode): | 148 | if isinstance(markup, str): |
146 | yield (markup, None, None, False) | 149 | yield (markup, None, None, False) |
147 | return | 150 | return |
148 | 151 | ||
149 | try_encodings = [user_specified_encoding, document_declared_encoding] | 152 | try_encodings = [user_specified_encoding, document_declared_encoding] |
150 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) | 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, |
154 | exclude_encodings=exclude_encodings) | ||
151 | yield (dammit.markup, dammit.original_encoding, | 155 | yield (dammit.markup, dammit.original_encoding, |
152 | dammit.declared_html_encoding, | 156 | dammit.declared_html_encoding, |
153 | dammit.contains_replacement_characters) | 157 | dammit.contains_replacement_characters) |
@@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): | |||
158 | parser.soup = self.soup | 162 | parser.soup = self.soup |
159 | try: | 163 | try: |
160 | parser.feed(markup) | 164 | parser.feed(markup) |
161 | except HTMLParseError, e: | 165 | except HTMLParseError as e: |
162 | warnings.warn(RuntimeWarning( | 166 | warnings.warn(RuntimeWarning( |
163 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) | 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) |
164 | raise e | 168 | raise e |
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py index fa5d49875e..9c6c14ee65 100644 --- a/bitbake/lib/bs4/builder/_lxml.py +++ b/bitbake/lib/bs4/builder/_lxml.py | |||
@@ -4,10 +4,15 @@ __all__ = [ | |||
4 | ] | 4 | ] |
5 | 5 | ||
6 | from io import BytesIO | 6 | from io import BytesIO |
7 | from StringIO import StringIO | 7 | from io import StringIO |
8 | import collections | 8 | import collections |
9 | from lxml import etree | 9 | from lxml import etree |
10 | from bs4.element import Comment, Doctype, NamespacedAttribute | 10 | from bs4.element import ( |
11 | Comment, | ||
12 | Doctype, | ||
13 | NamespacedAttribute, | ||
14 | ProcessingInstruction, | ||
15 | ) | ||
11 | from bs4.builder import ( | 16 | from bs4.builder import ( |
12 | FAST, | 17 | FAST, |
13 | HTML, | 18 | HTML, |
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
25 | 30 | ||
26 | is_xml = True | 31 | is_xml = True |
27 | 32 | ||
33 | NAME = "lxml-xml" | ||
34 | ALTERNATE_NAMES = ["xml"] | ||
35 | |||
28 | # Well, it's permissive by XML parser standards. | 36 | # Well, it's permissive by XML parser standards. |
29 | features = [LXML, XML, FAST, PERMISSIVE] | 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] |
30 | 38 | ||
31 | CHUNK_SIZE = 512 | 39 | CHUNK_SIZE = 512 |
32 | 40 | ||
@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
70 | return (None, tag) | 78 | return (None, tag) |
71 | 79 | ||
72 | def prepare_markup(self, markup, user_specified_encoding=None, | 80 | def prepare_markup(self, markup, user_specified_encoding=None, |
81 | exclude_encodings=None, | ||
73 | document_declared_encoding=None): | 82 | document_declared_encoding=None): |
74 | """ | 83 | """ |
75 | :yield: A series of 4-tuples. | 84 | :yield: A series of 4-tuples. |
@@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
78 | 87 | ||
79 | Each 4-tuple represents a strategy for parsing the document. | 88 | Each 4-tuple represents a strategy for parsing the document. |
80 | """ | 89 | """ |
81 | if isinstance(markup, unicode): | 90 | if isinstance(markup, str): |
82 | # We were given Unicode. Maybe lxml can parse Unicode on | 91 | # We were given Unicode. Maybe lxml can parse Unicode on |
83 | # this system? | 92 | # this system? |
84 | yield markup, None, document_declared_encoding, False | 93 | yield markup, None, document_declared_encoding, False |
85 | 94 | ||
86 | if isinstance(markup, unicode): | 95 | if isinstance(markup, str): |
87 | # No, apparently not. Convert the Unicode to UTF-8 and | 96 | # No, apparently not. Convert the Unicode to UTF-8 and |
88 | # tell lxml to parse it as UTF-8. | 97 | # tell lxml to parse it as UTF-8. |
89 | yield (markup.encode("utf8"), "utf8", | 98 | yield (markup.encode("utf8"), "utf8", |
@@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
95 | # the document as each one in turn. | 104 | # the document as each one in turn. |
96 | is_html = not self.is_xml | 105 | is_html = not self.is_xml |
97 | try_encodings = [user_specified_encoding, document_declared_encoding] | 106 | try_encodings = [user_specified_encoding, document_declared_encoding] |
98 | detector = EncodingDetector(markup, try_encodings, is_html) | 107 | detector = EncodingDetector( |
108 | markup, try_encodings, is_html, exclude_encodings) | ||
99 | for encoding in detector.encodings: | 109 | for encoding in detector.encodings: |
100 | yield (detector.markup, encoding, document_declared_encoding, False) | 110 | yield (detector.markup, encoding, document_declared_encoding, False) |
101 | 111 | ||
102 | def feed(self, markup): | 112 | def feed(self, markup): |
103 | if isinstance(markup, bytes): | 113 | if isinstance(markup, bytes): |
104 | markup = BytesIO(markup) | 114 | markup = BytesIO(markup) |
105 | elif isinstance(markup, unicode): | 115 | elif isinstance(markup, str): |
106 | markup = StringIO(markup) | 116 | markup = StringIO(markup) |
107 | 117 | ||
108 | # Call feed() at least once, even if the markup is empty, | 118 | # Call feed() at least once, even if the markup is empty, |
@@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
117 | if len(data) != 0: | 127 | if len(data) != 0: |
118 | self.parser.feed(data) | 128 | self.parser.feed(data) |
119 | self.parser.close() | 129 | self.parser.close() |
120 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | 130 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
121 | raise ParserRejectedMarkup(str(e)) | 131 | raise ParserRejectedMarkup(str(e)) |
122 | 132 | ||
123 | def close(self): | 133 | def close(self): |
@@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
135 | self.nsmaps.append(None) | 145 | self.nsmaps.append(None) |
136 | elif len(nsmap) > 0: | 146 | elif len(nsmap) > 0: |
137 | # A new namespace mapping has come into play. | 147 | # A new namespace mapping has come into play. |
138 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) | 148 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) |
139 | self.nsmaps.append(inverted_nsmap) | 149 | self.nsmaps.append(inverted_nsmap) |
140 | # Also treat the namespace mapping as a set of attributes on the | 150 | # Also treat the namespace mapping as a set of attributes on the |
141 | # tag, so we can recreate it later. | 151 | # tag, so we can recreate it later. |
142 | attrs = attrs.copy() | 152 | attrs = attrs.copy() |
143 | for prefix, namespace in nsmap.items(): | 153 | for prefix, namespace in list(nsmap.items()): |
144 | attribute = NamespacedAttribute( | 154 | attribute = NamespacedAttribute( |
145 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") | 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") |
146 | attrs[attribute] = namespace | 156 | attrs[attribute] = namespace |
@@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
149 | # from lxml with namespaces attached to their names, and | 159 | # from lxml with namespaces attached to their names, and |
150 | # turn then into NamespacedAttribute objects. | 160 | # turn then into NamespacedAttribute objects. |
151 | new_attrs = {} | 161 | new_attrs = {} |
152 | for attr, value in attrs.items(): | 162 | for attr, value in list(attrs.items()): |
153 | namespace, attr = self._getNsTag(attr) | 163 | namespace, attr = self._getNsTag(attr) |
154 | if namespace is None: | 164 | if namespace is None: |
155 | new_attrs[attr] = value | 165 | new_attrs[attr] = value |
@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
189 | self.nsmaps.pop() | 199 | self.nsmaps.pop() |
190 | 200 | ||
191 | def pi(self, target, data): | 201 | def pi(self, target, data): |
192 | pass | 202 | self.soup.endData() |
203 | self.soup.handle_data(target + ' ' + data) | ||
204 | self.soup.endData(ProcessingInstruction) | ||
193 | 205 | ||
194 | def data(self, content): | 206 | def data(self, content): |
195 | self.soup.handle_data(content) | 207 | self.soup.handle_data(content) |
@@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
207 | 219 | ||
208 | def test_fragment_to_document(self, fragment): | 220 | def test_fragment_to_document(self, fragment): |
209 | """See `TreeBuilder`.""" | 221 | """See `TreeBuilder`.""" |
210 | return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment | 222 | return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment |
211 | 223 | ||
212 | 224 | ||
213 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): |
214 | 226 | ||
215 | features = [LXML, HTML, FAST, PERMISSIVE] | 227 | NAME = LXML |
228 | ALTERNATE_NAMES = ["lxml-html"] | ||
229 | |||
230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] | ||
216 | is_xml = False | 231 | is_xml = False |
217 | 232 | ||
218 | def default_parser(self, encoding): | 233 | def default_parser(self, encoding): |
@@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | |||
224 | self.parser = self.parser_for(encoding) | 239 | self.parser = self.parser_for(encoding) |
225 | self.parser.feed(markup) | 240 | self.parser.feed(markup) |
226 | self.parser.close() | 241 | self.parser.close() |
227 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | 242 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
228 | raise ParserRejectedMarkup(str(e)) | 243 | raise ParserRejectedMarkup(str(e)) |
229 | 244 | ||
230 | 245 | ||
231 | def test_fragment_to_document(self, fragment): | 246 | def test_fragment_to_document(self, fragment): |
232 | """See `TreeBuilder`.""" | 247 | """See `TreeBuilder`.""" |
233 | return u'<html><body>%s</body></html>' % fragment | 248 | return '<html><body>%s</body></html>' % fragment |