diff options
Diffstat (limited to 'bitbake/lib/bs4/builder/_lxml.py')
-rw-r--r-- | bitbake/lib/bs4/builder/_lxml.py | 47 |
1 files changed, 31 insertions, 16 deletions
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py index fa5d49875e..9c6c14ee65 100644 --- a/bitbake/lib/bs4/builder/_lxml.py +++ b/bitbake/lib/bs4/builder/_lxml.py | |||
@@ -4,10 +4,15 @@ __all__ = [ | |||
4 | ] | 4 | ] |
5 | 5 | ||
6 | from io import BytesIO | 6 | from io import BytesIO |
7 | from StringIO import StringIO | 7 | from io import StringIO |
8 | import collections | 8 | import collections |
9 | from lxml import etree | 9 | from lxml import etree |
10 | from bs4.element import Comment, Doctype, NamespacedAttribute | 10 | from bs4.element import ( |
11 | Comment, | ||
12 | Doctype, | ||
13 | NamespacedAttribute, | ||
14 | ProcessingInstruction, | ||
15 | ) | ||
11 | from bs4.builder import ( | 16 | from bs4.builder import ( |
12 | FAST, | 17 | FAST, |
13 | HTML, | 18 | HTML, |
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
25 | 30 | ||
26 | is_xml = True | 31 | is_xml = True |
27 | 32 | ||
33 | NAME = "lxml-xml" | ||
34 | ALTERNATE_NAMES = ["xml"] | ||
35 | |||
28 | # Well, it's permissive by XML parser standards. | 36 | # Well, it's permissive by XML parser standards. |
29 | features = [LXML, XML, FAST, PERMISSIVE] | 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] |
30 | 38 | ||
31 | CHUNK_SIZE = 512 | 39 | CHUNK_SIZE = 512 |
32 | 40 | ||
@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
70 | return (None, tag) | 78 | return (None, tag) |
71 | 79 | ||
72 | def prepare_markup(self, markup, user_specified_encoding=None, | 80 | def prepare_markup(self, markup, user_specified_encoding=None, |
81 | exclude_encodings=None, | ||
73 | document_declared_encoding=None): | 82 | document_declared_encoding=None): |
74 | """ | 83 | """ |
75 | :yield: A series of 4-tuples. | 84 | :yield: A series of 4-tuples. |
@@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
78 | 87 | ||
79 | Each 4-tuple represents a strategy for parsing the document. | 88 | Each 4-tuple represents a strategy for parsing the document. |
80 | """ | 89 | """ |
81 | if isinstance(markup, unicode): | 90 | if isinstance(markup, str): |
82 | # We were given Unicode. Maybe lxml can parse Unicode on | 91 | # We were given Unicode. Maybe lxml can parse Unicode on |
83 | # this system? | 92 | # this system? |
84 | yield markup, None, document_declared_encoding, False | 93 | yield markup, None, document_declared_encoding, False |
85 | 94 | ||
86 | if isinstance(markup, unicode): | 95 | if isinstance(markup, str): |
87 | # No, apparently not. Convert the Unicode to UTF-8 and | 96 | # No, apparently not. Convert the Unicode to UTF-8 and |
88 | # tell lxml to parse it as UTF-8. | 97 | # tell lxml to parse it as UTF-8. |
89 | yield (markup.encode("utf8"), "utf8", | 98 | yield (markup.encode("utf8"), "utf8", |
@@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
95 | # the document as each one in turn. | 104 | # the document as each one in turn. |
96 | is_html = not self.is_xml | 105 | is_html = not self.is_xml |
97 | try_encodings = [user_specified_encoding, document_declared_encoding] | 106 | try_encodings = [user_specified_encoding, document_declared_encoding] |
98 | detector = EncodingDetector(markup, try_encodings, is_html) | 107 | detector = EncodingDetector( |
108 | markup, try_encodings, is_html, exclude_encodings) | ||
99 | for encoding in detector.encodings: | 109 | for encoding in detector.encodings: |
100 | yield (detector.markup, encoding, document_declared_encoding, False) | 110 | yield (detector.markup, encoding, document_declared_encoding, False) |
101 | 111 | ||
102 | def feed(self, markup): | 112 | def feed(self, markup): |
103 | if isinstance(markup, bytes): | 113 | if isinstance(markup, bytes): |
104 | markup = BytesIO(markup) | 114 | markup = BytesIO(markup) |
105 | elif isinstance(markup, unicode): | 115 | elif isinstance(markup, str): |
106 | markup = StringIO(markup) | 116 | markup = StringIO(markup) |
107 | 117 | ||
108 | # Call feed() at least once, even if the markup is empty, | 118 | # Call feed() at least once, even if the markup is empty, |
@@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
117 | if len(data) != 0: | 127 | if len(data) != 0: |
118 | self.parser.feed(data) | 128 | self.parser.feed(data) |
119 | self.parser.close() | 129 | self.parser.close() |
120 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | 130 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
121 | raise ParserRejectedMarkup(str(e)) | 131 | raise ParserRejectedMarkup(str(e)) |
122 | 132 | ||
123 | def close(self): | 133 | def close(self): |
@@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
135 | self.nsmaps.append(None) | 145 | self.nsmaps.append(None) |
136 | elif len(nsmap) > 0: | 146 | elif len(nsmap) > 0: |
137 | # A new namespace mapping has come into play. | 147 | # A new namespace mapping has come into play. |
138 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) | 148 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) |
139 | self.nsmaps.append(inverted_nsmap) | 149 | self.nsmaps.append(inverted_nsmap) |
140 | # Also treat the namespace mapping as a set of attributes on the | 150 | # Also treat the namespace mapping as a set of attributes on the |
141 | # tag, so we can recreate it later. | 151 | # tag, so we can recreate it later. |
142 | attrs = attrs.copy() | 152 | attrs = attrs.copy() |
143 | for prefix, namespace in nsmap.items(): | 153 | for prefix, namespace in list(nsmap.items()): |
144 | attribute = NamespacedAttribute( | 154 | attribute = NamespacedAttribute( |
145 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") | 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") |
146 | attrs[attribute] = namespace | 156 | attrs[attribute] = namespace |
@@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
149 | # from lxml with namespaces attached to their names, and | 159 | # from lxml with namespaces attached to their names, and |
150 | # turn then into NamespacedAttribute objects. | 160 | # turn then into NamespacedAttribute objects. |
151 | new_attrs = {} | 161 | new_attrs = {} |
152 | for attr, value in attrs.items(): | 162 | for attr, value in list(attrs.items()): |
153 | namespace, attr = self._getNsTag(attr) | 163 | namespace, attr = self._getNsTag(attr) |
154 | if namespace is None: | 164 | if namespace is None: |
155 | new_attrs[attr] = value | 165 | new_attrs[attr] = value |
@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
189 | self.nsmaps.pop() | 199 | self.nsmaps.pop() |
190 | 200 | ||
191 | def pi(self, target, data): | 201 | def pi(self, target, data): |
192 | pass | 202 | self.soup.endData() |
203 | self.soup.handle_data(target + ' ' + data) | ||
204 | self.soup.endData(ProcessingInstruction) | ||
193 | 205 | ||
194 | def data(self, content): | 206 | def data(self, content): |
195 | self.soup.handle_data(content) | 207 | self.soup.handle_data(content) |
@@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
207 | 219 | ||
208 | def test_fragment_to_document(self, fragment): | 220 | def test_fragment_to_document(self, fragment): |
209 | """See `TreeBuilder`.""" | 221 | """See `TreeBuilder`.""" |
210 | return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment | 222 | return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment |
211 | 223 | ||
212 | 224 | ||
213 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): |
214 | 226 | ||
215 | features = [LXML, HTML, FAST, PERMISSIVE] | 227 | NAME = LXML |
228 | ALTERNATE_NAMES = ["lxml-html"] | ||
229 | |||
230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] | ||
216 | is_xml = False | 231 | is_xml = False |
217 | 232 | ||
218 | def default_parser(self, encoding): | 233 | def default_parser(self, encoding): |
@@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | |||
224 | self.parser = self.parser_for(encoding) | 239 | self.parser = self.parser_for(encoding) |
225 | self.parser.feed(markup) | 240 | self.parser.feed(markup) |
226 | self.parser.close() | 241 | self.parser.close() |
227 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | 242 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
228 | raise ParserRejectedMarkup(str(e)) | 243 | raise ParserRejectedMarkup(str(e)) |
229 | 244 | ||
230 | 245 | ||
231 | def test_fragment_to_document(self, fragment): | 246 | def test_fragment_to_document(self, fragment): |
232 | """See `TreeBuilder`.""" | 247 | """See `TreeBuilder`.""" |
233 | return u'<html><body>%s</body></html>' % fragment | 248 | return '<html><body>%s</body></html>' % fragment |