summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/builder/_lxml.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/builder/_lxml.py')
-rw-r--r--bitbake/lib/bs4/builder/_lxml.py47
1 files changed, 31 insertions, 16 deletions
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py
index fa5d49875e..9c6c14ee65 100644
--- a/bitbake/lib/bs4/builder/_lxml.py
+++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -4,10 +4,15 @@ __all__ = [
4 ] 4 ]
5 5
6from io import BytesIO 6from io import BytesIO
7from StringIO import StringIO 7from io import StringIO
8import collections 8import collections
9from lxml import etree 9from lxml import etree
10from bs4.element import Comment, Doctype, NamespacedAttribute 10from bs4.element import (
11 Comment,
12 Doctype,
13 NamespacedAttribute,
14 ProcessingInstruction,
15)
11from bs4.builder import ( 16from bs4.builder import (
12 FAST, 17 FAST,
13 HTML, 18 HTML,
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
25 30
26 is_xml = True 31 is_xml = True
27 32
33 NAME = "lxml-xml"
34 ALTERNATE_NAMES = ["xml"]
35
28 # Well, it's permissive by XML parser standards. 36 # Well, it's permissive by XML parser standards.
29 features = [LXML, XML, FAST, PERMISSIVE] 37 features = [NAME, LXML, XML, FAST, PERMISSIVE]
30 38
31 CHUNK_SIZE = 512 39 CHUNK_SIZE = 512
32 40
@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
70 return (None, tag) 78 return (None, tag)
71 79
72 def prepare_markup(self, markup, user_specified_encoding=None, 80 def prepare_markup(self, markup, user_specified_encoding=None,
81 exclude_encodings=None,
73 document_declared_encoding=None): 82 document_declared_encoding=None):
74 """ 83 """
75 :yield: A series of 4-tuples. 84 :yield: A series of 4-tuples.
@@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
78 87
79 Each 4-tuple represents a strategy for parsing the document. 88 Each 4-tuple represents a strategy for parsing the document.
80 """ 89 """
81 if isinstance(markup, unicode): 90 if isinstance(markup, str):
82 # We were given Unicode. Maybe lxml can parse Unicode on 91 # We were given Unicode. Maybe lxml can parse Unicode on
83 # this system? 92 # this system?
84 yield markup, None, document_declared_encoding, False 93 yield markup, None, document_declared_encoding, False
85 94
86 if isinstance(markup, unicode): 95 if isinstance(markup, str):
87 # No, apparently not. Convert the Unicode to UTF-8 and 96 # No, apparently not. Convert the Unicode to UTF-8 and
88 # tell lxml to parse it as UTF-8. 97 # tell lxml to parse it as UTF-8.
89 yield (markup.encode("utf8"), "utf8", 98 yield (markup.encode("utf8"), "utf8",
@@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
95 # the document as each one in turn. 104 # the document as each one in turn.
96 is_html = not self.is_xml 105 is_html = not self.is_xml
97 try_encodings = [user_specified_encoding, document_declared_encoding] 106 try_encodings = [user_specified_encoding, document_declared_encoding]
98 detector = EncodingDetector(markup, try_encodings, is_html) 107 detector = EncodingDetector(
108 markup, try_encodings, is_html, exclude_encodings)
99 for encoding in detector.encodings: 109 for encoding in detector.encodings:
100 yield (detector.markup, encoding, document_declared_encoding, False) 110 yield (detector.markup, encoding, document_declared_encoding, False)
101 111
102 def feed(self, markup): 112 def feed(self, markup):
103 if isinstance(markup, bytes): 113 if isinstance(markup, bytes):
104 markup = BytesIO(markup) 114 markup = BytesIO(markup)
105 elif isinstance(markup, unicode): 115 elif isinstance(markup, str):
106 markup = StringIO(markup) 116 markup = StringIO(markup)
107 117
108 # Call feed() at least once, even if the markup is empty, 118 # Call feed() at least once, even if the markup is empty,
@@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
117 if len(data) != 0: 127 if len(data) != 0:
118 self.parser.feed(data) 128 self.parser.feed(data)
119 self.parser.close() 129 self.parser.close()
120 except (UnicodeDecodeError, LookupError, etree.ParserError), e: 130 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
121 raise ParserRejectedMarkup(str(e)) 131 raise ParserRejectedMarkup(str(e))
122 132
123 def close(self): 133 def close(self):
@@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
135 self.nsmaps.append(None) 145 self.nsmaps.append(None)
136 elif len(nsmap) > 0: 146 elif len(nsmap) > 0:
137 # A new namespace mapping has come into play. 147 # A new namespace mapping has come into play.
138 inverted_nsmap = dict((value, key) for key, value in nsmap.items()) 148 inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
139 self.nsmaps.append(inverted_nsmap) 149 self.nsmaps.append(inverted_nsmap)
140 # Also treat the namespace mapping as a set of attributes on the 150 # Also treat the namespace mapping as a set of attributes on the
141 # tag, so we can recreate it later. 151 # tag, so we can recreate it later.
142 attrs = attrs.copy() 152 attrs = attrs.copy()
143 for prefix, namespace in nsmap.items(): 153 for prefix, namespace in list(nsmap.items()):
144 attribute = NamespacedAttribute( 154 attribute = NamespacedAttribute(
145 "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 155 "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
146 attrs[attribute] = namespace 156 attrs[attribute] = namespace
@@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
149 # from lxml with namespaces attached to their names, and 159 # from lxml with namespaces attached to their names, and
150 # turn then into NamespacedAttribute objects. 160 # turn then into NamespacedAttribute objects.
151 new_attrs = {} 161 new_attrs = {}
152 for attr, value in attrs.items(): 162 for attr, value in list(attrs.items()):
153 namespace, attr = self._getNsTag(attr) 163 namespace, attr = self._getNsTag(attr)
154 if namespace is None: 164 if namespace is None:
155 new_attrs[attr] = value 165 new_attrs[attr] = value
@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
189 self.nsmaps.pop() 199 self.nsmaps.pop()
190 200
191 def pi(self, target, data): 201 def pi(self, target, data):
192 pass 202 self.soup.endData()
203 self.soup.handle_data(target + ' ' + data)
204 self.soup.endData(ProcessingInstruction)
193 205
194 def data(self, content): 206 def data(self, content):
195 self.soup.handle_data(content) 207 self.soup.handle_data(content)
@@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
207 219
208 def test_fragment_to_document(self, fragment): 220 def test_fragment_to_document(self, fragment):
209 """See `TreeBuilder`.""" 221 """See `TreeBuilder`."""
210 return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment 222 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
211 223
212 224
213class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 225class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
214 226
215 features = [LXML, HTML, FAST, PERMISSIVE] 227 NAME = LXML
228 ALTERNATE_NAMES = ["lxml-html"]
229
230 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
216 is_xml = False 231 is_xml = False
217 232
218 def default_parser(self, encoding): 233 def default_parser(self, encoding):
@@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
224 self.parser = self.parser_for(encoding) 239 self.parser = self.parser_for(encoding)
225 self.parser.feed(markup) 240 self.parser.feed(markup)
226 self.parser.close() 241 self.parser.close()
227 except (UnicodeDecodeError, LookupError, etree.ParserError), e: 242 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
228 raise ParserRejectedMarkup(str(e)) 243 raise ParserRejectedMarkup(str(e))
229 244
230 245
231 def test_fragment_to_document(self, fragment): 246 def test_fragment_to_document(self, fragment):
232 """See `TreeBuilder`.""" 247 """See `TreeBuilder`."""
233 return u'<html><body>%s</body></html>' % fragment 248 return '<html><body>%s</body></html>' % fragment