summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/builder
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/builder')
-rw-r--r--bitbake/lib/bs4/builder/__init__.py321
-rw-r--r--bitbake/lib/bs4/builder/_html5lib.py285
-rw-r--r--bitbake/lib/bs4/builder/_htmlparser.py258
-rw-r--r--bitbake/lib/bs4/builder/_lxml.py233
4 files changed, 1097 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py
new file mode 100644
index 0000000000..740f5f29cd
--- /dev/null
+++ b/bitbake/lib/bs4/builder/__init__.py
@@ -0,0 +1,321 @@
1from collections import defaultdict
2import itertools
3import sys
4from bs4.element import (
5 CharsetMetaAttributeValue,
6 ContentMetaAttributeValue,
7 whitespace_re
8 )
9
10__all__ = [
11 'HTMLTreeBuilder',
12 'SAXTreeBuilder',
13 'TreeBuilder',
14 'TreeBuilderRegistry',
15 ]
16
17# Some useful features for a TreeBuilder to have.
18FAST = 'fast'
19PERMISSIVE = 'permissive'
20STRICT = 'strict'
21XML = 'xml'
22HTML = 'html'
23HTML_5 = 'html5'
24
25
26class TreeBuilderRegistry(object):
27
28 def __init__(self):
29 self.builders_for_feature = defaultdict(list)
30 self.builders = []
31
32 def register(self, treebuilder_class):
33 """Register a treebuilder based on its advertised features."""
34 for feature in treebuilder_class.features:
35 self.builders_for_feature[feature].insert(0, treebuilder_class)
36 self.builders.insert(0, treebuilder_class)
37
38 def lookup(self, *features):
39 if len(self.builders) == 0:
40 # There are no builders at all.
41 return None
42
43 if len(features) == 0:
44 # They didn't ask for any features. Give them the most
45 # recently registered builder.
46 return self.builders[0]
47
48 # Go down the list of features in order, and eliminate any builders
49 # that don't match every feature.
50 features = list(features)
51 features.reverse()
52 candidates = None
53 candidate_set = None
54 while len(features) > 0:
55 feature = features.pop()
56 we_have_the_feature = self.builders_for_feature.get(feature, [])
57 if len(we_have_the_feature) > 0:
58 if candidates is None:
59 candidates = we_have_the_feature
60 candidate_set = set(candidates)
61 else:
62 # Eliminate any candidates that don't have this feature.
63 candidate_set = candidate_set.intersection(
64 set(we_have_the_feature))
65
66 # The only valid candidates are the ones in candidate_set.
67 # Go through the original list of candidates and pick the first one
68 # that's in candidate_set.
69 if candidate_set is None:
70 return None
71 for candidate in candidates:
72 if candidate in candidate_set:
73 return candidate
74 return None
75
76# The BeautifulSoup class will take feature lists from developers and use them
77# to look up builders in this registry.
78builder_registry = TreeBuilderRegistry()
79
80class TreeBuilder(object):
81 """Turn a document into a Beautiful Soup object tree."""
82
83 features = []
84
85 is_xml = False
86 preserve_whitespace_tags = set()
87 empty_element_tags = None # A tag will be considered an empty-element
88 # tag when and only when it has no contents.
89
90 # A value for these tag/attribute combinations is a space- or
91 # comma-separated list of CDATA, rather than a single CDATA.
92 cdata_list_attributes = {}
93
94
95 def __init__(self):
96 self.soup = None
97
98 def reset(self):
99 pass
100
101 def can_be_empty_element(self, tag_name):
102 """Might a tag with this name be an empty-element tag?
103
104 The final markup may or may not actually present this tag as
105 self-closing.
106
107 For instance: an HTMLBuilder does not consider a <p> tag to be
108 an empty-element tag (it's not in
109 HTMLBuilder.empty_element_tags). This means an empty <p> tag
110 will be presented as "<p></p>", not "<p />".
111
112 The default implementation has no opinion about which tags are
113 empty-element tags, so a tag will be presented as an
114 empty-element tag if and only if it has no contents.
115 "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
116 be left alone.
117 """
118 if self.empty_element_tags is None:
119 return True
120 return tag_name in self.empty_element_tags
121
122 def feed(self, markup):
123 raise NotImplementedError()
124
125 def prepare_markup(self, markup, user_specified_encoding=None,
126 document_declared_encoding=None):
127 return markup, None, None, False
128
129 def test_fragment_to_document(self, fragment):
130 """Wrap an HTML fragment to make it look like a document.
131
132 Different parsers do this differently. For instance, lxml
133 introduces an empty <head> tag, and html5lib
134 doesn't. Abstracting this away lets us write simple tests
135 which run HTML fragments through the parser and compare the
136 results against other HTML fragments.
137
138 This method should not be used outside of tests.
139 """
140 return fragment
141
142 def set_up_substitutions(self, tag):
143 return False
144
145 def _replace_cdata_list_attribute_values(self, tag_name, attrs):
146 """Replaces class="foo bar" with class=["foo", "bar"]
147
148 Modifies its input in place.
149 """
150 if not attrs:
151 return attrs
152 if self.cdata_list_attributes:
153 universal = self.cdata_list_attributes.get('*', [])
154 tag_specific = self.cdata_list_attributes.get(
155 tag_name.lower(), None)
156 for attr in attrs.keys():
157 if attr in universal or (tag_specific and attr in tag_specific):
158 # We have a "class"-type attribute whose string
159 # value is a whitespace-separated list of
160 # values. Split it into a list.
161 value = attrs[attr]
162 if isinstance(value, basestring):
163 values = whitespace_re.split(value)
164 else:
165 # html5lib sometimes calls setAttributes twice
166 # for the same tag when rearranging the parse
167 # tree. On the second call the attribute value
168 # here is already a list. If this happens,
169 # leave the value alone rather than trying to
170 # split it again.
171 values = value
172 attrs[attr] = values
173 return attrs
174
175class SAXTreeBuilder(TreeBuilder):
176 """A Beautiful Soup treebuilder that listens for SAX events."""
177
178 def feed(self, markup):
179 raise NotImplementedError()
180
181 def close(self):
182 pass
183
184 def startElement(self, name, attrs):
185 attrs = dict((key[1], value) for key, value in list(attrs.items()))
186 #print "Start %s, %r" % (name, attrs)
187 self.soup.handle_starttag(name, attrs)
188
189 def endElement(self, name):
190 #print "End %s" % name
191 self.soup.handle_endtag(name)
192
193 def startElementNS(self, nsTuple, nodeName, attrs):
194 # Throw away (ns, nodeName) for now.
195 self.startElement(nodeName, attrs)
196
197 def endElementNS(self, nsTuple, nodeName):
198 # Throw away (ns, nodeName) for now.
199 self.endElement(nodeName)
200 #handler.endElementNS((ns, node.nodeName), node.nodeName)
201
202 def startPrefixMapping(self, prefix, nodeValue):
203 # Ignore the prefix for now.
204 pass
205
206 def endPrefixMapping(self, prefix):
207 # Ignore the prefix for now.
208 # handler.endPrefixMapping(prefix)
209 pass
210
211 def characters(self, content):
212 self.soup.handle_data(content)
213
214 def startDocument(self):
215 pass
216
217 def endDocument(self):
218 pass
219
220
221class HTMLTreeBuilder(TreeBuilder):
222 """This TreeBuilder knows facts about HTML.
223
224 Such as which tags are empty-element tags.
225 """
226
227 preserve_whitespace_tags = set(['pre', 'textarea'])
228 empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
229 'spacer', 'link', 'frame', 'base'])
230
231 # The HTML standard defines these attributes as containing a
232 # space-separated list of values, not a single value. That is,
233 # class="foo bar" means that the 'class' attribute has two values,
234 # 'foo' and 'bar', not the single value 'foo bar'. When we
235 # encounter one of these attributes, we will parse its value into
236 # a list of values if possible. Upon output, the list will be
237 # converted back into a string.
238 cdata_list_attributes = {
239 "*" : ['class', 'accesskey', 'dropzone'],
240 "a" : ['rel', 'rev'],
241 "link" : ['rel', 'rev'],
242 "td" : ["headers"],
243 "th" : ["headers"],
244 "td" : ["headers"],
245 "form" : ["accept-charset"],
246 "object" : ["archive"],
247
248 # These are HTML5 specific, as are *.accesskey and *.dropzone above.
249 "area" : ["rel"],
250 "icon" : ["sizes"],
251 "iframe" : ["sandbox"],
252 "output" : ["for"],
253 }
254
255 def set_up_substitutions(self, tag):
256 # We are only interested in <meta> tags
257 if tag.name != 'meta':
258 return False
259
260 http_equiv = tag.get('http-equiv')
261 content = tag.get('content')
262 charset = tag.get('charset')
263
264 # We are interested in <meta> tags that say what encoding the
265 # document was originally in. This means HTML 5-style <meta>
266 # tags that provide the "charset" attribute. It also means
267 # HTML 4-style <meta> tags that provide the "content"
268 # attribute and have "http-equiv" set to "content-type".
269 #
270 # In both cases we will replace the value of the appropriate
271 # attribute with a standin object that can take on any
272 # encoding.
273 meta_encoding = None
274 if charset is not None:
275 # HTML 5 style:
276 # <meta charset="utf8">
277 meta_encoding = charset
278 tag['charset'] = CharsetMetaAttributeValue(charset)
279
280 elif (content is not None and http_equiv is not None
281 and http_equiv.lower() == 'content-type'):
282 # HTML 4 style:
283 # <meta http-equiv="content-type" content="text/html; charset=utf8">
284 tag['content'] = ContentMetaAttributeValue(content)
285
286 return (meta_encoding is not None)
287
288def register_treebuilders_from(module):
289 """Copy TreeBuilders from the given module into this module."""
290 # I'm fairly sure this is not the best way to do this.
291 this_module = sys.modules['bs4.builder']
292 for name in module.__all__:
293 obj = getattr(module, name)
294
295 if issubclass(obj, TreeBuilder):
296 setattr(this_module, name, obj)
297 this_module.__all__.append(name)
298 # Register the builder while we're at it.
299 this_module.builder_registry.register(obj)
300
301class ParserRejectedMarkup(Exception):
302 pass
303
304# Builders are registered in reverse order of priority, so that custom
305# builder registrations will take precedence. In general, we want lxml
306# to take precedence over html5lib, because it's faster. And we only
307# want to use HTMLParser as a last result.
308from . import _htmlparser
309register_treebuilders_from(_htmlparser)
310try:
311 from . import _html5lib
312 register_treebuilders_from(_html5lib)
313except ImportError:
314 # They don't have html5lib installed.
315 pass
316try:
317 from . import _lxml
318 register_treebuilders_from(_lxml)
319except ImportError:
320 # They don't have lxml installed.
321 pass
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py
new file mode 100644
index 0000000000..7de36ae75e
--- /dev/null
+++ b/bitbake/lib/bs4/builder/_html5lib.py
@@ -0,0 +1,285 @@
1__all__ = [
2 'HTML5TreeBuilder',
3 ]
4
5import warnings
6from bs4.builder import (
7 PERMISSIVE,
8 HTML,
9 HTML_5,
10 HTMLTreeBuilder,
11 )
12from bs4.element import NamespacedAttribute
13import html5lib
14from html5lib.constants import namespaces
15from bs4.element import (
16 Comment,
17 Doctype,
18 NavigableString,
19 Tag,
20 )
21
22class HTML5TreeBuilder(HTMLTreeBuilder):
23 """Use html5lib to build a tree."""
24
25 features = ['html5lib', PERMISSIVE, HTML_5, HTML]
26
27 def prepare_markup(self, markup, user_specified_encoding):
28 # Store the user-specified encoding for use later on.
29 self.user_specified_encoding = user_specified_encoding
30 yield (markup, None, None, False)
31
32 # These methods are defined by Beautiful Soup.
33 def feed(self, markup):
34 if self.soup.parse_only is not None:
35 warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
36 parser = html5lib.HTMLParser(tree=self.create_treebuilder)
37 doc = parser.parse(markup, encoding=self.user_specified_encoding)
38
39 # Set the character encoding detected by the tokenizer.
40 if isinstance(markup, unicode):
41 # We need to special-case this because html5lib sets
42 # charEncoding to UTF-8 if it gets Unicode input.
43 doc.original_encoding = None
44 else:
45 doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
46
47 def create_treebuilder(self, namespaceHTMLElements):
48 self.underlying_builder = TreeBuilderForHtml5lib(
49 self.soup, namespaceHTMLElements)
50 return self.underlying_builder
51
52 def test_fragment_to_document(self, fragment):
53 """See `TreeBuilder`."""
54 return u'<html><head></head><body>%s</body></html>' % fragment
55
56
57class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
58
59 def __init__(self, soup, namespaceHTMLElements):
60 self.soup = soup
61 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
62
63 def documentClass(self):
64 self.soup.reset()
65 return Element(self.soup, self.soup, None)
66
67 def insertDoctype(self, token):
68 name = token["name"]
69 publicId = token["publicId"]
70 systemId = token["systemId"]
71
72 doctype = Doctype.for_name_and_ids(name, publicId, systemId)
73 self.soup.object_was_parsed(doctype)
74
75 def elementClass(self, name, namespace):
76 tag = self.soup.new_tag(name, namespace)
77 return Element(tag, self.soup, namespace)
78
79 def commentClass(self, data):
80 return TextNode(Comment(data), self.soup)
81
82 def fragmentClass(self):
83 self.soup = BeautifulSoup("")
84 self.soup.name = "[document_fragment]"
85 return Element(self.soup, self.soup, None)
86
87 def appendChild(self, node):
88 # XXX This code is not covered by the BS4 tests.
89 self.soup.append(node.element)
90
91 def getDocument(self):
92 return self.soup
93
94 def getFragment(self):
95 return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
96
97class AttrList(object):
98 def __init__(self, element):
99 self.element = element
100 self.attrs = dict(self.element.attrs)
101 def __iter__(self):
102 return list(self.attrs.items()).__iter__()
103 def __setitem__(self, name, value):
104 "set attr", name, value
105 self.element[name] = value
106 def items(self):
107 return list(self.attrs.items())
108 def keys(self):
109 return list(self.attrs.keys())
110 def __len__(self):
111 return len(self.attrs)
112 def __getitem__(self, name):
113 return self.attrs[name]
114 def __contains__(self, name):
115 return name in list(self.attrs.keys())
116
117
118class Element(html5lib.treebuilders._base.Node):
119 def __init__(self, element, soup, namespace):
120 html5lib.treebuilders._base.Node.__init__(self, element.name)
121 self.element = element
122 self.soup = soup
123 self.namespace = namespace
124
125 def appendChild(self, node):
126 string_child = child = None
127 if isinstance(node, basestring):
128 # Some other piece of code decided to pass in a string
129 # instead of creating a TextElement object to contain the
130 # string.
131 string_child = child = node
132 elif isinstance(node, Tag):
133 # Some other piece of code decided to pass in a Tag
134 # instead of creating an Element object to contain the
135 # Tag.
136 child = node
137 elif node.element.__class__ == NavigableString:
138 string_child = child = node.element
139 else:
140 child = node.element
141
142 if not isinstance(child, basestring) and child.parent is not None:
143 node.element.extract()
144
145 if (string_child and self.element.contents
146 and self.element.contents[-1].__class__ == NavigableString):
147 # We are appending a string onto another string.
148 # TODO This has O(n^2) performance, for input like
149 # "a</a>a</a>a</a>..."
150 old_element = self.element.contents[-1]
151 new_element = self.soup.new_string(old_element + string_child)
152 old_element.replace_with(new_element)
153 self.soup._most_recent_element = new_element
154 else:
155 if isinstance(node, basestring):
156 # Create a brand new NavigableString from this string.
157 child = self.soup.new_string(node)
158
159 # Tell Beautiful Soup to act as if it parsed this element
160 # immediately after the parent's last descendant. (Or
161 # immediately after the parent, if it has no children.)
162 if self.element.contents:
163 most_recent_element = self.element._last_descendant(False)
164 else:
165 most_recent_element = self.element
166
167 self.soup.object_was_parsed(
168 child, parent=self.element,
169 most_recent_element=most_recent_element)
170
171 def getAttributes(self):
172 return AttrList(self.element)
173
174 def setAttributes(self, attributes):
175 if attributes is not None and len(attributes) > 0:
176
177 converted_attributes = []
178 for name, value in list(attributes.items()):
179 if isinstance(name, tuple):
180 new_name = NamespacedAttribute(*name)
181 del attributes[name]
182 attributes[new_name] = value
183
184 self.soup.builder._replace_cdata_list_attribute_values(
185 self.name, attributes)
186 for name, value in attributes.items():
187 self.element[name] = value
188
189 # The attributes may contain variables that need substitution.
190 # Call set_up_substitutions manually.
191 #
192 # The Tag constructor called this method when the Tag was created,
193 # but we just set/changed the attributes, so call it again.
194 self.soup.builder.set_up_substitutions(self.element)
195 attributes = property(getAttributes, setAttributes)
196
197 def insertText(self, data, insertBefore=None):
198 if insertBefore:
199 text = TextNode(self.soup.new_string(data), self.soup)
200 self.insertBefore(data, insertBefore)
201 else:
202 self.appendChild(data)
203
204 def insertBefore(self, node, refNode):
205 index = self.element.index(refNode.element)
206 if (node.element.__class__ == NavigableString and self.element.contents
207 and self.element.contents[index-1].__class__ == NavigableString):
208 # (See comments in appendChild)
209 old_node = self.element.contents[index-1]
210 new_str = self.soup.new_string(old_node + node.element)
211 old_node.replace_with(new_str)
212 else:
213 self.element.insert(index, node.element)
214 node.parent = self
215
216 def removeChild(self, node):
217 node.element.extract()
218
219 def reparentChildren(self, new_parent):
220 """Move all of this tag's children into another tag."""
221 element = self.element
222 new_parent_element = new_parent.element
223 # Determine what this tag's next_element will be once all the children
224 # are removed.
225 final_next_element = element.next_sibling
226
227 new_parents_last_descendant = new_parent_element._last_descendant(False, False)
228 if len(new_parent_element.contents) > 0:
229 # The new parent already contains children. We will be
230 # appending this tag's children to the end.
231 new_parents_last_child = new_parent_element.contents[-1]
232 new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
233 else:
234 # The new parent contains no children.
235 new_parents_last_child = None
236 new_parents_last_descendant_next_element = new_parent_element.next_element
237
238 to_append = element.contents
239 append_after = new_parent.element.contents
240 if len(to_append) > 0:
241 # Set the first child's previous_element and previous_sibling
242 # to elements within the new parent
243 first_child = to_append[0]
244 first_child.previous_element = new_parents_last_descendant
245 first_child.previous_sibling = new_parents_last_child
246
247 # Fix the last child's next_element and next_sibling
248 last_child = to_append[-1]
249 last_child.next_element = new_parents_last_descendant_next_element
250 last_child.next_sibling = None
251
252 for child in to_append:
253 child.parent = new_parent_element
254 new_parent_element.contents.append(child)
255
256 # Now that this element has no children, change its .next_element.
257 element.contents = []
258 element.next_element = final_next_element
259
260 def cloneNode(self):
261 tag = self.soup.new_tag(self.element.name, self.namespace)
262 node = Element(tag, self.soup, self.namespace)
263 for key,value in self.attributes:
264 node.attributes[key] = value
265 return node
266
267 def hasContent(self):
268 return self.element.contents
269
270 def getNameTuple(self):
271 if self.namespace == None:
272 return namespaces["html"], self.name
273 else:
274 return self.namespace, self.name
275
276 nameTuple = property(getNameTuple)
277
278class TextNode(Element):
279 def __init__(self, element, soup):
280 html5lib.treebuilders._base.Node.__init__(self, None)
281 self.element = element
282 self.soup = soup
283
284 def cloneNode(self):
285 raise NotImplementedError
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py
new file mode 100644
index 0000000000..ca8d8b892b
--- /dev/null
+++ b/bitbake/lib/bs4/builder/_htmlparser.py
@@ -0,0 +1,258 @@
1"""Use the HTMLParser library to parse HTML files that aren't too bad."""
2
3__all__ = [
4 'HTMLParserTreeBuilder',
5 ]
6
7from HTMLParser import (
8 HTMLParser,
9 HTMLParseError,
10 )
11import sys
12import warnings
13
14# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
15# argument, which we'd like to set to False. Unfortunately,
16# http://bugs.python.org/issue13273 makes strict=True a better bet
17# before Python 3.2.3.
18#
19# At the end of this file, we monkeypatch HTMLParser so that
20# strict=True works well on Python 3.2.2.
21major, minor, release = sys.version_info[:3]
22CONSTRUCTOR_TAKES_STRICT = (
23 major > 3
24 or (major == 3 and minor > 2)
25 or (major == 3 and minor == 2 and release >= 3))
26
27from bs4.element import (
28 CData,
29 Comment,
30 Declaration,
31 Doctype,
32 ProcessingInstruction,
33 )
34from bs4.dammit import EntitySubstitution, UnicodeDammit
35
36from bs4.builder import (
37 HTML,
38 HTMLTreeBuilder,
39 STRICT,
40 )
41
42
43HTMLPARSER = 'html.parser'
44
45class BeautifulSoupHTMLParser(HTMLParser):
46 def handle_starttag(self, name, attrs):
47 # XXX namespace
48 attr_dict = {}
49 for key, value in attrs:
50 # Change None attribute values to the empty string
51 # for consistency with the other tree builders.
52 if value is None:
53 value = ''
54 attr_dict[key] = value
55 attrvalue = '""'
56 self.soup.handle_starttag(name, None, None, attr_dict)
57
58 def handle_endtag(self, name):
59 self.soup.handle_endtag(name)
60
61 def handle_data(self, data):
62 self.soup.handle_data(data)
63
64 def handle_charref(self, name):
65 # XXX workaround for a bug in HTMLParser. Remove this once
66 # it's fixed.
67 if name.startswith('x'):
68 real_name = int(name.lstrip('x'), 16)
69 elif name.startswith('X'):
70 real_name = int(name.lstrip('X'), 16)
71 else:
72 real_name = int(name)
73
74 try:
75 data = unichr(real_name)
76 except (ValueError, OverflowError), e:
77 data = u"\N{REPLACEMENT CHARACTER}"
78
79 self.handle_data(data)
80
81 def handle_entityref(self, name):
82 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
83 if character is not None:
84 data = character
85 else:
86 data = "&%s;" % name
87 self.handle_data(data)
88
89 def handle_comment(self, data):
90 self.soup.endData()
91 self.soup.handle_data(data)
92 self.soup.endData(Comment)
93
94 def handle_decl(self, data):
95 self.soup.endData()
96 if data.startswith("DOCTYPE "):
97 data = data[len("DOCTYPE "):]
98 elif data == 'DOCTYPE':
99 # i.e. "<!DOCTYPE>"
100 data = ''
101 self.soup.handle_data(data)
102 self.soup.endData(Doctype)
103
104 def unknown_decl(self, data):
105 if data.upper().startswith('CDATA['):
106 cls = CData
107 data = data[len('CDATA['):]
108 else:
109 cls = Declaration
110 self.soup.endData()
111 self.soup.handle_data(data)
112 self.soup.endData(cls)
113
114 def handle_pi(self, data):
115 self.soup.endData()
116 if data.endswith("?") and data.lower().startswith("xml"):
117 # "An XHTML processing instruction using the trailing '?'
118 # will cause the '?' to be included in data." - HTMLParser
119 # docs.
120 #
121 # Strip the question mark so we don't end up with two
122 # question marks.
123 data = data[:-1]
124 self.soup.handle_data(data)
125 self.soup.endData(ProcessingInstruction)
126
127
128class HTMLParserTreeBuilder(HTMLTreeBuilder):
129
130 is_xml = False
131 features = [HTML, STRICT, HTMLPARSER]
132
133 def __init__(self, *args, **kwargs):
134 if CONSTRUCTOR_TAKES_STRICT:
135 kwargs['strict'] = False
136 self.parser_args = (args, kwargs)
137
138 def prepare_markup(self, markup, user_specified_encoding=None,
139 document_declared_encoding=None):
140 """
141 :return: A 4-tuple (markup, original encoding, encoding
142 declared within markup, whether any characters had to be
143 replaced with REPLACEMENT CHARACTER).
144 """
145 if isinstance(markup, unicode):
146 yield (markup, None, None, False)
147 return
148
149 try_encodings = [user_specified_encoding, document_declared_encoding]
150 dammit = UnicodeDammit(markup, try_encodings, is_html=True)
151 yield (dammit.markup, dammit.original_encoding,
152 dammit.declared_html_encoding,
153 dammit.contains_replacement_characters)
154
155 def feed(self, markup):
156 args, kwargs = self.parser_args
157 parser = BeautifulSoupHTMLParser(*args, **kwargs)
158 parser.soup = self.soup
159 try:
160 parser.feed(markup)
161 except HTMLParseError, e:
162 warnings.warn(RuntimeWarning(
163 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
164 raise e
165
166# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
167# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
168# string.
169#
170# XXX This code can be removed once most Python 3 users are on 3.2.3.
171if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
172 import re
173 attrfind_tolerant = re.compile(
174 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
175 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
176 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
177
178 locatestarttagend = re.compile(r"""
179 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
180 (?:\s+ # whitespace before attribute name
181 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
182 (?:\s*=\s* # value indicator
183 (?:'[^']*' # LITA-enclosed value
184 |\"[^\"]*\" # LIT-enclosed value
185 |[^'\">\s]+ # bare value
186 )
187 )?
188 )
189 )*
190 \s* # trailing whitespace
191""", re.VERBOSE)
192 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
193
194 from html.parser import tagfind, attrfind
195
196 def parse_starttag(self, i):
197 self.__starttag_text = None
198 endpos = self.check_for_whole_start_tag(i)
199 if endpos < 0:
200 return endpos
201 rawdata = self.rawdata
202 self.__starttag_text = rawdata[i:endpos]
203
204 # Now parse the data between i+1 and j into a tag and attrs
205 attrs = []
206 match = tagfind.match(rawdata, i+1)
207 assert match, 'unexpected call to parse_starttag()'
208 k = match.end()
209 self.lasttag = tag = rawdata[i+1:k].lower()
210 while k < endpos:
211 if self.strict:
212 m = attrfind.match(rawdata, k)
213 else:
214 m = attrfind_tolerant.match(rawdata, k)
215 if not m:
216 break
217 attrname, rest, attrvalue = m.group(1, 2, 3)
218 if not rest:
219 attrvalue = None
220 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
221 attrvalue[:1] == '"' == attrvalue[-1:]:
222 attrvalue = attrvalue[1:-1]
223 if attrvalue:
224 attrvalue = self.unescape(attrvalue)
225 attrs.append((attrname.lower(), attrvalue))
226 k = m.end()
227
228 end = rawdata[k:endpos].strip()
229 if end not in (">", "/>"):
230 lineno, offset = self.getpos()
231 if "\n" in self.__starttag_text:
232 lineno = lineno + self.__starttag_text.count("\n")
233 offset = len(self.__starttag_text) \
234 - self.__starttag_text.rfind("\n")
235 else:
236 offset = offset + len(self.__starttag_text)
237 if self.strict:
238 self.error("junk characters in start tag: %r"
239 % (rawdata[k:endpos][:20],))
240 self.handle_data(rawdata[i:endpos])
241 return endpos
242 if end.endswith('/>'):
243 # XHTML-style empty tag: <span attr="value" />
244 self.handle_startendtag(tag, attrs)
245 else:
246 self.handle_starttag(tag, attrs)
247 if tag in self.CDATA_CONTENT_ELEMENTS:
248 self.set_cdata_mode(tag)
249 return endpos
250
251 def set_cdata_mode(self, elem):
252 self.cdata_elem = elem.lower()
253 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
254
255 BeautifulSoupHTMLParser.parse_starttag = parse_starttag
256 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
257
258 CONSTRUCTOR_TAKES_STRICT = True
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py
new file mode 100644
index 0000000000..fa5d49875e
--- /dev/null
+++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -0,0 +1,233 @@
1__all__ = [
2 'LXMLTreeBuilderForXML',
3 'LXMLTreeBuilder',
4 ]
5
6from io import BytesIO
7from StringIO import StringIO
8import collections
9from lxml import etree
10from bs4.element import Comment, Doctype, NamespacedAttribute
11from bs4.builder import (
12 FAST,
13 HTML,
14 HTMLTreeBuilder,
15 PERMISSIVE,
16 ParserRejectedMarkup,
17 TreeBuilder,
18 XML)
19from bs4.dammit import EncodingDetector
20
21LXML = 'lxml'
22
23class LXMLTreeBuilderForXML(TreeBuilder):
24 DEFAULT_PARSER_CLASS = etree.XMLParser
25
26 is_xml = True
27
28 # Well, it's permissive by XML parser standards.
29 features = [LXML, XML, FAST, PERMISSIVE]
30
31 CHUNK_SIZE = 512
32
33 # This namespace mapping is specified in the XML Namespace
34 # standard.
35 DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
36
37 def default_parser(self, encoding):
38 # This can either return a parser object or a class, which
39 # will be instantiated with default arguments.
40 if self._default_parser is not None:
41 return self._default_parser
42 return etree.XMLParser(
43 target=self, strip_cdata=False, recover=True, encoding=encoding)
44
45 def parser_for(self, encoding):
46 # Use the default parser.
47 parser = self.default_parser(encoding)
48
49 if isinstance(parser, collections.Callable):
50 # Instantiate the parser with default arguments
51 parser = parser(target=self, strip_cdata=False, encoding=encoding)
52 return parser
53
54 def __init__(self, parser=None, empty_element_tags=None):
55 # TODO: Issue a warning if parser is present but not a
56 # callable, since that means there's no way to create new
57 # parsers for different encodings.
58 self._default_parser = parser
59 if empty_element_tags is not None:
60 self.empty_element_tags = set(empty_element_tags)
61 self.soup = None
62 self.nsmaps = [self.DEFAULT_NSMAPS]
63
64 def _getNsTag(self, tag):
65 # Split the namespace URL out of a fully-qualified lxml tag
66 # name. Copied from lxml's src/lxml/sax.py.
67 if tag[0] == '{':
68 return tuple(tag[1:].split('}', 1))
69 else:
70 return (None, tag)
71
72 def prepare_markup(self, markup, user_specified_encoding=None,
73 document_declared_encoding=None):
74 """
75 :yield: A series of 4-tuples.
76 (markup, encoding, declared encoding,
77 has undergone character replacement)
78
79 Each 4-tuple represents a strategy for parsing the document.
80 """
81 if isinstance(markup, unicode):
82 # We were given Unicode. Maybe lxml can parse Unicode on
83 # this system?
84 yield markup, None, document_declared_encoding, False
85
86 if isinstance(markup, unicode):
87 # No, apparently not. Convert the Unicode to UTF-8 and
88 # tell lxml to parse it as UTF-8.
89 yield (markup.encode("utf8"), "utf8",
90 document_declared_encoding, False)
91
92 # Instead of using UnicodeDammit to convert the bytestring to
93 # Unicode using different encodings, use EncodingDetector to
94 # iterate over the encodings, and tell lxml to try to parse
95 # the document as each one in turn.
96 is_html = not self.is_xml
97 try_encodings = [user_specified_encoding, document_declared_encoding]
98 detector = EncodingDetector(markup, try_encodings, is_html)
99 for encoding in detector.encodings:
100 yield (detector.markup, encoding, document_declared_encoding, False)
101
102 def feed(self, markup):
103 if isinstance(markup, bytes):
104 markup = BytesIO(markup)
105 elif isinstance(markup, unicode):
106 markup = StringIO(markup)
107
108 # Call feed() at least once, even if the markup is empty,
109 # or the parser won't be initialized.
110 data = markup.read(self.CHUNK_SIZE)
111 try:
112 self.parser = self.parser_for(self.soup.original_encoding)
113 self.parser.feed(data)
114 while len(data) != 0:
115 # Now call feed() on the rest of the data, chunk by chunk.
116 data = markup.read(self.CHUNK_SIZE)
117 if len(data) != 0:
118 self.parser.feed(data)
119 self.parser.close()
120 except (UnicodeDecodeError, LookupError, etree.ParserError), e:
121 raise ParserRejectedMarkup(str(e))
122
123 def close(self):
124 self.nsmaps = [self.DEFAULT_NSMAPS]
125
126 def start(self, name, attrs, nsmap={}):
127 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
128 attrs = dict(attrs)
129 nsprefix = None
130 # Invert each namespace map as it comes in.
131 if len(self.nsmaps) > 1:
132 # There are no new namespaces for this tag, but
133 # non-default namespaces are in play, so we need a
134 # separate tag stack to know when they end.
135 self.nsmaps.append(None)
136 elif len(nsmap) > 0:
137 # A new namespace mapping has come into play.
138 inverted_nsmap = dict((value, key) for key, value in nsmap.items())
139 self.nsmaps.append(inverted_nsmap)
140 # Also treat the namespace mapping as a set of attributes on the
141 # tag, so we can recreate it later.
142 attrs = attrs.copy()
143 for prefix, namespace in nsmap.items():
144 attribute = NamespacedAttribute(
145 "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
146 attrs[attribute] = namespace
147
148 # Namespaces are in play. Find any attributes that came in
149 # from lxml with namespaces attached to their names, and
150 # turn then into NamespacedAttribute objects.
151 new_attrs = {}
152 for attr, value in attrs.items():
153 namespace, attr = self._getNsTag(attr)
154 if namespace is None:
155 new_attrs[attr] = value
156 else:
157 nsprefix = self._prefix_for_namespace(namespace)
158 attr = NamespacedAttribute(nsprefix, attr, namespace)
159 new_attrs[attr] = value
160 attrs = new_attrs
161
162 namespace, name = self._getNsTag(name)
163 nsprefix = self._prefix_for_namespace(namespace)
164 self.soup.handle_starttag(name, namespace, nsprefix, attrs)
165
166 def _prefix_for_namespace(self, namespace):
167 """Find the currently active prefix for the given namespace."""
168 if namespace is None:
169 return None
170 for inverted_nsmap in reversed(self.nsmaps):
171 if inverted_nsmap is not None and namespace in inverted_nsmap:
172 return inverted_nsmap[namespace]
173 return None
174
175 def end(self, name):
176 self.soup.endData()
177 completed_tag = self.soup.tagStack[-1]
178 namespace, name = self._getNsTag(name)
179 nsprefix = None
180 if namespace is not None:
181 for inverted_nsmap in reversed(self.nsmaps):
182 if inverted_nsmap is not None and namespace in inverted_nsmap:
183 nsprefix = inverted_nsmap[namespace]
184 break
185 self.soup.handle_endtag(name, nsprefix)
186 if len(self.nsmaps) > 1:
187 # This tag, or one of its parents, introduced a namespace
188 # mapping, so pop it off the stack.
189 self.nsmaps.pop()
190
191 def pi(self, target, data):
192 pass
193
194 def data(self, content):
195 self.soup.handle_data(content)
196
197 def doctype(self, name, pubid, system):
198 self.soup.endData()
199 doctype = Doctype.for_name_and_ids(name, pubid, system)
200 self.soup.object_was_parsed(doctype)
201
202 def comment(self, content):
203 "Handle comments as Comment objects."
204 self.soup.endData()
205 self.soup.handle_data(content)
206 self.soup.endData(Comment)
207
208 def test_fragment_to_document(self, fragment):
209 """See `TreeBuilder`."""
210 return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
211
212
213class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
214
215 features = [LXML, HTML, FAST, PERMISSIVE]
216 is_xml = False
217
218 def default_parser(self, encoding):
219 return etree.HTMLParser
220
221 def feed(self, markup):
222 encoding = self.soup.original_encoding
223 try:
224 self.parser = self.parser_for(encoding)
225 self.parser.feed(markup)
226 self.parser.close()
227 except (UnicodeDecodeError, LookupError, etree.ParserError), e:
228 raise ParserRejectedMarkup(str(e))
229
230
231 def test_fragment_to_document(self, fragment):
232 """See `TreeBuilder`."""
233 return u'<html><body>%s</body></html>' % fragment