summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/builder/_html5lib.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/builder/_html5lib.py')
-rw-r--r--bitbake/lib/bs4/builder/_html5lib.py251
1 files changed, 197 insertions, 54 deletions
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py
index 9e9216ef9c..7c46a85118 100644
--- a/bitbake/lib/bs4/builder/_html5lib.py
+++ b/bitbake/lib/bs4/builder/_html5lib.py
@@ -1,9 +1,14 @@
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
1__all__ = [ 4__all__ = [
2 'HTML5TreeBuilder', 5 'HTML5TreeBuilder',
3 ] 6 ]
4 7
5import warnings 8import warnings
9import re
6from bs4.builder import ( 10from bs4.builder import (
11 DetectsXMLParsedAsHTML,
7 PERMISSIVE, 12 PERMISSIVE,
8 HTML, 13 HTML,
9 HTML_5, 14 HTML_5,
@@ -11,17 +16,13 @@ from bs4.builder import (
11 ) 16 )
12from bs4.element import ( 17from bs4.element import (
13 NamespacedAttribute, 18 NamespacedAttribute,
14 whitespace_re, 19 nonwhitespace_re,
15) 20)
16import html5lib 21import html5lib
17try: 22from html5lib.constants import (
18 # html5lib >= 0.99999999/1.0b9 23 namespaces,
19 from html5lib.treebuilders import base as treebuildersbase 24 prefixes,
20except ImportError: 25 )
21 # html5lib <= 0.9999999/1.0b8
22 from html5lib.treebuilders import _base as treebuildersbase
23from html5lib.constants import namespaces
24
25from bs4.element import ( 26from bs4.element import (
26 Comment, 27 Comment,
27 Doctype, 28 Doctype,
@@ -29,13 +30,37 @@ from bs4.element import (
29 Tag, 30 Tag,
30 ) 31 )
31 32
33try:
34 # Pre-0.99999999
35 from html5lib.treebuilders import _base as treebuilder_base
36 new_html5lib = False
37except ImportError as e:
38 # 0.99999999 and up
39 from html5lib.treebuilders import base as treebuilder_base
40 new_html5lib = True
41
32class HTML5TreeBuilder(HTMLTreeBuilder): 42class HTML5TreeBuilder(HTMLTreeBuilder):
33 """Use html5lib to build a tree.""" 43 """Use html5lib to build a tree.
44
45 Note that this TreeBuilder does not support some features common
46 to HTML TreeBuilders. Some of these features could theoretically
47 be implemented, but at the very least it's quite difficult,
48 because html5lib moves the parse tree around as it's being built.
49
50 * This TreeBuilder doesn't use different subclasses of NavigableString
51 based on the name of the tag in which the string was found.
52
53 * You can't use a SoupStrainer to parse only part of a document.
54 """
34 55
35 NAME = "html5lib" 56 NAME = "html5lib"
36 57
37 features = [NAME, PERMISSIVE, HTML_5, HTML] 58 features = [NAME, PERMISSIVE, HTML_5, HTML]
38 59
60 # html5lib can tell us which line number and position in the
61 # original file is the source of an element.
62 TRACKS_LINE_NUMBERS = True
63
39 def prepare_markup(self, markup, user_specified_encoding, 64 def prepare_markup(self, markup, user_specified_encoding,
40 document_declared_encoding=None, exclude_encodings=None): 65 document_declared_encoding=None, exclude_encodings=None):
41 # Store the user-specified encoding for use later on. 66 # Store the user-specified encoding for use later on.
@@ -45,27 +70,56 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
45 # ATM because the html5lib TreeBuilder doesn't use 70 # ATM because the html5lib TreeBuilder doesn't use
46 # UnicodeDammit. 71 # UnicodeDammit.
47 if exclude_encodings: 72 if exclude_encodings:
48 warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 73 warnings.warn(
74 "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
75 stacklevel=3
76 )
77
78 # html5lib only parses HTML, so if it's given XML that's worth
79 # noting.
80 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
81 markup, stacklevel=3
82 )
83
49 yield (markup, None, None, False) 84 yield (markup, None, None, False)
50 85
51 # These methods are defined by Beautiful Soup. 86 # These methods are defined by Beautiful Soup.
52 def feed(self, markup): 87 def feed(self, markup):
53 if self.soup.parse_only is not None: 88 if self.soup.parse_only is not None:
54 warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 89 warnings.warn(
90 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
91 stacklevel=4
92 )
55 parser = html5lib.HTMLParser(tree=self.create_treebuilder) 93 parser = html5lib.HTMLParser(tree=self.create_treebuilder)
56 doc = parser.parse(markup, encoding=self.user_specified_encoding) 94 self.underlying_builder.parser = parser
57 95 extra_kwargs = dict()
96 if not isinstance(markup, str):
97 if new_html5lib:
98 extra_kwargs['override_encoding'] = self.user_specified_encoding
99 else:
100 extra_kwargs['encoding'] = self.user_specified_encoding
101 doc = parser.parse(markup, **extra_kwargs)
102
58 # Set the character encoding detected by the tokenizer. 103 # Set the character encoding detected by the tokenizer.
59 if isinstance(markup, str): 104 if isinstance(markup, str):
60 # We need to special-case this because html5lib sets 105 # We need to special-case this because html5lib sets
61 # charEncoding to UTF-8 if it gets Unicode input. 106 # charEncoding to UTF-8 if it gets Unicode input.
62 doc.original_encoding = None 107 doc.original_encoding = None
63 else: 108 else:
64 doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 109 original_encoding = parser.tokenizer.stream.charEncoding[0]
65 110 if not isinstance(original_encoding, str):
111 # In 0.99999999 and up, the encoding is an html5lib
112 # Encoding object. We want to use a string for compatibility
113 # with other tree builders.
114 original_encoding = original_encoding.name
115 doc.original_encoding = original_encoding
116 self.underlying_builder.parser = None
117
66 def create_treebuilder(self, namespaceHTMLElements): 118 def create_treebuilder(self, namespaceHTMLElements):
67 self.underlying_builder = TreeBuilderForHtml5lib( 119 self.underlying_builder = TreeBuilderForHtml5lib(
68 self.soup, namespaceHTMLElements) 120 namespaceHTMLElements, self.soup,
121 store_line_numbers=self.store_line_numbers
122 )
69 return self.underlying_builder 123 return self.underlying_builder
70 124
71 def test_fragment_to_document(self, fragment): 125 def test_fragment_to_document(self, fragment):
@@ -73,12 +127,30 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
73 return '<html><head></head><body>%s</body></html>' % fragment 127 return '<html><head></head><body>%s</body></html>' % fragment
74 128
75 129
76class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder): 130class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
77 131
78 def __init__(self, soup, namespaceHTMLElements): 132 def __init__(self, namespaceHTMLElements, soup=None,
79 self.soup = soup 133 store_line_numbers=True, **kwargs):
134 if soup:
135 self.soup = soup
136 else:
137 from bs4 import BeautifulSoup
138 # TODO: Why is the parser 'html.parser' here? To avoid an
139 # infinite loop?
140 self.soup = BeautifulSoup(
141 "", "html.parser", store_line_numbers=store_line_numbers,
142 **kwargs
143 )
144 # TODO: What are **kwargs exactly? Should they be passed in
145 # here in addition to/instead of being passed to the BeautifulSoup
146 # constructor?
80 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 147 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
81 148
149 # This will be set later to an html5lib.html5parser.HTMLParser
150 # object, which we can use to track the current line number.
151 self.parser = None
152 self.store_line_numbers = store_line_numbers
153
82 def documentClass(self): 154 def documentClass(self):
83 self.soup.reset() 155 self.soup.reset()
84 return Element(self.soup, self.soup, None) 156 return Element(self.soup, self.soup, None)
@@ -92,14 +164,26 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
92 self.soup.object_was_parsed(doctype) 164 self.soup.object_was_parsed(doctype)
93 165
94 def elementClass(self, name, namespace): 166 def elementClass(self, name, namespace):
95 tag = self.soup.new_tag(name, namespace) 167 kwargs = {}
168 if self.parser and self.store_line_numbers:
169 # This represents the point immediately after the end of the
170 # tag. We don't know when the tag started, but we do know
171 # where it ended -- the character just before this one.
172 sourceline, sourcepos = self.parser.tokenizer.stream.position()
173 kwargs['sourceline'] = sourceline
174 kwargs['sourcepos'] = sourcepos-1
175 tag = self.soup.new_tag(name, namespace, **kwargs)
176
96 return Element(tag, self.soup, namespace) 177 return Element(tag, self.soup, namespace)
97 178
98 def commentClass(self, data): 179 def commentClass(self, data):
99 return TextNode(Comment(data), self.soup) 180 return TextNode(Comment(data), self.soup)
100 181
101 def fragmentClass(self): 182 def fragmentClass(self):
102 self.soup = BeautifulSoup("") 183 from bs4 import BeautifulSoup
184 # TODO: Why is the parser 'html.parser' here? To avoid an
185 # infinite loop?
186 self.soup = BeautifulSoup("", "html.parser")
103 self.soup.name = "[document_fragment]" 187 self.soup.name = "[document_fragment]"
104 return Element(self.soup, self.soup, None) 188 return Element(self.soup, self.soup, None)
105 189
@@ -111,7 +195,57 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
111 return self.soup 195 return self.soup
112 196
113 def getFragment(self): 197 def getFragment(self):
114 return treebuildersbase.TreeBuilder.getFragment(self).element 198 return treebuilder_base.TreeBuilder.getFragment(self).element
199
200 def testSerializer(self, element):
201 from bs4 import BeautifulSoup
202 rv = []
203 doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
204
205 def serializeElement(element, indent=0):
206 if isinstance(element, BeautifulSoup):
207 pass
208 if isinstance(element, Doctype):
209 m = doctype_re.match(element)
210 if m:
211 name = m.group(1)
212 if m.lastindex > 1:
213 publicId = m.group(2) or ""
214 systemId = m.group(3) or m.group(4) or ""
215 rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
216 (' ' * indent, name, publicId, systemId))
217 else:
218 rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
219 else:
220 rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
221 elif isinstance(element, Comment):
222 rv.append("|%s<!-- %s -->" % (' ' * indent, element))
223 elif isinstance(element, NavigableString):
224 rv.append("|%s\"%s\"" % (' ' * indent, element))
225 else:
226 if element.namespace:
227 name = "%s %s" % (prefixes[element.namespace],
228 element.name)
229 else:
230 name = element.name
231 rv.append("|%s<%s>" % (' ' * indent, name))
232 if element.attrs:
233 attributes = []
234 for name, value in list(element.attrs.items()):
235 if isinstance(name, NamespacedAttribute):
236 name = "%s %s" % (prefixes[name.namespace], name.name)
237 if isinstance(value, list):
238 value = " ".join(value)
239 attributes.append((name, value))
240
241 for name, value in sorted(attributes):
242 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
243 indent += 2
244 for child in element.children:
245 serializeElement(child, indent)
246 serializeElement(element, 0)
247
248 return "\n".join(rv)
115 249
116class AttrList(object): 250class AttrList(object):
117 def __init__(self, element): 251 def __init__(self, element):
@@ -122,14 +256,14 @@ class AttrList(object):
122 def __setitem__(self, name, value): 256 def __setitem__(self, name, value):
123 # If this attribute is a multi-valued attribute for this element, 257 # If this attribute is a multi-valued attribute for this element,
124 # turn its value into a list. 258 # turn its value into a list.
125 list_attr = HTML5TreeBuilder.cdata_list_attributes 259 list_attr = self.element.cdata_list_attributes or {}
126 if (name in list_attr['*'] 260 if (name in list_attr.get('*', [])
127 or (self.element.name in list_attr 261 or (self.element.name in list_attr
128 and name in list_attr[self.element.name])): 262 and name in list_attr.get(self.element.name, []))):
129 # A node that is being cloned may have already undergone 263 # A node that is being cloned may have already undergone
130 # this procedure. 264 # this procedure.
131 if not isinstance(value, list): 265 if not isinstance(value, list):
132 value = whitespace_re.split(value) 266 value = nonwhitespace_re.findall(value)
133 self.element[name] = value 267 self.element[name] = value
134 def items(self): 268 def items(self):
135 return list(self.attrs.items()) 269 return list(self.attrs.items())
@@ -143,9 +277,9 @@ class AttrList(object):
143 return name in list(self.attrs.keys()) 277 return name in list(self.attrs.keys())
144 278
145 279
146class Element(treebuildersbase.Node): 280class Element(treebuilder_base.Node):
147 def __init__(self, element, soup, namespace): 281 def __init__(self, element, soup, namespace):
148 treebuildersbase.Node.__init__(self, element.name) 282 treebuilder_base.Node.__init__(self, element.name)
149 self.element = element 283 self.element = element
150 self.soup = soup 284 self.soup = soup
151 self.namespace = namespace 285 self.namespace = namespace
@@ -164,13 +298,15 @@ class Element(treebuildersbase.Node):
164 child = node 298 child = node
165 elif node.element.__class__ == NavigableString: 299 elif node.element.__class__ == NavigableString:
166 string_child = child = node.element 300 string_child = child = node.element
301 node.parent = self
167 else: 302 else:
168 child = node.element 303 child = node.element
304 node.parent = self
169 305
170 if not isinstance(child, str) and child.parent is not None: 306 if not isinstance(child, str) and child.parent is not None:
171 node.element.extract() 307 node.element.extract()
172 308
173 if (string_child and self.element.contents 309 if (string_child is not None and self.element.contents
174 and self.element.contents[-1].__class__ == NavigableString): 310 and self.element.contents[-1].__class__ == NavigableString):
175 # We are appending a string onto another string. 311 # We are appending a string onto another string.
176 # TODO This has O(n^2) performance, for input like 312 # TODO This has O(n^2) performance, for input like
@@ -203,12 +339,12 @@ class Element(treebuildersbase.Node):
203 most_recent_element=most_recent_element) 339 most_recent_element=most_recent_element)
204 340
205 def getAttributes(self): 341 def getAttributes(self):
342 if isinstance(self.element, Comment):
343 return {}
206 return AttrList(self.element) 344 return AttrList(self.element)
207 345
208 def setAttributes(self, attributes): 346 def setAttributes(self, attributes):
209
210 if attributes is not None and len(attributes) > 0: 347 if attributes is not None and len(attributes) > 0:
211
212 converted_attributes = [] 348 converted_attributes = []
213 for name, value in list(attributes.items()): 349 for name, value in list(attributes.items()):
214 if isinstance(name, tuple): 350 if isinstance(name, tuple):
@@ -230,11 +366,11 @@ class Element(treebuildersbase.Node):
230 attributes = property(getAttributes, setAttributes) 366 attributes = property(getAttributes, setAttributes)
231 367
232 def insertText(self, data, insertBefore=None): 368 def insertText(self, data, insertBefore=None):
369 text = TextNode(self.soup.new_string(data), self.soup)
233 if insertBefore: 370 if insertBefore:
234 text = TextNode(self.soup.new_string(data), self.soup) 371 self.insertBefore(text, insertBefore)
235 self.insertBefore(data, insertBefore)
236 else: 372 else:
237 self.appendChild(data) 373 self.appendChild(text)
238 374
239 def insertBefore(self, node, refNode): 375 def insertBefore(self, node, refNode):
240 index = self.element.index(refNode.element) 376 index = self.element.index(refNode.element)
@@ -253,9 +389,10 @@ class Element(treebuildersbase.Node):
253 389
254 def reparentChildren(self, new_parent): 390 def reparentChildren(self, new_parent):
255 """Move all of this tag's children into another tag.""" 391 """Move all of this tag's children into another tag."""
256 # print "MOVE", self.element.contents 392 # print("MOVE", self.element.contents)
257 # print "FROM", self.element 393 # print("FROM", self.element)
258 # print "TO", new_parent.element 394 # print("TO", new_parent.element)
395
259 element = self.element 396 element = self.element
260 new_parent_element = new_parent.element 397 new_parent_element = new_parent.element
261 # Determine what this tag's next_element will be once all the children 398 # Determine what this tag's next_element will be once all the children
@@ -274,29 +411,35 @@ class Element(treebuildersbase.Node):
274 new_parents_last_descendant_next_element = new_parent_element.next_element 411 new_parents_last_descendant_next_element = new_parent_element.next_element
275 412
276 to_append = element.contents 413 to_append = element.contents
277 append_after = new_parent_element.contents
278 if len(to_append) > 0: 414 if len(to_append) > 0:
279 # Set the first child's previous_element and previous_sibling 415 # Set the first child's previous_element and previous_sibling
280 # to elements within the new parent 416 # to elements within the new parent
281 first_child = to_append[0] 417 first_child = to_append[0]
282 if new_parents_last_descendant: 418 if new_parents_last_descendant is not None:
283 first_child.previous_element = new_parents_last_descendant 419 first_child.previous_element = new_parents_last_descendant
284 else: 420 else:
285 first_child.previous_element = new_parent_element 421 first_child.previous_element = new_parent_element
286 first_child.previous_sibling = new_parents_last_child 422 first_child.previous_sibling = new_parents_last_child
287 if new_parents_last_descendant: 423 if new_parents_last_descendant is not None:
288 new_parents_last_descendant.next_element = first_child 424 new_parents_last_descendant.next_element = first_child
289 else: 425 else:
290 new_parent_element.next_element = first_child 426 new_parent_element.next_element = first_child
291 if new_parents_last_child: 427 if new_parents_last_child is not None:
292 new_parents_last_child.next_sibling = first_child 428 new_parents_last_child.next_sibling = first_child
293 429
294 # Fix the last child's next_element and next_sibling 430 # Find the very last element being moved. It is now the
295 last_child = to_append[-1] 431 # parent's last descendant. It has no .next_sibling and
296 last_child.next_element = new_parents_last_descendant_next_element 432 # its .next_element is whatever the previous last
297 if new_parents_last_descendant_next_element: 433 # descendant had.
298 new_parents_last_descendant_next_element.previous_element = last_child 434 last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
299 last_child.next_sibling = None 435
436 last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
437 if new_parents_last_descendant_next_element is not None:
438 # TODO: This code has no test coverage and I'm not sure
439 # how to get html5lib to go through this path, but it's
440 # just the other side of the previous line.
441 new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
442 last_childs_last_descendant.next_sibling = None
300 443
301 for child in to_append: 444 for child in to_append:
302 child.parent = new_parent_element 445 child.parent = new_parent_element
@@ -306,9 +449,9 @@ class Element(treebuildersbase.Node):
306 element.contents = [] 449 element.contents = []
307 element.next_element = final_next_element 450 element.next_element = final_next_element
308 451
309 # print "DONE WITH MOVE" 452 # print("DONE WITH MOVE")
310 # print "FROM", self.element 453 # print("FROM", self.element)
311 # print "TO", new_parent_element 454 # print("TO", new_parent_element)
312 455
313 def cloneNode(self): 456 def cloneNode(self):
314 tag = self.soup.new_tag(self.element.name, self.namespace) 457 tag = self.soup.new_tag(self.element.name, self.namespace)
@@ -321,7 +464,7 @@ class Element(treebuildersbase.Node):
321 return self.element.contents 464 return self.element.contents
322 465
323 def getNameTuple(self): 466 def getNameTuple(self):
324 if self.namespace is None: 467 if self.namespace == None:
325 return namespaces["html"], self.name 468 return namespaces["html"], self.name
326 else: 469 else:
327 return self.namespace, self.name 470 return self.namespace, self.name
@@ -330,7 +473,7 @@ class Element(treebuildersbase.Node):
330 473
331class TextNode(Element): 474class TextNode(Element):
332 def __init__(self, element, soup): 475 def __init__(self, element, soup):
333 treebuildersbase.Node.__init__(self, None) 476 treebuilder_base.Node.__init__(self, None)
334 self.element = element 477 self.element = element
335 self.soup = soup 478 self.soup = soup
336 479