diff options
Diffstat (limited to 'bitbake/lib/bs4/builder/_html5lib.py')
-rw-r--r-- | bitbake/lib/bs4/builder/_html5lib.py | 285 |
1 files changed, 285 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py new file mode 100644 index 0000000000..7de36ae75e --- /dev/null +++ b/bitbake/lib/bs4/builder/_html5lib.py | |||
@@ -0,0 +1,285 @@ | |||
1 | __all__ = [ | ||
2 | 'HTML5TreeBuilder', | ||
3 | ] | ||
4 | |||
5 | import warnings | ||
6 | from bs4.builder import ( | ||
7 | PERMISSIVE, | ||
8 | HTML, | ||
9 | HTML_5, | ||
10 | HTMLTreeBuilder, | ||
11 | ) | ||
12 | from bs4.element import NamespacedAttribute | ||
13 | import html5lib | ||
14 | from html5lib.constants import namespaces | ||
15 | from bs4.element import ( | ||
16 | Comment, | ||
17 | Doctype, | ||
18 | NavigableString, | ||
19 | Tag, | ||
20 | ) | ||
21 | |||
22 | class HTML5TreeBuilder(HTMLTreeBuilder): | ||
23 | """Use html5lib to build a tree.""" | ||
24 | |||
25 | features = ['html5lib', PERMISSIVE, HTML_5, HTML] | ||
26 | |||
27 | def prepare_markup(self, markup, user_specified_encoding): | ||
28 | # Store the user-specified encoding for use later on. | ||
29 | self.user_specified_encoding = user_specified_encoding | ||
30 | yield (markup, None, None, False) | ||
31 | |||
32 | # These methods are defined by Beautiful Soup. | ||
33 | def feed(self, markup): | ||
34 | if self.soup.parse_only is not None: | ||
35 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") | ||
36 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) | ||
37 | doc = parser.parse(markup, encoding=self.user_specified_encoding) | ||
38 | |||
39 | # Set the character encoding detected by the tokenizer. | ||
40 | if isinstance(markup, unicode): | ||
41 | # We need to special-case this because html5lib sets | ||
42 | # charEncoding to UTF-8 if it gets Unicode input. | ||
43 | doc.original_encoding = None | ||
44 | else: | ||
45 | doc.original_encoding = parser.tokenizer.stream.charEncoding[0] | ||
46 | |||
47 | def create_treebuilder(self, namespaceHTMLElements): | ||
48 | self.underlying_builder = TreeBuilderForHtml5lib( | ||
49 | self.soup, namespaceHTMLElements) | ||
50 | return self.underlying_builder | ||
51 | |||
52 | def test_fragment_to_document(self, fragment): | ||
53 | """See `TreeBuilder`.""" | ||
54 | return u'<html><head></head><body>%s</body></html>' % fragment | ||
55 | |||
56 | |||
57 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): | ||
58 | |||
59 | def __init__(self, soup, namespaceHTMLElements): | ||
60 | self.soup = soup | ||
61 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) | ||
62 | |||
63 | def documentClass(self): | ||
64 | self.soup.reset() | ||
65 | return Element(self.soup, self.soup, None) | ||
66 | |||
67 | def insertDoctype(self, token): | ||
68 | name = token["name"] | ||
69 | publicId = token["publicId"] | ||
70 | systemId = token["systemId"] | ||
71 | |||
72 | doctype = Doctype.for_name_and_ids(name, publicId, systemId) | ||
73 | self.soup.object_was_parsed(doctype) | ||
74 | |||
75 | def elementClass(self, name, namespace): | ||
76 | tag = self.soup.new_tag(name, namespace) | ||
77 | return Element(tag, self.soup, namespace) | ||
78 | |||
79 | def commentClass(self, data): | ||
80 | return TextNode(Comment(data), self.soup) | ||
81 | |||
82 | def fragmentClass(self): | ||
83 | self.soup = BeautifulSoup("") | ||
84 | self.soup.name = "[document_fragment]" | ||
85 | return Element(self.soup, self.soup, None) | ||
86 | |||
87 | def appendChild(self, node): | ||
88 | # XXX This code is not covered by the BS4 tests. | ||
89 | self.soup.append(node.element) | ||
90 | |||
91 | def getDocument(self): | ||
92 | return self.soup | ||
93 | |||
94 | def getFragment(self): | ||
95 | return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element | ||
96 | |||
97 | class AttrList(object): | ||
98 | def __init__(self, element): | ||
99 | self.element = element | ||
100 | self.attrs = dict(self.element.attrs) | ||
101 | def __iter__(self): | ||
102 | return list(self.attrs.items()).__iter__() | ||
103 | def __setitem__(self, name, value): | ||
104 | "set attr", name, value | ||
105 | self.element[name] = value | ||
106 | def items(self): | ||
107 | return list(self.attrs.items()) | ||
108 | def keys(self): | ||
109 | return list(self.attrs.keys()) | ||
110 | def __len__(self): | ||
111 | return len(self.attrs) | ||
112 | def __getitem__(self, name): | ||
113 | return self.attrs[name] | ||
114 | def __contains__(self, name): | ||
115 | return name in list(self.attrs.keys()) | ||
116 | |||
117 | |||
118 | class Element(html5lib.treebuilders._base.Node): | ||
119 | def __init__(self, element, soup, namespace): | ||
120 | html5lib.treebuilders._base.Node.__init__(self, element.name) | ||
121 | self.element = element | ||
122 | self.soup = soup | ||
123 | self.namespace = namespace | ||
124 | |||
125 | def appendChild(self, node): | ||
126 | string_child = child = None | ||
127 | if isinstance(node, basestring): | ||
128 | # Some other piece of code decided to pass in a string | ||
129 | # instead of creating a TextElement object to contain the | ||
130 | # string. | ||
131 | string_child = child = node | ||
132 | elif isinstance(node, Tag): | ||
133 | # Some other piece of code decided to pass in a Tag | ||
134 | # instead of creating an Element object to contain the | ||
135 | # Tag. | ||
136 | child = node | ||
137 | elif node.element.__class__ == NavigableString: | ||
138 | string_child = child = node.element | ||
139 | else: | ||
140 | child = node.element | ||
141 | |||
142 | if not isinstance(child, basestring) and child.parent is not None: | ||
143 | node.element.extract() | ||
144 | |||
145 | if (string_child and self.element.contents | ||
146 | and self.element.contents[-1].__class__ == NavigableString): | ||
147 | # We are appending a string onto another string. | ||
148 | # TODO This has O(n^2) performance, for input like | ||
149 | # "a</a>a</a>a</a>..." | ||
150 | old_element = self.element.contents[-1] | ||
151 | new_element = self.soup.new_string(old_element + string_child) | ||
152 | old_element.replace_with(new_element) | ||
153 | self.soup._most_recent_element = new_element | ||
154 | else: | ||
155 | if isinstance(node, basestring): | ||
156 | # Create a brand new NavigableString from this string. | ||
157 | child = self.soup.new_string(node) | ||
158 | |||
159 | # Tell Beautiful Soup to act as if it parsed this element | ||
160 | # immediately after the parent's last descendant. (Or | ||
161 | # immediately after the parent, if it has no children.) | ||
162 | if self.element.contents: | ||
163 | most_recent_element = self.element._last_descendant(False) | ||
164 | else: | ||
165 | most_recent_element = self.element | ||
166 | |||
167 | self.soup.object_was_parsed( | ||
168 | child, parent=self.element, | ||
169 | most_recent_element=most_recent_element) | ||
170 | |||
171 | def getAttributes(self): | ||
172 | return AttrList(self.element) | ||
173 | |||
174 | def setAttributes(self, attributes): | ||
175 | if attributes is not None and len(attributes) > 0: | ||
176 | |||
177 | converted_attributes = [] | ||
178 | for name, value in list(attributes.items()): | ||
179 | if isinstance(name, tuple): | ||
180 | new_name = NamespacedAttribute(*name) | ||
181 | del attributes[name] | ||
182 | attributes[new_name] = value | ||
183 | |||
184 | self.soup.builder._replace_cdata_list_attribute_values( | ||
185 | self.name, attributes) | ||
186 | for name, value in attributes.items(): | ||
187 | self.element[name] = value | ||
188 | |||
189 | # The attributes may contain variables that need substitution. | ||
190 | # Call set_up_substitutions manually. | ||
191 | # | ||
192 | # The Tag constructor called this method when the Tag was created, | ||
193 | # but we just set/changed the attributes, so call it again. | ||
194 | self.soup.builder.set_up_substitutions(self.element) | ||
195 | attributes = property(getAttributes, setAttributes) | ||
196 | |||
197 | def insertText(self, data, insertBefore=None): | ||
198 | if insertBefore: | ||
199 | text = TextNode(self.soup.new_string(data), self.soup) | ||
200 | self.insertBefore(data, insertBefore) | ||
201 | else: | ||
202 | self.appendChild(data) | ||
203 | |||
204 | def insertBefore(self, node, refNode): | ||
205 | index = self.element.index(refNode.element) | ||
206 | if (node.element.__class__ == NavigableString and self.element.contents | ||
207 | and self.element.contents[index-1].__class__ == NavigableString): | ||
208 | # (See comments in appendChild) | ||
209 | old_node = self.element.contents[index-1] | ||
210 | new_str = self.soup.new_string(old_node + node.element) | ||
211 | old_node.replace_with(new_str) | ||
212 | else: | ||
213 | self.element.insert(index, node.element) | ||
214 | node.parent = self | ||
215 | |||
216 | def removeChild(self, node): | ||
217 | node.element.extract() | ||
218 | |||
219 | def reparentChildren(self, new_parent): | ||
220 | """Move all of this tag's children into another tag.""" | ||
221 | element = self.element | ||
222 | new_parent_element = new_parent.element | ||
223 | # Determine what this tag's next_element will be once all the children | ||
224 | # are removed. | ||
225 | final_next_element = element.next_sibling | ||
226 | |||
227 | new_parents_last_descendant = new_parent_element._last_descendant(False, False) | ||
228 | if len(new_parent_element.contents) > 0: | ||
229 | # The new parent already contains children. We will be | ||
230 | # appending this tag's children to the end. | ||
231 | new_parents_last_child = new_parent_element.contents[-1] | ||
232 | new_parents_last_descendant_next_element = new_parents_last_descendant.next_element | ||
233 | else: | ||
234 | # The new parent contains no children. | ||
235 | new_parents_last_child = None | ||
236 | new_parents_last_descendant_next_element = new_parent_element.next_element | ||
237 | |||
238 | to_append = element.contents | ||
239 | append_after = new_parent.element.contents | ||
240 | if len(to_append) > 0: | ||
241 | # Set the first child's previous_element and previous_sibling | ||
242 | # to elements within the new parent | ||
243 | first_child = to_append[0] | ||
244 | first_child.previous_element = new_parents_last_descendant | ||
245 | first_child.previous_sibling = new_parents_last_child | ||
246 | |||
247 | # Fix the last child's next_element and next_sibling | ||
248 | last_child = to_append[-1] | ||
249 | last_child.next_element = new_parents_last_descendant_next_element | ||
250 | last_child.next_sibling = None | ||
251 | |||
252 | for child in to_append: | ||
253 | child.parent = new_parent_element | ||
254 | new_parent_element.contents.append(child) | ||
255 | |||
256 | # Now that this element has no children, change its .next_element. | ||
257 | element.contents = [] | ||
258 | element.next_element = final_next_element | ||
259 | |||
260 | def cloneNode(self): | ||
261 | tag = self.soup.new_tag(self.element.name, self.namespace) | ||
262 | node = Element(tag, self.soup, self.namespace) | ||
263 | for key,value in self.attributes: | ||
264 | node.attributes[key] = value | ||
265 | return node | ||
266 | |||
267 | def hasContent(self): | ||
268 | return self.element.contents | ||
269 | |||
270 | def getNameTuple(self): | ||
271 | if self.namespace == None: | ||
272 | return namespaces["html"], self.name | ||
273 | else: | ||
274 | return self.namespace, self.name | ||
275 | |||
276 | nameTuple = property(getNameTuple) | ||
277 | |||
278 | class TextNode(Element): | ||
279 | def __init__(self, element, soup): | ||
280 | html5lib.treebuilders._base.Node.__init__(self, None) | ||
281 | self.element = element | ||
282 | self.soup = soup | ||
283 | |||
284 | def cloneNode(self): | ||
285 | raise NotImplementedError | ||