diff options
author | Aníbal Limón <anibal.limon@linux.intel.com> | 2014-11-05 12:10:27 -0600 |
---|---|---|
committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2014-11-06 16:45:23 +0000 |
commit | 25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch) | |
tree | 7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/builder/__init__.py | |
parent | bc6330cb7f288e76209410b0812aff1dbfa90950 (diff) | |
download | poky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz |
bitbake: bs4: Add beautifulsoup 4.3.2 to assist the fetcher
Added Beautifulsoup module because fetch/wget latest_versionstring
method depends on it.
This provides support to fetch/wget.py module for search new package
versions in upstream sites.
(Bitbake rev: 4626c9b77e5eded97507b6f9ca0d891f9a54bb8a)
Signed-off-by: Aníbal Limón <anibal.limon@linux.intel.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/builder/__init__.py')
-rw-r--r-- | bitbake/lib/bs4/builder/__init__.py | 321 |
1 files changed, 321 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py new file mode 100644 index 0000000000..740f5f29cd --- /dev/null +++ b/bitbake/lib/bs4/builder/__init__.py | |||
@@ -0,0 +1,321 @@ | |||
1 | from collections import defaultdict | ||
2 | import itertools | ||
3 | import sys | ||
4 | from bs4.element import ( | ||
5 | CharsetMetaAttributeValue, | ||
6 | ContentMetaAttributeValue, | ||
7 | whitespace_re | ||
8 | ) | ||
9 | |||
10 | __all__ = [ | ||
11 | 'HTMLTreeBuilder', | ||
12 | 'SAXTreeBuilder', | ||
13 | 'TreeBuilder', | ||
14 | 'TreeBuilderRegistry', | ||
15 | ] | ||
16 | |||
17 | # Some useful features for a TreeBuilder to have. | ||
18 | FAST = 'fast' | ||
19 | PERMISSIVE = 'permissive' | ||
20 | STRICT = 'strict' | ||
21 | XML = 'xml' | ||
22 | HTML = 'html' | ||
23 | HTML_5 = 'html5' | ||
24 | |||
25 | |||
26 | class TreeBuilderRegistry(object): | ||
27 | |||
28 | def __init__(self): | ||
29 | self.builders_for_feature = defaultdict(list) | ||
30 | self.builders = [] | ||
31 | |||
32 | def register(self, treebuilder_class): | ||
33 | """Register a treebuilder based on its advertised features.""" | ||
34 | for feature in treebuilder_class.features: | ||
35 | self.builders_for_feature[feature].insert(0, treebuilder_class) | ||
36 | self.builders.insert(0, treebuilder_class) | ||
37 | |||
38 | def lookup(self, *features): | ||
39 | if len(self.builders) == 0: | ||
40 | # There are no builders at all. | ||
41 | return None | ||
42 | |||
43 | if len(features) == 0: | ||
44 | # They didn't ask for any features. Give them the most | ||
45 | # recently registered builder. | ||
46 | return self.builders[0] | ||
47 | |||
48 | # Go down the list of features in order, and eliminate any builders | ||
49 | # that don't match every feature. | ||
50 | features = list(features) | ||
51 | features.reverse() | ||
52 | candidates = None | ||
53 | candidate_set = None | ||
54 | while len(features) > 0: | ||
55 | feature = features.pop() | ||
56 | we_have_the_feature = self.builders_for_feature.get(feature, []) | ||
57 | if len(we_have_the_feature) > 0: | ||
58 | if candidates is None: | ||
59 | candidates = we_have_the_feature | ||
60 | candidate_set = set(candidates) | ||
61 | else: | ||
62 | # Eliminate any candidates that don't have this feature. | ||
63 | candidate_set = candidate_set.intersection( | ||
64 | set(we_have_the_feature)) | ||
65 | |||
66 | # The only valid candidates are the ones in candidate_set. | ||
67 | # Go through the original list of candidates and pick the first one | ||
68 | # that's in candidate_set. | ||
69 | if candidate_set is None: | ||
70 | return None | ||
71 | for candidate in candidates: | ||
72 | if candidate in candidate_set: | ||
73 | return candidate | ||
74 | return None | ||
75 | |||
76 | # The BeautifulSoup class will take feature lists from developers and use them | ||
77 | # to look up builders in this registry. | ||
78 | builder_registry = TreeBuilderRegistry() | ||
79 | |||
80 | class TreeBuilder(object): | ||
81 | """Turn a document into a Beautiful Soup object tree.""" | ||
82 | |||
83 | features = [] | ||
84 | |||
85 | is_xml = False | ||
86 | preserve_whitespace_tags = set() | ||
87 | empty_element_tags = None # A tag will be considered an empty-element | ||
88 | # tag when and only when it has no contents. | ||
89 | |||
90 | # A value for these tag/attribute combinations is a space- or | ||
91 | # comma-separated list of CDATA, rather than a single CDATA. | ||
92 | cdata_list_attributes = {} | ||
93 | |||
94 | |||
95 | def __init__(self): | ||
96 | self.soup = None | ||
97 | |||
98 | def reset(self): | ||
99 | pass | ||
100 | |||
101 | def can_be_empty_element(self, tag_name): | ||
102 | """Might a tag with this name be an empty-element tag? | ||
103 | |||
104 | The final markup may or may not actually present this tag as | ||
105 | self-closing. | ||
106 | |||
107 | For instance: an HTMLBuilder does not consider a <p> tag to be | ||
108 | an empty-element tag (it's not in | ||
109 | HTMLBuilder.empty_element_tags). This means an empty <p> tag | ||
110 | will be presented as "<p></p>", not "<p />". | ||
111 | |||
112 | The default implementation has no opinion about which tags are | ||
113 | empty-element tags, so a tag will be presented as an | ||
114 | empty-element tag if and only if it has no contents. | ||
115 | "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will | ||
116 | be left alone. | ||
117 | """ | ||
118 | if self.empty_element_tags is None: | ||
119 | return True | ||
120 | return tag_name in self.empty_element_tags | ||
121 | |||
122 | def feed(self, markup): | ||
123 | raise NotImplementedError() | ||
124 | |||
125 | def prepare_markup(self, markup, user_specified_encoding=None, | ||
126 | document_declared_encoding=None): | ||
127 | return markup, None, None, False | ||
128 | |||
129 | def test_fragment_to_document(self, fragment): | ||
130 | """Wrap an HTML fragment to make it look like a document. | ||
131 | |||
132 | Different parsers do this differently. For instance, lxml | ||
133 | introduces an empty <head> tag, and html5lib | ||
134 | doesn't. Abstracting this away lets us write simple tests | ||
135 | which run HTML fragments through the parser and compare the | ||
136 | results against other HTML fragments. | ||
137 | |||
138 | This method should not be used outside of tests. | ||
139 | """ | ||
140 | return fragment | ||
141 | |||
142 | def set_up_substitutions(self, tag): | ||
143 | return False | ||
144 | |||
145 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): | ||
146 | """Replaces class="foo bar" with class=["foo", "bar"] | ||
147 | |||
148 | Modifies its input in place. | ||
149 | """ | ||
150 | if not attrs: | ||
151 | return attrs | ||
152 | if self.cdata_list_attributes: | ||
153 | universal = self.cdata_list_attributes.get('*', []) | ||
154 | tag_specific = self.cdata_list_attributes.get( | ||
155 | tag_name.lower(), None) | ||
156 | for attr in attrs.keys(): | ||
157 | if attr in universal or (tag_specific and attr in tag_specific): | ||
158 | # We have a "class"-type attribute whose string | ||
159 | # value is a whitespace-separated list of | ||
160 | # values. Split it into a list. | ||
161 | value = attrs[attr] | ||
162 | if isinstance(value, basestring): | ||
163 | values = whitespace_re.split(value) | ||
164 | else: | ||
165 | # html5lib sometimes calls setAttributes twice | ||
166 | # for the same tag when rearranging the parse | ||
167 | # tree. On the second call the attribute value | ||
168 | # here is already a list. If this happens, | ||
169 | # leave the value alone rather than trying to | ||
170 | # split it again. | ||
171 | values = value | ||
172 | attrs[attr] = values | ||
173 | return attrs | ||
174 | |||
175 | class SAXTreeBuilder(TreeBuilder): | ||
176 | """A Beautiful Soup treebuilder that listens for SAX events.""" | ||
177 | |||
178 | def feed(self, markup): | ||
179 | raise NotImplementedError() | ||
180 | |||
181 | def close(self): | ||
182 | pass | ||
183 | |||
184 | def startElement(self, name, attrs): | ||
185 | attrs = dict((key[1], value) for key, value in list(attrs.items())) | ||
186 | #print "Start %s, %r" % (name, attrs) | ||
187 | self.soup.handle_starttag(name, attrs) | ||
188 | |||
189 | def endElement(self, name): | ||
190 | #print "End %s" % name | ||
191 | self.soup.handle_endtag(name) | ||
192 | |||
193 | def startElementNS(self, nsTuple, nodeName, attrs): | ||
194 | # Throw away (ns, nodeName) for now. | ||
195 | self.startElement(nodeName, attrs) | ||
196 | |||
197 | def endElementNS(self, nsTuple, nodeName): | ||
198 | # Throw away (ns, nodeName) for now. | ||
199 | self.endElement(nodeName) | ||
200 | #handler.endElementNS((ns, node.nodeName), node.nodeName) | ||
201 | |||
202 | def startPrefixMapping(self, prefix, nodeValue): | ||
203 | # Ignore the prefix for now. | ||
204 | pass | ||
205 | |||
206 | def endPrefixMapping(self, prefix): | ||
207 | # Ignore the prefix for now. | ||
208 | # handler.endPrefixMapping(prefix) | ||
209 | pass | ||
210 | |||
211 | def characters(self, content): | ||
212 | self.soup.handle_data(content) | ||
213 | |||
214 | def startDocument(self): | ||
215 | pass | ||
216 | |||
217 | def endDocument(self): | ||
218 | pass | ||
219 | |||
220 | |||
221 | class HTMLTreeBuilder(TreeBuilder): | ||
222 | """This TreeBuilder knows facts about HTML. | ||
223 | |||
224 | Such as which tags are empty-element tags. | ||
225 | """ | ||
226 | |||
227 | preserve_whitespace_tags = set(['pre', 'textarea']) | ||
228 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', | ||
229 | 'spacer', 'link', 'frame', 'base']) | ||
230 | |||
231 | # The HTML standard defines these attributes as containing a | ||
232 | # space-separated list of values, not a single value. That is, | ||
233 | # class="foo bar" means that the 'class' attribute has two values, | ||
234 | # 'foo' and 'bar', not the single value 'foo bar'. When we | ||
235 | # encounter one of these attributes, we will parse its value into | ||
236 | # a list of values if possible. Upon output, the list will be | ||
237 | # converted back into a string. | ||
238 | cdata_list_attributes = { | ||
239 | "*" : ['class', 'accesskey', 'dropzone'], | ||
240 | "a" : ['rel', 'rev'], | ||
241 | "link" : ['rel', 'rev'], | ||
242 | "td" : ["headers"], | ||
243 | "th" : ["headers"], | ||
244 | "td" : ["headers"], | ||
245 | "form" : ["accept-charset"], | ||
246 | "object" : ["archive"], | ||
247 | |||
248 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. | ||
249 | "area" : ["rel"], | ||
250 | "icon" : ["sizes"], | ||
251 | "iframe" : ["sandbox"], | ||
252 | "output" : ["for"], | ||
253 | } | ||
254 | |||
255 | def set_up_substitutions(self, tag): | ||
256 | # We are only interested in <meta> tags | ||
257 | if tag.name != 'meta': | ||
258 | return False | ||
259 | |||
260 | http_equiv = tag.get('http-equiv') | ||
261 | content = tag.get('content') | ||
262 | charset = tag.get('charset') | ||
263 | |||
264 | # We are interested in <meta> tags that say what encoding the | ||
265 | # document was originally in. This means HTML 5-style <meta> | ||
266 | # tags that provide the "charset" attribute. It also means | ||
267 | # HTML 4-style <meta> tags that provide the "content" | ||
268 | # attribute and have "http-equiv" set to "content-type". | ||
269 | # | ||
270 | # In both cases we will replace the value of the appropriate | ||
271 | # attribute with a standin object that can take on any | ||
272 | # encoding. | ||
273 | meta_encoding = None | ||
274 | if charset is not None: | ||
275 | # HTML 5 style: | ||
276 | # <meta charset="utf8"> | ||
277 | meta_encoding = charset | ||
278 | tag['charset'] = CharsetMetaAttributeValue(charset) | ||
279 | |||
280 | elif (content is not None and http_equiv is not None | ||
281 | and http_equiv.lower() == 'content-type'): | ||
282 | # HTML 4 style: | ||
283 | # <meta http-equiv="content-type" content="text/html; charset=utf8"> | ||
284 | tag['content'] = ContentMetaAttributeValue(content) | ||
285 | |||
286 | return (meta_encoding is not None) | ||
287 | |||
288 | def register_treebuilders_from(module): | ||
289 | """Copy TreeBuilders from the given module into this module.""" | ||
290 | # I'm fairly sure this is not the best way to do this. | ||
291 | this_module = sys.modules['bs4.builder'] | ||
292 | for name in module.__all__: | ||
293 | obj = getattr(module, name) | ||
294 | |||
295 | if issubclass(obj, TreeBuilder): | ||
296 | setattr(this_module, name, obj) | ||
297 | this_module.__all__.append(name) | ||
298 | # Register the builder while we're at it. | ||
299 | this_module.builder_registry.register(obj) | ||
300 | |||
301 | class ParserRejectedMarkup(Exception): | ||
302 | pass | ||
303 | |||
304 | # Builders are registered in reverse order of priority, so that custom | ||
305 | # builder registrations will take precedence. In general, we want lxml | ||
306 | # to take precedence over html5lib, because it's faster. And we only | ||
307 | # want to use HTMLParser as a last result. | ||
308 | from . import _htmlparser | ||
309 | register_treebuilders_from(_htmlparser) | ||
310 | try: | ||
311 | from . import _html5lib | ||
312 | register_treebuilders_from(_html5lib) | ||
313 | except ImportError: | ||
314 | # They don't have html5lib installed. | ||
315 | pass | ||
316 | try: | ||
317 | from . import _lxml | ||
318 | register_treebuilders_from(_lxml) | ||
319 | except ImportError: | ||
320 | # They don't have lxml installed. | ||
321 | pass | ||