diff options
author | Aníbal Limón <anibal.limon@linux.intel.com> | 2014-11-05 12:10:27 -0600 |
---|---|---|
committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2014-11-06 16:45:23 +0000 |
commit | 25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch) | |
tree | 7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/__init__.py | |
parent | bc6330cb7f288e76209410b0812aff1dbfa90950 (diff) | |
download | poky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz |
bitbake: bs4: Add beautifulsoup 4.3.2 to assist the fetcher
Added Beautifulsoup module because fetch/wget latest_versionstring
method depends on it.
This provides support to fetch/wget.py module for search new package
versions in upstream sites.
(Bitbake rev: 4626c9b77e5eded97507b6f9ca0d891f9a54bb8a)
Signed-off-by: Aníbal Limón <anibal.limon@linux.intel.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/__init__.py')
-rw-r--r-- | bitbake/lib/bs4/__init__.py | 406 |
1 files changed, 406 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/__init__.py b/bitbake/lib/bs4/__init__.py new file mode 100644 index 0000000000..7ba34269af --- /dev/null +++ b/bitbake/lib/bs4/__init__.py | |||
@@ -0,0 +1,406 @@ | |||
1 | """Beautiful Soup | ||
2 | Elixir and Tonic | ||
3 | "The Screen-Scraper's Friend" | ||
4 | http://www.crummy.com/software/BeautifulSoup/ | ||
5 | |||
6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a | ||
7 | (possibly invalid) document into a tree representation. Beautiful Soup | ||
8 | provides provides methods and Pythonic idioms that make it easy to | ||
9 | navigate, search, and modify the parse tree. | ||
10 | |||
11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml | ||
12 | and/or html5lib is installed. | ||
13 | |||
14 | For more than you ever wanted to know about Beautiful Soup, see the | ||
15 | documentation: | ||
16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/ | ||
17 | """ | ||
18 | |||
19 | __author__ = "Leonard Richardson (leonardr@segfault.org)" | ||
20 | __version__ = "4.3.2" | ||
21 | __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" | ||
22 | __license__ = "MIT" | ||
23 | |||
24 | __all__ = ['BeautifulSoup'] | ||
25 | |||
26 | import os | ||
27 | import re | ||
28 | import warnings | ||
29 | |||
30 | from .builder import builder_registry, ParserRejectedMarkup | ||
31 | from .dammit import UnicodeDammit | ||
32 | from .element import ( | ||
33 | CData, | ||
34 | Comment, | ||
35 | DEFAULT_OUTPUT_ENCODING, | ||
36 | Declaration, | ||
37 | Doctype, | ||
38 | NavigableString, | ||
39 | PageElement, | ||
40 | ProcessingInstruction, | ||
41 | ResultSet, | ||
42 | SoupStrainer, | ||
43 | Tag, | ||
44 | ) | ||
45 | |||
46 | # The very first thing we do is give a useful error if someone is | ||
47 | # running this code under Python 3 without converting it. | ||
48 | syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' | ||
49 | |||
50 | class BeautifulSoup(Tag): | ||
51 | """ | ||
52 | This class defines the basic interface called by the tree builders. | ||
53 | |||
54 | These methods will be called by the parser: | ||
55 | reset() | ||
56 | feed(markup) | ||
57 | |||
58 | The tree builder may call these methods from its feed() implementation: | ||
59 | handle_starttag(name, attrs) # See note about return value | ||
60 | handle_endtag(name) | ||
61 | handle_data(data) # Appends to the current data node | ||
62 | endData(containerClass=NavigableString) # Ends the current data node | ||
63 | |||
64 | No matter how complicated the underlying parser is, you should be | ||
65 | able to build a tree using 'start tag' events, 'end tag' events, | ||
66 | 'data' events, and "done with data" events. | ||
67 | |||
68 | If you encounter an empty-element tag (aka a self-closing tag, | ||
69 | like HTML's <br> tag), call handle_starttag and then | ||
70 | handle_endtag. | ||
71 | """ | ||
72 | ROOT_TAG_NAME = u'[document]' | ||
73 | |||
74 | # If the end-user gives no indication which tree builder they | ||
75 | # want, look for one with these features. | ||
76 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] | ||
77 | |||
78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' | ||
79 | |||
80 | def __init__(self, markup="", features=None, builder=None, | ||
81 | parse_only=None, from_encoding=None, **kwargs): | ||
82 | """The Soup object is initialized as the 'root tag', and the | ||
83 | provided markup (which can be a string or a file-like object) | ||
84 | is fed into the underlying parser.""" | ||
85 | |||
86 | if 'convertEntities' in kwargs: | ||
87 | warnings.warn( | ||
88 | "BS4 does not respect the convertEntities argument to the " | ||
89 | "BeautifulSoup constructor. Entities are always converted " | ||
90 | "to Unicode characters.") | ||
91 | |||
92 | if 'markupMassage' in kwargs: | ||
93 | del kwargs['markupMassage'] | ||
94 | warnings.warn( | ||
95 | "BS4 does not respect the markupMassage argument to the " | ||
96 | "BeautifulSoup constructor. The tree builder is responsible " | ||
97 | "for any necessary markup massage.") | ||
98 | |||
99 | if 'smartQuotesTo' in kwargs: | ||
100 | del kwargs['smartQuotesTo'] | ||
101 | warnings.warn( | ||
102 | "BS4 does not respect the smartQuotesTo argument to the " | ||
103 | "BeautifulSoup constructor. Smart quotes are always converted " | ||
104 | "to Unicode characters.") | ||
105 | |||
106 | if 'selfClosingTags' in kwargs: | ||
107 | del kwargs['selfClosingTags'] | ||
108 | warnings.warn( | ||
109 | "BS4 does not respect the selfClosingTags argument to the " | ||
110 | "BeautifulSoup constructor. The tree builder is responsible " | ||
111 | "for understanding self-closing tags.") | ||
112 | |||
113 | if 'isHTML' in kwargs: | ||
114 | del kwargs['isHTML'] | ||
115 | warnings.warn( | ||
116 | "BS4 does not respect the isHTML argument to the " | ||
117 | "BeautifulSoup constructor. You can pass in features='html' " | ||
118 | "or features='xml' to get a builder capable of handling " | ||
119 | "one or the other.") | ||
120 | |||
121 | def deprecated_argument(old_name, new_name): | ||
122 | if old_name in kwargs: | ||
123 | warnings.warn( | ||
124 | 'The "%s" argument to the BeautifulSoup constructor ' | ||
125 | 'has been renamed to "%s."' % (old_name, new_name)) | ||
126 | value = kwargs[old_name] | ||
127 | del kwargs[old_name] | ||
128 | return value | ||
129 | return None | ||
130 | |||
131 | parse_only = parse_only or deprecated_argument( | ||
132 | "parseOnlyThese", "parse_only") | ||
133 | |||
134 | from_encoding = from_encoding or deprecated_argument( | ||
135 | "fromEncoding", "from_encoding") | ||
136 | |||
137 | if len(kwargs) > 0: | ||
138 | arg = kwargs.keys().pop() | ||
139 | raise TypeError( | ||
140 | "__init__() got an unexpected keyword argument '%s'" % arg) | ||
141 | |||
142 | if builder is None: | ||
143 | if isinstance(features, basestring): | ||
144 | features = [features] | ||
145 | if features is None or len(features) == 0: | ||
146 | features = self.DEFAULT_BUILDER_FEATURES | ||
147 | builder_class = builder_registry.lookup(*features) | ||
148 | if builder_class is None: | ||
149 | raise FeatureNotFound( | ||
150 | "Couldn't find a tree builder with the features you " | ||
151 | "requested: %s. Do you need to install a parser library?" | ||
152 | % ",".join(features)) | ||
153 | builder = builder_class() | ||
154 | self.builder = builder | ||
155 | self.is_xml = builder.is_xml | ||
156 | self.builder.soup = self | ||
157 | |||
158 | self.parse_only = parse_only | ||
159 | |||
160 | if hasattr(markup, 'read'): # It's a file-type object. | ||
161 | markup = markup.read() | ||
162 | elif len(markup) <= 256: | ||
163 | # Print out warnings for a couple beginner problems | ||
164 | # involving passing non-markup to Beautiful Soup. | ||
165 | # Beautiful Soup will still parse the input as markup, | ||
166 | # just in case that's what the user really wants. | ||
167 | if (isinstance(markup, unicode) | ||
168 | and not os.path.supports_unicode_filenames): | ||
169 | possible_filename = markup.encode("utf8") | ||
170 | else: | ||
171 | possible_filename = markup | ||
172 | is_file = False | ||
173 | try: | ||
174 | is_file = os.path.exists(possible_filename) | ||
175 | except Exception, e: | ||
176 | # This is almost certainly a problem involving | ||
177 | # characters not valid in filenames on this | ||
178 | # system. Just let it go. | ||
179 | pass | ||
180 | if is_file: | ||
181 | warnings.warn( | ||
182 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) | ||
183 | if markup[:5] == "http:" or markup[:6] == "https:": | ||
184 | # TODO: This is ugly but I couldn't get it to work in | ||
185 | # Python 3 otherwise. | ||
186 | if ((isinstance(markup, bytes) and not b' ' in markup) | ||
187 | or (isinstance(markup, unicode) and not u' ' in markup)): | ||
188 | warnings.warn( | ||
189 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) | ||
190 | |||
191 | for (self.markup, self.original_encoding, self.declared_html_encoding, | ||
192 | self.contains_replacement_characters) in ( | ||
193 | self.builder.prepare_markup(markup, from_encoding)): | ||
194 | self.reset() | ||
195 | try: | ||
196 | self._feed() | ||
197 | break | ||
198 | except ParserRejectedMarkup: | ||
199 | pass | ||
200 | |||
201 | # Clear out the markup and remove the builder's circular | ||
202 | # reference to this object. | ||
203 | self.markup = None | ||
204 | self.builder.soup = None | ||
205 | |||
206 | def _feed(self): | ||
207 | # Convert the document to Unicode. | ||
208 | self.builder.reset() | ||
209 | |||
210 | self.builder.feed(self.markup) | ||
211 | # Close out any unfinished strings and close all the open tags. | ||
212 | self.endData() | ||
213 | while self.currentTag.name != self.ROOT_TAG_NAME: | ||
214 | self.popTag() | ||
215 | |||
216 | def reset(self): | ||
217 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) | ||
218 | self.hidden = 1 | ||
219 | self.builder.reset() | ||
220 | self.current_data = [] | ||
221 | self.currentTag = None | ||
222 | self.tagStack = [] | ||
223 | self.preserve_whitespace_tag_stack = [] | ||
224 | self.pushTag(self) | ||
225 | |||
226 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs): | ||
227 | """Create a new tag associated with this soup.""" | ||
228 | return Tag(None, self.builder, name, namespace, nsprefix, attrs) | ||
229 | |||
230 | def new_string(self, s, subclass=NavigableString): | ||
231 | """Create a new NavigableString associated with this soup.""" | ||
232 | navigable = subclass(s) | ||
233 | navigable.setup() | ||
234 | return navigable | ||
235 | |||
236 | def insert_before(self, successor): | ||
237 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") | ||
238 | |||
239 | def insert_after(self, successor): | ||
240 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") | ||
241 | |||
242 | def popTag(self): | ||
243 | tag = self.tagStack.pop() | ||
244 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: | ||
245 | self.preserve_whitespace_tag_stack.pop() | ||
246 | #print "Pop", tag.name | ||
247 | if self.tagStack: | ||
248 | self.currentTag = self.tagStack[-1] | ||
249 | return self.currentTag | ||
250 | |||
251 | def pushTag(self, tag): | ||
252 | #print "Push", tag.name | ||
253 | if self.currentTag: | ||
254 | self.currentTag.contents.append(tag) | ||
255 | self.tagStack.append(tag) | ||
256 | self.currentTag = self.tagStack[-1] | ||
257 | if tag.name in self.builder.preserve_whitespace_tags: | ||
258 | self.preserve_whitespace_tag_stack.append(tag) | ||
259 | |||
260 | def endData(self, containerClass=NavigableString): | ||
261 | if self.current_data: | ||
262 | current_data = u''.join(self.current_data) | ||
263 | # If whitespace is not preserved, and this string contains | ||
264 | # nothing but ASCII spaces, replace it with a single space | ||
265 | # or newline. | ||
266 | if not self.preserve_whitespace_tag_stack: | ||
267 | strippable = True | ||
268 | for i in current_data: | ||
269 | if i not in self.ASCII_SPACES: | ||
270 | strippable = False | ||
271 | break | ||
272 | if strippable: | ||
273 | if '\n' in current_data: | ||
274 | current_data = '\n' | ||
275 | else: | ||
276 | current_data = ' ' | ||
277 | |||
278 | # Reset the data collector. | ||
279 | self.current_data = [] | ||
280 | |||
281 | # Should we add this string to the tree at all? | ||
282 | if self.parse_only and len(self.tagStack) <= 1 and \ | ||
283 | (not self.parse_only.text or \ | ||
284 | not self.parse_only.search(current_data)): | ||
285 | return | ||
286 | |||
287 | o = containerClass(current_data) | ||
288 | self.object_was_parsed(o) | ||
289 | |||
290 | def object_was_parsed(self, o, parent=None, most_recent_element=None): | ||
291 | """Add an object to the parse tree.""" | ||
292 | parent = parent or self.currentTag | ||
293 | most_recent_element = most_recent_element or self._most_recent_element | ||
294 | o.setup(parent, most_recent_element) | ||
295 | |||
296 | if most_recent_element is not None: | ||
297 | most_recent_element.next_element = o | ||
298 | self._most_recent_element = o | ||
299 | parent.contents.append(o) | ||
300 | |||
301 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): | ||
302 | """Pops the tag stack up to and including the most recent | ||
303 | instance of the given tag. If inclusivePop is false, pops the tag | ||
304 | stack up to but *not* including the most recent instqance of | ||
305 | the given tag.""" | ||
306 | #print "Popping to %s" % name | ||
307 | if name == self.ROOT_TAG_NAME: | ||
308 | # The BeautifulSoup object itself can never be popped. | ||
309 | return | ||
310 | |||
311 | most_recently_popped = None | ||
312 | |||
313 | stack_size = len(self.tagStack) | ||
314 | for i in range(stack_size - 1, 0, -1): | ||
315 | t = self.tagStack[i] | ||
316 | if (name == t.name and nsprefix == t.prefix): | ||
317 | if inclusivePop: | ||
318 | most_recently_popped = self.popTag() | ||
319 | break | ||
320 | most_recently_popped = self.popTag() | ||
321 | |||
322 | return most_recently_popped | ||
323 | |||
324 | def handle_starttag(self, name, namespace, nsprefix, attrs): | ||
325 | """Push a start tag on to the stack. | ||
326 | |||
327 | If this method returns None, the tag was rejected by the | ||
328 | SoupStrainer. You should proceed as if the tag had not occured | ||
329 | in the document. For instance, if this was a self-closing tag, | ||
330 | don't call handle_endtag. | ||
331 | """ | ||
332 | |||
333 | # print "Start tag %s: %s" % (name, attrs) | ||
334 | self.endData() | ||
335 | |||
336 | if (self.parse_only and len(self.tagStack) <= 1 | ||
337 | and (self.parse_only.text | ||
338 | or not self.parse_only.search_tag(name, attrs))): | ||
339 | return None | ||
340 | |||
341 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, | ||
342 | self.currentTag, self._most_recent_element) | ||
343 | if tag is None: | ||
344 | return tag | ||
345 | if self._most_recent_element: | ||
346 | self._most_recent_element.next_element = tag | ||
347 | self._most_recent_element = tag | ||
348 | self.pushTag(tag) | ||
349 | return tag | ||
350 | |||
351 | def handle_endtag(self, name, nsprefix=None): | ||
352 | #print "End tag: " + name | ||
353 | self.endData() | ||
354 | self._popToTag(name, nsprefix) | ||
355 | |||
356 | def handle_data(self, data): | ||
357 | self.current_data.append(data) | ||
358 | |||
359 | def decode(self, pretty_print=False, | ||
360 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, | ||
361 | formatter="minimal"): | ||
362 | """Returns a string or Unicode representation of this document. | ||
363 | To get Unicode, pass None for encoding.""" | ||
364 | |||
365 | if self.is_xml: | ||
366 | # Print the XML declaration | ||
367 | encoding_part = '' | ||
368 | if eventual_encoding != None: | ||
369 | encoding_part = ' encoding="%s"' % eventual_encoding | ||
370 | prefix = u'<?xml version="1.0"%s?>\n' % encoding_part | ||
371 | else: | ||
372 | prefix = u'' | ||
373 | if not pretty_print: | ||
374 | indent_level = None | ||
375 | else: | ||
376 | indent_level = 0 | ||
377 | return prefix + super(BeautifulSoup, self).decode( | ||
378 | indent_level, eventual_encoding, formatter) | ||
379 | |||
380 | # Alias to make it easier to type import: 'from bs4 import _soup' | ||
381 | _s = BeautifulSoup | ||
382 | _soup = BeautifulSoup | ||
383 | |||
384 | class BeautifulStoneSoup(BeautifulSoup): | ||
385 | """Deprecated interface to an XML parser.""" | ||
386 | |||
387 | def __init__(self, *args, **kwargs): | ||
388 | kwargs['features'] = 'xml' | ||
389 | warnings.warn( | ||
390 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' | ||
391 | 'it, pass features="xml" into the BeautifulSoup constructor.') | ||
392 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) | ||
393 | |||
394 | |||
395 | class StopParsing(Exception): | ||
396 | pass | ||
397 | |||
398 | class FeatureNotFound(ValueError): | ||
399 | pass | ||
400 | |||
401 | |||
402 | #By default, act as an HTML pretty-printer. | ||
403 | if __name__ == '__main__': | ||
404 | import sys | ||
405 | soup = BeautifulSoup(sys.stdin) | ||
406 | print soup.prettify() | ||