summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4')
-rw-r--r--bitbake/lib/bs4/AUTHORS.txt43
-rw-r--r--bitbake/lib/bs4/COPYING.txt26
-rw-r--r--bitbake/lib/bs4/NEWS.txt1066
-rw-r--r--bitbake/lib/bs4/__init__.py406
-rw-r--r--bitbake/lib/bs4/builder/__init__.py321
-rw-r--r--bitbake/lib/bs4/builder/_html5lib.py285
-rw-r--r--bitbake/lib/bs4/builder/_htmlparser.py258
-rw-r--r--bitbake/lib/bs4/builder/_lxml.py233
-rw-r--r--bitbake/lib/bs4/dammit.py829
-rw-r--r--bitbake/lib/bs4/diagnose.py204
-rw-r--r--bitbake/lib/bs4/element.py1611
-rw-r--r--bitbake/lib/bs4/testing.py592
-rw-r--r--bitbake/lib/bs4/tests/__init__.py1
-rw-r--r--bitbake/lib/bs4/tests/test_builder_registry.py141
-rw-r--r--bitbake/lib/bs4/tests/test_docs.py36
-rw-r--r--bitbake/lib/bs4/tests/test_html5lib.py85
-rw-r--r--bitbake/lib/bs4/tests/test_htmlparser.py19
-rw-r--r--bitbake/lib/bs4/tests/test_lxml.py91
-rw-r--r--bitbake/lib/bs4/tests/test_soup.py434
-rw-r--r--bitbake/lib/bs4/tests/test_tree.py1829
20 files changed, 8510 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/AUTHORS.txt b/bitbake/lib/bs4/AUTHORS.txt
new file mode 100644
index 0000000000..2ac8fcc8cc
--- /dev/null
+++ b/bitbake/lib/bs4/AUTHORS.txt
@@ -0,0 +1,43 @@
1Behold, mortal, the origins of Beautiful Soup...
2================================================
3
4Leonard Richardson is the primary programmer.
5
6Aaron DeVore is awesome.
7
8Mark Pilgrim provided the encoding detection code that forms the base
9of UnicodeDammit.
10
11Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
12Soup 4 working under Python 3.
13
14Simon Willison wrote soupselect, which was used to make Beautiful Soup
15support CSS selectors.
16
17Sam Ruby helped with a lot of edge cases.
18
19Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
20work in solving the nestable tags conundrum.
21
22An incomplete list of people have contributed patches to Beautiful
23Soup:
24
25 Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
26 Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
27 Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
28 Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
29 Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
30 Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
31 Webster, Paul Wright, Danny Yoo
32
33An incomplete list of people who made suggestions or found bugs or
34found ways to break Beautiful Soup:
35
36 Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
37 Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
38 Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
39 warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
40 Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
41 Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
42 Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
43 Sousa Rocha, Yichun Wei, Per Vognsen
diff --git a/bitbake/lib/bs4/COPYING.txt b/bitbake/lib/bs4/COPYING.txt
new file mode 100644
index 0000000000..d668d13f04
--- /dev/null
+++ b/bitbake/lib/bs4/COPYING.txt
@@ -0,0 +1,26 @@
1Beautiful Soup is made available under the MIT license:
2
3 Copyright (c) 2004-2012 Leonard Richardson
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE, DAMMIT.
24
25Beautiful Soup incorporates code from the html5lib library, which is
26also made available under the MIT license.
diff --git a/bitbake/lib/bs4/NEWS.txt b/bitbake/lib/bs4/NEWS.txt
new file mode 100644
index 0000000000..88a60a2458
--- /dev/null
+++ b/bitbake/lib/bs4/NEWS.txt
@@ -0,0 +1,1066 @@
1= 4.3.2 (20131002) =
2
3* Fixed a bug in which short Unicode input was improperly encoded to
4 ASCII when checking whether or not it was the name of a file on
5 disk. [bug=1227016]
6
7* Fixed a crash when a short input contains data not valid in
8 filenames. [bug=1232604]
9
10* Fixed a bug that caused Unicode data put into UnicodeDammit to
11 return None instead of the original data. [bug=1214983]
12
13* Combined two tests to stop a spurious test failure when tests are
14 run by nosetests. [bug=1212445]
15
16= 4.3.1 (20130815) =
17
18* Fixed yet another problem with the html5lib tree builder, caused by
19 html5lib's tendency to rearrange the tree during
20 parsing. [bug=1189267]
21
22* Fixed a bug that caused the optimized version of find_all() to
23 return nothing. [bug=1212655]
24
25= 4.3.0 (20130812) =
26
27* Instead of converting incoming data to Unicode and feeding it to the
28 lxml tree builder in chunks, Beautiful Soup now makes successive
29 guesses at the encoding of the incoming data, and tells lxml to
30 parse the data as that encoding. Giving lxml more control over the
31 parsing process improves performance and avoids a number of bugs and
32 issues with the lxml parser which had previously required elaborate
33 workarounds:
34
35 - An issue in which lxml refuses to parse Unicode strings on some
36 systems. [bug=1180527]
37
38 - A returning bug that truncated documents longer than a (very
39 small) size. [bug=963880]
40
41 - A returning bug in which extra spaces were added to a document if
42 the document defined a charset other than UTF-8. [bug=972466]
43
44 This required a major overhaul of the tree builder architecture. If
45 you wrote your own tree builder and didn't tell me, you'll need to
46 modify your prepare_markup() method.
47
48* The UnicodeDammit code that makes guesses at encodings has been
49 split into its own class, EncodingDetector. A lot of apparently
50 redundant code has been removed from Unicode, Dammit, and some
51 undocumented features have also been removed.
52
53* Beautiful Soup will issue a warning if instead of markup you pass it
54 a URL or the name of a file on disk (a common beginner's mistake).
55
56* A number of optimizations improve the performance of the lxml tree
57 builder by about 33%, the html.parser tree builder by about 20%, and
58 the html5lib tree builder by about 15%.
59
60* All find_all calls should now return a ResultSet object. Patch by
61 Aaron DeVore. [bug=1194034]
62
63= 4.2.1 (20130531) =
64
65* The default XML formatter will now replace ampersands even if they
66 appear to be part of entities. That is, "<" will become
67 "<". The old code was left over from Beautiful Soup 3, which
68 didn't always turn entities into Unicode characters.
69
70 If you really want the old behavior (maybe because you add new
71 strings to the tree, those strings include entities, and you want
72 the formatter to leave them alone on output), it can be found in
73 EntitySubstitution.substitute_xml_containing_entities(). [bug=1182183]
74
75* Gave new_string() the ability to create subclasses of
76 NavigableString. [bug=1181986]
77
78* Fixed another bug by which the html5lib tree builder could create a
79 disconnected tree. [bug=1182089]
80
81* The .previous_element of a BeautifulSoup object is now always None,
82 not the last element to be parsed. [bug=1182089]
83
84* Fixed test failures when lxml is not installed. [bug=1181589]
85
86* html5lib now supports Python 3. Fixed some Python 2-specific
87 code in the html5lib test suite. [bug=1181624]
88
89* The html.parser treebuilder can now handle numeric attributes in
90 text when the hexidecimal name of the attribute starts with a
91 capital X. Patch by Tim Shirley. [bug=1186242]
92
93= 4.2.0 (20130514) =
94
95* The Tag.select() method now supports a much wider variety of CSS
96 selectors.
97
98 - Added support for the adjacent sibling combinator (+) and the
99 general sibling combinator (~). Tests by "liquider". [bug=1082144]
100
101 - The combinators (>, +, and ~) can now combine with any supported
102 selector, not just one that selects based on tag name.
103
104 - Added limited support for the "nth-of-type" pseudo-class. Code
105 by Sven Slootweg. [bug=1109952]
106
107* The BeautifulSoup class is now aliased to "_s" and "_soup", making
108 it quicker to type the import statement in an interactive session:
109
110 from bs4 import _s
111 or
112 from bs4 import _soup
113
114 The alias may change in the future, so don't use this in code you're
115 going to run more than once.
116
117* Added the 'diagnose' submodule, which includes several useful
118 functions for reporting problems and doing tech support.
119
120 - diagnose(data) tries the given markup on every installed parser,
121 reporting exceptions and displaying successes. If a parser is not
122 installed, diagnose() mentions this fact.
123
124 - lxml_trace(data, html=True) runs the given markup through lxml's
125 XML parser or HTML parser, and prints out the parser events as
126 they happen. This helps you quickly determine whether a given
127 problem occurs in lxml code or Beautiful Soup code.
128
129 - htmlparser_trace(data) is the same thing, but for Python's
130 built-in HTMLParser class.
131
132* In an HTML document, the contents of a <script> or <style> tag will
133 no longer undergo entity substitution by default. XML documents work
134 the same way they did before. [bug=1085953]
135
136* Methods like get_text() and properties like .strings now only give
137 you strings that are visible in the document--no comments or
138 processing commands. [bug=1050164]
139
140* The prettify() method now leaves the contents of <pre> tags
141 alone. [bug=1095654]
142
143* Fix a bug in the html5lib treebuilder which sometimes created
144 disconnected trees. [bug=1039527]
145
146* Fix a bug in the lxml treebuilder which crashed when a tag included
147 an attribute from the predefined "xml:" namespace. [bug=1065617]
148
149* Fix a bug by which keyword arguments to find_parent() were not
150 being passed on. [bug=1126734]
151
152* Stop a crash when unwisely messing with a tag that's been
153 decomposed. [bug=1097699]
154
155* Now that lxml's segfault on invalid doctype has been fixed, fixed a
156 corresponding problem on the Beautiful Soup end that was previously
157 invisible. [bug=984936]
158
159* Fixed an exception when an overspecified CSS selector didn't match
160 anything. Code by Stefaan Lippens. [bug=1168167]
161
162= 4.1.3 (20120820) =
163
164* Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious
165 test failure caused by the lousy HTMLParser in those
166 versions. [bug=1038503]
167
168* Raise a more specific error (FeatureNotFound) when a requested
169 parser or parser feature is not installed. Raise NotImplementedError
170 instead of ValueError when the user calls insert_before() or
171 insert_after() on the BeautifulSoup object itself. Patch by Aaron
172 Devore. [bug=1038301]
173
174= 4.1.2 (20120817) =
175
176* As per PEP-8, allow searching by CSS class using the 'class_'
177 keyword argument. [bug=1037624]
178
179* Display namespace prefixes for namespaced attribute names, instead of
180 the fully-qualified names given by the lxml parser. [bug=1037597]
181
182* Fixed a crash on encoding when an attribute name contained
183 non-ASCII characters.
184
185* When sniffing encodings, if the cchardet library is installed,
186 Beautiful Soup uses it instead of chardet. cchardet is much
187 faster. [bug=1020748]
188
189* Use logging.warning() instead of warning.warn() to notify the user
190 that characters were replaced with REPLACEMENT
191 CHARACTER. [bug=1013862]
192
193= 4.1.1 (20120703) =
194
195* Fixed an html5lib tree builder crash which happened when html5lib
196 moved a tag with a multivalued attribute from one part of the tree
197 to another. [bug=1019603]
198
199* Correctly display closing tags with an XML namespace declared. Patch
200 by Andreas Kostyrka. [bug=1019635]
201
202* Fixed a typo that made parsing significantly slower than it should
203 have been, and also waited too long to close tags with XML
204 namespaces. [bug=1020268]
205
206* get_text() now returns an empty Unicode string if there is no text,
207 rather than an empty bytestring. [bug=1020387]
208
209= 4.1.0 (20120529) =
210
211* Added experimental support for fixing Windows-1252 characters
212 embedded in UTF-8 documents. (UnicodeDammit.detwingle())
213
214* Fixed the handling of &quot; with the built-in parser. [bug=993871]
215
216* Comments, processing instructions, document type declarations, and
217 markup declarations are now treated as preformatted strings, the way
218 CData blocks are. [bug=1001025]
219
220* Fixed a bug with the lxml treebuilder that prevented the user from
221 adding attributes to a tag that didn't originally have
222 attributes. [bug=1002378] Thanks to Oliver Beattie for the patch.
223
224* Fixed some edge-case bugs having to do with inserting an element
225 into a tag it's already inside, and replacing one of a tag's
226 children with another. [bug=997529]
227
228* Added the ability to search for attribute values specified in UTF-8. [bug=1003974]
229
230 This caused a major refactoring of the search code. All the tests
231 pass, but it's possible that some searches will behave differently.
232
233= 4.0.5 (20120427) =
234
235* Added a new method, wrap(), which wraps an element in a tag.
236
237* Renamed replace_with_children() to unwrap(), which is easier to
238 understand and also the jQuery name of the function.
239
240* Made encoding substitution in <meta> tags completely transparent (no
241 more %SOUP-ENCODING%).
242
243* Fixed a bug in decoding data that contained a byte-order mark, such
244 as data encoded in UTF-16LE. [bug=988980]
245
246* Fixed a bug that made the HTMLParser treebuilder generate XML
247 definitions ending with two question marks instead of
248 one. [bug=984258]
249
250* Upon document generation, CData objects are no longer run through
251 the formatter. [bug=988905]
252
253* The test suite now passes when lxml is not installed, whether or not
254 html5lib is installed. [bug=987004]
255
256* Print a warning on HTMLParseErrors to let people know they should
257 install a better parser library.
258
259= 4.0.4 (20120416) =
260
261* Fixed a bug that sometimes created disconnected trees.
262
263* Fixed a bug with the string setter that moved a string around the
264 tree instead of copying it. [bug=983050]
265
266* Attribute values are now run through the provided output formatter.
267 Previously they were always run through the 'minimal' formatter. In
268 the future I may make it possible to specify different formatters
269 for attribute values and strings, but for now, consistent behavior
270 is better than inconsistent behavior. [bug=980237]
271
272* Added the missing renderContents method from Beautiful Soup 3. Also
273 added an encode_contents() method to go along with decode_contents().
274
275* Give a more useful error when the user tries to run the Python 2
276 version of BS under Python 3.
277
278* UnicodeDammit can now convert Microsoft smart quotes to ASCII with
279 UnicodeDammit(markup, smart_quotes_to="ascii").
280
281= 4.0.3 (20120403) =
282
283* Fixed a typo that caused some versions of Python 3 to convert the
284 Beautiful Soup codebase incorrectly.
285
286* Got rid of the 4.0.2 workaround for HTML documents--it was
287 unnecessary and the workaround was triggering a (possibly different,
288 but related) bug in lxml. [bug=972466]
289
290= 4.0.2 (20120326) =
291
292* Worked around a possible bug in lxml that prevents non-tiny XML
293 documents from being parsed. [bug=963880, bug=963936]
294
295* Fixed a bug where specifying `text` while also searching for a tag
296 only worked if `text` wanted an exact string match. [bug=955942]
297
298= 4.0.1 (20120314) =
299
300* This is the first official release of Beautiful Soup 4. There is no
301 4.0.0 release, to eliminate any possibility that packaging software
302 might treat "4.0.0" as being an earlier version than "4.0.0b10".
303
304* Brought BS up to date with the latest release of soupselect, adding
305 CSS selector support for direct descendant matches and multiple CSS
306 class matches.
307
308= 4.0.0b10 (20120302) =
309
310* Added support for simple CSS selectors, taken from the soupselect project.
311
312* Fixed a crash when using html5lib. [bug=943246]
313
314* In HTML5-style <meta charset="foo"> tags, the value of the "charset"
315 attribute is now replaced with the appropriate encoding on
316 output. [bug=942714]
317
318* Fixed a bug that caused calling a tag to sometimes call find_all()
319 with the wrong arguments. [bug=944426]
320
321* For backwards compatibility, brought back the BeautifulStoneSoup
322 class as a deprecated wrapper around BeautifulSoup.
323
324= 4.0.0b9 (20120228) =
325
326* Fixed the string representation of DOCTYPEs that have both a public
327 ID and a system ID.
328
329* Fixed the generated XML declaration.
330
331* Renamed Tag.nsprefix to Tag.prefix, for consistency with
332 NamespacedAttribute.
333
334* Fixed a test failure that occured on Python 3.x when chardet was
335 installed.
336
337* Made prettify() return Unicode by default, so it will look nice on
338 Python 3 when passed into print().
339
340= 4.0.0b8 (20120224) =
341
342* All tree builders now preserve namespace information in the
343 documents they parse. If you use the html5lib parser or lxml's XML
344 parser, you can access the namespace URL for a tag as tag.namespace.
345
346 However, there is no special support for namespace-oriented
347 searching or tree manipulation. When you search the tree, you need
348 to use namespace prefixes exactly as they're used in the original
349 document.
350
351* The string representation of a DOCTYPE always ends in a newline.
352
353* Issue a warning if the user tries to use a SoupStrainer in
354 conjunction with the html5lib tree builder, which doesn't support
355 them.
356
357= 4.0.0b7 (20120223) =
358
359* Upon decoding to string, any characters that can't be represented in
360 your chosen encoding will be converted into numeric XML entity
361 references.
362
363* Issue a warning if characters were replaced with REPLACEMENT
364 CHARACTER during Unicode conversion.
365
366* Restored compatibility with Python 2.6.
367
368* The install process no longer installs docs or auxillary text files.
369
370* It's now possible to deepcopy a BeautifulSoup object created with
371 Python's built-in HTML parser.
372
373* About 100 unit tests that "test" the behavior of various parsers on
374 invalid markup have been removed. Legitimate changes to those
375 parsers caused these tests to fail, indicating that perhaps
376 Beautiful Soup should not test the behavior of foreign
377 libraries.
378
379 The problematic unit tests have been reformulated as informational
380 comparisons generated by the script
381 scripts/demonstrate_parser_differences.py.
382
383 This makes Beautiful Soup compatible with html5lib version 0.95 and
384 future versions of HTMLParser.
385
386= 4.0.0b6 (20120216) =
387
388* Multi-valued attributes like "class" always have a list of values,
389 even if there's only one value in the list.
390
391* Added a number of multi-valued attributes defined in HTML5.
392
393* Stopped generating a space before the slash that closes an
394 empty-element tag. This may come back if I add a special XHTML mode
395 (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty
396 useless.
397
398* Passing text along with tag-specific arguments to a find* method:
399
400 find("a", text="Click here")
401
402 will find tags that contain the given text as their
403 .string. Previously, the tag-specific arguments were ignored and
404 only strings were searched.
405
406* Fixed a bug that caused the html5lib tree builder to build a
407 partially disconnected tree. Generally cleaned up the html5lib tree
408 builder.
409
410* If you restrict a multi-valued attribute like "class" to a string
411 that contains spaces, Beautiful Soup will only consider it a match
412 if the values correspond to that specific string.
413
414= 4.0.0b5 (20120209) =
415
416* Rationalized Beautiful Soup's treatment of CSS class. A tag
417 belonging to multiple CSS classes is treated as having a list of
418 values for the 'class' attribute. Searching for a CSS class will
419 match *any* of the CSS classes.
420
421 This actually affects all attributes that the HTML standard defines
422 as taking multiple values (class, rel, rev, archive, accept-charset,
423 and headers), but 'class' is by far the most common. [bug=41034]
424
425* If you pass anything other than a dictionary as the second argument
426 to one of the find* methods, it'll assume you want to use that
427 object to search against a tag's CSS classes. Previously this only
428 worked if you passed in a string.
429
430* Fixed a bug that caused a crash when you passed a dictionary as an
431 attribute value (possibly because you mistyped "attrs"). [bug=842419]
432
433* Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags
434 like <meta charset="utf-8" />. [bug=837268]
435
436* If Unicode, Dammit can't figure out a consistent encoding for a
437 page, it will try each of its guesses again, with errors="replace"
438 instead of errors="strict". This may mean that some data gets
439 replaced with REPLACEMENT CHARACTER, but at least most of it will
440 get turned into Unicode. [bug=754903]
441
442* Patched over a bug in html5lib (?) that was crashing Beautiful Soup
443 on certain kinds of markup. [bug=838800]
444
445* Fixed a bug that wrecked the tree if you replaced an element with an
446 empty string. [bug=728697]
447
448* Improved Unicode, Dammit's behavior when you give it Unicode to
449 begin with.
450
451= 4.0.0b4 (20120208) =
452
453* Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag()
454
455* BeautifulSoup.new_tag() will follow the rules of whatever
456 tree-builder was used to create the original BeautifulSoup object. A
457 new <p> tag will look like "<p />" if the soup object was created to
458 parse XML, but it will look like "<p></p>" if the soup object was
459 created to parse HTML.
460
461* We pass in strict=False to html.parser on Python 3, greatly
462 improving html.parser's ability to handle bad HTML.
463
464* We also monkeypatch a serious bug in html.parser that made
465 strict=False disastrous on Python 3.2.2.
466
467* Replaced the "substitute_html_entities" argument with the
468 more general "formatter" argument.
469
470* Bare ampersands and angle brackets are always converted to XML
471 entities unless the user prevents it.
472
473* Added PageElement.insert_before() and PageElement.insert_after(),
474 which let you put an element into the parse tree with respect to
475 some other element.
476
477* Raise an exception when the user tries to do something nonsensical
478 like insert a tag into itself.
479
480
481= 4.0.0b3 (20120203) =
482
483Beautiful Soup 4 is a nearly-complete rewrite that removes Beautiful
484Soup's custom HTML parser in favor of a system that lets you write a
485little glue code and plug in any HTML or XML parser you want.
486
487Beautiful Soup 4.0 comes with glue code for four parsers:
488
489 * Python's standard HTMLParser (html.parser in Python 3)
490 * lxml's HTML and XML parsers
491 * html5lib's HTML parser
492
493HTMLParser is the default, but I recommend you install lxml if you
494can.
495
496For complete documentation, see the Sphinx documentation in
497bs4/doc/source/. What follows is a summary of the changes from
498Beautiful Soup 3.
499
500=== The module name has changed ===
501
502Previously you imported the BeautifulSoup class from a module also
503called BeautifulSoup. To save keystrokes and make it clear which
504version of the API is in use, the module is now called 'bs4':
505
506 >>> from bs4 import BeautifulSoup
507
508=== It works with Python 3 ===
509
510Beautiful Soup 3.1.0 worked with Python 3, but the parser it used was
511so bad that it barely worked at all. Beautiful Soup 4 works with
512Python 3, and since its parser is pluggable, you don't sacrifice
513quality.
514
515Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3
516support to the finish line. Ezio Melotti is also to thank for greatly
517improving the HTML parser that comes with Python 3.2.
518
519=== CDATA sections are normal text, if they're understood at all. ===
520
521Currently, the lxml and html5lib HTML parsers ignore CDATA sections in
522markup:
523
524 <p><![CDATA[foo]]></p> => <p></p>
525
526A future version of html5lib will turn CDATA sections into text nodes,
527but only within tags like <svg> and <math>:
528
529 <svg><![CDATA[foo]]></svg> => <p>foo</p>
530
531The default XML parser (which uses lxml behind the scenes) turns CDATA
532sections into ordinary text elements:
533
534 <p><![CDATA[foo]]></p> => <p>foo</p>
535
536In theory it's possible to preserve the CDATA sections when using the
537XML parser, but I don't see how to get it to work in practice.
538
539=== Miscellaneous other stuff ===
540
541If the BeautifulSoup instance has .is_xml set to True, an appropriate
542XML declaration will be emitted when the tree is transformed into a
543string:
544
545 <?xml version="1.0" encoding="utf-8">
546 <markup>
547 ...
548 </markup>
549
550The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree
551builders set it to False. If you want to parse XHTML with an HTML
552parser, you can set it manually.
553
554
555= 3.2.0 =
556
557The 3.1 series wasn't very useful, so I renamed the 3.0 series to 3.2
558to make it obvious which one you should use.
559
560= 3.1.0 =
561
562A hybrid version that supports 2.4 and can be automatically converted
563to run under Python 3.0. There are three backwards-incompatible
564changes you should be aware of, but no new features or deliberate
565behavior changes.
566
5671. str() may no longer do what you want. This is because the meaning
568of str() inverts between Python 2 and 3; in Python 2 it gives you a
569byte string, in Python 3 it gives you a Unicode string.
570
571The effect of this is that you can't pass an encoding to .__str__
572anymore. Use encode() to get a string and decode() to get Unicode, and
573you'll be ready (well, readier) for Python 3.
574
5752. Beautiful Soup is now based on HTMLParser rather than SGMLParser,
576which is gone in Python 3. There's some bad HTML that SGMLParser
577handled but HTMLParser doesn't, usually to do with attribute values
578that aren't closed or have brackets inside them:
579
580 <a href="foo</a>, </a><a href="bar">baz</a>
581 <a b="<a>">', '<a b="&lt;a&gt;"></a><a>"></a>
582
583A later version of Beautiful Soup will allow you to plug in different
584parsers to make tradeoffs between speed and the ability to handle bad
585HTML.
586
5873. In Python 3 (but not Python 2), HTMLParser converts entities within
588attributes to the corresponding Unicode characters. In Python 2 it's
589possible to parse this string and leave the &eacute; intact.
590
591 <a href="http://crummy.com?sacr&eacute;&bleu">
592
593In Python 3, the &eacute; is always converted to \xe9 during
594parsing.
595
596
597= 3.0.7a =
598
599Added an import that makes BS work in Python 2.3.
600
601
602= 3.0.7 =
603
604Fixed a UnicodeDecodeError when unpickling documents that contain
605non-ASCII characters.
606
607Fixed a TypeError that occured in some circumstances when a tag
608contained no text.
609
610Jump through hoops to avoid the use of chardet, which can be extremely
611slow in some circumstances. UTF-8 documents should never trigger the
612use of chardet.
613
614Whitespace is preserved inside <pre> and <textarea> tags that contain
615nothing but whitespace.
616
617Beautiful Soup can now parse a doctype that's scoped to an XML namespace.
618
619
620= 3.0.6 =
621
622Got rid of a very old debug line that prevented chardet from working.
623
624Added a Tag.decompose() method that completely disconnects a tree or a
625subset of a tree, breaking it up into bite-sized pieces that are
626easy for the garbage collecter to collect.
627
628Tag.extract() now returns the tag that was extracted.
629
630Tag.findNext() now does something with the keyword arguments you pass
631it instead of dropping them on the floor.
632
633Fixed a Unicode conversion bug.
634
635Fixed a bug that garbled some <meta> tags when rewriting them.
636
637
638= 3.0.5 =
639
640Soup objects can now be pickled, and copied with copy.deepcopy.
641
642Tag.append now works properly on existing BS objects. (It wasn't
643originally intended for outside use, but it can be now.) (Giles
644Radford)
645
646Passing in a nonexistent encoding will no longer crash the parser on
647Python 2.4 (John Nagle).
648
649Fixed an underlying bug in SGMLParser that thinks ASCII has 255
650characters instead of 127 (John Nagle).
651
652Entities are converted more consistently to Unicode characters.
653
654Entity references in attribute values are now converted to Unicode
655characters when appropriate. Numeric entities are always converted,
656because SGMLParser always converts them outside of attribute values.
657
658ALL_ENTITIES happens to just be the XHTML entities, so I renamed it to
659XHTML_ENTITIES.
660
661The regular expression for bare ampersands was too loose. In some
662cases ampersands were not being escaped. (Sam Ruby?)
663
664Non-breaking spaces and other special Unicode space characters are no
665longer folded to ASCII spaces. (Robert Leftwich)
666
667Information inside a TEXTAREA tag is now parsed literally, not as HTML
668tags. TEXTAREA now works exactly the same way as SCRIPT. (Zephyr Fang)
669
670= 3.0.4 =
671
672Fixed a bug that crashed Unicode conversion in some cases.
673
674Fixed a bug that prevented UnicodeDammit from being used as a
675general-purpose data scrubber.
676
677Fixed some unit test failures when running against Python 2.5.
678
679When considering whether to convert smart quotes, UnicodeDammit now
680looks at the original encoding in a case-insensitive way.
681
682= 3.0.3 (20060606) =
683
684Beautiful Soup is now usable as a way to clean up invalid XML/HTML (be
685sure to pass in an appropriate value for convertEntities, or XML/HTML
686entities might stick around that aren't valid in HTML/XML). The result
687may not validate, but it should be good enough to not choke a
688real-world XML parser. Specifically, the output of a properly
689constructed soup object should always be valid as part of an XML
690document, but parts may be missing if they were missing in the
691original. As always, if the input is valid XML, the output will also
692be valid.
693
694= 3.0.2 (20060602) =
695
696Previously, Beautiful Soup correctly handled attribute values that
697contained embedded quotes (sometimes by escaping), but not other kinds
698of XML character. Now, it correctly handles or escapes all special XML
699characters in attribute values.
700
701I aliased methods to the 2.x names (fetch, find, findText, etc.) for
702backwards compatibility purposes. Those names are deprecated and if I
703ever do a 4.0 I will remove them. I will, I tell you!
704
705Fixed a bug where the findAll method wasn't passing along any keyword
706arguments.
707
708When run from the command line, Beautiful Soup now acts as an HTML
709pretty-printer, not an XML pretty-printer.
710
711= 3.0.1 (20060530) =
712
713Reintroduced the "fetch by CSS class" shortcut. I thought keyword
714arguments would replace it, but they don't. You can't call soup('a',
715class='foo') because class is a Python keyword.
716
717If Beautiful Soup encounters a meta tag that declares the encoding,
718but a SoupStrainer tells it not to parse that tag, Beautiful Soup will
719no longer try to rewrite the meta tag to mention the new
720encoding. Basically, this makes SoupStrainers work in real-world
721applications instead of crashing the parser.
722
723= 3.0.0 "Who would not give all else for two p" (20060528) =
724
725This release is not backward-compatible with previous releases. If
726you've got code written with a previous version of the library, go
727ahead and keep using it, unless one of the features mentioned here
728really makes your life easier. Since the library is self-contained,
729you can include an old copy of the library in your old applications,
730and use the new version for everything else.
731
732The documentation has been rewritten and greatly expanded with many
733more examples.
734
735Beautiful Soup autodetects the encoding of a document (or uses the one
736you specify), and converts it from its native encoding to
737Unicode. Internally, it only deals with Unicode strings. When you
738print out the document, it converts to UTF-8 (or another encoding you
739specify). [Doc reference]
740
741It's now easy to make large-scale changes to the parse tree without
742screwing up the navigation members. The methods are extract,
743replaceWith, and insert. [Doc reference. See also Improving Memory
744Usage with extract]
745
746Passing True in as an attribute value gives you tags that have any
747value for that attribute. You don't have to create a regular
748expression. Passing None for an attribute value gives you tags that
749don't have that attribute at all.
750
751Tag objects now know whether or not they're self-closing. This avoids
752the problem where Beautiful Soup thought that tags like <BR /> were
753self-closing even in XML documents. You can customize the self-closing
754tags for a parser object by passing them in as a list of
755selfClosingTags: you don't have to subclass anymore.
756
757There's a new built-in parser, MinimalSoup, which has most of
758BeautifulSoup's HTML-specific rules, but no tag nesting rules. [Doc
759reference]
760
761You can use a SoupStrainer to tell Beautiful Soup to parse only part
762of a document. This saves time and memory, often making Beautiful Soup
763about as fast as a custom-built SGMLParser subclass. [Doc reference,
764SoupStrainer reference]
765
766You can (usually) use keyword arguments instead of passing a
767dictionary of attributes to a search method. That is, you can replace
768soup(args={"id" : "5"}) with soup(id="5"). You can still use args if
769(for instance) you need to find an attribute whose name clashes with
770the name of an argument to findAll. [Doc reference: **kwargs attrs]
771
772The method names have changed to the better method names used in
773Rubyful Soup. Instead of find methods and fetch methods, there are
774only find methods. Instead of a scheme where you can't remember which
775method finds one element and which one finds them all, we have find
776and findAll. In general, if the method name mentions All or a plural
777noun (eg. findNextSiblings), then it finds many elements
778method. Otherwise, it only finds one element. [Doc reference]
779
780Some of the argument names have been renamed for clarity. For instance
781avoidParserProblems is now parserMassage.
782
783Beautiful Soup no longer implements a feed method. You need to pass a
784string or a filehandle into the soup constructor, not with feed after
785the soup has been created. There is still a feed method, but it's the
786feed method implemented by SGMLParser and calling it will bypass
787Beautiful Soup and cause problems.
788
789The NavigableText class has been renamed to NavigableString. There is
790no NavigableUnicodeString anymore, because every string inside a
791Beautiful Soup parse tree is a Unicode string.
792
793findText and fetchText are gone. Just pass a text argument into find
794or findAll.
795
796Null was more trouble than it was worth, so I got rid of it. Anything
797that used to return Null now returns None.
798
799Special XML constructs like comments and CDATA now have their own
800NavigableString subclasses, instead of being treated as oddly-formed
801data. If you parse a document that contains CDATA and write it back
802out, the CDATA will still be there.
803
804When you're parsing a document, you can get Beautiful Soup to convert
805XML or HTML entities into the corresponding Unicode characters. [Doc
806reference]
807
808= 2.1.1 (20050918) =
809
810Fixed a serious performance bug in BeautifulStoneSoup which was
811causing parsing to be incredibly slow.
812
813Corrected several entities that were previously being incorrectly
814translated from Microsoft smart-quote-like characters.
815
816Fixed a bug that was breaking text fetch.
817
818Fixed a bug that crashed the parser when text chunks that look like
819HTML tag names showed up within a SCRIPT tag.
820
821THEAD, TBODY, and TFOOT tags are now nestable within TABLE
822tags. Nested tables should parse more sensibly now.
823
824BASE is now considered a self-closing tag.
825
826= 2.1.0 "Game, or any other dish?" (20050504) =
827
828Added a wide variety of new search methods which, given a starting
829point inside the tree, follow a particular navigation member (like
830nextSibling) over and over again, looking for Tag and NavigableText
831objects that match certain criteria. The new methods are findNext,
832fetchNext, findPrevious, fetchPrevious, findNextSibling,
833fetchNextSiblings, findPreviousSibling, fetchPreviousSiblings,
834findParent, and fetchParents. All of these use the same basic code
835used by first and fetch, so you can pass your weird ways of matching
836things into these methods.
837
838The fetch method and its derivatives now accept a limit argument.
839
840You can now pass keyword arguments when calling a Tag object as though
841it were a method.
842
843Fixed a bug that caused all hand-created tags to share a single set of
844attributes.
845
846= 2.0.3 (20050501) =
847
848Fixed Python 2.2 support for iterators.
849
850Fixed a bug that gave the wrong representation to tags within quote
851tags like <script>.
852
853Took some code from Mark Pilgrim that treats CDATA declarations as
854data instead of ignoring them.
855
856Beautiful Soup's setup.py will now do an install even if the unit
857tests fail. It won't build a source distribution if the unit tests
858fail, so I can't release a new version unless they pass.
859
860= 2.0.2 (20050416) =
861
862Added the unit tests in a separate module, and packaged it with
863distutils.
864
865Fixed a bug that sometimes caused renderContents() to return a Unicode
866string even if there was no Unicode in the original string.
867
868Added the done() method, which closes all of the parser's open
869tags. It gets called automatically when you pass in some text to the
870constructor of a parser class; otherwise you must call it yourself.
871
872Reinstated some backwards compatibility with 1.x versions: referencing
873the string member of a NavigableText object returns the NavigableText
874object instead of throwing an error.
875
876= 2.0.1 (20050412) =
877
878Fixed a bug that caused bad results when you tried to reference a tag
879name shorter than 3 characters as a member of a Tag, eg. tag.table.td.
880
881Made sure all Tags have the 'hidden' attribute so that an attempt to
882access tag.hidden doesn't spawn an attempt to find a tag named
883'hidden'.
884
885Fixed a bug in the comparison operator.
886
887= 2.0.0 "Who cares for fish?" (20050410)
888
889Beautiful Soup version 1 was very useful but also pretty stupid. I
890originally wrote it without noticing any of the problems inherent in
891trying to build a parse tree out of ambiguous HTML tags. This version
892solves all of those problems to my satisfaction. It also adds many new
893clever things to make up for the removal of the stupid things.
894
895== Parsing ==
896
897The parser logic has been greatly improved, and the BeautifulSoup
898class should much more reliably yield a parse tree that looks like
899what the page author intended. For a particular class of odd edge
900cases that now causes problems, there is a new class,
901ICantBelieveItsBeautifulSoup.
902
903By default, Beautiful Soup now performs some cleanup operations on
904text before parsing it. This is to avoid common problems with bad
905definitions and self-closing tags that crash SGMLParser. You can
906provide your own set of cleanup operations, or turn it off
907altogether. The cleanup operations include fixing self-closing tags
908that don't close, and replacing Microsoft smart quotes and similar
909characters with their HTML entity equivalents.
910
911You can now get a pretty-print version of parsed HTML to get a visual
912picture of how Beautiful Soup parses it, with the Tag.prettify()
913method.
914
915== Strings and Unicode ==
916
917There are separate NavigableText subclasses for ASCII and Unicode
918strings. These classes directly subclass the corresponding base data
919types. This means you can treat NavigableText objects as strings
920instead of having to call methods on them to get the strings.
921
922str() on a Tag always returns a string, and unicode() always returns
923Unicode. Previously it was inconsistent.
924
925== Tree traversal ==
926
927In a first() or fetch() call, the tag name or the desired value of an
928attribute can now be any of the following:
929
930 * A string (matches that specific tag or that specific attribute value)
931 * A list of strings (matches any tag or attribute value in the list)
932 * A compiled regular expression object (matches any tag or attribute
933 value that matches the regular expression)
934 * A callable object that takes the Tag object or attribute value as a
935 string. It returns None/false/empty string if the given string
936 doesn't match, and any other value if it does.
937
938This is much easier to use than SQL-style wildcards (see, regular
939expressions are good for something). Because of this, I took out
940SQL-style wildcards. I'll put them back if someone complains, but
941their removal simplifies the code a lot.
942
943You can use fetch() and first() to search for text in the parse tree,
944not just tags. There are new alias methods fetchText() and firstText()
945designed for this purpose. As with searching for tags, you can pass in
946a string, a regular expression object, or a method to match your text.
947
948If you pass in something besides a map to the attrs argument of
949fetch() or first(), Beautiful Soup will assume you want to match that
950thing against the "class" attribute. When you're scraping
951well-structured HTML, this makes your code a lot cleaner.
952
9531.x and 2.x both let you call a Tag object as a shorthand for
954fetch(). For instance, foo("bar") is a shorthand for
955foo.fetch("bar"). In 2.x, you can also access a specially-named member
956of a Tag object as a shorthand for first(). For instance, foo.barTag
957is a shorthand for foo.first("bar"). By chaining these shortcuts you
958traverse a tree in very little code: for header in
959soup.bodyTag.pTag.tableTag('th'):
960
961If an element relationship (like parent or next) doesn't apply to a
962tag, it'll now show up Null instead of None. first() will also return
963Null if you ask it for a nonexistent tag. Null is an object that's
964just like None, except you can do whatever you want to it and it'll
965give you Null instead of throwing an error.
966
967This lets you do tree traversals like soup.htmlTag.headTag.titleTag
968without having to worry if the intermediate stages are actually
969there. Previously, if there was no 'head' tag in the document, headTag
970in that instance would have been None, and accessing its 'titleTag'
971member would have thrown an AttributeError. Now, you can get what you
972want when it exists, and get Null when it doesn't, without having to
973do a lot of conditionals checking to see if every stage is None.
974
975There are two new relations between page elements: previousSibling and
976nextSibling. They reference the previous and next element at the same
977level of the parse tree. For instance, if you have HTML like this:
978
979 <p><ul><li>Foo<br /><li>Bar</ul>
980
981The first 'li' tag has a previousSibling of Null and its nextSibling
982is the second 'li' tag. The second 'li' tag has a nextSibling of Null
983and its previousSibling is the first 'li' tag. The previousSibling of
984the 'ul' tag is the first 'p' tag. The nextSibling of 'Foo' is the
985'br' tag.
986
987I took out the ability to use fetch() to find tags that have a
988specific list of contents. See, I can't even explain it well. It was
989really difficult to use, I never used it, and I don't think anyone
990else ever used it. To the extent anyone did, they can probably use
991fetchText() instead. If it turns out someone needs it I'll think of
992another solution.
993
994== Tree manipulation ==
995
996You can add new attributes to a tag, and delete attributes from a
997tag. In 1.x you could only change a tag's existing attributes.
998
999== Porting Considerations ==
1000
1001There are three changes in 2.0 that break old code:
1002
1003In the post-1.2 release you could pass in a function into fetch(). The
1004function took a string, the tag name. In 2.0, the function takes the
1005actual Tag object.
1006
1007It's no longer to pass in SQL-style wildcards to fetch(). Use a
1008regular expression instead.
1009
1010The different parsing algorithm means the parse tree may not be shaped
1011like you expect. This will only actually affect you if your code uses
1012one of the affected parts. I haven't run into this problem yet while
1013porting my code.
1014
1015= Between 1.2 and 2.0 =
1016
1017This is the release to get if you want Python 1.5 compatibility.
1018
1019The desired value of an attribute can now be any of the following:
1020
1021 * A string
1022 * A string with SQL-style wildcards
1023 * A compiled RE object
1024 * A callable that returns None/false/empty string if the given value
1025 doesn't match, and any other value otherwise.
1026
1027This is much easier to use than SQL-style wildcards (see, regular
1028expressions are good for something). Because of this, I no longer
1029recommend you use SQL-style wildcards. They may go away in a future
1030release to clean up the code.
1031
1032Made Beautiful Soup handle processing instructions as text instead of
1033ignoring them.
1034
1035Applied patch from Richie Hindle (richie at entrian dot com) that
1036makes tag.string a shorthand for tag.contents[0].string when the tag
1037has only one string-owning child.
1038
1039Added still more nestable tags. The nestable tags thing won't work in
1040a lot of cases and needs to be rethought.
1041
1042Fixed an edge case where searching for "%foo" would match any string
1043shorter than "foo".
1044
1045= 1.2 "Who for such dainties would not stoop?" (20040708) =
1046
1047Applied patch from Ben Last (ben at benlast dot com) that made
1048Tag.renderContents() correctly handle Unicode.
1049
1050Made BeautifulStoneSoup even dumber by making it not implicitly close
1051a tag when another tag of the same type is encountered; only when an
1052actual closing tag is encountered. This change courtesy of Fuzzy (mike
1053at pcblokes dot com). BeautifulSoup still works as before.
1054
1055= 1.1 "Swimming in a hot tureen" =
1056
1057Added more 'nestable' tags. Changed popping semantics so that when a
1058nestable tag is encountered, tags are popped up to the previously
1059encountered nestable tag (of whatever kind). I will revert this if
1060enough people complain, but it should make more people's lives easier
1061than harder. This enhancement was suggested by Anthony Baxter (anthony
1062at interlink dot com dot au).
1063
1064= 1.0 "So rich and green" (20040420) =
1065
1066Initial release.
diff --git a/bitbake/lib/bs4/__init__.py b/bitbake/lib/bs4/__init__.py
new file mode 100644
index 0000000000..7ba34269af
--- /dev/null
+++ b/bitbake/lib/bs4/__init__.py
@@ -0,0 +1,406 @@
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup uses a pluggable XML or HTML parser to parse a
7(possibly invalid) document into a tree representation. Beautiful Soup
8provides provides methods and Pythonic idioms that make it easy to
9navigate, search, and modify the parse tree.
10
11Beautiful Soup works with Python 2.6 and up. It works better if lxml
12and/or html5lib is installed.
13
14For more than you ever wanted to know about Beautiful Soup, see the
15documentation:
16http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17"""
18
19__author__ = "Leonard Richardson (leonardr@segfault.org)"
20__version__ = "4.3.2"
21__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
22__license__ = "MIT"
23
24__all__ = ['BeautifulSoup']
25
26import os
27import re
28import warnings
29
30from .builder import builder_registry, ParserRejectedMarkup
31from .dammit import UnicodeDammit
32from .element import (
33 CData,
34 Comment,
35 DEFAULT_OUTPUT_ENCODING,
36 Declaration,
37 Doctype,
38 NavigableString,
39 PageElement,
40 ProcessingInstruction,
41 ResultSet,
42 SoupStrainer,
43 Tag,
44 )
45
46# The very first thing we do is give a useful error if someone is
47# running this code under Python 3 without converting it.
48syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
49
50class BeautifulSoup(Tag):
51 """
52 This class defines the basic interface called by the tree builders.
53
54 These methods will be called by the parser:
55 reset()
56 feed(markup)
57
58 The tree builder may call these methods from its feed() implementation:
59 handle_starttag(name, attrs) # See note about return value
60 handle_endtag(name)
61 handle_data(data) # Appends to the current data node
62 endData(containerClass=NavigableString) # Ends the current data node
63
64 No matter how complicated the underlying parser is, you should be
65 able to build a tree using 'start tag' events, 'end tag' events,
66 'data' events, and "done with data" events.
67
68 If you encounter an empty-element tag (aka a self-closing tag,
69 like HTML's <br> tag), call handle_starttag and then
70 handle_endtag.
71 """
72 ROOT_TAG_NAME = u'[document]'
73
74 # If the end-user gives no indication which tree builder they
75 # want, look for one with these features.
76 DEFAULT_BUILDER_FEATURES = ['html', 'fast']
77
78 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
79
80 def __init__(self, markup="", features=None, builder=None,
81 parse_only=None, from_encoding=None, **kwargs):
82 """The Soup object is initialized as the 'root tag', and the
83 provided markup (which can be a string or a file-like object)
84 is fed into the underlying parser."""
85
86 if 'convertEntities' in kwargs:
87 warnings.warn(
88 "BS4 does not respect the convertEntities argument to the "
89 "BeautifulSoup constructor. Entities are always converted "
90 "to Unicode characters.")
91
92 if 'markupMassage' in kwargs:
93 del kwargs['markupMassage']
94 warnings.warn(
95 "BS4 does not respect the markupMassage argument to the "
96 "BeautifulSoup constructor. The tree builder is responsible "
97 "for any necessary markup massage.")
98
99 if 'smartQuotesTo' in kwargs:
100 del kwargs['smartQuotesTo']
101 warnings.warn(
102 "BS4 does not respect the smartQuotesTo argument to the "
103 "BeautifulSoup constructor. Smart quotes are always converted "
104 "to Unicode characters.")
105
106 if 'selfClosingTags' in kwargs:
107 del kwargs['selfClosingTags']
108 warnings.warn(
109 "BS4 does not respect the selfClosingTags argument to the "
110 "BeautifulSoup constructor. The tree builder is responsible "
111 "for understanding self-closing tags.")
112
113 if 'isHTML' in kwargs:
114 del kwargs['isHTML']
115 warnings.warn(
116 "BS4 does not respect the isHTML argument to the "
117 "BeautifulSoup constructor. You can pass in features='html' "
118 "or features='xml' to get a builder capable of handling "
119 "one or the other.")
120
121 def deprecated_argument(old_name, new_name):
122 if old_name in kwargs:
123 warnings.warn(
124 'The "%s" argument to the BeautifulSoup constructor '
125 'has been renamed to "%s."' % (old_name, new_name))
126 value = kwargs[old_name]
127 del kwargs[old_name]
128 return value
129 return None
130
131 parse_only = parse_only or deprecated_argument(
132 "parseOnlyThese", "parse_only")
133
134 from_encoding = from_encoding or deprecated_argument(
135 "fromEncoding", "from_encoding")
136
137 if len(kwargs) > 0:
138 arg = kwargs.keys().pop()
139 raise TypeError(
140 "__init__() got an unexpected keyword argument '%s'" % arg)
141
142 if builder is None:
143 if isinstance(features, basestring):
144 features = [features]
145 if features is None or len(features) == 0:
146 features = self.DEFAULT_BUILDER_FEATURES
147 builder_class = builder_registry.lookup(*features)
148 if builder_class is None:
149 raise FeatureNotFound(
150 "Couldn't find a tree builder with the features you "
151 "requested: %s. Do you need to install a parser library?"
152 % ",".join(features))
153 builder = builder_class()
154 self.builder = builder
155 self.is_xml = builder.is_xml
156 self.builder.soup = self
157
158 self.parse_only = parse_only
159
160 if hasattr(markup, 'read'): # It's a file-type object.
161 markup = markup.read()
162 elif len(markup) <= 256:
163 # Print out warnings for a couple beginner problems
164 # involving passing non-markup to Beautiful Soup.
165 # Beautiful Soup will still parse the input as markup,
166 # just in case that's what the user really wants.
167 if (isinstance(markup, unicode)
168 and not os.path.supports_unicode_filenames):
169 possible_filename = markup.encode("utf8")
170 else:
171 possible_filename = markup
172 is_file = False
173 try:
174 is_file = os.path.exists(possible_filename)
175 except Exception, e:
176 # This is almost certainly a problem involving
177 # characters not valid in filenames on this
178 # system. Just let it go.
179 pass
180 if is_file:
181 warnings.warn(
182 '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
183 if markup[:5] == "http:" or markup[:6] == "https:":
184 # TODO: This is ugly but I couldn't get it to work in
185 # Python 3 otherwise.
186 if ((isinstance(markup, bytes) and not b' ' in markup)
187 or (isinstance(markup, unicode) and not u' ' in markup)):
188 warnings.warn(
189 '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
190
191 for (self.markup, self.original_encoding, self.declared_html_encoding,
192 self.contains_replacement_characters) in (
193 self.builder.prepare_markup(markup, from_encoding)):
194 self.reset()
195 try:
196 self._feed()
197 break
198 except ParserRejectedMarkup:
199 pass
200
201 # Clear out the markup and remove the builder's circular
202 # reference to this object.
203 self.markup = None
204 self.builder.soup = None
205
206 def _feed(self):
207 # Convert the document to Unicode.
208 self.builder.reset()
209
210 self.builder.feed(self.markup)
211 # Close out any unfinished strings and close all the open tags.
212 self.endData()
213 while self.currentTag.name != self.ROOT_TAG_NAME:
214 self.popTag()
215
216 def reset(self):
217 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
218 self.hidden = 1
219 self.builder.reset()
220 self.current_data = []
221 self.currentTag = None
222 self.tagStack = []
223 self.preserve_whitespace_tag_stack = []
224 self.pushTag(self)
225
226 def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
227 """Create a new tag associated with this soup."""
228 return Tag(None, self.builder, name, namespace, nsprefix, attrs)
229
230 def new_string(self, s, subclass=NavigableString):
231 """Create a new NavigableString associated with this soup."""
232 navigable = subclass(s)
233 navigable.setup()
234 return navigable
235
236 def insert_before(self, successor):
237 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
238
239 def insert_after(self, successor):
240 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
241
242 def popTag(self):
243 tag = self.tagStack.pop()
244 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
245 self.preserve_whitespace_tag_stack.pop()
246 #print "Pop", tag.name
247 if self.tagStack:
248 self.currentTag = self.tagStack[-1]
249 return self.currentTag
250
251 def pushTag(self, tag):
252 #print "Push", tag.name
253 if self.currentTag:
254 self.currentTag.contents.append(tag)
255 self.tagStack.append(tag)
256 self.currentTag = self.tagStack[-1]
257 if tag.name in self.builder.preserve_whitespace_tags:
258 self.preserve_whitespace_tag_stack.append(tag)
259
260 def endData(self, containerClass=NavigableString):
261 if self.current_data:
262 current_data = u''.join(self.current_data)
263 # If whitespace is not preserved, and this string contains
264 # nothing but ASCII spaces, replace it with a single space
265 # or newline.
266 if not self.preserve_whitespace_tag_stack:
267 strippable = True
268 for i in current_data:
269 if i not in self.ASCII_SPACES:
270 strippable = False
271 break
272 if strippable:
273 if '\n' in current_data:
274 current_data = '\n'
275 else:
276 current_data = ' '
277
278 # Reset the data collector.
279 self.current_data = []
280
281 # Should we add this string to the tree at all?
282 if self.parse_only and len(self.tagStack) <= 1 and \
283 (not self.parse_only.text or \
284 not self.parse_only.search(current_data)):
285 return
286
287 o = containerClass(current_data)
288 self.object_was_parsed(o)
289
290 def object_was_parsed(self, o, parent=None, most_recent_element=None):
291 """Add an object to the parse tree."""
292 parent = parent or self.currentTag
293 most_recent_element = most_recent_element or self._most_recent_element
294 o.setup(parent, most_recent_element)
295
296 if most_recent_element is not None:
297 most_recent_element.next_element = o
298 self._most_recent_element = o
299 parent.contents.append(o)
300
301 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
302 """Pops the tag stack up to and including the most recent
303 instance of the given tag. If inclusivePop is false, pops the tag
304 stack up to but *not* including the most recent instqance of
305 the given tag."""
306 #print "Popping to %s" % name
307 if name == self.ROOT_TAG_NAME:
308 # The BeautifulSoup object itself can never be popped.
309 return
310
311 most_recently_popped = None
312
313 stack_size = len(self.tagStack)
314 for i in range(stack_size - 1, 0, -1):
315 t = self.tagStack[i]
316 if (name == t.name and nsprefix == t.prefix):
317 if inclusivePop:
318 most_recently_popped = self.popTag()
319 break
320 most_recently_popped = self.popTag()
321
322 return most_recently_popped
323
324 def handle_starttag(self, name, namespace, nsprefix, attrs):
325 """Push a start tag on to the stack.
326
327 If this method returns None, the tag was rejected by the
328 SoupStrainer. You should proceed as if the tag had not occured
329 in the document. For instance, if this was a self-closing tag,
330 don't call handle_endtag.
331 """
332
333 # print "Start tag %s: %s" % (name, attrs)
334 self.endData()
335
336 if (self.parse_only and len(self.tagStack) <= 1
337 and (self.parse_only.text
338 or not self.parse_only.search_tag(name, attrs))):
339 return None
340
341 tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
342 self.currentTag, self._most_recent_element)
343 if tag is None:
344 return tag
345 if self._most_recent_element:
346 self._most_recent_element.next_element = tag
347 self._most_recent_element = tag
348 self.pushTag(tag)
349 return tag
350
351 def handle_endtag(self, name, nsprefix=None):
352 #print "End tag: " + name
353 self.endData()
354 self._popToTag(name, nsprefix)
355
356 def handle_data(self, data):
357 self.current_data.append(data)
358
359 def decode(self, pretty_print=False,
360 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
361 formatter="minimal"):
362 """Returns a string or Unicode representation of this document.
363 To get Unicode, pass None for encoding."""
364
365 if self.is_xml:
366 # Print the XML declaration
367 encoding_part = ''
368 if eventual_encoding != None:
369 encoding_part = ' encoding="%s"' % eventual_encoding
370 prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
371 else:
372 prefix = u''
373 if not pretty_print:
374 indent_level = None
375 else:
376 indent_level = 0
377 return prefix + super(BeautifulSoup, self).decode(
378 indent_level, eventual_encoding, formatter)
379
380# Alias to make it easier to type import: 'from bs4 import _soup'
381_s = BeautifulSoup
382_soup = BeautifulSoup
383
384class BeautifulStoneSoup(BeautifulSoup):
385 """Deprecated interface to an XML parser."""
386
387 def __init__(self, *args, **kwargs):
388 kwargs['features'] = 'xml'
389 warnings.warn(
390 'The BeautifulStoneSoup class is deprecated. Instead of using '
391 'it, pass features="xml" into the BeautifulSoup constructor.')
392 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
393
394
395class StopParsing(Exception):
396 pass
397
398class FeatureNotFound(ValueError):
399 pass
400
401
402#By default, act as an HTML pretty-printer.
403if __name__ == '__main__':
404 import sys
405 soup = BeautifulSoup(sys.stdin)
406 print soup.prettify()
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py
new file mode 100644
index 0000000000..740f5f29cd
--- /dev/null
+++ b/bitbake/lib/bs4/builder/__init__.py
@@ -0,0 +1,321 @@
1from collections import defaultdict
2import itertools
3import sys
4from bs4.element import (
5 CharsetMetaAttributeValue,
6 ContentMetaAttributeValue,
7 whitespace_re
8 )
9
10__all__ = [
11 'HTMLTreeBuilder',
12 'SAXTreeBuilder',
13 'TreeBuilder',
14 'TreeBuilderRegistry',
15 ]
16
17# Some useful features for a TreeBuilder to have.
18FAST = 'fast'
19PERMISSIVE = 'permissive'
20STRICT = 'strict'
21XML = 'xml'
22HTML = 'html'
23HTML_5 = 'html5'
24
25
26class TreeBuilderRegistry(object):
27
28 def __init__(self):
29 self.builders_for_feature = defaultdict(list)
30 self.builders = []
31
32 def register(self, treebuilder_class):
33 """Register a treebuilder based on its advertised features."""
34 for feature in treebuilder_class.features:
35 self.builders_for_feature[feature].insert(0, treebuilder_class)
36 self.builders.insert(0, treebuilder_class)
37
38 def lookup(self, *features):
39 if len(self.builders) == 0:
40 # There are no builders at all.
41 return None
42
43 if len(features) == 0:
44 # They didn't ask for any features. Give them the most
45 # recently registered builder.
46 return self.builders[0]
47
48 # Go down the list of features in order, and eliminate any builders
49 # that don't match every feature.
50 features = list(features)
51 features.reverse()
52 candidates = None
53 candidate_set = None
54 while len(features) > 0:
55 feature = features.pop()
56 we_have_the_feature = self.builders_for_feature.get(feature, [])
57 if len(we_have_the_feature) > 0:
58 if candidates is None:
59 candidates = we_have_the_feature
60 candidate_set = set(candidates)
61 else:
62 # Eliminate any candidates that don't have this feature.
63 candidate_set = candidate_set.intersection(
64 set(we_have_the_feature))
65
66 # The only valid candidates are the ones in candidate_set.
67 # Go through the original list of candidates and pick the first one
68 # that's in candidate_set.
69 if candidate_set is None:
70 return None
71 for candidate in candidates:
72 if candidate in candidate_set:
73 return candidate
74 return None
75
76# The BeautifulSoup class will take feature lists from developers and use them
77# to look up builders in this registry.
78builder_registry = TreeBuilderRegistry()
79
80class TreeBuilder(object):
81 """Turn a document into a Beautiful Soup object tree."""
82
83 features = []
84
85 is_xml = False
86 preserve_whitespace_tags = set()
87 empty_element_tags = None # A tag will be considered an empty-element
88 # tag when and only when it has no contents.
89
90 # A value for these tag/attribute combinations is a space- or
91 # comma-separated list of CDATA, rather than a single CDATA.
92 cdata_list_attributes = {}
93
94
95 def __init__(self):
96 self.soup = None
97
98 def reset(self):
99 pass
100
101 def can_be_empty_element(self, tag_name):
102 """Might a tag with this name be an empty-element tag?
103
104 The final markup may or may not actually present this tag as
105 self-closing.
106
107 For instance: an HTMLBuilder does not consider a <p> tag to be
108 an empty-element tag (it's not in
109 HTMLBuilder.empty_element_tags). This means an empty <p> tag
110 will be presented as "<p></p>", not "<p />".
111
112 The default implementation has no opinion about which tags are
113 empty-element tags, so a tag will be presented as an
114 empty-element tag if and only if it has no contents.
115 "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
116 be left alone.
117 """
118 if self.empty_element_tags is None:
119 return True
120 return tag_name in self.empty_element_tags
121
122 def feed(self, markup):
123 raise NotImplementedError()
124
125 def prepare_markup(self, markup, user_specified_encoding=None,
126 document_declared_encoding=None):
127 return markup, None, None, False
128
129 def test_fragment_to_document(self, fragment):
130 """Wrap an HTML fragment to make it look like a document.
131
132 Different parsers do this differently. For instance, lxml
133 introduces an empty <head> tag, and html5lib
134 doesn't. Abstracting this away lets us write simple tests
135 which run HTML fragments through the parser and compare the
136 results against other HTML fragments.
137
138 This method should not be used outside of tests.
139 """
140 return fragment
141
142 def set_up_substitutions(self, tag):
143 return False
144
145 def _replace_cdata_list_attribute_values(self, tag_name, attrs):
146 """Replaces class="foo bar" with class=["foo", "bar"]
147
148 Modifies its input in place.
149 """
150 if not attrs:
151 return attrs
152 if self.cdata_list_attributes:
153 universal = self.cdata_list_attributes.get('*', [])
154 tag_specific = self.cdata_list_attributes.get(
155 tag_name.lower(), None)
156 for attr in attrs.keys():
157 if attr in universal or (tag_specific and attr in tag_specific):
158 # We have a "class"-type attribute whose string
159 # value is a whitespace-separated list of
160 # values. Split it into a list.
161 value = attrs[attr]
162 if isinstance(value, basestring):
163 values = whitespace_re.split(value)
164 else:
165 # html5lib sometimes calls setAttributes twice
166 # for the same tag when rearranging the parse
167 # tree. On the second call the attribute value
168 # here is already a list. If this happens,
169 # leave the value alone rather than trying to
170 # split it again.
171 values = value
172 attrs[attr] = values
173 return attrs
174
175class SAXTreeBuilder(TreeBuilder):
176 """A Beautiful Soup treebuilder that listens for SAX events."""
177
178 def feed(self, markup):
179 raise NotImplementedError()
180
181 def close(self):
182 pass
183
184 def startElement(self, name, attrs):
185 attrs = dict((key[1], value) for key, value in list(attrs.items()))
186 #print "Start %s, %r" % (name, attrs)
187 self.soup.handle_starttag(name, attrs)
188
189 def endElement(self, name):
190 #print "End %s" % name
191 self.soup.handle_endtag(name)
192
193 def startElementNS(self, nsTuple, nodeName, attrs):
194 # Throw away (ns, nodeName) for now.
195 self.startElement(nodeName, attrs)
196
197 def endElementNS(self, nsTuple, nodeName):
198 # Throw away (ns, nodeName) for now.
199 self.endElement(nodeName)
200 #handler.endElementNS((ns, node.nodeName), node.nodeName)
201
202 def startPrefixMapping(self, prefix, nodeValue):
203 # Ignore the prefix for now.
204 pass
205
206 def endPrefixMapping(self, prefix):
207 # Ignore the prefix for now.
208 # handler.endPrefixMapping(prefix)
209 pass
210
211 def characters(self, content):
212 self.soup.handle_data(content)
213
214 def startDocument(self):
215 pass
216
217 def endDocument(self):
218 pass
219
220
221class HTMLTreeBuilder(TreeBuilder):
222 """This TreeBuilder knows facts about HTML.
223
224 Such as which tags are empty-element tags.
225 """
226
227 preserve_whitespace_tags = set(['pre', 'textarea'])
228 empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
229 'spacer', 'link', 'frame', 'base'])
230
231 # The HTML standard defines these attributes as containing a
232 # space-separated list of values, not a single value. That is,
233 # class="foo bar" means that the 'class' attribute has two values,
234 # 'foo' and 'bar', not the single value 'foo bar'. When we
235 # encounter one of these attributes, we will parse its value into
236 # a list of values if possible. Upon output, the list will be
237 # converted back into a string.
238 cdata_list_attributes = {
239 "*" : ['class', 'accesskey', 'dropzone'],
240 "a" : ['rel', 'rev'],
241 "link" : ['rel', 'rev'],
242 "td" : ["headers"],
243 "th" : ["headers"],
244 "td" : ["headers"],
245 "form" : ["accept-charset"],
246 "object" : ["archive"],
247
248 # These are HTML5 specific, as are *.accesskey and *.dropzone above.
249 "area" : ["rel"],
250 "icon" : ["sizes"],
251 "iframe" : ["sandbox"],
252 "output" : ["for"],
253 }
254
255 def set_up_substitutions(self, tag):
256 # We are only interested in <meta> tags
257 if tag.name != 'meta':
258 return False
259
260 http_equiv = tag.get('http-equiv')
261 content = tag.get('content')
262 charset = tag.get('charset')
263
264 # We are interested in <meta> tags that say what encoding the
265 # document was originally in. This means HTML 5-style <meta>
266 # tags that provide the "charset" attribute. It also means
267 # HTML 4-style <meta> tags that provide the "content"
268 # attribute and have "http-equiv" set to "content-type".
269 #
270 # In both cases we will replace the value of the appropriate
271 # attribute with a standin object that can take on any
272 # encoding.
273 meta_encoding = None
274 if charset is not None:
275 # HTML 5 style:
276 # <meta charset="utf8">
277 meta_encoding = charset
278 tag['charset'] = CharsetMetaAttributeValue(charset)
279
280 elif (content is not None and http_equiv is not None
281 and http_equiv.lower() == 'content-type'):
282 # HTML 4 style:
283 # <meta http-equiv="content-type" content="text/html; charset=utf8">
284 tag['content'] = ContentMetaAttributeValue(content)
285
286 return (meta_encoding is not None)
287
288def register_treebuilders_from(module):
289 """Copy TreeBuilders from the given module into this module."""
290 # I'm fairly sure this is not the best way to do this.
291 this_module = sys.modules['bs4.builder']
292 for name in module.__all__:
293 obj = getattr(module, name)
294
295 if issubclass(obj, TreeBuilder):
296 setattr(this_module, name, obj)
297 this_module.__all__.append(name)
298 # Register the builder while we're at it.
299 this_module.builder_registry.register(obj)
300
301class ParserRejectedMarkup(Exception):
302 pass
303
304# Builders are registered in reverse order of priority, so that custom
305# builder registrations will take precedence. In general, we want lxml
306# to take precedence over html5lib, because it's faster. And we only
307# want to use HTMLParser as a last result.
308from . import _htmlparser
309register_treebuilders_from(_htmlparser)
310try:
311 from . import _html5lib
312 register_treebuilders_from(_html5lib)
313except ImportError:
314 # They don't have html5lib installed.
315 pass
316try:
317 from . import _lxml
318 register_treebuilders_from(_lxml)
319except ImportError:
320 # They don't have lxml installed.
321 pass
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py
new file mode 100644
index 0000000000..7de36ae75e
--- /dev/null
+++ b/bitbake/lib/bs4/builder/_html5lib.py
@@ -0,0 +1,285 @@
1__all__ = [
2 'HTML5TreeBuilder',
3 ]
4
5import warnings
6from bs4.builder import (
7 PERMISSIVE,
8 HTML,
9 HTML_5,
10 HTMLTreeBuilder,
11 )
12from bs4.element import NamespacedAttribute
13import html5lib
14from html5lib.constants import namespaces
15from bs4.element import (
16 Comment,
17 Doctype,
18 NavigableString,
19 Tag,
20 )
21
22class HTML5TreeBuilder(HTMLTreeBuilder):
23 """Use html5lib to build a tree."""
24
25 features = ['html5lib', PERMISSIVE, HTML_5, HTML]
26
27 def prepare_markup(self, markup, user_specified_encoding):
28 # Store the user-specified encoding for use later on.
29 self.user_specified_encoding = user_specified_encoding
30 yield (markup, None, None, False)
31
32 # These methods are defined by Beautiful Soup.
33 def feed(self, markup):
34 if self.soup.parse_only is not None:
35 warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
36 parser = html5lib.HTMLParser(tree=self.create_treebuilder)
37 doc = parser.parse(markup, encoding=self.user_specified_encoding)
38
39 # Set the character encoding detected by the tokenizer.
40 if isinstance(markup, unicode):
41 # We need to special-case this because html5lib sets
42 # charEncoding to UTF-8 if it gets Unicode input.
43 doc.original_encoding = None
44 else:
45 doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
46
47 def create_treebuilder(self, namespaceHTMLElements):
48 self.underlying_builder = TreeBuilderForHtml5lib(
49 self.soup, namespaceHTMLElements)
50 return self.underlying_builder
51
52 def test_fragment_to_document(self, fragment):
53 """See `TreeBuilder`."""
54 return u'<html><head></head><body>%s</body></html>' % fragment
55
56
57class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
58
59 def __init__(self, soup, namespaceHTMLElements):
60 self.soup = soup
61 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
62
63 def documentClass(self):
64 self.soup.reset()
65 return Element(self.soup, self.soup, None)
66
67 def insertDoctype(self, token):
68 name = token["name"]
69 publicId = token["publicId"]
70 systemId = token["systemId"]
71
72 doctype = Doctype.for_name_and_ids(name, publicId, systemId)
73 self.soup.object_was_parsed(doctype)
74
75 def elementClass(self, name, namespace):
76 tag = self.soup.new_tag(name, namespace)
77 return Element(tag, self.soup, namespace)
78
79 def commentClass(self, data):
80 return TextNode(Comment(data), self.soup)
81
82 def fragmentClass(self):
83 self.soup = BeautifulSoup("")
84 self.soup.name = "[document_fragment]"
85 return Element(self.soup, self.soup, None)
86
87 def appendChild(self, node):
88 # XXX This code is not covered by the BS4 tests.
89 self.soup.append(node.element)
90
91 def getDocument(self):
92 return self.soup
93
94 def getFragment(self):
95 return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
96
97class AttrList(object):
98 def __init__(self, element):
99 self.element = element
100 self.attrs = dict(self.element.attrs)
101 def __iter__(self):
102 return list(self.attrs.items()).__iter__()
103 def __setitem__(self, name, value):
104 "set attr", name, value
105 self.element[name] = value
106 def items(self):
107 return list(self.attrs.items())
108 def keys(self):
109 return list(self.attrs.keys())
110 def __len__(self):
111 return len(self.attrs)
112 def __getitem__(self, name):
113 return self.attrs[name]
114 def __contains__(self, name):
115 return name in list(self.attrs.keys())
116
117
118class Element(html5lib.treebuilders._base.Node):
119 def __init__(self, element, soup, namespace):
120 html5lib.treebuilders._base.Node.__init__(self, element.name)
121 self.element = element
122 self.soup = soup
123 self.namespace = namespace
124
125 def appendChild(self, node):
126 string_child = child = None
127 if isinstance(node, basestring):
128 # Some other piece of code decided to pass in a string
129 # instead of creating a TextElement object to contain the
130 # string.
131 string_child = child = node
132 elif isinstance(node, Tag):
133 # Some other piece of code decided to pass in a Tag
134 # instead of creating an Element object to contain the
135 # Tag.
136 child = node
137 elif node.element.__class__ == NavigableString:
138 string_child = child = node.element
139 else:
140 child = node.element
141
142 if not isinstance(child, basestring) and child.parent is not None:
143 node.element.extract()
144
145 if (string_child and self.element.contents
146 and self.element.contents[-1].__class__ == NavigableString):
147 # We are appending a string onto another string.
148 # TODO This has O(n^2) performance, for input like
149 # "a</a>a</a>a</a>..."
150 old_element = self.element.contents[-1]
151 new_element = self.soup.new_string(old_element + string_child)
152 old_element.replace_with(new_element)
153 self.soup._most_recent_element = new_element
154 else:
155 if isinstance(node, basestring):
156 # Create a brand new NavigableString from this string.
157 child = self.soup.new_string(node)
158
159 # Tell Beautiful Soup to act as if it parsed this element
160 # immediately after the parent's last descendant. (Or
161 # immediately after the parent, if it has no children.)
162 if self.element.contents:
163 most_recent_element = self.element._last_descendant(False)
164 else:
165 most_recent_element = self.element
166
167 self.soup.object_was_parsed(
168 child, parent=self.element,
169 most_recent_element=most_recent_element)
170
171 def getAttributes(self):
172 return AttrList(self.element)
173
174 def setAttributes(self, attributes):
175 if attributes is not None and len(attributes) > 0:
176
177 converted_attributes = []
178 for name, value in list(attributes.items()):
179 if isinstance(name, tuple):
180 new_name = NamespacedAttribute(*name)
181 del attributes[name]
182 attributes[new_name] = value
183
184 self.soup.builder._replace_cdata_list_attribute_values(
185 self.name, attributes)
186 for name, value in attributes.items():
187 self.element[name] = value
188
189 # The attributes may contain variables that need substitution.
190 # Call set_up_substitutions manually.
191 #
192 # The Tag constructor called this method when the Tag was created,
193 # but we just set/changed the attributes, so call it again.
194 self.soup.builder.set_up_substitutions(self.element)
195 attributes = property(getAttributes, setAttributes)
196
197 def insertText(self, data, insertBefore=None):
198 if insertBefore:
199 text = TextNode(self.soup.new_string(data), self.soup)
200 self.insertBefore(data, insertBefore)
201 else:
202 self.appendChild(data)
203
204 def insertBefore(self, node, refNode):
205 index = self.element.index(refNode.element)
206 if (node.element.__class__ == NavigableString and self.element.contents
207 and self.element.contents[index-1].__class__ == NavigableString):
208 # (See comments in appendChild)
209 old_node = self.element.contents[index-1]
210 new_str = self.soup.new_string(old_node + node.element)
211 old_node.replace_with(new_str)
212 else:
213 self.element.insert(index, node.element)
214 node.parent = self
215
216 def removeChild(self, node):
217 node.element.extract()
218
219 def reparentChildren(self, new_parent):
220 """Move all of this tag's children into another tag."""
221 element = self.element
222 new_parent_element = new_parent.element
223 # Determine what this tag's next_element will be once all the children
224 # are removed.
225 final_next_element = element.next_sibling
226
227 new_parents_last_descendant = new_parent_element._last_descendant(False, False)
228 if len(new_parent_element.contents) > 0:
229 # The new parent already contains children. We will be
230 # appending this tag's children to the end.
231 new_parents_last_child = new_parent_element.contents[-1]
232 new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
233 else:
234 # The new parent contains no children.
235 new_parents_last_child = None
236 new_parents_last_descendant_next_element = new_parent_element.next_element
237
238 to_append = element.contents
239 append_after = new_parent.element.contents
240 if len(to_append) > 0:
241 # Set the first child's previous_element and previous_sibling
242 # to elements within the new parent
243 first_child = to_append[0]
244 first_child.previous_element = new_parents_last_descendant
245 first_child.previous_sibling = new_parents_last_child
246
247 # Fix the last child's next_element and next_sibling
248 last_child = to_append[-1]
249 last_child.next_element = new_parents_last_descendant_next_element
250 last_child.next_sibling = None
251
252 for child in to_append:
253 child.parent = new_parent_element
254 new_parent_element.contents.append(child)
255
256 # Now that this element has no children, change its .next_element.
257 element.contents = []
258 element.next_element = final_next_element
259
260 def cloneNode(self):
261 tag = self.soup.new_tag(self.element.name, self.namespace)
262 node = Element(tag, self.soup, self.namespace)
263 for key,value in self.attributes:
264 node.attributes[key] = value
265 return node
266
267 def hasContent(self):
268 return self.element.contents
269
270 def getNameTuple(self):
271 if self.namespace == None:
272 return namespaces["html"], self.name
273 else:
274 return self.namespace, self.name
275
276 nameTuple = property(getNameTuple)
277
278class TextNode(Element):
279 def __init__(self, element, soup):
280 html5lib.treebuilders._base.Node.__init__(self, None)
281 self.element = element
282 self.soup = soup
283
284 def cloneNode(self):
285 raise NotImplementedError
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py
new file mode 100644
index 0000000000..ca8d8b892b
--- /dev/null
+++ b/bitbake/lib/bs4/builder/_htmlparser.py
@@ -0,0 +1,258 @@
1"""Use the HTMLParser library to parse HTML files that aren't too bad."""
2
3__all__ = [
4 'HTMLParserTreeBuilder',
5 ]
6
7from HTMLParser import (
8 HTMLParser,
9 HTMLParseError,
10 )
11import sys
12import warnings
13
14# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
15# argument, which we'd like to set to False. Unfortunately,
16# http://bugs.python.org/issue13273 makes strict=True a better bet
17# before Python 3.2.3.
18#
19# At the end of this file, we monkeypatch HTMLParser so that
20# strict=True works well on Python 3.2.2.
21major, minor, release = sys.version_info[:3]
22CONSTRUCTOR_TAKES_STRICT = (
23 major > 3
24 or (major == 3 and minor > 2)
25 or (major == 3 and minor == 2 and release >= 3))
26
27from bs4.element import (
28 CData,
29 Comment,
30 Declaration,
31 Doctype,
32 ProcessingInstruction,
33 )
34from bs4.dammit import EntitySubstitution, UnicodeDammit
35
36from bs4.builder import (
37 HTML,
38 HTMLTreeBuilder,
39 STRICT,
40 )
41
42
43HTMLPARSER = 'html.parser'
44
45class BeautifulSoupHTMLParser(HTMLParser):
46 def handle_starttag(self, name, attrs):
47 # XXX namespace
48 attr_dict = {}
49 for key, value in attrs:
50 # Change None attribute values to the empty string
51 # for consistency with the other tree builders.
52 if value is None:
53 value = ''
54 attr_dict[key] = value
55 attrvalue = '""'
56 self.soup.handle_starttag(name, None, None, attr_dict)
57
58 def handle_endtag(self, name):
59 self.soup.handle_endtag(name)
60
61 def handle_data(self, data):
62 self.soup.handle_data(data)
63
64 def handle_charref(self, name):
65 # XXX workaround for a bug in HTMLParser. Remove this once
66 # it's fixed.
67 if name.startswith('x'):
68 real_name = int(name.lstrip('x'), 16)
69 elif name.startswith('X'):
70 real_name = int(name.lstrip('X'), 16)
71 else:
72 real_name = int(name)
73
74 try:
75 data = unichr(real_name)
76 except (ValueError, OverflowError), e:
77 data = u"\N{REPLACEMENT CHARACTER}"
78
79 self.handle_data(data)
80
81 def handle_entityref(self, name):
82 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
83 if character is not None:
84 data = character
85 else:
86 data = "&%s;" % name
87 self.handle_data(data)
88
89 def handle_comment(self, data):
90 self.soup.endData()
91 self.soup.handle_data(data)
92 self.soup.endData(Comment)
93
94 def handle_decl(self, data):
95 self.soup.endData()
96 if data.startswith("DOCTYPE "):
97 data = data[len("DOCTYPE "):]
98 elif data == 'DOCTYPE':
99 # i.e. "<!DOCTYPE>"
100 data = ''
101 self.soup.handle_data(data)
102 self.soup.endData(Doctype)
103
104 def unknown_decl(self, data):
105 if data.upper().startswith('CDATA['):
106 cls = CData
107 data = data[len('CDATA['):]
108 else:
109 cls = Declaration
110 self.soup.endData()
111 self.soup.handle_data(data)
112 self.soup.endData(cls)
113
114 def handle_pi(self, data):
115 self.soup.endData()
116 if data.endswith("?") and data.lower().startswith("xml"):
117 # "An XHTML processing instruction using the trailing '?'
118 # will cause the '?' to be included in data." - HTMLParser
119 # docs.
120 #
121 # Strip the question mark so we don't end up with two
122 # question marks.
123 data = data[:-1]
124 self.soup.handle_data(data)
125 self.soup.endData(ProcessingInstruction)
126
127
128class HTMLParserTreeBuilder(HTMLTreeBuilder):
129
130 is_xml = False
131 features = [HTML, STRICT, HTMLPARSER]
132
133 def __init__(self, *args, **kwargs):
134 if CONSTRUCTOR_TAKES_STRICT:
135 kwargs['strict'] = False
136 self.parser_args = (args, kwargs)
137
138 def prepare_markup(self, markup, user_specified_encoding=None,
139 document_declared_encoding=None):
140 """
141 :return: A 4-tuple (markup, original encoding, encoding
142 declared within markup, whether any characters had to be
143 replaced with REPLACEMENT CHARACTER).
144 """
145 if isinstance(markup, unicode):
146 yield (markup, None, None, False)
147 return
148
149 try_encodings = [user_specified_encoding, document_declared_encoding]
150 dammit = UnicodeDammit(markup, try_encodings, is_html=True)
151 yield (dammit.markup, dammit.original_encoding,
152 dammit.declared_html_encoding,
153 dammit.contains_replacement_characters)
154
155 def feed(self, markup):
156 args, kwargs = self.parser_args
157 parser = BeautifulSoupHTMLParser(*args, **kwargs)
158 parser.soup = self.soup
159 try:
160 parser.feed(markup)
161 except HTMLParseError, e:
162 warnings.warn(RuntimeWarning(
163 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
164 raise e
165
166# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
167# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
168# string.
169#
170# XXX This code can be removed once most Python 3 users are on 3.2.3.
171if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
172 import re
173 attrfind_tolerant = re.compile(
174 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
175 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
176 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
177
178 locatestarttagend = re.compile(r"""
179 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
180 (?:\s+ # whitespace before attribute name
181 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
182 (?:\s*=\s* # value indicator
183 (?:'[^']*' # LITA-enclosed value
184 |\"[^\"]*\" # LIT-enclosed value
185 |[^'\">\s]+ # bare value
186 )
187 )?
188 )
189 )*
190 \s* # trailing whitespace
191""", re.VERBOSE)
192 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
193
194 from html.parser import tagfind, attrfind
195
196 def parse_starttag(self, i):
197 self.__starttag_text = None
198 endpos = self.check_for_whole_start_tag(i)
199 if endpos < 0:
200 return endpos
201 rawdata = self.rawdata
202 self.__starttag_text = rawdata[i:endpos]
203
204 # Now parse the data between i+1 and j into a tag and attrs
205 attrs = []
206 match = tagfind.match(rawdata, i+1)
207 assert match, 'unexpected call to parse_starttag()'
208 k = match.end()
209 self.lasttag = tag = rawdata[i+1:k].lower()
210 while k < endpos:
211 if self.strict:
212 m = attrfind.match(rawdata, k)
213 else:
214 m = attrfind_tolerant.match(rawdata, k)
215 if not m:
216 break
217 attrname, rest, attrvalue = m.group(1, 2, 3)
218 if not rest:
219 attrvalue = None
220 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
221 attrvalue[:1] == '"' == attrvalue[-1:]:
222 attrvalue = attrvalue[1:-1]
223 if attrvalue:
224 attrvalue = self.unescape(attrvalue)
225 attrs.append((attrname.lower(), attrvalue))
226 k = m.end()
227
228 end = rawdata[k:endpos].strip()
229 if end not in (">", "/>"):
230 lineno, offset = self.getpos()
231 if "\n" in self.__starttag_text:
232 lineno = lineno + self.__starttag_text.count("\n")
233 offset = len(self.__starttag_text) \
234 - self.__starttag_text.rfind("\n")
235 else:
236 offset = offset + len(self.__starttag_text)
237 if self.strict:
238 self.error("junk characters in start tag: %r"
239 % (rawdata[k:endpos][:20],))
240 self.handle_data(rawdata[i:endpos])
241 return endpos
242 if end.endswith('/>'):
243 # XHTML-style empty tag: <span attr="value" />
244 self.handle_startendtag(tag, attrs)
245 else:
246 self.handle_starttag(tag, attrs)
247 if tag in self.CDATA_CONTENT_ELEMENTS:
248 self.set_cdata_mode(tag)
249 return endpos
250
251 def set_cdata_mode(self, elem):
252 self.cdata_elem = elem.lower()
253 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
254
255 BeautifulSoupHTMLParser.parse_starttag = parse_starttag
256 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
257
258 CONSTRUCTOR_TAKES_STRICT = True
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py
new file mode 100644
index 0000000000..fa5d49875e
--- /dev/null
+++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -0,0 +1,233 @@
1__all__ = [
2 'LXMLTreeBuilderForXML',
3 'LXMLTreeBuilder',
4 ]
5
6from io import BytesIO
7from StringIO import StringIO
8import collections
9from lxml import etree
10from bs4.element import Comment, Doctype, NamespacedAttribute
11from bs4.builder import (
12 FAST,
13 HTML,
14 HTMLTreeBuilder,
15 PERMISSIVE,
16 ParserRejectedMarkup,
17 TreeBuilder,
18 XML)
19from bs4.dammit import EncodingDetector
20
21LXML = 'lxml'
22
23class LXMLTreeBuilderForXML(TreeBuilder):
24 DEFAULT_PARSER_CLASS = etree.XMLParser
25
26 is_xml = True
27
28 # Well, it's permissive by XML parser standards.
29 features = [LXML, XML, FAST, PERMISSIVE]
30
31 CHUNK_SIZE = 512
32
33 # This namespace mapping is specified in the XML Namespace
34 # standard.
35 DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
36
37 def default_parser(self, encoding):
38 # This can either return a parser object or a class, which
39 # will be instantiated with default arguments.
40 if self._default_parser is not None:
41 return self._default_parser
42 return etree.XMLParser(
43 target=self, strip_cdata=False, recover=True, encoding=encoding)
44
45 def parser_for(self, encoding):
46 # Use the default parser.
47 parser = self.default_parser(encoding)
48
49 if isinstance(parser, collections.Callable):
50 # Instantiate the parser with default arguments
51 parser = parser(target=self, strip_cdata=False, encoding=encoding)
52 return parser
53
54 def __init__(self, parser=None, empty_element_tags=None):
55 # TODO: Issue a warning if parser is present but not a
56 # callable, since that means there's no way to create new
57 # parsers for different encodings.
58 self._default_parser = parser
59 if empty_element_tags is not None:
60 self.empty_element_tags = set(empty_element_tags)
61 self.soup = None
62 self.nsmaps = [self.DEFAULT_NSMAPS]
63
64 def _getNsTag(self, tag):
65 # Split the namespace URL out of a fully-qualified lxml tag
66 # name. Copied from lxml's src/lxml/sax.py.
67 if tag[0] == '{':
68 return tuple(tag[1:].split('}', 1))
69 else:
70 return (None, tag)
71
72 def prepare_markup(self, markup, user_specified_encoding=None,
73 document_declared_encoding=None):
74 """
75 :yield: A series of 4-tuples.
76 (markup, encoding, declared encoding,
77 has undergone character replacement)
78
79 Each 4-tuple represents a strategy for parsing the document.
80 """
81 if isinstance(markup, unicode):
82 # We were given Unicode. Maybe lxml can parse Unicode on
83 # this system?
84 yield markup, None, document_declared_encoding, False
85
86 if isinstance(markup, unicode):
87 # No, apparently not. Convert the Unicode to UTF-8 and
88 # tell lxml to parse it as UTF-8.
89 yield (markup.encode("utf8"), "utf8",
90 document_declared_encoding, False)
91
92 # Instead of using UnicodeDammit to convert the bytestring to
93 # Unicode using different encodings, use EncodingDetector to
94 # iterate over the encodings, and tell lxml to try to parse
95 # the document as each one in turn.
96 is_html = not self.is_xml
97 try_encodings = [user_specified_encoding, document_declared_encoding]
98 detector = EncodingDetector(markup, try_encodings, is_html)
99 for encoding in detector.encodings:
100 yield (detector.markup, encoding, document_declared_encoding, False)
101
102 def feed(self, markup):
103 if isinstance(markup, bytes):
104 markup = BytesIO(markup)
105 elif isinstance(markup, unicode):
106 markup = StringIO(markup)
107
108 # Call feed() at least once, even if the markup is empty,
109 # or the parser won't be initialized.
110 data = markup.read(self.CHUNK_SIZE)
111 try:
112 self.parser = self.parser_for(self.soup.original_encoding)
113 self.parser.feed(data)
114 while len(data) != 0:
115 # Now call feed() on the rest of the data, chunk by chunk.
116 data = markup.read(self.CHUNK_SIZE)
117 if len(data) != 0:
118 self.parser.feed(data)
119 self.parser.close()
120 except (UnicodeDecodeError, LookupError, etree.ParserError), e:
121 raise ParserRejectedMarkup(str(e))
122
123 def close(self):
124 self.nsmaps = [self.DEFAULT_NSMAPS]
125
126 def start(self, name, attrs, nsmap={}):
127 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
128 attrs = dict(attrs)
129 nsprefix = None
130 # Invert each namespace map as it comes in.
131 if len(self.nsmaps) > 1:
132 # There are no new namespaces for this tag, but
133 # non-default namespaces are in play, so we need a
134 # separate tag stack to know when they end.
135 self.nsmaps.append(None)
136 elif len(nsmap) > 0:
137 # A new namespace mapping has come into play.
138 inverted_nsmap = dict((value, key) for key, value in nsmap.items())
139 self.nsmaps.append(inverted_nsmap)
140 # Also treat the namespace mapping as a set of attributes on the
141 # tag, so we can recreate it later.
142 attrs = attrs.copy()
143 for prefix, namespace in nsmap.items():
144 attribute = NamespacedAttribute(
145 "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
146 attrs[attribute] = namespace
147
148 # Namespaces are in play. Find any attributes that came in
149 # from lxml with namespaces attached to their names, and
150 # turn then into NamespacedAttribute objects.
151 new_attrs = {}
152 for attr, value in attrs.items():
153 namespace, attr = self._getNsTag(attr)
154 if namespace is None:
155 new_attrs[attr] = value
156 else:
157 nsprefix = self._prefix_for_namespace(namespace)
158 attr = NamespacedAttribute(nsprefix, attr, namespace)
159 new_attrs[attr] = value
160 attrs = new_attrs
161
162 namespace, name = self._getNsTag(name)
163 nsprefix = self._prefix_for_namespace(namespace)
164 self.soup.handle_starttag(name, namespace, nsprefix, attrs)
165
166 def _prefix_for_namespace(self, namespace):
167 """Find the currently active prefix for the given namespace."""
168 if namespace is None:
169 return None
170 for inverted_nsmap in reversed(self.nsmaps):
171 if inverted_nsmap is not None and namespace in inverted_nsmap:
172 return inverted_nsmap[namespace]
173 return None
174
175 def end(self, name):
176 self.soup.endData()
177 completed_tag = self.soup.tagStack[-1]
178 namespace, name = self._getNsTag(name)
179 nsprefix = None
180 if namespace is not None:
181 for inverted_nsmap in reversed(self.nsmaps):
182 if inverted_nsmap is not None and namespace in inverted_nsmap:
183 nsprefix = inverted_nsmap[namespace]
184 break
185 self.soup.handle_endtag(name, nsprefix)
186 if len(self.nsmaps) > 1:
187 # This tag, or one of its parents, introduced a namespace
188 # mapping, so pop it off the stack.
189 self.nsmaps.pop()
190
191 def pi(self, target, data):
192 pass
193
194 def data(self, content):
195 self.soup.handle_data(content)
196
197 def doctype(self, name, pubid, system):
198 self.soup.endData()
199 doctype = Doctype.for_name_and_ids(name, pubid, system)
200 self.soup.object_was_parsed(doctype)
201
202 def comment(self, content):
203 "Handle comments as Comment objects."
204 self.soup.endData()
205 self.soup.handle_data(content)
206 self.soup.endData(Comment)
207
208 def test_fragment_to_document(self, fragment):
209 """See `TreeBuilder`."""
210 return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
211
212
213class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
214
215 features = [LXML, HTML, FAST, PERMISSIVE]
216 is_xml = False
217
218 def default_parser(self, encoding):
219 return etree.HTMLParser
220
221 def feed(self, markup):
222 encoding = self.soup.original_encoding
223 try:
224 self.parser = self.parser_for(encoding)
225 self.parser.feed(markup)
226 self.parser.close()
227 except (UnicodeDecodeError, LookupError, etree.ParserError), e:
228 raise ParserRejectedMarkup(str(e))
229
230
231 def test_fragment_to_document(self, fragment):
232 """See `TreeBuilder`."""
233 return u'<html><body>%s</body></html>' % fragment
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py
new file mode 100644
index 0000000000..59640b7ce3
--- /dev/null
+++ b/bitbake/lib/bs4/dammit.py
@@ -0,0 +1,829 @@
1# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
3
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and XML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job.
8"""
9
10import codecs
11from htmlentitydefs import codepoint2name
12import re
13import logging
14import string
15
16# Import a library to autodetect character encodings.
17chardet_type = None
18try:
19 # First try the fast C implementation.
20 # PyPI package: cchardet
21 import cchardet
22 def chardet_dammit(s):
23 return cchardet.detect(s)['encoding']
24except ImportError:
25 try:
26 # Fall back to the pure Python implementation
27 # Debian package: python-chardet
28 # PyPI package: chardet
29 import chardet
30 def chardet_dammit(s):
31 return chardet.detect(s)['encoding']
32 #import chardet.constants
33 #chardet.constants._debug = 1
34 except ImportError:
35 # No chardet available.
36 def chardet_dammit(s):
37 return None
38
39# Available from http://cjkpython.i18n.org/.
40try:
41 import iconv_codec
42except ImportError:
43 pass
44
45xml_encoding_re = re.compile(
46 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
47html_meta_re = re.compile(
48 '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
49
50class EntitySubstitution(object):
51
52 """Substitute XML or HTML entities for the corresponding characters."""
53
54 def _populate_class_variables():
55 lookup = {}
56 reverse_lookup = {}
57 characters_for_re = []
58 for codepoint, name in list(codepoint2name.items()):
59 character = unichr(codepoint)
60 if codepoint != 34:
61 # There's no point in turning the quotation mark into
62 # &quot;, unless it happens within an attribute value, which
63 # is handled elsewhere.
64 characters_for_re.append(character)
65 lookup[character] = name
66 # But we do want to turn &quot; into the quotation mark.
67 reverse_lookup[name] = character
68 re_definition = "[%s]" % "".join(characters_for_re)
69 return lookup, reverse_lookup, re.compile(re_definition)
70 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
71 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
72
73 CHARACTER_TO_XML_ENTITY = {
74 "'": "apos",
75 '"': "quot",
76 "&": "amp",
77 "<": "lt",
78 ">": "gt",
79 }
80
81 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
82 "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
83 ")")
84
85 AMPERSAND_OR_BRACKET = re.compile("([<>&])")
86
87 @classmethod
88 def _substitute_html_entity(cls, matchobj):
89 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
90 return "&%s;" % entity
91
92 @classmethod
93 def _substitute_xml_entity(cls, matchobj):
94 """Used with a regular expression to substitute the
95 appropriate XML entity for an XML special character."""
96 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
97 return "&%s;" % entity
98
99 @classmethod
100 def quoted_attribute_value(self, value):
101 """Make a value into a quoted XML attribute, possibly escaping it.
102
103 Most strings will be quoted using double quotes.
104
105 Bob's Bar -> "Bob's Bar"
106
107 If a string contains double quotes, it will be quoted using
108 single quotes.
109
110 Welcome to "my bar" -> 'Welcome to "my bar"'
111
112 If a string contains both single and double quotes, the
113 double quotes will be escaped, and the string will be quoted
114 using double quotes.
115
116 Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
117 """
118 quote_with = '"'
119 if '"' in value:
120 if "'" in value:
121 # The string contains both single and double
122 # quotes. Turn the double quotes into
123 # entities. We quote the double quotes rather than
124 # the single quotes because the entity name is
125 # "&quot;" whether this is HTML or XML. If we
126 # quoted the single quotes, we'd have to decide
127 # between &apos; and &squot;.
128 replace_with = "&quot;"
129 value = value.replace('"', replace_with)
130 else:
131 # There are double quotes but no single quotes.
132 # We can use single quotes to quote the attribute.
133 quote_with = "'"
134 return quote_with + value + quote_with
135
136 @classmethod
137 def substitute_xml(cls, value, make_quoted_attribute=False):
138 """Substitute XML entities for special XML characters.
139
140 :param value: A string to be substituted. The less-than sign
141 will become &lt;, the greater-than sign will become &gt;,
142 and any ampersands will become &amp;. If you want ampersands
143 that appear to be part of an entity definition to be left
144 alone, use substitute_xml_containing_entities() instead.
145
146 :param make_quoted_attribute: If True, then the string will be
147 quoted, as befits an attribute value.
148 """
149 # Escape angle brackets and ampersands.
150 value = cls.AMPERSAND_OR_BRACKET.sub(
151 cls._substitute_xml_entity, value)
152
153 if make_quoted_attribute:
154 value = cls.quoted_attribute_value(value)
155 return value
156
157 @classmethod
158 def substitute_xml_containing_entities(
159 cls, value, make_quoted_attribute=False):
160 """Substitute XML entities for special XML characters.
161
162 :param value: A string to be substituted. The less-than sign will
163 become &lt;, the greater-than sign will become &gt;, and any
164 ampersands that are not part of an entity defition will
165 become &amp;.
166
167 :param make_quoted_attribute: If True, then the string will be
168 quoted, as befits an attribute value.
169 """
170 # Escape angle brackets, and ampersands that aren't part of
171 # entities.
172 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
173 cls._substitute_xml_entity, value)
174
175 if make_quoted_attribute:
176 value = cls.quoted_attribute_value(value)
177 return value
178
179 @classmethod
180 def substitute_html(cls, s):
181 """Replace certain Unicode characters with named HTML entities.
182
183 This differs from data.encode(encoding, 'xmlcharrefreplace')
184 in that the goal is to make the result more readable (to those
185 with ASCII displays) rather than to recover from
186 errors. There's absolutely nothing wrong with a UTF-8 string
187 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
188 character with "&eacute;" will make it more readable to some
189 people.
190 """
191 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
192 cls._substitute_html_entity, s)
193
194
195class EncodingDetector:
196 """Suggests a number of possible encodings for a bytestring.
197
198 Order of precedence:
199
200 1. Encodings you specifically tell EncodingDetector to try first
201 (the override_encodings argument to the constructor).
202
203 2. An encoding declared within the bytestring itself, either in an
204 XML declaration (if the bytestring is to be interpreted as an XML
205 document), or in a <meta> tag (if the bytestring is to be
206 interpreted as an HTML document.)
207
208 3. An encoding detected through textual analysis by chardet,
209 cchardet, or a similar external library.
210
211 4. UTF-8.
212
213 5. Windows-1252.
214 """
215 def __init__(self, markup, override_encodings=None, is_html=False):
216 self.override_encodings = override_encodings or []
217 self.chardet_encoding = None
218 self.is_html = is_html
219 self.declared_encoding = None
220
221 # First order of business: strip a byte-order mark.
222 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
223
224 def _usable(self, encoding, tried):
225 if encoding is not None:
226 encoding = encoding.lower()
227 if encoding not in tried:
228 tried.add(encoding)
229 return True
230 return False
231
232 @property
233 def encodings(self):
234 """Yield a number of encodings that might work for this markup."""
235 tried = set()
236 for e in self.override_encodings:
237 if self._usable(e, tried):
238 yield e
239
240 # Did the document originally start with a byte-order mark
241 # that indicated its encoding?
242 if self._usable(self.sniffed_encoding, tried):
243 yield self.sniffed_encoding
244
245 # Look within the document for an XML or HTML encoding
246 # declaration.
247 if self.declared_encoding is None:
248 self.declared_encoding = self.find_declared_encoding(
249 self.markup, self.is_html)
250 if self._usable(self.declared_encoding, tried):
251 yield self.declared_encoding
252
253 # Use third-party character set detection to guess at the
254 # encoding.
255 if self.chardet_encoding is None:
256 self.chardet_encoding = chardet_dammit(self.markup)
257 if self._usable(self.chardet_encoding, tried):
258 yield self.chardet_encoding
259
260 # As a last-ditch effort, try utf-8 and windows-1252.
261 for e in ('utf-8', 'windows-1252'):
262 if self._usable(e, tried):
263 yield e
264
265 @classmethod
266 def strip_byte_order_mark(cls, data):
267 """If a byte-order mark is present, strip it and return the encoding it implies."""
268 encoding = None
269 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
270 and (data[2:4] != '\x00\x00'):
271 encoding = 'utf-16be'
272 data = data[2:]
273 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
274 and (data[2:4] != '\x00\x00'):
275 encoding = 'utf-16le'
276 data = data[2:]
277 elif data[:3] == b'\xef\xbb\xbf':
278 encoding = 'utf-8'
279 data = data[3:]
280 elif data[:4] == b'\x00\x00\xfe\xff':
281 encoding = 'utf-32be'
282 data = data[4:]
283 elif data[:4] == b'\xff\xfe\x00\x00':
284 encoding = 'utf-32le'
285 data = data[4:]
286 return data, encoding
287
288 @classmethod
289 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
290 """Given a document, tries to find its declared encoding.
291
292 An XML encoding is declared at the beginning of the document.
293
294 An HTML encoding is declared in a <meta> tag, hopefully near the
295 beginning of the document.
296 """
297 if search_entire_document:
298 xml_endpos = html_endpos = len(markup)
299 else:
300 xml_endpos = 1024
301 html_endpos = max(2048, int(len(markup) * 0.05))
302
303 declared_encoding = None
304 declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
305 if not declared_encoding_match and is_html:
306 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
307 if declared_encoding_match is not None:
308 declared_encoding = declared_encoding_match.groups()[0].decode(
309 'ascii')
310 if declared_encoding:
311 return declared_encoding.lower()
312 return None
313
314class UnicodeDammit:
315 """A class for detecting the encoding of a *ML document and
316 converting it to a Unicode string. If the source encoding is
317 windows-1252, can replace MS smart quotes with their HTML or XML
318 equivalents."""
319
320 # This dictionary maps commonly seen values for "charset" in HTML
321 # meta tags to the corresponding Python codec names. It only covers
322 # values that aren't in Python's aliases and can't be determined
323 # by the heuristics in find_codec.
324 CHARSET_ALIASES = {"macintosh": "mac-roman",
325 "x-sjis": "shift-jis"}
326
327 ENCODINGS_WITH_SMART_QUOTES = [
328 "windows-1252",
329 "iso-8859-1",
330 "iso-8859-2",
331 ]
332
333 def __init__(self, markup, override_encodings=[],
334 smart_quotes_to=None, is_html=False):
335 self.smart_quotes_to = smart_quotes_to
336 self.tried_encodings = []
337 self.contains_replacement_characters = False
338 self.is_html = is_html
339
340 self.detector = EncodingDetector(markup, override_encodings, is_html)
341
342 # Short-circuit if the data is in Unicode to begin with.
343 if isinstance(markup, unicode) or markup == '':
344 self.markup = markup
345 self.unicode_markup = unicode(markup)
346 self.original_encoding = None
347 return
348
349 # The encoding detector may have stripped a byte-order mark.
350 # Use the stripped markup from this point on.
351 self.markup = self.detector.markup
352
353 u = None
354 for encoding in self.detector.encodings:
355 markup = self.detector.markup
356 u = self._convert_from(encoding)
357 if u is not None:
358 break
359
360 if not u:
361 # None of the encodings worked. As an absolute last resort,
362 # try them again with character replacement.
363
364 for encoding in self.detector.encodings:
365 if encoding != "ascii":
366 u = self._convert_from(encoding, "replace")
367 if u is not None:
368 logging.warning(
369 "Some characters could not be decoded, and were "
370 "replaced with REPLACEMENT CHARACTER.")
371 self.contains_replacement_characters = True
372 break
373
374 # If none of that worked, we could at this point force it to
375 # ASCII, but that would destroy so much data that I think
376 # giving up is better.
377 self.unicode_markup = u
378 if not u:
379 self.original_encoding = None
380
381 def _sub_ms_char(self, match):
382 """Changes a MS smart quote character to an XML or HTML
383 entity, or an ASCII character."""
384 orig = match.group(1)
385 if self.smart_quotes_to == 'ascii':
386 sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
387 else:
388 sub = self.MS_CHARS.get(orig)
389 if type(sub) == tuple:
390 if self.smart_quotes_to == 'xml':
391 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
392 else:
393 sub = '&'.encode() + sub[0].encode() + ';'.encode()
394 else:
395 sub = sub.encode()
396 return sub
397
398 def _convert_from(self, proposed, errors="strict"):
399 proposed = self.find_codec(proposed)
400 if not proposed or (proposed, errors) in self.tried_encodings:
401 return None
402 self.tried_encodings.append((proposed, errors))
403 markup = self.markup
404 # Convert smart quotes to HTML if coming from an encoding
405 # that might have them.
406 if (self.smart_quotes_to is not None
407 and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
408 smart_quotes_re = b"([\x80-\x9f])"
409 smart_quotes_compiled = re.compile(smart_quotes_re)
410 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
411
412 try:
413 #print "Trying to convert document to %s (errors=%s)" % (
414 # proposed, errors)
415 u = self._to_unicode(markup, proposed, errors)
416 self.markup = u
417 self.original_encoding = proposed
418 except Exception as e:
419 #print "That didn't work!"
420 #print e
421 return None
422 #print "Correct encoding: %s" % proposed
423 return self.markup
424
425 def _to_unicode(self, data, encoding, errors="strict"):
426 '''Given a string and its encoding, decodes the string into Unicode.
427 %encoding is a string recognized by encodings.aliases'''
428 return unicode(data, encoding, errors)
429
430 @property
431 def declared_html_encoding(self):
432 if not self.is_html:
433 return None
434 return self.detector.declared_encoding
435
436 def find_codec(self, charset):
437 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
438 or (charset and self._codec(charset.replace("-", "")))
439 or (charset and self._codec(charset.replace("-", "_")))
440 or (charset and charset.lower())
441 or charset
442 )
443 if value:
444 return value.lower()
445 return None
446
447 def _codec(self, charset):
448 if not charset:
449 return charset
450 codec = None
451 try:
452 codecs.lookup(charset)
453 codec = charset
454 except (LookupError, ValueError):
455 pass
456 return codec
457
458
459 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
460 MS_CHARS = {b'\x80': ('euro', '20AC'),
461 b'\x81': ' ',
462 b'\x82': ('sbquo', '201A'),
463 b'\x83': ('fnof', '192'),
464 b'\x84': ('bdquo', '201E'),
465 b'\x85': ('hellip', '2026'),
466 b'\x86': ('dagger', '2020'),
467 b'\x87': ('Dagger', '2021'),
468 b'\x88': ('circ', '2C6'),
469 b'\x89': ('permil', '2030'),
470 b'\x8A': ('Scaron', '160'),
471 b'\x8B': ('lsaquo', '2039'),
472 b'\x8C': ('OElig', '152'),
473 b'\x8D': '?',
474 b'\x8E': ('#x17D', '17D'),
475 b'\x8F': '?',
476 b'\x90': '?',
477 b'\x91': ('lsquo', '2018'),
478 b'\x92': ('rsquo', '2019'),
479 b'\x93': ('ldquo', '201C'),
480 b'\x94': ('rdquo', '201D'),
481 b'\x95': ('bull', '2022'),
482 b'\x96': ('ndash', '2013'),
483 b'\x97': ('mdash', '2014'),
484 b'\x98': ('tilde', '2DC'),
485 b'\x99': ('trade', '2122'),
486 b'\x9a': ('scaron', '161'),
487 b'\x9b': ('rsaquo', '203A'),
488 b'\x9c': ('oelig', '153'),
489 b'\x9d': '?',
490 b'\x9e': ('#x17E', '17E'),
491 b'\x9f': ('Yuml', ''),}
492
493 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
494 # horrors like stripping diacritical marks to turn á into a, but also
495 # contains non-horrors like turning “ into ".
496 MS_CHARS_TO_ASCII = {
497 b'\x80' : 'EUR',
498 b'\x81' : ' ',
499 b'\x82' : ',',
500 b'\x83' : 'f',
501 b'\x84' : ',,',
502 b'\x85' : '...',
503 b'\x86' : '+',
504 b'\x87' : '++',
505 b'\x88' : '^',
506 b'\x89' : '%',
507 b'\x8a' : 'S',
508 b'\x8b' : '<',
509 b'\x8c' : 'OE',
510 b'\x8d' : '?',
511 b'\x8e' : 'Z',
512 b'\x8f' : '?',
513 b'\x90' : '?',
514 b'\x91' : "'",
515 b'\x92' : "'",
516 b'\x93' : '"',
517 b'\x94' : '"',
518 b'\x95' : '*',
519 b'\x96' : '-',
520 b'\x97' : '--',
521 b'\x98' : '~',
522 b'\x99' : '(TM)',
523 b'\x9a' : 's',
524 b'\x9b' : '>',
525 b'\x9c' : 'oe',
526 b'\x9d' : '?',
527 b'\x9e' : 'z',
528 b'\x9f' : 'Y',
529 b'\xa0' : ' ',
530 b'\xa1' : '!',
531 b'\xa2' : 'c',
532 b'\xa3' : 'GBP',
533 b'\xa4' : '$', #This approximation is especially parochial--this is the
534 #generic currency symbol.
535 b'\xa5' : 'YEN',
536 b'\xa6' : '|',
537 b'\xa7' : 'S',
538 b'\xa8' : '..',
539 b'\xa9' : '',
540 b'\xaa' : '(th)',
541 b'\xab' : '<<',
542 b'\xac' : '!',
543 b'\xad' : ' ',
544 b'\xae' : '(R)',
545 b'\xaf' : '-',
546 b'\xb0' : 'o',
547 b'\xb1' : '+-',
548 b'\xb2' : '2',
549 b'\xb3' : '3',
550 b'\xb4' : ("'", 'acute'),
551 b'\xb5' : 'u',
552 b'\xb6' : 'P',
553 b'\xb7' : '*',
554 b'\xb8' : ',',
555 b'\xb9' : '1',
556 b'\xba' : '(th)',
557 b'\xbb' : '>>',
558 b'\xbc' : '1/4',
559 b'\xbd' : '1/2',
560 b'\xbe' : '3/4',
561 b'\xbf' : '?',
562 b'\xc0' : 'A',
563 b'\xc1' : 'A',
564 b'\xc2' : 'A',
565 b'\xc3' : 'A',
566 b'\xc4' : 'A',
567 b'\xc5' : 'A',
568 b'\xc6' : 'AE',
569 b'\xc7' : 'C',
570 b'\xc8' : 'E',
571 b'\xc9' : 'E',
572 b'\xca' : 'E',
573 b'\xcb' : 'E',
574 b'\xcc' : 'I',
575 b'\xcd' : 'I',
576 b'\xce' : 'I',
577 b'\xcf' : 'I',
578 b'\xd0' : 'D',
579 b'\xd1' : 'N',
580 b'\xd2' : 'O',
581 b'\xd3' : 'O',
582 b'\xd4' : 'O',
583 b'\xd5' : 'O',
584 b'\xd6' : 'O',
585 b'\xd7' : '*',
586 b'\xd8' : 'O',
587 b'\xd9' : 'U',
588 b'\xda' : 'U',
589 b'\xdb' : 'U',
590 b'\xdc' : 'U',
591 b'\xdd' : 'Y',
592 b'\xde' : 'b',
593 b'\xdf' : 'B',
594 b'\xe0' : 'a',
595 b'\xe1' : 'a',
596 b'\xe2' : 'a',
597 b'\xe3' : 'a',
598 b'\xe4' : 'a',
599 b'\xe5' : 'a',
600 b'\xe6' : 'ae',
601 b'\xe7' : 'c',
602 b'\xe8' : 'e',
603 b'\xe9' : 'e',
604 b'\xea' : 'e',
605 b'\xeb' : 'e',
606 b'\xec' : 'i',
607 b'\xed' : 'i',
608 b'\xee' : 'i',
609 b'\xef' : 'i',
610 b'\xf0' : 'o',
611 b'\xf1' : 'n',
612 b'\xf2' : 'o',
613 b'\xf3' : 'o',
614 b'\xf4' : 'o',
615 b'\xf5' : 'o',
616 b'\xf6' : 'o',
617 b'\xf7' : '/',
618 b'\xf8' : 'o',
619 b'\xf9' : 'u',
620 b'\xfa' : 'u',
621 b'\xfb' : 'u',
622 b'\xfc' : 'u',
623 b'\xfd' : 'y',
624 b'\xfe' : 'b',
625 b'\xff' : 'y',
626 }
627
628 # A map used when removing rogue Windows-1252/ISO-8859-1
629 # characters in otherwise UTF-8 documents.
630 #
631 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
632 # Windows-1252.
633 WINDOWS_1252_TO_UTF8 = {
634 0x80 : b'\xe2\x82\xac', # €
635 0x82 : b'\xe2\x80\x9a', # ‚
636 0x83 : b'\xc6\x92', # Æ’
637 0x84 : b'\xe2\x80\x9e', # „
638 0x85 : b'\xe2\x80\xa6', # …
639 0x86 : b'\xe2\x80\xa0', # †
640 0x87 : b'\xe2\x80\xa1', # ‡
641 0x88 : b'\xcb\x86', # ˆ
642 0x89 : b'\xe2\x80\xb0', # ‰
643 0x8a : b'\xc5\xa0', # Å 
644 0x8b : b'\xe2\x80\xb9', # ‹
645 0x8c : b'\xc5\x92', # Å’
646 0x8e : b'\xc5\xbd', # Ž
647 0x91 : b'\xe2\x80\x98', # ‘
648 0x92 : b'\xe2\x80\x99', # ’
649 0x93 : b'\xe2\x80\x9c', # “
650 0x94 : b'\xe2\x80\x9d', # â€
651 0x95 : b'\xe2\x80\xa2', # •
652 0x96 : b'\xe2\x80\x93', # –
653 0x97 : b'\xe2\x80\x94', # —
654 0x98 : b'\xcb\x9c', # ˜
655 0x99 : b'\xe2\x84\xa2', # â„¢
656 0x9a : b'\xc5\xa1', # Å¡
657 0x9b : b'\xe2\x80\xba', # ›
658 0x9c : b'\xc5\x93', # Å“
659 0x9e : b'\xc5\xbe', # ž
660 0x9f : b'\xc5\xb8', # Ÿ
661 0xa0 : b'\xc2\xa0', #  
662 0xa1 : b'\xc2\xa1', # ¡
663 0xa2 : b'\xc2\xa2', # ¢
664 0xa3 : b'\xc2\xa3', # £
665 0xa4 : b'\xc2\xa4', # ¤
666 0xa5 : b'\xc2\xa5', # ¥
667 0xa6 : b'\xc2\xa6', # ¦
668 0xa7 : b'\xc2\xa7', # §
669 0xa8 : b'\xc2\xa8', # ¨
670 0xa9 : b'\xc2\xa9', # ©
671 0xaa : b'\xc2\xaa', # ª
672 0xab : b'\xc2\xab', # «
673 0xac : b'\xc2\xac', # ¬
674 0xad : b'\xc2\xad', # ­
675 0xae : b'\xc2\xae', # ®
676 0xaf : b'\xc2\xaf', # ¯
677 0xb0 : b'\xc2\xb0', # °
678 0xb1 : b'\xc2\xb1', # ±
679 0xb2 : b'\xc2\xb2', # ²
680 0xb3 : b'\xc2\xb3', # ³
681 0xb4 : b'\xc2\xb4', # ´
682 0xb5 : b'\xc2\xb5', # µ
683 0xb6 : b'\xc2\xb6', # ¶
684 0xb7 : b'\xc2\xb7', # ·
685 0xb8 : b'\xc2\xb8', # ¸
686 0xb9 : b'\xc2\xb9', # ¹
687 0xba : b'\xc2\xba', # º
688 0xbb : b'\xc2\xbb', # »
689 0xbc : b'\xc2\xbc', # ¼
690 0xbd : b'\xc2\xbd', # ½
691 0xbe : b'\xc2\xbe', # ¾
692 0xbf : b'\xc2\xbf', # ¿
693 0xc0 : b'\xc3\x80', # À
694 0xc1 : b'\xc3\x81', # Ã
695 0xc2 : b'\xc3\x82', # Â
696 0xc3 : b'\xc3\x83', # Ã
697 0xc4 : b'\xc3\x84', # Ä
698 0xc5 : b'\xc3\x85', # Ã…
699 0xc6 : b'\xc3\x86', # Æ
700 0xc7 : b'\xc3\x87', # Ç
701 0xc8 : b'\xc3\x88', # È
702 0xc9 : b'\xc3\x89', # É
703 0xca : b'\xc3\x8a', # Ê
704 0xcb : b'\xc3\x8b', # Ë
705 0xcc : b'\xc3\x8c', # Ì
706 0xcd : b'\xc3\x8d', # Ã
707 0xce : b'\xc3\x8e', # ÃŽ
708 0xcf : b'\xc3\x8f', # Ã
709 0xd0 : b'\xc3\x90', # Ã
710 0xd1 : b'\xc3\x91', # Ñ
711 0xd2 : b'\xc3\x92', # Ã’
712 0xd3 : b'\xc3\x93', # Ó
713 0xd4 : b'\xc3\x94', # Ô
714 0xd5 : b'\xc3\x95', # Õ
715 0xd6 : b'\xc3\x96', # Ö
716 0xd7 : b'\xc3\x97', # ×
717 0xd8 : b'\xc3\x98', # Ø
718 0xd9 : b'\xc3\x99', # Ù
719 0xda : b'\xc3\x9a', # Ú
720 0xdb : b'\xc3\x9b', # Û
721 0xdc : b'\xc3\x9c', # Ü
722 0xdd : b'\xc3\x9d', # Ã
723 0xde : b'\xc3\x9e', # Þ
724 0xdf : b'\xc3\x9f', # ß
725 0xe0 : b'\xc3\xa0', # à
726 0xe1 : b'\xa1', # á
727 0xe2 : b'\xc3\xa2', # â
728 0xe3 : b'\xc3\xa3', # ã
729 0xe4 : b'\xc3\xa4', # ä
730 0xe5 : b'\xc3\xa5', # å
731 0xe6 : b'\xc3\xa6', # æ
732 0xe7 : b'\xc3\xa7', # ç
733 0xe8 : b'\xc3\xa8', # è
734 0xe9 : b'\xc3\xa9', # é
735 0xea : b'\xc3\xaa', # ê
736 0xeb : b'\xc3\xab', # ë
737 0xec : b'\xc3\xac', # ì
738 0xed : b'\xc3\xad', # í
739 0xee : b'\xc3\xae', # î
740 0xef : b'\xc3\xaf', # ï
741 0xf0 : b'\xc3\xb0', # ð
742 0xf1 : b'\xc3\xb1', # ñ
743 0xf2 : b'\xc3\xb2', # ò
744 0xf3 : b'\xc3\xb3', # ó
745 0xf4 : b'\xc3\xb4', # ô
746 0xf5 : b'\xc3\xb5', # õ
747 0xf6 : b'\xc3\xb6', # ö
748 0xf7 : b'\xc3\xb7', # ÷
749 0xf8 : b'\xc3\xb8', # ø
750 0xf9 : b'\xc3\xb9', # ù
751 0xfa : b'\xc3\xba', # ú
752 0xfb : b'\xc3\xbb', # û
753 0xfc : b'\xc3\xbc', # ü
754 0xfd : b'\xc3\xbd', # ý
755 0xfe : b'\xc3\xbe', # þ
756 }
757
758 MULTIBYTE_MARKERS_AND_SIZES = [
759 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
760 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
761 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
762 ]
763
764 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
765 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
766
767 @classmethod
768 def detwingle(cls, in_bytes, main_encoding="utf8",
769 embedded_encoding="windows-1252"):
770 """Fix characters from one encoding embedded in some other encoding.
771
772 Currently the only situation supported is Windows-1252 (or its
773 subset ISO-8859-1), embedded in UTF-8.
774
775 The input must be a bytestring. If you've already converted
776 the document to Unicode, you're too late.
777
778 The output is a bytestring in which `embedded_encoding`
779 characters have been converted to their `main_encoding`
780 equivalents.
781 """
782 if embedded_encoding.replace('_', '-').lower() not in (
783 'windows-1252', 'windows_1252'):
784 raise NotImplementedError(
785 "Windows-1252 and ISO-8859-1 are the only currently supported "
786 "embedded encodings.")
787
788 if main_encoding.lower() not in ('utf8', 'utf-8'):
789 raise NotImplementedError(
790 "UTF-8 is the only currently supported main encoding.")
791
792 byte_chunks = []
793
794 chunk_start = 0
795 pos = 0
796 while pos < len(in_bytes):
797 byte = in_bytes[pos]
798 if not isinstance(byte, int):
799 # Python 2.x
800 byte = ord(byte)
801 if (byte >= cls.FIRST_MULTIBYTE_MARKER
802 and byte <= cls.LAST_MULTIBYTE_MARKER):
803 # This is the start of a UTF-8 multibyte character. Skip
804 # to the end.
805 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
806 if byte >= start and byte <= end:
807 pos += size
808 break
809 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
810 # We found a Windows-1252 character!
811 # Save the string up to this point as a chunk.
812 byte_chunks.append(in_bytes[chunk_start:pos])
813
814 # Now translate the Windows-1252 character into UTF-8
815 # and add it as another, one-byte chunk.
816 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
817 pos += 1
818 chunk_start = pos
819 else:
820 # Go on to the next character.
821 pos += 1
822 if chunk_start == 0:
823 # The string is unchanged.
824 return in_bytes
825 else:
826 # Store the final chunk.
827 byte_chunks.append(in_bytes[chunk_start:])
828 return b''.join(byte_chunks)
829
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py
new file mode 100644
index 0000000000..4d0b00afad
--- /dev/null
+++ b/bitbake/lib/bs4/diagnose.py
@@ -0,0 +1,204 @@
1"""Diagnostic functions, mainly for use when doing tech support."""
2import cProfile
3from StringIO import StringIO
4from HTMLParser import HTMLParser
5import bs4
6from bs4 import BeautifulSoup, __version__
7from bs4.builder import builder_registry
8
9import os
10import pstats
11import random
12import tempfile
13import time
14import traceback
15import sys
16import cProfile
17
18def diagnose(data):
19 """Diagnostic suite for isolating common problems."""
20 print "Diagnostic running on Beautiful Soup %s" % __version__
21 print "Python version %s" % sys.version
22
23 basic_parsers = ["html.parser", "html5lib", "lxml"]
24 for name in basic_parsers:
25 for builder in builder_registry.builders:
26 if name in builder.features:
27 break
28 else:
29 basic_parsers.remove(name)
30 print (
31 "I noticed that %s is not installed. Installing it may help." %
32 name)
33
34 if 'lxml' in basic_parsers:
35 basic_parsers.append(["lxml", "xml"])
36 from lxml import etree
37 print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
38
39 if 'html5lib' in basic_parsers:
40 import html5lib
41 print "Found html5lib version %s" % html5lib.__version__
42
43 if hasattr(data, 'read'):
44 data = data.read()
45 elif os.path.exists(data):
46 print '"%s" looks like a filename. Reading data from the file.' % data
47 data = open(data).read()
48 elif data.startswith("http:") or data.startswith("https:"):
49 print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
50 print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
51 return
52 print
53
54 for parser in basic_parsers:
55 print "Trying to parse your markup with %s" % parser
56 success = False
57 try:
58 soup = BeautifulSoup(data, parser)
59 success = True
60 except Exception, e:
61 print "%s could not parse the markup." % parser
62 traceback.print_exc()
63 if success:
64 print "Here's what %s did with the markup:" % parser
65 print soup.prettify()
66
67 print "-" * 80
68
69def lxml_trace(data, html=True, **kwargs):
70 """Print out the lxml events that occur during parsing.
71
72 This lets you see how lxml parses a document when no Beautiful
73 Soup code is running.
74 """
75 from lxml import etree
76 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
77 print("%s, %4s, %s" % (event, element.tag, element.text))
78
79class AnnouncingParser(HTMLParser):
80 """Announces HTMLParser parse events, without doing anything else."""
81
82 def _p(self, s):
83 print(s)
84
85 def handle_starttag(self, name, attrs):
86 self._p("%s START" % name)
87
88 def handle_endtag(self, name):
89 self._p("%s END" % name)
90
91 def handle_data(self, data):
92 self._p("%s DATA" % data)
93
94 def handle_charref(self, name):
95 self._p("%s CHARREF" % name)
96
97 def handle_entityref(self, name):
98 self._p("%s ENTITYREF" % name)
99
100 def handle_comment(self, data):
101 self._p("%s COMMENT" % data)
102
103 def handle_decl(self, data):
104 self._p("%s DECL" % data)
105
106 def unknown_decl(self, data):
107 self._p("%s UNKNOWN-DECL" % data)
108
109 def handle_pi(self, data):
110 self._p("%s PI" % data)
111
112def htmlparser_trace(data):
113 """Print out the HTMLParser events that occur during parsing.
114
115 This lets you see how HTMLParser parses a document when no
116 Beautiful Soup code is running.
117 """
118 parser = AnnouncingParser()
119 parser.feed(data)
120
121_vowels = "aeiou"
122_consonants = "bcdfghjklmnpqrstvwxyz"
123
124def rword(length=5):
125 "Generate a random word-like string."
126 s = ''
127 for i in range(length):
128 if i % 2 == 0:
129 t = _consonants
130 else:
131 t = _vowels
132 s += random.choice(t)
133 return s
134
135def rsentence(length=4):
136 "Generate a random sentence-like string."
137 return " ".join(rword(random.randint(4,9)) for i in range(length))
138
139def rdoc(num_elements=1000):
140 """Randomly generate an invalid HTML document."""
141 tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
142 elements = []
143 for i in range(num_elements):
144 choice = random.randint(0,3)
145 if choice == 0:
146 # New tag.
147 tag_name = random.choice(tag_names)
148 elements.append("<%s>" % tag_name)
149 elif choice == 1:
150 elements.append(rsentence(random.randint(1,4)))
151 elif choice == 2:
152 # Close a tag.
153 tag_name = random.choice(tag_names)
154 elements.append("</%s>" % tag_name)
155 return "<html>" + "\n".join(elements) + "</html>"
156
157def benchmark_parsers(num_elements=100000):
158 """Very basic head-to-head performance benchmark."""
159 print "Comparative parser benchmark on Beautiful Soup %s" % __version__
160 data = rdoc(num_elements)
161 print "Generated a large invalid HTML document (%d bytes)." % len(data)
162
163 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
164 success = False
165 try:
166 a = time.time()
167 soup = BeautifulSoup(data, parser)
168 b = time.time()
169 success = True
170 except Exception, e:
171 print "%s could not parse the markup." % parser
172 traceback.print_exc()
173 if success:
174 print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
175
176 from lxml import etree
177 a = time.time()
178 etree.HTML(data)
179 b = time.time()
180 print "Raw lxml parsed the markup in %.2fs." % (b-a)
181
182 import html5lib
183 parser = html5lib.HTMLParser()
184 a = time.time()
185 parser.parse(data)
186 b = time.time()
187 print "Raw html5lib parsed the markup in %.2fs." % (b-a)
188
189def profile(num_elements=100000, parser="lxml"):
190
191 filehandle = tempfile.NamedTemporaryFile()
192 filename = filehandle.name
193
194 data = rdoc(num_elements)
195 vars = dict(bs4=bs4, data=data, parser=parser)
196 cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
197
198 stats = pstats.Stats(filename)
199 # stats.strip_dirs()
200 stats.sort_stats("cumulative")
201 stats.print_stats('_html5lib|bs4', 50)
202
203if __name__ == '__main__':
204 diagnose(sys.stdin.read())
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py
new file mode 100644
index 0000000000..da9afdf48e
--- /dev/null
+++ b/bitbake/lib/bs4/element.py
@@ -0,0 +1,1611 @@
1import collections
2import re
3import sys
4import warnings
5from bs4.dammit import EntitySubstitution
6
7DEFAULT_OUTPUT_ENCODING = "utf-8"
8PY3K = (sys.version_info[0] > 2)
9
10whitespace_re = re.compile("\s+")
11
12def _alias(attr):
13 """Alias one attribute name to another for backward compatibility"""
14 @property
15 def alias(self):
16 return getattr(self, attr)
17
18 @alias.setter
19 def alias(self):
20 return setattr(self, attr)
21 return alias
22
23
24class NamespacedAttribute(unicode):
25
26 def __new__(cls, prefix, name, namespace=None):
27 if name is None:
28 obj = unicode.__new__(cls, prefix)
29 elif prefix is None:
30 # Not really namespaced.
31 obj = unicode.__new__(cls, name)
32 else:
33 obj = unicode.__new__(cls, prefix + ":" + name)
34 obj.prefix = prefix
35 obj.name = name
36 obj.namespace = namespace
37 return obj
38
39class AttributeValueWithCharsetSubstitution(unicode):
40 """A stand-in object for a character encoding specified in HTML."""
41
42class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
43 """A generic stand-in for the value of a meta tag's 'charset' attribute.
44
45 When Beautiful Soup parses the markup '<meta charset="utf8">', the
46 value of the 'charset' attribute will be one of these objects.
47 """
48
49 def __new__(cls, original_value):
50 obj = unicode.__new__(cls, original_value)
51 obj.original_value = original_value
52 return obj
53
54 def encode(self, encoding):
55 return encoding
56
57
58class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
59 """A generic stand-in for the value of a meta tag's 'content' attribute.
60
61 When Beautiful Soup parses the markup:
62 <meta http-equiv="content-type" content="text/html; charset=utf8">
63
64 The value of the 'content' attribute will be one of these objects.
65 """
66
67 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
68
69 def __new__(cls, original_value):
70 match = cls.CHARSET_RE.search(original_value)
71 if match is None:
72 # No substitution necessary.
73 return unicode.__new__(unicode, original_value)
74
75 obj = unicode.__new__(cls, original_value)
76 obj.original_value = original_value
77 return obj
78
79 def encode(self, encoding):
80 def rewrite(match):
81 return match.group(1) + encoding
82 return self.CHARSET_RE.sub(rewrite, self.original_value)
83
84class HTMLAwareEntitySubstitution(EntitySubstitution):
85
86 """Entity substitution rules that are aware of some HTML quirks.
87
88 Specifically, the contents of <script> and <style> tags should not
89 undergo entity substitution.
90
91 Incoming NavigableString objects are checked to see if they're the
92 direct children of a <script> or <style> tag.
93 """
94
95 cdata_containing_tags = set(["script", "style"])
96
97 preformatted_tags = set(["pre"])
98
99 @classmethod
100 def _substitute_if_appropriate(cls, ns, f):
101 if (isinstance(ns, NavigableString)
102 and ns.parent is not None
103 and ns.parent.name in cls.cdata_containing_tags):
104 # Do nothing.
105 return ns
106 # Substitute.
107 return f(ns)
108
109 @classmethod
110 def substitute_html(cls, ns):
111 return cls._substitute_if_appropriate(
112 ns, EntitySubstitution.substitute_html)
113
114 @classmethod
115 def substitute_xml(cls, ns):
116 return cls._substitute_if_appropriate(
117 ns, EntitySubstitution.substitute_xml)
118
119class PageElement(object):
120 """Contains the navigational information for some part of the page
121 (either a tag or a piece of text)"""
122
123 # There are five possible values for the "formatter" argument passed in
124 # to methods like encode() and prettify():
125 #
126 # "html" - All Unicode characters with corresponding HTML entities
127 # are converted to those entities on output.
128 # "minimal" - Bare ampersands and angle brackets are converted to
129 # XML entities: &amp; &lt; &gt;
130 # None - The null formatter. Unicode characters are never
131 # converted to entities. This is not recommended, but it's
132 # faster than "minimal".
133 # A function - This function will be called on every string that
134 # needs to undergo entity substitution.
135 #
136
137 # In an HTML document, the default "html" and "minimal" functions
138 # will leave the contents of <script> and <style> tags alone. For
139 # an XML document, all tags will be given the same treatment.
140
141 HTML_FORMATTERS = {
142 "html" : HTMLAwareEntitySubstitution.substitute_html,
143 "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
144 None : None
145 }
146
147 XML_FORMATTERS = {
148 "html" : EntitySubstitution.substitute_html,
149 "minimal" : EntitySubstitution.substitute_xml,
150 None : None
151 }
152
153 def format_string(self, s, formatter='minimal'):
154 """Format the given string using the given formatter."""
155 if not callable(formatter):
156 formatter = self._formatter_for_name(formatter)
157 if formatter is None:
158 output = s
159 else:
160 output = formatter(s)
161 return output
162
163 @property
164 def _is_xml(self):
165 """Is this element part of an XML tree or an HTML tree?
166
167 This is used when mapping a formatter name ("minimal") to an
168 appropriate function (one that performs entity-substitution on
169 the contents of <script> and <style> tags, or not). It's
170 inefficient, but it should be called very rarely.
171 """
172 if self.parent is None:
173 # This is the top-level object. It should have .is_xml set
174 # from tree creation. If not, take a guess--BS is usually
175 # used on HTML markup.
176 return getattr(self, 'is_xml', False)
177 return self.parent._is_xml
178
179 def _formatter_for_name(self, name):
180 "Look up a formatter function based on its name and the tree."
181 if self._is_xml:
182 return self.XML_FORMATTERS.get(
183 name, EntitySubstitution.substitute_xml)
184 else:
185 return self.HTML_FORMATTERS.get(
186 name, HTMLAwareEntitySubstitution.substitute_xml)
187
188 def setup(self, parent=None, previous_element=None):
189 """Sets up the initial relations between this element and
190 other elements."""
191 self.parent = parent
192 self.previous_element = previous_element
193 if previous_element is not None:
194 self.previous_element.next_element = self
195 self.next_element = None
196 self.previous_sibling = None
197 self.next_sibling = None
198 if self.parent is not None and self.parent.contents:
199 self.previous_sibling = self.parent.contents[-1]
200 self.previous_sibling.next_sibling = self
201
202 nextSibling = _alias("next_sibling") # BS3
203 previousSibling = _alias("previous_sibling") # BS3
204
205 def replace_with(self, replace_with):
206 if replace_with is self:
207 return
208 if replace_with is self.parent:
209 raise ValueError("Cannot replace a Tag with its parent.")
210 old_parent = self.parent
211 my_index = self.parent.index(self)
212 self.extract()
213 old_parent.insert(my_index, replace_with)
214 return self
215 replaceWith = replace_with # BS3
216
217 def unwrap(self):
218 my_parent = self.parent
219 my_index = self.parent.index(self)
220 self.extract()
221 for child in reversed(self.contents[:]):
222 my_parent.insert(my_index, child)
223 return self
224 replace_with_children = unwrap
225 replaceWithChildren = unwrap # BS3
226
227 def wrap(self, wrap_inside):
228 me = self.replace_with(wrap_inside)
229 wrap_inside.append(me)
230 return wrap_inside
231
232 def extract(self):
233 """Destructively rips this element out of the tree."""
234 if self.parent is not None:
235 del self.parent.contents[self.parent.index(self)]
236
237 #Find the two elements that would be next to each other if
238 #this element (and any children) hadn't been parsed. Connect
239 #the two.
240 last_child = self._last_descendant()
241 next_element = last_child.next_element
242
243 if self.previous_element is not None:
244 self.previous_element.next_element = next_element
245 if next_element is not None:
246 next_element.previous_element = self.previous_element
247 self.previous_element = None
248 last_child.next_element = None
249
250 self.parent = None
251 if self.previous_sibling is not None:
252 self.previous_sibling.next_sibling = self.next_sibling
253 if self.next_sibling is not None:
254 self.next_sibling.previous_sibling = self.previous_sibling
255 self.previous_sibling = self.next_sibling = None
256 return self
257
258 def _last_descendant(self, is_initialized=True, accept_self=True):
259 "Finds the last element beneath this object to be parsed."
260 if is_initialized and self.next_sibling:
261 last_child = self.next_sibling.previous_element
262 else:
263 last_child = self
264 while isinstance(last_child, Tag) and last_child.contents:
265 last_child = last_child.contents[-1]
266 if not accept_self and last_child == self:
267 last_child = None
268 return last_child
269 # BS3: Not part of the API!
270 _lastRecursiveChild = _last_descendant
271
272 def insert(self, position, new_child):
273 if new_child is self:
274 raise ValueError("Cannot insert a tag into itself.")
275 if (isinstance(new_child, basestring)
276 and not isinstance(new_child, NavigableString)):
277 new_child = NavigableString(new_child)
278
279 position = min(position, len(self.contents))
280 if hasattr(new_child, 'parent') and new_child.parent is not None:
281 # We're 'inserting' an element that's already one
282 # of this object's children.
283 if new_child.parent is self:
284 current_index = self.index(new_child)
285 if current_index < position:
286 # We're moving this element further down the list
287 # of this object's children. That means that when
288 # we extract this element, our target index will
289 # jump down one.
290 position -= 1
291 new_child.extract()
292
293 new_child.parent = self
294 previous_child = None
295 if position == 0:
296 new_child.previous_sibling = None
297 new_child.previous_element = self
298 else:
299 previous_child = self.contents[position - 1]
300 new_child.previous_sibling = previous_child
301 new_child.previous_sibling.next_sibling = new_child
302 new_child.previous_element = previous_child._last_descendant(False)
303 if new_child.previous_element is not None:
304 new_child.previous_element.next_element = new_child
305
306 new_childs_last_element = new_child._last_descendant(False)
307
308 if position >= len(self.contents):
309 new_child.next_sibling = None
310
311 parent = self
312 parents_next_sibling = None
313 while parents_next_sibling is None and parent is not None:
314 parents_next_sibling = parent.next_sibling
315 parent = parent.parent
316 if parents_next_sibling is not None:
317 # We found the element that comes next in the document.
318 break
319 if parents_next_sibling is not None:
320 new_childs_last_element.next_element = parents_next_sibling
321 else:
322 # The last element of this tag is the last element in
323 # the document.
324 new_childs_last_element.next_element = None
325 else:
326 next_child = self.contents[position]
327 new_child.next_sibling = next_child
328 if new_child.next_sibling is not None:
329 new_child.next_sibling.previous_sibling = new_child
330 new_childs_last_element.next_element = next_child
331
332 if new_childs_last_element.next_element is not None:
333 new_childs_last_element.next_element.previous_element = new_childs_last_element
334 self.contents.insert(position, new_child)
335
336 def append(self, tag):
337 """Appends the given tag to the contents of this tag."""
338 self.insert(len(self.contents), tag)
339
340 def insert_before(self, predecessor):
341 """Makes the given element the immediate predecessor of this one.
342
343 The two elements will have the same parent, and the given element
344 will be immediately before this one.
345 """
346 if self is predecessor:
347 raise ValueError("Can't insert an element before itself.")
348 parent = self.parent
349 if parent is None:
350 raise ValueError(
351 "Element has no parent, so 'before' has no meaning.")
352 # Extract first so that the index won't be screwed up if they
353 # are siblings.
354 if isinstance(predecessor, PageElement):
355 predecessor.extract()
356 index = parent.index(self)
357 parent.insert(index, predecessor)
358
359 def insert_after(self, successor):
360 """Makes the given element the immediate successor of this one.
361
362 The two elements will have the same parent, and the given element
363 will be immediately after this one.
364 """
365 if self is successor:
366 raise ValueError("Can't insert an element after itself.")
367 parent = self.parent
368 if parent is None:
369 raise ValueError(
370 "Element has no parent, so 'after' has no meaning.")
371 # Extract first so that the index won't be screwed up if they
372 # are siblings.
373 if isinstance(successor, PageElement):
374 successor.extract()
375 index = parent.index(self)
376 parent.insert(index+1, successor)
377
378 def find_next(self, name=None, attrs={}, text=None, **kwargs):
379 """Returns the first item that matches the given criteria and
380 appears after this Tag in the document."""
381 return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
382 findNext = find_next # BS3
383
384 def find_all_next(self, name=None, attrs={}, text=None, limit=None,
385 **kwargs):
386 """Returns all items that match the given criteria and appear
387 after this Tag in the document."""
388 return self._find_all(name, attrs, text, limit, self.next_elements,
389 **kwargs)
390 findAllNext = find_all_next # BS3
391
392 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
393 """Returns the closest sibling to this Tag that matches the
394 given criteria and appears after this Tag in the document."""
395 return self._find_one(self.find_next_siblings, name, attrs, text,
396 **kwargs)
397 findNextSibling = find_next_sibling # BS3
398
399 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
400 **kwargs):
401 """Returns the siblings of this Tag that match the given
402 criteria and appear after this Tag in the document."""
403 return self._find_all(name, attrs, text, limit,
404 self.next_siblings, **kwargs)
405 findNextSiblings = find_next_siblings # BS3
406 fetchNextSiblings = find_next_siblings # BS2
407
408 def find_previous(self, name=None, attrs={}, text=None, **kwargs):
409 """Returns the first item that matches the given criteria and
410 appears before this Tag in the document."""
411 return self._find_one(
412 self.find_all_previous, name, attrs, text, **kwargs)
413 findPrevious = find_previous # BS3
414
415 def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
416 **kwargs):
417 """Returns all items that match the given criteria and appear
418 before this Tag in the document."""
419 return self._find_all(name, attrs, text, limit, self.previous_elements,
420 **kwargs)
421 findAllPrevious = find_all_previous # BS3
422 fetchPrevious = find_all_previous # BS2
423
424 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
425 """Returns the closest sibling to this Tag that matches the
426 given criteria and appears before this Tag in the document."""
427 return self._find_one(self.find_previous_siblings, name, attrs, text,
428 **kwargs)
429 findPreviousSibling = find_previous_sibling # BS3
430
431 def find_previous_siblings(self, name=None, attrs={}, text=None,
432 limit=None, **kwargs):
433 """Returns the siblings of this Tag that match the given
434 criteria and appear before this Tag in the document."""
435 return self._find_all(name, attrs, text, limit,
436 self.previous_siblings, **kwargs)
437 findPreviousSiblings = find_previous_siblings # BS3
438 fetchPreviousSiblings = find_previous_siblings # BS2
439
440 def find_parent(self, name=None, attrs={}, **kwargs):
441 """Returns the closest parent of this Tag that matches the given
442 criteria."""
443 # NOTE: We can't use _find_one because findParents takes a different
444 # set of arguments.
445 r = None
446 l = self.find_parents(name, attrs, 1, **kwargs)
447 if l:
448 r = l[0]
449 return r
450 findParent = find_parent # BS3
451
452 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
453 """Returns the parents of this Tag that match the given
454 criteria."""
455
456 return self._find_all(name, attrs, None, limit, self.parents,
457 **kwargs)
458 findParents = find_parents # BS3
459 fetchParents = find_parents # BS2
460
461 @property
462 def next(self):
463 return self.next_element
464
465 @property
466 def previous(self):
467 return self.previous_element
468
469 #These methods do the real heavy lifting.
470
471 def _find_one(self, method, name, attrs, text, **kwargs):
472 r = None
473 l = method(name, attrs, text, 1, **kwargs)
474 if l:
475 r = l[0]
476 return r
477
478 def _find_all(self, name, attrs, text, limit, generator, **kwargs):
479 "Iterates over a generator looking for things that match."
480
481 if isinstance(name, SoupStrainer):
482 strainer = name
483 else:
484 strainer = SoupStrainer(name, attrs, text, **kwargs)
485
486 if text is None and not limit and not attrs and not kwargs:
487 if name is True or name is None:
488 # Optimization to find all tags.
489 result = (element for element in generator
490 if isinstance(element, Tag))
491 return ResultSet(strainer, result)
492 elif isinstance(name, basestring):
493 # Optimization to find all tags with a given name.
494 result = (element for element in generator
495 if isinstance(element, Tag)
496 and element.name == name)
497 return ResultSet(strainer, result)
498 results = ResultSet(strainer)
499 while True:
500 try:
501 i = next(generator)
502 except StopIteration:
503 break
504 if i:
505 found = strainer.search(i)
506 if found:
507 results.append(found)
508 if limit and len(results) >= limit:
509 break
510 return results
511
512 #These generators can be used to navigate starting from both
513 #NavigableStrings and Tags.
514 @property
515 def next_elements(self):
516 i = self.next_element
517 while i is not None:
518 yield i
519 i = i.next_element
520
521 @property
522 def next_siblings(self):
523 i = self.next_sibling
524 while i is not None:
525 yield i
526 i = i.next_sibling
527
528 @property
529 def previous_elements(self):
530 i = self.previous_element
531 while i is not None:
532 yield i
533 i = i.previous_element
534
535 @property
536 def previous_siblings(self):
537 i = self.previous_sibling
538 while i is not None:
539 yield i
540 i = i.previous_sibling
541
542 @property
543 def parents(self):
544 i = self.parent
545 while i is not None:
546 yield i
547 i = i.parent
548
549 # Methods for supporting CSS selectors.
550
551 tag_name_re = re.compile('^[a-z0-9]+$')
552
553 # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
554 # \---/ \---/\-------------/ \-------/
555 # | | | |
556 # | | | The value
557 # | | ~,|,^,$,* or =
558 # | Attribute
559 # Tag
560 attribselect_re = re.compile(
561 r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
562 r'=?"?(?P<value>[^\]"]*)"?\]$'
563 )
564
565 def _attr_value_as_string(self, value, default=None):
566 """Force an attribute value into a string representation.
567
568 A multi-valued attribute will be converted into a
569 space-separated stirng.
570 """
571 value = self.get(value, default)
572 if isinstance(value, list) or isinstance(value, tuple):
573 value =" ".join(value)
574 return value
575
576 def _tag_name_matches_and(self, function, tag_name):
577 if not tag_name:
578 return function
579 else:
580 def _match(tag):
581 return tag.name == tag_name and function(tag)
582 return _match
583
584 def _attribute_checker(self, operator, attribute, value=''):
585 """Create a function that performs a CSS selector operation.
586
587 Takes an operator, attribute and optional value. Returns a
588 function that will return True for elements that match that
589 combination.
590 """
591 if operator == '=':
592 # string representation of `attribute` is equal to `value`
593 return lambda el: el._attr_value_as_string(attribute) == value
594 elif operator == '~':
595 # space-separated list representation of `attribute`
596 # contains `value`
597 def _includes_value(element):
598 attribute_value = element.get(attribute, [])
599 if not isinstance(attribute_value, list):
600 attribute_value = attribute_value.split()
601 return value in attribute_value
602 return _includes_value
603 elif operator == '^':
604 # string representation of `attribute` starts with `value`
605 return lambda el: el._attr_value_as_string(
606 attribute, '').startswith(value)
607 elif operator == '$':
608 # string represenation of `attribute` ends with `value`
609 return lambda el: el._attr_value_as_string(
610 attribute, '').endswith(value)
611 elif operator == '*':
612 # string representation of `attribute` contains `value`
613 return lambda el: value in el._attr_value_as_string(attribute, '')
614 elif operator == '|':
615 # string representation of `attribute` is either exactly
616 # `value` or starts with `value` and then a dash.
617 def _is_or_starts_with_dash(element):
618 attribute_value = element._attr_value_as_string(attribute, '')
619 return (attribute_value == value or attribute_value.startswith(
620 value + '-'))
621 return _is_or_starts_with_dash
622 else:
623 return lambda el: el.has_attr(attribute)
624
625 # Old non-property versions of the generators, for backwards
626 # compatibility with BS3.
627 def nextGenerator(self):
628 return self.next_elements
629
630 def nextSiblingGenerator(self):
631 return self.next_siblings
632
633 def previousGenerator(self):
634 return self.previous_elements
635
636 def previousSiblingGenerator(self):
637 return self.previous_siblings
638
639 def parentGenerator(self):
640 return self.parents
641
642
643class NavigableString(unicode, PageElement):
644
645 PREFIX = ''
646 SUFFIX = ''
647
648 def __new__(cls, value):
649 """Create a new NavigableString.
650
651 When unpickling a NavigableString, this method is called with
652 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
653 passed in to the superclass's __new__ or the superclass won't know
654 how to handle non-ASCII characters.
655 """
656 if isinstance(value, unicode):
657 return unicode.__new__(cls, value)
658 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
659
660 def __copy__(self):
661 return self
662
663 def __getnewargs__(self):
664 return (unicode(self),)
665
666 def __getattr__(self, attr):
667 """text.string gives you text. This is for backwards
668 compatibility for Navigable*String, but for CData* it lets you
669 get the string without the CData wrapper."""
670 if attr == 'string':
671 return self
672 else:
673 raise AttributeError(
674 "'%s' object has no attribute '%s'" % (
675 self.__class__.__name__, attr))
676
677 def output_ready(self, formatter="minimal"):
678 output = self.format_string(self, formatter)
679 return self.PREFIX + output + self.SUFFIX
680
681 @property
682 def name(self):
683 return None
684
685 @name.setter
686 def name(self, name):
687 raise AttributeError("A NavigableString cannot be given a name.")
688
689class PreformattedString(NavigableString):
690 """A NavigableString not subject to the normal formatting rules.
691
692 The string will be passed into the formatter (to trigger side effects),
693 but the return value will be ignored.
694 """
695
696 def output_ready(self, formatter="minimal"):
697 """CData strings are passed into the formatter.
698 But the return value is ignored."""
699 self.format_string(self, formatter)
700 return self.PREFIX + self + self.SUFFIX
701
702class CData(PreformattedString):
703
704 PREFIX = u'<![CDATA['
705 SUFFIX = u']]>'
706
707class ProcessingInstruction(PreformattedString):
708
709 PREFIX = u'<?'
710 SUFFIX = u'?>'
711
712class Comment(PreformattedString):
713
714 PREFIX = u'<!--'
715 SUFFIX = u'-->'
716
717
718class Declaration(PreformattedString):
719 PREFIX = u'<!'
720 SUFFIX = u'!>'
721
722
723class Doctype(PreformattedString):
724
725 @classmethod
726 def for_name_and_ids(cls, name, pub_id, system_id):
727 value = name or ''
728 if pub_id is not None:
729 value += ' PUBLIC "%s"' % pub_id
730 if system_id is not None:
731 value += ' "%s"' % system_id
732 elif system_id is not None:
733 value += ' SYSTEM "%s"' % system_id
734
735 return Doctype(value)
736
737 PREFIX = u'<!DOCTYPE '
738 SUFFIX = u'>\n'
739
740
741class Tag(PageElement):
742
743 """Represents a found HTML tag with its attributes and contents."""
744
745 def __init__(self, parser=None, builder=None, name=None, namespace=None,
746 prefix=None, attrs=None, parent=None, previous=None):
747 "Basic constructor."
748
749 if parser is None:
750 self.parser_class = None
751 else:
752 # We don't actually store the parser object: that lets extracted
753 # chunks be garbage-collected.
754 self.parser_class = parser.__class__
755 if name is None:
756 raise ValueError("No value provided for new tag's name.")
757 self.name = name
758 self.namespace = namespace
759 self.prefix = prefix
760 if attrs is None:
761 attrs = {}
762 elif attrs and builder.cdata_list_attributes:
763 attrs = builder._replace_cdata_list_attribute_values(
764 self.name, attrs)
765 else:
766 attrs = dict(attrs)
767 self.attrs = attrs
768 self.contents = []
769 self.setup(parent, previous)
770 self.hidden = False
771
772 # Set up any substitutions, such as the charset in a META tag.
773 if builder is not None:
774 builder.set_up_substitutions(self)
775 self.can_be_empty_element = builder.can_be_empty_element(name)
776 else:
777 self.can_be_empty_element = False
778
779 parserClass = _alias("parser_class") # BS3
780
781 @property
782 def is_empty_element(self):
783 """Is this tag an empty-element tag? (aka a self-closing tag)
784
785 A tag that has contents is never an empty-element tag.
786
787 A tag that has no contents may or may not be an empty-element
788 tag. It depends on the builder used to create the tag. If the
789 builder has a designated list of empty-element tags, then only
790 a tag whose name shows up in that list is considered an
791 empty-element tag.
792
793 If the builder has no designated list of empty-element tags,
794 then any tag with no contents is an empty-element tag.
795 """
796 return len(self.contents) == 0 and self.can_be_empty_element
797 isSelfClosing = is_empty_element # BS3
798
799 @property
800 def string(self):
801 """Convenience property to get the single string within this tag.
802
803 :Return: If this tag has a single string child, return value
804 is that string. If this tag has no children, or more than one
805 child, return value is None. If this tag has one child tag,
806 return value is the 'string' attribute of the child tag,
807 recursively.
808 """
809 if len(self.contents) != 1:
810 return None
811 child = self.contents[0]
812 if isinstance(child, NavigableString):
813 return child
814 return child.string
815
816 @string.setter
817 def string(self, string):
818 self.clear()
819 self.append(string.__class__(string))
820
821 def _all_strings(self, strip=False, types=(NavigableString, CData)):
822 """Yield all strings of certain classes, possibly stripping them.
823
824 By default, yields only NavigableString and CData objects. So
825 no comments, processing instructions, etc.
826 """
827 for descendant in self.descendants:
828 if (
829 (types is None and not isinstance(descendant, NavigableString))
830 or
831 (types is not None and type(descendant) not in types)):
832 continue
833 if strip:
834 descendant = descendant.strip()
835 if len(descendant) == 0:
836 continue
837 yield descendant
838
839 strings = property(_all_strings)
840
841 @property
842 def stripped_strings(self):
843 for string in self._all_strings(True):
844 yield string
845
846 def get_text(self, separator=u"", strip=False,
847 types=(NavigableString, CData)):
848 """
849 Get all child strings, concatenated using the given separator.
850 """
851 return separator.join([s for s in self._all_strings(
852 strip, types=types)])
853 getText = get_text
854 text = property(get_text)
855
856 def decompose(self):
857 """Recursively destroys the contents of this tree."""
858 self.extract()
859 i = self
860 while i is not None:
861 next = i.next_element
862 i.__dict__.clear()
863 i.contents = []
864 i = next
865
866 def clear(self, decompose=False):
867 """
868 Extract all children. If decompose is True, decompose instead.
869 """
870 if decompose:
871 for element in self.contents[:]:
872 if isinstance(element, Tag):
873 element.decompose()
874 else:
875 element.extract()
876 else:
877 for element in self.contents[:]:
878 element.extract()
879
880 def index(self, element):
881 """
882 Find the index of a child by identity, not value. Avoids issues with
883 tag.contents.index(element) getting the index of equal elements.
884 """
885 for i, child in enumerate(self.contents):
886 if child is element:
887 return i
888 raise ValueError("Tag.index: element not in tag")
889
890 def get(self, key, default=None):
891 """Returns the value of the 'key' attribute for the tag, or
892 the value given for 'default' if it doesn't have that
893 attribute."""
894 return self.attrs.get(key, default)
895
896 def has_attr(self, key):
897 return key in self.attrs
898
899 def __hash__(self):
900 return str(self).__hash__()
901
902 def __getitem__(self, key):
903 """tag[key] returns the value of the 'key' attribute for the tag,
904 and throws an exception if it's not there."""
905 return self.attrs[key]
906
907 def __iter__(self):
908 "Iterating over a tag iterates over its contents."
909 return iter(self.contents)
910
911 def __len__(self):
912 "The length of a tag is the length of its list of contents."
913 return len(self.contents)
914
915 def __contains__(self, x):
916 return x in self.contents
917
918 def __nonzero__(self):
919 "A tag is non-None even if it has no contents."
920 return True
921
922 def __setitem__(self, key, value):
923 """Setting tag[key] sets the value of the 'key' attribute for the
924 tag."""
925 self.attrs[key] = value
926
927 def __delitem__(self, key):
928 "Deleting tag[key] deletes all 'key' attributes for the tag."
929 self.attrs.pop(key, None)
930
931 def __call__(self, *args, **kwargs):
932 """Calling a tag like a function is the same as calling its
933 find_all() method. Eg. tag('a') returns a list of all the A tags
934 found within this tag."""
935 return self.find_all(*args, **kwargs)
936
937 def __getattr__(self, tag):
938 #print "Getattr %s.%s" % (self.__class__, tag)
939 if len(tag) > 3 and tag.endswith('Tag'):
940 # BS3: soup.aTag -> "soup.find("a")
941 tag_name = tag[:-3]
942 warnings.warn(
943 '.%sTag is deprecated, use .find("%s") instead.' % (
944 tag_name, tag_name))
945 return self.find(tag_name)
946 # We special case contents to avoid recursion.
947 elif not tag.startswith("__") and not tag=="contents":
948 return self.find(tag)
949 raise AttributeError(
950 "'%s' object has no attribute '%s'" % (self.__class__, tag))
951
952 def __eq__(self, other):
953 """Returns true iff this tag has the same name, the same attributes,
954 and the same contents (recursively) as the given tag."""
955 if self is other:
956 return True
957 if (not hasattr(other, 'name') or
958 not hasattr(other, 'attrs') or
959 not hasattr(other, 'contents') or
960 self.name != other.name or
961 self.attrs != other.attrs or
962 len(self) != len(other)):
963 return False
964 for i, my_child in enumerate(self.contents):
965 if my_child != other.contents[i]:
966 return False
967 return True
968
969 def __ne__(self, other):
970 """Returns true iff this tag is not identical to the other tag,
971 as defined in __eq__."""
972 return not self == other
973
974 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
975 """Renders this tag as a string."""
976 return self.encode(encoding)
977
978 def __unicode__(self):
979 return self.decode()
980
981 def __str__(self):
982 return self.encode()
983
984 if PY3K:
985 __str__ = __repr__ = __unicode__
986
987 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
988 indent_level=None, formatter="minimal",
989 errors="xmlcharrefreplace"):
990 # Turn the data structure into Unicode, then encode the
991 # Unicode.
992 u = self.decode(indent_level, encoding, formatter)
993 return u.encode(encoding, errors)
994
995 def _should_pretty_print(self, indent_level):
996 """Should this tag be pretty-printed?"""
997 return (
998 indent_level is not None and
999 (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1000 or self._is_xml))
1001
1002 def decode(self, indent_level=None,
1003 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1004 formatter="minimal"):
1005 """Returns a Unicode representation of this tag and its contents.
1006
1007 :param eventual_encoding: The tag is destined to be
1008 encoded into this encoding. This method is _not_
1009 responsible for performing that encoding. This information
1010 is passed in so that it can be substituted in if the
1011 document contains a <META> tag that mentions the document's
1012 encoding.
1013 """
1014
1015 # First off, turn a string formatter into a function. This
1016 # will stop the lookup from happening over and over again.
1017 if not callable(formatter):
1018 formatter = self._formatter_for_name(formatter)
1019
1020 attrs = []
1021 if self.attrs:
1022 for key, val in sorted(self.attrs.items()):
1023 if val is None:
1024 decoded = key
1025 else:
1026 if isinstance(val, list) or isinstance(val, tuple):
1027 val = ' '.join(val)
1028 elif not isinstance(val, basestring):
1029 val = unicode(val)
1030 elif (
1031 isinstance(val, AttributeValueWithCharsetSubstitution)
1032 and eventual_encoding is not None):
1033 val = val.encode(eventual_encoding)
1034
1035 text = self.format_string(val, formatter)
1036 decoded = (
1037 unicode(key) + '='
1038 + EntitySubstitution.quoted_attribute_value(text))
1039 attrs.append(decoded)
1040 close = ''
1041 closeTag = ''
1042
1043 prefix = ''
1044 if self.prefix:
1045 prefix = self.prefix + ":"
1046
1047 if self.is_empty_element:
1048 close = '/'
1049 else:
1050 closeTag = '</%s%s>' % (prefix, self.name)
1051
1052 pretty_print = self._should_pretty_print(indent_level)
1053 space = ''
1054 indent_space = ''
1055 if indent_level is not None:
1056 indent_space = (' ' * (indent_level - 1))
1057 if pretty_print:
1058 space = indent_space
1059 indent_contents = indent_level + 1
1060 else:
1061 indent_contents = None
1062 contents = self.decode_contents(
1063 indent_contents, eventual_encoding, formatter)
1064
1065 if self.hidden:
1066 # This is the 'document root' object.
1067 s = contents
1068 else:
1069 s = []
1070 attribute_string = ''
1071 if attrs:
1072 attribute_string = ' ' + ' '.join(attrs)
1073 if indent_level is not None:
1074 # Even if this particular tag is not pretty-printed,
1075 # we should indent up to the start of the tag.
1076 s.append(indent_space)
1077 s.append('<%s%s%s%s>' % (
1078 prefix, self.name, attribute_string, close))
1079 if pretty_print:
1080 s.append("\n")
1081 s.append(contents)
1082 if pretty_print and contents and contents[-1] != "\n":
1083 s.append("\n")
1084 if pretty_print and closeTag:
1085 s.append(space)
1086 s.append(closeTag)
1087 if indent_level is not None and closeTag and self.next_sibling:
1088 # Even if this particular tag is not pretty-printed,
1089 # we're now done with the tag, and we should add a
1090 # newline if appropriate.
1091 s.append("\n")
1092 s = ''.join(s)
1093 return s
1094
1095 def prettify(self, encoding=None, formatter="minimal"):
1096 if encoding is None:
1097 return self.decode(True, formatter=formatter)
1098 else:
1099 return self.encode(encoding, True, formatter=formatter)
1100
1101 def decode_contents(self, indent_level=None,
1102 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1103 formatter="minimal"):
1104 """Renders the contents of this tag as a Unicode string.
1105
1106 :param eventual_encoding: The tag is destined to be
1107 encoded into this encoding. This method is _not_
1108 responsible for performing that encoding. This information
1109 is passed in so that it can be substituted in if the
1110 document contains a <META> tag that mentions the document's
1111 encoding.
1112 """
1113 # First off, turn a string formatter into a function. This
1114 # will stop the lookup from happening over and over again.
1115 if not callable(formatter):
1116 formatter = self._formatter_for_name(formatter)
1117
1118 pretty_print = (indent_level is not None)
1119 s = []
1120 for c in self:
1121 text = None
1122 if isinstance(c, NavigableString):
1123 text = c.output_ready(formatter)
1124 elif isinstance(c, Tag):
1125 s.append(c.decode(indent_level, eventual_encoding,
1126 formatter))
1127 if text and indent_level and not self.name == 'pre':
1128 text = text.strip()
1129 if text:
1130 if pretty_print and not self.name == 'pre':
1131 s.append(" " * (indent_level - 1))
1132 s.append(text)
1133 if pretty_print and not self.name == 'pre':
1134 s.append("\n")
1135 return ''.join(s)
1136
1137 def encode_contents(
1138 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1139 formatter="minimal"):
1140 """Renders the contents of this tag as a bytestring."""
1141 contents = self.decode_contents(indent_level, encoding, formatter)
1142 return contents.encode(encoding)
1143
1144 # Old method for BS3 compatibility
1145 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1146 prettyPrint=False, indentLevel=0):
1147 if not prettyPrint:
1148 indentLevel = None
1149 return self.encode_contents(
1150 indent_level=indentLevel, encoding=encoding)
1151
1152 #Soup methods
1153
1154 def find(self, name=None, attrs={}, recursive=True, text=None,
1155 **kwargs):
1156 """Return only the first child of this Tag matching the given
1157 criteria."""
1158 r = None
1159 l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1160 if l:
1161 r = l[0]
1162 return r
1163 findChild = find
1164
1165 def find_all(self, name=None, attrs={}, recursive=True, text=None,
1166 limit=None, **kwargs):
1167 """Extracts a list of Tag objects that match the given
1168 criteria. You can specify the name of the Tag and any
1169 attributes you want the Tag to have.
1170
1171 The value of a key-value pair in the 'attrs' map can be a
1172 string, a list of strings, a regular expression object, or a
1173 callable that takes a string and returns whether or not the
1174 string matches for some custom definition of 'matches'. The
1175 same is true of the tag name."""
1176
1177 generator = self.descendants
1178 if not recursive:
1179 generator = self.children
1180 return self._find_all(name, attrs, text, limit, generator, **kwargs)
1181 findAll = find_all # BS3
1182 findChildren = find_all # BS2
1183
1184 #Generator methods
1185 @property
1186 def children(self):
1187 # return iter() to make the purpose of the method clear
1188 return iter(self.contents) # XXX This seems to be untested.
1189
1190 @property
1191 def descendants(self):
1192 if not len(self.contents):
1193 return
1194 stopNode = self._last_descendant().next_element
1195 current = self.contents[0]
1196 while current is not stopNode:
1197 yield current
1198 current = current.next_element
1199
1200 # CSS selector code
1201
1202 _selector_combinators = ['>', '+', '~']
1203 _select_debug = False
1204 def select(self, selector, _candidate_generator=None):
1205 """Perform a CSS selection operation on the current element."""
1206 tokens = selector.split()
1207 current_context = [self]
1208
1209 if tokens[-1] in self._selector_combinators:
1210 raise ValueError(
1211 'Final combinator "%s" is missing an argument.' % tokens[-1])
1212 if self._select_debug:
1213 print 'Running CSS selector "%s"' % selector
1214 for index, token in enumerate(tokens):
1215 if self._select_debug:
1216 print ' Considering token "%s"' % token
1217 recursive_candidate_generator = None
1218 tag_name = None
1219 if tokens[index-1] in self._selector_combinators:
1220 # This token was consumed by the previous combinator. Skip it.
1221 if self._select_debug:
1222 print ' Token was consumed by the previous combinator.'
1223 continue
1224 # Each operation corresponds to a checker function, a rule
1225 # for determining whether a candidate matches the
1226 # selector. Candidates are generated by the active
1227 # iterator.
1228 checker = None
1229
1230 m = self.attribselect_re.match(token)
1231 if m is not None:
1232 # Attribute selector
1233 tag_name, attribute, operator, value = m.groups()
1234 checker = self._attribute_checker(operator, attribute, value)
1235
1236 elif '#' in token:
1237 # ID selector
1238 tag_name, tag_id = token.split('#', 1)
1239 def id_matches(tag):
1240 return tag.get('id', None) == tag_id
1241 checker = id_matches
1242
1243 elif '.' in token:
1244 # Class selector
1245 tag_name, klass = token.split('.', 1)
1246 classes = set(klass.split('.'))
1247 def classes_match(candidate):
1248 return classes.issubset(candidate.get('class', []))
1249 checker = classes_match
1250
1251 elif ':' in token:
1252 # Pseudo-class
1253 tag_name, pseudo = token.split(':', 1)
1254 if tag_name == '':
1255 raise ValueError(
1256 "A pseudo-class must be prefixed with a tag name.")
1257 pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1258 found = []
1259 if pseudo_attributes is not None:
1260 pseudo_type, pseudo_value = pseudo_attributes.groups()
1261 if pseudo_type == 'nth-of-type':
1262 try:
1263 pseudo_value = int(pseudo_value)
1264 except:
1265 raise NotImplementedError(
1266 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1267 if pseudo_value < 1:
1268 raise ValueError(
1269 'nth-of-type pseudo-class value must be at least 1.')
1270 class Counter(object):
1271 def __init__(self, destination):
1272 self.count = 0
1273 self.destination = destination
1274
1275 def nth_child_of_type(self, tag):
1276 self.count += 1
1277 if self.count == self.destination:
1278 return True
1279 if self.count > self.destination:
1280 # Stop the generator that's sending us
1281 # these things.
1282 raise StopIteration()
1283 return False
1284 checker = Counter(pseudo_value).nth_child_of_type
1285 else:
1286 raise NotImplementedError(
1287 'Only the following pseudo-classes are implemented: nth-of-type.')
1288
1289 elif token == '*':
1290 # Star selector -- matches everything
1291 pass
1292 elif token == '>':
1293 # Run the next token as a CSS selector against the
1294 # direct children of each tag in the current context.
1295 recursive_candidate_generator = lambda tag: tag.children
1296 elif token == '~':
1297 # Run the next token as a CSS selector against the
1298 # siblings of each tag in the current context.
1299 recursive_candidate_generator = lambda tag: tag.next_siblings
1300 elif token == '+':
1301 # For each tag in the current context, run the next
1302 # token as a CSS selector against the tag's next
1303 # sibling that's a tag.
1304 def next_tag_sibling(tag):
1305 yield tag.find_next_sibling(True)
1306 recursive_candidate_generator = next_tag_sibling
1307
1308 elif self.tag_name_re.match(token):
1309 # Just a tag name.
1310 tag_name = token
1311 else:
1312 raise ValueError(
1313 'Unsupported or invalid CSS selector: "%s"' % token)
1314
1315 if recursive_candidate_generator:
1316 # This happens when the selector looks like "> foo".
1317 #
1318 # The generator calls select() recursively on every
1319 # member of the current context, passing in a different
1320 # candidate generator and a different selector.
1321 #
1322 # In the case of "> foo", the candidate generator is
1323 # one that yields a tag's direct children (">"), and
1324 # the selector is "foo".
1325 next_token = tokens[index+1]
1326 def recursive_select(tag):
1327 if self._select_debug:
1328 print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
1329 print '-' * 40
1330 for i in tag.select(next_token, recursive_candidate_generator):
1331 if self._select_debug:
1332 print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
1333 yield i
1334 if self._select_debug:
1335 print '-' * 40
1336 _use_candidate_generator = recursive_select
1337 elif _candidate_generator is None:
1338 # By default, a tag's candidates are all of its
1339 # children. If tag_name is defined, only yield tags
1340 # with that name.
1341 if self._select_debug:
1342 if tag_name:
1343 check = "[any]"
1344 else:
1345 check = tag_name
1346 print ' Default candidate generator, tag name="%s"' % check
1347 if self._select_debug:
1348 # This is redundant with later code, but it stops
1349 # a bunch of bogus tags from cluttering up the
1350 # debug log.
1351 def default_candidate_generator(tag):
1352 for child in tag.descendants:
1353 if not isinstance(child, Tag):
1354 continue
1355 if tag_name and not child.name == tag_name:
1356 continue
1357 yield child
1358 _use_candidate_generator = default_candidate_generator
1359 else:
1360 _use_candidate_generator = lambda tag: tag.descendants
1361 else:
1362 _use_candidate_generator = _candidate_generator
1363
1364 new_context = []
1365 new_context_ids = set([])
1366 for tag in current_context:
1367 if self._select_debug:
1368 print " Running candidate generator on %s %s" % (
1369 tag.name, repr(tag.attrs))
1370 for candidate in _use_candidate_generator(tag):
1371 if not isinstance(candidate, Tag):
1372 continue
1373 if tag_name and candidate.name != tag_name:
1374 continue
1375 if checker is not None:
1376 try:
1377 result = checker(candidate)
1378 except StopIteration:
1379 # The checker has decided we should no longer
1380 # run the generator.
1381 break
1382 if checker is None or result:
1383 if self._select_debug:
1384 print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
1385 if id(candidate) not in new_context_ids:
1386 # If a tag matches a selector more than once,
1387 # don't include it in the context more than once.
1388 new_context.append(candidate)
1389 new_context_ids.add(id(candidate))
1390 elif self._select_debug:
1391 print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
1392
1393 current_context = new_context
1394
1395 if self._select_debug:
1396 print "Final verdict:"
1397 for i in current_context:
1398 print " %s %s" % (i.name, i.attrs)
1399 return current_context
1400
1401 # Old names for backwards compatibility
1402 def childGenerator(self):
1403 return self.children
1404
1405 def recursiveChildGenerator(self):
1406 return self.descendants
1407
1408 def has_key(self, key):
1409 """This was kind of misleading because has_key() (attributes)
1410 was different from __in__ (contents). has_key() is gone in
1411 Python 3, anyway."""
1412 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1413 key))
1414 return self.has_attr(key)
1415
1416# Next, a couple classes to represent queries and their results.
1417class SoupStrainer(object):
1418 """Encapsulates a number of ways of matching a markup element (tag or
1419 text)."""
1420
1421 def __init__(self, name=None, attrs={}, text=None, **kwargs):
1422 self.name = self._normalize_search_value(name)
1423 if not isinstance(attrs, dict):
1424 # Treat a non-dict value for attrs as a search for the 'class'
1425 # attribute.
1426 kwargs['class'] = attrs
1427 attrs = None
1428
1429 if 'class_' in kwargs:
1430 # Treat class_="foo" as a search for the 'class'
1431 # attribute, overriding any non-dict value for attrs.
1432 kwargs['class'] = kwargs['class_']
1433 del kwargs['class_']
1434
1435 if kwargs:
1436 if attrs:
1437 attrs = attrs.copy()
1438 attrs.update(kwargs)
1439 else:
1440 attrs = kwargs
1441 normalized_attrs = {}
1442 for key, value in attrs.items():
1443 normalized_attrs[key] = self._normalize_search_value(value)
1444
1445 self.attrs = normalized_attrs
1446 self.text = self._normalize_search_value(text)
1447
1448 def _normalize_search_value(self, value):
1449 # Leave it alone if it's a Unicode string, a callable, a
1450 # regular expression, a boolean, or None.
1451 if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
1452 or isinstance(value, bool) or value is None):
1453 return value
1454
1455 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1456 if isinstance(value, bytes):
1457 return value.decode("utf8")
1458
1459 # If it's listlike, convert it into a list of strings.
1460 if hasattr(value, '__iter__'):
1461 new_value = []
1462 for v in value:
1463 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1464 and not isinstance(v, unicode)):
1465 # This is almost certainly the user's mistake. In the
1466 # interests of avoiding infinite loops, we'll let
1467 # it through as-is rather than doing a recursive call.
1468 new_value.append(v)
1469 else:
1470 new_value.append(self._normalize_search_value(v))
1471 return new_value
1472
1473 # Otherwise, convert it into a Unicode string.
1474 # The unicode(str()) thing is so this will do the same thing on Python 2
1475 # and Python 3.
1476 return unicode(str(value))
1477
1478 def __str__(self):
1479 if self.text:
1480 return self.text
1481 else:
1482 return "%s|%s" % (self.name, self.attrs)
1483
1484 def search_tag(self, markup_name=None, markup_attrs={}):
1485 found = None
1486 markup = None
1487 if isinstance(markup_name, Tag):
1488 markup = markup_name
1489 markup_attrs = markup
1490 call_function_with_tag_data = (
1491 isinstance(self.name, collections.Callable)
1492 and not isinstance(markup_name, Tag))
1493
1494 if ((not self.name)
1495 or call_function_with_tag_data
1496 or (markup and self._matches(markup, self.name))
1497 or (not markup and self._matches(markup_name, self.name))):
1498 if call_function_with_tag_data:
1499 match = self.name(markup_name, markup_attrs)
1500 else:
1501 match = True
1502 markup_attr_map = None
1503 for attr, match_against in list(self.attrs.items()):
1504 if not markup_attr_map:
1505 if hasattr(markup_attrs, 'get'):
1506 markup_attr_map = markup_attrs
1507 else:
1508 markup_attr_map = {}
1509 for k, v in markup_attrs:
1510 markup_attr_map[k] = v
1511 attr_value = markup_attr_map.get(attr)
1512 if not self._matches(attr_value, match_against):
1513 match = False
1514 break
1515 if match:
1516 if markup:
1517 found = markup
1518 else:
1519 found = markup_name
1520 if found and self.text and not self._matches(found.string, self.text):
1521 found = None
1522 return found
1523 searchTag = search_tag
1524
1525 def search(self, markup):
1526 # print 'looking for %s in %s' % (self, markup)
1527 found = None
1528 # If given a list of items, scan it for a text element that
1529 # matches.
1530 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
1531 for element in markup:
1532 if isinstance(element, NavigableString) \
1533 and self.search(element):
1534 found = element
1535 break
1536 # If it's a Tag, make sure its name or attributes match.
1537 # Don't bother with Tags if we're searching for text.
1538 elif isinstance(markup, Tag):
1539 if not self.text or self.name or self.attrs:
1540 found = self.search_tag(markup)
1541 # If it's text, make sure the text matches.
1542 elif isinstance(markup, NavigableString) or \
1543 isinstance(markup, basestring):
1544 if not self.name and not self.attrs and self._matches(markup, self.text):
1545 found = markup
1546 else:
1547 raise Exception(
1548 "I don't know how to match against a %s" % markup.__class__)
1549 return found
1550
1551 def _matches(self, markup, match_against):
1552 # print u"Matching %s against %s" % (markup, match_against)
1553 result = False
1554 if isinstance(markup, list) or isinstance(markup, tuple):
1555 # This should only happen when searching a multi-valued attribute
1556 # like 'class'.
1557 if (isinstance(match_against, unicode)
1558 and ' ' in match_against):
1559 # A bit of a special case. If they try to match "foo
1560 # bar" on a multivalue attribute's value, only accept
1561 # the literal value "foo bar"
1562 #
1563 # XXX This is going to be pretty slow because we keep
1564 # splitting match_against. But it shouldn't come up
1565 # too often.
1566 return (whitespace_re.split(match_against) == markup)
1567 else:
1568 for item in markup:
1569 if self._matches(item, match_against):
1570 return True
1571 return False
1572
1573 if match_against is True:
1574 # True matches any non-None value.
1575 return markup is not None
1576
1577 if isinstance(match_against, collections.Callable):
1578 return match_against(markup)
1579
1580 # Custom callables take the tag as an argument, but all
1581 # other ways of matching match the tag name as a string.
1582 if isinstance(markup, Tag):
1583 markup = markup.name
1584
1585 # Ensure that `markup` is either a Unicode string, or None.
1586 markup = self._normalize_search_value(markup)
1587
1588 if markup is None:
1589 # None matches None, False, an empty string, an empty list, and so on.
1590 return not match_against
1591
1592 if isinstance(match_against, unicode):
1593 # Exact string match
1594 return markup == match_against
1595
1596 if hasattr(match_against, 'match'):
1597 # Regexp match
1598 return match_against.search(markup)
1599
1600 if hasattr(match_against, '__iter__'):
1601 # The markup must be an exact match against something
1602 # in the iterable.
1603 return markup in match_against
1604
1605
1606class ResultSet(list):
1607 """A ResultSet is just a list that keeps track of the SoupStrainer
1608 that created it."""
1609 def __init__(self, source, result=()):
1610 super(ResultSet, self).__init__(result)
1611 self.source = source
diff --git a/bitbake/lib/bs4/testing.py b/bitbake/lib/bs4/testing.py
new file mode 100644
index 0000000000..fd4495ac58
--- /dev/null
+++ b/bitbake/lib/bs4/testing.py
@@ -0,0 +1,592 @@
1"""Helper classes for tests."""
2
3import copy
4import functools
5import unittest
6from unittest import TestCase
7from bs4 import BeautifulSoup
8from bs4.element import (
9 CharsetMetaAttributeValue,
10 Comment,
11 ContentMetaAttributeValue,
12 Doctype,
13 SoupStrainer,
14)
15
16from bs4.builder import HTMLParserTreeBuilder
17default_builder = HTMLParserTreeBuilder
18
19
20class SoupTest(unittest.TestCase):
21
22 @property
23 def default_builder(self):
24 return default_builder()
25
26 def soup(self, markup, **kwargs):
27 """Build a Beautiful Soup object from markup."""
28 builder = kwargs.pop('builder', self.default_builder)
29 return BeautifulSoup(markup, builder=builder, **kwargs)
30
31 def document_for(self, markup):
32 """Turn an HTML fragment into a document.
33
34 The details depend on the builder.
35 """
36 return self.default_builder.test_fragment_to_document(markup)
37
38 def assertSoupEquals(self, to_parse, compare_parsed_to=None):
39 builder = self.default_builder
40 obj = BeautifulSoup(to_parse, builder=builder)
41 if compare_parsed_to is None:
42 compare_parsed_to = to_parse
43
44 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
45
46
47class HTMLTreeBuilderSmokeTest(object):
48
49 """A basic test of a treebuilder's competence.
50
51 Any HTML treebuilder, present or future, should be able to pass
52 these tests. With invalid markup, there's room for interpretation,
53 and different parsers can handle it differently. But with the
54 markup in these tests, there's not much room for interpretation.
55 """
56
57 def assertDoctypeHandled(self, doctype_fragment):
58 """Assert that a given doctype string is handled correctly."""
59 doctype_str, soup = self._document_with_doctype(doctype_fragment)
60
61 # Make sure a Doctype object was created.
62 doctype = soup.contents[0]
63 self.assertEqual(doctype.__class__, Doctype)
64 self.assertEqual(doctype, doctype_fragment)
65 self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
66
67 # Make sure that the doctype was correctly associated with the
68 # parse tree and that the rest of the document parsed.
69 self.assertEqual(soup.p.contents[0], 'foo')
70
71 def _document_with_doctype(self, doctype_fragment):
72 """Generate and parse a document with the given doctype."""
73 doctype = '<!DOCTYPE %s>' % doctype_fragment
74 markup = doctype + '\n<p>foo</p>'
75 soup = self.soup(markup)
76 return doctype, soup
77
78 def test_normal_doctypes(self):
79 """Make sure normal, everyday HTML doctypes are handled correctly."""
80 self.assertDoctypeHandled("html")
81 self.assertDoctypeHandled(
82 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
83
84 def test_empty_doctype(self):
85 soup = self.soup("<!DOCTYPE>")
86 doctype = soup.contents[0]
87 self.assertEqual("", doctype.strip())
88
89 def test_public_doctype_with_url(self):
90 doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
91 self.assertDoctypeHandled(doctype)
92
93 def test_system_doctype(self):
94 self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
95
96 def test_namespaced_system_doctype(self):
97 # We can handle a namespaced doctype with a system ID.
98 self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
99
100 def test_namespaced_public_doctype(self):
101 # Test a namespaced doctype with a public id.
102 self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
103
104 def test_real_xhtml_document(self):
105 """A real XHTML document should come out more or less the same as it went in."""
106 markup = b"""<?xml version="1.0" encoding="utf-8"?>
107<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
108<html xmlns="http://www.w3.org/1999/xhtml">
109<head><title>Hello.</title></head>
110<body>Goodbye.</body>
111</html>"""
112 soup = self.soup(markup)
113 self.assertEqual(
114 soup.encode("utf-8").replace(b"\n", b""),
115 markup.replace(b"\n", b""))
116
117 def test_deepcopy(self):
118 """Make sure you can copy the tree builder.
119
120 This is important because the builder is part of a
121 BeautifulSoup object, and we want to be able to copy that.
122 """
123 copy.deepcopy(self.default_builder)
124
125 def test_p_tag_is_never_empty_element(self):
126 """A <p> tag is never designated as an empty-element tag.
127
128 Even if the markup shows it as an empty-element tag, it
129 shouldn't be presented that way.
130 """
131 soup = self.soup("<p/>")
132 self.assertFalse(soup.p.is_empty_element)
133 self.assertEqual(str(soup.p), "<p></p>")
134
135 def test_unclosed_tags_get_closed(self):
136 """A tag that's not closed by the end of the document should be closed.
137
138 This applies to all tags except empty-element tags.
139 """
140 self.assertSoupEquals("<p>", "<p></p>")
141 self.assertSoupEquals("<b>", "<b></b>")
142
143 self.assertSoupEquals("<br>", "<br/>")
144
145 def test_br_is_always_empty_element_tag(self):
146 """A <br> tag is designated as an empty-element tag.
147
148 Some parsers treat <br></br> as one <br/> tag, some parsers as
149 two tags, but it should always be an empty-element tag.
150 """
151 soup = self.soup("<br></br>")
152 self.assertTrue(soup.br.is_empty_element)
153 self.assertEqual(str(soup.br), "<br/>")
154
155 def test_nested_formatting_elements(self):
156 self.assertSoupEquals("<em><em></em></em>")
157
158 def test_comment(self):
159 # Comments are represented as Comment objects.
160 markup = "<p>foo<!--foobar-->baz</p>"
161 self.assertSoupEquals(markup)
162
163 soup = self.soup(markup)
164 comment = soup.find(text="foobar")
165 self.assertEqual(comment.__class__, Comment)
166
167 # The comment is properly integrated into the tree.
168 foo = soup.find(text="foo")
169 self.assertEqual(comment, foo.next_element)
170 baz = soup.find(text="baz")
171 self.assertEqual(comment, baz.previous_element)
172
173 def test_preserved_whitespace_in_pre_and_textarea(self):
174 """Whitespace must be preserved in <pre> and <textarea> tags."""
175 self.assertSoupEquals("<pre> </pre>")
176 self.assertSoupEquals("<textarea> woo </textarea>")
177
178 def test_nested_inline_elements(self):
179 """Inline elements can be nested indefinitely."""
180 b_tag = "<b>Inside a B tag</b>"
181 self.assertSoupEquals(b_tag)
182
183 nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
184 self.assertSoupEquals(nested_b_tag)
185
186 double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
187 self.assertSoupEquals(nested_b_tag)
188
189 def test_nested_block_level_elements(self):
190 """Block elements can be nested."""
191 soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
192 blockquote = soup.blockquote
193 self.assertEqual(blockquote.p.b.string, 'Foo')
194 self.assertEqual(blockquote.b.string, 'Foo')
195
196 def test_correctly_nested_tables(self):
197 """One table can go inside another one."""
198 markup = ('<table id="1">'
199 '<tr>'
200 "<td>Here's another table:"
201 '<table id="2">'
202 '<tr><td>foo</td></tr>'
203 '</table></td>')
204
205 self.assertSoupEquals(
206 markup,
207 '<table id="1"><tr><td>Here\'s another table:'
208 '<table id="2"><tr><td>foo</td></tr></table>'
209 '</td></tr></table>')
210
211 self.assertSoupEquals(
212 "<table><thead><tr><td>Foo</td></tr></thead>"
213 "<tbody><tr><td>Bar</td></tr></tbody>"
214 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
215
216 def test_deeply_nested_multivalued_attribute(self):
217 # html5lib can set the attributes of the same tag many times
218 # as it rearranges the tree. This has caused problems with
219 # multivalued attributes.
220 markup = '<table><div><div class="css"></div></div></table>'
221 soup = self.soup(markup)
222 self.assertEqual(["css"], soup.div.div['class'])
223
224 def test_angle_brackets_in_attribute_values_are_escaped(self):
225 self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
226
227 def test_entities_in_attributes_converted_to_unicode(self):
228 expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
229 self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
230 self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
231 self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
232 self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
233
234 def test_entities_in_text_converted_to_unicode(self):
235 expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
236 self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
237 self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
238 self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
239 self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
240
241 def test_quot_entity_converted_to_quotation_mark(self):
242 self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
243 '<p>I said "good day!"</p>')
244
245 def test_out_of_range_entity(self):
246 expect = u"\N{REPLACEMENT CHARACTER}"
247 self.assertSoupEquals("&#10000000000000;", expect)
248 self.assertSoupEquals("&#x10000000000000;", expect)
249 self.assertSoupEquals("&#1000000000;", expect)
250
251 def test_multipart_strings(self):
252 "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
253 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
254 self.assertEqual("p", soup.h2.string.next_element.name)
255 self.assertEqual("p", soup.p.name)
256
257 def test_basic_namespaces(self):
258 """Parsers don't need to *understand* namespaces, but at the
259 very least they should not choke on namespaces or lose
260 data."""
261
262 markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
263 soup = self.soup(markup)
264 self.assertEqual(markup, soup.encode())
265 html = soup.html
266 self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
267 self.assertEqual(
268 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
269 self.assertEqual(
270 'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
271
272 def test_multivalued_attribute_value_becomes_list(self):
273 markup = b'<a class="foo bar">'
274 soup = self.soup(markup)
275 self.assertEqual(['foo', 'bar'], soup.a['class'])
276
277 #
278 # Generally speaking, tests below this point are more tests of
279 # Beautiful Soup than tests of the tree builders. But parsers are
280 # weird, so we run these tests separately for every tree builder
281 # to detect any differences between them.
282 #
283
284 def test_can_parse_unicode_document(self):
285 # A seemingly innocuous document... but it's in Unicode! And
286 # it contains characters that can't be represented in the
287 # encoding found in the declaration! The horror!
288 markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
289 soup = self.soup(markup)
290 self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
291
292 def test_soupstrainer(self):
293 """Parsers should be able to work with SoupStrainers."""
294 strainer = SoupStrainer("b")
295 soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
296 parse_only=strainer)
297 self.assertEqual(soup.decode(), "<b>bold</b>")
298
299 def test_single_quote_attribute_values_become_double_quotes(self):
300 self.assertSoupEquals("<foo attr='bar'></foo>",
301 '<foo attr="bar"></foo>')
302
303 def test_attribute_values_with_nested_quotes_are_left_alone(self):
304 text = """<foo attr='bar "brawls" happen'>a</foo>"""
305 self.assertSoupEquals(text)
306
307 def test_attribute_values_with_double_nested_quotes_get_quoted(self):
308 text = """<foo attr='bar "brawls" happen'>a</foo>"""
309 soup = self.soup(text)
310 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
311 self.assertSoupEquals(
312 soup.foo.decode(),
313 """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
314
315 def test_ampersand_in_attribute_value_gets_escaped(self):
316 self.assertSoupEquals('<this is="really messed up & stuff"></this>',
317 '<this is="really messed up &amp; stuff"></this>')
318
319 self.assertSoupEquals(
320 '<a href="http://example.org?a=1&b=2;3">foo</a>',
321 '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
322
323 def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
324 self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
325
326 def test_entities_in_strings_converted_during_parsing(self):
327 # Both XML and HTML entities are converted to Unicode characters
328 # during parsing.
329 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
330 expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
331 self.assertSoupEquals(text, expected)
332
333 def test_smart_quotes_converted_on_the_way_in(self):
334 # Microsoft smart quotes are converted to Unicode characters during
335 # parsing.
336 quote = b"<p>\x91Foo\x92</p>"
337 soup = self.soup(quote)
338 self.assertEqual(
339 soup.p.string,
340 u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
341
342 def test_non_breaking_spaces_converted_on_the_way_in(self):
343 soup = self.soup("<a>&nbsp;&nbsp;</a>")
344 self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
345
346 def test_entities_converted_on_the_way_out(self):
347 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
348 expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
349 soup = self.soup(text)
350 self.assertEqual(soup.p.encode("utf-8"), expected)
351
352 def test_real_iso_latin_document(self):
353 # Smoke test of interrelated functionality, using an
354 # easy-to-understand document.
355
356 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
357 unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
358
359 # That's because we're going to encode it into ISO-Latin-1, and use
360 # that to test.
361 iso_latin_html = unicode_html.encode("iso-8859-1")
362
363 # Parse the ISO-Latin-1 HTML.
364 soup = self.soup(iso_latin_html)
365 # Encode it to UTF-8.
366 result = soup.encode("utf-8")
367
368 # What do we expect the result to look like? Well, it would
369 # look like unicode_html, except that the META tag would say
370 # UTF-8 instead of ISO-Latin-1.
371 expected = unicode_html.replace("ISO-Latin-1", "utf-8")
372
373 # And, of course, it would be in UTF-8, not Unicode.
374 expected = expected.encode("utf-8")
375
376 # Ta-da!
377 self.assertEqual(result, expected)
378
379 def test_real_shift_jis_document(self):
380 # Smoke test to make sure the parser can handle a document in
381 # Shift-JIS encoding, without choking.
382 shift_jis_html = (
383 b'<html><head></head><body><pre>'
384 b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
385 b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
386 b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
387 b'</pre></body></html>')
388 unicode_html = shift_jis_html.decode("shift-jis")
389 soup = self.soup(unicode_html)
390
391 # Make sure the parse tree is correctly encoded to various
392 # encodings.
393 self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
394 self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
395
396 def test_real_hebrew_document(self):
397 # A real-world test to make sure we can convert ISO-8859-9 (a
398 # Hebrew encoding) to UTF-8.
399 hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
400 soup = self.soup(
401 hebrew_document, from_encoding="iso8859-8")
402 self.assertEqual(soup.original_encoding, 'iso8859-8')
403 self.assertEqual(
404 soup.encode('utf-8'),
405 hebrew_document.decode("iso8859-8").encode("utf-8"))
406
407 def test_meta_tag_reflects_current_encoding(self):
408 # Here's the <meta> tag saying that a document is
409 # encoded in Shift-JIS.
410 meta_tag = ('<meta content="text/html; charset=x-sjis" '
411 'http-equiv="Content-type"/>')
412
413 # Here's a document incorporating that meta tag.
414 shift_jis_html = (
415 '<html><head>\n%s\n'
416 '<meta http-equiv="Content-language" content="ja"/>'
417 '</head><body>Shift-JIS markup goes here.') % meta_tag
418 soup = self.soup(shift_jis_html)
419
420 # Parse the document, and the charset is seemingly unaffected.
421 parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
422 content = parsed_meta['content']
423 self.assertEqual('text/html; charset=x-sjis', content)
424
425 # But that value is actually a ContentMetaAttributeValue object.
426 self.assertTrue(isinstance(content, ContentMetaAttributeValue))
427
428 # And it will take on a value that reflects its current
429 # encoding.
430 self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
431
432 # For the rest of the story, see TestSubstitutions in
433 # test_tree.py.
434
435 def test_html5_style_meta_tag_reflects_current_encoding(self):
436 # Here's the <meta> tag saying that a document is
437 # encoded in Shift-JIS.
438 meta_tag = ('<meta id="encoding" charset="x-sjis" />')
439
440 # Here's a document incorporating that meta tag.
441 shift_jis_html = (
442 '<html><head>\n%s\n'
443 '<meta http-equiv="Content-language" content="ja"/>'
444 '</head><body>Shift-JIS markup goes here.') % meta_tag
445 soup = self.soup(shift_jis_html)
446
447 # Parse the document, and the charset is seemingly unaffected.
448 parsed_meta = soup.find('meta', id="encoding")
449 charset = parsed_meta['charset']
450 self.assertEqual('x-sjis', charset)
451
452 # But that value is actually a CharsetMetaAttributeValue object.
453 self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
454
455 # And it will take on a value that reflects its current
456 # encoding.
457 self.assertEqual('utf8', charset.encode("utf8"))
458
459 def test_tag_with_no_attributes_can_have_attributes_added(self):
460 data = self.soup("<a>text</a>")
461 data.a['foo'] = 'bar'
462 self.assertEqual('<a foo="bar">text</a>', data.a.decode())
463
464class XMLTreeBuilderSmokeTest(object):
465
466 def test_docstring_generated(self):
467 soup = self.soup("<root/>")
468 self.assertEqual(
469 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
470
471 def test_real_xhtml_document(self):
472 """A real XHTML document should come out *exactly* the same as it went in."""
473 markup = b"""<?xml version="1.0" encoding="utf-8"?>
474<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
475<html xmlns="http://www.w3.org/1999/xhtml">
476<head><title>Hello.</title></head>
477<body>Goodbye.</body>
478</html>"""
479 soup = self.soup(markup)
480 self.assertEqual(
481 soup.encode("utf-8"), markup)
482
483 def test_formatter_processes_script_tag_for_xml_documents(self):
484 doc = """
485 <script type="text/javascript">
486 </script>
487"""
488 soup = BeautifulSoup(doc, "xml")
489 # lxml would have stripped this while parsing, but we can add
490 # it later.
491 soup.script.string = 'console.log("< < hey > > ");'
492 encoded = soup.encode()
493 self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
494
495 def test_can_parse_unicode_document(self):
496 markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
497 soup = self.soup(markup)
498 self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
499
500 def test_popping_namespaced_tag(self):
501 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
502 soup = self.soup(markup)
503 self.assertEqual(
504 unicode(soup.rss), markup)
505
506 def test_docstring_includes_correct_encoding(self):
507 soup = self.soup("<root/>")
508 self.assertEqual(
509 soup.encode("latin1"),
510 b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
511
512 def test_large_xml_document(self):
513 """A large XML document should come out the same as it went in."""
514 markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
515 + b'0' * (2**12)
516 + b'</root>')
517 soup = self.soup(markup)
518 self.assertEqual(soup.encode("utf-8"), markup)
519
520
521 def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
522 self.assertSoupEquals("<p>", "<p/>")
523 self.assertSoupEquals("<p>foo</p>")
524
525 def test_namespaces_are_preserved(self):
526 markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
527 soup = self.soup(markup)
528 root = soup.root
529 self.assertEqual("http://example.com/", root['xmlns:a'])
530 self.assertEqual("http://example.net/", root['xmlns:b'])
531
532 def test_closing_namespaced_tag(self):
533 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
534 soup = self.soup(markup)
535 self.assertEqual(unicode(soup.p), markup)
536
537 def test_namespaced_attributes(self):
538 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
539 soup = self.soup(markup)
540 self.assertEqual(unicode(soup.foo), markup)
541
542 def test_namespaced_attributes_xml_namespace(self):
543 markup = '<foo xml:lang="fr">bar</foo>'
544 soup = self.soup(markup)
545 self.assertEqual(unicode(soup.foo), markup)
546
547class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
548 """Smoke test for a tree builder that supports HTML5."""
549
550 def test_real_xhtml_document(self):
551 # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
552 # XHTML documents in any particular way.
553 pass
554
555 def test_html_tags_have_namespace(self):
556 markup = "<a>"
557 soup = self.soup(markup)
558 self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
559
560 def test_svg_tags_have_namespace(self):
561 markup = '<svg><circle/></svg>'
562 soup = self.soup(markup)
563 namespace = "http://www.w3.org/2000/svg"
564 self.assertEqual(namespace, soup.svg.namespace)
565 self.assertEqual(namespace, soup.circle.namespace)
566
567
568 def test_mathml_tags_have_namespace(self):
569 markup = '<math><msqrt>5</msqrt></math>'
570 soup = self.soup(markup)
571 namespace = 'http://www.w3.org/1998/Math/MathML'
572 self.assertEqual(namespace, soup.math.namespace)
573 self.assertEqual(namespace, soup.msqrt.namespace)
574
575 def test_xml_declaration_becomes_comment(self):
576 markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
577 soup = self.soup(markup)
578 self.assertTrue(isinstance(soup.contents[0], Comment))
579 self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
580 self.assertEqual("html", soup.contents[0].next_element.name)
581
582def skipIf(condition, reason):
583 def nothing(test, *args, **kwargs):
584 return None
585
586 def decorator(test_item):
587 if condition:
588 return nothing
589 else:
590 return test_item
591
592 return decorator
diff --git a/bitbake/lib/bs4/tests/__init__.py b/bitbake/lib/bs4/tests/__init__.py
new file mode 100644
index 0000000000..142c8cc3f1
--- /dev/null
+++ b/bitbake/lib/bs4/tests/__init__.py
@@ -0,0 +1 @@
"The beautifulsoup tests."
diff --git a/bitbake/lib/bs4/tests/test_builder_registry.py b/bitbake/lib/bs4/tests/test_builder_registry.py
new file mode 100644
index 0000000000..92ad10fb04
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_builder_registry.py
@@ -0,0 +1,141 @@
1"""Tests of the builder registry."""
2
3import unittest
4
5from bs4 import BeautifulSoup
6from bs4.builder import (
7 builder_registry as registry,
8 HTMLParserTreeBuilder,
9 TreeBuilderRegistry,
10)
11
12try:
13 from bs4.builder import HTML5TreeBuilder
14 HTML5LIB_PRESENT = True
15except ImportError:
16 HTML5LIB_PRESENT = False
17
18try:
19 from bs4.builder import (
20 LXMLTreeBuilderForXML,
21 LXMLTreeBuilder,
22 )
23 LXML_PRESENT = True
24except ImportError:
25 LXML_PRESENT = False
26
27
28class BuiltInRegistryTest(unittest.TestCase):
29 """Test the built-in registry with the default builders registered."""
30
31 def test_combination(self):
32 if LXML_PRESENT:
33 self.assertEqual(registry.lookup('fast', 'html'),
34 LXMLTreeBuilder)
35
36 if LXML_PRESENT:
37 self.assertEqual(registry.lookup('permissive', 'xml'),
38 LXMLTreeBuilderForXML)
39 self.assertEqual(registry.lookup('strict', 'html'),
40 HTMLParserTreeBuilder)
41 if HTML5LIB_PRESENT:
42 self.assertEqual(registry.lookup('html5lib', 'html'),
43 HTML5TreeBuilder)
44
45 def test_lookup_by_markup_type(self):
46 if LXML_PRESENT:
47 self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
48 self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
49 else:
50 self.assertEqual(registry.lookup('xml'), None)
51 if HTML5LIB_PRESENT:
52 self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
53 else:
54 self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
55
56 def test_named_library(self):
57 if LXML_PRESENT:
58 self.assertEqual(registry.lookup('lxml', 'xml'),
59 LXMLTreeBuilderForXML)
60 self.assertEqual(registry.lookup('lxml', 'html'),
61 LXMLTreeBuilder)
62 if HTML5LIB_PRESENT:
63 self.assertEqual(registry.lookup('html5lib'),
64 HTML5TreeBuilder)
65
66 self.assertEqual(registry.lookup('html.parser'),
67 HTMLParserTreeBuilder)
68
69 def test_beautifulsoup_constructor_does_lookup(self):
70 # You can pass in a string.
71 BeautifulSoup("", features="html")
72 # Or a list of strings.
73 BeautifulSoup("", features=["html", "fast"])
74
75 # You'll get an exception if BS can't find an appropriate
76 # builder.
77 self.assertRaises(ValueError, BeautifulSoup,
78 "", features="no-such-feature")
79
80class RegistryTest(unittest.TestCase):
81 """Test the TreeBuilderRegistry class in general."""
82
83 def setUp(self):
84 self.registry = TreeBuilderRegistry()
85
86 def builder_for_features(self, *feature_list):
87 cls = type('Builder_' + '_'.join(feature_list),
88 (object,), {'features' : feature_list})
89
90 self.registry.register(cls)
91 return cls
92
93 def test_register_with_no_features(self):
94 builder = self.builder_for_features()
95
96 # Since the builder advertises no features, you can't find it
97 # by looking up features.
98 self.assertEqual(self.registry.lookup('foo'), None)
99
100 # But you can find it by doing a lookup with no features, if
101 # this happens to be the only registered builder.
102 self.assertEqual(self.registry.lookup(), builder)
103
104 def test_register_with_features_makes_lookup_succeed(self):
105 builder = self.builder_for_features('foo', 'bar')
106 self.assertEqual(self.registry.lookup('foo'), builder)
107 self.assertEqual(self.registry.lookup('bar'), builder)
108
109 def test_lookup_fails_when_no_builder_implements_feature(self):
110 builder = self.builder_for_features('foo', 'bar')
111 self.assertEqual(self.registry.lookup('baz'), None)
112
113 def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
114 builder1 = self.builder_for_features('foo')
115 builder2 = self.builder_for_features('bar')
116 self.assertEqual(self.registry.lookup(), builder2)
117
118 def test_lookup_fails_when_no_tree_builders_registered(self):
119 self.assertEqual(self.registry.lookup(), None)
120
121 def test_lookup_gets_most_recent_builder_supporting_all_features(self):
122 has_one = self.builder_for_features('foo')
123 has_the_other = self.builder_for_features('bar')
124 has_both_early = self.builder_for_features('foo', 'bar', 'baz')
125 has_both_late = self.builder_for_features('foo', 'bar', 'quux')
126 lacks_one = self.builder_for_features('bar')
127 has_the_other = self.builder_for_features('foo')
128
129 # There are two builders featuring 'foo' and 'bar', but
130 # the one that also features 'quux' was registered later.
131 self.assertEqual(self.registry.lookup('foo', 'bar'),
132 has_both_late)
133
134 # There is only one builder featuring 'foo', 'bar', and 'baz'.
135 self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
136 has_both_early)
137
138 def test_lookup_fails_when_cannot_reconcile_requested_features(self):
139 builder1 = self.builder_for_features('foo', 'bar')
140 builder2 = self.builder_for_features('foo', 'baz')
141 self.assertEqual(self.registry.lookup('bar', 'baz'), None)
diff --git a/bitbake/lib/bs4/tests/test_docs.py b/bitbake/lib/bs4/tests/test_docs.py
new file mode 100644
index 0000000000..5b9f677093
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_docs.py
@@ -0,0 +1,36 @@
1"Test harness for doctests."
2
3# pylint: disable-msg=E0611,W0142
4
5__metaclass__ = type
6__all__ = [
7 'additional_tests',
8 ]
9
10import atexit
11import doctest
12import os
13#from pkg_resources import (
14# resource_filename, resource_exists, resource_listdir, cleanup_resources)
15import unittest
16
17DOCTEST_FLAGS = (
18 doctest.ELLIPSIS |
19 doctest.NORMALIZE_WHITESPACE |
20 doctest.REPORT_NDIFF)
21
22
23# def additional_tests():
24# "Run the doc tests (README.txt and docs/*, if any exist)"
25# doctest_files = [
26# os.path.abspath(resource_filename('bs4', 'README.txt'))]
27# if resource_exists('bs4', 'docs'):
28# for name in resource_listdir('bs4', 'docs'):
29# if name.endswith('.txt'):
30# doctest_files.append(
31# os.path.abspath(
32# resource_filename('bs4', 'docs/%s' % name)))
33# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
34# atexit.register(cleanup_resources)
35# return unittest.TestSuite((
36# doctest.DocFileSuite(*doctest_files, **kwargs)))
diff --git a/bitbake/lib/bs4/tests/test_html5lib.py b/bitbake/lib/bs4/tests/test_html5lib.py
new file mode 100644
index 0000000000..594c3e1f26
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_html5lib.py
@@ -0,0 +1,85 @@
1"""Tests to ensure that the html5lib tree builder generates good trees."""
2
3import warnings
4
5try:
6 from bs4.builder import HTML5TreeBuilder
7 HTML5LIB_PRESENT = True
8except ImportError, e:
9 HTML5LIB_PRESENT = False
10from bs4.element import SoupStrainer
11from bs4.testing import (
12 HTML5TreeBuilderSmokeTest,
13 SoupTest,
14 skipIf,
15)
16
17@skipIf(
18 not HTML5LIB_PRESENT,
19 "html5lib seems not to be present, not testing its tree builder.")
20class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
21 """See ``HTML5TreeBuilderSmokeTest``."""
22
23 @property
24 def default_builder(self):
25 return HTML5TreeBuilder()
26
27 def test_soupstrainer(self):
28 # The html5lib tree builder does not support SoupStrainers.
29 strainer = SoupStrainer("b")
30 markup = "<p>A <b>bold</b> statement.</p>"
31 with warnings.catch_warnings(record=True) as w:
32 soup = self.soup(markup, parse_only=strainer)
33 self.assertEqual(
34 soup.decode(), self.document_for(markup))
35
36 self.assertTrue(
37 "the html5lib tree builder doesn't support parse_only" in
38 str(w[0].message))
39
40 def test_correctly_nested_tables(self):
41 """html5lib inserts <tbody> tags where other parsers don't."""
42 markup = ('<table id="1">'
43 '<tr>'
44 "<td>Here's another table:"
45 '<table id="2">'
46 '<tr><td>foo</td></tr>'
47 '</table></td>')
48
49 self.assertSoupEquals(
50 markup,
51 '<table id="1"><tbody><tr><td>Here\'s another table:'
52 '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
53 '</td></tr></tbody></table>')
54
55 self.assertSoupEquals(
56 "<table><thead><tr><td>Foo</td></tr></thead>"
57 "<tbody><tr><td>Bar</td></tr></tbody>"
58 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
59
60 def test_xml_declaration_followed_by_doctype(self):
61 markup = '''<?xml version="1.0" encoding="utf-8"?>
62<!DOCTYPE html>
63<html>
64 <head>
65 </head>
66 <body>
67 <p>foo</p>
68 </body>
69</html>'''
70 soup = self.soup(markup)
71 # Verify that we can reach the <p> tag; this means the tree is connected.
72 self.assertEqual(b"<p>foo</p>", soup.p.encode())
73
74 def test_reparented_markup(self):
75 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 soup = self.soup(markup)
77 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 self.assertEqual(2, len(soup.find_all('p')))
79
80
81 def test_reparented_markup_ends_with_whitespace(self):
82 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 soup = self.soup(markup)
84 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 self.assertEqual(2, len(soup.find_all('p')))
diff --git a/bitbake/lib/bs4/tests/test_htmlparser.py b/bitbake/lib/bs4/tests/test_htmlparser.py
new file mode 100644
index 0000000000..bcb5ed232f
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_htmlparser.py
@@ -0,0 +1,19 @@
1"""Tests to ensure that the html.parser tree builder generates good
2trees."""
3
4from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
5from bs4.builder import HTMLParserTreeBuilder
6
7class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
8
9 @property
10 def default_builder(self):
11 return HTMLParserTreeBuilder()
12
13 def test_namespaced_system_doctype(self):
14 # html.parser can't handle namespaced doctypes, so skip this one.
15 pass
16
17 def test_namespaced_public_doctype(self):
18 # html.parser can't handle namespaced doctypes, so skip this one.
19 pass
diff --git a/bitbake/lib/bs4/tests/test_lxml.py b/bitbake/lib/bs4/tests/test_lxml.py
new file mode 100644
index 0000000000..2b2e9b7e78
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_lxml.py
@@ -0,0 +1,91 @@
1"""Tests to ensure that the lxml tree builder generates good trees."""
2
3import re
4import warnings
5
6try:
7 import lxml.etree
8 LXML_PRESENT = True
9 LXML_VERSION = lxml.etree.LXML_VERSION
10except ImportError, e:
11 LXML_PRESENT = False
12 LXML_VERSION = (0,)
13
14if LXML_PRESENT:
15 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
16
17from bs4 import (
18 BeautifulSoup,
19 BeautifulStoneSoup,
20 )
21from bs4.element import Comment, Doctype, SoupStrainer
22from bs4.testing import skipIf
23from bs4.tests import test_htmlparser
24from bs4.testing import (
25 HTMLTreeBuilderSmokeTest,
26 XMLTreeBuilderSmokeTest,
27 SoupTest,
28 skipIf,
29)
30
31@skipIf(
32 not LXML_PRESENT,
33 "lxml seems not to be present, not testing its tree builder.")
34class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35 """See ``HTMLTreeBuilderSmokeTest``."""
36
37 @property
38 def default_builder(self):
39 return LXMLTreeBuilder()
40
41 def test_out_of_range_entity(self):
42 self.assertSoupEquals(
43 "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
44 self.assertSoupEquals(
45 "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
46 self.assertSoupEquals(
47 "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
48
49 # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50 # test if an old version of lxml is installed.
51
52 @skipIf(
53 not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54 "Skipping doctype test for old version of lxml to avoid segfault.")
55 def test_empty_doctype(self):
56 soup = self.soup("<!DOCTYPE>")
57 doctype = soup.contents[0]
58 self.assertEqual("", doctype.strip())
59
60 def test_beautifulstonesoup_is_xml_parser(self):
61 # Make sure that the deprecated BSS class uses an xml builder
62 # if one is installed.
63 with warnings.catch_warnings(record=True) as w:
64 soup = BeautifulStoneSoup("<b />")
65 self.assertEqual(u"<b/>", unicode(soup.b))
66 self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67
68 def test_real_xhtml_document(self):
69 """lxml strips the XML definition from an XHTML doc, which is fine."""
70 markup = b"""<?xml version="1.0" encoding="utf-8"?>
71<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
72<html xmlns="http://www.w3.org/1999/xhtml">
73<head><title>Hello.</title></head>
74<body>Goodbye.</body>
75</html>"""
76 soup = self.soup(markup)
77 self.assertEqual(
78 soup.encode("utf-8").replace(b"\n", b''),
79 markup.replace(b'\n', b'').replace(
80 b'<?xml version="1.0" encoding="utf-8"?>', b''))
81
82
83@skipIf(
84 not LXML_PRESENT,
85 "lxml seems not to be present, not testing its XML tree builder.")
86class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
87 """See ``HTMLTreeBuilderSmokeTest``."""
88
89 @property
90 def default_builder(self):
91 return LXMLTreeBuilderForXML()
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py
new file mode 100644
index 0000000000..47ac245f99
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_soup.py
@@ -0,0 +1,434 @@
1# -*- coding: utf-8 -*-
2"""Tests of Beautiful Soup as a whole."""
3
4import logging
5import unittest
6import sys
7import tempfile
8
9from bs4 import (
10 BeautifulSoup,
11 BeautifulStoneSoup,
12)
13from bs4.element import (
14 CharsetMetaAttributeValue,
15 ContentMetaAttributeValue,
16 SoupStrainer,
17 NamespacedAttribute,
18 )
19import bs4.dammit
20from bs4.dammit import (
21 EntitySubstitution,
22 UnicodeDammit,
23)
24from bs4.testing import (
25 SoupTest,
26 skipIf,
27)
28import warnings
29
30try:
31 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
32 LXML_PRESENT = True
33except ImportError, e:
34 LXML_PRESENT = False
35
36PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
37PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
38
39class TestConstructor(SoupTest):
40
41 def test_short_unicode_input(self):
42 data = u"<h1>éé</h1>"
43 soup = self.soup(data)
44 self.assertEqual(u"éé", soup.h1.string)
45
46 def test_embedded_null(self):
47 data = u"<h1>foo\0bar</h1>"
48 soup = self.soup(data)
49 self.assertEqual(u"foo\0bar", soup.h1.string)
50
51
52class TestDeprecatedConstructorArguments(SoupTest):
53
54 def test_parseOnlyThese_renamed_to_parse_only(self):
55 with warnings.catch_warnings(record=True) as w:
56 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
57 msg = str(w[0].message)
58 self.assertTrue("parseOnlyThese" in msg)
59 self.assertTrue("parse_only" in msg)
60 self.assertEqual(b"<b></b>", soup.encode())
61
62 def test_fromEncoding_renamed_to_from_encoding(self):
63 with warnings.catch_warnings(record=True) as w:
64 utf8 = b"\xc3\xa9"
65 soup = self.soup(utf8, fromEncoding="utf8")
66 msg = str(w[0].message)
67 self.assertTrue("fromEncoding" in msg)
68 self.assertTrue("from_encoding" in msg)
69 self.assertEqual("utf8", soup.original_encoding)
70
71 def test_unrecognized_keyword_argument(self):
72 self.assertRaises(
73 TypeError, self.soup, "<a>", no_such_argument=True)
74
75class TestWarnings(SoupTest):
76
77 def test_disk_file_warning(self):
78 filehandle = tempfile.NamedTemporaryFile()
79 filename = filehandle.name
80 try:
81 with warnings.catch_warnings(record=True) as w:
82 soup = self.soup(filename)
83 msg = str(w[0].message)
84 self.assertTrue("looks like a filename" in msg)
85 finally:
86 filehandle.close()
87
88 # The file no longer exists, so Beautiful Soup will no longer issue the warning.
89 with warnings.catch_warnings(record=True) as w:
90 soup = self.soup(filename)
91 self.assertEqual(0, len(w))
92
93 def test_url_warning(self):
94 with warnings.catch_warnings(record=True) as w:
95 soup = self.soup("http://www.crummy.com/")
96 msg = str(w[0].message)
97 self.assertTrue("looks like a URL" in msg)
98
99 with warnings.catch_warnings(record=True) as w:
100 soup = self.soup("http://www.crummy.com/ is great")
101 self.assertEqual(0, len(w))
102
103class TestSelectiveParsing(SoupTest):
104
105 def test_parse_with_soupstrainer(self):
106 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
107 strainer = SoupStrainer("b")
108 soup = self.soup(markup, parse_only=strainer)
109 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
110
111
112class TestEntitySubstitution(unittest.TestCase):
113 """Standalone tests of the EntitySubstitution class."""
114 def setUp(self):
115 self.sub = EntitySubstitution
116
117 def test_simple_html_substitution(self):
118 # Unicode characters corresponding to named HTML entites
119 # are substituted, and no others.
120 s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
121 self.assertEqual(self.sub.substitute_html(s),
122 u"foo&forall;\N{SNOWMAN}&otilde;bar")
123
124 def test_smart_quote_substitution(self):
125 # MS smart quotes are a common source of frustration, so we
126 # give them a special test.
127 quotes = b"\x91\x92foo\x93\x94"
128 dammit = UnicodeDammit(quotes)
129 self.assertEqual(self.sub.substitute_html(dammit.markup),
130 "&lsquo;&rsquo;foo&ldquo;&rdquo;")
131
132 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
133 s = 'Welcome to "my bar"'
134 self.assertEqual(self.sub.substitute_xml(s, False), s)
135
136 def test_xml_attribute_quoting_normally_uses_double_quotes(self):
137 self.assertEqual(self.sub.substitute_xml("Welcome", True),
138 '"Welcome"')
139 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
140 '"Bob\'s Bar"')
141
142 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
143 s = 'Welcome to "my bar"'
144 self.assertEqual(self.sub.substitute_xml(s, True),
145 "'Welcome to \"my bar\"'")
146
147 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
148 s = 'Welcome to "Bob\'s Bar"'
149 self.assertEqual(
150 self.sub.substitute_xml(s, True),
151 '"Welcome to &quot;Bob\'s Bar&quot;"')
152
153 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
154 quoted = 'Welcome to "Bob\'s Bar"'
155 self.assertEqual(self.sub.substitute_xml(quoted), quoted)
156
157 def test_xml_quoting_handles_angle_brackets(self):
158 self.assertEqual(
159 self.sub.substitute_xml("foo<bar>"),
160 "foo&lt;bar&gt;")
161
162 def test_xml_quoting_handles_ampersands(self):
163 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
164
165 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
166 self.assertEqual(
167 self.sub.substitute_xml("&Aacute;T&T"),
168 "&amp;Aacute;T&amp;T")
169
170 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
171 self.assertEqual(
172 self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
173 "&Aacute;T&amp;T")
174
175 def test_quotes_not_html_substituted(self):
176 """There's no need to do this except inside attribute values."""
177 text = 'Bob\'s "bar"'
178 self.assertEqual(self.sub.substitute_html(text), text)
179
180
181class TestEncodingConversion(SoupTest):
182 # Test Beautiful Soup's ability to decode and encode from various
183 # encodings.
184
185 def setUp(self):
186 super(TestEncodingConversion, self).setUp()
187 self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
188 self.utf8_data = self.unicode_data.encode("utf-8")
189 # Just so you know what it looks like.
190 self.assertEqual(
191 self.utf8_data,
192 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
193
194 def test_ascii_in_unicode_out(self):
195 # ASCII input is converted to Unicode. The original_encoding
196 # attribute is set to 'utf-8', a superset of ASCII.
197 chardet = bs4.dammit.chardet_dammit
198 logging.disable(logging.WARNING)
199 try:
200 def noop(str):
201 return None
202 # Disable chardet, which will realize that the ASCII is ASCII.
203 bs4.dammit.chardet_dammit = noop
204 ascii = b"<foo>a</foo>"
205 soup_from_ascii = self.soup(ascii)
206 unicode_output = soup_from_ascii.decode()
207 self.assertTrue(isinstance(unicode_output, unicode))
208 self.assertEqual(unicode_output, self.document_for(ascii.decode()))
209 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
210 finally:
211 logging.disable(logging.NOTSET)
212 bs4.dammit.chardet_dammit = chardet
213
214 def test_unicode_in_unicode_out(self):
215 # Unicode input is left alone. The original_encoding attribute
216 # is not set.
217 soup_from_unicode = self.soup(self.unicode_data)
218 self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
219 self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
220 self.assertEqual(soup_from_unicode.original_encoding, None)
221
222 def test_utf8_in_unicode_out(self):
223 # UTF-8 input is converted to Unicode. The original_encoding
224 # attribute is set.
225 soup_from_utf8 = self.soup(self.utf8_data)
226 self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
227 self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
228
229 def test_utf8_out(self):
230 # The internal data structures can be encoded as UTF-8.
231 soup_from_unicode = self.soup(self.unicode_data)
232 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
233
234 @skipIf(
235 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
236 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
237 def test_attribute_name_containing_unicode_characters(self):
238 markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
239 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
240
241class TestUnicodeDammit(unittest.TestCase):
242 """Standalone tests of UnicodeDammit."""
243
244 def test_unicode_input(self):
245 markup = u"I'm already Unicode! \N{SNOWMAN}"
246 dammit = UnicodeDammit(markup)
247 self.assertEqual(dammit.unicode_markup, markup)
248
249 def test_smart_quotes_to_unicode(self):
250 markup = b"<foo>\x91\x92\x93\x94</foo>"
251 dammit = UnicodeDammit(markup)
252 self.assertEqual(
253 dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
254
255 def test_smart_quotes_to_xml_entities(self):
256 markup = b"<foo>\x91\x92\x93\x94</foo>"
257 dammit = UnicodeDammit(markup, smart_quotes_to="xml")
258 self.assertEqual(
259 dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
260
261 def test_smart_quotes_to_html_entities(self):
262 markup = b"<foo>\x91\x92\x93\x94</foo>"
263 dammit = UnicodeDammit(markup, smart_quotes_to="html")
264 self.assertEqual(
265 dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
266
267 def test_smart_quotes_to_ascii(self):
268 markup = b"<foo>\x91\x92\x93\x94</foo>"
269 dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
270 self.assertEqual(
271 dammit.unicode_markup, """<foo>''""</foo>""")
272
273 def test_detect_utf8(self):
274 utf8 = b"\xc3\xa9"
275 dammit = UnicodeDammit(utf8)
276 self.assertEqual(dammit.unicode_markup, u'\xe9')
277 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
278
279 def test_convert_hebrew(self):
280 hebrew = b"\xed\xe5\xec\xf9"
281 dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
282 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
283 self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
284
285 def test_dont_see_smart_quotes_where_there_are_none(self):
286 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
287 dammit = UnicodeDammit(utf_8)
288 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
289 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
290
291 def test_ignore_inappropriate_codecs(self):
292 utf8_data = u"Räksmörgås".encode("utf-8")
293 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
294 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
295
296 def test_ignore_invalid_codecs(self):
297 utf8_data = u"Räksmörgås".encode("utf-8")
298 for bad_encoding in ['.utf8', '...', 'utF---16.!']:
299 dammit = UnicodeDammit(utf8_data, [bad_encoding])
300 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
301
302 def test_detect_html5_style_meta_tag(self):
303
304 for data in (
305 b'<html><meta charset="euc-jp" /></html>',
306 b"<html><meta charset='euc-jp' /></html>",
307 b"<html><meta charset=euc-jp /></html>",
308 b"<html><meta charset=euc-jp/></html>"):
309 dammit = UnicodeDammit(data, is_html=True)
310 self.assertEqual(
311 "euc-jp", dammit.original_encoding)
312
313 def test_last_ditch_entity_replacement(self):
314 # This is a UTF-8 document that contains bytestrings
315 # completely incompatible with UTF-8 (ie. encoded with some other
316 # encoding).
317 #
318 # Since there is no consistent encoding for the document,
319 # Unicode, Dammit will eventually encode the document as UTF-8
320 # and encode the incompatible characters as REPLACEMENT
321 # CHARACTER.
322 #
323 # If chardet is installed, it will detect that the document
324 # can be converted into ISO-8859-1 without errors. This happens
325 # to be the wrong encoding, but it is a consistent encoding, so the
326 # code we're testing here won't run.
327 #
328 # So we temporarily disable chardet if it's present.
329 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
330<html><b>\330\250\330\252\330\261</b>
331<i>\310\322\321\220\312\321\355\344</i></html>"""
332 chardet = bs4.dammit.chardet_dammit
333 logging.disable(logging.WARNING)
334 try:
335 def noop(str):
336 return None
337 bs4.dammit.chardet_dammit = noop
338 dammit = UnicodeDammit(doc)
339 self.assertEqual(True, dammit.contains_replacement_characters)
340 self.assertTrue(u"\ufffd" in dammit.unicode_markup)
341
342 soup = BeautifulSoup(doc, "html.parser")
343 self.assertTrue(soup.contains_replacement_characters)
344 finally:
345 logging.disable(logging.NOTSET)
346 bs4.dammit.chardet_dammit = chardet
347
348 def test_byte_order_mark_removed(self):
349 # A document written in UTF-16LE will have its byte order marker stripped.
350 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
351 dammit = UnicodeDammit(data)
352 self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
353 self.assertEqual("utf-16le", dammit.original_encoding)
354
355 def test_detwingle(self):
356 # Here's a UTF8 document.
357 utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
358
359 # Here's a Windows-1252 document.
360 windows_1252 = (
361 u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
362 u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
363
364 # Through some unholy alchemy, they've been stuck together.
365 doc = utf8 + windows_1252 + utf8
366
367 # The document can't be turned into UTF-8:
368 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
369
370 # Unicode, Dammit thinks the whole document is Windows-1252,
371 # and decodes it into "☃☃☃“Hi, I like Windows!â€Ã¢ËœÆ’☃☃"
372
373 # But if we run it through fix_embedded_windows_1252, it's fixed:
374
375 fixed = UnicodeDammit.detwingle(doc)
376 self.assertEqual(
377 u"☃☃☃“Hi, I like Windows!â€â˜ƒâ˜ƒâ˜ƒ", fixed.decode("utf8"))
378
379 def test_detwingle_ignores_multibyte_characters(self):
380 # Each of these characters has a UTF-8 representation ending
381 # in \x93. \x93 is a smart quote if interpreted as
382 # Windows-1252. But our code knows to skip over multibyte
383 # UTF-8 characters, so they'll survive the process unscathed.
384 for tricky_unicode_char in (
385 u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
386 u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
387 u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
388 ):
389 input = tricky_unicode_char.encode("utf8")
390 self.assertTrue(input.endswith(b'\x93'))
391 output = UnicodeDammit.detwingle(input)
392 self.assertEqual(output, input)
393
394class TestNamedspacedAttribute(SoupTest):
395
396 def test_name_may_be_none(self):
397 a = NamespacedAttribute("xmlns", None)
398 self.assertEqual(a, "xmlns")
399
400 def test_attribute_is_equivalent_to_colon_separated_string(self):
401 a = NamespacedAttribute("a", "b")
402 self.assertEqual("a:b", a)
403
404 def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
405 a = NamespacedAttribute("a", "b", "c")
406 b = NamespacedAttribute("a", "b", "c")
407 self.assertEqual(a, b)
408
409 # The actual namespace is not considered.
410 c = NamespacedAttribute("a", "b", None)
411 self.assertEqual(a, c)
412
413 # But name and prefix are important.
414 d = NamespacedAttribute("a", "z", "c")
415 self.assertNotEqual(a, d)
416
417 e = NamespacedAttribute("z", "b", "c")
418 self.assertNotEqual(a, e)
419
420
421class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
422
423 def test_content_meta_attribute_value(self):
424 value = CharsetMetaAttributeValue("euc-jp")
425 self.assertEqual("euc-jp", value)
426 self.assertEqual("euc-jp", value.original_value)
427 self.assertEqual("utf8", value.encode("utf8"))
428
429
430 def test_content_meta_attribute_value(self):
431 value = ContentMetaAttributeValue("text/html; charset=euc-jp")
432 self.assertEqual("text/html; charset=euc-jp", value)
433 self.assertEqual("text/html; charset=euc-jp", value.original_value)
434 self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py
new file mode 100644
index 0000000000..f8515c0ea1
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_tree.py
@@ -0,0 +1,1829 @@
1# -*- coding: utf-8 -*-
2"""Tests for Beautiful Soup's tree traversal methods.
3
4The tree traversal methods are the main advantage of using Beautiful
5Soup over just using a parser.
6
7Different parsers will build different Beautiful Soup trees given the
8same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here.
10"""
11
12import copy
13import pickle
14import re
15import warnings
16from bs4 import BeautifulSoup
17from bs4.builder import (
18 builder_registry,
19 HTMLParserTreeBuilder,
20)
21from bs4.element import (
22 CData,
23 Comment,
24 Doctype,
25 NavigableString,
26 SoupStrainer,
27 Tag,
28)
29from bs4.testing import (
30 SoupTest,
31 skipIf,
32)
33
34XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
35LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
36
37class TreeTest(SoupTest):
38
39 def assertSelects(self, tags, should_match):
40 """Make sure that the given tags have the correct text.
41
42 This is used in tests that define a bunch of tags, each
43 containing a single string, and then select certain strings by
44 some mechanism.
45 """
46 self.assertEqual([tag.string for tag in tags], should_match)
47
48 def assertSelectsIDs(self, tags, should_match):
49 """Make sure that the given tags have the correct IDs.
50
51 This is used in tests that define a bunch of tags, each
52 containing a single string, and then select certain strings by
53 some mechanism.
54 """
55 self.assertEqual([tag['id'] for tag in tags], should_match)
56
57
58class TestFind(TreeTest):
59 """Basic tests of the find() method.
60
61 find() just calls find_all() with limit=1, so it's not tested all
62 that thouroughly here.
63 """
64
65 def test_find_tag(self):
66 soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>")
67 self.assertEqual(soup.find("b").string, "2")
68
69 def test_unicode_text_find(self):
70 soup = self.soup(u'<h1>Räksmörgås</h1>')
71 self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås')
72
73 def test_find_everything(self):
74 """Test an optimization that finds all tags."""
75 soup = self.soup("<a>foo</a><b>bar</b>")
76 self.assertEqual(2, len(soup.find_all()))
77
78 def test_find_everything_with_name(self):
79 """Test an optimization that finds all tags with a given name."""
80 soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
81 self.assertEqual(2, len(soup.find_all('a')))
82
83class TestFindAll(TreeTest):
84 """Basic tests of the find_all() method."""
85
86 def test_find_all_text_nodes(self):
87 """You can search the tree for text nodes."""
88 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
89 # Exact match.
90 self.assertEqual(soup.find_all(text="bar"), [u"bar"])
91 # Match any of a number of strings.
92 self.assertEqual(
93 soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
94 # Match a regular expression.
95 self.assertEqual(soup.find_all(text=re.compile('.*')),
96 [u"Foo", u"bar", u'\xbb'])
97 # Match anything.
98 self.assertEqual(soup.find_all(text=True),
99 [u"Foo", u"bar", u'\xbb'])
100
101 def test_find_all_limit(self):
102 """You can limit the number of items returned by find_all."""
103 soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>")
104 self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
105 self.assertSelects(soup.find_all('a', limit=1), ["1"])
106 self.assertSelects(
107 soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
108
109 # A limit of 0 means no limit.
110 self.assertSelects(
111 soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
112
113 def test_calling_a_tag_is_calling_findall(self):
114 soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>")
115 self.assertSelects(soup('a', limit=1), ["1"])
116 self.assertSelects(soup.b(id="foo"), ["3"])
117
118 def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
119 soup = self.soup("<a></a>")
120 # Create a self-referential list.
121 l = []
122 l.append(l)
123
124 # Without special code in _normalize_search_value, this would cause infinite
125 # recursion.
126 self.assertEqual([], soup.find_all(l))
127
128 def test_find_all_resultset(self):
129 """All find_all calls return a ResultSet"""
130 soup = self.soup("<a></a>")
131 result = soup.find_all("a")
132 self.assertTrue(hasattr(result, "source"))
133
134 result = soup.find_all(True)
135 self.assertTrue(hasattr(result, "source"))
136
137 result = soup.find_all(text="foo")
138 self.assertTrue(hasattr(result, "source"))
139
140
141class TestFindAllBasicNamespaces(TreeTest):
142
143 def test_find_by_namespaced_name(self):
144 soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
145 self.assertEqual("4", soup.find("mathml:msqrt").string)
146 self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
147
148
149class TestFindAllByName(TreeTest):
150 """Test ways of finding tags by tag name."""
151
152 def setUp(self):
153 super(TreeTest, self).setUp()
154 self.tree = self.soup("""<a>First tag.</a>
155 <b>Second tag.</b>
156 <c>Third <a>Nested tag.</a> tag.</c>""")
157
158 def test_find_all_by_tag_name(self):
159 # Find all the <a> tags.
160 self.assertSelects(
161 self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
162
163 def test_find_all_by_name_and_text(self):
164 self.assertSelects(
165 self.tree.find_all('a', text='First tag.'), ['First tag.'])
166
167 self.assertSelects(
168 self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
169
170 self.assertSelects(
171 self.tree.find_all('a', text=re.compile("tag")),
172 ['First tag.', 'Nested tag.'])
173
174
175 def test_find_all_on_non_root_element(self):
176 # You can call find_all on any node, not just the root.
177 self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
178
179 def test_calling_element_invokes_find_all(self):
180 self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
181
182 def test_find_all_by_tag_strainer(self):
183 self.assertSelects(
184 self.tree.find_all(SoupStrainer('a')),
185 ['First tag.', 'Nested tag.'])
186
187 def test_find_all_by_tag_names(self):
188 self.assertSelects(
189 self.tree.find_all(['a', 'b']),
190 ['First tag.', 'Second tag.', 'Nested tag.'])
191
192 def test_find_all_by_tag_dict(self):
193 self.assertSelects(
194 self.tree.find_all({'a' : True, 'b' : True}),
195 ['First tag.', 'Second tag.', 'Nested tag.'])
196
197 def test_find_all_by_tag_re(self):
198 self.assertSelects(
199 self.tree.find_all(re.compile('^[ab]$')),
200 ['First tag.', 'Second tag.', 'Nested tag.'])
201
202 def test_find_all_with_tags_matching_method(self):
203 # You can define an oracle method that determines whether
204 # a tag matches the search.
205 def id_matches_name(tag):
206 return tag.name == tag.get('id')
207
208 tree = self.soup("""<a id="a">Match 1.</a>
209 <a id="1">Does not match.</a>
210 <b id="b">Match 2.</a>""")
211
212 self.assertSelects(
213 tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
214
215
216class TestFindAllByAttribute(TreeTest):
217
218 def test_find_all_by_attribute_name(self):
219 # You can pass in keyword arguments to find_all to search by
220 # attribute.
221 tree = self.soup("""
222 <a id="first">Matching a.</a>
223 <a id="second">
224 Non-matching <b id="first">Matching b.</b>a.
225 </a>""")
226 self.assertSelects(tree.find_all(id='first'),
227 ["Matching a.", "Matching b."])
228
229 def test_find_all_by_utf8_attribute_value(self):
230 peace = u"×ולש".encode("utf8")
231 data = u'<a title="×ולש"></a>'.encode("utf8")
232 soup = self.soup(data)
233 self.assertEqual([soup.a], soup.find_all(title=peace))
234 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
235 self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
236
237 def test_find_all_by_attribute_dict(self):
238 # You can pass in a dictionary as the argument 'attrs'. This
239 # lets you search for attributes like 'name' (a fixed argument
240 # to find_all) and 'class' (a reserved word in Python.)
241 tree = self.soup("""
242 <a name="name1" class="class1">Name match.</a>
243 <a name="name2" class="class2">Class match.</a>
244 <a name="name3" class="class3">Non-match.</a>
245 <name1>A tag called 'name1'.</name1>
246 """)
247
248 # This doesn't do what you want.
249 self.assertSelects(tree.find_all(name='name1'),
250 ["A tag called 'name1'."])
251 # This does what you want.
252 self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
253 ["Name match."])
254
255 self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
256 ["Class match."])
257
258 def test_find_all_by_class(self):
259 tree = self.soup("""
260 <a class="1">Class 1.</a>
261 <a class="2">Class 2.</a>
262 <b class="1">Class 1.</b>
263 <c class="3 4">Class 3 and 4.</c>
264 """)
265
266 # Passing in the class_ keyword argument will search against
267 # the 'class' attribute.
268 self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
269 self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
270 self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
271
272 # Passing in a string to 'attrs' will also search the CSS class.
273 self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
274 self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
275 self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
276 self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
277
278 def test_find_by_class_when_multiple_classes_present(self):
279 tree = self.soup("<gar class='foo bar'>Found it</gar>")
280
281 f = tree.find_all("gar", class_=re.compile("o"))
282 self.assertSelects(f, ["Found it"])
283
284 f = tree.find_all("gar", class_=re.compile("a"))
285 self.assertSelects(f, ["Found it"])
286
287 # Since the class is not the string "foo bar", but the two
288 # strings "foo" and "bar", this will not find anything.
289 f = tree.find_all("gar", class_=re.compile("o b"))
290 self.assertSelects(f, [])
291
292 def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
293 soup = self.soup("<a class='bar'>Found it</a>")
294
295 self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
296
297 def big_attribute_value(value):
298 return len(value) > 3
299
300 self.assertSelects(soup.find_all("a", big_attribute_value), [])
301
302 def small_attribute_value(value):
303 return len(value) <= 3
304
305 self.assertSelects(
306 soup.find_all("a", small_attribute_value), ["Found it"])
307
308 def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
309 soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
310 a, a2 = soup.find_all("a")
311 self.assertEqual([a, a2], soup.find_all("a", "foo"))
312 self.assertEqual([a], soup.find_all("a", "bar"))
313
314 # If you specify the class as a string that contains a
315 # space, only that specific value will be found.
316 self.assertEqual([a], soup.find_all("a", class_="foo bar"))
317 self.assertEqual([a], soup.find_all("a", "foo bar"))
318 self.assertEqual([], soup.find_all("a", "bar foo"))
319
320 def test_find_all_by_attribute_soupstrainer(self):
321 tree = self.soup("""
322 <a id="first">Match.</a>
323 <a id="second">Non-match.</a>""")
324
325 strainer = SoupStrainer(attrs={'id' : 'first'})
326 self.assertSelects(tree.find_all(strainer), ['Match.'])
327
328 def test_find_all_with_missing_atribute(self):
329 # You can pass in None as the value of an attribute to find_all.
330 # This will match tags that do not have that attribute set.
331 tree = self.soup("""<a id="1">ID present.</a>
332 <a>No ID present.</a>
333 <a id="">ID is empty.</a>""")
334 self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
335
336 def test_find_all_with_defined_attribute(self):
337 # You can pass in None as the value of an attribute to find_all.
338 # This will match tags that have that attribute set to any value.
339 tree = self.soup("""<a id="1">ID present.</a>
340 <a>No ID present.</a>
341 <a id="">ID is empty.</a>""")
342 self.assertSelects(
343 tree.find_all(id=True), ["ID present.", "ID is empty."])
344
345 def test_find_all_with_numeric_attribute(self):
346 # If you search for a number, it's treated as a string.
347 tree = self.soup("""<a id=1>Unquoted attribute.</a>
348 <a id="1">Quoted attribute.</a>""")
349
350 expected = ["Unquoted attribute.", "Quoted attribute."]
351 self.assertSelects(tree.find_all(id=1), expected)
352 self.assertSelects(tree.find_all(id="1"), expected)
353
354 def test_find_all_with_list_attribute_values(self):
355 # You can pass a list of attribute values instead of just one,
356 # and you'll get tags that match any of the values.
357 tree = self.soup("""<a id="1">1</a>
358 <a id="2">2</a>
359 <a id="3">3</a>
360 <a>No ID.</a>""")
361 self.assertSelects(tree.find_all(id=["1", "3", "4"]),
362 ["1", "3"])
363
364 def test_find_all_with_regular_expression_attribute_value(self):
365 # You can pass a regular expression as an attribute value, and
366 # you'll get tags whose values for that attribute match the
367 # regular expression.
368 tree = self.soup("""<a id="a">One a.</a>
369 <a id="aa">Two as.</a>
370 <a id="ab">Mixed as and bs.</a>
371 <a id="b">One b.</a>
372 <a>No ID.</a>""")
373
374 self.assertSelects(tree.find_all(id=re.compile("^a+$")),
375 ["One a.", "Two as."])
376
377 def test_find_by_name_and_containing_string(self):
378 soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>")
379 a = soup.a
380
381 self.assertEqual([a], soup.find_all("a", text="foo"))
382 self.assertEqual([], soup.find_all("a", text="bar"))
383 self.assertEqual([], soup.find_all("a", text="bar"))
384
385 def test_find_by_name_and_containing_string_when_string_is_buried(self):
386 soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>")
387 self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
388
389 def test_find_by_attribute_and_containing_string(self):
390 soup = self.soup('<b id="1">foo</b><a id="2">foo</a>')
391 a = soup.a
392
393 self.assertEqual([a], soup.find_all(id=2, text="foo"))
394 self.assertEqual([], soup.find_all(id=1, text="bar"))
395
396
397
398
399class TestIndex(TreeTest):
400 """Test Tag.index"""
401 def test_index(self):
402 tree = self.soup("""<div>
403 <a>Identical</a>
404 <b>Not identical</b>
405 <a>Identical</a>
406
407 <c><d>Identical with child</d></c>
408 <b>Also not identical</b>
409 <c><d>Identical with child</d></c>
410 </div>""")
411 div = tree.div
412 for i, element in enumerate(div.contents):
413 self.assertEqual(i, div.index(element))
414 self.assertRaises(ValueError, tree.index, 1)
415
416
417class TestParentOperations(TreeTest):
418 """Test navigation and searching through an element's parents."""
419
420 def setUp(self):
421 super(TestParentOperations, self).setUp()
422 self.tree = self.soup('''<ul id="empty"></ul>
423 <ul id="top">
424 <ul id="middle">
425 <ul id="bottom">
426 <b>Start here</b>
427 </ul>
428 </ul>''')
429 self.start = self.tree.b
430
431
432 def test_parent(self):
433 self.assertEqual(self.start.parent['id'], 'bottom')
434 self.assertEqual(self.start.parent.parent['id'], 'middle')
435 self.assertEqual(self.start.parent.parent.parent['id'], 'top')
436
437 def test_parent_of_top_tag_is_soup_object(self):
438 top_tag = self.tree.contents[0]
439 self.assertEqual(top_tag.parent, self.tree)
440
441 def test_soup_object_has_no_parent(self):
442 self.assertEqual(None, self.tree.parent)
443
444 def test_find_parents(self):
445 self.assertSelectsIDs(
446 self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
447 self.assertSelectsIDs(
448 self.start.find_parents('ul', id="middle"), ['middle'])
449
450 def test_find_parent(self):
451 self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
452 self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
453
454 def test_parent_of_text_element(self):
455 text = self.tree.find(text="Start here")
456 self.assertEqual(text.parent.name, 'b')
457
458 def test_text_element_find_parent(self):
459 text = self.tree.find(text="Start here")
460 self.assertEqual(text.find_parent('ul')['id'], 'bottom')
461
462 def test_parent_generator(self):
463 parents = [parent['id'] for parent in self.start.parents
464 if parent is not None and 'id' in parent.attrs]
465 self.assertEqual(parents, ['bottom', 'middle', 'top'])
466
467
468class ProximityTest(TreeTest):
469
470 def setUp(self):
471 super(TreeTest, self).setUp()
472 self.tree = self.soup(
473 '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>')
474
475
476class TestNextOperations(ProximityTest):
477
478 def setUp(self):
479 super(TestNextOperations, self).setUp()
480 self.start = self.tree.b
481
482 def test_next(self):
483 self.assertEqual(self.start.next_element, "One")
484 self.assertEqual(self.start.next_element.next_element['id'], "2")
485
486 def test_next_of_last_item_is_none(self):
487 last = self.tree.find(text="Three")
488 self.assertEqual(last.next_element, None)
489
490 def test_next_of_root_is_none(self):
491 # The document root is outside the next/previous chain.
492 self.assertEqual(self.tree.next_element, None)
493
494 def test_find_all_next(self):
495 self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
496 self.start.find_all_next(id=3)
497 self.assertSelects(self.start.find_all_next(id=3), ["Three"])
498
499 def test_find_next(self):
500 self.assertEqual(self.start.find_next('b')['id'], '2')
501 self.assertEqual(self.start.find_next(text="Three"), "Three")
502
503 def test_find_next_for_text_element(self):
504 text = self.tree.find(text="One")
505 self.assertEqual(text.find_next("b").string, "Two")
506 self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
507
508 def test_next_generator(self):
509 start = self.tree.find(text="Two")
510 successors = [node for node in start.next_elements]
511 # There are two successors: the final <b> tag and its text contents.
512 tag, contents = successors
513 self.assertEqual(tag['id'], '3')
514 self.assertEqual(contents, "Three")
515
516class TestPreviousOperations(ProximityTest):
517
518 def setUp(self):
519 super(TestPreviousOperations, self).setUp()
520 self.end = self.tree.find(text="Three")
521
522 def test_previous(self):
523 self.assertEqual(self.end.previous_element['id'], "3")
524 self.assertEqual(self.end.previous_element.previous_element, "Two")
525
526 def test_previous_of_first_item_is_none(self):
527 first = self.tree.find('html')
528 self.assertEqual(first.previous_element, None)
529
530 def test_previous_of_root_is_none(self):
531 # The document root is outside the next/previous chain.
532 # XXX This is broken!
533 #self.assertEqual(self.tree.previous_element, None)
534 pass
535
536 def test_find_all_previous(self):
537 # The <b> tag containing the "Three" node is the predecessor
538 # of the "Three" node itself, which is why "Three" shows up
539 # here.
540 self.assertSelects(
541 self.end.find_all_previous('b'), ["Three", "Two", "One"])
542 self.assertSelects(self.end.find_all_previous(id=1), ["One"])
543
544 def test_find_previous(self):
545 self.assertEqual(self.end.find_previous('b')['id'], '3')
546 self.assertEqual(self.end.find_previous(text="One"), "One")
547
548 def test_find_previous_for_text_element(self):
549 text = self.tree.find(text="Three")
550 self.assertEqual(text.find_previous("b").string, "Three")
551 self.assertSelects(
552 text.find_all_previous("b"), ["Three", "Two", "One"])
553
554 def test_previous_generator(self):
555 start = self.tree.find(text="One")
556 predecessors = [node for node in start.previous_elements]
557
558 # There are four predecessors: the <b> tag containing "One"
559 # the <body> tag, the <head> tag, and the <html> tag.
560 b, body, head, html = predecessors
561 self.assertEqual(b['id'], '1')
562 self.assertEqual(body.name, "body")
563 self.assertEqual(head.name, "head")
564 self.assertEqual(html.name, "html")
565
566
567class SiblingTest(TreeTest):
568
569 def setUp(self):
570 super(SiblingTest, self).setUp()
571 markup = '''<html>
572 <span id="1">
573 <span id="1.1"></span>
574 </span>
575 <span id="2">
576 <span id="2.1"></span>
577 </span>
578 <span id="3">
579 <span id="3.1"></span>
580 </span>
581 <span id="4"></span>
582 </html>'''
583 # All that whitespace looks good but makes the tests more
584 # difficult. Get rid of it.
585 markup = re.compile("\n\s*").sub("", markup)
586 self.tree = self.soup(markup)
587
588
589class TestNextSibling(SiblingTest):
590
591 def setUp(self):
592 super(TestNextSibling, self).setUp()
593 self.start = self.tree.find(id="1")
594
595 def test_next_sibling_of_root_is_none(self):
596 self.assertEqual(self.tree.next_sibling, None)
597
598 def test_next_sibling(self):
599 self.assertEqual(self.start.next_sibling['id'], '2')
600 self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
601
602 # Note the difference between next_sibling and next_element.
603 self.assertEqual(self.start.next_element['id'], '1.1')
604
605 def test_next_sibling_may_not_exist(self):
606 self.assertEqual(self.tree.html.next_sibling, None)
607
608 nested_span = self.tree.find(id="1.1")
609 self.assertEqual(nested_span.next_sibling, None)
610
611 last_span = self.tree.find(id="4")
612 self.assertEqual(last_span.next_sibling, None)
613
614 def test_find_next_sibling(self):
615 self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
616
617 def test_next_siblings(self):
618 self.assertSelectsIDs(self.start.find_next_siblings("span"),
619 ['2', '3', '4'])
620
621 self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
622
623 def test_next_sibling_for_text_element(self):
624 soup = self.soup("Foo<b>bar</b>baz")
625 start = soup.find(text="Foo")
626 self.assertEqual(start.next_sibling.name, 'b')
627 self.assertEqual(start.next_sibling.next_sibling, 'baz')
628
629 self.assertSelects(start.find_next_siblings('b'), ['bar'])
630 self.assertEqual(start.find_next_sibling(text="baz"), "baz")
631 self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
632
633
634class TestPreviousSibling(SiblingTest):
635
636 def setUp(self):
637 super(TestPreviousSibling, self).setUp()
638 self.end = self.tree.find(id="4")
639
640 def test_previous_sibling_of_root_is_none(self):
641 self.assertEqual(self.tree.previous_sibling, None)
642
643 def test_previous_sibling(self):
644 self.assertEqual(self.end.previous_sibling['id'], '3')
645 self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
646
647 # Note the difference between previous_sibling and previous_element.
648 self.assertEqual(self.end.previous_element['id'], '3.1')
649
650 def test_previous_sibling_may_not_exist(self):
651 self.assertEqual(self.tree.html.previous_sibling, None)
652
653 nested_span = self.tree.find(id="1.1")
654 self.assertEqual(nested_span.previous_sibling, None)
655
656 first_span = self.tree.find(id="1")
657 self.assertEqual(first_span.previous_sibling, None)
658
659 def test_find_previous_sibling(self):
660 self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
661
662 def test_previous_siblings(self):
663 self.assertSelectsIDs(self.end.find_previous_siblings("span"),
664 ['3', '2', '1'])
665
666 self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
667
668 def test_previous_sibling_for_text_element(self):
669 soup = self.soup("Foo<b>bar</b>baz")
670 start = soup.find(text="baz")
671 self.assertEqual(start.previous_sibling.name, 'b')
672 self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
673
674 self.assertSelects(start.find_previous_siblings('b'), ['bar'])
675 self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
676 self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
677
678
679class TestTagCreation(SoupTest):
680 """Test the ability to create new tags."""
681 def test_new_tag(self):
682 soup = self.soup("")
683 new_tag = soup.new_tag("foo", bar="baz")
684 self.assertTrue(isinstance(new_tag, Tag))
685 self.assertEqual("foo", new_tag.name)
686 self.assertEqual(dict(bar="baz"), new_tag.attrs)
687 self.assertEqual(None, new_tag.parent)
688
689 def test_tag_inherits_self_closing_rules_from_builder(self):
690 if XML_BUILDER_PRESENT:
691 xml_soup = BeautifulSoup("", "xml")
692 xml_br = xml_soup.new_tag("br")
693 xml_p = xml_soup.new_tag("p")
694
695 # Both the <br> and <p> tag are empty-element, just because
696 # they have no contents.
697 self.assertEqual(b"<br/>", xml_br.encode())
698 self.assertEqual(b"<p/>", xml_p.encode())
699
700 html_soup = BeautifulSoup("", "html")
701 html_br = html_soup.new_tag("br")
702 html_p = html_soup.new_tag("p")
703
704 # The HTML builder users HTML's rules about which tags are
705 # empty-element tags, and the new tags reflect these rules.
706 self.assertEqual(b"<br/>", html_br.encode())
707 self.assertEqual(b"<p></p>", html_p.encode())
708
709 def test_new_string_creates_navigablestring(self):
710 soup = self.soup("")
711 s = soup.new_string("foo")
712 self.assertEqual("foo", s)
713 self.assertTrue(isinstance(s, NavigableString))
714
715 def test_new_string_can_create_navigablestring_subclass(self):
716 soup = self.soup("")
717 s = soup.new_string("foo", Comment)
718 self.assertEqual("foo", s)
719 self.assertTrue(isinstance(s, Comment))
720
721class TestTreeModification(SoupTest):
722
723 def test_attribute_modification(self):
724 soup = self.soup('<a id="1"></a>')
725 soup.a['id'] = 2
726 self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
727 del(soup.a['id'])
728 self.assertEqual(soup.decode(), self.document_for('<a></a>'))
729 soup.a['id2'] = 'foo'
730 self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
731
732 def test_new_tag_creation(self):
733 builder = builder_registry.lookup('html')()
734 soup = self.soup("<body></body>", builder=builder)
735 a = Tag(soup, builder, 'a')
736 ol = Tag(soup, builder, 'ol')
737 a['href'] = 'http://foo.com/'
738 soup.body.insert(0, a)
739 soup.body.insert(1, ol)
740 self.assertEqual(
741 soup.body.encode(),
742 b'<body><a href="http://foo.com/"></a><ol></ol></body>')
743
744 def test_append_to_contents_moves_tag(self):
745 doc = """<p id="1">Don't leave me <b>here</b>.</p>
746 <p id="2">Don\'t leave!</p>"""
747 soup = self.soup(doc)
748 second_para = soup.find(id='2')
749 bold = soup.b
750
751 # Move the <b> tag to the end of the second paragraph.
752 soup.find(id='2').append(soup.b)
753
754 # The <b> tag is now a child of the second paragraph.
755 self.assertEqual(bold.parent, second_para)
756
757 self.assertEqual(
758 soup.decode(), self.document_for(
759 '<p id="1">Don\'t leave me .</p>\n'
760 '<p id="2">Don\'t leave!<b>here</b></p>'))
761
762 def test_replace_with_returns_thing_that_was_replaced(self):
763 text = "<a></a><b><c></c></b>"
764 soup = self.soup(text)
765 a = soup.a
766 new_a = a.replace_with(soup.c)
767 self.assertEqual(a, new_a)
768
769 def test_unwrap_returns_thing_that_was_replaced(self):
770 text = "<a><b></b><c></c></a>"
771 soup = self.soup(text)
772 a = soup.a
773 new_a = a.unwrap()
774 self.assertEqual(a, new_a)
775
776 def test_replace_tag_with_itself(self):
777 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
778 soup = self.soup(text)
779 c = soup.c
780 soup.c.replace_with(c)
781 self.assertEqual(soup.decode(), self.document_for(text))
782
783 def test_replace_tag_with_its_parent_raises_exception(self):
784 text = "<a><b></b></a>"
785 soup = self.soup(text)
786 self.assertRaises(ValueError, soup.b.replace_with, soup.a)
787
788 def test_insert_tag_into_itself_raises_exception(self):
789 text = "<a><b></b></a>"
790 soup = self.soup(text)
791 self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
792
793 def test_replace_with_maintains_next_element_throughout(self):
794 soup = self.soup('<p><a>one</a><b>three</b></p>')
795 a = soup.a
796 b = a.contents[0]
797 # Make it so the <a> tag has two text children.
798 a.insert(1, "two")
799
800 # Now replace each one with the empty string.
801 left, right = a.contents
802 left.replaceWith('')
803 right.replaceWith('')
804
805 # The <b> tag is still connected to the tree.
806 self.assertEqual("three", soup.b.string)
807
808 def test_replace_final_node(self):
809 soup = self.soup("<b>Argh!</b>")
810 soup.find(text="Argh!").replace_with("Hooray!")
811 new_text = soup.find(text="Hooray!")
812 b = soup.b
813 self.assertEqual(new_text.previous_element, b)
814 self.assertEqual(new_text.parent, b)
815 self.assertEqual(new_text.previous_element.next_element, new_text)
816 self.assertEqual(new_text.next_element, None)
817
818 def test_consecutive_text_nodes(self):
819 # A builder should never create two consecutive text nodes,
820 # but if you insert one next to another, Beautiful Soup will
821 # handle it correctly.
822 soup = self.soup("<a><b>Argh!</b><c></c></a>")
823 soup.b.insert(1, "Hooray!")
824
825 self.assertEqual(
826 soup.decode(), self.document_for(
827 "<a><b>Argh!Hooray!</b><c></c></a>"))
828
829 new_text = soup.find(text="Hooray!")
830 self.assertEqual(new_text.previous_element, "Argh!")
831 self.assertEqual(new_text.previous_element.next_element, new_text)
832
833 self.assertEqual(new_text.previous_sibling, "Argh!")
834 self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
835
836 self.assertEqual(new_text.next_sibling, None)
837 self.assertEqual(new_text.next_element, soup.c)
838
839 def test_insert_string(self):
840 soup = self.soup("<a></a>")
841 soup.a.insert(0, "bar")
842 soup.a.insert(0, "foo")
843 # The string were added to the tag.
844 self.assertEqual(["foo", "bar"], soup.a.contents)
845 # And they were converted to NavigableStrings.
846 self.assertEqual(soup.a.contents[0].next_element, "bar")
847
848 def test_insert_tag(self):
849 builder = self.default_builder
850 soup = self.soup(
851 "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
852 magic_tag = Tag(soup, builder, 'magictag')
853 magic_tag.insert(0, "the")
854 soup.a.insert(1, magic_tag)
855
856 self.assertEqual(
857 soup.decode(), self.document_for(
858 "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
859
860 # Make sure all the relationships are hooked up correctly.
861 b_tag = soup.b
862 self.assertEqual(b_tag.next_sibling, magic_tag)
863 self.assertEqual(magic_tag.previous_sibling, b_tag)
864
865 find = b_tag.find(text="Find")
866 self.assertEqual(find.next_element, magic_tag)
867 self.assertEqual(magic_tag.previous_element, find)
868
869 c_tag = soup.c
870 self.assertEqual(magic_tag.next_sibling, c_tag)
871 self.assertEqual(c_tag.previous_sibling, magic_tag)
872
873 the = magic_tag.find(text="the")
874 self.assertEqual(the.parent, magic_tag)
875 self.assertEqual(the.next_element, c_tag)
876 self.assertEqual(c_tag.previous_element, the)
877
878 def test_append_child_thats_already_at_the_end(self):
879 data = "<a><b></b></a>"
880 soup = self.soup(data)
881 soup.a.append(soup.b)
882 self.assertEqual(data, soup.decode())
883
884 def test_move_tag_to_beginning_of_parent(self):
885 data = "<a><b></b><c></c><d></d></a>"
886 soup = self.soup(data)
887 soup.a.insert(0, soup.d)
888 self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode())
889
890 def test_insert_works_on_empty_element_tag(self):
891 # This is a little strange, since most HTML parsers don't allow
892 # markup like this to come through. But in general, we don't
893 # know what the parser would or wouldn't have allowed, so
894 # I'm letting this succeed for now.
895 soup = self.soup("<br/>")
896 soup.br.insert(1, "Contents")
897 self.assertEqual(str(soup.br), "<br>Contents</br>")
898
899 def test_insert_before(self):
900 soup = self.soup("<a>foo</a><b>bar</b>")
901 soup.b.insert_before("BAZ")
902 soup.a.insert_before("QUUX")
903 self.assertEqual(
904 soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>"))
905
906 soup.a.insert_before(soup.b)
907 self.assertEqual(
908 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
909
910 def test_insert_after(self):
911 soup = self.soup("<a>foo</a><b>bar</b>")
912 soup.b.insert_after("BAZ")
913 soup.a.insert_after("QUUX")
914 self.assertEqual(
915 soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ"))
916 soup.b.insert_after(soup.a)
917 self.assertEqual(
918 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
919
920 def test_insert_after_raises_exception_if_after_has_no_meaning(self):
921 soup = self.soup("")
922 tag = soup.new_tag("a")
923 string = soup.new_string("")
924 self.assertRaises(ValueError, string.insert_after, tag)
925 self.assertRaises(NotImplementedError, soup.insert_after, tag)
926 self.assertRaises(ValueError, tag.insert_after, tag)
927
928 def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
929 soup = self.soup("")
930 tag = soup.new_tag("a")
931 string = soup.new_string("")
932 self.assertRaises(ValueError, string.insert_before, tag)
933 self.assertRaises(NotImplementedError, soup.insert_before, tag)
934 self.assertRaises(ValueError, tag.insert_before, tag)
935
936 def test_replace_with(self):
937 soup = self.soup(
938 "<p>There's <b>no</b> business like <b>show</b> business</p>")
939 no, show = soup.find_all('b')
940 show.replace_with(no)
941 self.assertEqual(
942 soup.decode(),
943 self.document_for(
944 "<p>There's business like <b>no</b> business</p>"))
945
946 self.assertEqual(show.parent, None)
947 self.assertEqual(no.parent, soup.p)
948 self.assertEqual(no.next_element, "no")
949 self.assertEqual(no.next_sibling, " business")
950
951 def test_replace_first_child(self):
952 data = "<a><b></b><c></c></a>"
953 soup = self.soup(data)
954 soup.b.replace_with(soup.c)
955 self.assertEqual("<a><c></c></a>", soup.decode())
956
957 def test_replace_last_child(self):
958 data = "<a><b></b><c></c></a>"
959 soup = self.soup(data)
960 soup.c.replace_with(soup.b)
961 self.assertEqual("<a><b></b></a>", soup.decode())
962
963 def test_nested_tag_replace_with(self):
964 soup = self.soup(
965 """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
966
967 # Replace the entire <b> tag and its contents ("reserve the
968 # right") with the <f> tag ("refuse").
969 remove_tag = soup.b
970 move_tag = soup.f
971 remove_tag.replace_with(move_tag)
972
973 self.assertEqual(
974 soup.decode(), self.document_for(
975 "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
976
977 # The <b> tag is now an orphan.
978 self.assertEqual(remove_tag.parent, None)
979 self.assertEqual(remove_tag.find(text="right").next_element, None)
980 self.assertEqual(remove_tag.previous_element, None)
981 self.assertEqual(remove_tag.next_sibling, None)
982 self.assertEqual(remove_tag.previous_sibling, None)
983
984 # The <f> tag is now connected to the <a> tag.
985 self.assertEqual(move_tag.parent, soup.a)
986 self.assertEqual(move_tag.previous_element, "We")
987 self.assertEqual(move_tag.next_element.next_element, soup.e)
988 self.assertEqual(move_tag.next_sibling, None)
989
990 # The gap where the <f> tag used to be has been mended, and
991 # the word "to" is now connected to the <g> tag.
992 to_text = soup.find(text="to")
993 g_tag = soup.g
994 self.assertEqual(to_text.next_element, g_tag)
995 self.assertEqual(to_text.next_sibling, g_tag)
996 self.assertEqual(g_tag.previous_element, to_text)
997 self.assertEqual(g_tag.previous_sibling, to_text)
998
999 def test_unwrap(self):
1000 tree = self.soup("""
1001 <p>Unneeded <em>formatting</em> is unneeded</p>
1002 """)
1003 tree.em.unwrap()
1004 self.assertEqual(tree.em, None)
1005 self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
1006
1007 def test_wrap(self):
1008 soup = self.soup("I wish I was bold.")
1009 value = soup.string.wrap(soup.new_tag("b"))
1010 self.assertEqual(value.decode(), "<b>I wish I was bold.</b>")
1011 self.assertEqual(
1012 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1013
1014 def test_wrap_extracts_tag_from_elsewhere(self):
1015 soup = self.soup("<b></b>I wish I was bold.")
1016 soup.b.next_sibling.wrap(soup.b)
1017 self.assertEqual(
1018 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1019
1020 def test_wrap_puts_new_contents_at_the_end(self):
1021 soup = self.soup("<b>I like being bold.</b>I wish I was bold.")
1022 soup.b.next_sibling.wrap(soup.b)
1023 self.assertEqual(2, len(soup.b.contents))
1024 self.assertEqual(
1025 soup.decode(), self.document_for(
1026 "<b>I like being bold.I wish I was bold.</b>"))
1027
1028 def test_extract(self):
1029 soup = self.soup(
1030 '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
1031
1032 self.assertEqual(len(soup.body.contents), 3)
1033 extracted = soup.find(id="nav").extract()
1034
1035 self.assertEqual(
1036 soup.decode(), "<html><body>Some content. More content.</body></html>")
1037 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
1038
1039 # The extracted tag is now an orphan.
1040 self.assertEqual(len(soup.body.contents), 2)
1041 self.assertEqual(extracted.parent, None)
1042 self.assertEqual(extracted.previous_element, None)
1043 self.assertEqual(extracted.next_element.next_element, None)
1044
1045 # The gap where the extracted tag used to be has been mended.
1046 content_1 = soup.find(text="Some content. ")
1047 content_2 = soup.find(text=" More content.")
1048 self.assertEqual(content_1.next_element, content_2)
1049 self.assertEqual(content_1.next_sibling, content_2)
1050 self.assertEqual(content_2.previous_element, content_1)
1051 self.assertEqual(content_2.previous_sibling, content_1)
1052
1053 def test_extract_distinguishes_between_identical_strings(self):
1054 soup = self.soup("<a>foo</a><b>bar</b>")
1055 foo_1 = soup.a.string
1056 bar_1 = soup.b.string
1057 foo_2 = soup.new_string("foo")
1058 bar_2 = soup.new_string("bar")
1059 soup.a.append(foo_2)
1060 soup.b.append(bar_2)
1061
1062 # Now there are two identical strings in the <a> tag, and two
1063 # in the <b> tag. Let's remove the first "foo" and the second
1064 # "bar".
1065 foo_1.extract()
1066 bar_2.extract()
1067 self.assertEqual(foo_2, soup.a.string)
1068 self.assertEqual(bar_2, soup.b.string)
1069
1070 def test_clear(self):
1071 """Tag.clear()"""
1072 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
1073 # clear using extract()
1074 a = soup.a
1075 soup.p.clear()
1076 self.assertEqual(len(soup.p.contents), 0)
1077 self.assertTrue(hasattr(a, "contents"))
1078
1079 # clear using decompose()
1080 em = a.em
1081 a.clear(decompose=True)
1082 self.assertEqual(0, len(em.contents))
1083
1084 def test_string_set(self):
1085 """Tag.string = 'string'"""
1086 soup = self.soup("<a></a> <b><c></c></b>")
1087 soup.a.string = "foo"
1088 self.assertEqual(soup.a.contents, ["foo"])
1089 soup.b.string = "bar"
1090 self.assertEqual(soup.b.contents, ["bar"])
1091
1092 def test_string_set_does_not_affect_original_string(self):
1093 soup = self.soup("<a><b>foo</b><c>bar</c>")
1094 soup.b.string = soup.c.string
1095 self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
1096
1097 def test_set_string_preserves_class_of_string(self):
1098 soup = self.soup("<a></a>")
1099 cdata = CData("foo")
1100 soup.a.string = cdata
1101 self.assertTrue(isinstance(soup.a.string, CData))
1102
1103class TestElementObjects(SoupTest):
1104 """Test various features of element objects."""
1105
1106 def test_len(self):
1107 """The length of an element is its number of children."""
1108 soup = self.soup("<top>1<b>2</b>3</top>")
1109
1110 # The BeautifulSoup object itself contains one element: the
1111 # <top> tag.
1112 self.assertEqual(len(soup.contents), 1)
1113 self.assertEqual(len(soup), 1)
1114
1115 # The <top> tag contains three elements: the text node "1", the
1116 # <b> tag, and the text node "3".
1117 self.assertEqual(len(soup.top), 3)
1118 self.assertEqual(len(soup.top.contents), 3)
1119
1120 def test_member_access_invokes_find(self):
1121 """Accessing a Python member .foo invokes find('foo')"""
1122 soup = self.soup('<b><i></i></b>')
1123 self.assertEqual(soup.b, soup.find('b'))
1124 self.assertEqual(soup.b.i, soup.find('b').find('i'))
1125 self.assertEqual(soup.a, None)
1126
1127 def test_deprecated_member_access(self):
1128 soup = self.soup('<b><i></i></b>')
1129 with warnings.catch_warnings(record=True) as w:
1130 tag = soup.bTag
1131 self.assertEqual(soup.b, tag)
1132 self.assertEqual(
1133 '.bTag is deprecated, use .find("b") instead.',
1134 str(w[0].message))
1135
1136 def test_has_attr(self):
1137 """has_attr() checks for the presence of an attribute.
1138
1139 Please note note: has_attr() is different from
1140 __in__. has_attr() checks the tag's attributes and __in__
1141 checks the tag's chidlren.
1142 """
1143 soup = self.soup("<foo attr='bar'>")
1144 self.assertTrue(soup.foo.has_attr('attr'))
1145 self.assertFalse(soup.foo.has_attr('attr2'))
1146
1147
1148 def test_attributes_come_out_in_alphabetical_order(self):
1149 markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
1150 self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
1151
1152 def test_string(self):
1153 # A tag that contains only a text node makes that node
1154 # available as .string.
1155 soup = self.soup("<b>foo</b>")
1156 self.assertEqual(soup.b.string, 'foo')
1157
1158 def test_empty_tag_has_no_string(self):
1159 # A tag with no children has no .stirng.
1160 soup = self.soup("<b></b>")
1161 self.assertEqual(soup.b.string, None)
1162
1163 def test_tag_with_multiple_children_has_no_string(self):
1164 # A tag with no children has no .string.
1165 soup = self.soup("<a>foo<b></b><b></b></b>")
1166 self.assertEqual(soup.b.string, None)
1167
1168 soup = self.soup("<a>foo<b></b>bar</b>")
1169 self.assertEqual(soup.b.string, None)
1170
1171 # Even if all the children are strings, due to trickery,
1172 # it won't work--but this would be a good optimization.
1173 soup = self.soup("<a>foo</b>")
1174 soup.a.insert(1, "bar")
1175 self.assertEqual(soup.a.string, None)
1176
1177 def test_tag_with_recursive_string_has_string(self):
1178 # A tag with a single child which has a .string inherits that
1179 # .string.
1180 soup = self.soup("<a><b>foo</b></a>")
1181 self.assertEqual(soup.a.string, "foo")
1182 self.assertEqual(soup.string, "foo")
1183
1184 def test_lack_of_string(self):
1185 """Only a tag containing a single text node has a .string."""
1186 soup = self.soup("<b>f<i>e</i>o</b>")
1187 self.assertFalse(soup.b.string)
1188
1189 soup = self.soup("<b></b>")
1190 self.assertFalse(soup.b.string)
1191
1192 def test_all_text(self):
1193 """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
1194 soup = self.soup("<a>a<b>r</b> <r> t </r></a>")
1195 self.assertEqual(soup.a.text, "ar t ")
1196 self.assertEqual(soup.a.get_text(strip=True), "art")
1197 self.assertEqual(soup.a.get_text(","), "a,r, , t ")
1198 self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
1199
1200 def test_get_text_ignores_comments(self):
1201 soup = self.soup("foo<!--IGNORE-->bar")
1202 self.assertEqual(soup.get_text(), "foobar")
1203
1204 self.assertEqual(
1205 soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
1206 self.assertEqual(
1207 soup.get_text(types=None), "fooIGNOREbar")
1208
1209 def test_all_strings_ignores_comments(self):
1210 soup = self.soup("foo<!--IGNORE-->bar")
1211 self.assertEqual(['foo', 'bar'], list(soup.strings))
1212
1213class TestCDAtaListAttributes(SoupTest):
1214
1215 """Testing cdata-list attributes like 'class'.
1216 """
1217 def test_single_value_becomes_list(self):
1218 soup = self.soup("<a class='foo'>")
1219 self.assertEqual(["foo"],soup.a['class'])
1220
1221 def test_multiple_values_becomes_list(self):
1222 soup = self.soup("<a class='foo bar'>")
1223 self.assertEqual(["foo", "bar"], soup.a['class'])
1224
1225 def test_multiple_values_separated_by_weird_whitespace(self):
1226 soup = self.soup("<a class='foo\tbar\nbaz'>")
1227 self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
1228
1229 def test_attributes_joined_into_string_on_output(self):
1230 soup = self.soup("<a class='foo\tbar'>")
1231 self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
1232
1233 def test_accept_charset(self):
1234 soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
1235 self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
1236
1237 def test_cdata_attribute_applying_only_to_one_tag(self):
1238 data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
1239 soup = self.soup(data)
1240 # We saw in another test that accept-charset is a cdata-list
1241 # attribute for the <form> tag. But it's not a cdata-list
1242 # attribute for any other tag.
1243 self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
1244
1245 def test_string_has_immutable_name_property(self):
1246 string = self.soup("s").string
1247 self.assertEqual(None, string.name)
1248 def t():
1249 string.name = 'foo'
1250 self.assertRaises(AttributeError, t)
1251
1252class TestPersistence(SoupTest):
1253 "Testing features like pickle and deepcopy."
1254
1255 def setUp(self):
1256 super(TestPersistence, self).setUp()
1257 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
1258"http://www.w3.org/TR/REC-html40/transitional.dtd">
1259<html>
1260<head>
1261<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
1262<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
1263<link rev="made" href="mailto:leonardr@segfault.org">
1264<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
1265<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
1266<meta name="author" content="Leonard Richardson">
1267</head>
1268<body>
1269<a href="foo">foo</a>
1270<a href="foo"><b>bar</b></a>
1271</body>
1272</html>"""
1273 self.tree = self.soup(self.page)
1274
1275 def test_pickle_and_unpickle_identity(self):
1276 # Pickling a tree, then unpickling it, yields a tree identical
1277 # to the original.
1278 dumped = pickle.dumps(self.tree, 2)
1279 loaded = pickle.loads(dumped)
1280 self.assertEqual(loaded.__class__, BeautifulSoup)
1281 self.assertEqual(loaded.decode(), self.tree.decode())
1282
1283 def test_deepcopy_identity(self):
1284 # Making a deepcopy of a tree yields an identical tree.
1285 copied = copy.deepcopy(self.tree)
1286 self.assertEqual(copied.decode(), self.tree.decode())
1287
1288 def test_unicode_pickle(self):
1289 # A tree containing Unicode characters can be pickled.
1290 html = u"<b>\N{SNOWMAN}</b>"
1291 soup = self.soup(html)
1292 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1293 loaded = pickle.loads(dumped)
1294 self.assertEqual(loaded.decode(), soup.decode())
1295
1296
1297class TestSubstitutions(SoupTest):
1298
1299 def test_default_formatter_is_minimal(self):
1300 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1301 soup = self.soup(markup)
1302 decoded = soup.decode(formatter="minimal")
1303 # The < is converted back into &lt; but the e-with-acute is left alone.
1304 self.assertEqual(
1305 decoded,
1306 self.document_for(
1307 u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1308
1309 def test_formatter_html(self):
1310 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1311 soup = self.soup(markup)
1312 decoded = soup.decode(formatter="html")
1313 self.assertEqual(
1314 decoded,
1315 self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1316
1317 def test_formatter_minimal(self):
1318 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1319 soup = self.soup(markup)
1320 decoded = soup.decode(formatter="minimal")
1321 # The < is converted back into &lt; but the e-with-acute is left alone.
1322 self.assertEqual(
1323 decoded,
1324 self.document_for(
1325 u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1326
1327 def test_formatter_null(self):
1328 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1329 soup = self.soup(markup)
1330 decoded = soup.decode(formatter=None)
1331 # Neither the angle brackets nor the e-with-acute are converted.
1332 # This is not valid HTML, but it's what the user wanted.
1333 self.assertEqual(decoded,
1334 self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1335
1336 def test_formatter_custom(self):
1337 markup = u"<b>&lt;foo&gt;</b><b>bar</b>"
1338 soup = self.soup(markup)
1339 decoded = soup.decode(formatter = lambda x: x.upper())
1340 # Instead of normal entity conversion code, the custom
1341 # callable is called on every string.
1342 self.assertEqual(
1343 decoded,
1344 self.document_for(u"<b><FOO></b><b>BAR</b>"))
1345
1346 def test_formatter_is_run_on_attribute_values(self):
1347 markup = u'<a href="http://a.com?a=b&c=é">e</a>'
1348 soup = self.soup(markup)
1349 a = soup.a
1350
1351 expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>'
1352
1353 self.assertEqual(expect_minimal, a.decode())
1354 self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1355
1356 expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1357 self.assertEqual(expect_html, a.decode(formatter="html"))
1358
1359 self.assertEqual(markup, a.decode(formatter=None))
1360 expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
1361 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1362
1363 def test_formatter_skips_script_tag_for_html_documents(self):
1364 doc = """
1365 <script type="text/javascript">
1366 console.log("< < hey > > ");
1367 </script>
1368"""
1369 encoded = BeautifulSoup(doc).encode()
1370 self.assertTrue(b"< < hey > >" in encoded)
1371
1372 def test_formatter_skips_style_tag_for_html_documents(self):
1373 doc = """
1374 <style type="text/css">
1375 console.log("< < hey > > ");
1376 </style>
1377"""
1378 encoded = BeautifulSoup(doc).encode()
1379 self.assertTrue(b"< < hey > >" in encoded)
1380
1381 def test_prettify_leaves_preformatted_text_alone(self):
1382 soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
1383 # Everything outside the <pre> tag is reformatted, but everything
1384 # inside is left alone.
1385 self.assertEqual(
1386 u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
1387 soup.div.prettify())
1388
1389 def test_prettify_accepts_formatter(self):
1390 soup = BeautifulSoup("<html><body>foo</body></html>")
1391 pretty = soup.prettify(formatter = lambda x: x.upper())
1392 self.assertTrue("FOO" in pretty)
1393
1394 def test_prettify_outputs_unicode_by_default(self):
1395 soup = self.soup("<a></a>")
1396 self.assertEqual(unicode, type(soup.prettify()))
1397
1398 def test_prettify_can_encode_data(self):
1399 soup = self.soup("<a></a>")
1400 self.assertEqual(bytes, type(soup.prettify("utf-8")))
1401
1402 def test_html_entity_substitution_off_by_default(self):
1403 markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1404 soup = self.soup(markup)
1405 encoded = soup.b.encode("utf-8")
1406 self.assertEqual(encoded, markup.encode('utf-8'))
1407
1408 def test_encoding_substitution(self):
1409 # Here's the <meta> tag saying that a document is
1410 # encoded in Shift-JIS.
1411 meta_tag = ('<meta content="text/html; charset=x-sjis" '
1412 'http-equiv="Content-type"/>')
1413 soup = self.soup(meta_tag)
1414
1415 # Parse the document, and the charset apprears unchanged.
1416 self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
1417
1418 # Encode the document into some encoding, and the encoding is
1419 # substituted into the meta tag.
1420 utf_8 = soup.encode("utf-8")
1421 self.assertTrue(b"charset=utf-8" in utf_8)
1422
1423 euc_jp = soup.encode("euc_jp")
1424 self.assertTrue(b"charset=euc_jp" in euc_jp)
1425
1426 shift_jis = soup.encode("shift-jis")
1427 self.assertTrue(b"charset=shift-jis" in shift_jis)
1428
1429 utf_16_u = soup.encode("utf-16").decode("utf-16")
1430 self.assertTrue("charset=utf-16" in utf_16_u)
1431
1432 def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
1433 markup = ('<head><meta content="text/html; charset=x-sjis" '
1434 'http-equiv="Content-type"/></head><pre>foo</pre>')
1435
1436 # Beautiful Soup used to try to rewrite the meta tag even if the
1437 # meta tag got filtered out by the strainer. This test makes
1438 # sure that doesn't happen.
1439 strainer = SoupStrainer('pre')
1440 soup = self.soup(markup, parse_only=strainer)
1441 self.assertEqual(soup.contents[0].name, 'pre')
1442
1443class TestEncoding(SoupTest):
1444 """Test the ability to encode objects into strings."""
1445
1446 def test_unicode_string_can_be_encoded(self):
1447 html = u"<b>\N{SNOWMAN}</b>"
1448 soup = self.soup(html)
1449 self.assertEqual(soup.b.string.encode("utf-8"),
1450 u"\N{SNOWMAN}".encode("utf-8"))
1451
1452 def test_tag_containing_unicode_string_can_be_encoded(self):
1453 html = u"<b>\N{SNOWMAN}</b>"
1454 soup = self.soup(html)
1455 self.assertEqual(
1456 soup.b.encode("utf-8"), html.encode("utf-8"))
1457
1458 def test_encoding_substitutes_unrecognized_characters_by_default(self):
1459 html = u"<b>\N{SNOWMAN}</b>"
1460 soup = self.soup(html)
1461 self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1462
1463 def test_encoding_can_be_made_strict(self):
1464 html = u"<b>\N{SNOWMAN}</b>"
1465 soup = self.soup(html)
1466 self.assertRaises(
1467 UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1468
1469 def test_decode_contents(self):
1470 html = u"<b>\N{SNOWMAN}</b>"
1471 soup = self.soup(html)
1472 self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents())
1473
1474 def test_encode_contents(self):
1475 html = u"<b>\N{SNOWMAN}</b>"
1476 soup = self.soup(html)
1477 self.assertEqual(
1478 u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1479 encoding="utf8"))
1480
1481 def test_deprecated_renderContents(self):
1482 html = u"<b>\N{SNOWMAN}</b>"
1483 soup = self.soup(html)
1484 self.assertEqual(
1485 u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1486
1487class TestNavigableStringSubclasses(SoupTest):
1488
1489 def test_cdata(self):
1490 # None of the current builders turn CDATA sections into CData
1491 # objects, but you can create them manually.
1492 soup = self.soup("")
1493 cdata = CData("foo")
1494 soup.insert(1, cdata)
1495 self.assertEqual(str(soup), "<![CDATA[foo]]>")
1496 self.assertEqual(soup.find(text="foo"), "foo")
1497 self.assertEqual(soup.contents[0], "foo")
1498
1499 def test_cdata_is_never_formatted(self):
1500 """Text inside a CData object is passed into the formatter.
1501
1502 But the return value is ignored.
1503 """
1504
1505 self.count = 0
1506 def increment(*args):
1507 self.count += 1
1508 return "BITTER FAILURE"
1509
1510 soup = self.soup("")
1511 cdata = CData("<><><>")
1512 soup.insert(1, cdata)
1513 self.assertEqual(
1514 b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
1515 self.assertEqual(1, self.count)
1516
1517 def test_doctype_ends_in_newline(self):
1518 # Unlike other NavigableString subclasses, a DOCTYPE always ends
1519 # in a newline.
1520 doctype = Doctype("foo")
1521 soup = self.soup("")
1522 soup.insert(1, doctype)
1523 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1524
1525
1526class TestSoupSelector(TreeTest):
1527
1528 HTML = """
1529<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
1530"http://www.w3.org/TR/html4/strict.dtd">
1531<html>
1532<head>
1533<title>The title</title>
1534<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1535</head>
1536<body>
1537
1538<div id="main" class="fancy">
1539<div id="inner">
1540<h1 id="header1">An H1</h1>
1541<p>Some text</p>
1542<p class="onep" id="p1">Some more text</p>
1543<h2 id="header2">An H2</h2>
1544<p class="class1 class2 class3" id="pmulti">Another</p>
1545<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
1546<h2 id="header3">Another H2</h2>
1547<a id="me" href="http://simonwillison.net/" rel="me">me</a>
1548<span class="s1">
1549<a href="#" id="s1a1">span1a1</a>
1550<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
1551<span class="span2">
1552<a href="#" id="s2a1">span2a1</a>
1553</span>
1554<span class="span3"></span>
1555</span>
1556</div>
1557<p lang="en" id="lang-en">English</p>
1558<p lang="en-gb" id="lang-en-gb">English UK</p>
1559<p lang="en-us" id="lang-en-us">English US</p>
1560<p lang="fr" id="lang-fr">French</p>
1561</div>
1562
1563<div id="footer">
1564</div>
1565"""
1566
1567 def setUp(self):
1568 self.soup = BeautifulSoup(self.HTML)
1569
1570 def assertSelects(self, selector, expected_ids):
1571 el_ids = [el['id'] for el in self.soup.select(selector)]
1572 el_ids.sort()
1573 expected_ids.sort()
1574 self.assertEqual(expected_ids, el_ids,
1575 "Selector %s, expected [%s], got [%s]" % (
1576 selector, ', '.join(expected_ids), ', '.join(el_ids)
1577 )
1578 )
1579
1580 assertSelect = assertSelects
1581
1582 def assertSelectMultiple(self, *tests):
1583 for selector, expected_ids in tests:
1584 self.assertSelect(selector, expected_ids)
1585
1586 def test_one_tag_one(self):
1587 els = self.soup.select('title')
1588 self.assertEqual(len(els), 1)
1589 self.assertEqual(els[0].name, 'title')
1590 self.assertEqual(els[0].contents, [u'The title'])
1591
1592 def test_one_tag_many(self):
1593 els = self.soup.select('div')
1594 self.assertEqual(len(els), 3)
1595 for div in els:
1596 self.assertEqual(div.name, 'div')
1597
1598 def test_tag_in_tag_one(self):
1599 els = self.soup.select('div div')
1600 self.assertSelects('div div', ['inner'])
1601
1602 def test_tag_in_tag_many(self):
1603 for selector in ('html div', 'html body div', 'body div'):
1604 self.assertSelects(selector, ['main', 'inner', 'footer'])
1605
1606 def test_tag_no_match(self):
1607 self.assertEqual(len(self.soup.select('del')), 0)
1608
1609 def test_invalid_tag(self):
1610 self.assertRaises(ValueError, self.soup.select, 'tag%t')
1611
1612 def test_header_tags(self):
1613 self.assertSelectMultiple(
1614 ('h1', ['header1']),
1615 ('h2', ['header2', 'header3']),
1616 )
1617
1618 def test_class_one(self):
1619 for selector in ('.onep', 'p.onep', 'html p.onep'):
1620 els = self.soup.select(selector)
1621 self.assertEqual(len(els), 1)
1622 self.assertEqual(els[0].name, 'p')
1623 self.assertEqual(els[0]['class'], ['onep'])
1624
1625 def test_class_mismatched_tag(self):
1626 els = self.soup.select('div.onep')
1627 self.assertEqual(len(els), 0)
1628
1629 def test_one_id(self):
1630 for selector in ('div#inner', '#inner', 'div div#inner'):
1631 self.assertSelects(selector, ['inner'])
1632
1633 def test_bad_id(self):
1634 els = self.soup.select('#doesnotexist')
1635 self.assertEqual(len(els), 0)
1636
1637 def test_items_in_id(self):
1638 els = self.soup.select('div#inner p')
1639 self.assertEqual(len(els), 3)
1640 for el in els:
1641 self.assertEqual(el.name, 'p')
1642 self.assertEqual(els[1]['class'], ['onep'])
1643 self.assertFalse(els[0].has_attr('class'))
1644
1645 def test_a_bunch_of_emptys(self):
1646 for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
1647 self.assertEqual(len(self.soup.select(selector)), 0)
1648
1649 def test_multi_class_support(self):
1650 for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
1651 '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
1652 self.assertSelects(selector, ['pmulti'])
1653
1654 def test_multi_class_selection(self):
1655 for selector in ('.class1.class3', '.class3.class2',
1656 '.class1.class2.class3'):
1657 self.assertSelects(selector, ['pmulti'])
1658
1659 def test_child_selector(self):
1660 self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
1661 self.assertSelects('.s1 > a span', ['s1a2s1'])
1662
1663 def test_child_selector_id(self):
1664 self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
1665
1666 def test_attribute_equals(self):
1667 self.assertSelectMultiple(
1668 ('p[class="onep"]', ['p1']),
1669 ('p[id="p1"]', ['p1']),
1670 ('[class="onep"]', ['p1']),
1671 ('[id="p1"]', ['p1']),
1672 ('link[rel="stylesheet"]', ['l1']),
1673 ('link[type="text/css"]', ['l1']),
1674 ('link[href="blah.css"]', ['l1']),
1675 ('link[href="no-blah.css"]', []),
1676 ('[rel="stylesheet"]', ['l1']),
1677 ('[type="text/css"]', ['l1']),
1678 ('[href="blah.css"]', ['l1']),
1679 ('[href="no-blah.css"]', []),
1680 ('p[href="no-blah.css"]', []),
1681 ('[href="no-blah.css"]', []),
1682 )
1683
1684 def test_attribute_tilde(self):
1685 self.assertSelectMultiple(
1686 ('p[class~="class1"]', ['pmulti']),
1687 ('p[class~="class2"]', ['pmulti']),
1688 ('p[class~="class3"]', ['pmulti']),
1689 ('[class~="class1"]', ['pmulti']),
1690 ('[class~="class2"]', ['pmulti']),
1691 ('[class~="class3"]', ['pmulti']),
1692 ('a[rel~="friend"]', ['bob']),
1693 ('a[rel~="met"]', ['bob']),
1694 ('[rel~="friend"]', ['bob']),
1695 ('[rel~="met"]', ['bob']),
1696 )
1697
1698 def test_attribute_startswith(self):
1699 self.assertSelectMultiple(
1700 ('[rel^="style"]', ['l1']),
1701 ('link[rel^="style"]', ['l1']),
1702 ('notlink[rel^="notstyle"]', []),
1703 ('[rel^="notstyle"]', []),
1704 ('link[rel^="notstyle"]', []),
1705 ('link[href^="bla"]', ['l1']),
1706 ('a[href^="http://"]', ['bob', 'me']),
1707 ('[href^="http://"]', ['bob', 'me']),
1708 ('[id^="p"]', ['pmulti', 'p1']),
1709 ('[id^="m"]', ['me', 'main']),
1710 ('div[id^="m"]', ['main']),
1711 ('a[id^="m"]', ['me']),
1712 )
1713
1714 def test_attribute_endswith(self):
1715 self.assertSelectMultiple(
1716 ('[href$=".css"]', ['l1']),
1717 ('link[href$=".css"]', ['l1']),
1718 ('link[id$="1"]', ['l1']),
1719 ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']),
1720 ('div[id$="1"]', []),
1721 ('[id$="noending"]', []),
1722 )
1723
1724 def test_attribute_contains(self):
1725 self.assertSelectMultiple(
1726 # From test_attribute_startswith
1727 ('[rel*="style"]', ['l1']),
1728 ('link[rel*="style"]', ['l1']),
1729 ('notlink[rel*="notstyle"]', []),
1730 ('[rel*="notstyle"]', []),
1731 ('link[rel*="notstyle"]', []),
1732 ('link[href*="bla"]', ['l1']),
1733 ('a[href*="http://"]', ['bob', 'me']),
1734 ('[href*="http://"]', ['bob', 'me']),
1735 ('[id*="p"]', ['pmulti', 'p1']),
1736 ('div[id*="m"]', ['main']),
1737 ('a[id*="m"]', ['me']),
1738 # From test_attribute_endswith
1739 ('[href*=".css"]', ['l1']),
1740 ('link[href*=".css"]', ['l1']),
1741 ('link[id*="1"]', ['l1']),
1742 ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']),
1743 ('div[id*="1"]', []),
1744 ('[id*="noending"]', []),
1745 # New for this test
1746 ('[href*="."]', ['bob', 'me', 'l1']),
1747 ('a[href*="."]', ['bob', 'me']),
1748 ('link[href*="."]', ['l1']),
1749 ('div[id*="n"]', ['main', 'inner']),
1750 ('div[id*="nn"]', ['inner']),
1751 )
1752
1753 def test_attribute_exact_or_hypen(self):
1754 self.assertSelectMultiple(
1755 ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1756 ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1757 ('p[lang|="fr"]', ['lang-fr']),
1758 ('p[lang|="gb"]', []),
1759 )
1760
1761 def test_attribute_exists(self):
1762 self.assertSelectMultiple(
1763 ('[rel]', ['l1', 'bob', 'me']),
1764 ('link[rel]', ['l1']),
1765 ('a[rel]', ['bob', 'me']),
1766 ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
1767 ('p[class]', ['p1', 'pmulti']),
1768 ('[blah]', []),
1769 ('p[blah]', []),
1770 )
1771
1772 def test_nth_of_type(self):
1773 # Try to select first paragraph
1774 els = self.soup.select('div#inner p:nth-of-type(1)')
1775 self.assertEqual(len(els), 1)
1776 self.assertEqual(els[0].string, u'Some text')
1777
1778 # Try to select third paragraph
1779 els = self.soup.select('div#inner p:nth-of-type(3)')
1780 self.assertEqual(len(els), 1)
1781 self.assertEqual(els[0].string, u'Another')
1782
1783 # Try to select (non-existent!) fourth paragraph
1784 els = self.soup.select('div#inner p:nth-of-type(4)')
1785 self.assertEqual(len(els), 0)
1786
1787 # Pass in an invalid value.
1788 self.assertRaises(
1789 ValueError, self.soup.select, 'div p:nth-of-type(0)')
1790
1791 def test_nth_of_type_direct_descendant(self):
1792 els = self.soup.select('div#inner > p:nth-of-type(1)')
1793 self.assertEqual(len(els), 1)
1794 self.assertEqual(els[0].string, u'Some text')
1795
1796 def test_id_child_selector_nth_of_type(self):
1797 self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
1798
1799 def test_select_on_element(self):
1800 # Other tests operate on the tree; this operates on an element
1801 # within the tree.
1802 inner = self.soup.find("div", id="main")
1803 selected = inner.select("div")
1804 # The <div id="inner"> tag was selected. The <div id="footer">
1805 # tag was not.
1806 self.assertSelectsIDs(selected, ['inner'])
1807
1808 def test_overspecified_child_id(self):
1809 self.assertSelects(".fancy #inner", ['inner'])
1810 self.assertSelects(".normal #inner", [])
1811
1812 def test_adjacent_sibling_selector(self):
1813 self.assertSelects('#p1 + h2', ['header2'])
1814 self.assertSelects('#p1 + h2 + p', ['pmulti'])
1815 self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
1816 self.assertEqual([], self.soup.select('#p1 + p'))
1817
1818 def test_general_sibling_selector(self):
1819 self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
1820 self.assertSelects('#p1 ~ #header2', ['header2'])
1821 self.assertSelects('#p1 ~ h2 + a', ['me'])
1822 self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
1823 self.assertEqual([], self.soup.select('#inner ~ h2'))
1824
1825 def test_dangling_combinator(self):
1826 self.assertRaises(ValueError, self.soup.select, 'h1 >')
1827
1828 def test_sibling_combinator_wont_select_same_tag_twice(self):
1829 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])