diff options
Diffstat (limited to 'bitbake/lib/bs4')
-rw-r--r-- | bitbake/lib/bs4/AUTHORS.txt | 43 | ||||
-rw-r--r-- | bitbake/lib/bs4/COPYING.txt | 26 | ||||
-rw-r--r-- | bitbake/lib/bs4/NEWS.txt | 1066 | ||||
-rw-r--r-- | bitbake/lib/bs4/__init__.py | 406 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/__init__.py | 321 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/_html5lib.py | 285 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/_htmlparser.py | 258 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/_lxml.py | 233 | ||||
-rw-r--r-- | bitbake/lib/bs4/dammit.py | 829 | ||||
-rw-r--r-- | bitbake/lib/bs4/diagnose.py | 204 | ||||
-rw-r--r-- | bitbake/lib/bs4/element.py | 1611 | ||||
-rw-r--r-- | bitbake/lib/bs4/testing.py | 592 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/__init__.py | 1 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_builder_registry.py | 141 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_docs.py | 36 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_html5lib.py | 85 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_htmlparser.py | 19 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_lxml.py | 91 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_soup.py | 434 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_tree.py | 1829 |
20 files changed, 8510 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/AUTHORS.txt b/bitbake/lib/bs4/AUTHORS.txt new file mode 100644 index 0000000000..2ac8fcc8cc --- /dev/null +++ b/bitbake/lib/bs4/AUTHORS.txt | |||
@@ -0,0 +1,43 @@ | |||
1 | Behold, mortal, the origins of Beautiful Soup... | ||
2 | ================================================ | ||
3 | |||
4 | Leonard Richardson is the primary programmer. | ||
5 | |||
6 | Aaron DeVore is awesome. | ||
7 | |||
8 | Mark Pilgrim provided the encoding detection code that forms the base | ||
9 | of UnicodeDammit. | ||
10 | |||
11 | Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful | ||
12 | Soup 4 working under Python 3. | ||
13 | |||
14 | Simon Willison wrote soupselect, which was used to make Beautiful Soup | ||
15 | support CSS selectors. | ||
16 | |||
17 | Sam Ruby helped with a lot of edge cases. | ||
18 | |||
19 | Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his | ||
20 | work in solving the nestable tags conundrum. | ||
21 | |||
22 | An incomplete list of people have contributed patches to Beautiful | ||
23 | Soup: | ||
24 | |||
25 | Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, | ||
26 | Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris | ||
27 | Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, | ||
28 | Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed | ||
29 | Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko | ||
30 | Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn | ||
31 | Webster, Paul Wright, Danny Yoo | ||
32 | |||
33 | An incomplete list of people who made suggestions or found bugs or | ||
34 | found ways to break Beautiful Soup: | ||
35 | |||
36 | Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, | ||
37 | Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, | ||
38 | Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams, | ||
39 | warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, | ||
40 | Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed | ||
41 | Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart | ||
42 | Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de | ||
43 | Sousa Rocha, Yichun Wei, Per Vognsen | ||
diff --git a/bitbake/lib/bs4/COPYING.txt b/bitbake/lib/bs4/COPYING.txt new file mode 100644 index 0000000000..d668d13f04 --- /dev/null +++ b/bitbake/lib/bs4/COPYING.txt | |||
@@ -0,0 +1,26 @@ | |||
1 | Beautiful Soup is made available under the MIT license: | ||
2 | |||
3 | Copyright (c) 2004-2012 Leonard Richardson | ||
4 | |||
5 | Permission is hereby granted, free of charge, to any person obtaining | ||
6 | a copy of this software and associated documentation files (the | ||
7 | "Software"), to deal in the Software without restriction, including | ||
8 | without limitation the rights to use, copy, modify, merge, publish, | ||
9 | distribute, sublicense, and/or sell copies of the Software, and to | ||
10 | permit persons to whom the Software is furnished to do so, subject to | ||
11 | the following conditions: | ||
12 | |||
13 | The above copyright notice and this permission notice shall be | ||
14 | included in all copies or substantial portions of the Software. | ||
15 | |||
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
23 | SOFTWARE, DAMMIT. | ||
24 | |||
25 | Beautiful Soup incorporates code from the html5lib library, which is | ||
26 | also made available under the MIT license. | ||
diff --git a/bitbake/lib/bs4/NEWS.txt b/bitbake/lib/bs4/NEWS.txt new file mode 100644 index 0000000000..88a60a2458 --- /dev/null +++ b/bitbake/lib/bs4/NEWS.txt | |||
@@ -0,0 +1,1066 @@ | |||
1 | = 4.3.2 (20131002) = | ||
2 | |||
3 | * Fixed a bug in which short Unicode input was improperly encoded to | ||
4 | ASCII when checking whether or not it was the name of a file on | ||
5 | disk. [bug=1227016] | ||
6 | |||
7 | * Fixed a crash when a short input contains data not valid in | ||
8 | filenames. [bug=1232604] | ||
9 | |||
10 | * Fixed a bug that caused Unicode data put into UnicodeDammit to | ||
11 | return None instead of the original data. [bug=1214983] | ||
12 | |||
13 | * Combined two tests to stop a spurious test failure when tests are | ||
14 | run by nosetests. [bug=1212445] | ||
15 | |||
16 | = 4.3.1 (20130815) = | ||
17 | |||
18 | * Fixed yet another problem with the html5lib tree builder, caused by | ||
19 | html5lib's tendency to rearrange the tree during | ||
20 | parsing. [bug=1189267] | ||
21 | |||
22 | * Fixed a bug that caused the optimized version of find_all() to | ||
23 | return nothing. [bug=1212655] | ||
24 | |||
25 | = 4.3.0 (20130812) = | ||
26 | |||
27 | * Instead of converting incoming data to Unicode and feeding it to the | ||
28 | lxml tree builder in chunks, Beautiful Soup now makes successive | ||
29 | guesses at the encoding of the incoming data, and tells lxml to | ||
30 | parse the data as that encoding. Giving lxml more control over the | ||
31 | parsing process improves performance and avoids a number of bugs and | ||
32 | issues with the lxml parser which had previously required elaborate | ||
33 | workarounds: | ||
34 | |||
35 | - An issue in which lxml refuses to parse Unicode strings on some | ||
36 | systems. [bug=1180527] | ||
37 | |||
38 | - A returning bug that truncated documents longer than a (very | ||
39 | small) size. [bug=963880] | ||
40 | |||
41 | - A returning bug in which extra spaces were added to a document if | ||
42 | the document defined a charset other than UTF-8. [bug=972466] | ||
43 | |||
44 | This required a major overhaul of the tree builder architecture. If | ||
45 | you wrote your own tree builder and didn't tell me, you'll need to | ||
46 | modify your prepare_markup() method. | ||
47 | |||
48 | * The UnicodeDammit code that makes guesses at encodings has been | ||
49 | split into its own class, EncodingDetector. A lot of apparently | ||
50 | redundant code has been removed from Unicode, Dammit, and some | ||
51 | undocumented features have also been removed. | ||
52 | |||
53 | * Beautiful Soup will issue a warning if instead of markup you pass it | ||
54 | a URL or the name of a file on disk (a common beginner's mistake). | ||
55 | |||
56 | * A number of optimizations improve the performance of the lxml tree | ||
57 | builder by about 33%, the html.parser tree builder by about 20%, and | ||
58 | the html5lib tree builder by about 15%. | ||
59 | |||
60 | * All find_all calls should now return a ResultSet object. Patch by | ||
61 | Aaron DeVore. [bug=1194034] | ||
62 | |||
63 | = 4.2.1 (20130531) = | ||
64 | |||
65 | * The default XML formatter will now replace ampersands even if they | ||
66 | appear to be part of entities. That is, "<" will become | ||
67 | "&lt;". The old code was left over from Beautiful Soup 3, which | ||
68 | didn't always turn entities into Unicode characters. | ||
69 | |||
70 | If you really want the old behavior (maybe because you add new | ||
71 | strings to the tree, those strings include entities, and you want | ||
72 | the formatter to leave them alone on output), it can be found in | ||
73 | EntitySubstitution.substitute_xml_containing_entities(). [bug=1182183] | ||
74 | |||
75 | * Gave new_string() the ability to create subclasses of | ||
76 | NavigableString. [bug=1181986] | ||
77 | |||
78 | * Fixed another bug by which the html5lib tree builder could create a | ||
79 | disconnected tree. [bug=1182089] | ||
80 | |||
81 | * The .previous_element of a BeautifulSoup object is now always None, | ||
82 | not the last element to be parsed. [bug=1182089] | ||
83 | |||
84 | * Fixed test failures when lxml is not installed. [bug=1181589] | ||
85 | |||
86 | * html5lib now supports Python 3. Fixed some Python 2-specific | ||
87 | code in the html5lib test suite. [bug=1181624] | ||
88 | |||
89 | * The html.parser treebuilder can now handle numeric attributes in | ||
90 | text when the hexidecimal name of the attribute starts with a | ||
91 | capital X. Patch by Tim Shirley. [bug=1186242] | ||
92 | |||
93 | = 4.2.0 (20130514) = | ||
94 | |||
95 | * The Tag.select() method now supports a much wider variety of CSS | ||
96 | selectors. | ||
97 | |||
98 | - Added support for the adjacent sibling combinator (+) and the | ||
99 | general sibling combinator (~). Tests by "liquider". [bug=1082144] | ||
100 | |||
101 | - The combinators (>, +, and ~) can now combine with any supported | ||
102 | selector, not just one that selects based on tag name. | ||
103 | |||
104 | - Added limited support for the "nth-of-type" pseudo-class. Code | ||
105 | by Sven Slootweg. [bug=1109952] | ||
106 | |||
107 | * The BeautifulSoup class is now aliased to "_s" and "_soup", making | ||
108 | it quicker to type the import statement in an interactive session: | ||
109 | |||
110 | from bs4 import _s | ||
111 | or | ||
112 | from bs4 import _soup | ||
113 | |||
114 | The alias may change in the future, so don't use this in code you're | ||
115 | going to run more than once. | ||
116 | |||
117 | * Added the 'diagnose' submodule, which includes several useful | ||
118 | functions for reporting problems and doing tech support. | ||
119 | |||
120 | - diagnose(data) tries the given markup on every installed parser, | ||
121 | reporting exceptions and displaying successes. If a parser is not | ||
122 | installed, diagnose() mentions this fact. | ||
123 | |||
124 | - lxml_trace(data, html=True) runs the given markup through lxml's | ||
125 | XML parser or HTML parser, and prints out the parser events as | ||
126 | they happen. This helps you quickly determine whether a given | ||
127 | problem occurs in lxml code or Beautiful Soup code. | ||
128 | |||
129 | - htmlparser_trace(data) is the same thing, but for Python's | ||
130 | built-in HTMLParser class. | ||
131 | |||
132 | * In an HTML document, the contents of a <script> or <style> tag will | ||
133 | no longer undergo entity substitution by default. XML documents work | ||
134 | the same way they did before. [bug=1085953] | ||
135 | |||
136 | * Methods like get_text() and properties like .strings now only give | ||
137 | you strings that are visible in the document--no comments or | ||
138 | processing commands. [bug=1050164] | ||
139 | |||
140 | * The prettify() method now leaves the contents of <pre> tags | ||
141 | alone. [bug=1095654] | ||
142 | |||
143 | * Fix a bug in the html5lib treebuilder which sometimes created | ||
144 | disconnected trees. [bug=1039527] | ||
145 | |||
146 | * Fix a bug in the lxml treebuilder which crashed when a tag included | ||
147 | an attribute from the predefined "xml:" namespace. [bug=1065617] | ||
148 | |||
149 | * Fix a bug by which keyword arguments to find_parent() were not | ||
150 | being passed on. [bug=1126734] | ||
151 | |||
152 | * Stop a crash when unwisely messing with a tag that's been | ||
153 | decomposed. [bug=1097699] | ||
154 | |||
155 | * Now that lxml's segfault on invalid doctype has been fixed, fixed a | ||
156 | corresponding problem on the Beautiful Soup end that was previously | ||
157 | invisible. [bug=984936] | ||
158 | |||
159 | * Fixed an exception when an overspecified CSS selector didn't match | ||
160 | anything. Code by Stefaan Lippens. [bug=1168167] | ||
161 | |||
162 | = 4.1.3 (20120820) = | ||
163 | |||
164 | * Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious | ||
165 | test failure caused by the lousy HTMLParser in those | ||
166 | versions. [bug=1038503] | ||
167 | |||
168 | * Raise a more specific error (FeatureNotFound) when a requested | ||
169 | parser or parser feature is not installed. Raise NotImplementedError | ||
170 | instead of ValueError when the user calls insert_before() or | ||
171 | insert_after() on the BeautifulSoup object itself. Patch by Aaron | ||
172 | Devore. [bug=1038301] | ||
173 | |||
174 | = 4.1.2 (20120817) = | ||
175 | |||
176 | * As per PEP-8, allow searching by CSS class using the 'class_' | ||
177 | keyword argument. [bug=1037624] | ||
178 | |||
179 | * Display namespace prefixes for namespaced attribute names, instead of | ||
180 | the fully-qualified names given by the lxml parser. [bug=1037597] | ||
181 | |||
182 | * Fixed a crash on encoding when an attribute name contained | ||
183 | non-ASCII characters. | ||
184 | |||
185 | * When sniffing encodings, if the cchardet library is installed, | ||
186 | Beautiful Soup uses it instead of chardet. cchardet is much | ||
187 | faster. [bug=1020748] | ||
188 | |||
189 | * Use logging.warning() instead of warning.warn() to notify the user | ||
190 | that characters were replaced with REPLACEMENT | ||
191 | CHARACTER. [bug=1013862] | ||
192 | |||
193 | = 4.1.1 (20120703) = | ||
194 | |||
195 | * Fixed an html5lib tree builder crash which happened when html5lib | ||
196 | moved a tag with a multivalued attribute from one part of the tree | ||
197 | to another. [bug=1019603] | ||
198 | |||
199 | * Correctly display closing tags with an XML namespace declared. Patch | ||
200 | by Andreas Kostyrka. [bug=1019635] | ||
201 | |||
202 | * Fixed a typo that made parsing significantly slower than it should | ||
203 | have been, and also waited too long to close tags with XML | ||
204 | namespaces. [bug=1020268] | ||
205 | |||
206 | * get_text() now returns an empty Unicode string if there is no text, | ||
207 | rather than an empty bytestring. [bug=1020387] | ||
208 | |||
209 | = 4.1.0 (20120529) = | ||
210 | |||
211 | * Added experimental support for fixing Windows-1252 characters | ||
212 | embedded in UTF-8 documents. (UnicodeDammit.detwingle()) | ||
213 | |||
214 | * Fixed the handling of " with the built-in parser. [bug=993871] | ||
215 | |||
216 | * Comments, processing instructions, document type declarations, and | ||
217 | markup declarations are now treated as preformatted strings, the way | ||
218 | CData blocks are. [bug=1001025] | ||
219 | |||
220 | * Fixed a bug with the lxml treebuilder that prevented the user from | ||
221 | adding attributes to a tag that didn't originally have | ||
222 | attributes. [bug=1002378] Thanks to Oliver Beattie for the patch. | ||
223 | |||
224 | * Fixed some edge-case bugs having to do with inserting an element | ||
225 | into a tag it's already inside, and replacing one of a tag's | ||
226 | children with another. [bug=997529] | ||
227 | |||
228 | * Added the ability to search for attribute values specified in UTF-8. [bug=1003974] | ||
229 | |||
230 | This caused a major refactoring of the search code. All the tests | ||
231 | pass, but it's possible that some searches will behave differently. | ||
232 | |||
233 | = 4.0.5 (20120427) = | ||
234 | |||
235 | * Added a new method, wrap(), which wraps an element in a tag. | ||
236 | |||
237 | * Renamed replace_with_children() to unwrap(), which is easier to | ||
238 | understand and also the jQuery name of the function. | ||
239 | |||
240 | * Made encoding substitution in <meta> tags completely transparent (no | ||
241 | more %SOUP-ENCODING%). | ||
242 | |||
243 | * Fixed a bug in decoding data that contained a byte-order mark, such | ||
244 | as data encoded in UTF-16LE. [bug=988980] | ||
245 | |||
246 | * Fixed a bug that made the HTMLParser treebuilder generate XML | ||
247 | definitions ending with two question marks instead of | ||
248 | one. [bug=984258] | ||
249 | |||
250 | * Upon document generation, CData objects are no longer run through | ||
251 | the formatter. [bug=988905] | ||
252 | |||
253 | * The test suite now passes when lxml is not installed, whether or not | ||
254 | html5lib is installed. [bug=987004] | ||
255 | |||
256 | * Print a warning on HTMLParseErrors to let people know they should | ||
257 | install a better parser library. | ||
258 | |||
259 | = 4.0.4 (20120416) = | ||
260 | |||
261 | * Fixed a bug that sometimes created disconnected trees. | ||
262 | |||
263 | * Fixed a bug with the string setter that moved a string around the | ||
264 | tree instead of copying it. [bug=983050] | ||
265 | |||
266 | * Attribute values are now run through the provided output formatter. | ||
267 | Previously they were always run through the 'minimal' formatter. In | ||
268 | the future I may make it possible to specify different formatters | ||
269 | for attribute values and strings, but for now, consistent behavior | ||
270 | is better than inconsistent behavior. [bug=980237] | ||
271 | |||
272 | * Added the missing renderContents method from Beautiful Soup 3. Also | ||
273 | added an encode_contents() method to go along with decode_contents(). | ||
274 | |||
275 | * Give a more useful error when the user tries to run the Python 2 | ||
276 | version of BS under Python 3. | ||
277 | |||
278 | * UnicodeDammit can now convert Microsoft smart quotes to ASCII with | ||
279 | UnicodeDammit(markup, smart_quotes_to="ascii"). | ||
280 | |||
281 | = 4.0.3 (20120403) = | ||
282 | |||
283 | * Fixed a typo that caused some versions of Python 3 to convert the | ||
284 | Beautiful Soup codebase incorrectly. | ||
285 | |||
286 | * Got rid of the 4.0.2 workaround for HTML documents--it was | ||
287 | unnecessary and the workaround was triggering a (possibly different, | ||
288 | but related) bug in lxml. [bug=972466] | ||
289 | |||
290 | = 4.0.2 (20120326) = | ||
291 | |||
292 | * Worked around a possible bug in lxml that prevents non-tiny XML | ||
293 | documents from being parsed. [bug=963880, bug=963936] | ||
294 | |||
295 | * Fixed a bug where specifying `text` while also searching for a tag | ||
296 | only worked if `text` wanted an exact string match. [bug=955942] | ||
297 | |||
298 | = 4.0.1 (20120314) = | ||
299 | |||
300 | * This is the first official release of Beautiful Soup 4. There is no | ||
301 | 4.0.0 release, to eliminate any possibility that packaging software | ||
302 | might treat "4.0.0" as being an earlier version than "4.0.0b10". | ||
303 | |||
304 | * Brought BS up to date with the latest release of soupselect, adding | ||
305 | CSS selector support for direct descendant matches and multiple CSS | ||
306 | class matches. | ||
307 | |||
308 | = 4.0.0b10 (20120302) = | ||
309 | |||
310 | * Added support for simple CSS selectors, taken from the soupselect project. | ||
311 | |||
312 | * Fixed a crash when using html5lib. [bug=943246] | ||
313 | |||
314 | * In HTML5-style <meta charset="foo"> tags, the value of the "charset" | ||
315 | attribute is now replaced with the appropriate encoding on | ||
316 | output. [bug=942714] | ||
317 | |||
318 | * Fixed a bug that caused calling a tag to sometimes call find_all() | ||
319 | with the wrong arguments. [bug=944426] | ||
320 | |||
321 | * For backwards compatibility, brought back the BeautifulStoneSoup | ||
322 | class as a deprecated wrapper around BeautifulSoup. | ||
323 | |||
324 | = 4.0.0b9 (20120228) = | ||
325 | |||
326 | * Fixed the string representation of DOCTYPEs that have both a public | ||
327 | ID and a system ID. | ||
328 | |||
329 | * Fixed the generated XML declaration. | ||
330 | |||
331 | * Renamed Tag.nsprefix to Tag.prefix, for consistency with | ||
332 | NamespacedAttribute. | ||
333 | |||
334 | * Fixed a test failure that occured on Python 3.x when chardet was | ||
335 | installed. | ||
336 | |||
337 | * Made prettify() return Unicode by default, so it will look nice on | ||
338 | Python 3 when passed into print(). | ||
339 | |||
340 | = 4.0.0b8 (20120224) = | ||
341 | |||
342 | * All tree builders now preserve namespace information in the | ||
343 | documents they parse. If you use the html5lib parser or lxml's XML | ||
344 | parser, you can access the namespace URL for a tag as tag.namespace. | ||
345 | |||
346 | However, there is no special support for namespace-oriented | ||
347 | searching or tree manipulation. When you search the tree, you need | ||
348 | to use namespace prefixes exactly as they're used in the original | ||
349 | document. | ||
350 | |||
351 | * The string representation of a DOCTYPE always ends in a newline. | ||
352 | |||
353 | * Issue a warning if the user tries to use a SoupStrainer in | ||
354 | conjunction with the html5lib tree builder, which doesn't support | ||
355 | them. | ||
356 | |||
357 | = 4.0.0b7 (20120223) = | ||
358 | |||
359 | * Upon decoding to string, any characters that can't be represented in | ||
360 | your chosen encoding will be converted into numeric XML entity | ||
361 | references. | ||
362 | |||
363 | * Issue a warning if characters were replaced with REPLACEMENT | ||
364 | CHARACTER during Unicode conversion. | ||
365 | |||
366 | * Restored compatibility with Python 2.6. | ||
367 | |||
368 | * The install process no longer installs docs or auxillary text files. | ||
369 | |||
370 | * It's now possible to deepcopy a BeautifulSoup object created with | ||
371 | Python's built-in HTML parser. | ||
372 | |||
373 | * About 100 unit tests that "test" the behavior of various parsers on | ||
374 | invalid markup have been removed. Legitimate changes to those | ||
375 | parsers caused these tests to fail, indicating that perhaps | ||
376 | Beautiful Soup should not test the behavior of foreign | ||
377 | libraries. | ||
378 | |||
379 | The problematic unit tests have been reformulated as informational | ||
380 | comparisons generated by the script | ||
381 | scripts/demonstrate_parser_differences.py. | ||
382 | |||
383 | This makes Beautiful Soup compatible with html5lib version 0.95 and | ||
384 | future versions of HTMLParser. | ||
385 | |||
386 | = 4.0.0b6 (20120216) = | ||
387 | |||
388 | * Multi-valued attributes like "class" always have a list of values, | ||
389 | even if there's only one value in the list. | ||
390 | |||
391 | * Added a number of multi-valued attributes defined in HTML5. | ||
392 | |||
393 | * Stopped generating a space before the slash that closes an | ||
394 | empty-element tag. This may come back if I add a special XHTML mode | ||
395 | (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty | ||
396 | useless. | ||
397 | |||
398 | * Passing text along with tag-specific arguments to a find* method: | ||
399 | |||
400 | find("a", text="Click here") | ||
401 | |||
402 | will find tags that contain the given text as their | ||
403 | .string. Previously, the tag-specific arguments were ignored and | ||
404 | only strings were searched. | ||
405 | |||
406 | * Fixed a bug that caused the html5lib tree builder to build a | ||
407 | partially disconnected tree. Generally cleaned up the html5lib tree | ||
408 | builder. | ||
409 | |||
410 | * If you restrict a multi-valued attribute like "class" to a string | ||
411 | that contains spaces, Beautiful Soup will only consider it a match | ||
412 | if the values correspond to that specific string. | ||
413 | |||
414 | = 4.0.0b5 (20120209) = | ||
415 | |||
416 | * Rationalized Beautiful Soup's treatment of CSS class. A tag | ||
417 | belonging to multiple CSS classes is treated as having a list of | ||
418 | values for the 'class' attribute. Searching for a CSS class will | ||
419 | match *any* of the CSS classes. | ||
420 | |||
421 | This actually affects all attributes that the HTML standard defines | ||
422 | as taking multiple values (class, rel, rev, archive, accept-charset, | ||
423 | and headers), but 'class' is by far the most common. [bug=41034] | ||
424 | |||
425 | * If you pass anything other than a dictionary as the second argument | ||
426 | to one of the find* methods, it'll assume you want to use that | ||
427 | object to search against a tag's CSS classes. Previously this only | ||
428 | worked if you passed in a string. | ||
429 | |||
430 | * Fixed a bug that caused a crash when you passed a dictionary as an | ||
431 | attribute value (possibly because you mistyped "attrs"). [bug=842419] | ||
432 | |||
433 | * Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags | ||
434 | like <meta charset="utf-8" />. [bug=837268] | ||
435 | |||
436 | * If Unicode, Dammit can't figure out a consistent encoding for a | ||
437 | page, it will try each of its guesses again, with errors="replace" | ||
438 | instead of errors="strict". This may mean that some data gets | ||
439 | replaced with REPLACEMENT CHARACTER, but at least most of it will | ||
440 | get turned into Unicode. [bug=754903] | ||
441 | |||
442 | * Patched over a bug in html5lib (?) that was crashing Beautiful Soup | ||
443 | on certain kinds of markup. [bug=838800] | ||
444 | |||
445 | * Fixed a bug that wrecked the tree if you replaced an element with an | ||
446 | empty string. [bug=728697] | ||
447 | |||
448 | * Improved Unicode, Dammit's behavior when you give it Unicode to | ||
449 | begin with. | ||
450 | |||
451 | = 4.0.0b4 (20120208) = | ||
452 | |||
453 | * Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag() | ||
454 | |||
455 | * BeautifulSoup.new_tag() will follow the rules of whatever | ||
456 | tree-builder was used to create the original BeautifulSoup object. A | ||
457 | new <p> tag will look like "<p />" if the soup object was created to | ||
458 | parse XML, but it will look like "<p></p>" if the soup object was | ||
459 | created to parse HTML. | ||
460 | |||
461 | * We pass in strict=False to html.parser on Python 3, greatly | ||
462 | improving html.parser's ability to handle bad HTML. | ||
463 | |||
464 | * We also monkeypatch a serious bug in html.parser that made | ||
465 | strict=False disastrous on Python 3.2.2. | ||
466 | |||
467 | * Replaced the "substitute_html_entities" argument with the | ||
468 | more general "formatter" argument. | ||
469 | |||
470 | * Bare ampersands and angle brackets are always converted to XML | ||
471 | entities unless the user prevents it. | ||
472 | |||
473 | * Added PageElement.insert_before() and PageElement.insert_after(), | ||
474 | which let you put an element into the parse tree with respect to | ||
475 | some other element. | ||
476 | |||
477 | * Raise an exception when the user tries to do something nonsensical | ||
478 | like insert a tag into itself. | ||
479 | |||
480 | |||
481 | = 4.0.0b3 (20120203) = | ||
482 | |||
483 | Beautiful Soup 4 is a nearly-complete rewrite that removes Beautiful | ||
484 | Soup's custom HTML parser in favor of a system that lets you write a | ||
485 | little glue code and plug in any HTML or XML parser you want. | ||
486 | |||
487 | Beautiful Soup 4.0 comes with glue code for four parsers: | ||
488 | |||
489 | * Python's standard HTMLParser (html.parser in Python 3) | ||
490 | * lxml's HTML and XML parsers | ||
491 | * html5lib's HTML parser | ||
492 | |||
493 | HTMLParser is the default, but I recommend you install lxml if you | ||
494 | can. | ||
495 | |||
496 | For complete documentation, see the Sphinx documentation in | ||
497 | bs4/doc/source/. What follows is a summary of the changes from | ||
498 | Beautiful Soup 3. | ||
499 | |||
500 | === The module name has changed === | ||
501 | |||
502 | Previously you imported the BeautifulSoup class from a module also | ||
503 | called BeautifulSoup. To save keystrokes and make it clear which | ||
504 | version of the API is in use, the module is now called 'bs4': | ||
505 | |||
506 | >>> from bs4 import BeautifulSoup | ||
507 | |||
508 | === It works with Python 3 === | ||
509 | |||
510 | Beautiful Soup 3.1.0 worked with Python 3, but the parser it used was | ||
511 | so bad that it barely worked at all. Beautiful Soup 4 works with | ||
512 | Python 3, and since its parser is pluggable, you don't sacrifice | ||
513 | quality. | ||
514 | |||
515 | Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3 | ||
516 | support to the finish line. Ezio Melotti is also to thank for greatly | ||
517 | improving the HTML parser that comes with Python 3.2. | ||
518 | |||
519 | === CDATA sections are normal text, if they're understood at all. === | ||
520 | |||
521 | Currently, the lxml and html5lib HTML parsers ignore CDATA sections in | ||
522 | markup: | ||
523 | |||
524 | <p><![CDATA[foo]]></p> => <p></p> | ||
525 | |||
526 | A future version of html5lib will turn CDATA sections into text nodes, | ||
527 | but only within tags like <svg> and <math>: | ||
528 | |||
529 | <svg><![CDATA[foo]]></svg> => <p>foo</p> | ||
530 | |||
531 | The default XML parser (which uses lxml behind the scenes) turns CDATA | ||
532 | sections into ordinary text elements: | ||
533 | |||
534 | <p><![CDATA[foo]]></p> => <p>foo</p> | ||
535 | |||
536 | In theory it's possible to preserve the CDATA sections when using the | ||
537 | XML parser, but I don't see how to get it to work in practice. | ||
538 | |||
539 | === Miscellaneous other stuff === | ||
540 | |||
541 | If the BeautifulSoup instance has .is_xml set to True, an appropriate | ||
542 | XML declaration will be emitted when the tree is transformed into a | ||
543 | string: | ||
544 | |||
545 | <?xml version="1.0" encoding="utf-8"> | ||
546 | <markup> | ||
547 | ... | ||
548 | </markup> | ||
549 | |||
550 | The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree | ||
551 | builders set it to False. If you want to parse XHTML with an HTML | ||
552 | parser, you can set it manually. | ||
553 | |||
554 | |||
555 | = 3.2.0 = | ||
556 | |||
557 | The 3.1 series wasn't very useful, so I renamed the 3.0 series to 3.2 | ||
558 | to make it obvious which one you should use. | ||
559 | |||
560 | = 3.1.0 = | ||
561 | |||
562 | A hybrid version that supports 2.4 and can be automatically converted | ||
563 | to run under Python 3.0. There are three backwards-incompatible | ||
564 | changes you should be aware of, but no new features or deliberate | ||
565 | behavior changes. | ||
566 | |||
567 | 1. str() may no longer do what you want. This is because the meaning | ||
568 | of str() inverts between Python 2 and 3; in Python 2 it gives you a | ||
569 | byte string, in Python 3 it gives you a Unicode string. | ||
570 | |||
571 | The effect of this is that you can't pass an encoding to .__str__ | ||
572 | anymore. Use encode() to get a string and decode() to get Unicode, and | ||
573 | you'll be ready (well, readier) for Python 3. | ||
574 | |||
575 | 2. Beautiful Soup is now based on HTMLParser rather than SGMLParser, | ||
576 | which is gone in Python 3. There's some bad HTML that SGMLParser | ||
577 | handled but HTMLParser doesn't, usually to do with attribute values | ||
578 | that aren't closed or have brackets inside them: | ||
579 | |||
580 | <a href="foo</a>, </a><a href="bar">baz</a> | ||
581 | <a b="<a>">', '<a b="<a>"></a><a>"></a> | ||
582 | |||
583 | A later version of Beautiful Soup will allow you to plug in different | ||
584 | parsers to make tradeoffs between speed and the ability to handle bad | ||
585 | HTML. | ||
586 | |||
587 | 3. In Python 3 (but not Python 2), HTMLParser converts entities within | ||
588 | attributes to the corresponding Unicode characters. In Python 2 it's | ||
589 | possible to parse this string and leave the é intact. | ||
590 | |||
591 | <a href="http://crummy.com?sacré&bleu"> | ||
592 | |||
593 | In Python 3, the é is always converted to \xe9 during | ||
594 | parsing. | ||
595 | |||
596 | |||
597 | = 3.0.7a = | ||
598 | |||
599 | Added an import that makes BS work in Python 2.3. | ||
600 | |||
601 | |||
602 | = 3.0.7 = | ||
603 | |||
604 | Fixed a UnicodeDecodeError when unpickling documents that contain | ||
605 | non-ASCII characters. | ||
606 | |||
607 | Fixed a TypeError that occured in some circumstances when a tag | ||
608 | contained no text. | ||
609 | |||
610 | Jump through hoops to avoid the use of chardet, which can be extremely | ||
611 | slow in some circumstances. UTF-8 documents should never trigger the | ||
612 | use of chardet. | ||
613 | |||
614 | Whitespace is preserved inside <pre> and <textarea> tags that contain | ||
615 | nothing but whitespace. | ||
616 | |||
617 | Beautiful Soup can now parse a doctype that's scoped to an XML namespace. | ||
618 | |||
619 | |||
620 | = 3.0.6 = | ||
621 | |||
622 | Got rid of a very old debug line that prevented chardet from working. | ||
623 | |||
624 | Added a Tag.decompose() method that completely disconnects a tree or a | ||
625 | subset of a tree, breaking it up into bite-sized pieces that are | ||
626 | easy for the garbage collecter to collect. | ||
627 | |||
628 | Tag.extract() now returns the tag that was extracted. | ||
629 | |||
630 | Tag.findNext() now does something with the keyword arguments you pass | ||
631 | it instead of dropping them on the floor. | ||
632 | |||
633 | Fixed a Unicode conversion bug. | ||
634 | |||
635 | Fixed a bug that garbled some <meta> tags when rewriting them. | ||
636 | |||
637 | |||
638 | = 3.0.5 = | ||
639 | |||
640 | Soup objects can now be pickled, and copied with copy.deepcopy. | ||
641 | |||
642 | Tag.append now works properly on existing BS objects. (It wasn't | ||
643 | originally intended for outside use, but it can be now.) (Giles | ||
644 | Radford) | ||
645 | |||
646 | Passing in a nonexistent encoding will no longer crash the parser on | ||
647 | Python 2.4 (John Nagle). | ||
648 | |||
649 | Fixed an underlying bug in SGMLParser that thinks ASCII has 255 | ||
650 | characters instead of 127 (John Nagle). | ||
651 | |||
652 | Entities are converted more consistently to Unicode characters. | ||
653 | |||
654 | Entity references in attribute values are now converted to Unicode | ||
655 | characters when appropriate. Numeric entities are always converted, | ||
656 | because SGMLParser always converts them outside of attribute values. | ||
657 | |||
658 | ALL_ENTITIES happens to just be the XHTML entities, so I renamed it to | ||
659 | XHTML_ENTITIES. | ||
660 | |||
661 | The regular expression for bare ampersands was too loose. In some | ||
662 | cases ampersands were not being escaped. (Sam Ruby?) | ||
663 | |||
664 | Non-breaking spaces and other special Unicode space characters are no | ||
665 | longer folded to ASCII spaces. (Robert Leftwich) | ||
666 | |||
667 | Information inside a TEXTAREA tag is now parsed literally, not as HTML | ||
668 | tags. TEXTAREA now works exactly the same way as SCRIPT. (Zephyr Fang) | ||
669 | |||
670 | = 3.0.4 = | ||
671 | |||
672 | Fixed a bug that crashed Unicode conversion in some cases. | ||
673 | |||
674 | Fixed a bug that prevented UnicodeDammit from being used as a | ||
675 | general-purpose data scrubber. | ||
676 | |||
677 | Fixed some unit test failures when running against Python 2.5. | ||
678 | |||
679 | When considering whether to convert smart quotes, UnicodeDammit now | ||
680 | looks at the original encoding in a case-insensitive way. | ||
681 | |||
682 | = 3.0.3 (20060606) = | ||
683 | |||
684 | Beautiful Soup is now usable as a way to clean up invalid XML/HTML (be | ||
685 | sure to pass in an appropriate value for convertEntities, or XML/HTML | ||
686 | entities might stick around that aren't valid in HTML/XML). The result | ||
687 | may not validate, but it should be good enough to not choke a | ||
688 | real-world XML parser. Specifically, the output of a properly | ||
689 | constructed soup object should always be valid as part of an XML | ||
690 | document, but parts may be missing if they were missing in the | ||
691 | original. As always, if the input is valid XML, the output will also | ||
692 | be valid. | ||
693 | |||
694 | = 3.0.2 (20060602) = | ||
695 | |||
696 | Previously, Beautiful Soup correctly handled attribute values that | ||
697 | contained embedded quotes (sometimes by escaping), but not other kinds | ||
698 | of XML character. Now, it correctly handles or escapes all special XML | ||
699 | characters in attribute values. | ||
700 | |||
701 | I aliased methods to the 2.x names (fetch, find, findText, etc.) for | ||
702 | backwards compatibility purposes. Those names are deprecated and if I | ||
703 | ever do a 4.0 I will remove them. I will, I tell you! | ||
704 | |||
705 | Fixed a bug where the findAll method wasn't passing along any keyword | ||
706 | arguments. | ||
707 | |||
708 | When run from the command line, Beautiful Soup now acts as an HTML | ||
709 | pretty-printer, not an XML pretty-printer. | ||
710 | |||
711 | = 3.0.1 (20060530) = | ||
712 | |||
713 | Reintroduced the "fetch by CSS class" shortcut. I thought keyword | ||
714 | arguments would replace it, but they don't. You can't call soup('a', | ||
715 | class='foo') because class is a Python keyword. | ||
716 | |||
717 | If Beautiful Soup encounters a meta tag that declares the encoding, | ||
718 | but a SoupStrainer tells it not to parse that tag, Beautiful Soup will | ||
719 | no longer try to rewrite the meta tag to mention the new | ||
720 | encoding. Basically, this makes SoupStrainers work in real-world | ||
721 | applications instead of crashing the parser. | ||
722 | |||
723 | = 3.0.0 "Who would not give all else for two p" (20060528) = | ||
724 | |||
725 | This release is not backward-compatible with previous releases. If | ||
726 | you've got code written with a previous version of the library, go | ||
727 | ahead and keep using it, unless one of the features mentioned here | ||
728 | really makes your life easier. Since the library is self-contained, | ||
729 | you can include an old copy of the library in your old applications, | ||
730 | and use the new version for everything else. | ||
731 | |||
732 | The documentation has been rewritten and greatly expanded with many | ||
733 | more examples. | ||
734 | |||
735 | Beautiful Soup autodetects the encoding of a document (or uses the one | ||
736 | you specify), and converts it from its native encoding to | ||
737 | Unicode. Internally, it only deals with Unicode strings. When you | ||
738 | print out the document, it converts to UTF-8 (or another encoding you | ||
739 | specify). [Doc reference] | ||
740 | |||
741 | It's now easy to make large-scale changes to the parse tree without | ||
742 | screwing up the navigation members. The methods are extract, | ||
743 | replaceWith, and insert. [Doc reference. See also Improving Memory | ||
744 | Usage with extract] | ||
745 | |||
746 | Passing True in as an attribute value gives you tags that have any | ||
747 | value for that attribute. You don't have to create a regular | ||
748 | expression. Passing None for an attribute value gives you tags that | ||
749 | don't have that attribute at all. | ||
750 | |||
751 | Tag objects now know whether or not they're self-closing. This avoids | ||
752 | the problem where Beautiful Soup thought that tags like <BR /> were | ||
753 | self-closing even in XML documents. You can customize the self-closing | ||
754 | tags for a parser object by passing them in as a list of | ||
755 | selfClosingTags: you don't have to subclass anymore. | ||
756 | |||
757 | There's a new built-in parser, MinimalSoup, which has most of | ||
758 | BeautifulSoup's HTML-specific rules, but no tag nesting rules. [Doc | ||
759 | reference] | ||
760 | |||
761 | You can use a SoupStrainer to tell Beautiful Soup to parse only part | ||
762 | of a document. This saves time and memory, often making Beautiful Soup | ||
763 | about as fast as a custom-built SGMLParser subclass. [Doc reference, | ||
764 | SoupStrainer reference] | ||
765 | |||
766 | You can (usually) use keyword arguments instead of passing a | ||
767 | dictionary of attributes to a search method. That is, you can replace | ||
768 | soup(args={"id" : "5"}) with soup(id="5"). You can still use args if | ||
769 | (for instance) you need to find an attribute whose name clashes with | ||
770 | the name of an argument to findAll. [Doc reference: **kwargs attrs] | ||
771 | |||
772 | The method names have changed to the better method names used in | ||
773 | Rubyful Soup. Instead of find methods and fetch methods, there are | ||
774 | only find methods. Instead of a scheme where you can't remember which | ||
775 | method finds one element and which one finds them all, we have find | ||
776 | and findAll. In general, if the method name mentions All or a plural | ||
777 | noun (eg. findNextSiblings), then it finds many elements | ||
778 | method. Otherwise, it only finds one element. [Doc reference] | ||
779 | |||
780 | Some of the argument names have been renamed for clarity. For instance | ||
781 | avoidParserProblems is now parserMassage. | ||
782 | |||
783 | Beautiful Soup no longer implements a feed method. You need to pass a | ||
784 | string or a filehandle into the soup constructor, not with feed after | ||
785 | the soup has been created. There is still a feed method, but it's the | ||
786 | feed method implemented by SGMLParser and calling it will bypass | ||
787 | Beautiful Soup and cause problems. | ||
788 | |||
789 | The NavigableText class has been renamed to NavigableString. There is | ||
790 | no NavigableUnicodeString anymore, because every string inside a | ||
791 | Beautiful Soup parse tree is a Unicode string. | ||
792 | |||
793 | findText and fetchText are gone. Just pass a text argument into find | ||
794 | or findAll. | ||
795 | |||
796 | Null was more trouble than it was worth, so I got rid of it. Anything | ||
797 | that used to return Null now returns None. | ||
798 | |||
799 | Special XML constructs like comments and CDATA now have their own | ||
800 | NavigableString subclasses, instead of being treated as oddly-formed | ||
801 | data. If you parse a document that contains CDATA and write it back | ||
802 | out, the CDATA will still be there. | ||
803 | |||
804 | When you're parsing a document, you can get Beautiful Soup to convert | ||
805 | XML or HTML entities into the corresponding Unicode characters. [Doc | ||
806 | reference] | ||
807 | |||
808 | = 2.1.1 (20050918) = | ||
809 | |||
810 | Fixed a serious performance bug in BeautifulStoneSoup which was | ||
811 | causing parsing to be incredibly slow. | ||
812 | |||
813 | Corrected several entities that were previously being incorrectly | ||
814 | translated from Microsoft smart-quote-like characters. | ||
815 | |||
816 | Fixed a bug that was breaking text fetch. | ||
817 | |||
818 | Fixed a bug that crashed the parser when text chunks that look like | ||
819 | HTML tag names showed up within a SCRIPT tag. | ||
820 | |||
821 | THEAD, TBODY, and TFOOT tags are now nestable within TABLE | ||
822 | tags. Nested tables should parse more sensibly now. | ||
823 | |||
824 | BASE is now considered a self-closing tag. | ||
825 | |||
826 | = 2.1.0 "Game, or any other dish?" (20050504) = | ||
827 | |||
828 | Added a wide variety of new search methods which, given a starting | ||
829 | point inside the tree, follow a particular navigation member (like | ||
830 | nextSibling) over and over again, looking for Tag and NavigableText | ||
831 | objects that match certain criteria. The new methods are findNext, | ||
832 | fetchNext, findPrevious, fetchPrevious, findNextSibling, | ||
833 | fetchNextSiblings, findPreviousSibling, fetchPreviousSiblings, | ||
834 | findParent, and fetchParents. All of these use the same basic code | ||
835 | used by first and fetch, so you can pass your weird ways of matching | ||
836 | things into these methods. | ||
837 | |||
838 | The fetch method and its derivatives now accept a limit argument. | ||
839 | |||
840 | You can now pass keyword arguments when calling a Tag object as though | ||
841 | it were a method. | ||
842 | |||
843 | Fixed a bug that caused all hand-created tags to share a single set of | ||
844 | attributes. | ||
845 | |||
846 | = 2.0.3 (20050501) = | ||
847 | |||
848 | Fixed Python 2.2 support for iterators. | ||
849 | |||
850 | Fixed a bug that gave the wrong representation to tags within quote | ||
851 | tags like <script>. | ||
852 | |||
853 | Took some code from Mark Pilgrim that treats CDATA declarations as | ||
854 | data instead of ignoring them. | ||
855 | |||
856 | Beautiful Soup's setup.py will now do an install even if the unit | ||
857 | tests fail. It won't build a source distribution if the unit tests | ||
858 | fail, so I can't release a new version unless they pass. | ||
859 | |||
860 | = 2.0.2 (20050416) = | ||
861 | |||
862 | Added the unit tests in a separate module, and packaged it with | ||
863 | distutils. | ||
864 | |||
865 | Fixed a bug that sometimes caused renderContents() to return a Unicode | ||
866 | string even if there was no Unicode in the original string. | ||
867 | |||
868 | Added the done() method, which closes all of the parser's open | ||
869 | tags. It gets called automatically when you pass in some text to the | ||
870 | constructor of a parser class; otherwise you must call it yourself. | ||
871 | |||
872 | Reinstated some backwards compatibility with 1.x versions: referencing | ||
873 | the string member of a NavigableText object returns the NavigableText | ||
874 | object instead of throwing an error. | ||
875 | |||
876 | = 2.0.1 (20050412) = | ||
877 | |||
878 | Fixed a bug that caused bad results when you tried to reference a tag | ||
879 | name shorter than 3 characters as a member of a Tag, eg. tag.table.td. | ||
880 | |||
881 | Made sure all Tags have the 'hidden' attribute so that an attempt to | ||
882 | access tag.hidden doesn't spawn an attempt to find a tag named | ||
883 | 'hidden'. | ||
884 | |||
885 | Fixed a bug in the comparison operator. | ||
886 | |||
887 | = 2.0.0 "Who cares for fish?" (20050410) | ||
888 | |||
889 | Beautiful Soup version 1 was very useful but also pretty stupid. I | ||
890 | originally wrote it without noticing any of the problems inherent in | ||
891 | trying to build a parse tree out of ambiguous HTML tags. This version | ||
892 | solves all of those problems to my satisfaction. It also adds many new | ||
893 | clever things to make up for the removal of the stupid things. | ||
894 | |||
895 | == Parsing == | ||
896 | |||
897 | The parser logic has been greatly improved, and the BeautifulSoup | ||
898 | class should much more reliably yield a parse tree that looks like | ||
899 | what the page author intended. For a particular class of odd edge | ||
900 | cases that now causes problems, there is a new class, | ||
901 | ICantBelieveItsBeautifulSoup. | ||
902 | |||
903 | By default, Beautiful Soup now performs some cleanup operations on | ||
904 | text before parsing it. This is to avoid common problems with bad | ||
905 | definitions and self-closing tags that crash SGMLParser. You can | ||
906 | provide your own set of cleanup operations, or turn it off | ||
907 | altogether. The cleanup operations include fixing self-closing tags | ||
908 | that don't close, and replacing Microsoft smart quotes and similar | ||
909 | characters with their HTML entity equivalents. | ||
910 | |||
911 | You can now get a pretty-print version of parsed HTML to get a visual | ||
912 | picture of how Beautiful Soup parses it, with the Tag.prettify() | ||
913 | method. | ||
914 | |||
915 | == Strings and Unicode == | ||
916 | |||
917 | There are separate NavigableText subclasses for ASCII and Unicode | ||
918 | strings. These classes directly subclass the corresponding base data | ||
919 | types. This means you can treat NavigableText objects as strings | ||
920 | instead of having to call methods on them to get the strings. | ||
921 | |||
922 | str() on a Tag always returns a string, and unicode() always returns | ||
923 | Unicode. Previously it was inconsistent. | ||
924 | |||
925 | == Tree traversal == | ||
926 | |||
927 | In a first() or fetch() call, the tag name or the desired value of an | ||
928 | attribute can now be any of the following: | ||
929 | |||
930 | * A string (matches that specific tag or that specific attribute value) | ||
931 | * A list of strings (matches any tag or attribute value in the list) | ||
932 | * A compiled regular expression object (matches any tag or attribute | ||
933 | value that matches the regular expression) | ||
934 | * A callable object that takes the Tag object or attribute value as a | ||
935 | string. It returns None/false/empty string if the given string | ||
936 | doesn't match, and any other value if it does. | ||
937 | |||
938 | This is much easier to use than SQL-style wildcards (see, regular | ||
939 | expressions are good for something). Because of this, I took out | ||
940 | SQL-style wildcards. I'll put them back if someone complains, but | ||
941 | their removal simplifies the code a lot. | ||
942 | |||
943 | You can use fetch() and first() to search for text in the parse tree, | ||
944 | not just tags. There are new alias methods fetchText() and firstText() | ||
945 | designed for this purpose. As with searching for tags, you can pass in | ||
946 | a string, a regular expression object, or a method to match your text. | ||
947 | |||
948 | If you pass in something besides a map to the attrs argument of | ||
949 | fetch() or first(), Beautiful Soup will assume you want to match that | ||
950 | thing against the "class" attribute. When you're scraping | ||
951 | well-structured HTML, this makes your code a lot cleaner. | ||
952 | |||
953 | 1.x and 2.x both let you call a Tag object as a shorthand for | ||
954 | fetch(). For instance, foo("bar") is a shorthand for | ||
955 | foo.fetch("bar"). In 2.x, you can also access a specially-named member | ||
956 | of a Tag object as a shorthand for first(). For instance, foo.barTag | ||
957 | is a shorthand for foo.first("bar"). By chaining these shortcuts you | ||
958 | traverse a tree in very little code: for header in | ||
959 | soup.bodyTag.pTag.tableTag('th'): | ||
960 | |||
961 | If an element relationship (like parent or next) doesn't apply to a | ||
962 | tag, it'll now show up Null instead of None. first() will also return | ||
963 | Null if you ask it for a nonexistent tag. Null is an object that's | ||
964 | just like None, except you can do whatever you want to it and it'll | ||
965 | give you Null instead of throwing an error. | ||
966 | |||
967 | This lets you do tree traversals like soup.htmlTag.headTag.titleTag | ||
968 | without having to worry if the intermediate stages are actually | ||
969 | there. Previously, if there was no 'head' tag in the document, headTag | ||
970 | in that instance would have been None, and accessing its 'titleTag' | ||
971 | member would have thrown an AttributeError. Now, you can get what you | ||
972 | want when it exists, and get Null when it doesn't, without having to | ||
973 | do a lot of conditionals checking to see if every stage is None. | ||
974 | |||
975 | There are two new relations between page elements: previousSibling and | ||
976 | nextSibling. They reference the previous and next element at the same | ||
977 | level of the parse tree. For instance, if you have HTML like this: | ||
978 | |||
979 | <p><ul><li>Foo<br /><li>Bar</ul> | ||
980 | |||
981 | The first 'li' tag has a previousSibling of Null and its nextSibling | ||
982 | is the second 'li' tag. The second 'li' tag has a nextSibling of Null | ||
983 | and its previousSibling is the first 'li' tag. The previousSibling of | ||
984 | the 'ul' tag is the first 'p' tag. The nextSibling of 'Foo' is the | ||
985 | 'br' tag. | ||
986 | |||
987 | I took out the ability to use fetch() to find tags that have a | ||
988 | specific list of contents. See, I can't even explain it well. It was | ||
989 | really difficult to use, I never used it, and I don't think anyone | ||
990 | else ever used it. To the extent anyone did, they can probably use | ||
991 | fetchText() instead. If it turns out someone needs it I'll think of | ||
992 | another solution. | ||
993 | |||
994 | == Tree manipulation == | ||
995 | |||
996 | You can add new attributes to a tag, and delete attributes from a | ||
997 | tag. In 1.x you could only change a tag's existing attributes. | ||
998 | |||
999 | == Porting Considerations == | ||
1000 | |||
1001 | There are three changes in 2.0 that break old code: | ||
1002 | |||
1003 | In the post-1.2 release you could pass in a function into fetch(). The | ||
1004 | function took a string, the tag name. In 2.0, the function takes the | ||
1005 | actual Tag object. | ||
1006 | |||
1007 | It's no longer to pass in SQL-style wildcards to fetch(). Use a | ||
1008 | regular expression instead. | ||
1009 | |||
1010 | The different parsing algorithm means the parse tree may not be shaped | ||
1011 | like you expect. This will only actually affect you if your code uses | ||
1012 | one of the affected parts. I haven't run into this problem yet while | ||
1013 | porting my code. | ||
1014 | |||
1015 | = Between 1.2 and 2.0 = | ||
1016 | |||
1017 | This is the release to get if you want Python 1.5 compatibility. | ||
1018 | |||
1019 | The desired value of an attribute can now be any of the following: | ||
1020 | |||
1021 | * A string | ||
1022 | * A string with SQL-style wildcards | ||
1023 | * A compiled RE object | ||
1024 | * A callable that returns None/false/empty string if the given value | ||
1025 | doesn't match, and any other value otherwise. | ||
1026 | |||
1027 | This is much easier to use than SQL-style wildcards (see, regular | ||
1028 | expressions are good for something). Because of this, I no longer | ||
1029 | recommend you use SQL-style wildcards. They may go away in a future | ||
1030 | release to clean up the code. | ||
1031 | |||
1032 | Made Beautiful Soup handle processing instructions as text instead of | ||
1033 | ignoring them. | ||
1034 | |||
1035 | Applied patch from Richie Hindle (richie at entrian dot com) that | ||
1036 | makes tag.string a shorthand for tag.contents[0].string when the tag | ||
1037 | has only one string-owning child. | ||
1038 | |||
1039 | Added still more nestable tags. The nestable tags thing won't work in | ||
1040 | a lot of cases and needs to be rethought. | ||
1041 | |||
1042 | Fixed an edge case where searching for "%foo" would match any string | ||
1043 | shorter than "foo". | ||
1044 | |||
1045 | = 1.2 "Who for such dainties would not stoop?" (20040708) = | ||
1046 | |||
1047 | Applied patch from Ben Last (ben at benlast dot com) that made | ||
1048 | Tag.renderContents() correctly handle Unicode. | ||
1049 | |||
1050 | Made BeautifulStoneSoup even dumber by making it not implicitly close | ||
1051 | a tag when another tag of the same type is encountered; only when an | ||
1052 | actual closing tag is encountered. This change courtesy of Fuzzy (mike | ||
1053 | at pcblokes dot com). BeautifulSoup still works as before. | ||
1054 | |||
1055 | = 1.1 "Swimming in a hot tureen" = | ||
1056 | |||
1057 | Added more 'nestable' tags. Changed popping semantics so that when a | ||
1058 | nestable tag is encountered, tags are popped up to the previously | ||
1059 | encountered nestable tag (of whatever kind). I will revert this if | ||
1060 | enough people complain, but it should make more people's lives easier | ||
1061 | than harder. This enhancement was suggested by Anthony Baxter (anthony | ||
1062 | at interlink dot com dot au). | ||
1063 | |||
1064 | = 1.0 "So rich and green" (20040420) = | ||
1065 | |||
1066 | Initial release. | ||
diff --git a/bitbake/lib/bs4/__init__.py b/bitbake/lib/bs4/__init__.py new file mode 100644 index 0000000000..7ba34269af --- /dev/null +++ b/bitbake/lib/bs4/__init__.py | |||
@@ -0,0 +1,406 @@ | |||
1 | """Beautiful Soup | ||
2 | Elixir and Tonic | ||
3 | "The Screen-Scraper's Friend" | ||
4 | http://www.crummy.com/software/BeautifulSoup/ | ||
5 | |||
6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a | ||
7 | (possibly invalid) document into a tree representation. Beautiful Soup | ||
8 | provides provides methods and Pythonic idioms that make it easy to | ||
9 | navigate, search, and modify the parse tree. | ||
10 | |||
11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml | ||
12 | and/or html5lib is installed. | ||
13 | |||
14 | For more than you ever wanted to know about Beautiful Soup, see the | ||
15 | documentation: | ||
16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/ | ||
17 | """ | ||
18 | |||
19 | __author__ = "Leonard Richardson (leonardr@segfault.org)" | ||
20 | __version__ = "4.3.2" | ||
21 | __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" | ||
22 | __license__ = "MIT" | ||
23 | |||
24 | __all__ = ['BeautifulSoup'] | ||
25 | |||
26 | import os | ||
27 | import re | ||
28 | import warnings | ||
29 | |||
30 | from .builder import builder_registry, ParserRejectedMarkup | ||
31 | from .dammit import UnicodeDammit | ||
32 | from .element import ( | ||
33 | CData, | ||
34 | Comment, | ||
35 | DEFAULT_OUTPUT_ENCODING, | ||
36 | Declaration, | ||
37 | Doctype, | ||
38 | NavigableString, | ||
39 | PageElement, | ||
40 | ProcessingInstruction, | ||
41 | ResultSet, | ||
42 | SoupStrainer, | ||
43 | Tag, | ||
44 | ) | ||
45 | |||
46 | # The very first thing we do is give a useful error if someone is | ||
47 | # running this code under Python 3 without converting it. | ||
48 | syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' | ||
49 | |||
50 | class BeautifulSoup(Tag): | ||
51 | """ | ||
52 | This class defines the basic interface called by the tree builders. | ||
53 | |||
54 | These methods will be called by the parser: | ||
55 | reset() | ||
56 | feed(markup) | ||
57 | |||
58 | The tree builder may call these methods from its feed() implementation: | ||
59 | handle_starttag(name, attrs) # See note about return value | ||
60 | handle_endtag(name) | ||
61 | handle_data(data) # Appends to the current data node | ||
62 | endData(containerClass=NavigableString) # Ends the current data node | ||
63 | |||
64 | No matter how complicated the underlying parser is, you should be | ||
65 | able to build a tree using 'start tag' events, 'end tag' events, | ||
66 | 'data' events, and "done with data" events. | ||
67 | |||
68 | If you encounter an empty-element tag (aka a self-closing tag, | ||
69 | like HTML's <br> tag), call handle_starttag and then | ||
70 | handle_endtag. | ||
71 | """ | ||
72 | ROOT_TAG_NAME = u'[document]' | ||
73 | |||
74 | # If the end-user gives no indication which tree builder they | ||
75 | # want, look for one with these features. | ||
76 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] | ||
77 | |||
78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' | ||
79 | |||
80 | def __init__(self, markup="", features=None, builder=None, | ||
81 | parse_only=None, from_encoding=None, **kwargs): | ||
82 | """The Soup object is initialized as the 'root tag', and the | ||
83 | provided markup (which can be a string or a file-like object) | ||
84 | is fed into the underlying parser.""" | ||
85 | |||
86 | if 'convertEntities' in kwargs: | ||
87 | warnings.warn( | ||
88 | "BS4 does not respect the convertEntities argument to the " | ||
89 | "BeautifulSoup constructor. Entities are always converted " | ||
90 | "to Unicode characters.") | ||
91 | |||
92 | if 'markupMassage' in kwargs: | ||
93 | del kwargs['markupMassage'] | ||
94 | warnings.warn( | ||
95 | "BS4 does not respect the markupMassage argument to the " | ||
96 | "BeautifulSoup constructor. The tree builder is responsible " | ||
97 | "for any necessary markup massage.") | ||
98 | |||
99 | if 'smartQuotesTo' in kwargs: | ||
100 | del kwargs['smartQuotesTo'] | ||
101 | warnings.warn( | ||
102 | "BS4 does not respect the smartQuotesTo argument to the " | ||
103 | "BeautifulSoup constructor. Smart quotes are always converted " | ||
104 | "to Unicode characters.") | ||
105 | |||
106 | if 'selfClosingTags' in kwargs: | ||
107 | del kwargs['selfClosingTags'] | ||
108 | warnings.warn( | ||
109 | "BS4 does not respect the selfClosingTags argument to the " | ||
110 | "BeautifulSoup constructor. The tree builder is responsible " | ||
111 | "for understanding self-closing tags.") | ||
112 | |||
113 | if 'isHTML' in kwargs: | ||
114 | del kwargs['isHTML'] | ||
115 | warnings.warn( | ||
116 | "BS4 does not respect the isHTML argument to the " | ||
117 | "BeautifulSoup constructor. You can pass in features='html' " | ||
118 | "or features='xml' to get a builder capable of handling " | ||
119 | "one or the other.") | ||
120 | |||
121 | def deprecated_argument(old_name, new_name): | ||
122 | if old_name in kwargs: | ||
123 | warnings.warn( | ||
124 | 'The "%s" argument to the BeautifulSoup constructor ' | ||
125 | 'has been renamed to "%s."' % (old_name, new_name)) | ||
126 | value = kwargs[old_name] | ||
127 | del kwargs[old_name] | ||
128 | return value | ||
129 | return None | ||
130 | |||
131 | parse_only = parse_only or deprecated_argument( | ||
132 | "parseOnlyThese", "parse_only") | ||
133 | |||
134 | from_encoding = from_encoding or deprecated_argument( | ||
135 | "fromEncoding", "from_encoding") | ||
136 | |||
137 | if len(kwargs) > 0: | ||
138 | arg = kwargs.keys().pop() | ||
139 | raise TypeError( | ||
140 | "__init__() got an unexpected keyword argument '%s'" % arg) | ||
141 | |||
142 | if builder is None: | ||
143 | if isinstance(features, basestring): | ||
144 | features = [features] | ||
145 | if features is None or len(features) == 0: | ||
146 | features = self.DEFAULT_BUILDER_FEATURES | ||
147 | builder_class = builder_registry.lookup(*features) | ||
148 | if builder_class is None: | ||
149 | raise FeatureNotFound( | ||
150 | "Couldn't find a tree builder with the features you " | ||
151 | "requested: %s. Do you need to install a parser library?" | ||
152 | % ",".join(features)) | ||
153 | builder = builder_class() | ||
154 | self.builder = builder | ||
155 | self.is_xml = builder.is_xml | ||
156 | self.builder.soup = self | ||
157 | |||
158 | self.parse_only = parse_only | ||
159 | |||
160 | if hasattr(markup, 'read'): # It's a file-type object. | ||
161 | markup = markup.read() | ||
162 | elif len(markup) <= 256: | ||
163 | # Print out warnings for a couple beginner problems | ||
164 | # involving passing non-markup to Beautiful Soup. | ||
165 | # Beautiful Soup will still parse the input as markup, | ||
166 | # just in case that's what the user really wants. | ||
167 | if (isinstance(markup, unicode) | ||
168 | and not os.path.supports_unicode_filenames): | ||
169 | possible_filename = markup.encode("utf8") | ||
170 | else: | ||
171 | possible_filename = markup | ||
172 | is_file = False | ||
173 | try: | ||
174 | is_file = os.path.exists(possible_filename) | ||
175 | except Exception, e: | ||
176 | # This is almost certainly a problem involving | ||
177 | # characters not valid in filenames on this | ||
178 | # system. Just let it go. | ||
179 | pass | ||
180 | if is_file: | ||
181 | warnings.warn( | ||
182 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) | ||
183 | if markup[:5] == "http:" or markup[:6] == "https:": | ||
184 | # TODO: This is ugly but I couldn't get it to work in | ||
185 | # Python 3 otherwise. | ||
186 | if ((isinstance(markup, bytes) and not b' ' in markup) | ||
187 | or (isinstance(markup, unicode) and not u' ' in markup)): | ||
188 | warnings.warn( | ||
189 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) | ||
190 | |||
191 | for (self.markup, self.original_encoding, self.declared_html_encoding, | ||
192 | self.contains_replacement_characters) in ( | ||
193 | self.builder.prepare_markup(markup, from_encoding)): | ||
194 | self.reset() | ||
195 | try: | ||
196 | self._feed() | ||
197 | break | ||
198 | except ParserRejectedMarkup: | ||
199 | pass | ||
200 | |||
201 | # Clear out the markup and remove the builder's circular | ||
202 | # reference to this object. | ||
203 | self.markup = None | ||
204 | self.builder.soup = None | ||
205 | |||
206 | def _feed(self): | ||
207 | # Convert the document to Unicode. | ||
208 | self.builder.reset() | ||
209 | |||
210 | self.builder.feed(self.markup) | ||
211 | # Close out any unfinished strings and close all the open tags. | ||
212 | self.endData() | ||
213 | while self.currentTag.name != self.ROOT_TAG_NAME: | ||
214 | self.popTag() | ||
215 | |||
216 | def reset(self): | ||
217 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) | ||
218 | self.hidden = 1 | ||
219 | self.builder.reset() | ||
220 | self.current_data = [] | ||
221 | self.currentTag = None | ||
222 | self.tagStack = [] | ||
223 | self.preserve_whitespace_tag_stack = [] | ||
224 | self.pushTag(self) | ||
225 | |||
226 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs): | ||
227 | """Create a new tag associated with this soup.""" | ||
228 | return Tag(None, self.builder, name, namespace, nsprefix, attrs) | ||
229 | |||
230 | def new_string(self, s, subclass=NavigableString): | ||
231 | """Create a new NavigableString associated with this soup.""" | ||
232 | navigable = subclass(s) | ||
233 | navigable.setup() | ||
234 | return navigable | ||
235 | |||
236 | def insert_before(self, successor): | ||
237 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") | ||
238 | |||
239 | def insert_after(self, successor): | ||
240 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") | ||
241 | |||
242 | def popTag(self): | ||
243 | tag = self.tagStack.pop() | ||
244 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: | ||
245 | self.preserve_whitespace_tag_stack.pop() | ||
246 | #print "Pop", tag.name | ||
247 | if self.tagStack: | ||
248 | self.currentTag = self.tagStack[-1] | ||
249 | return self.currentTag | ||
250 | |||
251 | def pushTag(self, tag): | ||
252 | #print "Push", tag.name | ||
253 | if self.currentTag: | ||
254 | self.currentTag.contents.append(tag) | ||
255 | self.tagStack.append(tag) | ||
256 | self.currentTag = self.tagStack[-1] | ||
257 | if tag.name in self.builder.preserve_whitespace_tags: | ||
258 | self.preserve_whitespace_tag_stack.append(tag) | ||
259 | |||
260 | def endData(self, containerClass=NavigableString): | ||
261 | if self.current_data: | ||
262 | current_data = u''.join(self.current_data) | ||
263 | # If whitespace is not preserved, and this string contains | ||
264 | # nothing but ASCII spaces, replace it with a single space | ||
265 | # or newline. | ||
266 | if not self.preserve_whitespace_tag_stack: | ||
267 | strippable = True | ||
268 | for i in current_data: | ||
269 | if i not in self.ASCII_SPACES: | ||
270 | strippable = False | ||
271 | break | ||
272 | if strippable: | ||
273 | if '\n' in current_data: | ||
274 | current_data = '\n' | ||
275 | else: | ||
276 | current_data = ' ' | ||
277 | |||
278 | # Reset the data collector. | ||
279 | self.current_data = [] | ||
280 | |||
281 | # Should we add this string to the tree at all? | ||
282 | if self.parse_only and len(self.tagStack) <= 1 and \ | ||
283 | (not self.parse_only.text or \ | ||
284 | not self.parse_only.search(current_data)): | ||
285 | return | ||
286 | |||
287 | o = containerClass(current_data) | ||
288 | self.object_was_parsed(o) | ||
289 | |||
290 | def object_was_parsed(self, o, parent=None, most_recent_element=None): | ||
291 | """Add an object to the parse tree.""" | ||
292 | parent = parent or self.currentTag | ||
293 | most_recent_element = most_recent_element or self._most_recent_element | ||
294 | o.setup(parent, most_recent_element) | ||
295 | |||
296 | if most_recent_element is not None: | ||
297 | most_recent_element.next_element = o | ||
298 | self._most_recent_element = o | ||
299 | parent.contents.append(o) | ||
300 | |||
301 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): | ||
302 | """Pops the tag stack up to and including the most recent | ||
303 | instance of the given tag. If inclusivePop is false, pops the tag | ||
304 | stack up to but *not* including the most recent instqance of | ||
305 | the given tag.""" | ||
306 | #print "Popping to %s" % name | ||
307 | if name == self.ROOT_TAG_NAME: | ||
308 | # The BeautifulSoup object itself can never be popped. | ||
309 | return | ||
310 | |||
311 | most_recently_popped = None | ||
312 | |||
313 | stack_size = len(self.tagStack) | ||
314 | for i in range(stack_size - 1, 0, -1): | ||
315 | t = self.tagStack[i] | ||
316 | if (name == t.name and nsprefix == t.prefix): | ||
317 | if inclusivePop: | ||
318 | most_recently_popped = self.popTag() | ||
319 | break | ||
320 | most_recently_popped = self.popTag() | ||
321 | |||
322 | return most_recently_popped | ||
323 | |||
324 | def handle_starttag(self, name, namespace, nsprefix, attrs): | ||
325 | """Push a start tag on to the stack. | ||
326 | |||
327 | If this method returns None, the tag was rejected by the | ||
328 | SoupStrainer. You should proceed as if the tag had not occured | ||
329 | in the document. For instance, if this was a self-closing tag, | ||
330 | don't call handle_endtag. | ||
331 | """ | ||
332 | |||
333 | # print "Start tag %s: %s" % (name, attrs) | ||
334 | self.endData() | ||
335 | |||
336 | if (self.parse_only and len(self.tagStack) <= 1 | ||
337 | and (self.parse_only.text | ||
338 | or not self.parse_only.search_tag(name, attrs))): | ||
339 | return None | ||
340 | |||
341 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, | ||
342 | self.currentTag, self._most_recent_element) | ||
343 | if tag is None: | ||
344 | return tag | ||
345 | if self._most_recent_element: | ||
346 | self._most_recent_element.next_element = tag | ||
347 | self._most_recent_element = tag | ||
348 | self.pushTag(tag) | ||
349 | return tag | ||
350 | |||
351 | def handle_endtag(self, name, nsprefix=None): | ||
352 | #print "End tag: " + name | ||
353 | self.endData() | ||
354 | self._popToTag(name, nsprefix) | ||
355 | |||
356 | def handle_data(self, data): | ||
357 | self.current_data.append(data) | ||
358 | |||
359 | def decode(self, pretty_print=False, | ||
360 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, | ||
361 | formatter="minimal"): | ||
362 | """Returns a string or Unicode representation of this document. | ||
363 | To get Unicode, pass None for encoding.""" | ||
364 | |||
365 | if self.is_xml: | ||
366 | # Print the XML declaration | ||
367 | encoding_part = '' | ||
368 | if eventual_encoding != None: | ||
369 | encoding_part = ' encoding="%s"' % eventual_encoding | ||
370 | prefix = u'<?xml version="1.0"%s?>\n' % encoding_part | ||
371 | else: | ||
372 | prefix = u'' | ||
373 | if not pretty_print: | ||
374 | indent_level = None | ||
375 | else: | ||
376 | indent_level = 0 | ||
377 | return prefix + super(BeautifulSoup, self).decode( | ||
378 | indent_level, eventual_encoding, formatter) | ||
379 | |||
380 | # Alias to make it easier to type import: 'from bs4 import _soup' | ||
381 | _s = BeautifulSoup | ||
382 | _soup = BeautifulSoup | ||
383 | |||
384 | class BeautifulStoneSoup(BeautifulSoup): | ||
385 | """Deprecated interface to an XML parser.""" | ||
386 | |||
387 | def __init__(self, *args, **kwargs): | ||
388 | kwargs['features'] = 'xml' | ||
389 | warnings.warn( | ||
390 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' | ||
391 | 'it, pass features="xml" into the BeautifulSoup constructor.') | ||
392 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) | ||
393 | |||
394 | |||
395 | class StopParsing(Exception): | ||
396 | pass | ||
397 | |||
398 | class FeatureNotFound(ValueError): | ||
399 | pass | ||
400 | |||
401 | |||
402 | #By default, act as an HTML pretty-printer. | ||
403 | if __name__ == '__main__': | ||
404 | import sys | ||
405 | soup = BeautifulSoup(sys.stdin) | ||
406 | print soup.prettify() | ||
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py new file mode 100644 index 0000000000..740f5f29cd --- /dev/null +++ b/bitbake/lib/bs4/builder/__init__.py | |||
@@ -0,0 +1,321 @@ | |||
1 | from collections import defaultdict | ||
2 | import itertools | ||
3 | import sys | ||
4 | from bs4.element import ( | ||
5 | CharsetMetaAttributeValue, | ||
6 | ContentMetaAttributeValue, | ||
7 | whitespace_re | ||
8 | ) | ||
9 | |||
10 | __all__ = [ | ||
11 | 'HTMLTreeBuilder', | ||
12 | 'SAXTreeBuilder', | ||
13 | 'TreeBuilder', | ||
14 | 'TreeBuilderRegistry', | ||
15 | ] | ||
16 | |||
17 | # Some useful features for a TreeBuilder to have. | ||
18 | FAST = 'fast' | ||
19 | PERMISSIVE = 'permissive' | ||
20 | STRICT = 'strict' | ||
21 | XML = 'xml' | ||
22 | HTML = 'html' | ||
23 | HTML_5 = 'html5' | ||
24 | |||
25 | |||
26 | class TreeBuilderRegistry(object): | ||
27 | |||
28 | def __init__(self): | ||
29 | self.builders_for_feature = defaultdict(list) | ||
30 | self.builders = [] | ||
31 | |||
32 | def register(self, treebuilder_class): | ||
33 | """Register a treebuilder based on its advertised features.""" | ||
34 | for feature in treebuilder_class.features: | ||
35 | self.builders_for_feature[feature].insert(0, treebuilder_class) | ||
36 | self.builders.insert(0, treebuilder_class) | ||
37 | |||
38 | def lookup(self, *features): | ||
39 | if len(self.builders) == 0: | ||
40 | # There are no builders at all. | ||
41 | return None | ||
42 | |||
43 | if len(features) == 0: | ||
44 | # They didn't ask for any features. Give them the most | ||
45 | # recently registered builder. | ||
46 | return self.builders[0] | ||
47 | |||
48 | # Go down the list of features in order, and eliminate any builders | ||
49 | # that don't match every feature. | ||
50 | features = list(features) | ||
51 | features.reverse() | ||
52 | candidates = None | ||
53 | candidate_set = None | ||
54 | while len(features) > 0: | ||
55 | feature = features.pop() | ||
56 | we_have_the_feature = self.builders_for_feature.get(feature, []) | ||
57 | if len(we_have_the_feature) > 0: | ||
58 | if candidates is None: | ||
59 | candidates = we_have_the_feature | ||
60 | candidate_set = set(candidates) | ||
61 | else: | ||
62 | # Eliminate any candidates that don't have this feature. | ||
63 | candidate_set = candidate_set.intersection( | ||
64 | set(we_have_the_feature)) | ||
65 | |||
66 | # The only valid candidates are the ones in candidate_set. | ||
67 | # Go through the original list of candidates and pick the first one | ||
68 | # that's in candidate_set. | ||
69 | if candidate_set is None: | ||
70 | return None | ||
71 | for candidate in candidates: | ||
72 | if candidate in candidate_set: | ||
73 | return candidate | ||
74 | return None | ||
75 | |||
76 | # The BeautifulSoup class will take feature lists from developers and use them | ||
77 | # to look up builders in this registry. | ||
78 | builder_registry = TreeBuilderRegistry() | ||
79 | |||
80 | class TreeBuilder(object): | ||
81 | """Turn a document into a Beautiful Soup object tree.""" | ||
82 | |||
83 | features = [] | ||
84 | |||
85 | is_xml = False | ||
86 | preserve_whitespace_tags = set() | ||
87 | empty_element_tags = None # A tag will be considered an empty-element | ||
88 | # tag when and only when it has no contents. | ||
89 | |||
90 | # A value for these tag/attribute combinations is a space- or | ||
91 | # comma-separated list of CDATA, rather than a single CDATA. | ||
92 | cdata_list_attributes = {} | ||
93 | |||
94 | |||
95 | def __init__(self): | ||
96 | self.soup = None | ||
97 | |||
98 | def reset(self): | ||
99 | pass | ||
100 | |||
101 | def can_be_empty_element(self, tag_name): | ||
102 | """Might a tag with this name be an empty-element tag? | ||
103 | |||
104 | The final markup may or may not actually present this tag as | ||
105 | self-closing. | ||
106 | |||
107 | For instance: an HTMLBuilder does not consider a <p> tag to be | ||
108 | an empty-element tag (it's not in | ||
109 | HTMLBuilder.empty_element_tags). This means an empty <p> tag | ||
110 | will be presented as "<p></p>", not "<p />". | ||
111 | |||
112 | The default implementation has no opinion about which tags are | ||
113 | empty-element tags, so a tag will be presented as an | ||
114 | empty-element tag if and only if it has no contents. | ||
115 | "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will | ||
116 | be left alone. | ||
117 | """ | ||
118 | if self.empty_element_tags is None: | ||
119 | return True | ||
120 | return tag_name in self.empty_element_tags | ||
121 | |||
122 | def feed(self, markup): | ||
123 | raise NotImplementedError() | ||
124 | |||
125 | def prepare_markup(self, markup, user_specified_encoding=None, | ||
126 | document_declared_encoding=None): | ||
127 | return markup, None, None, False | ||
128 | |||
129 | def test_fragment_to_document(self, fragment): | ||
130 | """Wrap an HTML fragment to make it look like a document. | ||
131 | |||
132 | Different parsers do this differently. For instance, lxml | ||
133 | introduces an empty <head> tag, and html5lib | ||
134 | doesn't. Abstracting this away lets us write simple tests | ||
135 | which run HTML fragments through the parser and compare the | ||
136 | results against other HTML fragments. | ||
137 | |||
138 | This method should not be used outside of tests. | ||
139 | """ | ||
140 | return fragment | ||
141 | |||
142 | def set_up_substitutions(self, tag): | ||
143 | return False | ||
144 | |||
145 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): | ||
146 | """Replaces class="foo bar" with class=["foo", "bar"] | ||
147 | |||
148 | Modifies its input in place. | ||
149 | """ | ||
150 | if not attrs: | ||
151 | return attrs | ||
152 | if self.cdata_list_attributes: | ||
153 | universal = self.cdata_list_attributes.get('*', []) | ||
154 | tag_specific = self.cdata_list_attributes.get( | ||
155 | tag_name.lower(), None) | ||
156 | for attr in attrs.keys(): | ||
157 | if attr in universal or (tag_specific and attr in tag_specific): | ||
158 | # We have a "class"-type attribute whose string | ||
159 | # value is a whitespace-separated list of | ||
160 | # values. Split it into a list. | ||
161 | value = attrs[attr] | ||
162 | if isinstance(value, basestring): | ||
163 | values = whitespace_re.split(value) | ||
164 | else: | ||
165 | # html5lib sometimes calls setAttributes twice | ||
166 | # for the same tag when rearranging the parse | ||
167 | # tree. On the second call the attribute value | ||
168 | # here is already a list. If this happens, | ||
169 | # leave the value alone rather than trying to | ||
170 | # split it again. | ||
171 | values = value | ||
172 | attrs[attr] = values | ||
173 | return attrs | ||
174 | |||
175 | class SAXTreeBuilder(TreeBuilder): | ||
176 | """A Beautiful Soup treebuilder that listens for SAX events.""" | ||
177 | |||
178 | def feed(self, markup): | ||
179 | raise NotImplementedError() | ||
180 | |||
181 | def close(self): | ||
182 | pass | ||
183 | |||
184 | def startElement(self, name, attrs): | ||
185 | attrs = dict((key[1], value) for key, value in list(attrs.items())) | ||
186 | #print "Start %s, %r" % (name, attrs) | ||
187 | self.soup.handle_starttag(name, attrs) | ||
188 | |||
189 | def endElement(self, name): | ||
190 | #print "End %s" % name | ||
191 | self.soup.handle_endtag(name) | ||
192 | |||
193 | def startElementNS(self, nsTuple, nodeName, attrs): | ||
194 | # Throw away (ns, nodeName) for now. | ||
195 | self.startElement(nodeName, attrs) | ||
196 | |||
197 | def endElementNS(self, nsTuple, nodeName): | ||
198 | # Throw away (ns, nodeName) for now. | ||
199 | self.endElement(nodeName) | ||
200 | #handler.endElementNS((ns, node.nodeName), node.nodeName) | ||
201 | |||
202 | def startPrefixMapping(self, prefix, nodeValue): | ||
203 | # Ignore the prefix for now. | ||
204 | pass | ||
205 | |||
206 | def endPrefixMapping(self, prefix): | ||
207 | # Ignore the prefix for now. | ||
208 | # handler.endPrefixMapping(prefix) | ||
209 | pass | ||
210 | |||
211 | def characters(self, content): | ||
212 | self.soup.handle_data(content) | ||
213 | |||
214 | def startDocument(self): | ||
215 | pass | ||
216 | |||
217 | def endDocument(self): | ||
218 | pass | ||
219 | |||
220 | |||
221 | class HTMLTreeBuilder(TreeBuilder): | ||
222 | """This TreeBuilder knows facts about HTML. | ||
223 | |||
224 | Such as which tags are empty-element tags. | ||
225 | """ | ||
226 | |||
227 | preserve_whitespace_tags = set(['pre', 'textarea']) | ||
228 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', | ||
229 | 'spacer', 'link', 'frame', 'base']) | ||
230 | |||
231 | # The HTML standard defines these attributes as containing a | ||
232 | # space-separated list of values, not a single value. That is, | ||
233 | # class="foo bar" means that the 'class' attribute has two values, | ||
234 | # 'foo' and 'bar', not the single value 'foo bar'. When we | ||
235 | # encounter one of these attributes, we will parse its value into | ||
236 | # a list of values if possible. Upon output, the list will be | ||
237 | # converted back into a string. | ||
238 | cdata_list_attributes = { | ||
239 | "*" : ['class', 'accesskey', 'dropzone'], | ||
240 | "a" : ['rel', 'rev'], | ||
241 | "link" : ['rel', 'rev'], | ||
242 | "td" : ["headers"], | ||
243 | "th" : ["headers"], | ||
244 | "td" : ["headers"], | ||
245 | "form" : ["accept-charset"], | ||
246 | "object" : ["archive"], | ||
247 | |||
248 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. | ||
249 | "area" : ["rel"], | ||
250 | "icon" : ["sizes"], | ||
251 | "iframe" : ["sandbox"], | ||
252 | "output" : ["for"], | ||
253 | } | ||
254 | |||
255 | def set_up_substitutions(self, tag): | ||
256 | # We are only interested in <meta> tags | ||
257 | if tag.name != 'meta': | ||
258 | return False | ||
259 | |||
260 | http_equiv = tag.get('http-equiv') | ||
261 | content = tag.get('content') | ||
262 | charset = tag.get('charset') | ||
263 | |||
264 | # We are interested in <meta> tags that say what encoding the | ||
265 | # document was originally in. This means HTML 5-style <meta> | ||
266 | # tags that provide the "charset" attribute. It also means | ||
267 | # HTML 4-style <meta> tags that provide the "content" | ||
268 | # attribute and have "http-equiv" set to "content-type". | ||
269 | # | ||
270 | # In both cases we will replace the value of the appropriate | ||
271 | # attribute with a standin object that can take on any | ||
272 | # encoding. | ||
273 | meta_encoding = None | ||
274 | if charset is not None: | ||
275 | # HTML 5 style: | ||
276 | # <meta charset="utf8"> | ||
277 | meta_encoding = charset | ||
278 | tag['charset'] = CharsetMetaAttributeValue(charset) | ||
279 | |||
280 | elif (content is not None and http_equiv is not None | ||
281 | and http_equiv.lower() == 'content-type'): | ||
282 | # HTML 4 style: | ||
283 | # <meta http-equiv="content-type" content="text/html; charset=utf8"> | ||
284 | tag['content'] = ContentMetaAttributeValue(content) | ||
285 | |||
286 | return (meta_encoding is not None) | ||
287 | |||
288 | def register_treebuilders_from(module): | ||
289 | """Copy TreeBuilders from the given module into this module.""" | ||
290 | # I'm fairly sure this is not the best way to do this. | ||
291 | this_module = sys.modules['bs4.builder'] | ||
292 | for name in module.__all__: | ||
293 | obj = getattr(module, name) | ||
294 | |||
295 | if issubclass(obj, TreeBuilder): | ||
296 | setattr(this_module, name, obj) | ||
297 | this_module.__all__.append(name) | ||
298 | # Register the builder while we're at it. | ||
299 | this_module.builder_registry.register(obj) | ||
300 | |||
301 | class ParserRejectedMarkup(Exception): | ||
302 | pass | ||
303 | |||
304 | # Builders are registered in reverse order of priority, so that custom | ||
305 | # builder registrations will take precedence. In general, we want lxml | ||
306 | # to take precedence over html5lib, because it's faster. And we only | ||
307 | # want to use HTMLParser as a last result. | ||
308 | from . import _htmlparser | ||
309 | register_treebuilders_from(_htmlparser) | ||
310 | try: | ||
311 | from . import _html5lib | ||
312 | register_treebuilders_from(_html5lib) | ||
313 | except ImportError: | ||
314 | # They don't have html5lib installed. | ||
315 | pass | ||
316 | try: | ||
317 | from . import _lxml | ||
318 | register_treebuilders_from(_lxml) | ||
319 | except ImportError: | ||
320 | # They don't have lxml installed. | ||
321 | pass | ||
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py new file mode 100644 index 0000000000..7de36ae75e --- /dev/null +++ b/bitbake/lib/bs4/builder/_html5lib.py | |||
@@ -0,0 +1,285 @@ | |||
1 | __all__ = [ | ||
2 | 'HTML5TreeBuilder', | ||
3 | ] | ||
4 | |||
5 | import warnings | ||
6 | from bs4.builder import ( | ||
7 | PERMISSIVE, | ||
8 | HTML, | ||
9 | HTML_5, | ||
10 | HTMLTreeBuilder, | ||
11 | ) | ||
12 | from bs4.element import NamespacedAttribute | ||
13 | import html5lib | ||
14 | from html5lib.constants import namespaces | ||
15 | from bs4.element import ( | ||
16 | Comment, | ||
17 | Doctype, | ||
18 | NavigableString, | ||
19 | Tag, | ||
20 | ) | ||
21 | |||
22 | class HTML5TreeBuilder(HTMLTreeBuilder): | ||
23 | """Use html5lib to build a tree.""" | ||
24 | |||
25 | features = ['html5lib', PERMISSIVE, HTML_5, HTML] | ||
26 | |||
27 | def prepare_markup(self, markup, user_specified_encoding): | ||
28 | # Store the user-specified encoding for use later on. | ||
29 | self.user_specified_encoding = user_specified_encoding | ||
30 | yield (markup, None, None, False) | ||
31 | |||
32 | # These methods are defined by Beautiful Soup. | ||
33 | def feed(self, markup): | ||
34 | if self.soup.parse_only is not None: | ||
35 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") | ||
36 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) | ||
37 | doc = parser.parse(markup, encoding=self.user_specified_encoding) | ||
38 | |||
39 | # Set the character encoding detected by the tokenizer. | ||
40 | if isinstance(markup, unicode): | ||
41 | # We need to special-case this because html5lib sets | ||
42 | # charEncoding to UTF-8 if it gets Unicode input. | ||
43 | doc.original_encoding = None | ||
44 | else: | ||
45 | doc.original_encoding = parser.tokenizer.stream.charEncoding[0] | ||
46 | |||
47 | def create_treebuilder(self, namespaceHTMLElements): | ||
48 | self.underlying_builder = TreeBuilderForHtml5lib( | ||
49 | self.soup, namespaceHTMLElements) | ||
50 | return self.underlying_builder | ||
51 | |||
52 | def test_fragment_to_document(self, fragment): | ||
53 | """See `TreeBuilder`.""" | ||
54 | return u'<html><head></head><body>%s</body></html>' % fragment | ||
55 | |||
56 | |||
57 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): | ||
58 | |||
59 | def __init__(self, soup, namespaceHTMLElements): | ||
60 | self.soup = soup | ||
61 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) | ||
62 | |||
63 | def documentClass(self): | ||
64 | self.soup.reset() | ||
65 | return Element(self.soup, self.soup, None) | ||
66 | |||
67 | def insertDoctype(self, token): | ||
68 | name = token["name"] | ||
69 | publicId = token["publicId"] | ||
70 | systemId = token["systemId"] | ||
71 | |||
72 | doctype = Doctype.for_name_and_ids(name, publicId, systemId) | ||
73 | self.soup.object_was_parsed(doctype) | ||
74 | |||
75 | def elementClass(self, name, namespace): | ||
76 | tag = self.soup.new_tag(name, namespace) | ||
77 | return Element(tag, self.soup, namespace) | ||
78 | |||
79 | def commentClass(self, data): | ||
80 | return TextNode(Comment(data), self.soup) | ||
81 | |||
82 | def fragmentClass(self): | ||
83 | self.soup = BeautifulSoup("") | ||
84 | self.soup.name = "[document_fragment]" | ||
85 | return Element(self.soup, self.soup, None) | ||
86 | |||
87 | def appendChild(self, node): | ||
88 | # XXX This code is not covered by the BS4 tests. | ||
89 | self.soup.append(node.element) | ||
90 | |||
91 | def getDocument(self): | ||
92 | return self.soup | ||
93 | |||
94 | def getFragment(self): | ||
95 | return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element | ||
96 | |||
97 | class AttrList(object): | ||
98 | def __init__(self, element): | ||
99 | self.element = element | ||
100 | self.attrs = dict(self.element.attrs) | ||
101 | def __iter__(self): | ||
102 | return list(self.attrs.items()).__iter__() | ||
103 | def __setitem__(self, name, value): | ||
104 | "set attr", name, value | ||
105 | self.element[name] = value | ||
106 | def items(self): | ||
107 | return list(self.attrs.items()) | ||
108 | def keys(self): | ||
109 | return list(self.attrs.keys()) | ||
110 | def __len__(self): | ||
111 | return len(self.attrs) | ||
112 | def __getitem__(self, name): | ||
113 | return self.attrs[name] | ||
114 | def __contains__(self, name): | ||
115 | return name in list(self.attrs.keys()) | ||
116 | |||
117 | |||
118 | class Element(html5lib.treebuilders._base.Node): | ||
119 | def __init__(self, element, soup, namespace): | ||
120 | html5lib.treebuilders._base.Node.__init__(self, element.name) | ||
121 | self.element = element | ||
122 | self.soup = soup | ||
123 | self.namespace = namespace | ||
124 | |||
125 | def appendChild(self, node): | ||
126 | string_child = child = None | ||
127 | if isinstance(node, basestring): | ||
128 | # Some other piece of code decided to pass in a string | ||
129 | # instead of creating a TextElement object to contain the | ||
130 | # string. | ||
131 | string_child = child = node | ||
132 | elif isinstance(node, Tag): | ||
133 | # Some other piece of code decided to pass in a Tag | ||
134 | # instead of creating an Element object to contain the | ||
135 | # Tag. | ||
136 | child = node | ||
137 | elif node.element.__class__ == NavigableString: | ||
138 | string_child = child = node.element | ||
139 | else: | ||
140 | child = node.element | ||
141 | |||
142 | if not isinstance(child, basestring) and child.parent is not None: | ||
143 | node.element.extract() | ||
144 | |||
145 | if (string_child and self.element.contents | ||
146 | and self.element.contents[-1].__class__ == NavigableString): | ||
147 | # We are appending a string onto another string. | ||
148 | # TODO This has O(n^2) performance, for input like | ||
149 | # "a</a>a</a>a</a>..." | ||
150 | old_element = self.element.contents[-1] | ||
151 | new_element = self.soup.new_string(old_element + string_child) | ||
152 | old_element.replace_with(new_element) | ||
153 | self.soup._most_recent_element = new_element | ||
154 | else: | ||
155 | if isinstance(node, basestring): | ||
156 | # Create a brand new NavigableString from this string. | ||
157 | child = self.soup.new_string(node) | ||
158 | |||
159 | # Tell Beautiful Soup to act as if it parsed this element | ||
160 | # immediately after the parent's last descendant. (Or | ||
161 | # immediately after the parent, if it has no children.) | ||
162 | if self.element.contents: | ||
163 | most_recent_element = self.element._last_descendant(False) | ||
164 | else: | ||
165 | most_recent_element = self.element | ||
166 | |||
167 | self.soup.object_was_parsed( | ||
168 | child, parent=self.element, | ||
169 | most_recent_element=most_recent_element) | ||
170 | |||
171 | def getAttributes(self): | ||
172 | return AttrList(self.element) | ||
173 | |||
174 | def setAttributes(self, attributes): | ||
175 | if attributes is not None and len(attributes) > 0: | ||
176 | |||
177 | converted_attributes = [] | ||
178 | for name, value in list(attributes.items()): | ||
179 | if isinstance(name, tuple): | ||
180 | new_name = NamespacedAttribute(*name) | ||
181 | del attributes[name] | ||
182 | attributes[new_name] = value | ||
183 | |||
184 | self.soup.builder._replace_cdata_list_attribute_values( | ||
185 | self.name, attributes) | ||
186 | for name, value in attributes.items(): | ||
187 | self.element[name] = value | ||
188 | |||
189 | # The attributes may contain variables that need substitution. | ||
190 | # Call set_up_substitutions manually. | ||
191 | # | ||
192 | # The Tag constructor called this method when the Tag was created, | ||
193 | # but we just set/changed the attributes, so call it again. | ||
194 | self.soup.builder.set_up_substitutions(self.element) | ||
195 | attributes = property(getAttributes, setAttributes) | ||
196 | |||
197 | def insertText(self, data, insertBefore=None): | ||
198 | if insertBefore: | ||
199 | text = TextNode(self.soup.new_string(data), self.soup) | ||
200 | self.insertBefore(data, insertBefore) | ||
201 | else: | ||
202 | self.appendChild(data) | ||
203 | |||
204 | def insertBefore(self, node, refNode): | ||
205 | index = self.element.index(refNode.element) | ||
206 | if (node.element.__class__ == NavigableString and self.element.contents | ||
207 | and self.element.contents[index-1].__class__ == NavigableString): | ||
208 | # (See comments in appendChild) | ||
209 | old_node = self.element.contents[index-1] | ||
210 | new_str = self.soup.new_string(old_node + node.element) | ||
211 | old_node.replace_with(new_str) | ||
212 | else: | ||
213 | self.element.insert(index, node.element) | ||
214 | node.parent = self | ||
215 | |||
216 | def removeChild(self, node): | ||
217 | node.element.extract() | ||
218 | |||
219 | def reparentChildren(self, new_parent): | ||
220 | """Move all of this tag's children into another tag.""" | ||
221 | element = self.element | ||
222 | new_parent_element = new_parent.element | ||
223 | # Determine what this tag's next_element will be once all the children | ||
224 | # are removed. | ||
225 | final_next_element = element.next_sibling | ||
226 | |||
227 | new_parents_last_descendant = new_parent_element._last_descendant(False, False) | ||
228 | if len(new_parent_element.contents) > 0: | ||
229 | # The new parent already contains children. We will be | ||
230 | # appending this tag's children to the end. | ||
231 | new_parents_last_child = new_parent_element.contents[-1] | ||
232 | new_parents_last_descendant_next_element = new_parents_last_descendant.next_element | ||
233 | else: | ||
234 | # The new parent contains no children. | ||
235 | new_parents_last_child = None | ||
236 | new_parents_last_descendant_next_element = new_parent_element.next_element | ||
237 | |||
238 | to_append = element.contents | ||
239 | append_after = new_parent.element.contents | ||
240 | if len(to_append) > 0: | ||
241 | # Set the first child's previous_element and previous_sibling | ||
242 | # to elements within the new parent | ||
243 | first_child = to_append[0] | ||
244 | first_child.previous_element = new_parents_last_descendant | ||
245 | first_child.previous_sibling = new_parents_last_child | ||
246 | |||
247 | # Fix the last child's next_element and next_sibling | ||
248 | last_child = to_append[-1] | ||
249 | last_child.next_element = new_parents_last_descendant_next_element | ||
250 | last_child.next_sibling = None | ||
251 | |||
252 | for child in to_append: | ||
253 | child.parent = new_parent_element | ||
254 | new_parent_element.contents.append(child) | ||
255 | |||
256 | # Now that this element has no children, change its .next_element. | ||
257 | element.contents = [] | ||
258 | element.next_element = final_next_element | ||
259 | |||
260 | def cloneNode(self): | ||
261 | tag = self.soup.new_tag(self.element.name, self.namespace) | ||
262 | node = Element(tag, self.soup, self.namespace) | ||
263 | for key,value in self.attributes: | ||
264 | node.attributes[key] = value | ||
265 | return node | ||
266 | |||
267 | def hasContent(self): | ||
268 | return self.element.contents | ||
269 | |||
270 | def getNameTuple(self): | ||
271 | if self.namespace == None: | ||
272 | return namespaces["html"], self.name | ||
273 | else: | ||
274 | return self.namespace, self.name | ||
275 | |||
276 | nameTuple = property(getNameTuple) | ||
277 | |||
278 | class TextNode(Element): | ||
279 | def __init__(self, element, soup): | ||
280 | html5lib.treebuilders._base.Node.__init__(self, None) | ||
281 | self.element = element | ||
282 | self.soup = soup | ||
283 | |||
284 | def cloneNode(self): | ||
285 | raise NotImplementedError | ||
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py new file mode 100644 index 0000000000..ca8d8b892b --- /dev/null +++ b/bitbake/lib/bs4/builder/_htmlparser.py | |||
@@ -0,0 +1,258 @@ | |||
1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" | ||
2 | |||
3 | __all__ = [ | ||
4 | 'HTMLParserTreeBuilder', | ||
5 | ] | ||
6 | |||
7 | from HTMLParser import ( | ||
8 | HTMLParser, | ||
9 | HTMLParseError, | ||
10 | ) | ||
11 | import sys | ||
12 | import warnings | ||
13 | |||
14 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' | ||
15 | # argument, which we'd like to set to False. Unfortunately, | ||
16 | # http://bugs.python.org/issue13273 makes strict=True a better bet | ||
17 | # before Python 3.2.3. | ||
18 | # | ||
19 | # At the end of this file, we monkeypatch HTMLParser so that | ||
20 | # strict=True works well on Python 3.2.2. | ||
21 | major, minor, release = sys.version_info[:3] | ||
22 | CONSTRUCTOR_TAKES_STRICT = ( | ||
23 | major > 3 | ||
24 | or (major == 3 and minor > 2) | ||
25 | or (major == 3 and minor == 2 and release >= 3)) | ||
26 | |||
27 | from bs4.element import ( | ||
28 | CData, | ||
29 | Comment, | ||
30 | Declaration, | ||
31 | Doctype, | ||
32 | ProcessingInstruction, | ||
33 | ) | ||
34 | from bs4.dammit import EntitySubstitution, UnicodeDammit | ||
35 | |||
36 | from bs4.builder import ( | ||
37 | HTML, | ||
38 | HTMLTreeBuilder, | ||
39 | STRICT, | ||
40 | ) | ||
41 | |||
42 | |||
43 | HTMLPARSER = 'html.parser' | ||
44 | |||
45 | class BeautifulSoupHTMLParser(HTMLParser): | ||
46 | def handle_starttag(self, name, attrs): | ||
47 | # XXX namespace | ||
48 | attr_dict = {} | ||
49 | for key, value in attrs: | ||
50 | # Change None attribute values to the empty string | ||
51 | # for consistency with the other tree builders. | ||
52 | if value is None: | ||
53 | value = '' | ||
54 | attr_dict[key] = value | ||
55 | attrvalue = '""' | ||
56 | self.soup.handle_starttag(name, None, None, attr_dict) | ||
57 | |||
58 | def handle_endtag(self, name): | ||
59 | self.soup.handle_endtag(name) | ||
60 | |||
61 | def handle_data(self, data): | ||
62 | self.soup.handle_data(data) | ||
63 | |||
64 | def handle_charref(self, name): | ||
65 | # XXX workaround for a bug in HTMLParser. Remove this once | ||
66 | # it's fixed. | ||
67 | if name.startswith('x'): | ||
68 | real_name = int(name.lstrip('x'), 16) | ||
69 | elif name.startswith('X'): | ||
70 | real_name = int(name.lstrip('X'), 16) | ||
71 | else: | ||
72 | real_name = int(name) | ||
73 | |||
74 | try: | ||
75 | data = unichr(real_name) | ||
76 | except (ValueError, OverflowError), e: | ||
77 | data = u"\N{REPLACEMENT CHARACTER}" | ||
78 | |||
79 | self.handle_data(data) | ||
80 | |||
81 | def handle_entityref(self, name): | ||
82 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) | ||
83 | if character is not None: | ||
84 | data = character | ||
85 | else: | ||
86 | data = "&%s;" % name | ||
87 | self.handle_data(data) | ||
88 | |||
89 | def handle_comment(self, data): | ||
90 | self.soup.endData() | ||
91 | self.soup.handle_data(data) | ||
92 | self.soup.endData(Comment) | ||
93 | |||
94 | def handle_decl(self, data): | ||
95 | self.soup.endData() | ||
96 | if data.startswith("DOCTYPE "): | ||
97 | data = data[len("DOCTYPE "):] | ||
98 | elif data == 'DOCTYPE': | ||
99 | # i.e. "<!DOCTYPE>" | ||
100 | data = '' | ||
101 | self.soup.handle_data(data) | ||
102 | self.soup.endData(Doctype) | ||
103 | |||
104 | def unknown_decl(self, data): | ||
105 | if data.upper().startswith('CDATA['): | ||
106 | cls = CData | ||
107 | data = data[len('CDATA['):] | ||
108 | else: | ||
109 | cls = Declaration | ||
110 | self.soup.endData() | ||
111 | self.soup.handle_data(data) | ||
112 | self.soup.endData(cls) | ||
113 | |||
114 | def handle_pi(self, data): | ||
115 | self.soup.endData() | ||
116 | if data.endswith("?") and data.lower().startswith("xml"): | ||
117 | # "An XHTML processing instruction using the trailing '?' | ||
118 | # will cause the '?' to be included in data." - HTMLParser | ||
119 | # docs. | ||
120 | # | ||
121 | # Strip the question mark so we don't end up with two | ||
122 | # question marks. | ||
123 | data = data[:-1] | ||
124 | self.soup.handle_data(data) | ||
125 | self.soup.endData(ProcessingInstruction) | ||
126 | |||
127 | |||
128 | class HTMLParserTreeBuilder(HTMLTreeBuilder): | ||
129 | |||
130 | is_xml = False | ||
131 | features = [HTML, STRICT, HTMLPARSER] | ||
132 | |||
133 | def __init__(self, *args, **kwargs): | ||
134 | if CONSTRUCTOR_TAKES_STRICT: | ||
135 | kwargs['strict'] = False | ||
136 | self.parser_args = (args, kwargs) | ||
137 | |||
138 | def prepare_markup(self, markup, user_specified_encoding=None, | ||
139 | document_declared_encoding=None): | ||
140 | """ | ||
141 | :return: A 4-tuple (markup, original encoding, encoding | ||
142 | declared within markup, whether any characters had to be | ||
143 | replaced with REPLACEMENT CHARACTER). | ||
144 | """ | ||
145 | if isinstance(markup, unicode): | ||
146 | yield (markup, None, None, False) | ||
147 | return | ||
148 | |||
149 | try_encodings = [user_specified_encoding, document_declared_encoding] | ||
150 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) | ||
151 | yield (dammit.markup, dammit.original_encoding, | ||
152 | dammit.declared_html_encoding, | ||
153 | dammit.contains_replacement_characters) | ||
154 | |||
155 | def feed(self, markup): | ||
156 | args, kwargs = self.parser_args | ||
157 | parser = BeautifulSoupHTMLParser(*args, **kwargs) | ||
158 | parser.soup = self.soup | ||
159 | try: | ||
160 | parser.feed(markup) | ||
161 | except HTMLParseError, e: | ||
162 | warnings.warn(RuntimeWarning( | ||
163 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) | ||
164 | raise e | ||
165 | |||
166 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some | ||
167 | # 3.2.3 code. This ensures they don't treat markup like <p></p> as a | ||
168 | # string. | ||
169 | # | ||
170 | # XXX This code can be removed once most Python 3 users are on 3.2.3. | ||
171 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: | ||
172 | import re | ||
173 | attrfind_tolerant = re.compile( | ||
174 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' | ||
175 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') | ||
176 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant | ||
177 | |||
178 | locatestarttagend = re.compile(r""" | ||
179 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name | ||
180 | (?:\s+ # whitespace before attribute name | ||
181 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name | ||
182 | (?:\s*=\s* # value indicator | ||
183 | (?:'[^']*' # LITA-enclosed value | ||
184 | |\"[^\"]*\" # LIT-enclosed value | ||
185 | |[^'\">\s]+ # bare value | ||
186 | ) | ||
187 | )? | ||
188 | ) | ||
189 | )* | ||
190 | \s* # trailing whitespace | ||
191 | """, re.VERBOSE) | ||
192 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend | ||
193 | |||
194 | from html.parser import tagfind, attrfind | ||
195 | |||
196 | def parse_starttag(self, i): | ||
197 | self.__starttag_text = None | ||
198 | endpos = self.check_for_whole_start_tag(i) | ||
199 | if endpos < 0: | ||
200 | return endpos | ||
201 | rawdata = self.rawdata | ||
202 | self.__starttag_text = rawdata[i:endpos] | ||
203 | |||
204 | # Now parse the data between i+1 and j into a tag and attrs | ||
205 | attrs = [] | ||
206 | match = tagfind.match(rawdata, i+1) | ||
207 | assert match, 'unexpected call to parse_starttag()' | ||
208 | k = match.end() | ||
209 | self.lasttag = tag = rawdata[i+1:k].lower() | ||
210 | while k < endpos: | ||
211 | if self.strict: | ||
212 | m = attrfind.match(rawdata, k) | ||
213 | else: | ||
214 | m = attrfind_tolerant.match(rawdata, k) | ||
215 | if not m: | ||
216 | break | ||
217 | attrname, rest, attrvalue = m.group(1, 2, 3) | ||
218 | if not rest: | ||
219 | attrvalue = None | ||
220 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ | ||
221 | attrvalue[:1] == '"' == attrvalue[-1:]: | ||
222 | attrvalue = attrvalue[1:-1] | ||
223 | if attrvalue: | ||
224 | attrvalue = self.unescape(attrvalue) | ||
225 | attrs.append((attrname.lower(), attrvalue)) | ||
226 | k = m.end() | ||
227 | |||
228 | end = rawdata[k:endpos].strip() | ||
229 | if end not in (">", "/>"): | ||
230 | lineno, offset = self.getpos() | ||
231 | if "\n" in self.__starttag_text: | ||
232 | lineno = lineno + self.__starttag_text.count("\n") | ||
233 | offset = len(self.__starttag_text) \ | ||
234 | - self.__starttag_text.rfind("\n") | ||
235 | else: | ||
236 | offset = offset + len(self.__starttag_text) | ||
237 | if self.strict: | ||
238 | self.error("junk characters in start tag: %r" | ||
239 | % (rawdata[k:endpos][:20],)) | ||
240 | self.handle_data(rawdata[i:endpos]) | ||
241 | return endpos | ||
242 | if end.endswith('/>'): | ||
243 | # XHTML-style empty tag: <span attr="value" /> | ||
244 | self.handle_startendtag(tag, attrs) | ||
245 | else: | ||
246 | self.handle_starttag(tag, attrs) | ||
247 | if tag in self.CDATA_CONTENT_ELEMENTS: | ||
248 | self.set_cdata_mode(tag) | ||
249 | return endpos | ||
250 | |||
251 | def set_cdata_mode(self, elem): | ||
252 | self.cdata_elem = elem.lower() | ||
253 | self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | ||
254 | |||
255 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag | ||
256 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode | ||
257 | |||
258 | CONSTRUCTOR_TAKES_STRICT = True | ||
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py new file mode 100644 index 0000000000..fa5d49875e --- /dev/null +++ b/bitbake/lib/bs4/builder/_lxml.py | |||
@@ -0,0 +1,233 @@ | |||
1 | __all__ = [ | ||
2 | 'LXMLTreeBuilderForXML', | ||
3 | 'LXMLTreeBuilder', | ||
4 | ] | ||
5 | |||
6 | from io import BytesIO | ||
7 | from StringIO import StringIO | ||
8 | import collections | ||
9 | from lxml import etree | ||
10 | from bs4.element import Comment, Doctype, NamespacedAttribute | ||
11 | from bs4.builder import ( | ||
12 | FAST, | ||
13 | HTML, | ||
14 | HTMLTreeBuilder, | ||
15 | PERMISSIVE, | ||
16 | ParserRejectedMarkup, | ||
17 | TreeBuilder, | ||
18 | XML) | ||
19 | from bs4.dammit import EncodingDetector | ||
20 | |||
21 | LXML = 'lxml' | ||
22 | |||
23 | class LXMLTreeBuilderForXML(TreeBuilder): | ||
24 | DEFAULT_PARSER_CLASS = etree.XMLParser | ||
25 | |||
26 | is_xml = True | ||
27 | |||
28 | # Well, it's permissive by XML parser standards. | ||
29 | features = [LXML, XML, FAST, PERMISSIVE] | ||
30 | |||
31 | CHUNK_SIZE = 512 | ||
32 | |||
33 | # This namespace mapping is specified in the XML Namespace | ||
34 | # standard. | ||
35 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} | ||
36 | |||
37 | def default_parser(self, encoding): | ||
38 | # This can either return a parser object or a class, which | ||
39 | # will be instantiated with default arguments. | ||
40 | if self._default_parser is not None: | ||
41 | return self._default_parser | ||
42 | return etree.XMLParser( | ||
43 | target=self, strip_cdata=False, recover=True, encoding=encoding) | ||
44 | |||
45 | def parser_for(self, encoding): | ||
46 | # Use the default parser. | ||
47 | parser = self.default_parser(encoding) | ||
48 | |||
49 | if isinstance(parser, collections.Callable): | ||
50 | # Instantiate the parser with default arguments | ||
51 | parser = parser(target=self, strip_cdata=False, encoding=encoding) | ||
52 | return parser | ||
53 | |||
54 | def __init__(self, parser=None, empty_element_tags=None): | ||
55 | # TODO: Issue a warning if parser is present but not a | ||
56 | # callable, since that means there's no way to create new | ||
57 | # parsers for different encodings. | ||
58 | self._default_parser = parser | ||
59 | if empty_element_tags is not None: | ||
60 | self.empty_element_tags = set(empty_element_tags) | ||
61 | self.soup = None | ||
62 | self.nsmaps = [self.DEFAULT_NSMAPS] | ||
63 | |||
64 | def _getNsTag(self, tag): | ||
65 | # Split the namespace URL out of a fully-qualified lxml tag | ||
66 | # name. Copied from lxml's src/lxml/sax.py. | ||
67 | if tag[0] == '{': | ||
68 | return tuple(tag[1:].split('}', 1)) | ||
69 | else: | ||
70 | return (None, tag) | ||
71 | |||
72 | def prepare_markup(self, markup, user_specified_encoding=None, | ||
73 | document_declared_encoding=None): | ||
74 | """ | ||
75 | :yield: A series of 4-tuples. | ||
76 | (markup, encoding, declared encoding, | ||
77 | has undergone character replacement) | ||
78 | |||
79 | Each 4-tuple represents a strategy for parsing the document. | ||
80 | """ | ||
81 | if isinstance(markup, unicode): | ||
82 | # We were given Unicode. Maybe lxml can parse Unicode on | ||
83 | # this system? | ||
84 | yield markup, None, document_declared_encoding, False | ||
85 | |||
86 | if isinstance(markup, unicode): | ||
87 | # No, apparently not. Convert the Unicode to UTF-8 and | ||
88 | # tell lxml to parse it as UTF-8. | ||
89 | yield (markup.encode("utf8"), "utf8", | ||
90 | document_declared_encoding, False) | ||
91 | |||
92 | # Instead of using UnicodeDammit to convert the bytestring to | ||
93 | # Unicode using different encodings, use EncodingDetector to | ||
94 | # iterate over the encodings, and tell lxml to try to parse | ||
95 | # the document as each one in turn. | ||
96 | is_html = not self.is_xml | ||
97 | try_encodings = [user_specified_encoding, document_declared_encoding] | ||
98 | detector = EncodingDetector(markup, try_encodings, is_html) | ||
99 | for encoding in detector.encodings: | ||
100 | yield (detector.markup, encoding, document_declared_encoding, False) | ||
101 | |||
102 | def feed(self, markup): | ||
103 | if isinstance(markup, bytes): | ||
104 | markup = BytesIO(markup) | ||
105 | elif isinstance(markup, unicode): | ||
106 | markup = StringIO(markup) | ||
107 | |||
108 | # Call feed() at least once, even if the markup is empty, | ||
109 | # or the parser won't be initialized. | ||
110 | data = markup.read(self.CHUNK_SIZE) | ||
111 | try: | ||
112 | self.parser = self.parser_for(self.soup.original_encoding) | ||
113 | self.parser.feed(data) | ||
114 | while len(data) != 0: | ||
115 | # Now call feed() on the rest of the data, chunk by chunk. | ||
116 | data = markup.read(self.CHUNK_SIZE) | ||
117 | if len(data) != 0: | ||
118 | self.parser.feed(data) | ||
119 | self.parser.close() | ||
120 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | ||
121 | raise ParserRejectedMarkup(str(e)) | ||
122 | |||
123 | def close(self): | ||
124 | self.nsmaps = [self.DEFAULT_NSMAPS] | ||
125 | |||
126 | def start(self, name, attrs, nsmap={}): | ||
127 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. | ||
128 | attrs = dict(attrs) | ||
129 | nsprefix = None | ||
130 | # Invert each namespace map as it comes in. | ||
131 | if len(self.nsmaps) > 1: | ||
132 | # There are no new namespaces for this tag, but | ||
133 | # non-default namespaces are in play, so we need a | ||
134 | # separate tag stack to know when they end. | ||
135 | self.nsmaps.append(None) | ||
136 | elif len(nsmap) > 0: | ||
137 | # A new namespace mapping has come into play. | ||
138 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) | ||
139 | self.nsmaps.append(inverted_nsmap) | ||
140 | # Also treat the namespace mapping as a set of attributes on the | ||
141 | # tag, so we can recreate it later. | ||
142 | attrs = attrs.copy() | ||
143 | for prefix, namespace in nsmap.items(): | ||
144 | attribute = NamespacedAttribute( | ||
145 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") | ||
146 | attrs[attribute] = namespace | ||
147 | |||
148 | # Namespaces are in play. Find any attributes that came in | ||
149 | # from lxml with namespaces attached to their names, and | ||
150 | # turn then into NamespacedAttribute objects. | ||
151 | new_attrs = {} | ||
152 | for attr, value in attrs.items(): | ||
153 | namespace, attr = self._getNsTag(attr) | ||
154 | if namespace is None: | ||
155 | new_attrs[attr] = value | ||
156 | else: | ||
157 | nsprefix = self._prefix_for_namespace(namespace) | ||
158 | attr = NamespacedAttribute(nsprefix, attr, namespace) | ||
159 | new_attrs[attr] = value | ||
160 | attrs = new_attrs | ||
161 | |||
162 | namespace, name = self._getNsTag(name) | ||
163 | nsprefix = self._prefix_for_namespace(namespace) | ||
164 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) | ||
165 | |||
166 | def _prefix_for_namespace(self, namespace): | ||
167 | """Find the currently active prefix for the given namespace.""" | ||
168 | if namespace is None: | ||
169 | return None | ||
170 | for inverted_nsmap in reversed(self.nsmaps): | ||
171 | if inverted_nsmap is not None and namespace in inverted_nsmap: | ||
172 | return inverted_nsmap[namespace] | ||
173 | return None | ||
174 | |||
175 | def end(self, name): | ||
176 | self.soup.endData() | ||
177 | completed_tag = self.soup.tagStack[-1] | ||
178 | namespace, name = self._getNsTag(name) | ||
179 | nsprefix = None | ||
180 | if namespace is not None: | ||
181 | for inverted_nsmap in reversed(self.nsmaps): | ||
182 | if inverted_nsmap is not None and namespace in inverted_nsmap: | ||
183 | nsprefix = inverted_nsmap[namespace] | ||
184 | break | ||
185 | self.soup.handle_endtag(name, nsprefix) | ||
186 | if len(self.nsmaps) > 1: | ||
187 | # This tag, or one of its parents, introduced a namespace | ||
188 | # mapping, so pop it off the stack. | ||
189 | self.nsmaps.pop() | ||
190 | |||
191 | def pi(self, target, data): | ||
192 | pass | ||
193 | |||
194 | def data(self, content): | ||
195 | self.soup.handle_data(content) | ||
196 | |||
197 | def doctype(self, name, pubid, system): | ||
198 | self.soup.endData() | ||
199 | doctype = Doctype.for_name_and_ids(name, pubid, system) | ||
200 | self.soup.object_was_parsed(doctype) | ||
201 | |||
202 | def comment(self, content): | ||
203 | "Handle comments as Comment objects." | ||
204 | self.soup.endData() | ||
205 | self.soup.handle_data(content) | ||
206 | self.soup.endData(Comment) | ||
207 | |||
208 | def test_fragment_to_document(self, fragment): | ||
209 | """See `TreeBuilder`.""" | ||
210 | return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment | ||
211 | |||
212 | |||
213 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | ||
214 | |||
215 | features = [LXML, HTML, FAST, PERMISSIVE] | ||
216 | is_xml = False | ||
217 | |||
218 | def default_parser(self, encoding): | ||
219 | return etree.HTMLParser | ||
220 | |||
221 | def feed(self, markup): | ||
222 | encoding = self.soup.original_encoding | ||
223 | try: | ||
224 | self.parser = self.parser_for(encoding) | ||
225 | self.parser.feed(markup) | ||
226 | self.parser.close() | ||
227 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | ||
228 | raise ParserRejectedMarkup(str(e)) | ||
229 | |||
230 | |||
231 | def test_fragment_to_document(self, fragment): | ||
232 | """See `TreeBuilder`.""" | ||
233 | return u'<html><body>%s</body></html>' % fragment | ||
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py new file mode 100644 index 0000000000..59640b7ce3 --- /dev/null +++ b/bitbake/lib/bs4/dammit.py | |||
@@ -0,0 +1,829 @@ | |||
1 | # -*- coding: utf-8 -*- | ||
2 | """Beautiful Soup bonus library: Unicode, Dammit | ||
3 | |||
4 | This library converts a bytestream to Unicode through any means | ||
5 | necessary. It is heavily based on code from Mark Pilgrim's Universal | ||
6 | Feed Parser. It works best on XML and XML, but it does not rewrite the | ||
7 | XML or HTML to reflect a new encoding; that's the tree builder's job. | ||
8 | """ | ||
9 | |||
10 | import codecs | ||
11 | from htmlentitydefs import codepoint2name | ||
12 | import re | ||
13 | import logging | ||
14 | import string | ||
15 | |||
16 | # Import a library to autodetect character encodings. | ||
17 | chardet_type = None | ||
18 | try: | ||
19 | # First try the fast C implementation. | ||
20 | # PyPI package: cchardet | ||
21 | import cchardet | ||
22 | def chardet_dammit(s): | ||
23 | return cchardet.detect(s)['encoding'] | ||
24 | except ImportError: | ||
25 | try: | ||
26 | # Fall back to the pure Python implementation | ||
27 | # Debian package: python-chardet | ||
28 | # PyPI package: chardet | ||
29 | import chardet | ||
30 | def chardet_dammit(s): | ||
31 | return chardet.detect(s)['encoding'] | ||
32 | #import chardet.constants | ||
33 | #chardet.constants._debug = 1 | ||
34 | except ImportError: | ||
35 | # No chardet available. | ||
36 | def chardet_dammit(s): | ||
37 | return None | ||
38 | |||
39 | # Available from http://cjkpython.i18n.org/. | ||
40 | try: | ||
41 | import iconv_codec | ||
42 | except ImportError: | ||
43 | pass | ||
44 | |||
45 | xml_encoding_re = re.compile( | ||
46 | '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) | ||
47 | html_meta_re = re.compile( | ||
48 | '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) | ||
49 | |||
50 | class EntitySubstitution(object): | ||
51 | |||
52 | """Substitute XML or HTML entities for the corresponding characters.""" | ||
53 | |||
54 | def _populate_class_variables(): | ||
55 | lookup = {} | ||
56 | reverse_lookup = {} | ||
57 | characters_for_re = [] | ||
58 | for codepoint, name in list(codepoint2name.items()): | ||
59 | character = unichr(codepoint) | ||
60 | if codepoint != 34: | ||
61 | # There's no point in turning the quotation mark into | ||
62 | # ", unless it happens within an attribute value, which | ||
63 | # is handled elsewhere. | ||
64 | characters_for_re.append(character) | ||
65 | lookup[character] = name | ||
66 | # But we do want to turn " into the quotation mark. | ||
67 | reverse_lookup[name] = character | ||
68 | re_definition = "[%s]" % "".join(characters_for_re) | ||
69 | return lookup, reverse_lookup, re.compile(re_definition) | ||
70 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, | ||
71 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() | ||
72 | |||
73 | CHARACTER_TO_XML_ENTITY = { | ||
74 | "'": "apos", | ||
75 | '"': "quot", | ||
76 | "&": "amp", | ||
77 | "<": "lt", | ||
78 | ">": "gt", | ||
79 | } | ||
80 | |||
81 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" | ||
82 | "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" | ||
83 | ")") | ||
84 | |||
85 | AMPERSAND_OR_BRACKET = re.compile("([<>&])") | ||
86 | |||
87 | @classmethod | ||
88 | def _substitute_html_entity(cls, matchobj): | ||
89 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) | ||
90 | return "&%s;" % entity | ||
91 | |||
92 | @classmethod | ||
93 | def _substitute_xml_entity(cls, matchobj): | ||
94 | """Used with a regular expression to substitute the | ||
95 | appropriate XML entity for an XML special character.""" | ||
96 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] | ||
97 | return "&%s;" % entity | ||
98 | |||
99 | @classmethod | ||
100 | def quoted_attribute_value(self, value): | ||
101 | """Make a value into a quoted XML attribute, possibly escaping it. | ||
102 | |||
103 | Most strings will be quoted using double quotes. | ||
104 | |||
105 | Bob's Bar -> "Bob's Bar" | ||
106 | |||
107 | If a string contains double quotes, it will be quoted using | ||
108 | single quotes. | ||
109 | |||
110 | Welcome to "my bar" -> 'Welcome to "my bar"' | ||
111 | |||
112 | If a string contains both single and double quotes, the | ||
113 | double quotes will be escaped, and the string will be quoted | ||
114 | using double quotes. | ||
115 | |||
116 | Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" | ||
117 | """ | ||
118 | quote_with = '"' | ||
119 | if '"' in value: | ||
120 | if "'" in value: | ||
121 | # The string contains both single and double | ||
122 | # quotes. Turn the double quotes into | ||
123 | # entities. We quote the double quotes rather than | ||
124 | # the single quotes because the entity name is | ||
125 | # """ whether this is HTML or XML. If we | ||
126 | # quoted the single quotes, we'd have to decide | ||
127 | # between ' and &squot;. | ||
128 | replace_with = """ | ||
129 | value = value.replace('"', replace_with) | ||
130 | else: | ||
131 | # There are double quotes but no single quotes. | ||
132 | # We can use single quotes to quote the attribute. | ||
133 | quote_with = "'" | ||
134 | return quote_with + value + quote_with | ||
135 | |||
136 | @classmethod | ||
137 | def substitute_xml(cls, value, make_quoted_attribute=False): | ||
138 | """Substitute XML entities for special XML characters. | ||
139 | |||
140 | :param value: A string to be substituted. The less-than sign | ||
141 | will become <, the greater-than sign will become >, | ||
142 | and any ampersands will become &. If you want ampersands | ||
143 | that appear to be part of an entity definition to be left | ||
144 | alone, use substitute_xml_containing_entities() instead. | ||
145 | |||
146 | :param make_quoted_attribute: If True, then the string will be | ||
147 | quoted, as befits an attribute value. | ||
148 | """ | ||
149 | # Escape angle brackets and ampersands. | ||
150 | value = cls.AMPERSAND_OR_BRACKET.sub( | ||
151 | cls._substitute_xml_entity, value) | ||
152 | |||
153 | if make_quoted_attribute: | ||
154 | value = cls.quoted_attribute_value(value) | ||
155 | return value | ||
156 | |||
157 | @classmethod | ||
158 | def substitute_xml_containing_entities( | ||
159 | cls, value, make_quoted_attribute=False): | ||
160 | """Substitute XML entities for special XML characters. | ||
161 | |||
162 | :param value: A string to be substituted. The less-than sign will | ||
163 | become <, the greater-than sign will become >, and any | ||
164 | ampersands that are not part of an entity defition will | ||
165 | become &. | ||
166 | |||
167 | :param make_quoted_attribute: If True, then the string will be | ||
168 | quoted, as befits an attribute value. | ||
169 | """ | ||
170 | # Escape angle brackets, and ampersands that aren't part of | ||
171 | # entities. | ||
172 | value = cls.BARE_AMPERSAND_OR_BRACKET.sub( | ||
173 | cls._substitute_xml_entity, value) | ||
174 | |||
175 | if make_quoted_attribute: | ||
176 | value = cls.quoted_attribute_value(value) | ||
177 | return value | ||
178 | |||
179 | @classmethod | ||
180 | def substitute_html(cls, s): | ||
181 | """Replace certain Unicode characters with named HTML entities. | ||
182 | |||
183 | This differs from data.encode(encoding, 'xmlcharrefreplace') | ||
184 | in that the goal is to make the result more readable (to those | ||
185 | with ASCII displays) rather than to recover from | ||
186 | errors. There's absolutely nothing wrong with a UTF-8 string | ||
187 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that | ||
188 | character with "é" will make it more readable to some | ||
189 | people. | ||
190 | """ | ||
191 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( | ||
192 | cls._substitute_html_entity, s) | ||
193 | |||
194 | |||
195 | class EncodingDetector: | ||
196 | """Suggests a number of possible encodings for a bytestring. | ||
197 | |||
198 | Order of precedence: | ||
199 | |||
200 | 1. Encodings you specifically tell EncodingDetector to try first | ||
201 | (the override_encodings argument to the constructor). | ||
202 | |||
203 | 2. An encoding declared within the bytestring itself, either in an | ||
204 | XML declaration (if the bytestring is to be interpreted as an XML | ||
205 | document), or in a <meta> tag (if the bytestring is to be | ||
206 | interpreted as an HTML document.) | ||
207 | |||
208 | 3. An encoding detected through textual analysis by chardet, | ||
209 | cchardet, or a similar external library. | ||
210 | |||
211 | 4. UTF-8. | ||
212 | |||
213 | 5. Windows-1252. | ||
214 | """ | ||
215 | def __init__(self, markup, override_encodings=None, is_html=False): | ||
216 | self.override_encodings = override_encodings or [] | ||
217 | self.chardet_encoding = None | ||
218 | self.is_html = is_html | ||
219 | self.declared_encoding = None | ||
220 | |||
221 | # First order of business: strip a byte-order mark. | ||
222 | self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) | ||
223 | |||
224 | def _usable(self, encoding, tried): | ||
225 | if encoding is not None: | ||
226 | encoding = encoding.lower() | ||
227 | if encoding not in tried: | ||
228 | tried.add(encoding) | ||
229 | return True | ||
230 | return False | ||
231 | |||
232 | @property | ||
233 | def encodings(self): | ||
234 | """Yield a number of encodings that might work for this markup.""" | ||
235 | tried = set() | ||
236 | for e in self.override_encodings: | ||
237 | if self._usable(e, tried): | ||
238 | yield e | ||
239 | |||
240 | # Did the document originally start with a byte-order mark | ||
241 | # that indicated its encoding? | ||
242 | if self._usable(self.sniffed_encoding, tried): | ||
243 | yield self.sniffed_encoding | ||
244 | |||
245 | # Look within the document for an XML or HTML encoding | ||
246 | # declaration. | ||
247 | if self.declared_encoding is None: | ||
248 | self.declared_encoding = self.find_declared_encoding( | ||
249 | self.markup, self.is_html) | ||
250 | if self._usable(self.declared_encoding, tried): | ||
251 | yield self.declared_encoding | ||
252 | |||
253 | # Use third-party character set detection to guess at the | ||
254 | # encoding. | ||
255 | if self.chardet_encoding is None: | ||
256 | self.chardet_encoding = chardet_dammit(self.markup) | ||
257 | if self._usable(self.chardet_encoding, tried): | ||
258 | yield self.chardet_encoding | ||
259 | |||
260 | # As a last-ditch effort, try utf-8 and windows-1252. | ||
261 | for e in ('utf-8', 'windows-1252'): | ||
262 | if self._usable(e, tried): | ||
263 | yield e | ||
264 | |||
265 | @classmethod | ||
266 | def strip_byte_order_mark(cls, data): | ||
267 | """If a byte-order mark is present, strip it and return the encoding it implies.""" | ||
268 | encoding = None | ||
269 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | ||
270 | and (data[2:4] != '\x00\x00'): | ||
271 | encoding = 'utf-16be' | ||
272 | data = data[2:] | ||
273 | elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ | ||
274 | and (data[2:4] != '\x00\x00'): | ||
275 | encoding = 'utf-16le' | ||
276 | data = data[2:] | ||
277 | elif data[:3] == b'\xef\xbb\xbf': | ||
278 | encoding = 'utf-8' | ||
279 | data = data[3:] | ||
280 | elif data[:4] == b'\x00\x00\xfe\xff': | ||
281 | encoding = 'utf-32be' | ||
282 | data = data[4:] | ||
283 | elif data[:4] == b'\xff\xfe\x00\x00': | ||
284 | encoding = 'utf-32le' | ||
285 | data = data[4:] | ||
286 | return data, encoding | ||
287 | |||
288 | @classmethod | ||
289 | def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): | ||
290 | """Given a document, tries to find its declared encoding. | ||
291 | |||
292 | An XML encoding is declared at the beginning of the document. | ||
293 | |||
294 | An HTML encoding is declared in a <meta> tag, hopefully near the | ||
295 | beginning of the document. | ||
296 | """ | ||
297 | if search_entire_document: | ||
298 | xml_endpos = html_endpos = len(markup) | ||
299 | else: | ||
300 | xml_endpos = 1024 | ||
301 | html_endpos = max(2048, int(len(markup) * 0.05)) | ||
302 | |||
303 | declared_encoding = None | ||
304 | declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) | ||
305 | if not declared_encoding_match and is_html: | ||
306 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) | ||
307 | if declared_encoding_match is not None: | ||
308 | declared_encoding = declared_encoding_match.groups()[0].decode( | ||
309 | 'ascii') | ||
310 | if declared_encoding: | ||
311 | return declared_encoding.lower() | ||
312 | return None | ||
313 | |||
314 | class UnicodeDammit: | ||
315 | """A class for detecting the encoding of a *ML document and | ||
316 | converting it to a Unicode string. If the source encoding is | ||
317 | windows-1252, can replace MS smart quotes with their HTML or XML | ||
318 | equivalents.""" | ||
319 | |||
320 | # This dictionary maps commonly seen values for "charset" in HTML | ||
321 | # meta tags to the corresponding Python codec names. It only covers | ||
322 | # values that aren't in Python's aliases and can't be determined | ||
323 | # by the heuristics in find_codec. | ||
324 | CHARSET_ALIASES = {"macintosh": "mac-roman", | ||
325 | "x-sjis": "shift-jis"} | ||
326 | |||
327 | ENCODINGS_WITH_SMART_QUOTES = [ | ||
328 | "windows-1252", | ||
329 | "iso-8859-1", | ||
330 | "iso-8859-2", | ||
331 | ] | ||
332 | |||
333 | def __init__(self, markup, override_encodings=[], | ||
334 | smart_quotes_to=None, is_html=False): | ||
335 | self.smart_quotes_to = smart_quotes_to | ||
336 | self.tried_encodings = [] | ||
337 | self.contains_replacement_characters = False | ||
338 | self.is_html = is_html | ||
339 | |||
340 | self.detector = EncodingDetector(markup, override_encodings, is_html) | ||
341 | |||
342 | # Short-circuit if the data is in Unicode to begin with. | ||
343 | if isinstance(markup, unicode) or markup == '': | ||
344 | self.markup = markup | ||
345 | self.unicode_markup = unicode(markup) | ||
346 | self.original_encoding = None | ||
347 | return | ||
348 | |||
349 | # The encoding detector may have stripped a byte-order mark. | ||
350 | # Use the stripped markup from this point on. | ||
351 | self.markup = self.detector.markup | ||
352 | |||
353 | u = None | ||
354 | for encoding in self.detector.encodings: | ||
355 | markup = self.detector.markup | ||
356 | u = self._convert_from(encoding) | ||
357 | if u is not None: | ||
358 | break | ||
359 | |||
360 | if not u: | ||
361 | # None of the encodings worked. As an absolute last resort, | ||
362 | # try them again with character replacement. | ||
363 | |||
364 | for encoding in self.detector.encodings: | ||
365 | if encoding != "ascii": | ||
366 | u = self._convert_from(encoding, "replace") | ||
367 | if u is not None: | ||
368 | logging.warning( | ||
369 | "Some characters could not be decoded, and were " | ||
370 | "replaced with REPLACEMENT CHARACTER.") | ||
371 | self.contains_replacement_characters = True | ||
372 | break | ||
373 | |||
374 | # If none of that worked, we could at this point force it to | ||
375 | # ASCII, but that would destroy so much data that I think | ||
376 | # giving up is better. | ||
377 | self.unicode_markup = u | ||
378 | if not u: | ||
379 | self.original_encoding = None | ||
380 | |||
381 | def _sub_ms_char(self, match): | ||
382 | """Changes a MS smart quote character to an XML or HTML | ||
383 | entity, or an ASCII character.""" | ||
384 | orig = match.group(1) | ||
385 | if self.smart_quotes_to == 'ascii': | ||
386 | sub = self.MS_CHARS_TO_ASCII.get(orig).encode() | ||
387 | else: | ||
388 | sub = self.MS_CHARS.get(orig) | ||
389 | if type(sub) == tuple: | ||
390 | if self.smart_quotes_to == 'xml': | ||
391 | sub = '&#x'.encode() + sub[1].encode() + ';'.encode() | ||
392 | else: | ||
393 | sub = '&'.encode() + sub[0].encode() + ';'.encode() | ||
394 | else: | ||
395 | sub = sub.encode() | ||
396 | return sub | ||
397 | |||
398 | def _convert_from(self, proposed, errors="strict"): | ||
399 | proposed = self.find_codec(proposed) | ||
400 | if not proposed or (proposed, errors) in self.tried_encodings: | ||
401 | return None | ||
402 | self.tried_encodings.append((proposed, errors)) | ||
403 | markup = self.markup | ||
404 | # Convert smart quotes to HTML if coming from an encoding | ||
405 | # that might have them. | ||
406 | if (self.smart_quotes_to is not None | ||
407 | and proposed in self.ENCODINGS_WITH_SMART_QUOTES): | ||
408 | smart_quotes_re = b"([\x80-\x9f])" | ||
409 | smart_quotes_compiled = re.compile(smart_quotes_re) | ||
410 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) | ||
411 | |||
412 | try: | ||
413 | #print "Trying to convert document to %s (errors=%s)" % ( | ||
414 | # proposed, errors) | ||
415 | u = self._to_unicode(markup, proposed, errors) | ||
416 | self.markup = u | ||
417 | self.original_encoding = proposed | ||
418 | except Exception as e: | ||
419 | #print "That didn't work!" | ||
420 | #print e | ||
421 | return None | ||
422 | #print "Correct encoding: %s" % proposed | ||
423 | return self.markup | ||
424 | |||
425 | def _to_unicode(self, data, encoding, errors="strict"): | ||
426 | '''Given a string and its encoding, decodes the string into Unicode. | ||
427 | %encoding is a string recognized by encodings.aliases''' | ||
428 | return unicode(data, encoding, errors) | ||
429 | |||
430 | @property | ||
431 | def declared_html_encoding(self): | ||
432 | if not self.is_html: | ||
433 | return None | ||
434 | return self.detector.declared_encoding | ||
435 | |||
436 | def find_codec(self, charset): | ||
437 | value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) | ||
438 | or (charset and self._codec(charset.replace("-", ""))) | ||
439 | or (charset and self._codec(charset.replace("-", "_"))) | ||
440 | or (charset and charset.lower()) | ||
441 | or charset | ||
442 | ) | ||
443 | if value: | ||
444 | return value.lower() | ||
445 | return None | ||
446 | |||
447 | def _codec(self, charset): | ||
448 | if not charset: | ||
449 | return charset | ||
450 | codec = None | ||
451 | try: | ||
452 | codecs.lookup(charset) | ||
453 | codec = charset | ||
454 | except (LookupError, ValueError): | ||
455 | pass | ||
456 | return codec | ||
457 | |||
458 | |||
459 | # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. | ||
460 | MS_CHARS = {b'\x80': ('euro', '20AC'), | ||
461 | b'\x81': ' ', | ||
462 | b'\x82': ('sbquo', '201A'), | ||
463 | b'\x83': ('fnof', '192'), | ||
464 | b'\x84': ('bdquo', '201E'), | ||
465 | b'\x85': ('hellip', '2026'), | ||
466 | b'\x86': ('dagger', '2020'), | ||
467 | b'\x87': ('Dagger', '2021'), | ||
468 | b'\x88': ('circ', '2C6'), | ||
469 | b'\x89': ('permil', '2030'), | ||
470 | b'\x8A': ('Scaron', '160'), | ||
471 | b'\x8B': ('lsaquo', '2039'), | ||
472 | b'\x8C': ('OElig', '152'), | ||
473 | b'\x8D': '?', | ||
474 | b'\x8E': ('#x17D', '17D'), | ||
475 | b'\x8F': '?', | ||
476 | b'\x90': '?', | ||
477 | b'\x91': ('lsquo', '2018'), | ||
478 | b'\x92': ('rsquo', '2019'), | ||
479 | b'\x93': ('ldquo', '201C'), | ||
480 | b'\x94': ('rdquo', '201D'), | ||
481 | b'\x95': ('bull', '2022'), | ||
482 | b'\x96': ('ndash', '2013'), | ||
483 | b'\x97': ('mdash', '2014'), | ||
484 | b'\x98': ('tilde', '2DC'), | ||
485 | b'\x99': ('trade', '2122'), | ||
486 | b'\x9a': ('scaron', '161'), | ||
487 | b'\x9b': ('rsaquo', '203A'), | ||
488 | b'\x9c': ('oelig', '153'), | ||
489 | b'\x9d': '?', | ||
490 | b'\x9e': ('#x17E', '17E'), | ||
491 | b'\x9f': ('Yuml', ''),} | ||
492 | |||
493 | # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains | ||
494 | # horrors like stripping diacritical marks to turn á into a, but also | ||
495 | # contains non-horrors like turning “ into ". | ||
496 | MS_CHARS_TO_ASCII = { | ||
497 | b'\x80' : 'EUR', | ||
498 | b'\x81' : ' ', | ||
499 | b'\x82' : ',', | ||
500 | b'\x83' : 'f', | ||
501 | b'\x84' : ',,', | ||
502 | b'\x85' : '...', | ||
503 | b'\x86' : '+', | ||
504 | b'\x87' : '++', | ||
505 | b'\x88' : '^', | ||
506 | b'\x89' : '%', | ||
507 | b'\x8a' : 'S', | ||
508 | b'\x8b' : '<', | ||
509 | b'\x8c' : 'OE', | ||
510 | b'\x8d' : '?', | ||
511 | b'\x8e' : 'Z', | ||
512 | b'\x8f' : '?', | ||
513 | b'\x90' : '?', | ||
514 | b'\x91' : "'", | ||
515 | b'\x92' : "'", | ||
516 | b'\x93' : '"', | ||
517 | b'\x94' : '"', | ||
518 | b'\x95' : '*', | ||
519 | b'\x96' : '-', | ||
520 | b'\x97' : '--', | ||
521 | b'\x98' : '~', | ||
522 | b'\x99' : '(TM)', | ||
523 | b'\x9a' : 's', | ||
524 | b'\x9b' : '>', | ||
525 | b'\x9c' : 'oe', | ||
526 | b'\x9d' : '?', | ||
527 | b'\x9e' : 'z', | ||
528 | b'\x9f' : 'Y', | ||
529 | b'\xa0' : ' ', | ||
530 | b'\xa1' : '!', | ||
531 | b'\xa2' : 'c', | ||
532 | b'\xa3' : 'GBP', | ||
533 | b'\xa4' : '$', #This approximation is especially parochial--this is the | ||
534 | #generic currency symbol. | ||
535 | b'\xa5' : 'YEN', | ||
536 | b'\xa6' : '|', | ||
537 | b'\xa7' : 'S', | ||
538 | b'\xa8' : '..', | ||
539 | b'\xa9' : '', | ||
540 | b'\xaa' : '(th)', | ||
541 | b'\xab' : '<<', | ||
542 | b'\xac' : '!', | ||
543 | b'\xad' : ' ', | ||
544 | b'\xae' : '(R)', | ||
545 | b'\xaf' : '-', | ||
546 | b'\xb0' : 'o', | ||
547 | b'\xb1' : '+-', | ||
548 | b'\xb2' : '2', | ||
549 | b'\xb3' : '3', | ||
550 | b'\xb4' : ("'", 'acute'), | ||
551 | b'\xb5' : 'u', | ||
552 | b'\xb6' : 'P', | ||
553 | b'\xb7' : '*', | ||
554 | b'\xb8' : ',', | ||
555 | b'\xb9' : '1', | ||
556 | b'\xba' : '(th)', | ||
557 | b'\xbb' : '>>', | ||
558 | b'\xbc' : '1/4', | ||
559 | b'\xbd' : '1/2', | ||
560 | b'\xbe' : '3/4', | ||
561 | b'\xbf' : '?', | ||
562 | b'\xc0' : 'A', | ||
563 | b'\xc1' : 'A', | ||
564 | b'\xc2' : 'A', | ||
565 | b'\xc3' : 'A', | ||
566 | b'\xc4' : 'A', | ||
567 | b'\xc5' : 'A', | ||
568 | b'\xc6' : 'AE', | ||
569 | b'\xc7' : 'C', | ||
570 | b'\xc8' : 'E', | ||
571 | b'\xc9' : 'E', | ||
572 | b'\xca' : 'E', | ||
573 | b'\xcb' : 'E', | ||
574 | b'\xcc' : 'I', | ||
575 | b'\xcd' : 'I', | ||
576 | b'\xce' : 'I', | ||
577 | b'\xcf' : 'I', | ||
578 | b'\xd0' : 'D', | ||
579 | b'\xd1' : 'N', | ||
580 | b'\xd2' : 'O', | ||
581 | b'\xd3' : 'O', | ||
582 | b'\xd4' : 'O', | ||
583 | b'\xd5' : 'O', | ||
584 | b'\xd6' : 'O', | ||
585 | b'\xd7' : '*', | ||
586 | b'\xd8' : 'O', | ||
587 | b'\xd9' : 'U', | ||
588 | b'\xda' : 'U', | ||
589 | b'\xdb' : 'U', | ||
590 | b'\xdc' : 'U', | ||
591 | b'\xdd' : 'Y', | ||
592 | b'\xde' : 'b', | ||
593 | b'\xdf' : 'B', | ||
594 | b'\xe0' : 'a', | ||
595 | b'\xe1' : 'a', | ||
596 | b'\xe2' : 'a', | ||
597 | b'\xe3' : 'a', | ||
598 | b'\xe4' : 'a', | ||
599 | b'\xe5' : 'a', | ||
600 | b'\xe6' : 'ae', | ||
601 | b'\xe7' : 'c', | ||
602 | b'\xe8' : 'e', | ||
603 | b'\xe9' : 'e', | ||
604 | b'\xea' : 'e', | ||
605 | b'\xeb' : 'e', | ||
606 | b'\xec' : 'i', | ||
607 | b'\xed' : 'i', | ||
608 | b'\xee' : 'i', | ||
609 | b'\xef' : 'i', | ||
610 | b'\xf0' : 'o', | ||
611 | b'\xf1' : 'n', | ||
612 | b'\xf2' : 'o', | ||
613 | b'\xf3' : 'o', | ||
614 | b'\xf4' : 'o', | ||
615 | b'\xf5' : 'o', | ||
616 | b'\xf6' : 'o', | ||
617 | b'\xf7' : '/', | ||
618 | b'\xf8' : 'o', | ||
619 | b'\xf9' : 'u', | ||
620 | b'\xfa' : 'u', | ||
621 | b'\xfb' : 'u', | ||
622 | b'\xfc' : 'u', | ||
623 | b'\xfd' : 'y', | ||
624 | b'\xfe' : 'b', | ||
625 | b'\xff' : 'y', | ||
626 | } | ||
627 | |||
628 | # A map used when removing rogue Windows-1252/ISO-8859-1 | ||
629 | # characters in otherwise UTF-8 documents. | ||
630 | # | ||
631 | # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in | ||
632 | # Windows-1252. | ||
633 | WINDOWS_1252_TO_UTF8 = { | ||
634 | 0x80 : b'\xe2\x82\xac', # € | ||
635 | 0x82 : b'\xe2\x80\x9a', # ‚ | ||
636 | 0x83 : b'\xc6\x92', # Æ’ | ||
637 | 0x84 : b'\xe2\x80\x9e', # „ | ||
638 | 0x85 : b'\xe2\x80\xa6', # … | ||
639 | 0x86 : b'\xe2\x80\xa0', # †| ||
640 | 0x87 : b'\xe2\x80\xa1', # ‡ | ||
641 | 0x88 : b'\xcb\x86', # ˆ | ||
642 | 0x89 : b'\xe2\x80\xb0', # ‰ | ||
643 | 0x8a : b'\xc5\xa0', # Å | ||
644 | 0x8b : b'\xe2\x80\xb9', # ‹ | ||
645 | 0x8c : b'\xc5\x92', # Å’ | ||
646 | 0x8e : b'\xc5\xbd', # Ž | ||
647 | 0x91 : b'\xe2\x80\x98', # ‘ | ||
648 | 0x92 : b'\xe2\x80\x99', # ’ | ||
649 | 0x93 : b'\xe2\x80\x9c', # “ | ||
650 | 0x94 : b'\xe2\x80\x9d', # †| ||
651 | 0x95 : b'\xe2\x80\xa2', # • | ||
652 | 0x96 : b'\xe2\x80\x93', # – | ||
653 | 0x97 : b'\xe2\x80\x94', # — | ||
654 | 0x98 : b'\xcb\x9c', # ˜ | ||
655 | 0x99 : b'\xe2\x84\xa2', # â„¢ | ||
656 | 0x9a : b'\xc5\xa1', # Å¡ | ||
657 | 0x9b : b'\xe2\x80\xba', # › | ||
658 | 0x9c : b'\xc5\x93', # Å“ | ||
659 | 0x9e : b'\xc5\xbe', # ž | ||
660 | 0x9f : b'\xc5\xb8', # Ÿ | ||
661 | 0xa0 : b'\xc2\xa0', # Â | ||
662 | 0xa1 : b'\xc2\xa1', # ¡ | ||
663 | 0xa2 : b'\xc2\xa2', # ¢ | ||
664 | 0xa3 : b'\xc2\xa3', # £ | ||
665 | 0xa4 : b'\xc2\xa4', # ¤ | ||
666 | 0xa5 : b'\xc2\xa5', # ¥ | ||
667 | 0xa6 : b'\xc2\xa6', # ¦ | ||
668 | 0xa7 : b'\xc2\xa7', # § | ||
669 | 0xa8 : b'\xc2\xa8', # ¨ | ||
670 | 0xa9 : b'\xc2\xa9', # © | ||
671 | 0xaa : b'\xc2\xaa', # ª | ||
672 | 0xab : b'\xc2\xab', # « | ||
673 | 0xac : b'\xc2\xac', # ¬ | ||
674 | 0xad : b'\xc2\xad', # Â | ||
675 | 0xae : b'\xc2\xae', # ® | ||
676 | 0xaf : b'\xc2\xaf', # ¯ | ||
677 | 0xb0 : b'\xc2\xb0', # ° | ||
678 | 0xb1 : b'\xc2\xb1', # ± | ||
679 | 0xb2 : b'\xc2\xb2', # ² | ||
680 | 0xb3 : b'\xc2\xb3', # ³ | ||
681 | 0xb4 : b'\xc2\xb4', # ´ | ||
682 | 0xb5 : b'\xc2\xb5', # µ | ||
683 | 0xb6 : b'\xc2\xb6', # ¶ | ||
684 | 0xb7 : b'\xc2\xb7', # · | ||
685 | 0xb8 : b'\xc2\xb8', # ¸ | ||
686 | 0xb9 : b'\xc2\xb9', # ¹ | ||
687 | 0xba : b'\xc2\xba', # º | ||
688 | 0xbb : b'\xc2\xbb', # » | ||
689 | 0xbc : b'\xc2\xbc', # ¼ | ||
690 | 0xbd : b'\xc2\xbd', # ½ | ||
691 | 0xbe : b'\xc2\xbe', # ¾ | ||
692 | 0xbf : b'\xc2\xbf', # ¿ | ||
693 | 0xc0 : b'\xc3\x80', # À | ||
694 | 0xc1 : b'\xc3\x81', # Ã | ||
695 | 0xc2 : b'\xc3\x82', # Â | ||
696 | 0xc3 : b'\xc3\x83', # Ã | ||
697 | 0xc4 : b'\xc3\x84', # Ä | ||
698 | 0xc5 : b'\xc3\x85', # Ã… | ||
699 | 0xc6 : b'\xc3\x86', # Æ | ||
700 | 0xc7 : b'\xc3\x87', # Ç | ||
701 | 0xc8 : b'\xc3\x88', # È | ||
702 | 0xc9 : b'\xc3\x89', # É | ||
703 | 0xca : b'\xc3\x8a', # Ê | ||
704 | 0xcb : b'\xc3\x8b', # Ë | ||
705 | 0xcc : b'\xc3\x8c', # Ì | ||
706 | 0xcd : b'\xc3\x8d', # Ã | ||
707 | 0xce : b'\xc3\x8e', # ÃŽ | ||
708 | 0xcf : b'\xc3\x8f', # Ã | ||
709 | 0xd0 : b'\xc3\x90', # Ã | ||
710 | 0xd1 : b'\xc3\x91', # Ñ | ||
711 | 0xd2 : b'\xc3\x92', # Ã’ | ||
712 | 0xd3 : b'\xc3\x93', # Ó | ||
713 | 0xd4 : b'\xc3\x94', # Ô | ||
714 | 0xd5 : b'\xc3\x95', # Õ | ||
715 | 0xd6 : b'\xc3\x96', # Ö | ||
716 | 0xd7 : b'\xc3\x97', # × | ||
717 | 0xd8 : b'\xc3\x98', # Ø | ||
718 | 0xd9 : b'\xc3\x99', # Ù | ||
719 | 0xda : b'\xc3\x9a', # Ú | ||
720 | 0xdb : b'\xc3\x9b', # Û | ||
721 | 0xdc : b'\xc3\x9c', # Ü | ||
722 | 0xdd : b'\xc3\x9d', # Ã | ||
723 | 0xde : b'\xc3\x9e', # Þ | ||
724 | 0xdf : b'\xc3\x9f', # ß | ||
725 | 0xe0 : b'\xc3\xa0', # Ã | ||
726 | 0xe1 : b'\xa1', # á | ||
727 | 0xe2 : b'\xc3\xa2', # â | ||
728 | 0xe3 : b'\xc3\xa3', # ã | ||
729 | 0xe4 : b'\xc3\xa4', # ä | ||
730 | 0xe5 : b'\xc3\xa5', # å | ||
731 | 0xe6 : b'\xc3\xa6', # æ | ||
732 | 0xe7 : b'\xc3\xa7', # ç | ||
733 | 0xe8 : b'\xc3\xa8', # è | ||
734 | 0xe9 : b'\xc3\xa9', # é | ||
735 | 0xea : b'\xc3\xaa', # ê | ||
736 | 0xeb : b'\xc3\xab', # ë | ||
737 | 0xec : b'\xc3\xac', # ì | ||
738 | 0xed : b'\xc3\xad', # Ã | ||
739 | 0xee : b'\xc3\xae', # î | ||
740 | 0xef : b'\xc3\xaf', # ï | ||
741 | 0xf0 : b'\xc3\xb0', # ð | ||
742 | 0xf1 : b'\xc3\xb1', # ñ | ||
743 | 0xf2 : b'\xc3\xb2', # ò | ||
744 | 0xf3 : b'\xc3\xb3', # ó | ||
745 | 0xf4 : b'\xc3\xb4', # ô | ||
746 | 0xf5 : b'\xc3\xb5', # õ | ||
747 | 0xf6 : b'\xc3\xb6', # ö | ||
748 | 0xf7 : b'\xc3\xb7', # ÷ | ||
749 | 0xf8 : b'\xc3\xb8', # ø | ||
750 | 0xf9 : b'\xc3\xb9', # ù | ||
751 | 0xfa : b'\xc3\xba', # ú | ||
752 | 0xfb : b'\xc3\xbb', # û | ||
753 | 0xfc : b'\xc3\xbc', # ü | ||
754 | 0xfd : b'\xc3\xbd', # ý | ||
755 | 0xfe : b'\xc3\xbe', # þ | ||
756 | } | ||
757 | |||
758 | MULTIBYTE_MARKERS_AND_SIZES = [ | ||
759 | (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF | ||
760 | (0xe0, 0xef, 3), # 3-byte characters start with E0-EF | ||
761 | (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 | ||
762 | ] | ||
763 | |||
764 | FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] | ||
765 | LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] | ||
766 | |||
767 | @classmethod | ||
768 | def detwingle(cls, in_bytes, main_encoding="utf8", | ||
769 | embedded_encoding="windows-1252"): | ||
770 | """Fix characters from one encoding embedded in some other encoding. | ||
771 | |||
772 | Currently the only situation supported is Windows-1252 (or its | ||
773 | subset ISO-8859-1), embedded in UTF-8. | ||
774 | |||
775 | The input must be a bytestring. If you've already converted | ||
776 | the document to Unicode, you're too late. | ||
777 | |||
778 | The output is a bytestring in which `embedded_encoding` | ||
779 | characters have been converted to their `main_encoding` | ||
780 | equivalents. | ||
781 | """ | ||
782 | if embedded_encoding.replace('_', '-').lower() not in ( | ||
783 | 'windows-1252', 'windows_1252'): | ||
784 | raise NotImplementedError( | ||
785 | "Windows-1252 and ISO-8859-1 are the only currently supported " | ||
786 | "embedded encodings.") | ||
787 | |||
788 | if main_encoding.lower() not in ('utf8', 'utf-8'): | ||
789 | raise NotImplementedError( | ||
790 | "UTF-8 is the only currently supported main encoding.") | ||
791 | |||
792 | byte_chunks = [] | ||
793 | |||
794 | chunk_start = 0 | ||
795 | pos = 0 | ||
796 | while pos < len(in_bytes): | ||
797 | byte = in_bytes[pos] | ||
798 | if not isinstance(byte, int): | ||
799 | # Python 2.x | ||
800 | byte = ord(byte) | ||
801 | if (byte >= cls.FIRST_MULTIBYTE_MARKER | ||
802 | and byte <= cls.LAST_MULTIBYTE_MARKER): | ||
803 | # This is the start of a UTF-8 multibyte character. Skip | ||
804 | # to the end. | ||
805 | for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: | ||
806 | if byte >= start and byte <= end: | ||
807 | pos += size | ||
808 | break | ||
809 | elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: | ||
810 | # We found a Windows-1252 character! | ||
811 | # Save the string up to this point as a chunk. | ||
812 | byte_chunks.append(in_bytes[chunk_start:pos]) | ||
813 | |||
814 | # Now translate the Windows-1252 character into UTF-8 | ||
815 | # and add it as another, one-byte chunk. | ||
816 | byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) | ||
817 | pos += 1 | ||
818 | chunk_start = pos | ||
819 | else: | ||
820 | # Go on to the next character. | ||
821 | pos += 1 | ||
822 | if chunk_start == 0: | ||
823 | # The string is unchanged. | ||
824 | return in_bytes | ||
825 | else: | ||
826 | # Store the final chunk. | ||
827 | byte_chunks.append(in_bytes[chunk_start:]) | ||
828 | return b''.join(byte_chunks) | ||
829 | |||
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py new file mode 100644 index 0000000000..4d0b00afad --- /dev/null +++ b/bitbake/lib/bs4/diagnose.py | |||
@@ -0,0 +1,204 @@ | |||
1 | """Diagnostic functions, mainly for use when doing tech support.""" | ||
2 | import cProfile | ||
3 | from StringIO import StringIO | ||
4 | from HTMLParser import HTMLParser | ||
5 | import bs4 | ||
6 | from bs4 import BeautifulSoup, __version__ | ||
7 | from bs4.builder import builder_registry | ||
8 | |||
9 | import os | ||
10 | import pstats | ||
11 | import random | ||
12 | import tempfile | ||
13 | import time | ||
14 | import traceback | ||
15 | import sys | ||
16 | import cProfile | ||
17 | |||
18 | def diagnose(data): | ||
19 | """Diagnostic suite for isolating common problems.""" | ||
20 | print "Diagnostic running on Beautiful Soup %s" % __version__ | ||
21 | print "Python version %s" % sys.version | ||
22 | |||
23 | basic_parsers = ["html.parser", "html5lib", "lxml"] | ||
24 | for name in basic_parsers: | ||
25 | for builder in builder_registry.builders: | ||
26 | if name in builder.features: | ||
27 | break | ||
28 | else: | ||
29 | basic_parsers.remove(name) | ||
30 | print ( | ||
31 | "I noticed that %s is not installed. Installing it may help." % | ||
32 | name) | ||
33 | |||
34 | if 'lxml' in basic_parsers: | ||
35 | basic_parsers.append(["lxml", "xml"]) | ||
36 | from lxml import etree | ||
37 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) | ||
38 | |||
39 | if 'html5lib' in basic_parsers: | ||
40 | import html5lib | ||
41 | print "Found html5lib version %s" % html5lib.__version__ | ||
42 | |||
43 | if hasattr(data, 'read'): | ||
44 | data = data.read() | ||
45 | elif os.path.exists(data): | ||
46 | print '"%s" looks like a filename. Reading data from the file.' % data | ||
47 | data = open(data).read() | ||
48 | elif data.startswith("http:") or data.startswith("https:"): | ||
49 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data | ||
50 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." | ||
51 | return | ||
52 | |||
53 | |||
54 | for parser in basic_parsers: | ||
55 | print "Trying to parse your markup with %s" % parser | ||
56 | success = False | ||
57 | try: | ||
58 | soup = BeautifulSoup(data, parser) | ||
59 | success = True | ||
60 | except Exception, e: | ||
61 | print "%s could not parse the markup." % parser | ||
62 | traceback.print_exc() | ||
63 | if success: | ||
64 | print "Here's what %s did with the markup:" % parser | ||
65 | print soup.prettify() | ||
66 | |||
67 | print "-" * 80 | ||
68 | |||
69 | def lxml_trace(data, html=True, **kwargs): | ||
70 | """Print out the lxml events that occur during parsing. | ||
71 | |||
72 | This lets you see how lxml parses a document when no Beautiful | ||
73 | Soup code is running. | ||
74 | """ | ||
75 | from lxml import etree | ||
76 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): | ||
77 | print("%s, %4s, %s" % (event, element.tag, element.text)) | ||
78 | |||
79 | class AnnouncingParser(HTMLParser): | ||
80 | """Announces HTMLParser parse events, without doing anything else.""" | ||
81 | |||
82 | def _p(self, s): | ||
83 | print(s) | ||
84 | |||
85 | def handle_starttag(self, name, attrs): | ||
86 | self._p("%s START" % name) | ||
87 | |||
88 | def handle_endtag(self, name): | ||
89 | self._p("%s END" % name) | ||
90 | |||
91 | def handle_data(self, data): | ||
92 | self._p("%s DATA" % data) | ||
93 | |||
94 | def handle_charref(self, name): | ||
95 | self._p("%s CHARREF" % name) | ||
96 | |||
97 | def handle_entityref(self, name): | ||
98 | self._p("%s ENTITYREF" % name) | ||
99 | |||
100 | def handle_comment(self, data): | ||
101 | self._p("%s COMMENT" % data) | ||
102 | |||
103 | def handle_decl(self, data): | ||
104 | self._p("%s DECL" % data) | ||
105 | |||
106 | def unknown_decl(self, data): | ||
107 | self._p("%s UNKNOWN-DECL" % data) | ||
108 | |||
109 | def handle_pi(self, data): | ||
110 | self._p("%s PI" % data) | ||
111 | |||
112 | def htmlparser_trace(data): | ||
113 | """Print out the HTMLParser events that occur during parsing. | ||
114 | |||
115 | This lets you see how HTMLParser parses a document when no | ||
116 | Beautiful Soup code is running. | ||
117 | """ | ||
118 | parser = AnnouncingParser() | ||
119 | parser.feed(data) | ||
120 | |||
121 | _vowels = "aeiou" | ||
122 | _consonants = "bcdfghjklmnpqrstvwxyz" | ||
123 | |||
124 | def rword(length=5): | ||
125 | "Generate a random word-like string." | ||
126 | s = '' | ||
127 | for i in range(length): | ||
128 | if i % 2 == 0: | ||
129 | t = _consonants | ||
130 | else: | ||
131 | t = _vowels | ||
132 | s += random.choice(t) | ||
133 | return s | ||
134 | |||
135 | def rsentence(length=4): | ||
136 | "Generate a random sentence-like string." | ||
137 | return " ".join(rword(random.randint(4,9)) for i in range(length)) | ||
138 | |||
139 | def rdoc(num_elements=1000): | ||
140 | """Randomly generate an invalid HTML document.""" | ||
141 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] | ||
142 | elements = [] | ||
143 | for i in range(num_elements): | ||
144 | choice = random.randint(0,3) | ||
145 | if choice == 0: | ||
146 | # New tag. | ||
147 | tag_name = random.choice(tag_names) | ||
148 | elements.append("<%s>" % tag_name) | ||
149 | elif choice == 1: | ||
150 | elements.append(rsentence(random.randint(1,4))) | ||
151 | elif choice == 2: | ||
152 | # Close a tag. | ||
153 | tag_name = random.choice(tag_names) | ||
154 | elements.append("</%s>" % tag_name) | ||
155 | return "<html>" + "\n".join(elements) + "</html>" | ||
156 | |||
157 | def benchmark_parsers(num_elements=100000): | ||
158 | """Very basic head-to-head performance benchmark.""" | ||
159 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__ | ||
160 | data = rdoc(num_elements) | ||
161 | print "Generated a large invalid HTML document (%d bytes)." % len(data) | ||
162 | |||
163 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: | ||
164 | success = False | ||
165 | try: | ||
166 | a = time.time() | ||
167 | soup = BeautifulSoup(data, parser) | ||
168 | b = time.time() | ||
169 | success = True | ||
170 | except Exception, e: | ||
171 | print "%s could not parse the markup." % parser | ||
172 | traceback.print_exc() | ||
173 | if success: | ||
174 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) | ||
175 | |||
176 | from lxml import etree | ||
177 | a = time.time() | ||
178 | etree.HTML(data) | ||
179 | b = time.time() | ||
180 | print "Raw lxml parsed the markup in %.2fs." % (b-a) | ||
181 | |||
182 | import html5lib | ||
183 | parser = html5lib.HTMLParser() | ||
184 | a = time.time() | ||
185 | parser.parse(data) | ||
186 | b = time.time() | ||
187 | print "Raw html5lib parsed the markup in %.2fs." % (b-a) | ||
188 | |||
189 | def profile(num_elements=100000, parser="lxml"): | ||
190 | |||
191 | filehandle = tempfile.NamedTemporaryFile() | ||
192 | filename = filehandle.name | ||
193 | |||
194 | data = rdoc(num_elements) | ||
195 | vars = dict(bs4=bs4, data=data, parser=parser) | ||
196 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) | ||
197 | |||
198 | stats = pstats.Stats(filename) | ||
199 | # stats.strip_dirs() | ||
200 | stats.sort_stats("cumulative") | ||
201 | stats.print_stats('_html5lib|bs4', 50) | ||
202 | |||
203 | if __name__ == '__main__': | ||
204 | diagnose(sys.stdin.read()) | ||
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py new file mode 100644 index 0000000000..da9afdf48e --- /dev/null +++ b/bitbake/lib/bs4/element.py | |||
@@ -0,0 +1,1611 @@ | |||
1 | import collections | ||
2 | import re | ||
3 | import sys | ||
4 | import warnings | ||
5 | from bs4.dammit import EntitySubstitution | ||
6 | |||
7 | DEFAULT_OUTPUT_ENCODING = "utf-8" | ||
8 | PY3K = (sys.version_info[0] > 2) | ||
9 | |||
10 | whitespace_re = re.compile("\s+") | ||
11 | |||
12 | def _alias(attr): | ||
13 | """Alias one attribute name to another for backward compatibility""" | ||
14 | @property | ||
15 | def alias(self): | ||
16 | return getattr(self, attr) | ||
17 | |||
18 | @alias.setter | ||
19 | def alias(self): | ||
20 | return setattr(self, attr) | ||
21 | return alias | ||
22 | |||
23 | |||
24 | class NamespacedAttribute(unicode): | ||
25 | |||
26 | def __new__(cls, prefix, name, namespace=None): | ||
27 | if name is None: | ||
28 | obj = unicode.__new__(cls, prefix) | ||
29 | elif prefix is None: | ||
30 | # Not really namespaced. | ||
31 | obj = unicode.__new__(cls, name) | ||
32 | else: | ||
33 | obj = unicode.__new__(cls, prefix + ":" + name) | ||
34 | obj.prefix = prefix | ||
35 | obj.name = name | ||
36 | obj.namespace = namespace | ||
37 | return obj | ||
38 | |||
39 | class AttributeValueWithCharsetSubstitution(unicode): | ||
40 | """A stand-in object for a character encoding specified in HTML.""" | ||
41 | |||
42 | class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): | ||
43 | """A generic stand-in for the value of a meta tag's 'charset' attribute. | ||
44 | |||
45 | When Beautiful Soup parses the markup '<meta charset="utf8">', the | ||
46 | value of the 'charset' attribute will be one of these objects. | ||
47 | """ | ||
48 | |||
49 | def __new__(cls, original_value): | ||
50 | obj = unicode.__new__(cls, original_value) | ||
51 | obj.original_value = original_value | ||
52 | return obj | ||
53 | |||
54 | def encode(self, encoding): | ||
55 | return encoding | ||
56 | |||
57 | |||
58 | class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): | ||
59 | """A generic stand-in for the value of a meta tag's 'content' attribute. | ||
60 | |||
61 | When Beautiful Soup parses the markup: | ||
62 | <meta http-equiv="content-type" content="text/html; charset=utf8"> | ||
63 | |||
64 | The value of the 'content' attribute will be one of these objects. | ||
65 | """ | ||
66 | |||
67 | CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) | ||
68 | |||
69 | def __new__(cls, original_value): | ||
70 | match = cls.CHARSET_RE.search(original_value) | ||
71 | if match is None: | ||
72 | # No substitution necessary. | ||
73 | return unicode.__new__(unicode, original_value) | ||
74 | |||
75 | obj = unicode.__new__(cls, original_value) | ||
76 | obj.original_value = original_value | ||
77 | return obj | ||
78 | |||
79 | def encode(self, encoding): | ||
80 | def rewrite(match): | ||
81 | return match.group(1) + encoding | ||
82 | return self.CHARSET_RE.sub(rewrite, self.original_value) | ||
83 | |||
84 | class HTMLAwareEntitySubstitution(EntitySubstitution): | ||
85 | |||
86 | """Entity substitution rules that are aware of some HTML quirks. | ||
87 | |||
88 | Specifically, the contents of <script> and <style> tags should not | ||
89 | undergo entity substitution. | ||
90 | |||
91 | Incoming NavigableString objects are checked to see if they're the | ||
92 | direct children of a <script> or <style> tag. | ||
93 | """ | ||
94 | |||
95 | cdata_containing_tags = set(["script", "style"]) | ||
96 | |||
97 | preformatted_tags = set(["pre"]) | ||
98 | |||
99 | @classmethod | ||
100 | def _substitute_if_appropriate(cls, ns, f): | ||
101 | if (isinstance(ns, NavigableString) | ||
102 | and ns.parent is not None | ||
103 | and ns.parent.name in cls.cdata_containing_tags): | ||
104 | # Do nothing. | ||
105 | return ns | ||
106 | # Substitute. | ||
107 | return f(ns) | ||
108 | |||
109 | @classmethod | ||
110 | def substitute_html(cls, ns): | ||
111 | return cls._substitute_if_appropriate( | ||
112 | ns, EntitySubstitution.substitute_html) | ||
113 | |||
114 | @classmethod | ||
115 | def substitute_xml(cls, ns): | ||
116 | return cls._substitute_if_appropriate( | ||
117 | ns, EntitySubstitution.substitute_xml) | ||
118 | |||
119 | class PageElement(object): | ||
120 | """Contains the navigational information for some part of the page | ||
121 | (either a tag or a piece of text)""" | ||
122 | |||
123 | # There are five possible values for the "formatter" argument passed in | ||
124 | # to methods like encode() and prettify(): | ||
125 | # | ||
126 | # "html" - All Unicode characters with corresponding HTML entities | ||
127 | # are converted to those entities on output. | ||
128 | # "minimal" - Bare ampersands and angle brackets are converted to | ||
129 | # XML entities: & < > | ||
130 | # None - The null formatter. Unicode characters are never | ||
131 | # converted to entities. This is not recommended, but it's | ||
132 | # faster than "minimal". | ||
133 | # A function - This function will be called on every string that | ||
134 | # needs to undergo entity substitution. | ||
135 | # | ||
136 | |||
137 | # In an HTML document, the default "html" and "minimal" functions | ||
138 | # will leave the contents of <script> and <style> tags alone. For | ||
139 | # an XML document, all tags will be given the same treatment. | ||
140 | |||
141 | HTML_FORMATTERS = { | ||
142 | "html" : HTMLAwareEntitySubstitution.substitute_html, | ||
143 | "minimal" : HTMLAwareEntitySubstitution.substitute_xml, | ||
144 | None : None | ||
145 | } | ||
146 | |||
147 | XML_FORMATTERS = { | ||
148 | "html" : EntitySubstitution.substitute_html, | ||
149 | "minimal" : EntitySubstitution.substitute_xml, | ||
150 | None : None | ||
151 | } | ||
152 | |||
153 | def format_string(self, s, formatter='minimal'): | ||
154 | """Format the given string using the given formatter.""" | ||
155 | if not callable(formatter): | ||
156 | formatter = self._formatter_for_name(formatter) | ||
157 | if formatter is None: | ||
158 | output = s | ||
159 | else: | ||
160 | output = formatter(s) | ||
161 | return output | ||
162 | |||
163 | @property | ||
164 | def _is_xml(self): | ||
165 | """Is this element part of an XML tree or an HTML tree? | ||
166 | |||
167 | This is used when mapping a formatter name ("minimal") to an | ||
168 | appropriate function (one that performs entity-substitution on | ||
169 | the contents of <script> and <style> tags, or not). It's | ||
170 | inefficient, but it should be called very rarely. | ||
171 | """ | ||
172 | if self.parent is None: | ||
173 | # This is the top-level object. It should have .is_xml set | ||
174 | # from tree creation. If not, take a guess--BS is usually | ||
175 | # used on HTML markup. | ||
176 | return getattr(self, 'is_xml', False) | ||
177 | return self.parent._is_xml | ||
178 | |||
179 | def _formatter_for_name(self, name): | ||
180 | "Look up a formatter function based on its name and the tree." | ||
181 | if self._is_xml: | ||
182 | return self.XML_FORMATTERS.get( | ||
183 | name, EntitySubstitution.substitute_xml) | ||
184 | else: | ||
185 | return self.HTML_FORMATTERS.get( | ||
186 | name, HTMLAwareEntitySubstitution.substitute_xml) | ||
187 | |||
188 | def setup(self, parent=None, previous_element=None): | ||
189 | """Sets up the initial relations between this element and | ||
190 | other elements.""" | ||
191 | self.parent = parent | ||
192 | self.previous_element = previous_element | ||
193 | if previous_element is not None: | ||
194 | self.previous_element.next_element = self | ||
195 | self.next_element = None | ||
196 | self.previous_sibling = None | ||
197 | self.next_sibling = None | ||
198 | if self.parent is not None and self.parent.contents: | ||
199 | self.previous_sibling = self.parent.contents[-1] | ||
200 | self.previous_sibling.next_sibling = self | ||
201 | |||
202 | nextSibling = _alias("next_sibling") # BS3 | ||
203 | previousSibling = _alias("previous_sibling") # BS3 | ||
204 | |||
205 | def replace_with(self, replace_with): | ||
206 | if replace_with is self: | ||
207 | return | ||
208 | if replace_with is self.parent: | ||
209 | raise ValueError("Cannot replace a Tag with its parent.") | ||
210 | old_parent = self.parent | ||
211 | my_index = self.parent.index(self) | ||
212 | self.extract() | ||
213 | old_parent.insert(my_index, replace_with) | ||
214 | return self | ||
215 | replaceWith = replace_with # BS3 | ||
216 | |||
217 | def unwrap(self): | ||
218 | my_parent = self.parent | ||
219 | my_index = self.parent.index(self) | ||
220 | self.extract() | ||
221 | for child in reversed(self.contents[:]): | ||
222 | my_parent.insert(my_index, child) | ||
223 | return self | ||
224 | replace_with_children = unwrap | ||
225 | replaceWithChildren = unwrap # BS3 | ||
226 | |||
227 | def wrap(self, wrap_inside): | ||
228 | me = self.replace_with(wrap_inside) | ||
229 | wrap_inside.append(me) | ||
230 | return wrap_inside | ||
231 | |||
232 | def extract(self): | ||
233 | """Destructively rips this element out of the tree.""" | ||
234 | if self.parent is not None: | ||
235 | del self.parent.contents[self.parent.index(self)] | ||
236 | |||
237 | #Find the two elements that would be next to each other if | ||
238 | #this element (and any children) hadn't been parsed. Connect | ||
239 | #the two. | ||
240 | last_child = self._last_descendant() | ||
241 | next_element = last_child.next_element | ||
242 | |||
243 | if self.previous_element is not None: | ||
244 | self.previous_element.next_element = next_element | ||
245 | if next_element is not None: | ||
246 | next_element.previous_element = self.previous_element | ||
247 | self.previous_element = None | ||
248 | last_child.next_element = None | ||
249 | |||
250 | self.parent = None | ||
251 | if self.previous_sibling is not None: | ||
252 | self.previous_sibling.next_sibling = self.next_sibling | ||
253 | if self.next_sibling is not None: | ||
254 | self.next_sibling.previous_sibling = self.previous_sibling | ||
255 | self.previous_sibling = self.next_sibling = None | ||
256 | return self | ||
257 | |||
258 | def _last_descendant(self, is_initialized=True, accept_self=True): | ||
259 | "Finds the last element beneath this object to be parsed." | ||
260 | if is_initialized and self.next_sibling: | ||
261 | last_child = self.next_sibling.previous_element | ||
262 | else: | ||
263 | last_child = self | ||
264 | while isinstance(last_child, Tag) and last_child.contents: | ||
265 | last_child = last_child.contents[-1] | ||
266 | if not accept_self and last_child == self: | ||
267 | last_child = None | ||
268 | return last_child | ||
269 | # BS3: Not part of the API! | ||
270 | _lastRecursiveChild = _last_descendant | ||
271 | |||
272 | def insert(self, position, new_child): | ||
273 | if new_child is self: | ||
274 | raise ValueError("Cannot insert a tag into itself.") | ||
275 | if (isinstance(new_child, basestring) | ||
276 | and not isinstance(new_child, NavigableString)): | ||
277 | new_child = NavigableString(new_child) | ||
278 | |||
279 | position = min(position, len(self.contents)) | ||
280 | if hasattr(new_child, 'parent') and new_child.parent is not None: | ||
281 | # We're 'inserting' an element that's already one | ||
282 | # of this object's children. | ||
283 | if new_child.parent is self: | ||
284 | current_index = self.index(new_child) | ||
285 | if current_index < position: | ||
286 | # We're moving this element further down the list | ||
287 | # of this object's children. That means that when | ||
288 | # we extract this element, our target index will | ||
289 | # jump down one. | ||
290 | position -= 1 | ||
291 | new_child.extract() | ||
292 | |||
293 | new_child.parent = self | ||
294 | previous_child = None | ||
295 | if position == 0: | ||
296 | new_child.previous_sibling = None | ||
297 | new_child.previous_element = self | ||
298 | else: | ||
299 | previous_child = self.contents[position - 1] | ||
300 | new_child.previous_sibling = previous_child | ||
301 | new_child.previous_sibling.next_sibling = new_child | ||
302 | new_child.previous_element = previous_child._last_descendant(False) | ||
303 | if new_child.previous_element is not None: | ||
304 | new_child.previous_element.next_element = new_child | ||
305 | |||
306 | new_childs_last_element = new_child._last_descendant(False) | ||
307 | |||
308 | if position >= len(self.contents): | ||
309 | new_child.next_sibling = None | ||
310 | |||
311 | parent = self | ||
312 | parents_next_sibling = None | ||
313 | while parents_next_sibling is None and parent is not None: | ||
314 | parents_next_sibling = parent.next_sibling | ||
315 | parent = parent.parent | ||
316 | if parents_next_sibling is not None: | ||
317 | # We found the element that comes next in the document. | ||
318 | break | ||
319 | if parents_next_sibling is not None: | ||
320 | new_childs_last_element.next_element = parents_next_sibling | ||
321 | else: | ||
322 | # The last element of this tag is the last element in | ||
323 | # the document. | ||
324 | new_childs_last_element.next_element = None | ||
325 | else: | ||
326 | next_child = self.contents[position] | ||
327 | new_child.next_sibling = next_child | ||
328 | if new_child.next_sibling is not None: | ||
329 | new_child.next_sibling.previous_sibling = new_child | ||
330 | new_childs_last_element.next_element = next_child | ||
331 | |||
332 | if new_childs_last_element.next_element is not None: | ||
333 | new_childs_last_element.next_element.previous_element = new_childs_last_element | ||
334 | self.contents.insert(position, new_child) | ||
335 | |||
336 | def append(self, tag): | ||
337 | """Appends the given tag to the contents of this tag.""" | ||
338 | self.insert(len(self.contents), tag) | ||
339 | |||
340 | def insert_before(self, predecessor): | ||
341 | """Makes the given element the immediate predecessor of this one. | ||
342 | |||
343 | The two elements will have the same parent, and the given element | ||
344 | will be immediately before this one. | ||
345 | """ | ||
346 | if self is predecessor: | ||
347 | raise ValueError("Can't insert an element before itself.") | ||
348 | parent = self.parent | ||
349 | if parent is None: | ||
350 | raise ValueError( | ||
351 | "Element has no parent, so 'before' has no meaning.") | ||
352 | # Extract first so that the index won't be screwed up if they | ||
353 | # are siblings. | ||
354 | if isinstance(predecessor, PageElement): | ||
355 | predecessor.extract() | ||
356 | index = parent.index(self) | ||
357 | parent.insert(index, predecessor) | ||
358 | |||
359 | def insert_after(self, successor): | ||
360 | """Makes the given element the immediate successor of this one. | ||
361 | |||
362 | The two elements will have the same parent, and the given element | ||
363 | will be immediately after this one. | ||
364 | """ | ||
365 | if self is successor: | ||
366 | raise ValueError("Can't insert an element after itself.") | ||
367 | parent = self.parent | ||
368 | if parent is None: | ||
369 | raise ValueError( | ||
370 | "Element has no parent, so 'after' has no meaning.") | ||
371 | # Extract first so that the index won't be screwed up if they | ||
372 | # are siblings. | ||
373 | if isinstance(successor, PageElement): | ||
374 | successor.extract() | ||
375 | index = parent.index(self) | ||
376 | parent.insert(index+1, successor) | ||
377 | |||
378 | def find_next(self, name=None, attrs={}, text=None, **kwargs): | ||
379 | """Returns the first item that matches the given criteria and | ||
380 | appears after this Tag in the document.""" | ||
381 | return self._find_one(self.find_all_next, name, attrs, text, **kwargs) | ||
382 | findNext = find_next # BS3 | ||
383 | |||
384 | def find_all_next(self, name=None, attrs={}, text=None, limit=None, | ||
385 | **kwargs): | ||
386 | """Returns all items that match the given criteria and appear | ||
387 | after this Tag in the document.""" | ||
388 | return self._find_all(name, attrs, text, limit, self.next_elements, | ||
389 | **kwargs) | ||
390 | findAllNext = find_all_next # BS3 | ||
391 | |||
392 | def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): | ||
393 | """Returns the closest sibling to this Tag that matches the | ||
394 | given criteria and appears after this Tag in the document.""" | ||
395 | return self._find_one(self.find_next_siblings, name, attrs, text, | ||
396 | **kwargs) | ||
397 | findNextSibling = find_next_sibling # BS3 | ||
398 | |||
399 | def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, | ||
400 | **kwargs): | ||
401 | """Returns the siblings of this Tag that match the given | ||
402 | criteria and appear after this Tag in the document.""" | ||
403 | return self._find_all(name, attrs, text, limit, | ||
404 | self.next_siblings, **kwargs) | ||
405 | findNextSiblings = find_next_siblings # BS3 | ||
406 | fetchNextSiblings = find_next_siblings # BS2 | ||
407 | |||
408 | def find_previous(self, name=None, attrs={}, text=None, **kwargs): | ||
409 | """Returns the first item that matches the given criteria and | ||
410 | appears before this Tag in the document.""" | ||
411 | return self._find_one( | ||
412 | self.find_all_previous, name, attrs, text, **kwargs) | ||
413 | findPrevious = find_previous # BS3 | ||
414 | |||
415 | def find_all_previous(self, name=None, attrs={}, text=None, limit=None, | ||
416 | **kwargs): | ||
417 | """Returns all items that match the given criteria and appear | ||
418 | before this Tag in the document.""" | ||
419 | return self._find_all(name, attrs, text, limit, self.previous_elements, | ||
420 | **kwargs) | ||
421 | findAllPrevious = find_all_previous # BS3 | ||
422 | fetchPrevious = find_all_previous # BS2 | ||
423 | |||
424 | def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): | ||
425 | """Returns the closest sibling to this Tag that matches the | ||
426 | given criteria and appears before this Tag in the document.""" | ||
427 | return self._find_one(self.find_previous_siblings, name, attrs, text, | ||
428 | **kwargs) | ||
429 | findPreviousSibling = find_previous_sibling # BS3 | ||
430 | |||
431 | def find_previous_siblings(self, name=None, attrs={}, text=None, | ||
432 | limit=None, **kwargs): | ||
433 | """Returns the siblings of this Tag that match the given | ||
434 | criteria and appear before this Tag in the document.""" | ||
435 | return self._find_all(name, attrs, text, limit, | ||
436 | self.previous_siblings, **kwargs) | ||
437 | findPreviousSiblings = find_previous_siblings # BS3 | ||
438 | fetchPreviousSiblings = find_previous_siblings # BS2 | ||
439 | |||
440 | def find_parent(self, name=None, attrs={}, **kwargs): | ||
441 | """Returns the closest parent of this Tag that matches the given | ||
442 | criteria.""" | ||
443 | # NOTE: We can't use _find_one because findParents takes a different | ||
444 | # set of arguments. | ||
445 | r = None | ||
446 | l = self.find_parents(name, attrs, 1, **kwargs) | ||
447 | if l: | ||
448 | r = l[0] | ||
449 | return r | ||
450 | findParent = find_parent # BS3 | ||
451 | |||
452 | def find_parents(self, name=None, attrs={}, limit=None, **kwargs): | ||
453 | """Returns the parents of this Tag that match the given | ||
454 | criteria.""" | ||
455 | |||
456 | return self._find_all(name, attrs, None, limit, self.parents, | ||
457 | **kwargs) | ||
458 | findParents = find_parents # BS3 | ||
459 | fetchParents = find_parents # BS2 | ||
460 | |||
461 | @property | ||
462 | def next(self): | ||
463 | return self.next_element | ||
464 | |||
465 | @property | ||
466 | def previous(self): | ||
467 | return self.previous_element | ||
468 | |||
469 | #These methods do the real heavy lifting. | ||
470 | |||
471 | def _find_one(self, method, name, attrs, text, **kwargs): | ||
472 | r = None | ||
473 | l = method(name, attrs, text, 1, **kwargs) | ||
474 | if l: | ||
475 | r = l[0] | ||
476 | return r | ||
477 | |||
478 | def _find_all(self, name, attrs, text, limit, generator, **kwargs): | ||
479 | "Iterates over a generator looking for things that match." | ||
480 | |||
481 | if isinstance(name, SoupStrainer): | ||
482 | strainer = name | ||
483 | else: | ||
484 | strainer = SoupStrainer(name, attrs, text, **kwargs) | ||
485 | |||
486 | if text is None and not limit and not attrs and not kwargs: | ||
487 | if name is True or name is None: | ||
488 | # Optimization to find all tags. | ||
489 | result = (element for element in generator | ||
490 | if isinstance(element, Tag)) | ||
491 | return ResultSet(strainer, result) | ||
492 | elif isinstance(name, basestring): | ||
493 | # Optimization to find all tags with a given name. | ||
494 | result = (element for element in generator | ||
495 | if isinstance(element, Tag) | ||
496 | and element.name == name) | ||
497 | return ResultSet(strainer, result) | ||
498 | results = ResultSet(strainer) | ||
499 | while True: | ||
500 | try: | ||
501 | i = next(generator) | ||
502 | except StopIteration: | ||
503 | break | ||
504 | if i: | ||
505 | found = strainer.search(i) | ||
506 | if found: | ||
507 | results.append(found) | ||
508 | if limit and len(results) >= limit: | ||
509 | break | ||
510 | return results | ||
511 | |||
512 | #These generators can be used to navigate starting from both | ||
513 | #NavigableStrings and Tags. | ||
514 | @property | ||
515 | def next_elements(self): | ||
516 | i = self.next_element | ||
517 | while i is not None: | ||
518 | yield i | ||
519 | i = i.next_element | ||
520 | |||
521 | @property | ||
522 | def next_siblings(self): | ||
523 | i = self.next_sibling | ||
524 | while i is not None: | ||
525 | yield i | ||
526 | i = i.next_sibling | ||
527 | |||
528 | @property | ||
529 | def previous_elements(self): | ||
530 | i = self.previous_element | ||
531 | while i is not None: | ||
532 | yield i | ||
533 | i = i.previous_element | ||
534 | |||
535 | @property | ||
536 | def previous_siblings(self): | ||
537 | i = self.previous_sibling | ||
538 | while i is not None: | ||
539 | yield i | ||
540 | i = i.previous_sibling | ||
541 | |||
542 | @property | ||
543 | def parents(self): | ||
544 | i = self.parent | ||
545 | while i is not None: | ||
546 | yield i | ||
547 | i = i.parent | ||
548 | |||
549 | # Methods for supporting CSS selectors. | ||
550 | |||
551 | tag_name_re = re.compile('^[a-z0-9]+$') | ||
552 | |||
553 | # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ | ||
554 | # \---/ \---/\-------------/ \-------/ | ||
555 | # | | | | | ||
556 | # | | | The value | ||
557 | # | | ~,|,^,$,* or = | ||
558 | # | Attribute | ||
559 | # Tag | ||
560 | attribselect_re = re.compile( | ||
561 | r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + | ||
562 | r'=?"?(?P<value>[^\]"]*)"?\]$' | ||
563 | ) | ||
564 | |||
565 | def _attr_value_as_string(self, value, default=None): | ||
566 | """Force an attribute value into a string representation. | ||
567 | |||
568 | A multi-valued attribute will be converted into a | ||
569 | space-separated stirng. | ||
570 | """ | ||
571 | value = self.get(value, default) | ||
572 | if isinstance(value, list) or isinstance(value, tuple): | ||
573 | value =" ".join(value) | ||
574 | return value | ||
575 | |||
576 | def _tag_name_matches_and(self, function, tag_name): | ||
577 | if not tag_name: | ||
578 | return function | ||
579 | else: | ||
580 | def _match(tag): | ||
581 | return tag.name == tag_name and function(tag) | ||
582 | return _match | ||
583 | |||
584 | def _attribute_checker(self, operator, attribute, value=''): | ||
585 | """Create a function that performs a CSS selector operation. | ||
586 | |||
587 | Takes an operator, attribute and optional value. Returns a | ||
588 | function that will return True for elements that match that | ||
589 | combination. | ||
590 | """ | ||
591 | if operator == '=': | ||
592 | # string representation of `attribute` is equal to `value` | ||
593 | return lambda el: el._attr_value_as_string(attribute) == value | ||
594 | elif operator == '~': | ||
595 | # space-separated list representation of `attribute` | ||
596 | # contains `value` | ||
597 | def _includes_value(element): | ||
598 | attribute_value = element.get(attribute, []) | ||
599 | if not isinstance(attribute_value, list): | ||
600 | attribute_value = attribute_value.split() | ||
601 | return value in attribute_value | ||
602 | return _includes_value | ||
603 | elif operator == '^': | ||
604 | # string representation of `attribute` starts with `value` | ||
605 | return lambda el: el._attr_value_as_string( | ||
606 | attribute, '').startswith(value) | ||
607 | elif operator == '$': | ||
608 | # string represenation of `attribute` ends with `value` | ||
609 | return lambda el: el._attr_value_as_string( | ||
610 | attribute, '').endswith(value) | ||
611 | elif operator == '*': | ||
612 | # string representation of `attribute` contains `value` | ||
613 | return lambda el: value in el._attr_value_as_string(attribute, '') | ||
614 | elif operator == '|': | ||
615 | # string representation of `attribute` is either exactly | ||
616 | # `value` or starts with `value` and then a dash. | ||
617 | def _is_or_starts_with_dash(element): | ||
618 | attribute_value = element._attr_value_as_string(attribute, '') | ||
619 | return (attribute_value == value or attribute_value.startswith( | ||
620 | value + '-')) | ||
621 | return _is_or_starts_with_dash | ||
622 | else: | ||
623 | return lambda el: el.has_attr(attribute) | ||
624 | |||
625 | # Old non-property versions of the generators, for backwards | ||
626 | # compatibility with BS3. | ||
627 | def nextGenerator(self): | ||
628 | return self.next_elements | ||
629 | |||
630 | def nextSiblingGenerator(self): | ||
631 | return self.next_siblings | ||
632 | |||
633 | def previousGenerator(self): | ||
634 | return self.previous_elements | ||
635 | |||
636 | def previousSiblingGenerator(self): | ||
637 | return self.previous_siblings | ||
638 | |||
639 | def parentGenerator(self): | ||
640 | return self.parents | ||
641 | |||
642 | |||
643 | class NavigableString(unicode, PageElement): | ||
644 | |||
645 | PREFIX = '' | ||
646 | SUFFIX = '' | ||
647 | |||
648 | def __new__(cls, value): | ||
649 | """Create a new NavigableString. | ||
650 | |||
651 | When unpickling a NavigableString, this method is called with | ||
652 | the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be | ||
653 | passed in to the superclass's __new__ or the superclass won't know | ||
654 | how to handle non-ASCII characters. | ||
655 | """ | ||
656 | if isinstance(value, unicode): | ||
657 | return unicode.__new__(cls, value) | ||
658 | return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) | ||
659 | |||
660 | def __copy__(self): | ||
661 | return self | ||
662 | |||
663 | def __getnewargs__(self): | ||
664 | return (unicode(self),) | ||
665 | |||
666 | def __getattr__(self, attr): | ||
667 | """text.string gives you text. This is for backwards | ||
668 | compatibility for Navigable*String, but for CData* it lets you | ||
669 | get the string without the CData wrapper.""" | ||
670 | if attr == 'string': | ||
671 | return self | ||
672 | else: | ||
673 | raise AttributeError( | ||
674 | "'%s' object has no attribute '%s'" % ( | ||
675 | self.__class__.__name__, attr)) | ||
676 | |||
677 | def output_ready(self, formatter="minimal"): | ||
678 | output = self.format_string(self, formatter) | ||
679 | return self.PREFIX + output + self.SUFFIX | ||
680 | |||
681 | @property | ||
682 | def name(self): | ||
683 | return None | ||
684 | |||
685 | @name.setter | ||
686 | def name(self, name): | ||
687 | raise AttributeError("A NavigableString cannot be given a name.") | ||
688 | |||
689 | class PreformattedString(NavigableString): | ||
690 | """A NavigableString not subject to the normal formatting rules. | ||
691 | |||
692 | The string will be passed into the formatter (to trigger side effects), | ||
693 | but the return value will be ignored. | ||
694 | """ | ||
695 | |||
696 | def output_ready(self, formatter="minimal"): | ||
697 | """CData strings are passed into the formatter. | ||
698 | But the return value is ignored.""" | ||
699 | self.format_string(self, formatter) | ||
700 | return self.PREFIX + self + self.SUFFIX | ||
701 | |||
702 | class CData(PreformattedString): | ||
703 | |||
704 | PREFIX = u'<![CDATA[' | ||
705 | SUFFIX = u']]>' | ||
706 | |||
707 | class ProcessingInstruction(PreformattedString): | ||
708 | |||
709 | PREFIX = u'<?' | ||
710 | SUFFIX = u'?>' | ||
711 | |||
712 | class Comment(PreformattedString): | ||
713 | |||
714 | PREFIX = u'<!--' | ||
715 | SUFFIX = u'-->' | ||
716 | |||
717 | |||
718 | class Declaration(PreformattedString): | ||
719 | PREFIX = u'<!' | ||
720 | SUFFIX = u'!>' | ||
721 | |||
722 | |||
723 | class Doctype(PreformattedString): | ||
724 | |||
725 | @classmethod | ||
726 | def for_name_and_ids(cls, name, pub_id, system_id): | ||
727 | value = name or '' | ||
728 | if pub_id is not None: | ||
729 | value += ' PUBLIC "%s"' % pub_id | ||
730 | if system_id is not None: | ||
731 | value += ' "%s"' % system_id | ||
732 | elif system_id is not None: | ||
733 | value += ' SYSTEM "%s"' % system_id | ||
734 | |||
735 | return Doctype(value) | ||
736 | |||
737 | PREFIX = u'<!DOCTYPE ' | ||
738 | SUFFIX = u'>\n' | ||
739 | |||
740 | |||
741 | class Tag(PageElement): | ||
742 | |||
743 | """Represents a found HTML tag with its attributes and contents.""" | ||
744 | |||
745 | def __init__(self, parser=None, builder=None, name=None, namespace=None, | ||
746 | prefix=None, attrs=None, parent=None, previous=None): | ||
747 | "Basic constructor." | ||
748 | |||
749 | if parser is None: | ||
750 | self.parser_class = None | ||
751 | else: | ||
752 | # We don't actually store the parser object: that lets extracted | ||
753 | # chunks be garbage-collected. | ||
754 | self.parser_class = parser.__class__ | ||
755 | if name is None: | ||
756 | raise ValueError("No value provided for new tag's name.") | ||
757 | self.name = name | ||
758 | self.namespace = namespace | ||
759 | self.prefix = prefix | ||
760 | if attrs is None: | ||
761 | attrs = {} | ||
762 | elif attrs and builder.cdata_list_attributes: | ||
763 | attrs = builder._replace_cdata_list_attribute_values( | ||
764 | self.name, attrs) | ||
765 | else: | ||
766 | attrs = dict(attrs) | ||
767 | self.attrs = attrs | ||
768 | self.contents = [] | ||
769 | self.setup(parent, previous) | ||
770 | self.hidden = False | ||
771 | |||
772 | # Set up any substitutions, such as the charset in a META tag. | ||
773 | if builder is not None: | ||
774 | builder.set_up_substitutions(self) | ||
775 | self.can_be_empty_element = builder.can_be_empty_element(name) | ||
776 | else: | ||
777 | self.can_be_empty_element = False | ||
778 | |||
779 | parserClass = _alias("parser_class") # BS3 | ||
780 | |||
781 | @property | ||
782 | def is_empty_element(self): | ||
783 | """Is this tag an empty-element tag? (aka a self-closing tag) | ||
784 | |||
785 | A tag that has contents is never an empty-element tag. | ||
786 | |||
787 | A tag that has no contents may or may not be an empty-element | ||
788 | tag. It depends on the builder used to create the tag. If the | ||
789 | builder has a designated list of empty-element tags, then only | ||
790 | a tag whose name shows up in that list is considered an | ||
791 | empty-element tag. | ||
792 | |||
793 | If the builder has no designated list of empty-element tags, | ||
794 | then any tag with no contents is an empty-element tag. | ||
795 | """ | ||
796 | return len(self.contents) == 0 and self.can_be_empty_element | ||
797 | isSelfClosing = is_empty_element # BS3 | ||
798 | |||
799 | @property | ||
800 | def string(self): | ||
801 | """Convenience property to get the single string within this tag. | ||
802 | |||
803 | :Return: If this tag has a single string child, return value | ||
804 | is that string. If this tag has no children, or more than one | ||
805 | child, return value is None. If this tag has one child tag, | ||
806 | return value is the 'string' attribute of the child tag, | ||
807 | recursively. | ||
808 | """ | ||
809 | if len(self.contents) != 1: | ||
810 | return None | ||
811 | child = self.contents[0] | ||
812 | if isinstance(child, NavigableString): | ||
813 | return child | ||
814 | return child.string | ||
815 | |||
816 | @string.setter | ||
817 | def string(self, string): | ||
818 | self.clear() | ||
819 | self.append(string.__class__(string)) | ||
820 | |||
821 | def _all_strings(self, strip=False, types=(NavigableString, CData)): | ||
822 | """Yield all strings of certain classes, possibly stripping them. | ||
823 | |||
824 | By default, yields only NavigableString and CData objects. So | ||
825 | no comments, processing instructions, etc. | ||
826 | """ | ||
827 | for descendant in self.descendants: | ||
828 | if ( | ||
829 | (types is None and not isinstance(descendant, NavigableString)) | ||
830 | or | ||
831 | (types is not None and type(descendant) not in types)): | ||
832 | continue | ||
833 | if strip: | ||
834 | descendant = descendant.strip() | ||
835 | if len(descendant) == 0: | ||
836 | continue | ||
837 | yield descendant | ||
838 | |||
839 | strings = property(_all_strings) | ||
840 | |||
841 | @property | ||
842 | def stripped_strings(self): | ||
843 | for string in self._all_strings(True): | ||
844 | yield string | ||
845 | |||
846 | def get_text(self, separator=u"", strip=False, | ||
847 | types=(NavigableString, CData)): | ||
848 | """ | ||
849 | Get all child strings, concatenated using the given separator. | ||
850 | """ | ||
851 | return separator.join([s for s in self._all_strings( | ||
852 | strip, types=types)]) | ||
853 | getText = get_text | ||
854 | text = property(get_text) | ||
855 | |||
856 | def decompose(self): | ||
857 | """Recursively destroys the contents of this tree.""" | ||
858 | self.extract() | ||
859 | i = self | ||
860 | while i is not None: | ||
861 | next = i.next_element | ||
862 | i.__dict__.clear() | ||
863 | i.contents = [] | ||
864 | i = next | ||
865 | |||
866 | def clear(self, decompose=False): | ||
867 | """ | ||
868 | Extract all children. If decompose is True, decompose instead. | ||
869 | """ | ||
870 | if decompose: | ||
871 | for element in self.contents[:]: | ||
872 | if isinstance(element, Tag): | ||
873 | element.decompose() | ||
874 | else: | ||
875 | element.extract() | ||
876 | else: | ||
877 | for element in self.contents[:]: | ||
878 | element.extract() | ||
879 | |||
880 | def index(self, element): | ||
881 | """ | ||
882 | Find the index of a child by identity, not value. Avoids issues with | ||
883 | tag.contents.index(element) getting the index of equal elements. | ||
884 | """ | ||
885 | for i, child in enumerate(self.contents): | ||
886 | if child is element: | ||
887 | return i | ||
888 | raise ValueError("Tag.index: element not in tag") | ||
889 | |||
890 | def get(self, key, default=None): | ||
891 | """Returns the value of the 'key' attribute for the tag, or | ||
892 | the value given for 'default' if it doesn't have that | ||
893 | attribute.""" | ||
894 | return self.attrs.get(key, default) | ||
895 | |||
896 | def has_attr(self, key): | ||
897 | return key in self.attrs | ||
898 | |||
899 | def __hash__(self): | ||
900 | return str(self).__hash__() | ||
901 | |||
902 | def __getitem__(self, key): | ||
903 | """tag[key] returns the value of the 'key' attribute for the tag, | ||
904 | and throws an exception if it's not there.""" | ||
905 | return self.attrs[key] | ||
906 | |||
907 | def __iter__(self): | ||
908 | "Iterating over a tag iterates over its contents." | ||
909 | return iter(self.contents) | ||
910 | |||
911 | def __len__(self): | ||
912 | "The length of a tag is the length of its list of contents." | ||
913 | return len(self.contents) | ||
914 | |||
915 | def __contains__(self, x): | ||
916 | return x in self.contents | ||
917 | |||
918 | def __nonzero__(self): | ||
919 | "A tag is non-None even if it has no contents." | ||
920 | return True | ||
921 | |||
922 | def __setitem__(self, key, value): | ||
923 | """Setting tag[key] sets the value of the 'key' attribute for the | ||
924 | tag.""" | ||
925 | self.attrs[key] = value | ||
926 | |||
927 | def __delitem__(self, key): | ||
928 | "Deleting tag[key] deletes all 'key' attributes for the tag." | ||
929 | self.attrs.pop(key, None) | ||
930 | |||
931 | def __call__(self, *args, **kwargs): | ||
932 | """Calling a tag like a function is the same as calling its | ||
933 | find_all() method. Eg. tag('a') returns a list of all the A tags | ||
934 | found within this tag.""" | ||
935 | return self.find_all(*args, **kwargs) | ||
936 | |||
937 | def __getattr__(self, tag): | ||
938 | #print "Getattr %s.%s" % (self.__class__, tag) | ||
939 | if len(tag) > 3 and tag.endswith('Tag'): | ||
940 | # BS3: soup.aTag -> "soup.find("a") | ||
941 | tag_name = tag[:-3] | ||
942 | warnings.warn( | ||
943 | '.%sTag is deprecated, use .find("%s") instead.' % ( | ||
944 | tag_name, tag_name)) | ||
945 | return self.find(tag_name) | ||
946 | # We special case contents to avoid recursion. | ||
947 | elif not tag.startswith("__") and not tag=="contents": | ||
948 | return self.find(tag) | ||
949 | raise AttributeError( | ||
950 | "'%s' object has no attribute '%s'" % (self.__class__, tag)) | ||
951 | |||
952 | def __eq__(self, other): | ||
953 | """Returns true iff this tag has the same name, the same attributes, | ||
954 | and the same contents (recursively) as the given tag.""" | ||
955 | if self is other: | ||
956 | return True | ||
957 | if (not hasattr(other, 'name') or | ||
958 | not hasattr(other, 'attrs') or | ||
959 | not hasattr(other, 'contents') or | ||
960 | self.name != other.name or | ||
961 | self.attrs != other.attrs or | ||
962 | len(self) != len(other)): | ||
963 | return False | ||
964 | for i, my_child in enumerate(self.contents): | ||
965 | if my_child != other.contents[i]: | ||
966 | return False | ||
967 | return True | ||
968 | |||
969 | def __ne__(self, other): | ||
970 | """Returns true iff this tag is not identical to the other tag, | ||
971 | as defined in __eq__.""" | ||
972 | return not self == other | ||
973 | |||
974 | def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): | ||
975 | """Renders this tag as a string.""" | ||
976 | return self.encode(encoding) | ||
977 | |||
978 | def __unicode__(self): | ||
979 | return self.decode() | ||
980 | |||
981 | def __str__(self): | ||
982 | return self.encode() | ||
983 | |||
984 | if PY3K: | ||
985 | __str__ = __repr__ = __unicode__ | ||
986 | |||
987 | def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, | ||
988 | indent_level=None, formatter="minimal", | ||
989 | errors="xmlcharrefreplace"): | ||
990 | # Turn the data structure into Unicode, then encode the | ||
991 | # Unicode. | ||
992 | u = self.decode(indent_level, encoding, formatter) | ||
993 | return u.encode(encoding, errors) | ||
994 | |||
995 | def _should_pretty_print(self, indent_level): | ||
996 | """Should this tag be pretty-printed?""" | ||
997 | return ( | ||
998 | indent_level is not None and | ||
999 | (self.name not in HTMLAwareEntitySubstitution.preformatted_tags | ||
1000 | or self._is_xml)) | ||
1001 | |||
1002 | def decode(self, indent_level=None, | ||
1003 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, | ||
1004 | formatter="minimal"): | ||
1005 | """Returns a Unicode representation of this tag and its contents. | ||
1006 | |||
1007 | :param eventual_encoding: The tag is destined to be | ||
1008 | encoded into this encoding. This method is _not_ | ||
1009 | responsible for performing that encoding. This information | ||
1010 | is passed in so that it can be substituted in if the | ||
1011 | document contains a <META> tag that mentions the document's | ||
1012 | encoding. | ||
1013 | """ | ||
1014 | |||
1015 | # First off, turn a string formatter into a function. This | ||
1016 | # will stop the lookup from happening over and over again. | ||
1017 | if not callable(formatter): | ||
1018 | formatter = self._formatter_for_name(formatter) | ||
1019 | |||
1020 | attrs = [] | ||
1021 | if self.attrs: | ||
1022 | for key, val in sorted(self.attrs.items()): | ||
1023 | if val is None: | ||
1024 | decoded = key | ||
1025 | else: | ||
1026 | if isinstance(val, list) or isinstance(val, tuple): | ||
1027 | val = ' '.join(val) | ||
1028 | elif not isinstance(val, basestring): | ||
1029 | val = unicode(val) | ||
1030 | elif ( | ||
1031 | isinstance(val, AttributeValueWithCharsetSubstitution) | ||
1032 | and eventual_encoding is not None): | ||
1033 | val = val.encode(eventual_encoding) | ||
1034 | |||
1035 | text = self.format_string(val, formatter) | ||
1036 | decoded = ( | ||
1037 | unicode(key) + '=' | ||
1038 | + EntitySubstitution.quoted_attribute_value(text)) | ||
1039 | attrs.append(decoded) | ||
1040 | close = '' | ||
1041 | closeTag = '' | ||
1042 | |||
1043 | prefix = '' | ||
1044 | if self.prefix: | ||
1045 | prefix = self.prefix + ":" | ||
1046 | |||
1047 | if self.is_empty_element: | ||
1048 | close = '/' | ||
1049 | else: | ||
1050 | closeTag = '</%s%s>' % (prefix, self.name) | ||
1051 | |||
1052 | pretty_print = self._should_pretty_print(indent_level) | ||
1053 | space = '' | ||
1054 | indent_space = '' | ||
1055 | if indent_level is not None: | ||
1056 | indent_space = (' ' * (indent_level - 1)) | ||
1057 | if pretty_print: | ||
1058 | space = indent_space | ||
1059 | indent_contents = indent_level + 1 | ||
1060 | else: | ||
1061 | indent_contents = None | ||
1062 | contents = self.decode_contents( | ||
1063 | indent_contents, eventual_encoding, formatter) | ||
1064 | |||
1065 | if self.hidden: | ||
1066 | # This is the 'document root' object. | ||
1067 | s = contents | ||
1068 | else: | ||
1069 | s = [] | ||
1070 | attribute_string = '' | ||
1071 | if attrs: | ||
1072 | attribute_string = ' ' + ' '.join(attrs) | ||
1073 | if indent_level is not None: | ||
1074 | # Even if this particular tag is not pretty-printed, | ||
1075 | # we should indent up to the start of the tag. | ||
1076 | s.append(indent_space) | ||
1077 | s.append('<%s%s%s%s>' % ( | ||
1078 | prefix, self.name, attribute_string, close)) | ||
1079 | if pretty_print: | ||
1080 | s.append("\n") | ||
1081 | s.append(contents) | ||
1082 | if pretty_print and contents and contents[-1] != "\n": | ||
1083 | s.append("\n") | ||
1084 | if pretty_print and closeTag: | ||
1085 | s.append(space) | ||
1086 | s.append(closeTag) | ||
1087 | if indent_level is not None and closeTag and self.next_sibling: | ||
1088 | # Even if this particular tag is not pretty-printed, | ||
1089 | # we're now done with the tag, and we should add a | ||
1090 | # newline if appropriate. | ||
1091 | s.append("\n") | ||
1092 | s = ''.join(s) | ||
1093 | return s | ||
1094 | |||
1095 | def prettify(self, encoding=None, formatter="minimal"): | ||
1096 | if encoding is None: | ||
1097 | return self.decode(True, formatter=formatter) | ||
1098 | else: | ||
1099 | return self.encode(encoding, True, formatter=formatter) | ||
1100 | |||
1101 | def decode_contents(self, indent_level=None, | ||
1102 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, | ||
1103 | formatter="minimal"): | ||
1104 | """Renders the contents of this tag as a Unicode string. | ||
1105 | |||
1106 | :param eventual_encoding: The tag is destined to be | ||
1107 | encoded into this encoding. This method is _not_ | ||
1108 | responsible for performing that encoding. This information | ||
1109 | is passed in so that it can be substituted in if the | ||
1110 | document contains a <META> tag that mentions the document's | ||
1111 | encoding. | ||
1112 | """ | ||
1113 | # First off, turn a string formatter into a function. This | ||
1114 | # will stop the lookup from happening over and over again. | ||
1115 | if not callable(formatter): | ||
1116 | formatter = self._formatter_for_name(formatter) | ||
1117 | |||
1118 | pretty_print = (indent_level is not None) | ||
1119 | s = [] | ||
1120 | for c in self: | ||
1121 | text = None | ||
1122 | if isinstance(c, NavigableString): | ||
1123 | text = c.output_ready(formatter) | ||
1124 | elif isinstance(c, Tag): | ||
1125 | s.append(c.decode(indent_level, eventual_encoding, | ||
1126 | formatter)) | ||
1127 | if text and indent_level and not self.name == 'pre': | ||
1128 | text = text.strip() | ||
1129 | if text: | ||
1130 | if pretty_print and not self.name == 'pre': | ||
1131 | s.append(" " * (indent_level - 1)) | ||
1132 | s.append(text) | ||
1133 | if pretty_print and not self.name == 'pre': | ||
1134 | s.append("\n") | ||
1135 | return ''.join(s) | ||
1136 | |||
1137 | def encode_contents( | ||
1138 | self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, | ||
1139 | formatter="minimal"): | ||
1140 | """Renders the contents of this tag as a bytestring.""" | ||
1141 | contents = self.decode_contents(indent_level, encoding, formatter) | ||
1142 | return contents.encode(encoding) | ||
1143 | |||
1144 | # Old method for BS3 compatibility | ||
1145 | def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, | ||
1146 | prettyPrint=False, indentLevel=0): | ||
1147 | if not prettyPrint: | ||
1148 | indentLevel = None | ||
1149 | return self.encode_contents( | ||
1150 | indent_level=indentLevel, encoding=encoding) | ||
1151 | |||
1152 | #Soup methods | ||
1153 | |||
1154 | def find(self, name=None, attrs={}, recursive=True, text=None, | ||
1155 | **kwargs): | ||
1156 | """Return only the first child of this Tag matching the given | ||
1157 | criteria.""" | ||
1158 | r = None | ||
1159 | l = self.find_all(name, attrs, recursive, text, 1, **kwargs) | ||
1160 | if l: | ||
1161 | r = l[0] | ||
1162 | return r | ||
1163 | findChild = find | ||
1164 | |||
1165 | def find_all(self, name=None, attrs={}, recursive=True, text=None, | ||
1166 | limit=None, **kwargs): | ||
1167 | """Extracts a list of Tag objects that match the given | ||
1168 | criteria. You can specify the name of the Tag and any | ||
1169 | attributes you want the Tag to have. | ||
1170 | |||
1171 | The value of a key-value pair in the 'attrs' map can be a | ||
1172 | string, a list of strings, a regular expression object, or a | ||
1173 | callable that takes a string and returns whether or not the | ||
1174 | string matches for some custom definition of 'matches'. The | ||
1175 | same is true of the tag name.""" | ||
1176 | |||
1177 | generator = self.descendants | ||
1178 | if not recursive: | ||
1179 | generator = self.children | ||
1180 | return self._find_all(name, attrs, text, limit, generator, **kwargs) | ||
1181 | findAll = find_all # BS3 | ||
1182 | findChildren = find_all # BS2 | ||
1183 | |||
1184 | #Generator methods | ||
1185 | @property | ||
1186 | def children(self): | ||
1187 | # return iter() to make the purpose of the method clear | ||
1188 | return iter(self.contents) # XXX This seems to be untested. | ||
1189 | |||
1190 | @property | ||
1191 | def descendants(self): | ||
1192 | if not len(self.contents): | ||
1193 | return | ||
1194 | stopNode = self._last_descendant().next_element | ||
1195 | current = self.contents[0] | ||
1196 | while current is not stopNode: | ||
1197 | yield current | ||
1198 | current = current.next_element | ||
1199 | |||
1200 | # CSS selector code | ||
1201 | |||
1202 | _selector_combinators = ['>', '+', '~'] | ||
1203 | _select_debug = False | ||
1204 | def select(self, selector, _candidate_generator=None): | ||
1205 | """Perform a CSS selection operation on the current element.""" | ||
1206 | tokens = selector.split() | ||
1207 | current_context = [self] | ||
1208 | |||
1209 | if tokens[-1] in self._selector_combinators: | ||
1210 | raise ValueError( | ||
1211 | 'Final combinator "%s" is missing an argument.' % tokens[-1]) | ||
1212 | if self._select_debug: | ||
1213 | print 'Running CSS selector "%s"' % selector | ||
1214 | for index, token in enumerate(tokens): | ||
1215 | if self._select_debug: | ||
1216 | print ' Considering token "%s"' % token | ||
1217 | recursive_candidate_generator = None | ||
1218 | tag_name = None | ||
1219 | if tokens[index-1] in self._selector_combinators: | ||
1220 | # This token was consumed by the previous combinator. Skip it. | ||
1221 | if self._select_debug: | ||
1222 | print ' Token was consumed by the previous combinator.' | ||
1223 | continue | ||
1224 | # Each operation corresponds to a checker function, a rule | ||
1225 | # for determining whether a candidate matches the | ||
1226 | # selector. Candidates are generated by the active | ||
1227 | # iterator. | ||
1228 | checker = None | ||
1229 | |||
1230 | m = self.attribselect_re.match(token) | ||
1231 | if m is not None: | ||
1232 | # Attribute selector | ||
1233 | tag_name, attribute, operator, value = m.groups() | ||
1234 | checker = self._attribute_checker(operator, attribute, value) | ||
1235 | |||
1236 | elif '#' in token: | ||
1237 | # ID selector | ||
1238 | tag_name, tag_id = token.split('#', 1) | ||
1239 | def id_matches(tag): | ||
1240 | return tag.get('id', None) == tag_id | ||
1241 | checker = id_matches | ||
1242 | |||
1243 | elif '.' in token: | ||
1244 | # Class selector | ||
1245 | tag_name, klass = token.split('.', 1) | ||
1246 | classes = set(klass.split('.')) | ||
1247 | def classes_match(candidate): | ||
1248 | return classes.issubset(candidate.get('class', [])) | ||
1249 | checker = classes_match | ||
1250 | |||
1251 | elif ':' in token: | ||
1252 | # Pseudo-class | ||
1253 | tag_name, pseudo = token.split(':', 1) | ||
1254 | if tag_name == '': | ||
1255 | raise ValueError( | ||
1256 | "A pseudo-class must be prefixed with a tag name.") | ||
1257 | pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) | ||
1258 | found = [] | ||
1259 | if pseudo_attributes is not None: | ||
1260 | pseudo_type, pseudo_value = pseudo_attributes.groups() | ||
1261 | if pseudo_type == 'nth-of-type': | ||
1262 | try: | ||
1263 | pseudo_value = int(pseudo_value) | ||
1264 | except: | ||
1265 | raise NotImplementedError( | ||
1266 | 'Only numeric values are currently supported for the nth-of-type pseudo-class.') | ||
1267 | if pseudo_value < 1: | ||
1268 | raise ValueError( | ||
1269 | 'nth-of-type pseudo-class value must be at least 1.') | ||
1270 | class Counter(object): | ||
1271 | def __init__(self, destination): | ||
1272 | self.count = 0 | ||
1273 | self.destination = destination | ||
1274 | |||
1275 | def nth_child_of_type(self, tag): | ||
1276 | self.count += 1 | ||
1277 | if self.count == self.destination: | ||
1278 | return True | ||
1279 | if self.count > self.destination: | ||
1280 | # Stop the generator that's sending us | ||
1281 | # these things. | ||
1282 | raise StopIteration() | ||
1283 | return False | ||
1284 | checker = Counter(pseudo_value).nth_child_of_type | ||
1285 | else: | ||
1286 | raise NotImplementedError( | ||
1287 | 'Only the following pseudo-classes are implemented: nth-of-type.') | ||
1288 | |||
1289 | elif token == '*': | ||
1290 | # Star selector -- matches everything | ||
1291 | pass | ||
1292 | elif token == '>': | ||
1293 | # Run the next token as a CSS selector against the | ||
1294 | # direct children of each tag in the current context. | ||
1295 | recursive_candidate_generator = lambda tag: tag.children | ||
1296 | elif token == '~': | ||
1297 | # Run the next token as a CSS selector against the | ||
1298 | # siblings of each tag in the current context. | ||
1299 | recursive_candidate_generator = lambda tag: tag.next_siblings | ||
1300 | elif token == '+': | ||
1301 | # For each tag in the current context, run the next | ||
1302 | # token as a CSS selector against the tag's next | ||
1303 | # sibling that's a tag. | ||
1304 | def next_tag_sibling(tag): | ||
1305 | yield tag.find_next_sibling(True) | ||
1306 | recursive_candidate_generator = next_tag_sibling | ||
1307 | |||
1308 | elif self.tag_name_re.match(token): | ||
1309 | # Just a tag name. | ||
1310 | tag_name = token | ||
1311 | else: | ||
1312 | raise ValueError( | ||
1313 | 'Unsupported or invalid CSS selector: "%s"' % token) | ||
1314 | |||
1315 | if recursive_candidate_generator: | ||
1316 | # This happens when the selector looks like "> foo". | ||
1317 | # | ||
1318 | # The generator calls select() recursively on every | ||
1319 | # member of the current context, passing in a different | ||
1320 | # candidate generator and a different selector. | ||
1321 | # | ||
1322 | # In the case of "> foo", the candidate generator is | ||
1323 | # one that yields a tag's direct children (">"), and | ||
1324 | # the selector is "foo". | ||
1325 | next_token = tokens[index+1] | ||
1326 | def recursive_select(tag): | ||
1327 | if self._select_debug: | ||
1328 | print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) | ||
1329 | print '-' * 40 | ||
1330 | for i in tag.select(next_token, recursive_candidate_generator): | ||
1331 | if self._select_debug: | ||
1332 | print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) | ||
1333 | yield i | ||
1334 | if self._select_debug: | ||
1335 | print '-' * 40 | ||
1336 | _use_candidate_generator = recursive_select | ||
1337 | elif _candidate_generator is None: | ||
1338 | # By default, a tag's candidates are all of its | ||
1339 | # children. If tag_name is defined, only yield tags | ||
1340 | # with that name. | ||
1341 | if self._select_debug: | ||
1342 | if tag_name: | ||
1343 | check = "[any]" | ||
1344 | else: | ||
1345 | check = tag_name | ||
1346 | print ' Default candidate generator, tag name="%s"' % check | ||
1347 | if self._select_debug: | ||
1348 | # This is redundant with later code, but it stops | ||
1349 | # a bunch of bogus tags from cluttering up the | ||
1350 | # debug log. | ||
1351 | def default_candidate_generator(tag): | ||
1352 | for child in tag.descendants: | ||
1353 | if not isinstance(child, Tag): | ||
1354 | continue | ||
1355 | if tag_name and not child.name == tag_name: | ||
1356 | continue | ||
1357 | yield child | ||
1358 | _use_candidate_generator = default_candidate_generator | ||
1359 | else: | ||
1360 | _use_candidate_generator = lambda tag: tag.descendants | ||
1361 | else: | ||
1362 | _use_candidate_generator = _candidate_generator | ||
1363 | |||
1364 | new_context = [] | ||
1365 | new_context_ids = set([]) | ||
1366 | for tag in current_context: | ||
1367 | if self._select_debug: | ||
1368 | print " Running candidate generator on %s %s" % ( | ||
1369 | tag.name, repr(tag.attrs)) | ||
1370 | for candidate in _use_candidate_generator(tag): | ||
1371 | if not isinstance(candidate, Tag): | ||
1372 | continue | ||
1373 | if tag_name and candidate.name != tag_name: | ||
1374 | continue | ||
1375 | if checker is not None: | ||
1376 | try: | ||
1377 | result = checker(candidate) | ||
1378 | except StopIteration: | ||
1379 | # The checker has decided we should no longer | ||
1380 | # run the generator. | ||
1381 | break | ||
1382 | if checker is None or result: | ||
1383 | if self._select_debug: | ||
1384 | print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) | ||
1385 | if id(candidate) not in new_context_ids: | ||
1386 | # If a tag matches a selector more than once, | ||
1387 | # don't include it in the context more than once. | ||
1388 | new_context.append(candidate) | ||
1389 | new_context_ids.add(id(candidate)) | ||
1390 | elif self._select_debug: | ||
1391 | print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) | ||
1392 | |||
1393 | current_context = new_context | ||
1394 | |||
1395 | if self._select_debug: | ||
1396 | print "Final verdict:" | ||
1397 | for i in current_context: | ||
1398 | print " %s %s" % (i.name, i.attrs) | ||
1399 | return current_context | ||
1400 | |||
1401 | # Old names for backwards compatibility | ||
1402 | def childGenerator(self): | ||
1403 | return self.children | ||
1404 | |||
1405 | def recursiveChildGenerator(self): | ||
1406 | return self.descendants | ||
1407 | |||
1408 | def has_key(self, key): | ||
1409 | """This was kind of misleading because has_key() (attributes) | ||
1410 | was different from __in__ (contents). has_key() is gone in | ||
1411 | Python 3, anyway.""" | ||
1412 | warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( | ||
1413 | key)) | ||
1414 | return self.has_attr(key) | ||
1415 | |||
1416 | # Next, a couple classes to represent queries and their results. | ||
1417 | class SoupStrainer(object): | ||
1418 | """Encapsulates a number of ways of matching a markup element (tag or | ||
1419 | text).""" | ||
1420 | |||
1421 | def __init__(self, name=None, attrs={}, text=None, **kwargs): | ||
1422 | self.name = self._normalize_search_value(name) | ||
1423 | if not isinstance(attrs, dict): | ||
1424 | # Treat a non-dict value for attrs as a search for the 'class' | ||
1425 | # attribute. | ||
1426 | kwargs['class'] = attrs | ||
1427 | attrs = None | ||
1428 | |||
1429 | if 'class_' in kwargs: | ||
1430 | # Treat class_="foo" as a search for the 'class' | ||
1431 | # attribute, overriding any non-dict value for attrs. | ||
1432 | kwargs['class'] = kwargs['class_'] | ||
1433 | del kwargs['class_'] | ||
1434 | |||
1435 | if kwargs: | ||
1436 | if attrs: | ||
1437 | attrs = attrs.copy() | ||
1438 | attrs.update(kwargs) | ||
1439 | else: | ||
1440 | attrs = kwargs | ||
1441 | normalized_attrs = {} | ||
1442 | for key, value in attrs.items(): | ||
1443 | normalized_attrs[key] = self._normalize_search_value(value) | ||
1444 | |||
1445 | self.attrs = normalized_attrs | ||
1446 | self.text = self._normalize_search_value(text) | ||
1447 | |||
1448 | def _normalize_search_value(self, value): | ||
1449 | # Leave it alone if it's a Unicode string, a callable, a | ||
1450 | # regular expression, a boolean, or None. | ||
1451 | if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') | ||
1452 | or isinstance(value, bool) or value is None): | ||
1453 | return value | ||
1454 | |||
1455 | # If it's a bytestring, convert it to Unicode, treating it as UTF-8. | ||
1456 | if isinstance(value, bytes): | ||
1457 | return value.decode("utf8") | ||
1458 | |||
1459 | # If it's listlike, convert it into a list of strings. | ||
1460 | if hasattr(value, '__iter__'): | ||
1461 | new_value = [] | ||
1462 | for v in value: | ||
1463 | if (hasattr(v, '__iter__') and not isinstance(v, bytes) | ||
1464 | and not isinstance(v, unicode)): | ||
1465 | # This is almost certainly the user's mistake. In the | ||
1466 | # interests of avoiding infinite loops, we'll let | ||
1467 | # it through as-is rather than doing a recursive call. | ||
1468 | new_value.append(v) | ||
1469 | else: | ||
1470 | new_value.append(self._normalize_search_value(v)) | ||
1471 | return new_value | ||
1472 | |||
1473 | # Otherwise, convert it into a Unicode string. | ||
1474 | # The unicode(str()) thing is so this will do the same thing on Python 2 | ||
1475 | # and Python 3. | ||
1476 | return unicode(str(value)) | ||
1477 | |||
1478 | def __str__(self): | ||
1479 | if self.text: | ||
1480 | return self.text | ||
1481 | else: | ||
1482 | return "%s|%s" % (self.name, self.attrs) | ||
1483 | |||
1484 | def search_tag(self, markup_name=None, markup_attrs={}): | ||
1485 | found = None | ||
1486 | markup = None | ||
1487 | if isinstance(markup_name, Tag): | ||
1488 | markup = markup_name | ||
1489 | markup_attrs = markup | ||
1490 | call_function_with_tag_data = ( | ||
1491 | isinstance(self.name, collections.Callable) | ||
1492 | and not isinstance(markup_name, Tag)) | ||
1493 | |||
1494 | if ((not self.name) | ||
1495 | or call_function_with_tag_data | ||
1496 | or (markup and self._matches(markup, self.name)) | ||
1497 | or (not markup and self._matches(markup_name, self.name))): | ||
1498 | if call_function_with_tag_data: | ||
1499 | match = self.name(markup_name, markup_attrs) | ||
1500 | else: | ||
1501 | match = True | ||
1502 | markup_attr_map = None | ||
1503 | for attr, match_against in list(self.attrs.items()): | ||
1504 | if not markup_attr_map: | ||
1505 | if hasattr(markup_attrs, 'get'): | ||
1506 | markup_attr_map = markup_attrs | ||
1507 | else: | ||
1508 | markup_attr_map = {} | ||
1509 | for k, v in markup_attrs: | ||
1510 | markup_attr_map[k] = v | ||
1511 | attr_value = markup_attr_map.get(attr) | ||
1512 | if not self._matches(attr_value, match_against): | ||
1513 | match = False | ||
1514 | break | ||
1515 | if match: | ||
1516 | if markup: | ||
1517 | found = markup | ||
1518 | else: | ||
1519 | found = markup_name | ||
1520 | if found and self.text and not self._matches(found.string, self.text): | ||
1521 | found = None | ||
1522 | return found | ||
1523 | searchTag = search_tag | ||
1524 | |||
1525 | def search(self, markup): | ||
1526 | # print 'looking for %s in %s' % (self, markup) | ||
1527 | found = None | ||
1528 | # If given a list of items, scan it for a text element that | ||
1529 | # matches. | ||
1530 | if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): | ||
1531 | for element in markup: | ||
1532 | if isinstance(element, NavigableString) \ | ||
1533 | and self.search(element): | ||
1534 | found = element | ||
1535 | break | ||
1536 | # If it's a Tag, make sure its name or attributes match. | ||
1537 | # Don't bother with Tags if we're searching for text. | ||
1538 | elif isinstance(markup, Tag): | ||
1539 | if not self.text or self.name or self.attrs: | ||
1540 | found = self.search_tag(markup) | ||
1541 | # If it's text, make sure the text matches. | ||
1542 | elif isinstance(markup, NavigableString) or \ | ||
1543 | isinstance(markup, basestring): | ||
1544 | if not self.name and not self.attrs and self._matches(markup, self.text): | ||
1545 | found = markup | ||
1546 | else: | ||
1547 | raise Exception( | ||
1548 | "I don't know how to match against a %s" % markup.__class__) | ||
1549 | return found | ||
1550 | |||
1551 | def _matches(self, markup, match_against): | ||
1552 | # print u"Matching %s against %s" % (markup, match_against) | ||
1553 | result = False | ||
1554 | if isinstance(markup, list) or isinstance(markup, tuple): | ||
1555 | # This should only happen when searching a multi-valued attribute | ||
1556 | # like 'class'. | ||
1557 | if (isinstance(match_against, unicode) | ||
1558 | and ' ' in match_against): | ||
1559 | # A bit of a special case. If they try to match "foo | ||
1560 | # bar" on a multivalue attribute's value, only accept | ||
1561 | # the literal value "foo bar" | ||
1562 | # | ||
1563 | # XXX This is going to be pretty slow because we keep | ||
1564 | # splitting match_against. But it shouldn't come up | ||
1565 | # too often. | ||
1566 | return (whitespace_re.split(match_against) == markup) | ||
1567 | else: | ||
1568 | for item in markup: | ||
1569 | if self._matches(item, match_against): | ||
1570 | return True | ||
1571 | return False | ||
1572 | |||
1573 | if match_against is True: | ||
1574 | # True matches any non-None value. | ||
1575 | return markup is not None | ||
1576 | |||
1577 | if isinstance(match_against, collections.Callable): | ||
1578 | return match_against(markup) | ||
1579 | |||
1580 | # Custom callables take the tag as an argument, but all | ||
1581 | # other ways of matching match the tag name as a string. | ||
1582 | if isinstance(markup, Tag): | ||
1583 | markup = markup.name | ||
1584 | |||
1585 | # Ensure that `markup` is either a Unicode string, or None. | ||
1586 | markup = self._normalize_search_value(markup) | ||
1587 | |||
1588 | if markup is None: | ||
1589 | # None matches None, False, an empty string, an empty list, and so on. | ||
1590 | return not match_against | ||
1591 | |||
1592 | if isinstance(match_against, unicode): | ||
1593 | # Exact string match | ||
1594 | return markup == match_against | ||
1595 | |||
1596 | if hasattr(match_against, 'match'): | ||
1597 | # Regexp match | ||
1598 | return match_against.search(markup) | ||
1599 | |||
1600 | if hasattr(match_against, '__iter__'): | ||
1601 | # The markup must be an exact match against something | ||
1602 | # in the iterable. | ||
1603 | return markup in match_against | ||
1604 | |||
1605 | |||
1606 | class ResultSet(list): | ||
1607 | """A ResultSet is just a list that keeps track of the SoupStrainer | ||
1608 | that created it.""" | ||
1609 | def __init__(self, source, result=()): | ||
1610 | super(ResultSet, self).__init__(result) | ||
1611 | self.source = source | ||
diff --git a/bitbake/lib/bs4/testing.py b/bitbake/lib/bs4/testing.py new file mode 100644 index 0000000000..fd4495ac58 --- /dev/null +++ b/bitbake/lib/bs4/testing.py | |||
@@ -0,0 +1,592 @@ | |||
1 | """Helper classes for tests.""" | ||
2 | |||
3 | import copy | ||
4 | import functools | ||
5 | import unittest | ||
6 | from unittest import TestCase | ||
7 | from bs4 import BeautifulSoup | ||
8 | from bs4.element import ( | ||
9 | CharsetMetaAttributeValue, | ||
10 | Comment, | ||
11 | ContentMetaAttributeValue, | ||
12 | Doctype, | ||
13 | SoupStrainer, | ||
14 | ) | ||
15 | |||
16 | from bs4.builder import HTMLParserTreeBuilder | ||
17 | default_builder = HTMLParserTreeBuilder | ||
18 | |||
19 | |||
20 | class SoupTest(unittest.TestCase): | ||
21 | |||
22 | @property | ||
23 | def default_builder(self): | ||
24 | return default_builder() | ||
25 | |||
26 | def soup(self, markup, **kwargs): | ||
27 | """Build a Beautiful Soup object from markup.""" | ||
28 | builder = kwargs.pop('builder', self.default_builder) | ||
29 | return BeautifulSoup(markup, builder=builder, **kwargs) | ||
30 | |||
31 | def document_for(self, markup): | ||
32 | """Turn an HTML fragment into a document. | ||
33 | |||
34 | The details depend on the builder. | ||
35 | """ | ||
36 | return self.default_builder.test_fragment_to_document(markup) | ||
37 | |||
38 | def assertSoupEquals(self, to_parse, compare_parsed_to=None): | ||
39 | builder = self.default_builder | ||
40 | obj = BeautifulSoup(to_parse, builder=builder) | ||
41 | if compare_parsed_to is None: | ||
42 | compare_parsed_to = to_parse | ||
43 | |||
44 | self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) | ||
45 | |||
46 | |||
47 | class HTMLTreeBuilderSmokeTest(object): | ||
48 | |||
49 | """A basic test of a treebuilder's competence. | ||
50 | |||
51 | Any HTML treebuilder, present or future, should be able to pass | ||
52 | these tests. With invalid markup, there's room for interpretation, | ||
53 | and different parsers can handle it differently. But with the | ||
54 | markup in these tests, there's not much room for interpretation. | ||
55 | """ | ||
56 | |||
57 | def assertDoctypeHandled(self, doctype_fragment): | ||
58 | """Assert that a given doctype string is handled correctly.""" | ||
59 | doctype_str, soup = self._document_with_doctype(doctype_fragment) | ||
60 | |||
61 | # Make sure a Doctype object was created. | ||
62 | doctype = soup.contents[0] | ||
63 | self.assertEqual(doctype.__class__, Doctype) | ||
64 | self.assertEqual(doctype, doctype_fragment) | ||
65 | self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) | ||
66 | |||
67 | # Make sure that the doctype was correctly associated with the | ||
68 | # parse tree and that the rest of the document parsed. | ||
69 | self.assertEqual(soup.p.contents[0], 'foo') | ||
70 | |||
71 | def _document_with_doctype(self, doctype_fragment): | ||
72 | """Generate and parse a document with the given doctype.""" | ||
73 | doctype = '<!DOCTYPE %s>' % doctype_fragment | ||
74 | markup = doctype + '\n<p>foo</p>' | ||
75 | soup = self.soup(markup) | ||
76 | return doctype, soup | ||
77 | |||
78 | def test_normal_doctypes(self): | ||
79 | """Make sure normal, everyday HTML doctypes are handled correctly.""" | ||
80 | self.assertDoctypeHandled("html") | ||
81 | self.assertDoctypeHandled( | ||
82 | 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') | ||
83 | |||
84 | def test_empty_doctype(self): | ||
85 | soup = self.soup("<!DOCTYPE>") | ||
86 | doctype = soup.contents[0] | ||
87 | self.assertEqual("", doctype.strip()) | ||
88 | |||
89 | def test_public_doctype_with_url(self): | ||
90 | doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' | ||
91 | self.assertDoctypeHandled(doctype) | ||
92 | |||
93 | def test_system_doctype(self): | ||
94 | self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') | ||
95 | |||
96 | def test_namespaced_system_doctype(self): | ||
97 | # We can handle a namespaced doctype with a system ID. | ||
98 | self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') | ||
99 | |||
100 | def test_namespaced_public_doctype(self): | ||
101 | # Test a namespaced doctype with a public id. | ||
102 | self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') | ||
103 | |||
104 | def test_real_xhtml_document(self): | ||
105 | """A real XHTML document should come out more or less the same as it went in.""" | ||
106 | markup = b"""<?xml version="1.0" encoding="utf-8"?> | ||
107 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> | ||
108 | <html xmlns="http://www.w3.org/1999/xhtml"> | ||
109 | <head><title>Hello.</title></head> | ||
110 | <body>Goodbye.</body> | ||
111 | </html>""" | ||
112 | soup = self.soup(markup) | ||
113 | self.assertEqual( | ||
114 | soup.encode("utf-8").replace(b"\n", b""), | ||
115 | markup.replace(b"\n", b"")) | ||
116 | |||
117 | def test_deepcopy(self): | ||
118 | """Make sure you can copy the tree builder. | ||
119 | |||
120 | This is important because the builder is part of a | ||
121 | BeautifulSoup object, and we want to be able to copy that. | ||
122 | """ | ||
123 | copy.deepcopy(self.default_builder) | ||
124 | |||
125 | def test_p_tag_is_never_empty_element(self): | ||
126 | """A <p> tag is never designated as an empty-element tag. | ||
127 | |||
128 | Even if the markup shows it as an empty-element tag, it | ||
129 | shouldn't be presented that way. | ||
130 | """ | ||
131 | soup = self.soup("<p/>") | ||
132 | self.assertFalse(soup.p.is_empty_element) | ||
133 | self.assertEqual(str(soup.p), "<p></p>") | ||
134 | |||
135 | def test_unclosed_tags_get_closed(self): | ||
136 | """A tag that's not closed by the end of the document should be closed. | ||
137 | |||
138 | This applies to all tags except empty-element tags. | ||
139 | """ | ||
140 | self.assertSoupEquals("<p>", "<p></p>") | ||
141 | self.assertSoupEquals("<b>", "<b></b>") | ||
142 | |||
143 | self.assertSoupEquals("<br>", "<br/>") | ||
144 | |||
145 | def test_br_is_always_empty_element_tag(self): | ||
146 | """A <br> tag is designated as an empty-element tag. | ||
147 | |||
148 | Some parsers treat <br></br> as one <br/> tag, some parsers as | ||
149 | two tags, but it should always be an empty-element tag. | ||
150 | """ | ||
151 | soup = self.soup("<br></br>") | ||
152 | self.assertTrue(soup.br.is_empty_element) | ||
153 | self.assertEqual(str(soup.br), "<br/>") | ||
154 | |||
155 | def test_nested_formatting_elements(self): | ||
156 | self.assertSoupEquals("<em><em></em></em>") | ||
157 | |||
158 | def test_comment(self): | ||
159 | # Comments are represented as Comment objects. | ||
160 | markup = "<p>foo<!--foobar-->baz</p>" | ||
161 | self.assertSoupEquals(markup) | ||
162 | |||
163 | soup = self.soup(markup) | ||
164 | comment = soup.find(text="foobar") | ||
165 | self.assertEqual(comment.__class__, Comment) | ||
166 | |||
167 | # The comment is properly integrated into the tree. | ||
168 | foo = soup.find(text="foo") | ||
169 | self.assertEqual(comment, foo.next_element) | ||
170 | baz = soup.find(text="baz") | ||
171 | self.assertEqual(comment, baz.previous_element) | ||
172 | |||
173 | def test_preserved_whitespace_in_pre_and_textarea(self): | ||
174 | """Whitespace must be preserved in <pre> and <textarea> tags.""" | ||
175 | self.assertSoupEquals("<pre> </pre>") | ||
176 | self.assertSoupEquals("<textarea> woo </textarea>") | ||
177 | |||
178 | def test_nested_inline_elements(self): | ||
179 | """Inline elements can be nested indefinitely.""" | ||
180 | b_tag = "<b>Inside a B tag</b>" | ||
181 | self.assertSoupEquals(b_tag) | ||
182 | |||
183 | nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" | ||
184 | self.assertSoupEquals(nested_b_tag) | ||
185 | |||
186 | double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" | ||
187 | self.assertSoupEquals(nested_b_tag) | ||
188 | |||
189 | def test_nested_block_level_elements(self): | ||
190 | """Block elements can be nested.""" | ||
191 | soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') | ||
192 | blockquote = soup.blockquote | ||
193 | self.assertEqual(blockquote.p.b.string, 'Foo') | ||
194 | self.assertEqual(blockquote.b.string, 'Foo') | ||
195 | |||
196 | def test_correctly_nested_tables(self): | ||
197 | """One table can go inside another one.""" | ||
198 | markup = ('<table id="1">' | ||
199 | '<tr>' | ||
200 | "<td>Here's another table:" | ||
201 | '<table id="2">' | ||
202 | '<tr><td>foo</td></tr>' | ||
203 | '</table></td>') | ||
204 | |||
205 | self.assertSoupEquals( | ||
206 | markup, | ||
207 | '<table id="1"><tr><td>Here\'s another table:' | ||
208 | '<table id="2"><tr><td>foo</td></tr></table>' | ||
209 | '</td></tr></table>') | ||
210 | |||
211 | self.assertSoupEquals( | ||
212 | "<table><thead><tr><td>Foo</td></tr></thead>" | ||
213 | "<tbody><tr><td>Bar</td></tr></tbody>" | ||
214 | "<tfoot><tr><td>Baz</td></tr></tfoot></table>") | ||
215 | |||
216 | def test_deeply_nested_multivalued_attribute(self): | ||
217 | # html5lib can set the attributes of the same tag many times | ||
218 | # as it rearranges the tree. This has caused problems with | ||
219 | # multivalued attributes. | ||
220 | markup = '<table><div><div class="css"></div></div></table>' | ||
221 | soup = self.soup(markup) | ||
222 | self.assertEqual(["css"], soup.div.div['class']) | ||
223 | |||
224 | def test_angle_brackets_in_attribute_values_are_escaped(self): | ||
225 | self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') | ||
226 | |||
227 | def test_entities_in_attributes_converted_to_unicode(self): | ||
228 | expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' | ||
229 | self.assertSoupEquals('<p id="piñata"></p>', expect) | ||
230 | self.assertSoupEquals('<p id="piñata"></p>', expect) | ||
231 | self.assertSoupEquals('<p id="piñata"></p>', expect) | ||
232 | self.assertSoupEquals('<p id="piñata"></p>', expect) | ||
233 | |||
234 | def test_entities_in_text_converted_to_unicode(self): | ||
235 | expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' | ||
236 | self.assertSoupEquals("<p>piñata</p>", expect) | ||
237 | self.assertSoupEquals("<p>piñata</p>", expect) | ||
238 | self.assertSoupEquals("<p>piñata</p>", expect) | ||
239 | self.assertSoupEquals("<p>piñata</p>", expect) | ||
240 | |||
241 | def test_quot_entity_converted_to_quotation_mark(self): | ||
242 | self.assertSoupEquals("<p>I said "good day!"</p>", | ||
243 | '<p>I said "good day!"</p>') | ||
244 | |||
245 | def test_out_of_range_entity(self): | ||
246 | expect = u"\N{REPLACEMENT CHARACTER}" | ||
247 | self.assertSoupEquals("�", expect) | ||
248 | self.assertSoupEquals("�", expect) | ||
249 | self.assertSoupEquals("�", expect) | ||
250 | |||
251 | def test_multipart_strings(self): | ||
252 | "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." | ||
253 | soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") | ||
254 | self.assertEqual("p", soup.h2.string.next_element.name) | ||
255 | self.assertEqual("p", soup.p.name) | ||
256 | |||
257 | def test_basic_namespaces(self): | ||
258 | """Parsers don't need to *understand* namespaces, but at the | ||
259 | very least they should not choke on namespaces or lose | ||
260 | data.""" | ||
261 | |||
262 | markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>' | ||
263 | soup = self.soup(markup) | ||
264 | self.assertEqual(markup, soup.encode()) | ||
265 | html = soup.html | ||
266 | self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) | ||
267 | self.assertEqual( | ||
268 | 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) | ||
269 | self.assertEqual( | ||
270 | 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) | ||
271 | |||
272 | def test_multivalued_attribute_value_becomes_list(self): | ||
273 | markup = b'<a class="foo bar">' | ||
274 | soup = self.soup(markup) | ||
275 | self.assertEqual(['foo', 'bar'], soup.a['class']) | ||
276 | |||
277 | # | ||
278 | # Generally speaking, tests below this point are more tests of | ||
279 | # Beautiful Soup than tests of the tree builders. But parsers are | ||
280 | # weird, so we run these tests separately for every tree builder | ||
281 | # to detect any differences between them. | ||
282 | # | ||
283 | |||
284 | def test_can_parse_unicode_document(self): | ||
285 | # A seemingly innocuous document... but it's in Unicode! And | ||
286 | # it contains characters that can't be represented in the | ||
287 | # encoding found in the declaration! The horror! | ||
288 | markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' | ||
289 | soup = self.soup(markup) | ||
290 | self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) | ||
291 | |||
292 | def test_soupstrainer(self): | ||
293 | """Parsers should be able to work with SoupStrainers.""" | ||
294 | strainer = SoupStrainer("b") | ||
295 | soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", | ||
296 | parse_only=strainer) | ||
297 | self.assertEqual(soup.decode(), "<b>bold</b>") | ||
298 | |||
299 | def test_single_quote_attribute_values_become_double_quotes(self): | ||
300 | self.assertSoupEquals("<foo attr='bar'></foo>", | ||
301 | '<foo attr="bar"></foo>') | ||
302 | |||
303 | def test_attribute_values_with_nested_quotes_are_left_alone(self): | ||
304 | text = """<foo attr='bar "brawls" happen'>a</foo>""" | ||
305 | self.assertSoupEquals(text) | ||
306 | |||
307 | def test_attribute_values_with_double_nested_quotes_get_quoted(self): | ||
308 | text = """<foo attr='bar "brawls" happen'>a</foo>""" | ||
309 | soup = self.soup(text) | ||
310 | soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' | ||
311 | self.assertSoupEquals( | ||
312 | soup.foo.decode(), | ||
313 | """<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""") | ||
314 | |||
315 | def test_ampersand_in_attribute_value_gets_escaped(self): | ||
316 | self.assertSoupEquals('<this is="really messed up & stuff"></this>', | ||
317 | '<this is="really messed up & stuff"></this>') | ||
318 | |||
319 | self.assertSoupEquals( | ||
320 | '<a href="http://example.org?a=1&b=2;3">foo</a>', | ||
321 | '<a href="http://example.org?a=1&b=2;3">foo</a>') | ||
322 | |||
323 | def test_escaped_ampersand_in_attribute_value_is_left_alone(self): | ||
324 | self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>') | ||
325 | |||
326 | def test_entities_in_strings_converted_during_parsing(self): | ||
327 | # Both XML and HTML entities are converted to Unicode characters | ||
328 | # during parsing. | ||
329 | text = "<p><<sacré bleu!>></p>" | ||
330 | expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" | ||
331 | self.assertSoupEquals(text, expected) | ||
332 | |||
333 | def test_smart_quotes_converted_on_the_way_in(self): | ||
334 | # Microsoft smart quotes are converted to Unicode characters during | ||
335 | # parsing. | ||
336 | quote = b"<p>\x91Foo\x92</p>" | ||
337 | soup = self.soup(quote) | ||
338 | self.assertEqual( | ||
339 | soup.p.string, | ||
340 | u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") | ||
341 | |||
342 | def test_non_breaking_spaces_converted_on_the_way_in(self): | ||
343 | soup = self.soup("<a> </a>") | ||
344 | self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) | ||
345 | |||
346 | def test_entities_converted_on_the_way_out(self): | ||
347 | text = "<p><<sacré bleu!>></p>" | ||
348 | expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") | ||
349 | soup = self.soup(text) | ||
350 | self.assertEqual(soup.p.encode("utf-8"), expected) | ||
351 | |||
352 | def test_real_iso_latin_document(self): | ||
353 | # Smoke test of interrelated functionality, using an | ||
354 | # easy-to-understand document. | ||
355 | |||
356 | # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. | ||
357 | unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' | ||
358 | |||
359 | # That's because we're going to encode it into ISO-Latin-1, and use | ||
360 | # that to test. | ||
361 | iso_latin_html = unicode_html.encode("iso-8859-1") | ||
362 | |||
363 | # Parse the ISO-Latin-1 HTML. | ||
364 | soup = self.soup(iso_latin_html) | ||
365 | # Encode it to UTF-8. | ||
366 | result = soup.encode("utf-8") | ||
367 | |||
368 | # What do we expect the result to look like? Well, it would | ||
369 | # look like unicode_html, except that the META tag would say | ||
370 | # UTF-8 instead of ISO-Latin-1. | ||
371 | expected = unicode_html.replace("ISO-Latin-1", "utf-8") | ||
372 | |||
373 | # And, of course, it would be in UTF-8, not Unicode. | ||
374 | expected = expected.encode("utf-8") | ||
375 | |||
376 | # Ta-da! | ||
377 | self.assertEqual(result, expected) | ||
378 | |||
379 | def test_real_shift_jis_document(self): | ||
380 | # Smoke test to make sure the parser can handle a document in | ||
381 | # Shift-JIS encoding, without choking. | ||
382 | shift_jis_html = ( | ||
383 | b'<html><head></head><body><pre>' | ||
384 | b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' | ||
385 | b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' | ||
386 | b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' | ||
387 | b'</pre></body></html>') | ||
388 | unicode_html = shift_jis_html.decode("shift-jis") | ||
389 | soup = self.soup(unicode_html) | ||
390 | |||
391 | # Make sure the parse tree is correctly encoded to various | ||
392 | # encodings. | ||
393 | self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) | ||
394 | self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) | ||
395 | |||
396 | def test_real_hebrew_document(self): | ||
397 | # A real-world test to make sure we can convert ISO-8859-9 (a | ||
398 | # Hebrew encoding) to UTF-8. | ||
399 | hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' | ||
400 | soup = self.soup( | ||
401 | hebrew_document, from_encoding="iso8859-8") | ||
402 | self.assertEqual(soup.original_encoding, 'iso8859-8') | ||
403 | self.assertEqual( | ||
404 | soup.encode('utf-8'), | ||
405 | hebrew_document.decode("iso8859-8").encode("utf-8")) | ||
406 | |||
407 | def test_meta_tag_reflects_current_encoding(self): | ||
408 | # Here's the <meta> tag saying that a document is | ||
409 | # encoded in Shift-JIS. | ||
410 | meta_tag = ('<meta content="text/html; charset=x-sjis" ' | ||
411 | 'http-equiv="Content-type"/>') | ||
412 | |||
413 | # Here's a document incorporating that meta tag. | ||
414 | shift_jis_html = ( | ||
415 | '<html><head>\n%s\n' | ||
416 | '<meta http-equiv="Content-language" content="ja"/>' | ||
417 | '</head><body>Shift-JIS markup goes here.') % meta_tag | ||
418 | soup = self.soup(shift_jis_html) | ||
419 | |||
420 | # Parse the document, and the charset is seemingly unaffected. | ||
421 | parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) | ||
422 | content = parsed_meta['content'] | ||
423 | self.assertEqual('text/html; charset=x-sjis', content) | ||
424 | |||
425 | # But that value is actually a ContentMetaAttributeValue object. | ||
426 | self.assertTrue(isinstance(content, ContentMetaAttributeValue)) | ||
427 | |||
428 | # And it will take on a value that reflects its current | ||
429 | # encoding. | ||
430 | self.assertEqual('text/html; charset=utf8', content.encode("utf8")) | ||
431 | |||
432 | # For the rest of the story, see TestSubstitutions in | ||
433 | # test_tree.py. | ||
434 | |||
435 | def test_html5_style_meta_tag_reflects_current_encoding(self): | ||
436 | # Here's the <meta> tag saying that a document is | ||
437 | # encoded in Shift-JIS. | ||
438 | meta_tag = ('<meta id="encoding" charset="x-sjis" />') | ||
439 | |||
440 | # Here's a document incorporating that meta tag. | ||
441 | shift_jis_html = ( | ||
442 | '<html><head>\n%s\n' | ||
443 | '<meta http-equiv="Content-language" content="ja"/>' | ||
444 | '</head><body>Shift-JIS markup goes here.') % meta_tag | ||
445 | soup = self.soup(shift_jis_html) | ||
446 | |||
447 | # Parse the document, and the charset is seemingly unaffected. | ||
448 | parsed_meta = soup.find('meta', id="encoding") | ||
449 | charset = parsed_meta['charset'] | ||
450 | self.assertEqual('x-sjis', charset) | ||
451 | |||
452 | # But that value is actually a CharsetMetaAttributeValue object. | ||
453 | self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) | ||
454 | |||
455 | # And it will take on a value that reflects its current | ||
456 | # encoding. | ||
457 | self.assertEqual('utf8', charset.encode("utf8")) | ||
458 | |||
459 | def test_tag_with_no_attributes_can_have_attributes_added(self): | ||
460 | data = self.soup("<a>text</a>") | ||
461 | data.a['foo'] = 'bar' | ||
462 | self.assertEqual('<a foo="bar">text</a>', data.a.decode()) | ||
463 | |||
464 | class XMLTreeBuilderSmokeTest(object): | ||
465 | |||
466 | def test_docstring_generated(self): | ||
467 | soup = self.soup("<root/>") | ||
468 | self.assertEqual( | ||
469 | soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') | ||
470 | |||
471 | def test_real_xhtml_document(self): | ||
472 | """A real XHTML document should come out *exactly* the same as it went in.""" | ||
473 | markup = b"""<?xml version="1.0" encoding="utf-8"?> | ||
474 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> | ||
475 | <html xmlns="http://www.w3.org/1999/xhtml"> | ||
476 | <head><title>Hello.</title></head> | ||
477 | <body>Goodbye.</body> | ||
478 | </html>""" | ||
479 | soup = self.soup(markup) | ||
480 | self.assertEqual( | ||
481 | soup.encode("utf-8"), markup) | ||
482 | |||
483 | def test_formatter_processes_script_tag_for_xml_documents(self): | ||
484 | doc = """ | ||
485 | <script type="text/javascript"> | ||
486 | </script> | ||
487 | """ | ||
488 | soup = BeautifulSoup(doc, "xml") | ||
489 | # lxml would have stripped this while parsing, but we can add | ||
490 | # it later. | ||
491 | soup.script.string = 'console.log("< < hey > > ");' | ||
492 | encoded = soup.encode() | ||
493 | self.assertTrue(b"< < hey > >" in encoded) | ||
494 | |||
495 | def test_can_parse_unicode_document(self): | ||
496 | markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' | ||
497 | soup = self.soup(markup) | ||
498 | self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) | ||
499 | |||
500 | def test_popping_namespaced_tag(self): | ||
501 | markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' | ||
502 | soup = self.soup(markup) | ||
503 | self.assertEqual( | ||
504 | unicode(soup.rss), markup) | ||
505 | |||
506 | def test_docstring_includes_correct_encoding(self): | ||
507 | soup = self.soup("<root/>") | ||
508 | self.assertEqual( | ||
509 | soup.encode("latin1"), | ||
510 | b'<?xml version="1.0" encoding="latin1"?>\n<root/>') | ||
511 | |||
512 | def test_large_xml_document(self): | ||
513 | """A large XML document should come out the same as it went in.""" | ||
514 | markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>' | ||
515 | + b'0' * (2**12) | ||
516 | + b'</root>') | ||
517 | soup = self.soup(markup) | ||
518 | self.assertEqual(soup.encode("utf-8"), markup) | ||
519 | |||
520 | |||
521 | def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): | ||
522 | self.assertSoupEquals("<p>", "<p/>") | ||
523 | self.assertSoupEquals("<p>foo</p>") | ||
524 | |||
525 | def test_namespaces_are_preserved(self): | ||
526 | markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>' | ||
527 | soup = self.soup(markup) | ||
528 | root = soup.root | ||
529 | self.assertEqual("http://example.com/", root['xmlns:a']) | ||
530 | self.assertEqual("http://example.net/", root['xmlns:b']) | ||
531 | |||
532 | def test_closing_namespaced_tag(self): | ||
533 | markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' | ||
534 | soup = self.soup(markup) | ||
535 | self.assertEqual(unicode(soup.p), markup) | ||
536 | |||
537 | def test_namespaced_attributes(self): | ||
538 | markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' | ||
539 | soup = self.soup(markup) | ||
540 | self.assertEqual(unicode(soup.foo), markup) | ||
541 | |||
542 | def test_namespaced_attributes_xml_namespace(self): | ||
543 | markup = '<foo xml:lang="fr">bar</foo>' | ||
544 | soup = self.soup(markup) | ||
545 | self.assertEqual(unicode(soup.foo), markup) | ||
546 | |||
547 | class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): | ||
548 | """Smoke test for a tree builder that supports HTML5.""" | ||
549 | |||
550 | def test_real_xhtml_document(self): | ||
551 | # Since XHTML is not HTML5, HTML5 parsers are not tested to handle | ||
552 | # XHTML documents in any particular way. | ||
553 | pass | ||
554 | |||
555 | def test_html_tags_have_namespace(self): | ||
556 | markup = "<a>" | ||
557 | soup = self.soup(markup) | ||
558 | self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) | ||
559 | |||
560 | def test_svg_tags_have_namespace(self): | ||
561 | markup = '<svg><circle/></svg>' | ||
562 | soup = self.soup(markup) | ||
563 | namespace = "http://www.w3.org/2000/svg" | ||
564 | self.assertEqual(namespace, soup.svg.namespace) | ||
565 | self.assertEqual(namespace, soup.circle.namespace) | ||
566 | |||
567 | |||
568 | def test_mathml_tags_have_namespace(self): | ||
569 | markup = '<math><msqrt>5</msqrt></math>' | ||
570 | soup = self.soup(markup) | ||
571 | namespace = 'http://www.w3.org/1998/Math/MathML' | ||
572 | self.assertEqual(namespace, soup.math.namespace) | ||
573 | self.assertEqual(namespace, soup.msqrt.namespace) | ||
574 | |||
575 | def test_xml_declaration_becomes_comment(self): | ||
576 | markup = '<?xml version="1.0" encoding="utf-8"?><html></html>' | ||
577 | soup = self.soup(markup) | ||
578 | self.assertTrue(isinstance(soup.contents[0], Comment)) | ||
579 | self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') | ||
580 | self.assertEqual("html", soup.contents[0].next_element.name) | ||
581 | |||
582 | def skipIf(condition, reason): | ||
583 | def nothing(test, *args, **kwargs): | ||
584 | return None | ||
585 | |||
586 | def decorator(test_item): | ||
587 | if condition: | ||
588 | return nothing | ||
589 | else: | ||
590 | return test_item | ||
591 | |||
592 | return decorator | ||
diff --git a/bitbake/lib/bs4/tests/__init__.py b/bitbake/lib/bs4/tests/__init__.py new file mode 100644 index 0000000000..142c8cc3f1 --- /dev/null +++ b/bitbake/lib/bs4/tests/__init__.py | |||
@@ -0,0 +1 @@ | |||
"The beautifulsoup tests." | |||
diff --git a/bitbake/lib/bs4/tests/test_builder_registry.py b/bitbake/lib/bs4/tests/test_builder_registry.py new file mode 100644 index 0000000000..92ad10fb04 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_builder_registry.py | |||
@@ -0,0 +1,141 @@ | |||
1 | """Tests of the builder registry.""" | ||
2 | |||
3 | import unittest | ||
4 | |||
5 | from bs4 import BeautifulSoup | ||
6 | from bs4.builder import ( | ||
7 | builder_registry as registry, | ||
8 | HTMLParserTreeBuilder, | ||
9 | TreeBuilderRegistry, | ||
10 | ) | ||
11 | |||
12 | try: | ||
13 | from bs4.builder import HTML5TreeBuilder | ||
14 | HTML5LIB_PRESENT = True | ||
15 | except ImportError: | ||
16 | HTML5LIB_PRESENT = False | ||
17 | |||
18 | try: | ||
19 | from bs4.builder import ( | ||
20 | LXMLTreeBuilderForXML, | ||
21 | LXMLTreeBuilder, | ||
22 | ) | ||
23 | LXML_PRESENT = True | ||
24 | except ImportError: | ||
25 | LXML_PRESENT = False | ||
26 | |||
27 | |||
28 | class BuiltInRegistryTest(unittest.TestCase): | ||
29 | """Test the built-in registry with the default builders registered.""" | ||
30 | |||
31 | def test_combination(self): | ||
32 | if LXML_PRESENT: | ||
33 | self.assertEqual(registry.lookup('fast', 'html'), | ||
34 | LXMLTreeBuilder) | ||
35 | |||
36 | if LXML_PRESENT: | ||
37 | self.assertEqual(registry.lookup('permissive', 'xml'), | ||
38 | LXMLTreeBuilderForXML) | ||
39 | self.assertEqual(registry.lookup('strict', 'html'), | ||
40 | HTMLParserTreeBuilder) | ||
41 | if HTML5LIB_PRESENT: | ||
42 | self.assertEqual(registry.lookup('html5lib', 'html'), | ||
43 | HTML5TreeBuilder) | ||
44 | |||
45 | def test_lookup_by_markup_type(self): | ||
46 | if LXML_PRESENT: | ||
47 | self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) | ||
48 | self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) | ||
49 | else: | ||
50 | self.assertEqual(registry.lookup('xml'), None) | ||
51 | if HTML5LIB_PRESENT: | ||
52 | self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) | ||
53 | else: | ||
54 | self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) | ||
55 | |||
56 | def test_named_library(self): | ||
57 | if LXML_PRESENT: | ||
58 | self.assertEqual(registry.lookup('lxml', 'xml'), | ||
59 | LXMLTreeBuilderForXML) | ||
60 | self.assertEqual(registry.lookup('lxml', 'html'), | ||
61 | LXMLTreeBuilder) | ||
62 | if HTML5LIB_PRESENT: | ||
63 | self.assertEqual(registry.lookup('html5lib'), | ||
64 | HTML5TreeBuilder) | ||
65 | |||
66 | self.assertEqual(registry.lookup('html.parser'), | ||
67 | HTMLParserTreeBuilder) | ||
68 | |||
69 | def test_beautifulsoup_constructor_does_lookup(self): | ||
70 | # You can pass in a string. | ||
71 | BeautifulSoup("", features="html") | ||
72 | # Or a list of strings. | ||
73 | BeautifulSoup("", features=["html", "fast"]) | ||
74 | |||
75 | # You'll get an exception if BS can't find an appropriate | ||
76 | # builder. | ||
77 | self.assertRaises(ValueError, BeautifulSoup, | ||
78 | "", features="no-such-feature") | ||
79 | |||
80 | class RegistryTest(unittest.TestCase): | ||
81 | """Test the TreeBuilderRegistry class in general.""" | ||
82 | |||
83 | def setUp(self): | ||
84 | self.registry = TreeBuilderRegistry() | ||
85 | |||
86 | def builder_for_features(self, *feature_list): | ||
87 | cls = type('Builder_' + '_'.join(feature_list), | ||
88 | (object,), {'features' : feature_list}) | ||
89 | |||
90 | self.registry.register(cls) | ||
91 | return cls | ||
92 | |||
93 | def test_register_with_no_features(self): | ||
94 | builder = self.builder_for_features() | ||
95 | |||
96 | # Since the builder advertises no features, you can't find it | ||
97 | # by looking up features. | ||
98 | self.assertEqual(self.registry.lookup('foo'), None) | ||
99 | |||
100 | # But you can find it by doing a lookup with no features, if | ||
101 | # this happens to be the only registered builder. | ||
102 | self.assertEqual(self.registry.lookup(), builder) | ||
103 | |||
104 | def test_register_with_features_makes_lookup_succeed(self): | ||
105 | builder = self.builder_for_features('foo', 'bar') | ||
106 | self.assertEqual(self.registry.lookup('foo'), builder) | ||
107 | self.assertEqual(self.registry.lookup('bar'), builder) | ||
108 | |||
109 | def test_lookup_fails_when_no_builder_implements_feature(self): | ||
110 | builder = self.builder_for_features('foo', 'bar') | ||
111 | self.assertEqual(self.registry.lookup('baz'), None) | ||
112 | |||
113 | def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): | ||
114 | builder1 = self.builder_for_features('foo') | ||
115 | builder2 = self.builder_for_features('bar') | ||
116 | self.assertEqual(self.registry.lookup(), builder2) | ||
117 | |||
118 | def test_lookup_fails_when_no_tree_builders_registered(self): | ||
119 | self.assertEqual(self.registry.lookup(), None) | ||
120 | |||
121 | def test_lookup_gets_most_recent_builder_supporting_all_features(self): | ||
122 | has_one = self.builder_for_features('foo') | ||
123 | has_the_other = self.builder_for_features('bar') | ||
124 | has_both_early = self.builder_for_features('foo', 'bar', 'baz') | ||
125 | has_both_late = self.builder_for_features('foo', 'bar', 'quux') | ||
126 | lacks_one = self.builder_for_features('bar') | ||
127 | has_the_other = self.builder_for_features('foo') | ||
128 | |||
129 | # There are two builders featuring 'foo' and 'bar', but | ||
130 | # the one that also features 'quux' was registered later. | ||
131 | self.assertEqual(self.registry.lookup('foo', 'bar'), | ||
132 | has_both_late) | ||
133 | |||
134 | # There is only one builder featuring 'foo', 'bar', and 'baz'. | ||
135 | self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), | ||
136 | has_both_early) | ||
137 | |||
138 | def test_lookup_fails_when_cannot_reconcile_requested_features(self): | ||
139 | builder1 = self.builder_for_features('foo', 'bar') | ||
140 | builder2 = self.builder_for_features('foo', 'baz') | ||
141 | self.assertEqual(self.registry.lookup('bar', 'baz'), None) | ||
diff --git a/bitbake/lib/bs4/tests/test_docs.py b/bitbake/lib/bs4/tests/test_docs.py new file mode 100644 index 0000000000..5b9f677093 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_docs.py | |||
@@ -0,0 +1,36 @@ | |||
1 | "Test harness for doctests." | ||
2 | |||
3 | # pylint: disable-msg=E0611,W0142 | ||
4 | |||
5 | __metaclass__ = type | ||
6 | __all__ = [ | ||
7 | 'additional_tests', | ||
8 | ] | ||
9 | |||
10 | import atexit | ||
11 | import doctest | ||
12 | import os | ||
13 | #from pkg_resources import ( | ||
14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources) | ||
15 | import unittest | ||
16 | |||
17 | DOCTEST_FLAGS = ( | ||
18 | doctest.ELLIPSIS | | ||
19 | doctest.NORMALIZE_WHITESPACE | | ||
20 | doctest.REPORT_NDIFF) | ||
21 | |||
22 | |||
23 | # def additional_tests(): | ||
24 | # "Run the doc tests (README.txt and docs/*, if any exist)" | ||
25 | # doctest_files = [ | ||
26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))] | ||
27 | # if resource_exists('bs4', 'docs'): | ||
28 | # for name in resource_listdir('bs4', 'docs'): | ||
29 | # if name.endswith('.txt'): | ||
30 | # doctest_files.append( | ||
31 | # os.path.abspath( | ||
32 | # resource_filename('bs4', 'docs/%s' % name))) | ||
33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) | ||
34 | # atexit.register(cleanup_resources) | ||
35 | # return unittest.TestSuite(( | ||
36 | # doctest.DocFileSuite(*doctest_files, **kwargs))) | ||
diff --git a/bitbake/lib/bs4/tests/test_html5lib.py b/bitbake/lib/bs4/tests/test_html5lib.py new file mode 100644 index 0000000000..594c3e1f26 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_html5lib.py | |||
@@ -0,0 +1,85 @@ | |||
1 | """Tests to ensure that the html5lib tree builder generates good trees.""" | ||
2 | |||
3 | import warnings | ||
4 | |||
5 | try: | ||
6 | from bs4.builder import HTML5TreeBuilder | ||
7 | HTML5LIB_PRESENT = True | ||
8 | except ImportError, e: | ||
9 | HTML5LIB_PRESENT = False | ||
10 | from bs4.element import SoupStrainer | ||
11 | from bs4.testing import ( | ||
12 | HTML5TreeBuilderSmokeTest, | ||
13 | SoupTest, | ||
14 | skipIf, | ||
15 | ) | ||
16 | |||
17 | @skipIf( | ||
18 | not HTML5LIB_PRESENT, | ||
19 | "html5lib seems not to be present, not testing its tree builder.") | ||
20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): | ||
21 | """See ``HTML5TreeBuilderSmokeTest``.""" | ||
22 | |||
23 | @property | ||
24 | def default_builder(self): | ||
25 | return HTML5TreeBuilder() | ||
26 | |||
27 | def test_soupstrainer(self): | ||
28 | # The html5lib tree builder does not support SoupStrainers. | ||
29 | strainer = SoupStrainer("b") | ||
30 | markup = "<p>A <b>bold</b> statement.</p>" | ||
31 | with warnings.catch_warnings(record=True) as w: | ||
32 | soup = self.soup(markup, parse_only=strainer) | ||
33 | self.assertEqual( | ||
34 | soup.decode(), self.document_for(markup)) | ||
35 | |||
36 | self.assertTrue( | ||
37 | "the html5lib tree builder doesn't support parse_only" in | ||
38 | str(w[0].message)) | ||
39 | |||
40 | def test_correctly_nested_tables(self): | ||
41 | """html5lib inserts <tbody> tags where other parsers don't.""" | ||
42 | markup = ('<table id="1">' | ||
43 | '<tr>' | ||
44 | "<td>Here's another table:" | ||
45 | '<table id="2">' | ||
46 | '<tr><td>foo</td></tr>' | ||
47 | '</table></td>') | ||
48 | |||
49 | self.assertSoupEquals( | ||
50 | markup, | ||
51 | '<table id="1"><tbody><tr><td>Here\'s another table:' | ||
52 | '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' | ||
53 | '</td></tr></tbody></table>') | ||
54 | |||
55 | self.assertSoupEquals( | ||
56 | "<table><thead><tr><td>Foo</td></tr></thead>" | ||
57 | "<tbody><tr><td>Bar</td></tr></tbody>" | ||
58 | "<tfoot><tr><td>Baz</td></tr></tfoot></table>") | ||
59 | |||
60 | def test_xml_declaration_followed_by_doctype(self): | ||
61 | markup = '''<?xml version="1.0" encoding="utf-8"?> | ||
62 | <!DOCTYPE html> | ||
63 | <html> | ||
64 | <head> | ||
65 | </head> | ||
66 | <body> | ||
67 | <p>foo</p> | ||
68 | </body> | ||
69 | </html>''' | ||
70 | soup = self.soup(markup) | ||
71 | # Verify that we can reach the <p> tag; this means the tree is connected. | ||
72 | self.assertEqual(b"<p>foo</p>", soup.p.encode()) | ||
73 | |||
74 | def test_reparented_markup(self): | ||
75 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' | ||
76 | soup = self.soup(markup) | ||
77 | self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) | ||
78 | self.assertEqual(2, len(soup.find_all('p'))) | ||
79 | |||
80 | |||
81 | def test_reparented_markup_ends_with_whitespace(self): | ||
82 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' | ||
83 | soup = self.soup(markup) | ||
84 | self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) | ||
85 | self.assertEqual(2, len(soup.find_all('p'))) | ||
diff --git a/bitbake/lib/bs4/tests/test_htmlparser.py b/bitbake/lib/bs4/tests/test_htmlparser.py new file mode 100644 index 0000000000..bcb5ed232f --- /dev/null +++ b/bitbake/lib/bs4/tests/test_htmlparser.py | |||
@@ -0,0 +1,19 @@ | |||
1 | """Tests to ensure that the html.parser tree builder generates good | ||
2 | trees.""" | ||
3 | |||
4 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest | ||
5 | from bs4.builder import HTMLParserTreeBuilder | ||
6 | |||
7 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): | ||
8 | |||
9 | @property | ||
10 | def default_builder(self): | ||
11 | return HTMLParserTreeBuilder() | ||
12 | |||
13 | def test_namespaced_system_doctype(self): | ||
14 | # html.parser can't handle namespaced doctypes, so skip this one. | ||
15 | pass | ||
16 | |||
17 | def test_namespaced_public_doctype(self): | ||
18 | # html.parser can't handle namespaced doctypes, so skip this one. | ||
19 | pass | ||
diff --git a/bitbake/lib/bs4/tests/test_lxml.py b/bitbake/lib/bs4/tests/test_lxml.py new file mode 100644 index 0000000000..2b2e9b7e78 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_lxml.py | |||
@@ -0,0 +1,91 @@ | |||
1 | """Tests to ensure that the lxml tree builder generates good trees.""" | ||
2 | |||
3 | import re | ||
4 | import warnings | ||
5 | |||
6 | try: | ||
7 | import lxml.etree | ||
8 | LXML_PRESENT = True | ||
9 | LXML_VERSION = lxml.etree.LXML_VERSION | ||
10 | except ImportError, e: | ||
11 | LXML_PRESENT = False | ||
12 | LXML_VERSION = (0,) | ||
13 | |||
14 | if LXML_PRESENT: | ||
15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | ||
16 | |||
17 | from bs4 import ( | ||
18 | BeautifulSoup, | ||
19 | BeautifulStoneSoup, | ||
20 | ) | ||
21 | from bs4.element import Comment, Doctype, SoupStrainer | ||
22 | from bs4.testing import skipIf | ||
23 | from bs4.tests import test_htmlparser | ||
24 | from bs4.testing import ( | ||
25 | HTMLTreeBuilderSmokeTest, | ||
26 | XMLTreeBuilderSmokeTest, | ||
27 | SoupTest, | ||
28 | skipIf, | ||
29 | ) | ||
30 | |||
31 | @skipIf( | ||
32 | not LXML_PRESENT, | ||
33 | "lxml seems not to be present, not testing its tree builder.") | ||
34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): | ||
35 | """See ``HTMLTreeBuilderSmokeTest``.""" | ||
36 | |||
37 | @property | ||
38 | def default_builder(self): | ||
39 | return LXMLTreeBuilder() | ||
40 | |||
41 | def test_out_of_range_entity(self): | ||
42 | self.assertSoupEquals( | ||
43 | "<p>foo�bar</p>", "<p>foobar</p>") | ||
44 | self.assertSoupEquals( | ||
45 | "<p>foo�bar</p>", "<p>foobar</p>") | ||
46 | self.assertSoupEquals( | ||
47 | "<p>foo�bar</p>", "<p>foobar</p>") | ||
48 | |||
49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this | ||
50 | # test if an old version of lxml is installed. | ||
51 | |||
52 | @skipIf( | ||
53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), | ||
54 | "Skipping doctype test for old version of lxml to avoid segfault.") | ||
55 | def test_empty_doctype(self): | ||
56 | soup = self.soup("<!DOCTYPE>") | ||
57 | doctype = soup.contents[0] | ||
58 | self.assertEqual("", doctype.strip()) | ||
59 | |||
60 | def test_beautifulstonesoup_is_xml_parser(self): | ||
61 | # Make sure that the deprecated BSS class uses an xml builder | ||
62 | # if one is installed. | ||
63 | with warnings.catch_warnings(record=True) as w: | ||
64 | soup = BeautifulStoneSoup("<b />") | ||
65 | self.assertEqual(u"<b/>", unicode(soup.b)) | ||
66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) | ||
67 | |||
68 | def test_real_xhtml_document(self): | ||
69 | """lxml strips the XML definition from an XHTML doc, which is fine.""" | ||
70 | markup = b"""<?xml version="1.0" encoding="utf-8"?> | ||
71 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> | ||
72 | <html xmlns="http://www.w3.org/1999/xhtml"> | ||
73 | <head><title>Hello.</title></head> | ||
74 | <body>Goodbye.</body> | ||
75 | </html>""" | ||
76 | soup = self.soup(markup) | ||
77 | self.assertEqual( | ||
78 | soup.encode("utf-8").replace(b"\n", b''), | ||
79 | markup.replace(b'\n', b'').replace( | ||
80 | b'<?xml version="1.0" encoding="utf-8"?>', b'')) | ||
81 | |||
82 | |||
83 | @skipIf( | ||
84 | not LXML_PRESENT, | ||
85 | "lxml seems not to be present, not testing its XML tree builder.") | ||
86 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): | ||
87 | """See ``HTMLTreeBuilderSmokeTest``.""" | ||
88 | |||
89 | @property | ||
90 | def default_builder(self): | ||
91 | return LXMLTreeBuilderForXML() | ||
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py new file mode 100644 index 0000000000..47ac245f99 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_soup.py | |||
@@ -0,0 +1,434 @@ | |||
1 | # -*- coding: utf-8 -*- | ||
2 | """Tests of Beautiful Soup as a whole.""" | ||
3 | |||
4 | import logging | ||
5 | import unittest | ||
6 | import sys | ||
7 | import tempfile | ||
8 | |||
9 | from bs4 import ( | ||
10 | BeautifulSoup, | ||
11 | BeautifulStoneSoup, | ||
12 | ) | ||
13 | from bs4.element import ( | ||
14 | CharsetMetaAttributeValue, | ||
15 | ContentMetaAttributeValue, | ||
16 | SoupStrainer, | ||
17 | NamespacedAttribute, | ||
18 | ) | ||
19 | import bs4.dammit | ||
20 | from bs4.dammit import ( | ||
21 | EntitySubstitution, | ||
22 | UnicodeDammit, | ||
23 | ) | ||
24 | from bs4.testing import ( | ||
25 | SoupTest, | ||
26 | skipIf, | ||
27 | ) | ||
28 | import warnings | ||
29 | |||
30 | try: | ||
31 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | ||
32 | LXML_PRESENT = True | ||
33 | except ImportError, e: | ||
34 | LXML_PRESENT = False | ||
35 | |||
36 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) | ||
37 | PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) | ||
38 | |||
39 | class TestConstructor(SoupTest): | ||
40 | |||
41 | def test_short_unicode_input(self): | ||
42 | data = u"<h1>éé</h1>" | ||
43 | soup = self.soup(data) | ||
44 | self.assertEqual(u"éé", soup.h1.string) | ||
45 | |||
46 | def test_embedded_null(self): | ||
47 | data = u"<h1>foo\0bar</h1>" | ||
48 | soup = self.soup(data) | ||
49 | self.assertEqual(u"foo\0bar", soup.h1.string) | ||
50 | |||
51 | |||
52 | class TestDeprecatedConstructorArguments(SoupTest): | ||
53 | |||
54 | def test_parseOnlyThese_renamed_to_parse_only(self): | ||
55 | with warnings.catch_warnings(record=True) as w: | ||
56 | soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) | ||
57 | msg = str(w[0].message) | ||
58 | self.assertTrue("parseOnlyThese" in msg) | ||
59 | self.assertTrue("parse_only" in msg) | ||
60 | self.assertEqual(b"<b></b>", soup.encode()) | ||
61 | |||
62 | def test_fromEncoding_renamed_to_from_encoding(self): | ||
63 | with warnings.catch_warnings(record=True) as w: | ||
64 | utf8 = b"\xc3\xa9" | ||
65 | soup = self.soup(utf8, fromEncoding="utf8") | ||
66 | msg = str(w[0].message) | ||
67 | self.assertTrue("fromEncoding" in msg) | ||
68 | self.assertTrue("from_encoding" in msg) | ||
69 | self.assertEqual("utf8", soup.original_encoding) | ||
70 | |||
71 | def test_unrecognized_keyword_argument(self): | ||
72 | self.assertRaises( | ||
73 | TypeError, self.soup, "<a>", no_such_argument=True) | ||
74 | |||
75 | class TestWarnings(SoupTest): | ||
76 | |||
77 | def test_disk_file_warning(self): | ||
78 | filehandle = tempfile.NamedTemporaryFile() | ||
79 | filename = filehandle.name | ||
80 | try: | ||
81 | with warnings.catch_warnings(record=True) as w: | ||
82 | soup = self.soup(filename) | ||
83 | msg = str(w[0].message) | ||
84 | self.assertTrue("looks like a filename" in msg) | ||
85 | finally: | ||
86 | filehandle.close() | ||
87 | |||
88 | # The file no longer exists, so Beautiful Soup will no longer issue the warning. | ||
89 | with warnings.catch_warnings(record=True) as w: | ||
90 | soup = self.soup(filename) | ||
91 | self.assertEqual(0, len(w)) | ||
92 | |||
93 | def test_url_warning(self): | ||
94 | with warnings.catch_warnings(record=True) as w: | ||
95 | soup = self.soup("http://www.crummy.com/") | ||
96 | msg = str(w[0].message) | ||
97 | self.assertTrue("looks like a URL" in msg) | ||
98 | |||
99 | with warnings.catch_warnings(record=True) as w: | ||
100 | soup = self.soup("http://www.crummy.com/ is great") | ||
101 | self.assertEqual(0, len(w)) | ||
102 | |||
103 | class TestSelectiveParsing(SoupTest): | ||
104 | |||
105 | def test_parse_with_soupstrainer(self): | ||
106 | markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" | ||
107 | strainer = SoupStrainer("b") | ||
108 | soup = self.soup(markup, parse_only=strainer) | ||
109 | self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") | ||
110 | |||
111 | |||
112 | class TestEntitySubstitution(unittest.TestCase): | ||
113 | """Standalone tests of the EntitySubstitution class.""" | ||
114 | def setUp(self): | ||
115 | self.sub = EntitySubstitution | ||
116 | |||
117 | def test_simple_html_substitution(self): | ||
118 | # Unicode characters corresponding to named HTML entites | ||
119 | # are substituted, and no others. | ||
120 | s = u"foo\u2200\N{SNOWMAN}\u00f5bar" | ||
121 | self.assertEqual(self.sub.substitute_html(s), | ||
122 | u"foo∀\N{SNOWMAN}õbar") | ||
123 | |||
124 | def test_smart_quote_substitution(self): | ||
125 | # MS smart quotes are a common source of frustration, so we | ||
126 | # give them a special test. | ||
127 | quotes = b"\x91\x92foo\x93\x94" | ||
128 | dammit = UnicodeDammit(quotes) | ||
129 | self.assertEqual(self.sub.substitute_html(dammit.markup), | ||
130 | "‘’foo“”") | ||
131 | |||
132 | def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): | ||
133 | s = 'Welcome to "my bar"' | ||
134 | self.assertEqual(self.sub.substitute_xml(s, False), s) | ||
135 | |||
136 | def test_xml_attribute_quoting_normally_uses_double_quotes(self): | ||
137 | self.assertEqual(self.sub.substitute_xml("Welcome", True), | ||
138 | '"Welcome"') | ||
139 | self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), | ||
140 | '"Bob\'s Bar"') | ||
141 | |||
142 | def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): | ||
143 | s = 'Welcome to "my bar"' | ||
144 | self.assertEqual(self.sub.substitute_xml(s, True), | ||
145 | "'Welcome to \"my bar\"'") | ||
146 | |||
147 | def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): | ||
148 | s = 'Welcome to "Bob\'s Bar"' | ||
149 | self.assertEqual( | ||
150 | self.sub.substitute_xml(s, True), | ||
151 | '"Welcome to "Bob\'s Bar""') | ||
152 | |||
153 | def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): | ||
154 | quoted = 'Welcome to "Bob\'s Bar"' | ||
155 | self.assertEqual(self.sub.substitute_xml(quoted), quoted) | ||
156 | |||
157 | def test_xml_quoting_handles_angle_brackets(self): | ||
158 | self.assertEqual( | ||
159 | self.sub.substitute_xml("foo<bar>"), | ||
160 | "foo<bar>") | ||
161 | |||
162 | def test_xml_quoting_handles_ampersands(self): | ||
163 | self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") | ||
164 | |||
165 | def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): | ||
166 | self.assertEqual( | ||
167 | self.sub.substitute_xml("ÁT&T"), | ||
168 | "&Aacute;T&T") | ||
169 | |||
170 | def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): | ||
171 | self.assertEqual( | ||
172 | self.sub.substitute_xml_containing_entities("ÁT&T"), | ||
173 | "ÁT&T") | ||
174 | |||
175 | def test_quotes_not_html_substituted(self): | ||
176 | """There's no need to do this except inside attribute values.""" | ||
177 | text = 'Bob\'s "bar"' | ||
178 | self.assertEqual(self.sub.substitute_html(text), text) | ||
179 | |||
180 | |||
181 | class TestEncodingConversion(SoupTest): | ||
182 | # Test Beautiful Soup's ability to decode and encode from various | ||
183 | # encodings. | ||
184 | |||
185 | def setUp(self): | ||
186 | super(TestEncodingConversion, self).setUp() | ||
187 | self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' | ||
188 | self.utf8_data = self.unicode_data.encode("utf-8") | ||
189 | # Just so you know what it looks like. | ||
190 | self.assertEqual( | ||
191 | self.utf8_data, | ||
192 | b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') | ||
193 | |||
194 | def test_ascii_in_unicode_out(self): | ||
195 | # ASCII input is converted to Unicode. The original_encoding | ||
196 | # attribute is set to 'utf-8', a superset of ASCII. | ||
197 | chardet = bs4.dammit.chardet_dammit | ||
198 | logging.disable(logging.WARNING) | ||
199 | try: | ||
200 | def noop(str): | ||
201 | return None | ||
202 | # Disable chardet, which will realize that the ASCII is ASCII. | ||
203 | bs4.dammit.chardet_dammit = noop | ||
204 | ascii = b"<foo>a</foo>" | ||
205 | soup_from_ascii = self.soup(ascii) | ||
206 | unicode_output = soup_from_ascii.decode() | ||
207 | self.assertTrue(isinstance(unicode_output, unicode)) | ||
208 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) | ||
209 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") | ||
210 | finally: | ||
211 | logging.disable(logging.NOTSET) | ||
212 | bs4.dammit.chardet_dammit = chardet | ||
213 | |||
214 | def test_unicode_in_unicode_out(self): | ||
215 | # Unicode input is left alone. The original_encoding attribute | ||
216 | # is not set. | ||
217 | soup_from_unicode = self.soup(self.unicode_data) | ||
218 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) | ||
219 | self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') | ||
220 | self.assertEqual(soup_from_unicode.original_encoding, None) | ||
221 | |||
222 | def test_utf8_in_unicode_out(self): | ||
223 | # UTF-8 input is converted to Unicode. The original_encoding | ||
224 | # attribute is set. | ||
225 | soup_from_utf8 = self.soup(self.utf8_data) | ||
226 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) | ||
227 | self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') | ||
228 | |||
229 | def test_utf8_out(self): | ||
230 | # The internal data structures can be encoded as UTF-8. | ||
231 | soup_from_unicode = self.soup(self.unicode_data) | ||
232 | self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) | ||
233 | |||
234 | @skipIf( | ||
235 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, | ||
236 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") | ||
237 | def test_attribute_name_containing_unicode_characters(self): | ||
238 | markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' | ||
239 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) | ||
240 | |||
241 | class TestUnicodeDammit(unittest.TestCase): | ||
242 | """Standalone tests of UnicodeDammit.""" | ||
243 | |||
244 | def test_unicode_input(self): | ||
245 | markup = u"I'm already Unicode! \N{SNOWMAN}" | ||
246 | dammit = UnicodeDammit(markup) | ||
247 | self.assertEqual(dammit.unicode_markup, markup) | ||
248 | |||
249 | def test_smart_quotes_to_unicode(self): | ||
250 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
251 | dammit = UnicodeDammit(markup) | ||
252 | self.assertEqual( | ||
253 | dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") | ||
254 | |||
255 | def test_smart_quotes_to_xml_entities(self): | ||
256 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
257 | dammit = UnicodeDammit(markup, smart_quotes_to="xml") | ||
258 | self.assertEqual( | ||
259 | dammit.unicode_markup, "<foo>‘’“”</foo>") | ||
260 | |||
261 | def test_smart_quotes_to_html_entities(self): | ||
262 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
263 | dammit = UnicodeDammit(markup, smart_quotes_to="html") | ||
264 | self.assertEqual( | ||
265 | dammit.unicode_markup, "<foo>‘’“”</foo>") | ||
266 | |||
267 | def test_smart_quotes_to_ascii(self): | ||
268 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
269 | dammit = UnicodeDammit(markup, smart_quotes_to="ascii") | ||
270 | self.assertEqual( | ||
271 | dammit.unicode_markup, """<foo>''""</foo>""") | ||
272 | |||
273 | def test_detect_utf8(self): | ||
274 | utf8 = b"\xc3\xa9" | ||
275 | dammit = UnicodeDammit(utf8) | ||
276 | self.assertEqual(dammit.unicode_markup, u'\xe9') | ||
277 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
278 | |||
279 | def test_convert_hebrew(self): | ||
280 | hebrew = b"\xed\xe5\xec\xf9" | ||
281 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) | ||
282 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') | ||
283 | self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') | ||
284 | |||
285 | def test_dont_see_smart_quotes_where_there_are_none(self): | ||
286 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" | ||
287 | dammit = UnicodeDammit(utf_8) | ||
288 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
289 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) | ||
290 | |||
291 | def test_ignore_inappropriate_codecs(self): | ||
292 | utf8_data = u"Räksmörgås".encode("utf-8") | ||
293 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) | ||
294 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
295 | |||
296 | def test_ignore_invalid_codecs(self): | ||
297 | utf8_data = u"Räksmörgås".encode("utf-8") | ||
298 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: | ||
299 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) | ||
300 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
301 | |||
302 | def test_detect_html5_style_meta_tag(self): | ||
303 | |||
304 | for data in ( | ||
305 | b'<html><meta charset="euc-jp" /></html>', | ||
306 | b"<html><meta charset='euc-jp' /></html>", | ||
307 | b"<html><meta charset=euc-jp /></html>", | ||
308 | b"<html><meta charset=euc-jp/></html>"): | ||
309 | dammit = UnicodeDammit(data, is_html=True) | ||
310 | self.assertEqual( | ||
311 | "euc-jp", dammit.original_encoding) | ||
312 | |||
313 | def test_last_ditch_entity_replacement(self): | ||
314 | # This is a UTF-8 document that contains bytestrings | ||
315 | # completely incompatible with UTF-8 (ie. encoded with some other | ||
316 | # encoding). | ||
317 | # | ||
318 | # Since there is no consistent encoding for the document, | ||
319 | # Unicode, Dammit will eventually encode the document as UTF-8 | ||
320 | # and encode the incompatible characters as REPLACEMENT | ||
321 | # CHARACTER. | ||
322 | # | ||
323 | # If chardet is installed, it will detect that the document | ||
324 | # can be converted into ISO-8859-1 without errors. This happens | ||
325 | # to be the wrong encoding, but it is a consistent encoding, so the | ||
326 | # code we're testing here won't run. | ||
327 | # | ||
328 | # So we temporarily disable chardet if it's present. | ||
329 | doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> | ||
330 | <html><b>\330\250\330\252\330\261</b> | ||
331 | <i>\310\322\321\220\312\321\355\344</i></html>""" | ||
332 | chardet = bs4.dammit.chardet_dammit | ||
333 | logging.disable(logging.WARNING) | ||
334 | try: | ||
335 | def noop(str): | ||
336 | return None | ||
337 | bs4.dammit.chardet_dammit = noop | ||
338 | dammit = UnicodeDammit(doc) | ||
339 | self.assertEqual(True, dammit.contains_replacement_characters) | ||
340 | self.assertTrue(u"\ufffd" in dammit.unicode_markup) | ||
341 | |||
342 | soup = BeautifulSoup(doc, "html.parser") | ||
343 | self.assertTrue(soup.contains_replacement_characters) | ||
344 | finally: | ||
345 | logging.disable(logging.NOTSET) | ||
346 | bs4.dammit.chardet_dammit = chardet | ||
347 | |||
348 | def test_byte_order_mark_removed(self): | ||
349 | # A document written in UTF-16LE will have its byte order marker stripped. | ||
350 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | ||
351 | dammit = UnicodeDammit(data) | ||
352 | self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) | ||
353 | self.assertEqual("utf-16le", dammit.original_encoding) | ||
354 | |||
355 | def test_detwingle(self): | ||
356 | # Here's a UTF8 document. | ||
357 | utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") | ||
358 | |||
359 | # Here's a Windows-1252 document. | ||
360 | windows_1252 = ( | ||
361 | u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" | ||
362 | u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") | ||
363 | |||
364 | # Through some unholy alchemy, they've been stuck together. | ||
365 | doc = utf8 + windows_1252 + utf8 | ||
366 | |||
367 | # The document can't be turned into UTF-8: | ||
368 | self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") | ||
369 | |||
370 | # Unicode, Dammit thinks the whole document is Windows-1252, | ||
371 | # and decodes it into "☃☃☃“Hi, I like Windows!â€Ã¢ËœÆ’☃☃" | ||
372 | |||
373 | # But if we run it through fix_embedded_windows_1252, it's fixed: | ||
374 | |||
375 | fixed = UnicodeDammit.detwingle(doc) | ||
376 | self.assertEqual( | ||
377 | u"☃☃☃“Hi, I like Windows!â€â˜ƒâ˜ƒâ˜ƒ", fixed.decode("utf8")) | ||
378 | |||
379 | def test_detwingle_ignores_multibyte_characters(self): | ||
380 | # Each of these characters has a UTF-8 representation ending | ||
381 | # in \x93. \x93 is a smart quote if interpreted as | ||
382 | # Windows-1252. But our code knows to skip over multibyte | ||
383 | # UTF-8 characters, so they'll survive the process unscathed. | ||
384 | for tricky_unicode_char in ( | ||
385 | u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' | ||
386 | u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' | ||
387 | u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. | ||
388 | ): | ||
389 | input = tricky_unicode_char.encode("utf8") | ||
390 | self.assertTrue(input.endswith(b'\x93')) | ||
391 | output = UnicodeDammit.detwingle(input) | ||
392 | self.assertEqual(output, input) | ||
393 | |||
394 | class TestNamedspacedAttribute(SoupTest): | ||
395 | |||
396 | def test_name_may_be_none(self): | ||
397 | a = NamespacedAttribute("xmlns", None) | ||
398 | self.assertEqual(a, "xmlns") | ||
399 | |||
400 | def test_attribute_is_equivalent_to_colon_separated_string(self): | ||
401 | a = NamespacedAttribute("a", "b") | ||
402 | self.assertEqual("a:b", a) | ||
403 | |||
404 | def test_attributes_are_equivalent_if_prefix_and_name_identical(self): | ||
405 | a = NamespacedAttribute("a", "b", "c") | ||
406 | b = NamespacedAttribute("a", "b", "c") | ||
407 | self.assertEqual(a, b) | ||
408 | |||
409 | # The actual namespace is not considered. | ||
410 | c = NamespacedAttribute("a", "b", None) | ||
411 | self.assertEqual(a, c) | ||
412 | |||
413 | # But name and prefix are important. | ||
414 | d = NamespacedAttribute("a", "z", "c") | ||
415 | self.assertNotEqual(a, d) | ||
416 | |||
417 | e = NamespacedAttribute("z", "b", "c") | ||
418 | self.assertNotEqual(a, e) | ||
419 | |||
420 | |||
421 | class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): | ||
422 | |||
423 | def test_content_meta_attribute_value(self): | ||
424 | value = CharsetMetaAttributeValue("euc-jp") | ||
425 | self.assertEqual("euc-jp", value) | ||
426 | self.assertEqual("euc-jp", value.original_value) | ||
427 | self.assertEqual("utf8", value.encode("utf8")) | ||
428 | |||
429 | |||
430 | def test_content_meta_attribute_value(self): | ||
431 | value = ContentMetaAttributeValue("text/html; charset=euc-jp") | ||
432 | self.assertEqual("text/html; charset=euc-jp", value) | ||
433 | self.assertEqual("text/html; charset=euc-jp", value.original_value) | ||
434 | self.assertEqual("text/html; charset=utf8", value.encode("utf8")) | ||
diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py new file mode 100644 index 0000000000..f8515c0ea1 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_tree.py | |||
@@ -0,0 +1,1829 @@ | |||
1 | # -*- coding: utf-8 -*- | ||
2 | """Tests for Beautiful Soup's tree traversal methods. | ||
3 | |||
4 | The tree traversal methods are the main advantage of using Beautiful | ||
5 | Soup over just using a parser. | ||
6 | |||
7 | Different parsers will build different Beautiful Soup trees given the | ||
8 | same markup, but all Beautiful Soup trees can be traversed with the | ||
9 | methods tested here. | ||
10 | """ | ||
11 | |||
12 | import copy | ||
13 | import pickle | ||
14 | import re | ||
15 | import warnings | ||
16 | from bs4 import BeautifulSoup | ||
17 | from bs4.builder import ( | ||
18 | builder_registry, | ||
19 | HTMLParserTreeBuilder, | ||
20 | ) | ||
21 | from bs4.element import ( | ||
22 | CData, | ||
23 | Comment, | ||
24 | Doctype, | ||
25 | NavigableString, | ||
26 | SoupStrainer, | ||
27 | Tag, | ||
28 | ) | ||
29 | from bs4.testing import ( | ||
30 | SoupTest, | ||
31 | skipIf, | ||
32 | ) | ||
33 | |||
34 | XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) | ||
35 | LXML_PRESENT = (builder_registry.lookup("lxml") is not None) | ||
36 | |||
37 | class TreeTest(SoupTest): | ||
38 | |||
39 | def assertSelects(self, tags, should_match): | ||
40 | """Make sure that the given tags have the correct text. | ||
41 | |||
42 | This is used in tests that define a bunch of tags, each | ||
43 | containing a single string, and then select certain strings by | ||
44 | some mechanism. | ||
45 | """ | ||
46 | self.assertEqual([tag.string for tag in tags], should_match) | ||
47 | |||
48 | def assertSelectsIDs(self, tags, should_match): | ||
49 | """Make sure that the given tags have the correct IDs. | ||
50 | |||
51 | This is used in tests that define a bunch of tags, each | ||
52 | containing a single string, and then select certain strings by | ||
53 | some mechanism. | ||
54 | """ | ||
55 | self.assertEqual([tag['id'] for tag in tags], should_match) | ||
56 | |||
57 | |||
58 | class TestFind(TreeTest): | ||
59 | """Basic tests of the find() method. | ||
60 | |||
61 | find() just calls find_all() with limit=1, so it's not tested all | ||
62 | that thouroughly here. | ||
63 | """ | ||
64 | |||
65 | def test_find_tag(self): | ||
66 | soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>") | ||
67 | self.assertEqual(soup.find("b").string, "2") | ||
68 | |||
69 | def test_unicode_text_find(self): | ||
70 | soup = self.soup(u'<h1>Räksmörgås</h1>') | ||
71 | self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') | ||
72 | |||
73 | def test_find_everything(self): | ||
74 | """Test an optimization that finds all tags.""" | ||
75 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
76 | self.assertEqual(2, len(soup.find_all())) | ||
77 | |||
78 | def test_find_everything_with_name(self): | ||
79 | """Test an optimization that finds all tags with a given name.""" | ||
80 | soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") | ||
81 | self.assertEqual(2, len(soup.find_all('a'))) | ||
82 | |||
83 | class TestFindAll(TreeTest): | ||
84 | """Basic tests of the find_all() method.""" | ||
85 | |||
86 | def test_find_all_text_nodes(self): | ||
87 | """You can search the tree for text nodes.""" | ||
88 | soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") | ||
89 | # Exact match. | ||
90 | self.assertEqual(soup.find_all(text="bar"), [u"bar"]) | ||
91 | # Match any of a number of strings. | ||
92 | self.assertEqual( | ||
93 | soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) | ||
94 | # Match a regular expression. | ||
95 | self.assertEqual(soup.find_all(text=re.compile('.*')), | ||
96 | [u"Foo", u"bar", u'\xbb']) | ||
97 | # Match anything. | ||
98 | self.assertEqual(soup.find_all(text=True), | ||
99 | [u"Foo", u"bar", u'\xbb']) | ||
100 | |||
101 | def test_find_all_limit(self): | ||
102 | """You can limit the number of items returned by find_all.""" | ||
103 | soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>") | ||
104 | self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) | ||
105 | self.assertSelects(soup.find_all('a', limit=1), ["1"]) | ||
106 | self.assertSelects( | ||
107 | soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) | ||
108 | |||
109 | # A limit of 0 means no limit. | ||
110 | self.assertSelects( | ||
111 | soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) | ||
112 | |||
113 | def test_calling_a_tag_is_calling_findall(self): | ||
114 | soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>") | ||
115 | self.assertSelects(soup('a', limit=1), ["1"]) | ||
116 | self.assertSelects(soup.b(id="foo"), ["3"]) | ||
117 | |||
118 | def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): | ||
119 | soup = self.soup("<a></a>") | ||
120 | # Create a self-referential list. | ||
121 | l = [] | ||
122 | l.append(l) | ||
123 | |||
124 | # Without special code in _normalize_search_value, this would cause infinite | ||
125 | # recursion. | ||
126 | self.assertEqual([], soup.find_all(l)) | ||
127 | |||
128 | def test_find_all_resultset(self): | ||
129 | """All find_all calls return a ResultSet""" | ||
130 | soup = self.soup("<a></a>") | ||
131 | result = soup.find_all("a") | ||
132 | self.assertTrue(hasattr(result, "source")) | ||
133 | |||
134 | result = soup.find_all(True) | ||
135 | self.assertTrue(hasattr(result, "source")) | ||
136 | |||
137 | result = soup.find_all(text="foo") | ||
138 | self.assertTrue(hasattr(result, "source")) | ||
139 | |||
140 | |||
141 | class TestFindAllBasicNamespaces(TreeTest): | ||
142 | |||
143 | def test_find_by_namespaced_name(self): | ||
144 | soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') | ||
145 | self.assertEqual("4", soup.find("mathml:msqrt").string) | ||
146 | self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) | ||
147 | |||
148 | |||
149 | class TestFindAllByName(TreeTest): | ||
150 | """Test ways of finding tags by tag name.""" | ||
151 | |||
152 | def setUp(self): | ||
153 | super(TreeTest, self).setUp() | ||
154 | self.tree = self.soup("""<a>First tag.</a> | ||
155 | <b>Second tag.</b> | ||
156 | <c>Third <a>Nested tag.</a> tag.</c>""") | ||
157 | |||
158 | def test_find_all_by_tag_name(self): | ||
159 | # Find all the <a> tags. | ||
160 | self.assertSelects( | ||
161 | self.tree.find_all('a'), ['First tag.', 'Nested tag.']) | ||
162 | |||
163 | def test_find_all_by_name_and_text(self): | ||
164 | self.assertSelects( | ||
165 | self.tree.find_all('a', text='First tag.'), ['First tag.']) | ||
166 | |||
167 | self.assertSelects( | ||
168 | self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) | ||
169 | |||
170 | self.assertSelects( | ||
171 | self.tree.find_all('a', text=re.compile("tag")), | ||
172 | ['First tag.', 'Nested tag.']) | ||
173 | |||
174 | |||
175 | def test_find_all_on_non_root_element(self): | ||
176 | # You can call find_all on any node, not just the root. | ||
177 | self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) | ||
178 | |||
179 | def test_calling_element_invokes_find_all(self): | ||
180 | self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) | ||
181 | |||
182 | def test_find_all_by_tag_strainer(self): | ||
183 | self.assertSelects( | ||
184 | self.tree.find_all(SoupStrainer('a')), | ||
185 | ['First tag.', 'Nested tag.']) | ||
186 | |||
187 | def test_find_all_by_tag_names(self): | ||
188 | self.assertSelects( | ||
189 | self.tree.find_all(['a', 'b']), | ||
190 | ['First tag.', 'Second tag.', 'Nested tag.']) | ||
191 | |||
192 | def test_find_all_by_tag_dict(self): | ||
193 | self.assertSelects( | ||
194 | self.tree.find_all({'a' : True, 'b' : True}), | ||
195 | ['First tag.', 'Second tag.', 'Nested tag.']) | ||
196 | |||
197 | def test_find_all_by_tag_re(self): | ||
198 | self.assertSelects( | ||
199 | self.tree.find_all(re.compile('^[ab]$')), | ||
200 | ['First tag.', 'Second tag.', 'Nested tag.']) | ||
201 | |||
202 | def test_find_all_with_tags_matching_method(self): | ||
203 | # You can define an oracle method that determines whether | ||
204 | # a tag matches the search. | ||
205 | def id_matches_name(tag): | ||
206 | return tag.name == tag.get('id') | ||
207 | |||
208 | tree = self.soup("""<a id="a">Match 1.</a> | ||
209 | <a id="1">Does not match.</a> | ||
210 | <b id="b">Match 2.</a>""") | ||
211 | |||
212 | self.assertSelects( | ||
213 | tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) | ||
214 | |||
215 | |||
216 | class TestFindAllByAttribute(TreeTest): | ||
217 | |||
218 | def test_find_all_by_attribute_name(self): | ||
219 | # You can pass in keyword arguments to find_all to search by | ||
220 | # attribute. | ||
221 | tree = self.soup(""" | ||
222 | <a id="first">Matching a.</a> | ||
223 | <a id="second"> | ||
224 | Non-matching <b id="first">Matching b.</b>a. | ||
225 | </a>""") | ||
226 | self.assertSelects(tree.find_all(id='first'), | ||
227 | ["Matching a.", "Matching b."]) | ||
228 | |||
229 | def test_find_all_by_utf8_attribute_value(self): | ||
230 | peace = u"×ולש".encode("utf8") | ||
231 | data = u'<a title="×ולש"></a>'.encode("utf8") | ||
232 | soup = self.soup(data) | ||
233 | self.assertEqual([soup.a], soup.find_all(title=peace)) | ||
234 | self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) | ||
235 | self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) | ||
236 | |||
237 | def test_find_all_by_attribute_dict(self): | ||
238 | # You can pass in a dictionary as the argument 'attrs'. This | ||
239 | # lets you search for attributes like 'name' (a fixed argument | ||
240 | # to find_all) and 'class' (a reserved word in Python.) | ||
241 | tree = self.soup(""" | ||
242 | <a name="name1" class="class1">Name match.</a> | ||
243 | <a name="name2" class="class2">Class match.</a> | ||
244 | <a name="name3" class="class3">Non-match.</a> | ||
245 | <name1>A tag called 'name1'.</name1> | ||
246 | """) | ||
247 | |||
248 | # This doesn't do what you want. | ||
249 | self.assertSelects(tree.find_all(name='name1'), | ||
250 | ["A tag called 'name1'."]) | ||
251 | # This does what you want. | ||
252 | self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), | ||
253 | ["Name match."]) | ||
254 | |||
255 | self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), | ||
256 | ["Class match."]) | ||
257 | |||
258 | def test_find_all_by_class(self): | ||
259 | tree = self.soup(""" | ||
260 | <a class="1">Class 1.</a> | ||
261 | <a class="2">Class 2.</a> | ||
262 | <b class="1">Class 1.</b> | ||
263 | <c class="3 4">Class 3 and 4.</c> | ||
264 | """) | ||
265 | |||
266 | # Passing in the class_ keyword argument will search against | ||
267 | # the 'class' attribute. | ||
268 | self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) | ||
269 | self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) | ||
270 | self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) | ||
271 | |||
272 | # Passing in a string to 'attrs' will also search the CSS class. | ||
273 | self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) | ||
274 | self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) | ||
275 | self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) | ||
276 | self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) | ||
277 | |||
278 | def test_find_by_class_when_multiple_classes_present(self): | ||
279 | tree = self.soup("<gar class='foo bar'>Found it</gar>") | ||
280 | |||
281 | f = tree.find_all("gar", class_=re.compile("o")) | ||
282 | self.assertSelects(f, ["Found it"]) | ||
283 | |||
284 | f = tree.find_all("gar", class_=re.compile("a")) | ||
285 | self.assertSelects(f, ["Found it"]) | ||
286 | |||
287 | # Since the class is not the string "foo bar", but the two | ||
288 | # strings "foo" and "bar", this will not find anything. | ||
289 | f = tree.find_all("gar", class_=re.compile("o b")) | ||
290 | self.assertSelects(f, []) | ||
291 | |||
292 | def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): | ||
293 | soup = self.soup("<a class='bar'>Found it</a>") | ||
294 | |||
295 | self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) | ||
296 | |||
297 | def big_attribute_value(value): | ||
298 | return len(value) > 3 | ||
299 | |||
300 | self.assertSelects(soup.find_all("a", big_attribute_value), []) | ||
301 | |||
302 | def small_attribute_value(value): | ||
303 | return len(value) <= 3 | ||
304 | |||
305 | self.assertSelects( | ||
306 | soup.find_all("a", small_attribute_value), ["Found it"]) | ||
307 | |||
308 | def test_find_all_with_string_for_attrs_finds_multiple_classes(self): | ||
309 | soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') | ||
310 | a, a2 = soup.find_all("a") | ||
311 | self.assertEqual([a, a2], soup.find_all("a", "foo")) | ||
312 | self.assertEqual([a], soup.find_all("a", "bar")) | ||
313 | |||
314 | # If you specify the class as a string that contains a | ||
315 | # space, only that specific value will be found. | ||
316 | self.assertEqual([a], soup.find_all("a", class_="foo bar")) | ||
317 | self.assertEqual([a], soup.find_all("a", "foo bar")) | ||
318 | self.assertEqual([], soup.find_all("a", "bar foo")) | ||
319 | |||
320 | def test_find_all_by_attribute_soupstrainer(self): | ||
321 | tree = self.soup(""" | ||
322 | <a id="first">Match.</a> | ||
323 | <a id="second">Non-match.</a>""") | ||
324 | |||
325 | strainer = SoupStrainer(attrs={'id' : 'first'}) | ||
326 | self.assertSelects(tree.find_all(strainer), ['Match.']) | ||
327 | |||
328 | def test_find_all_with_missing_atribute(self): | ||
329 | # You can pass in None as the value of an attribute to find_all. | ||
330 | # This will match tags that do not have that attribute set. | ||
331 | tree = self.soup("""<a id="1">ID present.</a> | ||
332 | <a>No ID present.</a> | ||
333 | <a id="">ID is empty.</a>""") | ||
334 | self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) | ||
335 | |||
336 | def test_find_all_with_defined_attribute(self): | ||
337 | # You can pass in None as the value of an attribute to find_all. | ||
338 | # This will match tags that have that attribute set to any value. | ||
339 | tree = self.soup("""<a id="1">ID present.</a> | ||
340 | <a>No ID present.</a> | ||
341 | <a id="">ID is empty.</a>""") | ||
342 | self.assertSelects( | ||
343 | tree.find_all(id=True), ["ID present.", "ID is empty."]) | ||
344 | |||
345 | def test_find_all_with_numeric_attribute(self): | ||
346 | # If you search for a number, it's treated as a string. | ||
347 | tree = self.soup("""<a id=1>Unquoted attribute.</a> | ||
348 | <a id="1">Quoted attribute.</a>""") | ||
349 | |||
350 | expected = ["Unquoted attribute.", "Quoted attribute."] | ||
351 | self.assertSelects(tree.find_all(id=1), expected) | ||
352 | self.assertSelects(tree.find_all(id="1"), expected) | ||
353 | |||
354 | def test_find_all_with_list_attribute_values(self): | ||
355 | # You can pass a list of attribute values instead of just one, | ||
356 | # and you'll get tags that match any of the values. | ||
357 | tree = self.soup("""<a id="1">1</a> | ||
358 | <a id="2">2</a> | ||
359 | <a id="3">3</a> | ||
360 | <a>No ID.</a>""") | ||
361 | self.assertSelects(tree.find_all(id=["1", "3", "4"]), | ||
362 | ["1", "3"]) | ||
363 | |||
364 | def test_find_all_with_regular_expression_attribute_value(self): | ||
365 | # You can pass a regular expression as an attribute value, and | ||
366 | # you'll get tags whose values for that attribute match the | ||
367 | # regular expression. | ||
368 | tree = self.soup("""<a id="a">One a.</a> | ||
369 | <a id="aa">Two as.</a> | ||
370 | <a id="ab">Mixed as and bs.</a> | ||
371 | <a id="b">One b.</a> | ||
372 | <a>No ID.</a>""") | ||
373 | |||
374 | self.assertSelects(tree.find_all(id=re.compile("^a+$")), | ||
375 | ["One a.", "Two as."]) | ||
376 | |||
377 | def test_find_by_name_and_containing_string(self): | ||
378 | soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") | ||
379 | a = soup.a | ||
380 | |||
381 | self.assertEqual([a], soup.find_all("a", text="foo")) | ||
382 | self.assertEqual([], soup.find_all("a", text="bar")) | ||
383 | self.assertEqual([], soup.find_all("a", text="bar")) | ||
384 | |||
385 | def test_find_by_name_and_containing_string_when_string_is_buried(self): | ||
386 | soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") | ||
387 | self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) | ||
388 | |||
389 | def test_find_by_attribute_and_containing_string(self): | ||
390 | soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') | ||
391 | a = soup.a | ||
392 | |||
393 | self.assertEqual([a], soup.find_all(id=2, text="foo")) | ||
394 | self.assertEqual([], soup.find_all(id=1, text="bar")) | ||
395 | |||
396 | |||
397 | |||
398 | |||
399 | class TestIndex(TreeTest): | ||
400 | """Test Tag.index""" | ||
401 | def test_index(self): | ||
402 | tree = self.soup("""<div> | ||
403 | <a>Identical</a> | ||
404 | <b>Not identical</b> | ||
405 | <a>Identical</a> | ||
406 | |||
407 | <c><d>Identical with child</d></c> | ||
408 | <b>Also not identical</b> | ||
409 | <c><d>Identical with child</d></c> | ||
410 | </div>""") | ||
411 | div = tree.div | ||
412 | for i, element in enumerate(div.contents): | ||
413 | self.assertEqual(i, div.index(element)) | ||
414 | self.assertRaises(ValueError, tree.index, 1) | ||
415 | |||
416 | |||
417 | class TestParentOperations(TreeTest): | ||
418 | """Test navigation and searching through an element's parents.""" | ||
419 | |||
420 | def setUp(self): | ||
421 | super(TestParentOperations, self).setUp() | ||
422 | self.tree = self.soup('''<ul id="empty"></ul> | ||
423 | <ul id="top"> | ||
424 | <ul id="middle"> | ||
425 | <ul id="bottom"> | ||
426 | <b>Start here</b> | ||
427 | </ul> | ||
428 | </ul>''') | ||
429 | self.start = self.tree.b | ||
430 | |||
431 | |||
432 | def test_parent(self): | ||
433 | self.assertEqual(self.start.parent['id'], 'bottom') | ||
434 | self.assertEqual(self.start.parent.parent['id'], 'middle') | ||
435 | self.assertEqual(self.start.parent.parent.parent['id'], 'top') | ||
436 | |||
437 | def test_parent_of_top_tag_is_soup_object(self): | ||
438 | top_tag = self.tree.contents[0] | ||
439 | self.assertEqual(top_tag.parent, self.tree) | ||
440 | |||
441 | def test_soup_object_has_no_parent(self): | ||
442 | self.assertEqual(None, self.tree.parent) | ||
443 | |||
444 | def test_find_parents(self): | ||
445 | self.assertSelectsIDs( | ||
446 | self.start.find_parents('ul'), ['bottom', 'middle', 'top']) | ||
447 | self.assertSelectsIDs( | ||
448 | self.start.find_parents('ul', id="middle"), ['middle']) | ||
449 | |||
450 | def test_find_parent(self): | ||
451 | self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') | ||
452 | self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') | ||
453 | |||
454 | def test_parent_of_text_element(self): | ||
455 | text = self.tree.find(text="Start here") | ||
456 | self.assertEqual(text.parent.name, 'b') | ||
457 | |||
458 | def test_text_element_find_parent(self): | ||
459 | text = self.tree.find(text="Start here") | ||
460 | self.assertEqual(text.find_parent('ul')['id'], 'bottom') | ||
461 | |||
462 | def test_parent_generator(self): | ||
463 | parents = [parent['id'] for parent in self.start.parents | ||
464 | if parent is not None and 'id' in parent.attrs] | ||
465 | self.assertEqual(parents, ['bottom', 'middle', 'top']) | ||
466 | |||
467 | |||
468 | class ProximityTest(TreeTest): | ||
469 | |||
470 | def setUp(self): | ||
471 | super(TreeTest, self).setUp() | ||
472 | self.tree = self.soup( | ||
473 | '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>') | ||
474 | |||
475 | |||
476 | class TestNextOperations(ProximityTest): | ||
477 | |||
478 | def setUp(self): | ||
479 | super(TestNextOperations, self).setUp() | ||
480 | self.start = self.tree.b | ||
481 | |||
482 | def test_next(self): | ||
483 | self.assertEqual(self.start.next_element, "One") | ||
484 | self.assertEqual(self.start.next_element.next_element['id'], "2") | ||
485 | |||
486 | def test_next_of_last_item_is_none(self): | ||
487 | last = self.tree.find(text="Three") | ||
488 | self.assertEqual(last.next_element, None) | ||
489 | |||
490 | def test_next_of_root_is_none(self): | ||
491 | # The document root is outside the next/previous chain. | ||
492 | self.assertEqual(self.tree.next_element, None) | ||
493 | |||
494 | def test_find_all_next(self): | ||
495 | self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) | ||
496 | self.start.find_all_next(id=3) | ||
497 | self.assertSelects(self.start.find_all_next(id=3), ["Three"]) | ||
498 | |||
499 | def test_find_next(self): | ||
500 | self.assertEqual(self.start.find_next('b')['id'], '2') | ||
501 | self.assertEqual(self.start.find_next(text="Three"), "Three") | ||
502 | |||
503 | def test_find_next_for_text_element(self): | ||
504 | text = self.tree.find(text="One") | ||
505 | self.assertEqual(text.find_next("b").string, "Two") | ||
506 | self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) | ||
507 | |||
508 | def test_next_generator(self): | ||
509 | start = self.tree.find(text="Two") | ||
510 | successors = [node for node in start.next_elements] | ||
511 | # There are two successors: the final <b> tag and its text contents. | ||
512 | tag, contents = successors | ||
513 | self.assertEqual(tag['id'], '3') | ||
514 | self.assertEqual(contents, "Three") | ||
515 | |||
516 | class TestPreviousOperations(ProximityTest): | ||
517 | |||
518 | def setUp(self): | ||
519 | super(TestPreviousOperations, self).setUp() | ||
520 | self.end = self.tree.find(text="Three") | ||
521 | |||
522 | def test_previous(self): | ||
523 | self.assertEqual(self.end.previous_element['id'], "3") | ||
524 | self.assertEqual(self.end.previous_element.previous_element, "Two") | ||
525 | |||
526 | def test_previous_of_first_item_is_none(self): | ||
527 | first = self.tree.find('html') | ||
528 | self.assertEqual(first.previous_element, None) | ||
529 | |||
530 | def test_previous_of_root_is_none(self): | ||
531 | # The document root is outside the next/previous chain. | ||
532 | # XXX This is broken! | ||
533 | #self.assertEqual(self.tree.previous_element, None) | ||
534 | pass | ||
535 | |||
536 | def test_find_all_previous(self): | ||
537 | # The <b> tag containing the "Three" node is the predecessor | ||
538 | # of the "Three" node itself, which is why "Three" shows up | ||
539 | # here. | ||
540 | self.assertSelects( | ||
541 | self.end.find_all_previous('b'), ["Three", "Two", "One"]) | ||
542 | self.assertSelects(self.end.find_all_previous(id=1), ["One"]) | ||
543 | |||
544 | def test_find_previous(self): | ||
545 | self.assertEqual(self.end.find_previous('b')['id'], '3') | ||
546 | self.assertEqual(self.end.find_previous(text="One"), "One") | ||
547 | |||
548 | def test_find_previous_for_text_element(self): | ||
549 | text = self.tree.find(text="Three") | ||
550 | self.assertEqual(text.find_previous("b").string, "Three") | ||
551 | self.assertSelects( | ||
552 | text.find_all_previous("b"), ["Three", "Two", "One"]) | ||
553 | |||
554 | def test_previous_generator(self): | ||
555 | start = self.tree.find(text="One") | ||
556 | predecessors = [node for node in start.previous_elements] | ||
557 | |||
558 | # There are four predecessors: the <b> tag containing "One" | ||
559 | # the <body> tag, the <head> tag, and the <html> tag. | ||
560 | b, body, head, html = predecessors | ||
561 | self.assertEqual(b['id'], '1') | ||
562 | self.assertEqual(body.name, "body") | ||
563 | self.assertEqual(head.name, "head") | ||
564 | self.assertEqual(html.name, "html") | ||
565 | |||
566 | |||
567 | class SiblingTest(TreeTest): | ||
568 | |||
569 | def setUp(self): | ||
570 | super(SiblingTest, self).setUp() | ||
571 | markup = '''<html> | ||
572 | <span id="1"> | ||
573 | <span id="1.1"></span> | ||
574 | </span> | ||
575 | <span id="2"> | ||
576 | <span id="2.1"></span> | ||
577 | </span> | ||
578 | <span id="3"> | ||
579 | <span id="3.1"></span> | ||
580 | </span> | ||
581 | <span id="4"></span> | ||
582 | </html>''' | ||
583 | # All that whitespace looks good but makes the tests more | ||
584 | # difficult. Get rid of it. | ||
585 | markup = re.compile("\n\s*").sub("", markup) | ||
586 | self.tree = self.soup(markup) | ||
587 | |||
588 | |||
589 | class TestNextSibling(SiblingTest): | ||
590 | |||
591 | def setUp(self): | ||
592 | super(TestNextSibling, self).setUp() | ||
593 | self.start = self.tree.find(id="1") | ||
594 | |||
595 | def test_next_sibling_of_root_is_none(self): | ||
596 | self.assertEqual(self.tree.next_sibling, None) | ||
597 | |||
598 | def test_next_sibling(self): | ||
599 | self.assertEqual(self.start.next_sibling['id'], '2') | ||
600 | self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') | ||
601 | |||
602 | # Note the difference between next_sibling and next_element. | ||
603 | self.assertEqual(self.start.next_element['id'], '1.1') | ||
604 | |||
605 | def test_next_sibling_may_not_exist(self): | ||
606 | self.assertEqual(self.tree.html.next_sibling, None) | ||
607 | |||
608 | nested_span = self.tree.find(id="1.1") | ||
609 | self.assertEqual(nested_span.next_sibling, None) | ||
610 | |||
611 | last_span = self.tree.find(id="4") | ||
612 | self.assertEqual(last_span.next_sibling, None) | ||
613 | |||
614 | def test_find_next_sibling(self): | ||
615 | self.assertEqual(self.start.find_next_sibling('span')['id'], '2') | ||
616 | |||
617 | def test_next_siblings(self): | ||
618 | self.assertSelectsIDs(self.start.find_next_siblings("span"), | ||
619 | ['2', '3', '4']) | ||
620 | |||
621 | self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) | ||
622 | |||
623 | def test_next_sibling_for_text_element(self): | ||
624 | soup = self.soup("Foo<b>bar</b>baz") | ||
625 | start = soup.find(text="Foo") | ||
626 | self.assertEqual(start.next_sibling.name, 'b') | ||
627 | self.assertEqual(start.next_sibling.next_sibling, 'baz') | ||
628 | |||
629 | self.assertSelects(start.find_next_siblings('b'), ['bar']) | ||
630 | self.assertEqual(start.find_next_sibling(text="baz"), "baz") | ||
631 | self.assertEqual(start.find_next_sibling(text="nonesuch"), None) | ||
632 | |||
633 | |||
634 | class TestPreviousSibling(SiblingTest): | ||
635 | |||
636 | def setUp(self): | ||
637 | super(TestPreviousSibling, self).setUp() | ||
638 | self.end = self.tree.find(id="4") | ||
639 | |||
640 | def test_previous_sibling_of_root_is_none(self): | ||
641 | self.assertEqual(self.tree.previous_sibling, None) | ||
642 | |||
643 | def test_previous_sibling(self): | ||
644 | self.assertEqual(self.end.previous_sibling['id'], '3') | ||
645 | self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') | ||
646 | |||
647 | # Note the difference between previous_sibling and previous_element. | ||
648 | self.assertEqual(self.end.previous_element['id'], '3.1') | ||
649 | |||
650 | def test_previous_sibling_may_not_exist(self): | ||
651 | self.assertEqual(self.tree.html.previous_sibling, None) | ||
652 | |||
653 | nested_span = self.tree.find(id="1.1") | ||
654 | self.assertEqual(nested_span.previous_sibling, None) | ||
655 | |||
656 | first_span = self.tree.find(id="1") | ||
657 | self.assertEqual(first_span.previous_sibling, None) | ||
658 | |||
659 | def test_find_previous_sibling(self): | ||
660 | self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') | ||
661 | |||
662 | def test_previous_siblings(self): | ||
663 | self.assertSelectsIDs(self.end.find_previous_siblings("span"), | ||
664 | ['3', '2', '1']) | ||
665 | |||
666 | self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) | ||
667 | |||
668 | def test_previous_sibling_for_text_element(self): | ||
669 | soup = self.soup("Foo<b>bar</b>baz") | ||
670 | start = soup.find(text="baz") | ||
671 | self.assertEqual(start.previous_sibling.name, 'b') | ||
672 | self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') | ||
673 | |||
674 | self.assertSelects(start.find_previous_siblings('b'), ['bar']) | ||
675 | self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") | ||
676 | self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) | ||
677 | |||
678 | |||
679 | class TestTagCreation(SoupTest): | ||
680 | """Test the ability to create new tags.""" | ||
681 | def test_new_tag(self): | ||
682 | soup = self.soup("") | ||
683 | new_tag = soup.new_tag("foo", bar="baz") | ||
684 | self.assertTrue(isinstance(new_tag, Tag)) | ||
685 | self.assertEqual("foo", new_tag.name) | ||
686 | self.assertEqual(dict(bar="baz"), new_tag.attrs) | ||
687 | self.assertEqual(None, new_tag.parent) | ||
688 | |||
689 | def test_tag_inherits_self_closing_rules_from_builder(self): | ||
690 | if XML_BUILDER_PRESENT: | ||
691 | xml_soup = BeautifulSoup("", "xml") | ||
692 | xml_br = xml_soup.new_tag("br") | ||
693 | xml_p = xml_soup.new_tag("p") | ||
694 | |||
695 | # Both the <br> and <p> tag are empty-element, just because | ||
696 | # they have no contents. | ||
697 | self.assertEqual(b"<br/>", xml_br.encode()) | ||
698 | self.assertEqual(b"<p/>", xml_p.encode()) | ||
699 | |||
700 | html_soup = BeautifulSoup("", "html") | ||
701 | html_br = html_soup.new_tag("br") | ||
702 | html_p = html_soup.new_tag("p") | ||
703 | |||
704 | # The HTML builder users HTML's rules about which tags are | ||
705 | # empty-element tags, and the new tags reflect these rules. | ||
706 | self.assertEqual(b"<br/>", html_br.encode()) | ||
707 | self.assertEqual(b"<p></p>", html_p.encode()) | ||
708 | |||
709 | def test_new_string_creates_navigablestring(self): | ||
710 | soup = self.soup("") | ||
711 | s = soup.new_string("foo") | ||
712 | self.assertEqual("foo", s) | ||
713 | self.assertTrue(isinstance(s, NavigableString)) | ||
714 | |||
715 | def test_new_string_can_create_navigablestring_subclass(self): | ||
716 | soup = self.soup("") | ||
717 | s = soup.new_string("foo", Comment) | ||
718 | self.assertEqual("foo", s) | ||
719 | self.assertTrue(isinstance(s, Comment)) | ||
720 | |||
721 | class TestTreeModification(SoupTest): | ||
722 | |||
723 | def test_attribute_modification(self): | ||
724 | soup = self.soup('<a id="1"></a>') | ||
725 | soup.a['id'] = 2 | ||
726 | self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>')) | ||
727 | del(soup.a['id']) | ||
728 | self.assertEqual(soup.decode(), self.document_for('<a></a>')) | ||
729 | soup.a['id2'] = 'foo' | ||
730 | self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) | ||
731 | |||
732 | def test_new_tag_creation(self): | ||
733 | builder = builder_registry.lookup('html')() | ||
734 | soup = self.soup("<body></body>", builder=builder) | ||
735 | a = Tag(soup, builder, 'a') | ||
736 | ol = Tag(soup, builder, 'ol') | ||
737 | a['href'] = 'http://foo.com/' | ||
738 | soup.body.insert(0, a) | ||
739 | soup.body.insert(1, ol) | ||
740 | self.assertEqual( | ||
741 | soup.body.encode(), | ||
742 | b'<body><a href="http://foo.com/"></a><ol></ol></body>') | ||
743 | |||
744 | def test_append_to_contents_moves_tag(self): | ||
745 | doc = """<p id="1">Don't leave me <b>here</b>.</p> | ||
746 | <p id="2">Don\'t leave!</p>""" | ||
747 | soup = self.soup(doc) | ||
748 | second_para = soup.find(id='2') | ||
749 | bold = soup.b | ||
750 | |||
751 | # Move the <b> tag to the end of the second paragraph. | ||
752 | soup.find(id='2').append(soup.b) | ||
753 | |||
754 | # The <b> tag is now a child of the second paragraph. | ||
755 | self.assertEqual(bold.parent, second_para) | ||
756 | |||
757 | self.assertEqual( | ||
758 | soup.decode(), self.document_for( | ||
759 | '<p id="1">Don\'t leave me .</p>\n' | ||
760 | '<p id="2">Don\'t leave!<b>here</b></p>')) | ||
761 | |||
762 | def test_replace_with_returns_thing_that_was_replaced(self): | ||
763 | text = "<a></a><b><c></c></b>" | ||
764 | soup = self.soup(text) | ||
765 | a = soup.a | ||
766 | new_a = a.replace_with(soup.c) | ||
767 | self.assertEqual(a, new_a) | ||
768 | |||
769 | def test_unwrap_returns_thing_that_was_replaced(self): | ||
770 | text = "<a><b></b><c></c></a>" | ||
771 | soup = self.soup(text) | ||
772 | a = soup.a | ||
773 | new_a = a.unwrap() | ||
774 | self.assertEqual(a, new_a) | ||
775 | |||
776 | def test_replace_tag_with_itself(self): | ||
777 | text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" | ||
778 | soup = self.soup(text) | ||
779 | c = soup.c | ||
780 | soup.c.replace_with(c) | ||
781 | self.assertEqual(soup.decode(), self.document_for(text)) | ||
782 | |||
783 | def test_replace_tag_with_its_parent_raises_exception(self): | ||
784 | text = "<a><b></b></a>" | ||
785 | soup = self.soup(text) | ||
786 | self.assertRaises(ValueError, soup.b.replace_with, soup.a) | ||
787 | |||
788 | def test_insert_tag_into_itself_raises_exception(self): | ||
789 | text = "<a><b></b></a>" | ||
790 | soup = self.soup(text) | ||
791 | self.assertRaises(ValueError, soup.a.insert, 0, soup.a) | ||
792 | |||
793 | def test_replace_with_maintains_next_element_throughout(self): | ||
794 | soup = self.soup('<p><a>one</a><b>three</b></p>') | ||
795 | a = soup.a | ||
796 | b = a.contents[0] | ||
797 | # Make it so the <a> tag has two text children. | ||
798 | a.insert(1, "two") | ||
799 | |||
800 | # Now replace each one with the empty string. | ||
801 | left, right = a.contents | ||
802 | left.replaceWith('') | ||
803 | right.replaceWith('') | ||
804 | |||
805 | # The <b> tag is still connected to the tree. | ||
806 | self.assertEqual("three", soup.b.string) | ||
807 | |||
808 | def test_replace_final_node(self): | ||
809 | soup = self.soup("<b>Argh!</b>") | ||
810 | soup.find(text="Argh!").replace_with("Hooray!") | ||
811 | new_text = soup.find(text="Hooray!") | ||
812 | b = soup.b | ||
813 | self.assertEqual(new_text.previous_element, b) | ||
814 | self.assertEqual(new_text.parent, b) | ||
815 | self.assertEqual(new_text.previous_element.next_element, new_text) | ||
816 | self.assertEqual(new_text.next_element, None) | ||
817 | |||
818 | def test_consecutive_text_nodes(self): | ||
819 | # A builder should never create two consecutive text nodes, | ||
820 | # but if you insert one next to another, Beautiful Soup will | ||
821 | # handle it correctly. | ||
822 | soup = self.soup("<a><b>Argh!</b><c></c></a>") | ||
823 | soup.b.insert(1, "Hooray!") | ||
824 | |||
825 | self.assertEqual( | ||
826 | soup.decode(), self.document_for( | ||
827 | "<a><b>Argh!Hooray!</b><c></c></a>")) | ||
828 | |||
829 | new_text = soup.find(text="Hooray!") | ||
830 | self.assertEqual(new_text.previous_element, "Argh!") | ||
831 | self.assertEqual(new_text.previous_element.next_element, new_text) | ||
832 | |||
833 | self.assertEqual(new_text.previous_sibling, "Argh!") | ||
834 | self.assertEqual(new_text.previous_sibling.next_sibling, new_text) | ||
835 | |||
836 | self.assertEqual(new_text.next_sibling, None) | ||
837 | self.assertEqual(new_text.next_element, soup.c) | ||
838 | |||
839 | def test_insert_string(self): | ||
840 | soup = self.soup("<a></a>") | ||
841 | soup.a.insert(0, "bar") | ||
842 | soup.a.insert(0, "foo") | ||
843 | # The string were added to the tag. | ||
844 | self.assertEqual(["foo", "bar"], soup.a.contents) | ||
845 | # And they were converted to NavigableStrings. | ||
846 | self.assertEqual(soup.a.contents[0].next_element, "bar") | ||
847 | |||
848 | def test_insert_tag(self): | ||
849 | builder = self.default_builder | ||
850 | soup = self.soup( | ||
851 | "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) | ||
852 | magic_tag = Tag(soup, builder, 'magictag') | ||
853 | magic_tag.insert(0, "the") | ||
854 | soup.a.insert(1, magic_tag) | ||
855 | |||
856 | self.assertEqual( | ||
857 | soup.decode(), self.document_for( | ||
858 | "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>")) | ||
859 | |||
860 | # Make sure all the relationships are hooked up correctly. | ||
861 | b_tag = soup.b | ||
862 | self.assertEqual(b_tag.next_sibling, magic_tag) | ||
863 | self.assertEqual(magic_tag.previous_sibling, b_tag) | ||
864 | |||
865 | find = b_tag.find(text="Find") | ||
866 | self.assertEqual(find.next_element, magic_tag) | ||
867 | self.assertEqual(magic_tag.previous_element, find) | ||
868 | |||
869 | c_tag = soup.c | ||
870 | self.assertEqual(magic_tag.next_sibling, c_tag) | ||
871 | self.assertEqual(c_tag.previous_sibling, magic_tag) | ||
872 | |||
873 | the = magic_tag.find(text="the") | ||
874 | self.assertEqual(the.parent, magic_tag) | ||
875 | self.assertEqual(the.next_element, c_tag) | ||
876 | self.assertEqual(c_tag.previous_element, the) | ||
877 | |||
878 | def test_append_child_thats_already_at_the_end(self): | ||
879 | data = "<a><b></b></a>" | ||
880 | soup = self.soup(data) | ||
881 | soup.a.append(soup.b) | ||
882 | self.assertEqual(data, soup.decode()) | ||
883 | |||
884 | def test_move_tag_to_beginning_of_parent(self): | ||
885 | data = "<a><b></b><c></c><d></d></a>" | ||
886 | soup = self.soup(data) | ||
887 | soup.a.insert(0, soup.d) | ||
888 | self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode()) | ||
889 | |||
890 | def test_insert_works_on_empty_element_tag(self): | ||
891 | # This is a little strange, since most HTML parsers don't allow | ||
892 | # markup like this to come through. But in general, we don't | ||
893 | # know what the parser would or wouldn't have allowed, so | ||
894 | # I'm letting this succeed for now. | ||
895 | soup = self.soup("<br/>") | ||
896 | soup.br.insert(1, "Contents") | ||
897 | self.assertEqual(str(soup.br), "<br>Contents</br>") | ||
898 | |||
899 | def test_insert_before(self): | ||
900 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
901 | soup.b.insert_before("BAZ") | ||
902 | soup.a.insert_before("QUUX") | ||
903 | self.assertEqual( | ||
904 | soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>")) | ||
905 | |||
906 | soup.a.insert_before(soup.b) | ||
907 | self.assertEqual( | ||
908 | soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) | ||
909 | |||
910 | def test_insert_after(self): | ||
911 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
912 | soup.b.insert_after("BAZ") | ||
913 | soup.a.insert_after("QUUX") | ||
914 | self.assertEqual( | ||
915 | soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ")) | ||
916 | soup.b.insert_after(soup.a) | ||
917 | self.assertEqual( | ||
918 | soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) | ||
919 | |||
920 | def test_insert_after_raises_exception_if_after_has_no_meaning(self): | ||
921 | soup = self.soup("") | ||
922 | tag = soup.new_tag("a") | ||
923 | string = soup.new_string("") | ||
924 | self.assertRaises(ValueError, string.insert_after, tag) | ||
925 | self.assertRaises(NotImplementedError, soup.insert_after, tag) | ||
926 | self.assertRaises(ValueError, tag.insert_after, tag) | ||
927 | |||
928 | def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): | ||
929 | soup = self.soup("") | ||
930 | tag = soup.new_tag("a") | ||
931 | string = soup.new_string("") | ||
932 | self.assertRaises(ValueError, string.insert_before, tag) | ||
933 | self.assertRaises(NotImplementedError, soup.insert_before, tag) | ||
934 | self.assertRaises(ValueError, tag.insert_before, tag) | ||
935 | |||
936 | def test_replace_with(self): | ||
937 | soup = self.soup( | ||
938 | "<p>There's <b>no</b> business like <b>show</b> business</p>") | ||
939 | no, show = soup.find_all('b') | ||
940 | show.replace_with(no) | ||
941 | self.assertEqual( | ||
942 | soup.decode(), | ||
943 | self.document_for( | ||
944 | "<p>There's business like <b>no</b> business</p>")) | ||
945 | |||
946 | self.assertEqual(show.parent, None) | ||
947 | self.assertEqual(no.parent, soup.p) | ||
948 | self.assertEqual(no.next_element, "no") | ||
949 | self.assertEqual(no.next_sibling, " business") | ||
950 | |||
951 | def test_replace_first_child(self): | ||
952 | data = "<a><b></b><c></c></a>" | ||
953 | soup = self.soup(data) | ||
954 | soup.b.replace_with(soup.c) | ||
955 | self.assertEqual("<a><c></c></a>", soup.decode()) | ||
956 | |||
957 | def test_replace_last_child(self): | ||
958 | data = "<a><b></b><c></c></a>" | ||
959 | soup = self.soup(data) | ||
960 | soup.c.replace_with(soup.b) | ||
961 | self.assertEqual("<a><b></b></a>", soup.decode()) | ||
962 | |||
963 | def test_nested_tag_replace_with(self): | ||
964 | soup = self.soup( | ||
965 | """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") | ||
966 | |||
967 | # Replace the entire <b> tag and its contents ("reserve the | ||
968 | # right") with the <f> tag ("refuse"). | ||
969 | remove_tag = soup.b | ||
970 | move_tag = soup.f | ||
971 | remove_tag.replace_with(move_tag) | ||
972 | |||
973 | self.assertEqual( | ||
974 | soup.decode(), self.document_for( | ||
975 | "<a>We<f>refuse</f></a><e>to<g>service</g></e>")) | ||
976 | |||
977 | # The <b> tag is now an orphan. | ||
978 | self.assertEqual(remove_tag.parent, None) | ||
979 | self.assertEqual(remove_tag.find(text="right").next_element, None) | ||
980 | self.assertEqual(remove_tag.previous_element, None) | ||
981 | self.assertEqual(remove_tag.next_sibling, None) | ||
982 | self.assertEqual(remove_tag.previous_sibling, None) | ||
983 | |||
984 | # The <f> tag is now connected to the <a> tag. | ||
985 | self.assertEqual(move_tag.parent, soup.a) | ||
986 | self.assertEqual(move_tag.previous_element, "We") | ||
987 | self.assertEqual(move_tag.next_element.next_element, soup.e) | ||
988 | self.assertEqual(move_tag.next_sibling, None) | ||
989 | |||
990 | # The gap where the <f> tag used to be has been mended, and | ||
991 | # the word "to" is now connected to the <g> tag. | ||
992 | to_text = soup.find(text="to") | ||
993 | g_tag = soup.g | ||
994 | self.assertEqual(to_text.next_element, g_tag) | ||
995 | self.assertEqual(to_text.next_sibling, g_tag) | ||
996 | self.assertEqual(g_tag.previous_element, to_text) | ||
997 | self.assertEqual(g_tag.previous_sibling, to_text) | ||
998 | |||
999 | def test_unwrap(self): | ||
1000 | tree = self.soup(""" | ||
1001 | <p>Unneeded <em>formatting</em> is unneeded</p> | ||
1002 | """) | ||
1003 | tree.em.unwrap() | ||
1004 | self.assertEqual(tree.em, None) | ||
1005 | self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") | ||
1006 | |||
1007 | def test_wrap(self): | ||
1008 | soup = self.soup("I wish I was bold.") | ||
1009 | value = soup.string.wrap(soup.new_tag("b")) | ||
1010 | self.assertEqual(value.decode(), "<b>I wish I was bold.</b>") | ||
1011 | self.assertEqual( | ||
1012 | soup.decode(), self.document_for("<b>I wish I was bold.</b>")) | ||
1013 | |||
1014 | def test_wrap_extracts_tag_from_elsewhere(self): | ||
1015 | soup = self.soup("<b></b>I wish I was bold.") | ||
1016 | soup.b.next_sibling.wrap(soup.b) | ||
1017 | self.assertEqual( | ||
1018 | soup.decode(), self.document_for("<b>I wish I was bold.</b>")) | ||
1019 | |||
1020 | def test_wrap_puts_new_contents_at_the_end(self): | ||
1021 | soup = self.soup("<b>I like being bold.</b>I wish I was bold.") | ||
1022 | soup.b.next_sibling.wrap(soup.b) | ||
1023 | self.assertEqual(2, len(soup.b.contents)) | ||
1024 | self.assertEqual( | ||
1025 | soup.decode(), self.document_for( | ||
1026 | "<b>I like being bold.I wish I was bold.</b>")) | ||
1027 | |||
1028 | def test_extract(self): | ||
1029 | soup = self.soup( | ||
1030 | '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>') | ||
1031 | |||
1032 | self.assertEqual(len(soup.body.contents), 3) | ||
1033 | extracted = soup.find(id="nav").extract() | ||
1034 | |||
1035 | self.assertEqual( | ||
1036 | soup.decode(), "<html><body>Some content. More content.</body></html>") | ||
1037 | self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') | ||
1038 | |||
1039 | # The extracted tag is now an orphan. | ||
1040 | self.assertEqual(len(soup.body.contents), 2) | ||
1041 | self.assertEqual(extracted.parent, None) | ||
1042 | self.assertEqual(extracted.previous_element, None) | ||
1043 | self.assertEqual(extracted.next_element.next_element, None) | ||
1044 | |||
1045 | # The gap where the extracted tag used to be has been mended. | ||
1046 | content_1 = soup.find(text="Some content. ") | ||
1047 | content_2 = soup.find(text=" More content.") | ||
1048 | self.assertEqual(content_1.next_element, content_2) | ||
1049 | self.assertEqual(content_1.next_sibling, content_2) | ||
1050 | self.assertEqual(content_2.previous_element, content_1) | ||
1051 | self.assertEqual(content_2.previous_sibling, content_1) | ||
1052 | |||
1053 | def test_extract_distinguishes_between_identical_strings(self): | ||
1054 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
1055 | foo_1 = soup.a.string | ||
1056 | bar_1 = soup.b.string | ||
1057 | foo_2 = soup.new_string("foo") | ||
1058 | bar_2 = soup.new_string("bar") | ||
1059 | soup.a.append(foo_2) | ||
1060 | soup.b.append(bar_2) | ||
1061 | |||
1062 | # Now there are two identical strings in the <a> tag, and two | ||
1063 | # in the <b> tag. Let's remove the first "foo" and the second | ||
1064 | # "bar". | ||
1065 | foo_1.extract() | ||
1066 | bar_2.extract() | ||
1067 | self.assertEqual(foo_2, soup.a.string) | ||
1068 | self.assertEqual(bar_2, soup.b.string) | ||
1069 | |||
1070 | def test_clear(self): | ||
1071 | """Tag.clear()""" | ||
1072 | soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") | ||
1073 | # clear using extract() | ||
1074 | a = soup.a | ||
1075 | soup.p.clear() | ||
1076 | self.assertEqual(len(soup.p.contents), 0) | ||
1077 | self.assertTrue(hasattr(a, "contents")) | ||
1078 | |||
1079 | # clear using decompose() | ||
1080 | em = a.em | ||
1081 | a.clear(decompose=True) | ||
1082 | self.assertEqual(0, len(em.contents)) | ||
1083 | |||
1084 | def test_string_set(self): | ||
1085 | """Tag.string = 'string'""" | ||
1086 | soup = self.soup("<a></a> <b><c></c></b>") | ||
1087 | soup.a.string = "foo" | ||
1088 | self.assertEqual(soup.a.contents, ["foo"]) | ||
1089 | soup.b.string = "bar" | ||
1090 | self.assertEqual(soup.b.contents, ["bar"]) | ||
1091 | |||
1092 | def test_string_set_does_not_affect_original_string(self): | ||
1093 | soup = self.soup("<a><b>foo</b><c>bar</c>") | ||
1094 | soup.b.string = soup.c.string | ||
1095 | self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>") | ||
1096 | |||
1097 | def test_set_string_preserves_class_of_string(self): | ||
1098 | soup = self.soup("<a></a>") | ||
1099 | cdata = CData("foo") | ||
1100 | soup.a.string = cdata | ||
1101 | self.assertTrue(isinstance(soup.a.string, CData)) | ||
1102 | |||
1103 | class TestElementObjects(SoupTest): | ||
1104 | """Test various features of element objects.""" | ||
1105 | |||
1106 | def test_len(self): | ||
1107 | """The length of an element is its number of children.""" | ||
1108 | soup = self.soup("<top>1<b>2</b>3</top>") | ||
1109 | |||
1110 | # The BeautifulSoup object itself contains one element: the | ||
1111 | # <top> tag. | ||
1112 | self.assertEqual(len(soup.contents), 1) | ||
1113 | self.assertEqual(len(soup), 1) | ||
1114 | |||
1115 | # The <top> tag contains three elements: the text node "1", the | ||
1116 | # <b> tag, and the text node "3". | ||
1117 | self.assertEqual(len(soup.top), 3) | ||
1118 | self.assertEqual(len(soup.top.contents), 3) | ||
1119 | |||
1120 | def test_member_access_invokes_find(self): | ||
1121 | """Accessing a Python member .foo invokes find('foo')""" | ||
1122 | soup = self.soup('<b><i></i></b>') | ||
1123 | self.assertEqual(soup.b, soup.find('b')) | ||
1124 | self.assertEqual(soup.b.i, soup.find('b').find('i')) | ||
1125 | self.assertEqual(soup.a, None) | ||
1126 | |||
1127 | def test_deprecated_member_access(self): | ||
1128 | soup = self.soup('<b><i></i></b>') | ||
1129 | with warnings.catch_warnings(record=True) as w: | ||
1130 | tag = soup.bTag | ||
1131 | self.assertEqual(soup.b, tag) | ||
1132 | self.assertEqual( | ||
1133 | '.bTag is deprecated, use .find("b") instead.', | ||
1134 | str(w[0].message)) | ||
1135 | |||
1136 | def test_has_attr(self): | ||
1137 | """has_attr() checks for the presence of an attribute. | ||
1138 | |||
1139 | Please note note: has_attr() is different from | ||
1140 | __in__. has_attr() checks the tag's attributes and __in__ | ||
1141 | checks the tag's chidlren. | ||
1142 | """ | ||
1143 | soup = self.soup("<foo attr='bar'>") | ||
1144 | self.assertTrue(soup.foo.has_attr('attr')) | ||
1145 | self.assertFalse(soup.foo.has_attr('attr2')) | ||
1146 | |||
1147 | |||
1148 | def test_attributes_come_out_in_alphabetical_order(self): | ||
1149 | markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' | ||
1150 | self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') | ||
1151 | |||
1152 | def test_string(self): | ||
1153 | # A tag that contains only a text node makes that node | ||
1154 | # available as .string. | ||
1155 | soup = self.soup("<b>foo</b>") | ||
1156 | self.assertEqual(soup.b.string, 'foo') | ||
1157 | |||
1158 | def test_empty_tag_has_no_string(self): | ||
1159 | # A tag with no children has no .stirng. | ||
1160 | soup = self.soup("<b></b>") | ||
1161 | self.assertEqual(soup.b.string, None) | ||
1162 | |||
1163 | def test_tag_with_multiple_children_has_no_string(self): | ||
1164 | # A tag with no children has no .string. | ||
1165 | soup = self.soup("<a>foo<b></b><b></b></b>") | ||
1166 | self.assertEqual(soup.b.string, None) | ||
1167 | |||
1168 | soup = self.soup("<a>foo<b></b>bar</b>") | ||
1169 | self.assertEqual(soup.b.string, None) | ||
1170 | |||
1171 | # Even if all the children are strings, due to trickery, | ||
1172 | # it won't work--but this would be a good optimization. | ||
1173 | soup = self.soup("<a>foo</b>") | ||
1174 | soup.a.insert(1, "bar") | ||
1175 | self.assertEqual(soup.a.string, None) | ||
1176 | |||
1177 | def test_tag_with_recursive_string_has_string(self): | ||
1178 | # A tag with a single child which has a .string inherits that | ||
1179 | # .string. | ||
1180 | soup = self.soup("<a><b>foo</b></a>") | ||
1181 | self.assertEqual(soup.a.string, "foo") | ||
1182 | self.assertEqual(soup.string, "foo") | ||
1183 | |||
1184 | def test_lack_of_string(self): | ||
1185 | """Only a tag containing a single text node has a .string.""" | ||
1186 | soup = self.soup("<b>f<i>e</i>o</b>") | ||
1187 | self.assertFalse(soup.b.string) | ||
1188 | |||
1189 | soup = self.soup("<b></b>") | ||
1190 | self.assertFalse(soup.b.string) | ||
1191 | |||
1192 | def test_all_text(self): | ||
1193 | """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" | ||
1194 | soup = self.soup("<a>a<b>r</b> <r> t </r></a>") | ||
1195 | self.assertEqual(soup.a.text, "ar t ") | ||
1196 | self.assertEqual(soup.a.get_text(strip=True), "art") | ||
1197 | self.assertEqual(soup.a.get_text(","), "a,r, , t ") | ||
1198 | self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") | ||
1199 | |||
1200 | def test_get_text_ignores_comments(self): | ||
1201 | soup = self.soup("foo<!--IGNORE-->bar") | ||
1202 | self.assertEqual(soup.get_text(), "foobar") | ||
1203 | |||
1204 | self.assertEqual( | ||
1205 | soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") | ||
1206 | self.assertEqual( | ||
1207 | soup.get_text(types=None), "fooIGNOREbar") | ||
1208 | |||
1209 | def test_all_strings_ignores_comments(self): | ||
1210 | soup = self.soup("foo<!--IGNORE-->bar") | ||
1211 | self.assertEqual(['foo', 'bar'], list(soup.strings)) | ||
1212 | |||
1213 | class TestCDAtaListAttributes(SoupTest): | ||
1214 | |||
1215 | """Testing cdata-list attributes like 'class'. | ||
1216 | """ | ||
1217 | def test_single_value_becomes_list(self): | ||
1218 | soup = self.soup("<a class='foo'>") | ||
1219 | self.assertEqual(["foo"],soup.a['class']) | ||
1220 | |||
1221 | def test_multiple_values_becomes_list(self): | ||
1222 | soup = self.soup("<a class='foo bar'>") | ||
1223 | self.assertEqual(["foo", "bar"], soup.a['class']) | ||
1224 | |||
1225 | def test_multiple_values_separated_by_weird_whitespace(self): | ||
1226 | soup = self.soup("<a class='foo\tbar\nbaz'>") | ||
1227 | self.assertEqual(["foo", "bar", "baz"],soup.a['class']) | ||
1228 | |||
1229 | def test_attributes_joined_into_string_on_output(self): | ||
1230 | soup = self.soup("<a class='foo\tbar'>") | ||
1231 | self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) | ||
1232 | |||
1233 | def test_accept_charset(self): | ||
1234 | soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') | ||
1235 | self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) | ||
1236 | |||
1237 | def test_cdata_attribute_applying_only_to_one_tag(self): | ||
1238 | data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' | ||
1239 | soup = self.soup(data) | ||
1240 | # We saw in another test that accept-charset is a cdata-list | ||
1241 | # attribute for the <form> tag. But it's not a cdata-list | ||
1242 | # attribute for any other tag. | ||
1243 | self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) | ||
1244 | |||
1245 | def test_string_has_immutable_name_property(self): | ||
1246 | string = self.soup("s").string | ||
1247 | self.assertEqual(None, string.name) | ||
1248 | def t(): | ||
1249 | string.name = 'foo' | ||
1250 | self.assertRaises(AttributeError, t) | ||
1251 | |||
1252 | class TestPersistence(SoupTest): | ||
1253 | "Testing features like pickle and deepcopy." | ||
1254 | |||
1255 | def setUp(self): | ||
1256 | super(TestPersistence, self).setUp() | ||
1257 | self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" | ||
1258 | "http://www.w3.org/TR/REC-html40/transitional.dtd"> | ||
1259 | <html> | ||
1260 | <head> | ||
1261 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | ||
1262 | <title>Beautiful Soup: We called him Tortoise because he taught us.</title> | ||
1263 | <link rev="made" href="mailto:leonardr@segfault.org"> | ||
1264 | <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> | ||
1265 | <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> | ||
1266 | <meta name="author" content="Leonard Richardson"> | ||
1267 | </head> | ||
1268 | <body> | ||
1269 | <a href="foo">foo</a> | ||
1270 | <a href="foo"><b>bar</b></a> | ||
1271 | </body> | ||
1272 | </html>""" | ||
1273 | self.tree = self.soup(self.page) | ||
1274 | |||
1275 | def test_pickle_and_unpickle_identity(self): | ||
1276 | # Pickling a tree, then unpickling it, yields a tree identical | ||
1277 | # to the original. | ||
1278 | dumped = pickle.dumps(self.tree, 2) | ||
1279 | loaded = pickle.loads(dumped) | ||
1280 | self.assertEqual(loaded.__class__, BeautifulSoup) | ||
1281 | self.assertEqual(loaded.decode(), self.tree.decode()) | ||
1282 | |||
1283 | def test_deepcopy_identity(self): | ||
1284 | # Making a deepcopy of a tree yields an identical tree. | ||
1285 | copied = copy.deepcopy(self.tree) | ||
1286 | self.assertEqual(copied.decode(), self.tree.decode()) | ||
1287 | |||
1288 | def test_unicode_pickle(self): | ||
1289 | # A tree containing Unicode characters can be pickled. | ||
1290 | html = u"<b>\N{SNOWMAN}</b>" | ||
1291 | soup = self.soup(html) | ||
1292 | dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) | ||
1293 | loaded = pickle.loads(dumped) | ||
1294 | self.assertEqual(loaded.decode(), soup.decode()) | ||
1295 | |||
1296 | |||
1297 | class TestSubstitutions(SoupTest): | ||
1298 | |||
1299 | def test_default_formatter_is_minimal(self): | ||
1300 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1301 | soup = self.soup(markup) | ||
1302 | decoded = soup.decode(formatter="minimal") | ||
1303 | # The < is converted back into < but the e-with-acute is left alone. | ||
1304 | self.assertEqual( | ||
1305 | decoded, | ||
1306 | self.document_for( | ||
1307 | u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | ||
1308 | |||
1309 | def test_formatter_html(self): | ||
1310 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1311 | soup = self.soup(markup) | ||
1312 | decoded = soup.decode(formatter="html") | ||
1313 | self.assertEqual( | ||
1314 | decoded, | ||
1315 | self.document_for("<b><<Sacré bleu!>></b>")) | ||
1316 | |||
1317 | def test_formatter_minimal(self): | ||
1318 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1319 | soup = self.soup(markup) | ||
1320 | decoded = soup.decode(formatter="minimal") | ||
1321 | # The < is converted back into < but the e-with-acute is left alone. | ||
1322 | self.assertEqual( | ||
1323 | decoded, | ||
1324 | self.document_for( | ||
1325 | u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | ||
1326 | |||
1327 | def test_formatter_null(self): | ||
1328 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1329 | soup = self.soup(markup) | ||
1330 | decoded = soup.decode(formatter=None) | ||
1331 | # Neither the angle brackets nor the e-with-acute are converted. | ||
1332 | # This is not valid HTML, but it's what the user wanted. | ||
1333 | self.assertEqual(decoded, | ||
1334 | self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | ||
1335 | |||
1336 | def test_formatter_custom(self): | ||
1337 | markup = u"<b><foo></b><b>bar</b>" | ||
1338 | soup = self.soup(markup) | ||
1339 | decoded = soup.decode(formatter = lambda x: x.upper()) | ||
1340 | # Instead of normal entity conversion code, the custom | ||
1341 | # callable is called on every string. | ||
1342 | self.assertEqual( | ||
1343 | decoded, | ||
1344 | self.document_for(u"<b><FOO></b><b>BAR</b>")) | ||
1345 | |||
1346 | def test_formatter_is_run_on_attribute_values(self): | ||
1347 | markup = u'<a href="http://a.com?a=b&c=é">e</a>' | ||
1348 | soup = self.soup(markup) | ||
1349 | a = soup.a | ||
1350 | |||
1351 | expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>' | ||
1352 | |||
1353 | self.assertEqual(expect_minimal, a.decode()) | ||
1354 | self.assertEqual(expect_minimal, a.decode(formatter="minimal")) | ||
1355 | |||
1356 | expect_html = u'<a href="http://a.com?a=b&c=é">e</a>' | ||
1357 | self.assertEqual(expect_html, a.decode(formatter="html")) | ||
1358 | |||
1359 | self.assertEqual(markup, a.decode(formatter=None)) | ||
1360 | expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' | ||
1361 | self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) | ||
1362 | |||
1363 | def test_formatter_skips_script_tag_for_html_documents(self): | ||
1364 | doc = """ | ||
1365 | <script type="text/javascript"> | ||
1366 | console.log("< < hey > > "); | ||
1367 | </script> | ||
1368 | """ | ||
1369 | encoded = BeautifulSoup(doc).encode() | ||
1370 | self.assertTrue(b"< < hey > >" in encoded) | ||
1371 | |||
1372 | def test_formatter_skips_style_tag_for_html_documents(self): | ||
1373 | doc = """ | ||
1374 | <style type="text/css"> | ||
1375 | console.log("< < hey > > "); | ||
1376 | </style> | ||
1377 | """ | ||
1378 | encoded = BeautifulSoup(doc).encode() | ||
1379 | self.assertTrue(b"< < hey > >" in encoded) | ||
1380 | |||
1381 | def test_prettify_leaves_preformatted_text_alone(self): | ||
1382 | soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ") | ||
1383 | # Everything outside the <pre> tag is reformatted, but everything | ||
1384 | # inside is left alone. | ||
1385 | self.assertEqual( | ||
1386 | u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', | ||
1387 | soup.div.prettify()) | ||
1388 | |||
1389 | def test_prettify_accepts_formatter(self): | ||
1390 | soup = BeautifulSoup("<html><body>foo</body></html>") | ||
1391 | pretty = soup.prettify(formatter = lambda x: x.upper()) | ||
1392 | self.assertTrue("FOO" in pretty) | ||
1393 | |||
1394 | def test_prettify_outputs_unicode_by_default(self): | ||
1395 | soup = self.soup("<a></a>") | ||
1396 | self.assertEqual(unicode, type(soup.prettify())) | ||
1397 | |||
1398 | def test_prettify_can_encode_data(self): | ||
1399 | soup = self.soup("<a></a>") | ||
1400 | self.assertEqual(bytes, type(soup.prettify("utf-8"))) | ||
1401 | |||
1402 | def test_html_entity_substitution_off_by_default(self): | ||
1403 | markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" | ||
1404 | soup = self.soup(markup) | ||
1405 | encoded = soup.b.encode("utf-8") | ||
1406 | self.assertEqual(encoded, markup.encode('utf-8')) | ||
1407 | |||
1408 | def test_encoding_substitution(self): | ||
1409 | # Here's the <meta> tag saying that a document is | ||
1410 | # encoded in Shift-JIS. | ||
1411 | meta_tag = ('<meta content="text/html; charset=x-sjis" ' | ||
1412 | 'http-equiv="Content-type"/>') | ||
1413 | soup = self.soup(meta_tag) | ||
1414 | |||
1415 | # Parse the document, and the charset apprears unchanged. | ||
1416 | self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') | ||
1417 | |||
1418 | # Encode the document into some encoding, and the encoding is | ||
1419 | # substituted into the meta tag. | ||
1420 | utf_8 = soup.encode("utf-8") | ||
1421 | self.assertTrue(b"charset=utf-8" in utf_8) | ||
1422 | |||
1423 | euc_jp = soup.encode("euc_jp") | ||
1424 | self.assertTrue(b"charset=euc_jp" in euc_jp) | ||
1425 | |||
1426 | shift_jis = soup.encode("shift-jis") | ||
1427 | self.assertTrue(b"charset=shift-jis" in shift_jis) | ||
1428 | |||
1429 | utf_16_u = soup.encode("utf-16").decode("utf-16") | ||
1430 | self.assertTrue("charset=utf-16" in utf_16_u) | ||
1431 | |||
1432 | def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): | ||
1433 | markup = ('<head><meta content="text/html; charset=x-sjis" ' | ||
1434 | 'http-equiv="Content-type"/></head><pre>foo</pre>') | ||
1435 | |||
1436 | # Beautiful Soup used to try to rewrite the meta tag even if the | ||
1437 | # meta tag got filtered out by the strainer. This test makes | ||
1438 | # sure that doesn't happen. | ||
1439 | strainer = SoupStrainer('pre') | ||
1440 | soup = self.soup(markup, parse_only=strainer) | ||
1441 | self.assertEqual(soup.contents[0].name, 'pre') | ||
1442 | |||
1443 | class TestEncoding(SoupTest): | ||
1444 | """Test the ability to encode objects into strings.""" | ||
1445 | |||
1446 | def test_unicode_string_can_be_encoded(self): | ||
1447 | html = u"<b>\N{SNOWMAN}</b>" | ||
1448 | soup = self.soup(html) | ||
1449 | self.assertEqual(soup.b.string.encode("utf-8"), | ||
1450 | u"\N{SNOWMAN}".encode("utf-8")) | ||
1451 | |||
1452 | def test_tag_containing_unicode_string_can_be_encoded(self): | ||
1453 | html = u"<b>\N{SNOWMAN}</b>" | ||
1454 | soup = self.soup(html) | ||
1455 | self.assertEqual( | ||
1456 | soup.b.encode("utf-8"), html.encode("utf-8")) | ||
1457 | |||
1458 | def test_encoding_substitutes_unrecognized_characters_by_default(self): | ||
1459 | html = u"<b>\N{SNOWMAN}</b>" | ||
1460 | soup = self.soup(html) | ||
1461 | self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") | ||
1462 | |||
1463 | def test_encoding_can_be_made_strict(self): | ||
1464 | html = u"<b>\N{SNOWMAN}</b>" | ||
1465 | soup = self.soup(html) | ||
1466 | self.assertRaises( | ||
1467 | UnicodeEncodeError, soup.encode, "ascii", errors="strict") | ||
1468 | |||
1469 | def test_decode_contents(self): | ||
1470 | html = u"<b>\N{SNOWMAN}</b>" | ||
1471 | soup = self.soup(html) | ||
1472 | self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) | ||
1473 | |||
1474 | def test_encode_contents(self): | ||
1475 | html = u"<b>\N{SNOWMAN}</b>" | ||
1476 | soup = self.soup(html) | ||
1477 | self.assertEqual( | ||
1478 | u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( | ||
1479 | encoding="utf8")) | ||
1480 | |||
1481 | def test_deprecated_renderContents(self): | ||
1482 | html = u"<b>\N{SNOWMAN}</b>" | ||
1483 | soup = self.soup(html) | ||
1484 | self.assertEqual( | ||
1485 | u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) | ||
1486 | |||
1487 | class TestNavigableStringSubclasses(SoupTest): | ||
1488 | |||
1489 | def test_cdata(self): | ||
1490 | # None of the current builders turn CDATA sections into CData | ||
1491 | # objects, but you can create them manually. | ||
1492 | soup = self.soup("") | ||
1493 | cdata = CData("foo") | ||
1494 | soup.insert(1, cdata) | ||
1495 | self.assertEqual(str(soup), "<![CDATA[foo]]>") | ||
1496 | self.assertEqual(soup.find(text="foo"), "foo") | ||
1497 | self.assertEqual(soup.contents[0], "foo") | ||
1498 | |||
1499 | def test_cdata_is_never_formatted(self): | ||
1500 | """Text inside a CData object is passed into the formatter. | ||
1501 | |||
1502 | But the return value is ignored. | ||
1503 | """ | ||
1504 | |||
1505 | self.count = 0 | ||
1506 | def increment(*args): | ||
1507 | self.count += 1 | ||
1508 | return "BITTER FAILURE" | ||
1509 | |||
1510 | soup = self.soup("") | ||
1511 | cdata = CData("<><><>") | ||
1512 | soup.insert(1, cdata) | ||
1513 | self.assertEqual( | ||
1514 | b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) | ||
1515 | self.assertEqual(1, self.count) | ||
1516 | |||
1517 | def test_doctype_ends_in_newline(self): | ||
1518 | # Unlike other NavigableString subclasses, a DOCTYPE always ends | ||
1519 | # in a newline. | ||
1520 | doctype = Doctype("foo") | ||
1521 | soup = self.soup("") | ||
1522 | soup.insert(1, doctype) | ||
1523 | self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") | ||
1524 | |||
1525 | |||
1526 | class TestSoupSelector(TreeTest): | ||
1527 | |||
1528 | HTML = """ | ||
1529 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | ||
1530 | "http://www.w3.org/TR/html4/strict.dtd"> | ||
1531 | <html> | ||
1532 | <head> | ||
1533 | <title>The title</title> | ||
1534 | <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> | ||
1535 | </head> | ||
1536 | <body> | ||
1537 | |||
1538 | <div id="main" class="fancy"> | ||
1539 | <div id="inner"> | ||
1540 | <h1 id="header1">An H1</h1> | ||
1541 | <p>Some text</p> | ||
1542 | <p class="onep" id="p1">Some more text</p> | ||
1543 | <h2 id="header2">An H2</h2> | ||
1544 | <p class="class1 class2 class3" id="pmulti">Another</p> | ||
1545 | <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> | ||
1546 | <h2 id="header3">Another H2</h2> | ||
1547 | <a id="me" href="http://simonwillison.net/" rel="me">me</a> | ||
1548 | <span class="s1"> | ||
1549 | <a href="#" id="s1a1">span1a1</a> | ||
1550 | <a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> | ||
1551 | <span class="span2"> | ||
1552 | <a href="#" id="s2a1">span2a1</a> | ||
1553 | </span> | ||
1554 | <span class="span3"></span> | ||
1555 | </span> | ||
1556 | </div> | ||
1557 | <p lang="en" id="lang-en">English</p> | ||
1558 | <p lang="en-gb" id="lang-en-gb">English UK</p> | ||
1559 | <p lang="en-us" id="lang-en-us">English US</p> | ||
1560 | <p lang="fr" id="lang-fr">French</p> | ||
1561 | </div> | ||
1562 | |||
1563 | <div id="footer"> | ||
1564 | </div> | ||
1565 | """ | ||
1566 | |||
1567 | def setUp(self): | ||
1568 | self.soup = BeautifulSoup(self.HTML) | ||
1569 | |||
1570 | def assertSelects(self, selector, expected_ids): | ||
1571 | el_ids = [el['id'] for el in self.soup.select(selector)] | ||
1572 | el_ids.sort() | ||
1573 | expected_ids.sort() | ||
1574 | self.assertEqual(expected_ids, el_ids, | ||
1575 | "Selector %s, expected [%s], got [%s]" % ( | ||
1576 | selector, ', '.join(expected_ids), ', '.join(el_ids) | ||
1577 | ) | ||
1578 | ) | ||
1579 | |||
1580 | assertSelect = assertSelects | ||
1581 | |||
1582 | def assertSelectMultiple(self, *tests): | ||
1583 | for selector, expected_ids in tests: | ||
1584 | self.assertSelect(selector, expected_ids) | ||
1585 | |||
1586 | def test_one_tag_one(self): | ||
1587 | els = self.soup.select('title') | ||
1588 | self.assertEqual(len(els), 1) | ||
1589 | self.assertEqual(els[0].name, 'title') | ||
1590 | self.assertEqual(els[0].contents, [u'The title']) | ||
1591 | |||
1592 | def test_one_tag_many(self): | ||
1593 | els = self.soup.select('div') | ||
1594 | self.assertEqual(len(els), 3) | ||
1595 | for div in els: | ||
1596 | self.assertEqual(div.name, 'div') | ||
1597 | |||
1598 | def test_tag_in_tag_one(self): | ||
1599 | els = self.soup.select('div div') | ||
1600 | self.assertSelects('div div', ['inner']) | ||
1601 | |||
1602 | def test_tag_in_tag_many(self): | ||
1603 | for selector in ('html div', 'html body div', 'body div'): | ||
1604 | self.assertSelects(selector, ['main', 'inner', 'footer']) | ||
1605 | |||
1606 | def test_tag_no_match(self): | ||
1607 | self.assertEqual(len(self.soup.select('del')), 0) | ||
1608 | |||
1609 | def test_invalid_tag(self): | ||
1610 | self.assertRaises(ValueError, self.soup.select, 'tag%t') | ||
1611 | |||
1612 | def test_header_tags(self): | ||
1613 | self.assertSelectMultiple( | ||
1614 | ('h1', ['header1']), | ||
1615 | ('h2', ['header2', 'header3']), | ||
1616 | ) | ||
1617 | |||
1618 | def test_class_one(self): | ||
1619 | for selector in ('.onep', 'p.onep', 'html p.onep'): | ||
1620 | els = self.soup.select(selector) | ||
1621 | self.assertEqual(len(els), 1) | ||
1622 | self.assertEqual(els[0].name, 'p') | ||
1623 | self.assertEqual(els[0]['class'], ['onep']) | ||
1624 | |||
1625 | def test_class_mismatched_tag(self): | ||
1626 | els = self.soup.select('div.onep') | ||
1627 | self.assertEqual(len(els), 0) | ||
1628 | |||
1629 | def test_one_id(self): | ||
1630 | for selector in ('div#inner', '#inner', 'div div#inner'): | ||
1631 | self.assertSelects(selector, ['inner']) | ||
1632 | |||
1633 | def test_bad_id(self): | ||
1634 | els = self.soup.select('#doesnotexist') | ||
1635 | self.assertEqual(len(els), 0) | ||
1636 | |||
1637 | def test_items_in_id(self): | ||
1638 | els = self.soup.select('div#inner p') | ||
1639 | self.assertEqual(len(els), 3) | ||
1640 | for el in els: | ||
1641 | self.assertEqual(el.name, 'p') | ||
1642 | self.assertEqual(els[1]['class'], ['onep']) | ||
1643 | self.assertFalse(els[0].has_attr('class')) | ||
1644 | |||
1645 | def test_a_bunch_of_emptys(self): | ||
1646 | for selector in ('div#main del', 'div#main div.oops', 'div div#main'): | ||
1647 | self.assertEqual(len(self.soup.select(selector)), 0) | ||
1648 | |||
1649 | def test_multi_class_support(self): | ||
1650 | for selector in ('.class1', 'p.class1', '.class2', 'p.class2', | ||
1651 | '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): | ||
1652 | self.assertSelects(selector, ['pmulti']) | ||
1653 | |||
1654 | def test_multi_class_selection(self): | ||
1655 | for selector in ('.class1.class3', '.class3.class2', | ||
1656 | '.class1.class2.class3'): | ||
1657 | self.assertSelects(selector, ['pmulti']) | ||
1658 | |||
1659 | def test_child_selector(self): | ||
1660 | self.assertSelects('.s1 > a', ['s1a1', 's1a2']) | ||
1661 | self.assertSelects('.s1 > a span', ['s1a2s1']) | ||
1662 | |||
1663 | def test_child_selector_id(self): | ||
1664 | self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) | ||
1665 | |||
1666 | def test_attribute_equals(self): | ||
1667 | self.assertSelectMultiple( | ||
1668 | ('p[class="onep"]', ['p1']), | ||
1669 | ('p[id="p1"]', ['p1']), | ||
1670 | ('[class="onep"]', ['p1']), | ||
1671 | ('[id="p1"]', ['p1']), | ||
1672 | ('link[rel="stylesheet"]', ['l1']), | ||
1673 | ('link[type="text/css"]', ['l1']), | ||
1674 | ('link[href="blah.css"]', ['l1']), | ||
1675 | ('link[href="no-blah.css"]', []), | ||
1676 | ('[rel="stylesheet"]', ['l1']), | ||
1677 | ('[type="text/css"]', ['l1']), | ||
1678 | ('[href="blah.css"]', ['l1']), | ||
1679 | ('[href="no-blah.css"]', []), | ||
1680 | ('p[href="no-blah.css"]', []), | ||
1681 | ('[href="no-blah.css"]', []), | ||
1682 | ) | ||
1683 | |||
1684 | def test_attribute_tilde(self): | ||
1685 | self.assertSelectMultiple( | ||
1686 | ('p[class~="class1"]', ['pmulti']), | ||
1687 | ('p[class~="class2"]', ['pmulti']), | ||
1688 | ('p[class~="class3"]', ['pmulti']), | ||
1689 | ('[class~="class1"]', ['pmulti']), | ||
1690 | ('[class~="class2"]', ['pmulti']), | ||
1691 | ('[class~="class3"]', ['pmulti']), | ||
1692 | ('a[rel~="friend"]', ['bob']), | ||
1693 | ('a[rel~="met"]', ['bob']), | ||
1694 | ('[rel~="friend"]', ['bob']), | ||
1695 | ('[rel~="met"]', ['bob']), | ||
1696 | ) | ||
1697 | |||
1698 | def test_attribute_startswith(self): | ||
1699 | self.assertSelectMultiple( | ||
1700 | ('[rel^="style"]', ['l1']), | ||
1701 | ('link[rel^="style"]', ['l1']), | ||
1702 | ('notlink[rel^="notstyle"]', []), | ||
1703 | ('[rel^="notstyle"]', []), | ||
1704 | ('link[rel^="notstyle"]', []), | ||
1705 | ('link[href^="bla"]', ['l1']), | ||
1706 | ('a[href^="http://"]', ['bob', 'me']), | ||
1707 | ('[href^="http://"]', ['bob', 'me']), | ||
1708 | ('[id^="p"]', ['pmulti', 'p1']), | ||
1709 | ('[id^="m"]', ['me', 'main']), | ||
1710 | ('div[id^="m"]', ['main']), | ||
1711 | ('a[id^="m"]', ['me']), | ||
1712 | ) | ||
1713 | |||
1714 | def test_attribute_endswith(self): | ||
1715 | self.assertSelectMultiple( | ||
1716 | ('[href$=".css"]', ['l1']), | ||
1717 | ('link[href$=".css"]', ['l1']), | ||
1718 | ('link[id$="1"]', ['l1']), | ||
1719 | ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), | ||
1720 | ('div[id$="1"]', []), | ||
1721 | ('[id$="noending"]', []), | ||
1722 | ) | ||
1723 | |||
1724 | def test_attribute_contains(self): | ||
1725 | self.assertSelectMultiple( | ||
1726 | # From test_attribute_startswith | ||
1727 | ('[rel*="style"]', ['l1']), | ||
1728 | ('link[rel*="style"]', ['l1']), | ||
1729 | ('notlink[rel*="notstyle"]', []), | ||
1730 | ('[rel*="notstyle"]', []), | ||
1731 | ('link[rel*="notstyle"]', []), | ||
1732 | ('link[href*="bla"]', ['l1']), | ||
1733 | ('a[href*="http://"]', ['bob', 'me']), | ||
1734 | ('[href*="http://"]', ['bob', 'me']), | ||
1735 | ('[id*="p"]', ['pmulti', 'p1']), | ||
1736 | ('div[id*="m"]', ['main']), | ||
1737 | ('a[id*="m"]', ['me']), | ||
1738 | # From test_attribute_endswith | ||
1739 | ('[href*=".css"]', ['l1']), | ||
1740 | ('link[href*=".css"]', ['l1']), | ||
1741 | ('link[id*="1"]', ['l1']), | ||
1742 | ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), | ||
1743 | ('div[id*="1"]', []), | ||
1744 | ('[id*="noending"]', []), | ||
1745 | # New for this test | ||
1746 | ('[href*="."]', ['bob', 'me', 'l1']), | ||
1747 | ('a[href*="."]', ['bob', 'me']), | ||
1748 | ('link[href*="."]', ['l1']), | ||
1749 | ('div[id*="n"]', ['main', 'inner']), | ||
1750 | ('div[id*="nn"]', ['inner']), | ||
1751 | ) | ||
1752 | |||
1753 | def test_attribute_exact_or_hypen(self): | ||
1754 | self.assertSelectMultiple( | ||
1755 | ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), | ||
1756 | ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), | ||
1757 | ('p[lang|="fr"]', ['lang-fr']), | ||
1758 | ('p[lang|="gb"]', []), | ||
1759 | ) | ||
1760 | |||
1761 | def test_attribute_exists(self): | ||
1762 | self.assertSelectMultiple( | ||
1763 | ('[rel]', ['l1', 'bob', 'me']), | ||
1764 | ('link[rel]', ['l1']), | ||
1765 | ('a[rel]', ['bob', 'me']), | ||
1766 | ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), | ||
1767 | ('p[class]', ['p1', 'pmulti']), | ||
1768 | ('[blah]', []), | ||
1769 | ('p[blah]', []), | ||
1770 | ) | ||
1771 | |||
1772 | def test_nth_of_type(self): | ||
1773 | # Try to select first paragraph | ||
1774 | els = self.soup.select('div#inner p:nth-of-type(1)') | ||
1775 | self.assertEqual(len(els), 1) | ||
1776 | self.assertEqual(els[0].string, u'Some text') | ||
1777 | |||
1778 | # Try to select third paragraph | ||
1779 | els = self.soup.select('div#inner p:nth-of-type(3)') | ||
1780 | self.assertEqual(len(els), 1) | ||
1781 | self.assertEqual(els[0].string, u'Another') | ||
1782 | |||
1783 | # Try to select (non-existent!) fourth paragraph | ||
1784 | els = self.soup.select('div#inner p:nth-of-type(4)') | ||
1785 | self.assertEqual(len(els), 0) | ||
1786 | |||
1787 | # Pass in an invalid value. | ||
1788 | self.assertRaises( | ||
1789 | ValueError, self.soup.select, 'div p:nth-of-type(0)') | ||
1790 | |||
1791 | def test_nth_of_type_direct_descendant(self): | ||
1792 | els = self.soup.select('div#inner > p:nth-of-type(1)') | ||
1793 | self.assertEqual(len(els), 1) | ||
1794 | self.assertEqual(els[0].string, u'Some text') | ||
1795 | |||
1796 | def test_id_child_selector_nth_of_type(self): | ||
1797 | self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) | ||
1798 | |||
1799 | def test_select_on_element(self): | ||
1800 | # Other tests operate on the tree; this operates on an element | ||
1801 | # within the tree. | ||
1802 | inner = self.soup.find("div", id="main") | ||
1803 | selected = inner.select("div") | ||
1804 | # The <div id="inner"> tag was selected. The <div id="footer"> | ||
1805 | # tag was not. | ||
1806 | self.assertSelectsIDs(selected, ['inner']) | ||
1807 | |||
1808 | def test_overspecified_child_id(self): | ||
1809 | self.assertSelects(".fancy #inner", ['inner']) | ||
1810 | self.assertSelects(".normal #inner", []) | ||
1811 | |||
1812 | def test_adjacent_sibling_selector(self): | ||
1813 | self.assertSelects('#p1 + h2', ['header2']) | ||
1814 | self.assertSelects('#p1 + h2 + p', ['pmulti']) | ||
1815 | self.assertSelects('#p1 + #header2 + .class1', ['pmulti']) | ||
1816 | self.assertEqual([], self.soup.select('#p1 + p')) | ||
1817 | |||
1818 | def test_general_sibling_selector(self): | ||
1819 | self.assertSelects('#p1 ~ h2', ['header2', 'header3']) | ||
1820 | self.assertSelects('#p1 ~ #header2', ['header2']) | ||
1821 | self.assertSelects('#p1 ~ h2 + a', ['me']) | ||
1822 | self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me']) | ||
1823 | self.assertEqual([], self.soup.select('#inner ~ h2')) | ||
1824 | |||
1825 | def test_dangling_combinator(self): | ||
1826 | self.assertRaises(ValueError, self.soup.select, 'h1 >') | ||
1827 | |||
1828 | def test_sibling_combinator_wont_select_same_tag_twice(self): | ||
1829 | self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) | ||