diff options
Diffstat (limited to 'bitbake/lib/bs4/tests/test_tree.py')
-rw-r--r-- | bitbake/lib/bs4/tests/test_tree.py | 2004 |
1 files changed, 0 insertions, 2004 deletions
diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py deleted file mode 100644 index cf0f1abe0c..0000000000 --- a/bitbake/lib/bs4/tests/test_tree.py +++ /dev/null | |||
@@ -1,2004 +0,0 @@ | |||
1 | # -*- coding: utf-8 -*- | ||
2 | """Tests for Beautiful Soup's tree traversal methods. | ||
3 | |||
4 | The tree traversal methods are the main advantage of using Beautiful | ||
5 | Soup over just using a parser. | ||
6 | |||
7 | Different parsers will build different Beautiful Soup trees given the | ||
8 | same markup, but all Beautiful Soup trees can be traversed with the | ||
9 | methods tested here. | ||
10 | """ | ||
11 | |||
12 | import copy | ||
13 | import pickle | ||
14 | import re | ||
15 | import warnings | ||
16 | from bs4 import BeautifulSoup | ||
17 | from bs4.builder import builder_registry | ||
18 | from bs4.element import ( | ||
19 | PY3K, | ||
20 | CData, | ||
21 | Comment, | ||
22 | Declaration, | ||
23 | Doctype, | ||
24 | NavigableString, | ||
25 | SoupStrainer, | ||
26 | Tag, | ||
27 | ) | ||
28 | from bs4.testing import SoupTest | ||
29 | |||
30 | XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) | ||
31 | LXML_PRESENT = (builder_registry.lookup("lxml") is not None) | ||
32 | |||
33 | class TreeTest(SoupTest): | ||
34 | |||
35 | def assertSelects(self, tags, should_match): | ||
36 | """Make sure that the given tags have the correct text. | ||
37 | |||
38 | This is used in tests that define a bunch of tags, each | ||
39 | containing a single string, and then select certain strings by | ||
40 | some mechanism. | ||
41 | """ | ||
42 | self.assertEqual([tag.string for tag in tags], should_match) | ||
43 | |||
44 | def assertSelectsIDs(self, tags, should_match): | ||
45 | """Make sure that the given tags have the correct IDs. | ||
46 | |||
47 | This is used in tests that define a bunch of tags, each | ||
48 | containing a single string, and then select certain strings by | ||
49 | some mechanism. | ||
50 | """ | ||
51 | self.assertEqual([tag['id'] for tag in tags], should_match) | ||
52 | |||
53 | |||
54 | class TestFind(TreeTest): | ||
55 | """Basic tests of the find() method. | ||
56 | |||
57 | find() just calls find_all() with limit=1, so it's not tested all | ||
58 | that thouroughly here. | ||
59 | """ | ||
60 | |||
61 | def test_find_tag(self): | ||
62 | soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>") | ||
63 | self.assertEqual(soup.find("b").string, "2") | ||
64 | |||
65 | def test_unicode_text_find(self): | ||
66 | soup = self.soup('<h1>Räksmörgås</h1>') | ||
67 | self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') | ||
68 | |||
69 | def test_unicode_attribute_find(self): | ||
70 | soup = self.soup('<h1 id="Räksmörgås">here it is</h1>') | ||
71 | str(soup) | ||
72 | self.assertEqual("here it is", soup.find(id='Räksmörgås').text) | ||
73 | |||
74 | |||
75 | def test_find_everything(self): | ||
76 | """Test an optimization that finds all tags.""" | ||
77 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
78 | self.assertEqual(2, len(soup.find_all())) | ||
79 | |||
80 | def test_find_everything_with_name(self): | ||
81 | """Test an optimization that finds all tags with a given name.""" | ||
82 | soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") | ||
83 | self.assertEqual(2, len(soup.find_all('a'))) | ||
84 | |||
85 | class TestFindAll(TreeTest): | ||
86 | """Basic tests of the find_all() method.""" | ||
87 | |||
88 | def test_find_all_text_nodes(self): | ||
89 | """You can search the tree for text nodes.""" | ||
90 | soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") | ||
91 | # Exact match. | ||
92 | self.assertEqual(soup.find_all(string="bar"), ["bar"]) | ||
93 | self.assertEqual(soup.find_all(text="bar"), ["bar"]) | ||
94 | # Match any of a number of strings. | ||
95 | self.assertEqual( | ||
96 | soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) | ||
97 | # Match a regular expression. | ||
98 | self.assertEqual(soup.find_all(text=re.compile('.*')), | ||
99 | ["Foo", "bar", '\xbb']) | ||
100 | # Match anything. | ||
101 | self.assertEqual(soup.find_all(text=True), | ||
102 | ["Foo", "bar", '\xbb']) | ||
103 | |||
104 | def test_find_all_limit(self): | ||
105 | """You can limit the number of items returned by find_all.""" | ||
106 | soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>") | ||
107 | self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) | ||
108 | self.assertSelects(soup.find_all('a', limit=1), ["1"]) | ||
109 | self.assertSelects( | ||
110 | soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) | ||
111 | |||
112 | # A limit of 0 means no limit. | ||
113 | self.assertSelects( | ||
114 | soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) | ||
115 | |||
116 | def test_calling_a_tag_is_calling_findall(self): | ||
117 | soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>") | ||
118 | self.assertSelects(soup('a', limit=1), ["1"]) | ||
119 | self.assertSelects(soup.b(id="foo"), ["3"]) | ||
120 | |||
121 | def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): | ||
122 | soup = self.soup("<a></a>") | ||
123 | # Create a self-referential list. | ||
124 | l = [] | ||
125 | l.append(l) | ||
126 | |||
127 | # Without special code in _normalize_search_value, this would cause infinite | ||
128 | # recursion. | ||
129 | self.assertEqual([], soup.find_all(l)) | ||
130 | |||
131 | def test_find_all_resultset(self): | ||
132 | """All find_all calls return a ResultSet""" | ||
133 | soup = self.soup("<a></a>") | ||
134 | result = soup.find_all("a") | ||
135 | self.assertTrue(hasattr(result, "source")) | ||
136 | |||
137 | result = soup.find_all(True) | ||
138 | self.assertTrue(hasattr(result, "source")) | ||
139 | |||
140 | result = soup.find_all(text="foo") | ||
141 | self.assertTrue(hasattr(result, "source")) | ||
142 | |||
143 | |||
144 | class TestFindAllBasicNamespaces(TreeTest): | ||
145 | |||
146 | def test_find_by_namespaced_name(self): | ||
147 | soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') | ||
148 | self.assertEqual("4", soup.find("mathml:msqrt").string) | ||
149 | self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) | ||
150 | |||
151 | |||
152 | class TestFindAllByName(TreeTest): | ||
153 | """Test ways of finding tags by tag name.""" | ||
154 | |||
155 | def setUp(self): | ||
156 | super(TreeTest, self).setUp() | ||
157 | self.tree = self.soup("""<a>First tag.</a> | ||
158 | <b>Second tag.</b> | ||
159 | <c>Third <a>Nested tag.</a> tag.</c>""") | ||
160 | |||
161 | def test_find_all_by_tag_name(self): | ||
162 | # Find all the <a> tags. | ||
163 | self.assertSelects( | ||
164 | self.tree.find_all('a'), ['First tag.', 'Nested tag.']) | ||
165 | |||
166 | def test_find_all_by_name_and_text(self): | ||
167 | self.assertSelects( | ||
168 | self.tree.find_all('a', text='First tag.'), ['First tag.']) | ||
169 | |||
170 | self.assertSelects( | ||
171 | self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) | ||
172 | |||
173 | self.assertSelects( | ||
174 | self.tree.find_all('a', text=re.compile("tag")), | ||
175 | ['First tag.', 'Nested tag.']) | ||
176 | |||
177 | |||
178 | def test_find_all_on_non_root_element(self): | ||
179 | # You can call find_all on any node, not just the root. | ||
180 | self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) | ||
181 | |||
182 | def test_calling_element_invokes_find_all(self): | ||
183 | self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) | ||
184 | |||
185 | def test_find_all_by_tag_strainer(self): | ||
186 | self.assertSelects( | ||
187 | self.tree.find_all(SoupStrainer('a')), | ||
188 | ['First tag.', 'Nested tag.']) | ||
189 | |||
190 | def test_find_all_by_tag_names(self): | ||
191 | self.assertSelects( | ||
192 | self.tree.find_all(['a', 'b']), | ||
193 | ['First tag.', 'Second tag.', 'Nested tag.']) | ||
194 | |||
195 | def test_find_all_by_tag_dict(self): | ||
196 | self.assertSelects( | ||
197 | self.tree.find_all({'a' : True, 'b' : True}), | ||
198 | ['First tag.', 'Second tag.', 'Nested tag.']) | ||
199 | |||
200 | def test_find_all_by_tag_re(self): | ||
201 | self.assertSelects( | ||
202 | self.tree.find_all(re.compile('^[ab]$')), | ||
203 | ['First tag.', 'Second tag.', 'Nested tag.']) | ||
204 | |||
205 | def test_find_all_with_tags_matching_method(self): | ||
206 | # You can define an oracle method that determines whether | ||
207 | # a tag matches the search. | ||
208 | def id_matches_name(tag): | ||
209 | return tag.name == tag.get('id') | ||
210 | |||
211 | tree = self.soup("""<a id="a">Match 1.</a> | ||
212 | <a id="1">Does not match.</a> | ||
213 | <b id="b">Match 2.</a>""") | ||
214 | |||
215 | self.assertSelects( | ||
216 | tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) | ||
217 | |||
218 | |||
219 | class TestFindAllByAttribute(TreeTest): | ||
220 | |||
221 | def test_find_all_by_attribute_name(self): | ||
222 | # You can pass in keyword arguments to find_all to search by | ||
223 | # attribute. | ||
224 | tree = self.soup(""" | ||
225 | <a id="first">Matching a.</a> | ||
226 | <a id="second"> | ||
227 | Non-matching <b id="first">Matching b.</b>a. | ||
228 | </a>""") | ||
229 | self.assertSelects(tree.find_all(id='first'), | ||
230 | ["Matching a.", "Matching b."]) | ||
231 | |||
232 | def test_find_all_by_utf8_attribute_value(self): | ||
233 | peace = "םולש".encode("utf8") | ||
234 | data = '<a title="םולש"></a>'.encode("utf8") | ||
235 | soup = self.soup(data) | ||
236 | self.assertEqual([soup.a], soup.find_all(title=peace)) | ||
237 | self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) | ||
238 | self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) | ||
239 | |||
240 | def test_find_all_by_attribute_dict(self): | ||
241 | # You can pass in a dictionary as the argument 'attrs'. This | ||
242 | # lets you search for attributes like 'name' (a fixed argument | ||
243 | # to find_all) and 'class' (a reserved word in Python.) | ||
244 | tree = self.soup(""" | ||
245 | <a name="name1" class="class1">Name match.</a> | ||
246 | <a name="name2" class="class2">Class match.</a> | ||
247 | <a name="name3" class="class3">Non-match.</a> | ||
248 | <name1>A tag called 'name1'.</name1> | ||
249 | """) | ||
250 | |||
251 | # This doesn't do what you want. | ||
252 | self.assertSelects(tree.find_all(name='name1'), | ||
253 | ["A tag called 'name1'."]) | ||
254 | # This does what you want. | ||
255 | self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), | ||
256 | ["Name match."]) | ||
257 | |||
258 | self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), | ||
259 | ["Class match."]) | ||
260 | |||
261 | def test_find_all_by_class(self): | ||
262 | tree = self.soup(""" | ||
263 | <a class="1">Class 1.</a> | ||
264 | <a class="2">Class 2.</a> | ||
265 | <b class="1">Class 1.</b> | ||
266 | <c class="3 4">Class 3 and 4.</c> | ||
267 | """) | ||
268 | |||
269 | # Passing in the class_ keyword argument will search against | ||
270 | # the 'class' attribute. | ||
271 | self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) | ||
272 | self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) | ||
273 | self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) | ||
274 | |||
275 | # Passing in a string to 'attrs' will also search the CSS class. | ||
276 | self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) | ||
277 | self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) | ||
278 | self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) | ||
279 | self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) | ||
280 | |||
281 | def test_find_by_class_when_multiple_classes_present(self): | ||
282 | tree = self.soup("<gar class='foo bar'>Found it</gar>") | ||
283 | |||
284 | f = tree.find_all("gar", class_=re.compile("o")) | ||
285 | self.assertSelects(f, ["Found it"]) | ||
286 | |||
287 | f = tree.find_all("gar", class_=re.compile("a")) | ||
288 | self.assertSelects(f, ["Found it"]) | ||
289 | |||
290 | # Since the class is not the string "foo bar", but the two | ||
291 | # strings "foo" and "bar", this will not find anything. | ||
292 | f = tree.find_all("gar", class_=re.compile("o b")) | ||
293 | self.assertSelects(f, []) | ||
294 | |||
295 | def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): | ||
296 | soup = self.soup("<a class='bar'>Found it</a>") | ||
297 | |||
298 | self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) | ||
299 | |||
300 | def big_attribute_value(value): | ||
301 | return len(value) > 3 | ||
302 | |||
303 | self.assertSelects(soup.find_all("a", big_attribute_value), []) | ||
304 | |||
305 | def small_attribute_value(value): | ||
306 | return len(value) <= 3 | ||
307 | |||
308 | self.assertSelects( | ||
309 | soup.find_all("a", small_attribute_value), ["Found it"]) | ||
310 | |||
311 | def test_find_all_with_string_for_attrs_finds_multiple_classes(self): | ||
312 | soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') | ||
313 | a, a2 = soup.find_all("a") | ||
314 | self.assertEqual([a, a2], soup.find_all("a", "foo")) | ||
315 | self.assertEqual([a], soup.find_all("a", "bar")) | ||
316 | |||
317 | # If you specify the class as a string that contains a | ||
318 | # space, only that specific value will be found. | ||
319 | self.assertEqual([a], soup.find_all("a", class_="foo bar")) | ||
320 | self.assertEqual([a], soup.find_all("a", "foo bar")) | ||
321 | self.assertEqual([], soup.find_all("a", "bar foo")) | ||
322 | |||
323 | def test_find_all_by_attribute_soupstrainer(self): | ||
324 | tree = self.soup(""" | ||
325 | <a id="first">Match.</a> | ||
326 | <a id="second">Non-match.</a>""") | ||
327 | |||
328 | strainer = SoupStrainer(attrs={'id' : 'first'}) | ||
329 | self.assertSelects(tree.find_all(strainer), ['Match.']) | ||
330 | |||
331 | def test_find_all_with_missing_atribute(self): | ||
332 | # You can pass in None as the value of an attribute to find_all. | ||
333 | # This will match tags that do not have that attribute set. | ||
334 | tree = self.soup("""<a id="1">ID present.</a> | ||
335 | <a>No ID present.</a> | ||
336 | <a id="">ID is empty.</a>""") | ||
337 | self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) | ||
338 | |||
339 | def test_find_all_with_defined_attribute(self): | ||
340 | # You can pass in None as the value of an attribute to find_all. | ||
341 | # This will match tags that have that attribute set to any value. | ||
342 | tree = self.soup("""<a id="1">ID present.</a> | ||
343 | <a>No ID present.</a> | ||
344 | <a id="">ID is empty.</a>""") | ||
345 | self.assertSelects( | ||
346 | tree.find_all(id=True), ["ID present.", "ID is empty."]) | ||
347 | |||
348 | def test_find_all_with_numeric_attribute(self): | ||
349 | # If you search for a number, it's treated as a string. | ||
350 | tree = self.soup("""<a id=1>Unquoted attribute.</a> | ||
351 | <a id="1">Quoted attribute.</a>""") | ||
352 | |||
353 | expected = ["Unquoted attribute.", "Quoted attribute."] | ||
354 | self.assertSelects(tree.find_all(id=1), expected) | ||
355 | self.assertSelects(tree.find_all(id="1"), expected) | ||
356 | |||
357 | def test_find_all_with_list_attribute_values(self): | ||
358 | # You can pass a list of attribute values instead of just one, | ||
359 | # and you'll get tags that match any of the values. | ||
360 | tree = self.soup("""<a id="1">1</a> | ||
361 | <a id="2">2</a> | ||
362 | <a id="3">3</a> | ||
363 | <a>No ID.</a>""") | ||
364 | self.assertSelects(tree.find_all(id=["1", "3", "4"]), | ||
365 | ["1", "3"]) | ||
366 | |||
367 | def test_find_all_with_regular_expression_attribute_value(self): | ||
368 | # You can pass a regular expression as an attribute value, and | ||
369 | # you'll get tags whose values for that attribute match the | ||
370 | # regular expression. | ||
371 | tree = self.soup("""<a id="a">One a.</a> | ||
372 | <a id="aa">Two as.</a> | ||
373 | <a id="ab">Mixed as and bs.</a> | ||
374 | <a id="b">One b.</a> | ||
375 | <a>No ID.</a>""") | ||
376 | |||
377 | self.assertSelects(tree.find_all(id=re.compile("^a+$")), | ||
378 | ["One a.", "Two as."]) | ||
379 | |||
380 | def test_find_by_name_and_containing_string(self): | ||
381 | soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") | ||
382 | a = soup.a | ||
383 | |||
384 | self.assertEqual([a], soup.find_all("a", text="foo")) | ||
385 | self.assertEqual([], soup.find_all("a", text="bar")) | ||
386 | self.assertEqual([], soup.find_all("a", text="bar")) | ||
387 | |||
388 | def test_find_by_name_and_containing_string_when_string_is_buried(self): | ||
389 | soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") | ||
390 | self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) | ||
391 | |||
392 | def test_find_by_attribute_and_containing_string(self): | ||
393 | soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') | ||
394 | a = soup.a | ||
395 | |||
396 | self.assertEqual([a], soup.find_all(id=2, text="foo")) | ||
397 | self.assertEqual([], soup.find_all(id=1, text="bar")) | ||
398 | |||
399 | |||
400 | |||
401 | |||
402 | class TestIndex(TreeTest): | ||
403 | """Test Tag.index""" | ||
404 | def test_index(self): | ||
405 | tree = self.soup("""<div> | ||
406 | <a>Identical</a> | ||
407 | <b>Not identical</b> | ||
408 | <a>Identical</a> | ||
409 | |||
410 | <c><d>Identical with child</d></c> | ||
411 | <b>Also not identical</b> | ||
412 | <c><d>Identical with child</d></c> | ||
413 | </div>""") | ||
414 | div = tree.div | ||
415 | for i, element in enumerate(div.contents): | ||
416 | self.assertEqual(i, div.index(element)) | ||
417 | self.assertRaises(ValueError, tree.index, 1) | ||
418 | |||
419 | |||
420 | class TestParentOperations(TreeTest): | ||
421 | """Test navigation and searching through an element's parents.""" | ||
422 | |||
423 | def setUp(self): | ||
424 | super(TestParentOperations, self).setUp() | ||
425 | self.tree = self.soup('''<ul id="empty"></ul> | ||
426 | <ul id="top"> | ||
427 | <ul id="middle"> | ||
428 | <ul id="bottom"> | ||
429 | <b>Start here</b> | ||
430 | </ul> | ||
431 | </ul>''') | ||
432 | self.start = self.tree.b | ||
433 | |||
434 | |||
435 | def test_parent(self): | ||
436 | self.assertEqual(self.start.parent['id'], 'bottom') | ||
437 | self.assertEqual(self.start.parent.parent['id'], 'middle') | ||
438 | self.assertEqual(self.start.parent.parent.parent['id'], 'top') | ||
439 | |||
440 | def test_parent_of_top_tag_is_soup_object(self): | ||
441 | top_tag = self.tree.contents[0] | ||
442 | self.assertEqual(top_tag.parent, self.tree) | ||
443 | |||
444 | def test_soup_object_has_no_parent(self): | ||
445 | self.assertEqual(None, self.tree.parent) | ||
446 | |||
447 | def test_find_parents(self): | ||
448 | self.assertSelectsIDs( | ||
449 | self.start.find_parents('ul'), ['bottom', 'middle', 'top']) | ||
450 | self.assertSelectsIDs( | ||
451 | self.start.find_parents('ul', id="middle"), ['middle']) | ||
452 | |||
453 | def test_find_parent(self): | ||
454 | self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') | ||
455 | self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') | ||
456 | |||
457 | def test_parent_of_text_element(self): | ||
458 | text = self.tree.find(text="Start here") | ||
459 | self.assertEqual(text.parent.name, 'b') | ||
460 | |||
461 | def test_text_element_find_parent(self): | ||
462 | text = self.tree.find(text="Start here") | ||
463 | self.assertEqual(text.find_parent('ul')['id'], 'bottom') | ||
464 | |||
465 | def test_parent_generator(self): | ||
466 | parents = [parent['id'] for parent in self.start.parents | ||
467 | if parent is not None and 'id' in parent.attrs] | ||
468 | self.assertEqual(parents, ['bottom', 'middle', 'top']) | ||
469 | |||
470 | |||
471 | class ProximityTest(TreeTest): | ||
472 | |||
473 | def setUp(self): | ||
474 | super(TreeTest, self).setUp() | ||
475 | self.tree = self.soup( | ||
476 | '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>') | ||
477 | |||
478 | |||
479 | class TestNextOperations(ProximityTest): | ||
480 | |||
481 | def setUp(self): | ||
482 | super(TestNextOperations, self).setUp() | ||
483 | self.start = self.tree.b | ||
484 | |||
485 | def test_next(self): | ||
486 | self.assertEqual(self.start.next_element, "One") | ||
487 | self.assertEqual(self.start.next_element.next_element['id'], "2") | ||
488 | |||
489 | def test_next_of_last_item_is_none(self): | ||
490 | last = self.tree.find(text="Three") | ||
491 | self.assertEqual(last.next_element, None) | ||
492 | |||
493 | def test_next_of_root_is_none(self): | ||
494 | # The document root is outside the next/previous chain. | ||
495 | self.assertEqual(self.tree.next_element, None) | ||
496 | |||
497 | def test_find_all_next(self): | ||
498 | self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) | ||
499 | self.start.find_all_next(id=3) | ||
500 | self.assertSelects(self.start.find_all_next(id=3), ["Three"]) | ||
501 | |||
502 | def test_find_next(self): | ||
503 | self.assertEqual(self.start.find_next('b')['id'], '2') | ||
504 | self.assertEqual(self.start.find_next(text="Three"), "Three") | ||
505 | |||
506 | def test_find_next_for_text_element(self): | ||
507 | text = self.tree.find(text="One") | ||
508 | self.assertEqual(text.find_next("b").string, "Two") | ||
509 | self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) | ||
510 | |||
511 | def test_next_generator(self): | ||
512 | start = self.tree.find(text="Two") | ||
513 | successors = [node for node in start.next_elements] | ||
514 | # There are two successors: the final <b> tag and its text contents. | ||
515 | tag, contents = successors | ||
516 | self.assertEqual(tag['id'], '3') | ||
517 | self.assertEqual(contents, "Three") | ||
518 | |||
519 | class TestPreviousOperations(ProximityTest): | ||
520 | |||
521 | def setUp(self): | ||
522 | super(TestPreviousOperations, self).setUp() | ||
523 | self.end = self.tree.find(text="Three") | ||
524 | |||
525 | def test_previous(self): | ||
526 | self.assertEqual(self.end.previous_element['id'], "3") | ||
527 | self.assertEqual(self.end.previous_element.previous_element, "Two") | ||
528 | |||
529 | def test_previous_of_first_item_is_none(self): | ||
530 | first = self.tree.find('html') | ||
531 | self.assertEqual(first.previous_element, None) | ||
532 | |||
533 | def test_previous_of_root_is_none(self): | ||
534 | # The document root is outside the next/previous chain. | ||
535 | # XXX This is broken! | ||
536 | #self.assertEqual(self.tree.previous_element, None) | ||
537 | pass | ||
538 | |||
539 | def test_find_all_previous(self): | ||
540 | # The <b> tag containing the "Three" node is the predecessor | ||
541 | # of the "Three" node itself, which is why "Three" shows up | ||
542 | # here. | ||
543 | self.assertSelects( | ||
544 | self.end.find_all_previous('b'), ["Three", "Two", "One"]) | ||
545 | self.assertSelects(self.end.find_all_previous(id=1), ["One"]) | ||
546 | |||
547 | def test_find_previous(self): | ||
548 | self.assertEqual(self.end.find_previous('b')['id'], '3') | ||
549 | self.assertEqual(self.end.find_previous(text="One"), "One") | ||
550 | |||
551 | def test_find_previous_for_text_element(self): | ||
552 | text = self.tree.find(text="Three") | ||
553 | self.assertEqual(text.find_previous("b").string, "Three") | ||
554 | self.assertSelects( | ||
555 | text.find_all_previous("b"), ["Three", "Two", "One"]) | ||
556 | |||
557 | def test_previous_generator(self): | ||
558 | start = self.tree.find(text="One") | ||
559 | predecessors = [node for node in start.previous_elements] | ||
560 | |||
561 | # There are four predecessors: the <b> tag containing "One" | ||
562 | # the <body> tag, the <head> tag, and the <html> tag. | ||
563 | b, body, head, html = predecessors | ||
564 | self.assertEqual(b['id'], '1') | ||
565 | self.assertEqual(body.name, "body") | ||
566 | self.assertEqual(head.name, "head") | ||
567 | self.assertEqual(html.name, "html") | ||
568 | |||
569 | |||
570 | class SiblingTest(TreeTest): | ||
571 | |||
572 | def setUp(self): | ||
573 | super(SiblingTest, self).setUp() | ||
574 | markup = '''<html> | ||
575 | <span id="1"> | ||
576 | <span id="1.1"></span> | ||
577 | </span> | ||
578 | <span id="2"> | ||
579 | <span id="2.1"></span> | ||
580 | </span> | ||
581 | <span id="3"> | ||
582 | <span id="3.1"></span> | ||
583 | </span> | ||
584 | <span id="4"></span> | ||
585 | </html>''' | ||
586 | # All that whitespace looks good but makes the tests more | ||
587 | # difficult. Get rid of it. | ||
588 | markup = re.compile(r"\n\s*").sub("", markup) | ||
589 | self.tree = self.soup(markup) | ||
590 | |||
591 | |||
592 | class TestNextSibling(SiblingTest): | ||
593 | |||
594 | def setUp(self): | ||
595 | super(TestNextSibling, self).setUp() | ||
596 | self.start = self.tree.find(id="1") | ||
597 | |||
598 | def test_next_sibling_of_root_is_none(self): | ||
599 | self.assertEqual(self.tree.next_sibling, None) | ||
600 | |||
601 | def test_next_sibling(self): | ||
602 | self.assertEqual(self.start.next_sibling['id'], '2') | ||
603 | self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') | ||
604 | |||
605 | # Note the difference between next_sibling and next_element. | ||
606 | self.assertEqual(self.start.next_element['id'], '1.1') | ||
607 | |||
608 | def test_next_sibling_may_not_exist(self): | ||
609 | self.assertEqual(self.tree.html.next_sibling, None) | ||
610 | |||
611 | nested_span = self.tree.find(id="1.1") | ||
612 | self.assertEqual(nested_span.next_sibling, None) | ||
613 | |||
614 | last_span = self.tree.find(id="4") | ||
615 | self.assertEqual(last_span.next_sibling, None) | ||
616 | |||
617 | def test_find_next_sibling(self): | ||
618 | self.assertEqual(self.start.find_next_sibling('span')['id'], '2') | ||
619 | |||
620 | def test_next_siblings(self): | ||
621 | self.assertSelectsIDs(self.start.find_next_siblings("span"), | ||
622 | ['2', '3', '4']) | ||
623 | |||
624 | self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) | ||
625 | |||
626 | def test_next_sibling_for_text_element(self): | ||
627 | soup = self.soup("Foo<b>bar</b>baz") | ||
628 | start = soup.find(text="Foo") | ||
629 | self.assertEqual(start.next_sibling.name, 'b') | ||
630 | self.assertEqual(start.next_sibling.next_sibling, 'baz') | ||
631 | |||
632 | self.assertSelects(start.find_next_siblings('b'), ['bar']) | ||
633 | self.assertEqual(start.find_next_sibling(text="baz"), "baz") | ||
634 | self.assertEqual(start.find_next_sibling(text="nonesuch"), None) | ||
635 | |||
636 | |||
637 | class TestPreviousSibling(SiblingTest): | ||
638 | |||
639 | def setUp(self): | ||
640 | super(TestPreviousSibling, self).setUp() | ||
641 | self.end = self.tree.find(id="4") | ||
642 | |||
643 | def test_previous_sibling_of_root_is_none(self): | ||
644 | self.assertEqual(self.tree.previous_sibling, None) | ||
645 | |||
646 | def test_previous_sibling(self): | ||
647 | self.assertEqual(self.end.previous_sibling['id'], '3') | ||
648 | self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') | ||
649 | |||
650 | # Note the difference between previous_sibling and previous_element. | ||
651 | self.assertEqual(self.end.previous_element['id'], '3.1') | ||
652 | |||
653 | def test_previous_sibling_may_not_exist(self): | ||
654 | self.assertEqual(self.tree.html.previous_sibling, None) | ||
655 | |||
656 | nested_span = self.tree.find(id="1.1") | ||
657 | self.assertEqual(nested_span.previous_sibling, None) | ||
658 | |||
659 | first_span = self.tree.find(id="1") | ||
660 | self.assertEqual(first_span.previous_sibling, None) | ||
661 | |||
662 | def test_find_previous_sibling(self): | ||
663 | self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') | ||
664 | |||
665 | def test_previous_siblings(self): | ||
666 | self.assertSelectsIDs(self.end.find_previous_siblings("span"), | ||
667 | ['3', '2', '1']) | ||
668 | |||
669 | self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) | ||
670 | |||
671 | def test_previous_sibling_for_text_element(self): | ||
672 | soup = self.soup("Foo<b>bar</b>baz") | ||
673 | start = soup.find(text="baz") | ||
674 | self.assertEqual(start.previous_sibling.name, 'b') | ||
675 | self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') | ||
676 | |||
677 | self.assertSelects(start.find_previous_siblings('b'), ['bar']) | ||
678 | self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") | ||
679 | self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) | ||
680 | |||
681 | |||
682 | class TestTagCreation(SoupTest): | ||
683 | """Test the ability to create new tags.""" | ||
684 | def test_new_tag(self): | ||
685 | soup = self.soup("") | ||
686 | new_tag = soup.new_tag("foo", bar="baz") | ||
687 | self.assertTrue(isinstance(new_tag, Tag)) | ||
688 | self.assertEqual("foo", new_tag.name) | ||
689 | self.assertEqual(dict(bar="baz"), new_tag.attrs) | ||
690 | self.assertEqual(None, new_tag.parent) | ||
691 | |||
692 | def test_tag_inherits_self_closing_rules_from_builder(self): | ||
693 | if XML_BUILDER_PRESENT: | ||
694 | xml_soup = BeautifulSoup("", "lxml-xml") | ||
695 | xml_br = xml_soup.new_tag("br") | ||
696 | xml_p = xml_soup.new_tag("p") | ||
697 | |||
698 | # Both the <br> and <p> tag are empty-element, just because | ||
699 | # they have no contents. | ||
700 | self.assertEqual(b"<br/>", xml_br.encode()) | ||
701 | self.assertEqual(b"<p/>", xml_p.encode()) | ||
702 | |||
703 | html_soup = BeautifulSoup("", "html.parser") | ||
704 | html_br = html_soup.new_tag("br") | ||
705 | html_p = html_soup.new_tag("p") | ||
706 | |||
707 | # The HTML builder users HTML's rules about which tags are | ||
708 | # empty-element tags, and the new tags reflect these rules. | ||
709 | self.assertEqual(b"<br/>", html_br.encode()) | ||
710 | self.assertEqual(b"<p></p>", html_p.encode()) | ||
711 | |||
712 | def test_new_string_creates_navigablestring(self): | ||
713 | soup = self.soup("") | ||
714 | s = soup.new_string("foo") | ||
715 | self.assertEqual("foo", s) | ||
716 | self.assertTrue(isinstance(s, NavigableString)) | ||
717 | |||
718 | def test_new_string_can_create_navigablestring_subclass(self): | ||
719 | soup = self.soup("") | ||
720 | s = soup.new_string("foo", Comment) | ||
721 | self.assertEqual("foo", s) | ||
722 | self.assertTrue(isinstance(s, Comment)) | ||
723 | |||
724 | class TestTreeModification(SoupTest): | ||
725 | |||
726 | def test_attribute_modification(self): | ||
727 | soup = self.soup('<a id="1"></a>') | ||
728 | soup.a['id'] = 2 | ||
729 | self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>')) | ||
730 | del(soup.a['id']) | ||
731 | self.assertEqual(soup.decode(), self.document_for('<a></a>')) | ||
732 | soup.a['id2'] = 'foo' | ||
733 | self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) | ||
734 | |||
735 | def test_new_tag_creation(self): | ||
736 | builder = builder_registry.lookup('html')() | ||
737 | soup = self.soup("<body></body>", builder=builder) | ||
738 | a = Tag(soup, builder, 'a') | ||
739 | ol = Tag(soup, builder, 'ol') | ||
740 | a['href'] = 'http://foo.com/' | ||
741 | soup.body.insert(0, a) | ||
742 | soup.body.insert(1, ol) | ||
743 | self.assertEqual( | ||
744 | soup.body.encode(), | ||
745 | b'<body><a href="http://foo.com/"></a><ol></ol></body>') | ||
746 | |||
747 | def test_append_to_contents_moves_tag(self): | ||
748 | doc = """<p id="1">Don't leave me <b>here</b>.</p> | ||
749 | <p id="2">Don\'t leave!</p>""" | ||
750 | soup = self.soup(doc) | ||
751 | second_para = soup.find(id='2') | ||
752 | bold = soup.b | ||
753 | |||
754 | # Move the <b> tag to the end of the second paragraph. | ||
755 | soup.find(id='2').append(soup.b) | ||
756 | |||
757 | # The <b> tag is now a child of the second paragraph. | ||
758 | self.assertEqual(bold.parent, second_para) | ||
759 | |||
760 | self.assertEqual( | ||
761 | soup.decode(), self.document_for( | ||
762 | '<p id="1">Don\'t leave me .</p>\n' | ||
763 | '<p id="2">Don\'t leave!<b>here</b></p>')) | ||
764 | |||
765 | def test_replace_with_returns_thing_that_was_replaced(self): | ||
766 | text = "<a></a><b><c></c></b>" | ||
767 | soup = self.soup(text) | ||
768 | a = soup.a | ||
769 | new_a = a.replace_with(soup.c) | ||
770 | self.assertEqual(a, new_a) | ||
771 | |||
772 | def test_unwrap_returns_thing_that_was_replaced(self): | ||
773 | text = "<a><b></b><c></c></a>" | ||
774 | soup = self.soup(text) | ||
775 | a = soup.a | ||
776 | new_a = a.unwrap() | ||
777 | self.assertEqual(a, new_a) | ||
778 | |||
779 | def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): | ||
780 | soup = self.soup("<a><b>Foo</b></a><c>Bar</c>") | ||
781 | a = soup.a | ||
782 | a.extract() | ||
783 | self.assertEqual(None, a.parent) | ||
784 | self.assertRaises(ValueError, a.unwrap) | ||
785 | self.assertRaises(ValueError, a.replace_with, soup.c) | ||
786 | |||
787 | def test_replace_tag_with_itself(self): | ||
788 | text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" | ||
789 | soup = self.soup(text) | ||
790 | c = soup.c | ||
791 | soup.c.replace_with(c) | ||
792 | self.assertEqual(soup.decode(), self.document_for(text)) | ||
793 | |||
794 | def test_replace_tag_with_its_parent_raises_exception(self): | ||
795 | text = "<a><b></b></a>" | ||
796 | soup = self.soup(text) | ||
797 | self.assertRaises(ValueError, soup.b.replace_with, soup.a) | ||
798 | |||
799 | def test_insert_tag_into_itself_raises_exception(self): | ||
800 | text = "<a><b></b></a>" | ||
801 | soup = self.soup(text) | ||
802 | self.assertRaises(ValueError, soup.a.insert, 0, soup.a) | ||
803 | |||
804 | def test_replace_with_maintains_next_element_throughout(self): | ||
805 | soup = self.soup('<p><a>one</a><b>three</b></p>') | ||
806 | a = soup.a | ||
807 | b = a.contents[0] | ||
808 | # Make it so the <a> tag has two text children. | ||
809 | a.insert(1, "two") | ||
810 | |||
811 | # Now replace each one with the empty string. | ||
812 | left, right = a.contents | ||
813 | left.replaceWith('') | ||
814 | right.replaceWith('') | ||
815 | |||
816 | # The <b> tag is still connected to the tree. | ||
817 | self.assertEqual("three", soup.b.string) | ||
818 | |||
819 | def test_replace_final_node(self): | ||
820 | soup = self.soup("<b>Argh!</b>") | ||
821 | soup.find(text="Argh!").replace_with("Hooray!") | ||
822 | new_text = soup.find(text="Hooray!") | ||
823 | b = soup.b | ||
824 | self.assertEqual(new_text.previous_element, b) | ||
825 | self.assertEqual(new_text.parent, b) | ||
826 | self.assertEqual(new_text.previous_element.next_element, new_text) | ||
827 | self.assertEqual(new_text.next_element, None) | ||
828 | |||
829 | def test_consecutive_text_nodes(self): | ||
830 | # A builder should never create two consecutive text nodes, | ||
831 | # but if you insert one next to another, Beautiful Soup will | ||
832 | # handle it correctly. | ||
833 | soup = self.soup("<a><b>Argh!</b><c></c></a>") | ||
834 | soup.b.insert(1, "Hooray!") | ||
835 | |||
836 | self.assertEqual( | ||
837 | soup.decode(), self.document_for( | ||
838 | "<a><b>Argh!Hooray!</b><c></c></a>")) | ||
839 | |||
840 | new_text = soup.find(text="Hooray!") | ||
841 | self.assertEqual(new_text.previous_element, "Argh!") | ||
842 | self.assertEqual(new_text.previous_element.next_element, new_text) | ||
843 | |||
844 | self.assertEqual(new_text.previous_sibling, "Argh!") | ||
845 | self.assertEqual(new_text.previous_sibling.next_sibling, new_text) | ||
846 | |||
847 | self.assertEqual(new_text.next_sibling, None) | ||
848 | self.assertEqual(new_text.next_element, soup.c) | ||
849 | |||
850 | def test_insert_string(self): | ||
851 | soup = self.soup("<a></a>") | ||
852 | soup.a.insert(0, "bar") | ||
853 | soup.a.insert(0, "foo") | ||
854 | # The string were added to the tag. | ||
855 | self.assertEqual(["foo", "bar"], soup.a.contents) | ||
856 | # And they were converted to NavigableStrings. | ||
857 | self.assertEqual(soup.a.contents[0].next_element, "bar") | ||
858 | |||
859 | def test_insert_tag(self): | ||
860 | builder = self.default_builder | ||
861 | soup = self.soup( | ||
862 | "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) | ||
863 | magic_tag = Tag(soup, builder, 'magictag') | ||
864 | magic_tag.insert(0, "the") | ||
865 | soup.a.insert(1, magic_tag) | ||
866 | |||
867 | self.assertEqual( | ||
868 | soup.decode(), self.document_for( | ||
869 | "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>")) | ||
870 | |||
871 | # Make sure all the relationships are hooked up correctly. | ||
872 | b_tag = soup.b | ||
873 | self.assertEqual(b_tag.next_sibling, magic_tag) | ||
874 | self.assertEqual(magic_tag.previous_sibling, b_tag) | ||
875 | |||
876 | find = b_tag.find(text="Find") | ||
877 | self.assertEqual(find.next_element, magic_tag) | ||
878 | self.assertEqual(magic_tag.previous_element, find) | ||
879 | |||
880 | c_tag = soup.c | ||
881 | self.assertEqual(magic_tag.next_sibling, c_tag) | ||
882 | self.assertEqual(c_tag.previous_sibling, magic_tag) | ||
883 | |||
884 | the = magic_tag.find(text="the") | ||
885 | self.assertEqual(the.parent, magic_tag) | ||
886 | self.assertEqual(the.next_element, c_tag) | ||
887 | self.assertEqual(c_tag.previous_element, the) | ||
888 | |||
889 | def test_append_child_thats_already_at_the_end(self): | ||
890 | data = "<a><b></b></a>" | ||
891 | soup = self.soup(data) | ||
892 | soup.a.append(soup.b) | ||
893 | self.assertEqual(data, soup.decode()) | ||
894 | |||
895 | def test_move_tag_to_beginning_of_parent(self): | ||
896 | data = "<a><b></b><c></c><d></d></a>" | ||
897 | soup = self.soup(data) | ||
898 | soup.a.insert(0, soup.d) | ||
899 | self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode()) | ||
900 | |||
901 | def test_insert_works_on_empty_element_tag(self): | ||
902 | # This is a little strange, since most HTML parsers don't allow | ||
903 | # markup like this to come through. But in general, we don't | ||
904 | # know what the parser would or wouldn't have allowed, so | ||
905 | # I'm letting this succeed for now. | ||
906 | soup = self.soup("<br/>") | ||
907 | soup.br.insert(1, "Contents") | ||
908 | self.assertEqual(str(soup.br), "<br>Contents</br>") | ||
909 | |||
910 | def test_insert_before(self): | ||
911 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
912 | soup.b.insert_before("BAZ") | ||
913 | soup.a.insert_before("QUUX") | ||
914 | self.assertEqual( | ||
915 | soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>")) | ||
916 | |||
917 | soup.a.insert_before(soup.b) | ||
918 | self.assertEqual( | ||
919 | soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) | ||
920 | |||
921 | def test_insert_after(self): | ||
922 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
923 | soup.b.insert_after("BAZ") | ||
924 | soup.a.insert_after("QUUX") | ||
925 | self.assertEqual( | ||
926 | soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ")) | ||
927 | soup.b.insert_after(soup.a) | ||
928 | self.assertEqual( | ||
929 | soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) | ||
930 | |||
931 | def test_insert_after_raises_exception_if_after_has_no_meaning(self): | ||
932 | soup = self.soup("") | ||
933 | tag = soup.new_tag("a") | ||
934 | string = soup.new_string("") | ||
935 | self.assertRaises(ValueError, string.insert_after, tag) | ||
936 | self.assertRaises(NotImplementedError, soup.insert_after, tag) | ||
937 | self.assertRaises(ValueError, tag.insert_after, tag) | ||
938 | |||
939 | def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): | ||
940 | soup = self.soup("") | ||
941 | tag = soup.new_tag("a") | ||
942 | string = soup.new_string("") | ||
943 | self.assertRaises(ValueError, string.insert_before, tag) | ||
944 | self.assertRaises(NotImplementedError, soup.insert_before, tag) | ||
945 | self.assertRaises(ValueError, tag.insert_before, tag) | ||
946 | |||
947 | def test_replace_with(self): | ||
948 | soup = self.soup( | ||
949 | "<p>There's <b>no</b> business like <b>show</b> business</p>") | ||
950 | no, show = soup.find_all('b') | ||
951 | show.replace_with(no) | ||
952 | self.assertEqual( | ||
953 | soup.decode(), | ||
954 | self.document_for( | ||
955 | "<p>There's business like <b>no</b> business</p>")) | ||
956 | |||
957 | self.assertEqual(show.parent, None) | ||
958 | self.assertEqual(no.parent, soup.p) | ||
959 | self.assertEqual(no.next_element, "no") | ||
960 | self.assertEqual(no.next_sibling, " business") | ||
961 | |||
962 | def test_replace_first_child(self): | ||
963 | data = "<a><b></b><c></c></a>" | ||
964 | soup = self.soup(data) | ||
965 | soup.b.replace_with(soup.c) | ||
966 | self.assertEqual("<a><c></c></a>", soup.decode()) | ||
967 | |||
968 | def test_replace_last_child(self): | ||
969 | data = "<a><b></b><c></c></a>" | ||
970 | soup = self.soup(data) | ||
971 | soup.c.replace_with(soup.b) | ||
972 | self.assertEqual("<a><b></b></a>", soup.decode()) | ||
973 | |||
974 | def test_nested_tag_replace_with(self): | ||
975 | soup = self.soup( | ||
976 | """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") | ||
977 | |||
978 | # Replace the entire <b> tag and its contents ("reserve the | ||
979 | # right") with the <f> tag ("refuse"). | ||
980 | remove_tag = soup.b | ||
981 | move_tag = soup.f | ||
982 | remove_tag.replace_with(move_tag) | ||
983 | |||
984 | self.assertEqual( | ||
985 | soup.decode(), self.document_for( | ||
986 | "<a>We<f>refuse</f></a><e>to<g>service</g></e>")) | ||
987 | |||
988 | # The <b> tag is now an orphan. | ||
989 | self.assertEqual(remove_tag.parent, None) | ||
990 | self.assertEqual(remove_tag.find(text="right").next_element, None) | ||
991 | self.assertEqual(remove_tag.previous_element, None) | ||
992 | self.assertEqual(remove_tag.next_sibling, None) | ||
993 | self.assertEqual(remove_tag.previous_sibling, None) | ||
994 | |||
995 | # The <f> tag is now connected to the <a> tag. | ||
996 | self.assertEqual(move_tag.parent, soup.a) | ||
997 | self.assertEqual(move_tag.previous_element, "We") | ||
998 | self.assertEqual(move_tag.next_element.next_element, soup.e) | ||
999 | self.assertEqual(move_tag.next_sibling, None) | ||
1000 | |||
1001 | # The gap where the <f> tag used to be has been mended, and | ||
1002 | # the word "to" is now connected to the <g> tag. | ||
1003 | to_text = soup.find(text="to") | ||
1004 | g_tag = soup.g | ||
1005 | self.assertEqual(to_text.next_element, g_tag) | ||
1006 | self.assertEqual(to_text.next_sibling, g_tag) | ||
1007 | self.assertEqual(g_tag.previous_element, to_text) | ||
1008 | self.assertEqual(g_tag.previous_sibling, to_text) | ||
1009 | |||
1010 | def test_unwrap(self): | ||
1011 | tree = self.soup(""" | ||
1012 | <p>Unneeded <em>formatting</em> is unneeded</p> | ||
1013 | """) | ||
1014 | tree.em.unwrap() | ||
1015 | self.assertEqual(tree.em, None) | ||
1016 | self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") | ||
1017 | |||
1018 | def test_wrap(self): | ||
1019 | soup = self.soup("I wish I was bold.") | ||
1020 | value = soup.string.wrap(soup.new_tag("b")) | ||
1021 | self.assertEqual(value.decode(), "<b>I wish I was bold.</b>") | ||
1022 | self.assertEqual( | ||
1023 | soup.decode(), self.document_for("<b>I wish I was bold.</b>")) | ||
1024 | |||
1025 | def test_wrap_extracts_tag_from_elsewhere(self): | ||
1026 | soup = self.soup("<b></b>I wish I was bold.") | ||
1027 | soup.b.next_sibling.wrap(soup.b) | ||
1028 | self.assertEqual( | ||
1029 | soup.decode(), self.document_for("<b>I wish I was bold.</b>")) | ||
1030 | |||
1031 | def test_wrap_puts_new_contents_at_the_end(self): | ||
1032 | soup = self.soup("<b>I like being bold.</b>I wish I was bold.") | ||
1033 | soup.b.next_sibling.wrap(soup.b) | ||
1034 | self.assertEqual(2, len(soup.b.contents)) | ||
1035 | self.assertEqual( | ||
1036 | soup.decode(), self.document_for( | ||
1037 | "<b>I like being bold.I wish I was bold.</b>")) | ||
1038 | |||
1039 | def test_extract(self): | ||
1040 | soup = self.soup( | ||
1041 | '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>') | ||
1042 | |||
1043 | self.assertEqual(len(soup.body.contents), 3) | ||
1044 | extracted = soup.find(id="nav").extract() | ||
1045 | |||
1046 | self.assertEqual( | ||
1047 | soup.decode(), "<html><body>Some content. More content.</body></html>") | ||
1048 | self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') | ||
1049 | |||
1050 | # The extracted tag is now an orphan. | ||
1051 | self.assertEqual(len(soup.body.contents), 2) | ||
1052 | self.assertEqual(extracted.parent, None) | ||
1053 | self.assertEqual(extracted.previous_element, None) | ||
1054 | self.assertEqual(extracted.next_element.next_element, None) | ||
1055 | |||
1056 | # The gap where the extracted tag used to be has been mended. | ||
1057 | content_1 = soup.find(text="Some content. ") | ||
1058 | content_2 = soup.find(text=" More content.") | ||
1059 | self.assertEqual(content_1.next_element, content_2) | ||
1060 | self.assertEqual(content_1.next_sibling, content_2) | ||
1061 | self.assertEqual(content_2.previous_element, content_1) | ||
1062 | self.assertEqual(content_2.previous_sibling, content_1) | ||
1063 | |||
1064 | def test_extract_distinguishes_between_identical_strings(self): | ||
1065 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
1066 | foo_1 = soup.a.string | ||
1067 | bar_1 = soup.b.string | ||
1068 | foo_2 = soup.new_string("foo") | ||
1069 | bar_2 = soup.new_string("bar") | ||
1070 | soup.a.append(foo_2) | ||
1071 | soup.b.append(bar_2) | ||
1072 | |||
1073 | # Now there are two identical strings in the <a> tag, and two | ||
1074 | # in the <b> tag. Let's remove the first "foo" and the second | ||
1075 | # "bar". | ||
1076 | foo_1.extract() | ||
1077 | bar_2.extract() | ||
1078 | self.assertEqual(foo_2, soup.a.string) | ||
1079 | self.assertEqual(bar_2, soup.b.string) | ||
1080 | |||
1081 | def test_extract_multiples_of_same_tag(self): | ||
1082 | soup = self.soup(""" | ||
1083 | <html> | ||
1084 | <head> | ||
1085 | <script>foo</script> | ||
1086 | </head> | ||
1087 | <body> | ||
1088 | <script>bar</script> | ||
1089 | <a></a> | ||
1090 | </body> | ||
1091 | <script>baz</script> | ||
1092 | </html>""") | ||
1093 | [soup.script.extract() for i in soup.find_all("script")] | ||
1094 | self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body)) | ||
1095 | |||
1096 | |||
1097 | def test_extract_works_when_element_is_surrounded_by_identical_strings(self): | ||
1098 | soup = self.soup( | ||
1099 | '<html>\n' | ||
1100 | '<body>hi</body>\n' | ||
1101 | '</html>') | ||
1102 | soup.find('body').extract() | ||
1103 | self.assertEqual(None, soup.find('body')) | ||
1104 | |||
1105 | |||
1106 | def test_clear(self): | ||
1107 | """Tag.clear()""" | ||
1108 | soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") | ||
1109 | # clear using extract() | ||
1110 | a = soup.a | ||
1111 | soup.p.clear() | ||
1112 | self.assertEqual(len(soup.p.contents), 0) | ||
1113 | self.assertTrue(hasattr(a, "contents")) | ||
1114 | |||
1115 | # clear using decompose() | ||
1116 | em = a.em | ||
1117 | a.clear(decompose=True) | ||
1118 | self.assertEqual(0, len(em.contents)) | ||
1119 | |||
1120 | def test_string_set(self): | ||
1121 | """Tag.string = 'string'""" | ||
1122 | soup = self.soup("<a></a> <b><c></c></b>") | ||
1123 | soup.a.string = "foo" | ||
1124 | self.assertEqual(soup.a.contents, ["foo"]) | ||
1125 | soup.b.string = "bar" | ||
1126 | self.assertEqual(soup.b.contents, ["bar"]) | ||
1127 | |||
1128 | def test_string_set_does_not_affect_original_string(self): | ||
1129 | soup = self.soup("<a><b>foo</b><c>bar</c>") | ||
1130 | soup.b.string = soup.c.string | ||
1131 | self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>") | ||
1132 | |||
1133 | def test_set_string_preserves_class_of_string(self): | ||
1134 | soup = self.soup("<a></a>") | ||
1135 | cdata = CData("foo") | ||
1136 | soup.a.string = cdata | ||
1137 | self.assertTrue(isinstance(soup.a.string, CData)) | ||
1138 | |||
1139 | class TestElementObjects(SoupTest): | ||
1140 | """Test various features of element objects.""" | ||
1141 | |||
1142 | def test_len(self): | ||
1143 | """The length of an element is its number of children.""" | ||
1144 | soup = self.soup("<top>1<b>2</b>3</top>") | ||
1145 | |||
1146 | # The BeautifulSoup object itself contains one element: the | ||
1147 | # <top> tag. | ||
1148 | self.assertEqual(len(soup.contents), 1) | ||
1149 | self.assertEqual(len(soup), 1) | ||
1150 | |||
1151 | # The <top> tag contains three elements: the text node "1", the | ||
1152 | # <b> tag, and the text node "3". | ||
1153 | self.assertEqual(len(soup.top), 3) | ||
1154 | self.assertEqual(len(soup.top.contents), 3) | ||
1155 | |||
1156 | def test_member_access_invokes_find(self): | ||
1157 | """Accessing a Python member .foo invokes find('foo')""" | ||
1158 | soup = self.soup('<b><i></i></b>') | ||
1159 | self.assertEqual(soup.b, soup.find('b')) | ||
1160 | self.assertEqual(soup.b.i, soup.find('b').find('i')) | ||
1161 | self.assertEqual(soup.a, None) | ||
1162 | |||
1163 | def test_deprecated_member_access(self): | ||
1164 | soup = self.soup('<b><i></i></b>') | ||
1165 | with warnings.catch_warnings(record=True) as w: | ||
1166 | tag = soup.bTag | ||
1167 | self.assertEqual(soup.b, tag) | ||
1168 | self.assertEqual( | ||
1169 | '.bTag is deprecated, use .find("b") instead.', | ||
1170 | str(w[0].message)) | ||
1171 | |||
1172 | def test_has_attr(self): | ||
1173 | """has_attr() checks for the presence of an attribute. | ||
1174 | |||
1175 | Please note note: has_attr() is different from | ||
1176 | __in__. has_attr() checks the tag's attributes and __in__ | ||
1177 | checks the tag's chidlren. | ||
1178 | """ | ||
1179 | soup = self.soup("<foo attr='bar'>") | ||
1180 | self.assertTrue(soup.foo.has_attr('attr')) | ||
1181 | self.assertFalse(soup.foo.has_attr('attr2')) | ||
1182 | |||
1183 | |||
1184 | def test_attributes_come_out_in_alphabetical_order(self): | ||
1185 | markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' | ||
1186 | self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') | ||
1187 | |||
1188 | def test_string(self): | ||
1189 | # A tag that contains only a text node makes that node | ||
1190 | # available as .string. | ||
1191 | soup = self.soup("<b>foo</b>") | ||
1192 | self.assertEqual(soup.b.string, 'foo') | ||
1193 | |||
1194 | def test_empty_tag_has_no_string(self): | ||
1195 | # A tag with no children has no .stirng. | ||
1196 | soup = self.soup("<b></b>") | ||
1197 | self.assertEqual(soup.b.string, None) | ||
1198 | |||
1199 | def test_tag_with_multiple_children_has_no_string(self): | ||
1200 | # A tag with no children has no .string. | ||
1201 | soup = self.soup("<a>foo<b></b><b></b></b>") | ||
1202 | self.assertEqual(soup.b.string, None) | ||
1203 | |||
1204 | soup = self.soup("<a>foo<b></b>bar</b>") | ||
1205 | self.assertEqual(soup.b.string, None) | ||
1206 | |||
1207 | # Even if all the children are strings, due to trickery, | ||
1208 | # it won't work--but this would be a good optimization. | ||
1209 | soup = self.soup("<a>foo</b>") | ||
1210 | soup.a.insert(1, "bar") | ||
1211 | self.assertEqual(soup.a.string, None) | ||
1212 | |||
1213 | def test_tag_with_recursive_string_has_string(self): | ||
1214 | # A tag with a single child which has a .string inherits that | ||
1215 | # .string. | ||
1216 | soup = self.soup("<a><b>foo</b></a>") | ||
1217 | self.assertEqual(soup.a.string, "foo") | ||
1218 | self.assertEqual(soup.string, "foo") | ||
1219 | |||
1220 | def test_lack_of_string(self): | ||
1221 | """Only a tag containing a single text node has a .string.""" | ||
1222 | soup = self.soup("<b>f<i>e</i>o</b>") | ||
1223 | self.assertFalse(soup.b.string) | ||
1224 | |||
1225 | soup = self.soup("<b></b>") | ||
1226 | self.assertFalse(soup.b.string) | ||
1227 | |||
1228 | def test_all_text(self): | ||
1229 | """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" | ||
1230 | soup = self.soup("<a>a<b>r</b> <r> t </r></a>") | ||
1231 | self.assertEqual(soup.a.text, "ar t ") | ||
1232 | self.assertEqual(soup.a.get_text(strip=True), "art") | ||
1233 | self.assertEqual(soup.a.get_text(","), "a,r, , t ") | ||
1234 | self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") | ||
1235 | |||
1236 | def test_get_text_ignores_comments(self): | ||
1237 | soup = self.soup("foo<!--IGNORE-->bar") | ||
1238 | self.assertEqual(soup.get_text(), "foobar") | ||
1239 | |||
1240 | self.assertEqual( | ||
1241 | soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") | ||
1242 | self.assertEqual( | ||
1243 | soup.get_text(types=None), "fooIGNOREbar") | ||
1244 | |||
1245 | def test_all_strings_ignores_comments(self): | ||
1246 | soup = self.soup("foo<!--IGNORE-->bar") | ||
1247 | self.assertEqual(['foo', 'bar'], list(soup.strings)) | ||
1248 | |||
1249 | class TestCDAtaListAttributes(SoupTest): | ||
1250 | |||
1251 | """Testing cdata-list attributes like 'class'. | ||
1252 | """ | ||
1253 | def test_single_value_becomes_list(self): | ||
1254 | soup = self.soup("<a class='foo'>") | ||
1255 | self.assertEqual(["foo"],soup.a['class']) | ||
1256 | |||
1257 | def test_multiple_values_becomes_list(self): | ||
1258 | soup = self.soup("<a class='foo bar'>") | ||
1259 | self.assertEqual(["foo", "bar"], soup.a['class']) | ||
1260 | |||
1261 | def test_multiple_values_separated_by_weird_whitespace(self): | ||
1262 | soup = self.soup("<a class='foo\tbar\nbaz'>") | ||
1263 | self.assertEqual(["foo", "bar", "baz"],soup.a['class']) | ||
1264 | |||
1265 | def test_attributes_joined_into_string_on_output(self): | ||
1266 | soup = self.soup("<a class='foo\tbar'>") | ||
1267 | self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) | ||
1268 | |||
1269 | def test_accept_charset(self): | ||
1270 | soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') | ||
1271 | self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) | ||
1272 | |||
1273 | def test_cdata_attribute_applying_only_to_one_tag(self): | ||
1274 | data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' | ||
1275 | soup = self.soup(data) | ||
1276 | # We saw in another test that accept-charset is a cdata-list | ||
1277 | # attribute for the <form> tag. But it's not a cdata-list | ||
1278 | # attribute for any other tag. | ||
1279 | self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) | ||
1280 | |||
1281 | def test_string_has_immutable_name_property(self): | ||
1282 | string = self.soup("s").string | ||
1283 | self.assertEqual(None, string.name) | ||
1284 | def t(): | ||
1285 | string.name = 'foo' | ||
1286 | self.assertRaises(AttributeError, t) | ||
1287 | |||
1288 | class TestPersistence(SoupTest): | ||
1289 | "Testing features like pickle and deepcopy." | ||
1290 | |||
1291 | def setUp(self): | ||
1292 | super(TestPersistence, self).setUp() | ||
1293 | self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" | ||
1294 | "http://www.w3.org/TR/REC-html40/transitional.dtd"> | ||
1295 | <html> | ||
1296 | <head> | ||
1297 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | ||
1298 | <title>Beautiful Soup: We called him Tortoise because he taught us.</title> | ||
1299 | <link rev="made" href="mailto:leonardr@segfault.org"> | ||
1300 | <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> | ||
1301 | <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> | ||
1302 | <meta name="author" content="Leonard Richardson"> | ||
1303 | </head> | ||
1304 | <body> | ||
1305 | <a href="foo">foo</a> | ||
1306 | <a href="foo"><b>bar</b></a> | ||
1307 | </body> | ||
1308 | </html>""" | ||
1309 | self.tree = self.soup(self.page) | ||
1310 | |||
1311 | def test_pickle_and_unpickle_identity(self): | ||
1312 | # Pickling a tree, then unpickling it, yields a tree identical | ||
1313 | # to the original. | ||
1314 | dumped = pickle.dumps(self.tree, 2) | ||
1315 | loaded = pickle.loads(dumped) | ||
1316 | self.assertEqual(loaded.__class__, BeautifulSoup) | ||
1317 | self.assertEqual(loaded.decode(), self.tree.decode()) | ||
1318 | |||
1319 | def test_deepcopy_identity(self): | ||
1320 | # Making a deepcopy of a tree yields an identical tree. | ||
1321 | copied = copy.deepcopy(self.tree) | ||
1322 | self.assertEqual(copied.decode(), self.tree.decode()) | ||
1323 | |||
1324 | def test_unicode_pickle(self): | ||
1325 | # A tree containing Unicode characters can be pickled. | ||
1326 | html = "<b>\N{SNOWMAN}</b>" | ||
1327 | soup = self.soup(html) | ||
1328 | dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) | ||
1329 | loaded = pickle.loads(dumped) | ||
1330 | self.assertEqual(loaded.decode(), soup.decode()) | ||
1331 | |||
1332 | def test_copy_navigablestring_is_not_attached_to_tree(self): | ||
1333 | html = "<b>Foo<a></a></b><b>Bar</b>" | ||
1334 | soup = self.soup(html) | ||
1335 | s1 = soup.find(string="Foo") | ||
1336 | s2 = copy.copy(s1) | ||
1337 | self.assertEqual(s1, s2) | ||
1338 | self.assertEqual(None, s2.parent) | ||
1339 | self.assertEqual(None, s2.next_element) | ||
1340 | self.assertNotEqual(None, s1.next_sibling) | ||
1341 | self.assertEqual(None, s2.next_sibling) | ||
1342 | self.assertEqual(None, s2.previous_element) | ||
1343 | |||
1344 | def test_copy_navigablestring_subclass_has_same_type(self): | ||
1345 | html = "<b><!--Foo--></b>" | ||
1346 | soup = self.soup(html) | ||
1347 | s1 = soup.string | ||
1348 | s2 = copy.copy(s1) | ||
1349 | self.assertEqual(s1, s2) | ||
1350 | self.assertTrue(isinstance(s2, Comment)) | ||
1351 | |||
1352 | def test_copy_entire_soup(self): | ||
1353 | html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" | ||
1354 | soup = self.soup(html) | ||
1355 | soup_copy = copy.copy(soup) | ||
1356 | self.assertEqual(soup, soup_copy) | ||
1357 | |||
1358 | def test_copy_tag_copies_contents(self): | ||
1359 | html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" | ||
1360 | soup = self.soup(html) | ||
1361 | div = soup.div | ||
1362 | div_copy = copy.copy(div) | ||
1363 | |||
1364 | # The two tags look the same, and evaluate to equal. | ||
1365 | self.assertEqual(str(div), str(div_copy)) | ||
1366 | self.assertEqual(div, div_copy) | ||
1367 | |||
1368 | # But they're not the same object. | ||
1369 | self.assertFalse(div is div_copy) | ||
1370 | |||
1371 | # And they don't have the same relation to the parse tree. The | ||
1372 | # copy is not associated with a parse tree at all. | ||
1373 | self.assertEqual(None, div_copy.parent) | ||
1374 | self.assertEqual(None, div_copy.previous_element) | ||
1375 | self.assertEqual(None, div_copy.find(string='Bar').next_element) | ||
1376 | self.assertNotEqual(None, div.find(string='Bar').next_element) | ||
1377 | |||
1378 | class TestSubstitutions(SoupTest): | ||
1379 | |||
1380 | def test_default_formatter_is_minimal(self): | ||
1381 | markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1382 | soup = self.soup(markup) | ||
1383 | decoded = soup.decode(formatter="minimal") | ||
1384 | # The < is converted back into < but the e-with-acute is left alone. | ||
1385 | self.assertEqual( | ||
1386 | decoded, | ||
1387 | self.document_for( | ||
1388 | "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | ||
1389 | |||
1390 | def test_formatter_html(self): | ||
1391 | markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1392 | soup = self.soup(markup) | ||
1393 | decoded = soup.decode(formatter="html") | ||
1394 | self.assertEqual( | ||
1395 | decoded, | ||
1396 | self.document_for("<b><<Sacré bleu!>></b>")) | ||
1397 | |||
1398 | def test_formatter_minimal(self): | ||
1399 | markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1400 | soup = self.soup(markup) | ||
1401 | decoded = soup.decode(formatter="minimal") | ||
1402 | # The < is converted back into < but the e-with-acute is left alone. | ||
1403 | self.assertEqual( | ||
1404 | decoded, | ||
1405 | self.document_for( | ||
1406 | "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | ||
1407 | |||
1408 | def test_formatter_null(self): | ||
1409 | markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1410 | soup = self.soup(markup) | ||
1411 | decoded = soup.decode(formatter=None) | ||
1412 | # Neither the angle brackets nor the e-with-acute are converted. | ||
1413 | # This is not valid HTML, but it's what the user wanted. | ||
1414 | self.assertEqual(decoded, | ||
1415 | self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | ||
1416 | |||
1417 | def test_formatter_custom(self): | ||
1418 | markup = "<b><foo></b><b>bar</b>" | ||
1419 | soup = self.soup(markup) | ||
1420 | decoded = soup.decode(formatter = lambda x: x.upper()) | ||
1421 | # Instead of normal entity conversion code, the custom | ||
1422 | # callable is called on every string. | ||
1423 | self.assertEqual( | ||
1424 | decoded, | ||
1425 | self.document_for("<b><FOO></b><b>BAR</b>")) | ||
1426 | |||
1427 | def test_formatter_is_run_on_attribute_values(self): | ||
1428 | markup = '<a href="http://a.com?a=b&c=é">e</a>' | ||
1429 | soup = self.soup(markup) | ||
1430 | a = soup.a | ||
1431 | |||
1432 | expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' | ||
1433 | |||
1434 | self.assertEqual(expect_minimal, a.decode()) | ||
1435 | self.assertEqual(expect_minimal, a.decode(formatter="minimal")) | ||
1436 | |||
1437 | expect_html = '<a href="http://a.com?a=b&c=é">e</a>' | ||
1438 | self.assertEqual(expect_html, a.decode(formatter="html")) | ||
1439 | |||
1440 | self.assertEqual(markup, a.decode(formatter=None)) | ||
1441 | expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' | ||
1442 | self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) | ||
1443 | |||
1444 | def test_formatter_skips_script_tag_for_html_documents(self): | ||
1445 | doc = """ | ||
1446 | <script type="text/javascript"> | ||
1447 | console.log("< < hey > > "); | ||
1448 | </script> | ||
1449 | """ | ||
1450 | encoded = BeautifulSoup(doc, 'html.parser').encode() | ||
1451 | self.assertTrue(b"< < hey > >" in encoded) | ||
1452 | |||
1453 | def test_formatter_skips_style_tag_for_html_documents(self): | ||
1454 | doc = """ | ||
1455 | <style type="text/css"> | ||
1456 | console.log("< < hey > > "); | ||
1457 | </style> | ||
1458 | """ | ||
1459 | encoded = BeautifulSoup(doc, 'html.parser').encode() | ||
1460 | self.assertTrue(b"< < hey > >" in encoded) | ||
1461 | |||
1462 | def test_prettify_leaves_preformatted_text_alone(self): | ||
1463 | soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ") | ||
1464 | # Everything outside the <pre> tag is reformatted, but everything | ||
1465 | # inside is left alone. | ||
1466 | self.assertEqual( | ||
1467 | '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', | ||
1468 | soup.div.prettify()) | ||
1469 | |||
1470 | def test_prettify_accepts_formatter(self): | ||
1471 | soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') | ||
1472 | pretty = soup.prettify(formatter = lambda x: x.upper()) | ||
1473 | self.assertTrue("FOO" in pretty) | ||
1474 | |||
1475 | def test_prettify_outputs_unicode_by_default(self): | ||
1476 | soup = self.soup("<a></a>") | ||
1477 | self.assertEqual(str, type(soup.prettify())) | ||
1478 | |||
1479 | def test_prettify_can_encode_data(self): | ||
1480 | soup = self.soup("<a></a>") | ||
1481 | self.assertEqual(bytes, type(soup.prettify("utf-8"))) | ||
1482 | |||
1483 | def test_html_entity_substitution_off_by_default(self): | ||
1484 | markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" | ||
1485 | soup = self.soup(markup) | ||
1486 | encoded = soup.b.encode("utf-8") | ||
1487 | self.assertEqual(encoded, markup.encode('utf-8')) | ||
1488 | |||
1489 | def test_encoding_substitution(self): | ||
1490 | # Here's the <meta> tag saying that a document is | ||
1491 | # encoded in Shift-JIS. | ||
1492 | meta_tag = ('<meta content="text/html; charset=x-sjis" ' | ||
1493 | 'http-equiv="Content-type"/>') | ||
1494 | soup = self.soup(meta_tag) | ||
1495 | |||
1496 | # Parse the document, and the charset apprears unchanged. | ||
1497 | self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') | ||
1498 | |||
1499 | # Encode the document into some encoding, and the encoding is | ||
1500 | # substituted into the meta tag. | ||
1501 | utf_8 = soup.encode("utf-8") | ||
1502 | self.assertTrue(b"charset=utf-8" in utf_8) | ||
1503 | |||
1504 | euc_jp = soup.encode("euc_jp") | ||
1505 | self.assertTrue(b"charset=euc_jp" in euc_jp) | ||
1506 | |||
1507 | shift_jis = soup.encode("shift-jis") | ||
1508 | self.assertTrue(b"charset=shift-jis" in shift_jis) | ||
1509 | |||
1510 | utf_16_u = soup.encode("utf-16").decode("utf-16") | ||
1511 | self.assertTrue("charset=utf-16" in utf_16_u) | ||
1512 | |||
1513 | def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): | ||
1514 | markup = ('<head><meta content="text/html; charset=x-sjis" ' | ||
1515 | 'http-equiv="Content-type"/></head><pre>foo</pre>') | ||
1516 | |||
1517 | # Beautiful Soup used to try to rewrite the meta tag even if the | ||
1518 | # meta tag got filtered out by the strainer. This test makes | ||
1519 | # sure that doesn't happen. | ||
1520 | strainer = SoupStrainer('pre') | ||
1521 | soup = self.soup(markup, parse_only=strainer) | ||
1522 | self.assertEqual(soup.contents[0].name, 'pre') | ||
1523 | |||
1524 | class TestEncoding(SoupTest): | ||
1525 | """Test the ability to encode objects into strings.""" | ||
1526 | |||
1527 | def test_unicode_string_can_be_encoded(self): | ||
1528 | html = "<b>\N{SNOWMAN}</b>" | ||
1529 | soup = self.soup(html) | ||
1530 | self.assertEqual(soup.b.string.encode("utf-8"), | ||
1531 | "\N{SNOWMAN}".encode("utf-8")) | ||
1532 | |||
1533 | def test_tag_containing_unicode_string_can_be_encoded(self): | ||
1534 | html = "<b>\N{SNOWMAN}</b>" | ||
1535 | soup = self.soup(html) | ||
1536 | self.assertEqual( | ||
1537 | soup.b.encode("utf-8"), html.encode("utf-8")) | ||
1538 | |||
1539 | def test_encoding_substitutes_unrecognized_characters_by_default(self): | ||
1540 | html = "<b>\N{SNOWMAN}</b>" | ||
1541 | soup = self.soup(html) | ||
1542 | self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") | ||
1543 | |||
1544 | def test_encoding_can_be_made_strict(self): | ||
1545 | html = "<b>\N{SNOWMAN}</b>" | ||
1546 | soup = self.soup(html) | ||
1547 | self.assertRaises( | ||
1548 | UnicodeEncodeError, soup.encode, "ascii", errors="strict") | ||
1549 | |||
1550 | def test_decode_contents(self): | ||
1551 | html = "<b>\N{SNOWMAN}</b>" | ||
1552 | soup = self.soup(html) | ||
1553 | self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) | ||
1554 | |||
1555 | def test_encode_contents(self): | ||
1556 | html = "<b>\N{SNOWMAN}</b>" | ||
1557 | soup = self.soup(html) | ||
1558 | self.assertEqual( | ||
1559 | "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( | ||
1560 | encoding="utf8")) | ||
1561 | |||
1562 | def test_deprecated_renderContents(self): | ||
1563 | html = "<b>\N{SNOWMAN}</b>" | ||
1564 | soup = self.soup(html) | ||
1565 | self.assertEqual( | ||
1566 | "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) | ||
1567 | |||
1568 | def test_repr(self): | ||
1569 | html = "<b>\N{SNOWMAN}</b>" | ||
1570 | soup = self.soup(html) | ||
1571 | if PY3K: | ||
1572 | self.assertEqual(html, repr(soup)) | ||
1573 | else: | ||
1574 | self.assertEqual(b'<b>\\u2603</b>', repr(soup)) | ||
1575 | |||
1576 | class TestNavigableStringSubclasses(SoupTest): | ||
1577 | |||
1578 | def test_cdata(self): | ||
1579 | # None of the current builders turn CDATA sections into CData | ||
1580 | # objects, but you can create them manually. | ||
1581 | soup = self.soup("") | ||
1582 | cdata = CData("foo") | ||
1583 | soup.insert(1, cdata) | ||
1584 | self.assertEqual(str(soup), "<![CDATA[foo]]>") | ||
1585 | self.assertEqual(soup.find(text="foo"), "foo") | ||
1586 | self.assertEqual(soup.contents[0], "foo") | ||
1587 | |||
1588 | def test_cdata_is_never_formatted(self): | ||
1589 | """Text inside a CData object is passed into the formatter. | ||
1590 | |||
1591 | But the return value is ignored. | ||
1592 | """ | ||
1593 | |||
1594 | self.count = 0 | ||
1595 | def increment(*args): | ||
1596 | self.count += 1 | ||
1597 | return "BITTER FAILURE" | ||
1598 | |||
1599 | soup = self.soup("") | ||
1600 | cdata = CData("<><><>") | ||
1601 | soup.insert(1, cdata) | ||
1602 | self.assertEqual( | ||
1603 | b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) | ||
1604 | self.assertEqual(1, self.count) | ||
1605 | |||
1606 | def test_doctype_ends_in_newline(self): | ||
1607 | # Unlike other NavigableString subclasses, a DOCTYPE always ends | ||
1608 | # in a newline. | ||
1609 | doctype = Doctype("foo") | ||
1610 | soup = self.soup("") | ||
1611 | soup.insert(1, doctype) | ||
1612 | self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") | ||
1613 | |||
1614 | def test_declaration(self): | ||
1615 | d = Declaration("foo") | ||
1616 | self.assertEqual("<?foo?>", d.output_ready()) | ||
1617 | |||
1618 | class TestSoupSelector(TreeTest): | ||
1619 | |||
1620 | HTML = """ | ||
1621 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | ||
1622 | "http://www.w3.org/TR/html4/strict.dtd"> | ||
1623 | <html> | ||
1624 | <head> | ||
1625 | <title>The title</title> | ||
1626 | <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> | ||
1627 | </head> | ||
1628 | <body> | ||
1629 | <custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag> | ||
1630 | <div id="main" class="fancy"> | ||
1631 | <div id="inner"> | ||
1632 | <h1 id="header1">An H1</h1> | ||
1633 | <p>Some text</p> | ||
1634 | <p class="onep" id="p1">Some more text</p> | ||
1635 | <h2 id="header2">An H2</h2> | ||
1636 | <p class="class1 class2 class3" id="pmulti">Another</p> | ||
1637 | <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> | ||
1638 | <h2 id="header3">Another H2</h2> | ||
1639 | <a id="me" href="http://simonwillison.net/" rel="me">me</a> | ||
1640 | <span class="s1"> | ||
1641 | <a href="#" id="s1a1">span1a1</a> | ||
1642 | <a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> | ||
1643 | <span class="span2"> | ||
1644 | <a href="#" id="s2a1">span2a1</a> | ||
1645 | </span> | ||
1646 | <span class="span3"></span> | ||
1647 | <custom-dashed-tag class="dashed" id="dash2"/> | ||
1648 | <div data-tag="dashedvalue" id="data1"/> | ||
1649 | </span> | ||
1650 | </div> | ||
1651 | <x id="xid"> | ||
1652 | <z id="zida"/> | ||
1653 | <z id="zidab"/> | ||
1654 | <z id="zidac"/> | ||
1655 | </x> | ||
1656 | <y id="yid"> | ||
1657 | <z id="zidb"/> | ||
1658 | </y> | ||
1659 | <p lang="en" id="lang-en">English</p> | ||
1660 | <p lang="en-gb" id="lang-en-gb">English UK</p> | ||
1661 | <p lang="en-us" id="lang-en-us">English US</p> | ||
1662 | <p lang="fr" id="lang-fr">French</p> | ||
1663 | </div> | ||
1664 | |||
1665 | <div id="footer"> | ||
1666 | </div> | ||
1667 | """ | ||
1668 | |||
1669 | def setUp(self): | ||
1670 | self.soup = BeautifulSoup(self.HTML, 'html.parser') | ||
1671 | |||
1672 | def assertSelects(self, selector, expected_ids): | ||
1673 | el_ids = [el['id'] for el in self.soup.select(selector)] | ||
1674 | el_ids.sort() | ||
1675 | expected_ids.sort() | ||
1676 | self.assertEqual(expected_ids, el_ids, | ||
1677 | "Selector %s, expected [%s], got [%s]" % ( | ||
1678 | selector, ', '.join(expected_ids), ', '.join(el_ids) | ||
1679 | ) | ||
1680 | ) | ||
1681 | |||
1682 | assertSelect = assertSelects | ||
1683 | |||
1684 | def assertSelectMultiple(self, *tests): | ||
1685 | for selector, expected_ids in tests: | ||
1686 | self.assertSelect(selector, expected_ids) | ||
1687 | |||
1688 | def test_one_tag_one(self): | ||
1689 | els = self.soup.select('title') | ||
1690 | self.assertEqual(len(els), 1) | ||
1691 | self.assertEqual(els[0].name, 'title') | ||
1692 | self.assertEqual(els[0].contents, ['The title']) | ||
1693 | |||
1694 | def test_one_tag_many(self): | ||
1695 | els = self.soup.select('div') | ||
1696 | self.assertEqual(len(els), 4) | ||
1697 | for div in els: | ||
1698 | self.assertEqual(div.name, 'div') | ||
1699 | |||
1700 | el = self.soup.select_one('div') | ||
1701 | self.assertEqual('main', el['id']) | ||
1702 | |||
1703 | def test_select_one_returns_none_if_no_match(self): | ||
1704 | match = self.soup.select_one('nonexistenttag') | ||
1705 | self.assertEqual(None, match) | ||
1706 | |||
1707 | |||
1708 | def test_tag_in_tag_one(self): | ||
1709 | els = self.soup.select('div div') | ||
1710 | self.assertSelects('div div', ['inner', 'data1']) | ||
1711 | |||
1712 | def test_tag_in_tag_many(self): | ||
1713 | for selector in ('html div', 'html body div', 'body div'): | ||
1714 | self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) | ||
1715 | |||
1716 | def test_tag_no_match(self): | ||
1717 | self.assertEqual(len(self.soup.select('del')), 0) | ||
1718 | |||
1719 | def test_invalid_tag(self): | ||
1720 | self.assertRaises(ValueError, self.soup.select, 'tag%t') | ||
1721 | |||
1722 | def test_select_dashed_tag_ids(self): | ||
1723 | self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) | ||
1724 | |||
1725 | def test_select_dashed_by_id(self): | ||
1726 | dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') | ||
1727 | self.assertEqual(dashed[0].name, 'custom-dashed-tag') | ||
1728 | self.assertEqual(dashed[0]['id'], 'dash2') | ||
1729 | |||
1730 | def test_dashed_tag_text(self): | ||
1731 | self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') | ||
1732 | |||
1733 | def test_select_dashed_matches_find_all(self): | ||
1734 | self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) | ||
1735 | |||
1736 | def test_header_tags(self): | ||
1737 | self.assertSelectMultiple( | ||
1738 | ('h1', ['header1']), | ||
1739 | ('h2', ['header2', 'header3']), | ||
1740 | ) | ||
1741 | |||
1742 | def test_class_one(self): | ||
1743 | for selector in ('.onep', 'p.onep', 'html p.onep'): | ||
1744 | els = self.soup.select(selector) | ||
1745 | self.assertEqual(len(els), 1) | ||
1746 | self.assertEqual(els[0].name, 'p') | ||
1747 | self.assertEqual(els[0]['class'], ['onep']) | ||
1748 | |||
1749 | def test_class_mismatched_tag(self): | ||
1750 | els = self.soup.select('div.onep') | ||
1751 | self.assertEqual(len(els), 0) | ||
1752 | |||
1753 | def test_one_id(self): | ||
1754 | for selector in ('div#inner', '#inner', 'div div#inner'): | ||
1755 | self.assertSelects(selector, ['inner']) | ||
1756 | |||
1757 | def test_bad_id(self): | ||
1758 | els = self.soup.select('#doesnotexist') | ||
1759 | self.assertEqual(len(els), 0) | ||
1760 | |||
1761 | def test_items_in_id(self): | ||
1762 | els = self.soup.select('div#inner p') | ||
1763 | self.assertEqual(len(els), 3) | ||
1764 | for el in els: | ||
1765 | self.assertEqual(el.name, 'p') | ||
1766 | self.assertEqual(els[1]['class'], ['onep']) | ||
1767 | self.assertFalse(els[0].has_attr('class')) | ||
1768 | |||
1769 | def test_a_bunch_of_emptys(self): | ||
1770 | for selector in ('div#main del', 'div#main div.oops', 'div div#main'): | ||
1771 | self.assertEqual(len(self.soup.select(selector)), 0) | ||
1772 | |||
1773 | def test_multi_class_support(self): | ||
1774 | for selector in ('.class1', 'p.class1', '.class2', 'p.class2', | ||
1775 | '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): | ||
1776 | self.assertSelects(selector, ['pmulti']) | ||
1777 | |||
1778 | def test_multi_class_selection(self): | ||
1779 | for selector in ('.class1.class3', '.class3.class2', | ||
1780 | '.class1.class2.class3'): | ||
1781 | self.assertSelects(selector, ['pmulti']) | ||
1782 | |||
1783 | def test_child_selector(self): | ||
1784 | self.assertSelects('.s1 > a', ['s1a1', 's1a2']) | ||
1785 | self.assertSelects('.s1 > a span', ['s1a2s1']) | ||
1786 | |||
1787 | def test_child_selector_id(self): | ||
1788 | self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) | ||
1789 | |||
1790 | def test_attribute_equals(self): | ||
1791 | self.assertSelectMultiple( | ||
1792 | ('p[class="onep"]', ['p1']), | ||
1793 | ('p[id="p1"]', ['p1']), | ||
1794 | ('[class="onep"]', ['p1']), | ||
1795 | ('[id="p1"]', ['p1']), | ||
1796 | ('link[rel="stylesheet"]', ['l1']), | ||
1797 | ('link[type="text/css"]', ['l1']), | ||
1798 | ('link[href="blah.css"]', ['l1']), | ||
1799 | ('link[href="no-blah.css"]', []), | ||
1800 | ('[rel="stylesheet"]', ['l1']), | ||
1801 | ('[type="text/css"]', ['l1']), | ||
1802 | ('[href="blah.css"]', ['l1']), | ||
1803 | ('[href="no-blah.css"]', []), | ||
1804 | ('p[href="no-blah.css"]', []), | ||
1805 | ('[href="no-blah.css"]', []), | ||
1806 | ) | ||
1807 | |||
1808 | def test_attribute_tilde(self): | ||
1809 | self.assertSelectMultiple( | ||
1810 | ('p[class~="class1"]', ['pmulti']), | ||
1811 | ('p[class~="class2"]', ['pmulti']), | ||
1812 | ('p[class~="class3"]', ['pmulti']), | ||
1813 | ('[class~="class1"]', ['pmulti']), | ||
1814 | ('[class~="class2"]', ['pmulti']), | ||
1815 | ('[class~="class3"]', ['pmulti']), | ||
1816 | ('a[rel~="friend"]', ['bob']), | ||
1817 | ('a[rel~="met"]', ['bob']), | ||
1818 | ('[rel~="friend"]', ['bob']), | ||
1819 | ('[rel~="met"]', ['bob']), | ||
1820 | ) | ||
1821 | |||
1822 | def test_attribute_startswith(self): | ||
1823 | self.assertSelectMultiple( | ||
1824 | ('[rel^="style"]', ['l1']), | ||
1825 | ('link[rel^="style"]', ['l1']), | ||
1826 | ('notlink[rel^="notstyle"]', []), | ||
1827 | ('[rel^="notstyle"]', []), | ||
1828 | ('link[rel^="notstyle"]', []), | ||
1829 | ('link[href^="bla"]', ['l1']), | ||
1830 | ('a[href^="http://"]', ['bob', 'me']), | ||
1831 | ('[href^="http://"]', ['bob', 'me']), | ||
1832 | ('[id^="p"]', ['pmulti', 'p1']), | ||
1833 | ('[id^="m"]', ['me', 'main']), | ||
1834 | ('div[id^="m"]', ['main']), | ||
1835 | ('a[id^="m"]', ['me']), | ||
1836 | ('div[data-tag^="dashed"]', ['data1']) | ||
1837 | ) | ||
1838 | |||
1839 | def test_attribute_endswith(self): | ||
1840 | self.assertSelectMultiple( | ||
1841 | ('[href$=".css"]', ['l1']), | ||
1842 | ('link[href$=".css"]', ['l1']), | ||
1843 | ('link[id$="1"]', ['l1']), | ||
1844 | ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), | ||
1845 | ('div[id$="1"]', ['data1']), | ||
1846 | ('[id$="noending"]', []), | ||
1847 | ) | ||
1848 | |||
1849 | def test_attribute_contains(self): | ||
1850 | self.assertSelectMultiple( | ||
1851 | # From test_attribute_startswith | ||
1852 | ('[rel*="style"]', ['l1']), | ||
1853 | ('link[rel*="style"]', ['l1']), | ||
1854 | ('notlink[rel*="notstyle"]', []), | ||
1855 | ('[rel*="notstyle"]', []), | ||
1856 | ('link[rel*="notstyle"]', []), | ||
1857 | ('link[href*="bla"]', ['l1']), | ||
1858 | ('[href*="http://"]', ['bob', 'me']), | ||
1859 | ('[id*="p"]', ['pmulti', 'p1']), | ||
1860 | ('div[id*="m"]', ['main']), | ||
1861 | ('a[id*="m"]', ['me']), | ||
1862 | # From test_attribute_endswith | ||
1863 | ('[href*=".css"]', ['l1']), | ||
1864 | ('link[href*=".css"]', ['l1']), | ||
1865 | ('link[id*="1"]', ['l1']), | ||
1866 | ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), | ||
1867 | ('div[id*="1"]', ['data1']), | ||
1868 | ('[id*="noending"]', []), | ||
1869 | # New for this test | ||
1870 | ('[href*="."]', ['bob', 'me', 'l1']), | ||
1871 | ('a[href*="."]', ['bob', 'me']), | ||
1872 | ('link[href*="."]', ['l1']), | ||
1873 | ('div[id*="n"]', ['main', 'inner']), | ||
1874 | ('div[id*="nn"]', ['inner']), | ||
1875 | ('div[data-tag*="edval"]', ['data1']) | ||
1876 | ) | ||
1877 | |||
1878 | def test_attribute_exact_or_hypen(self): | ||
1879 | self.assertSelectMultiple( | ||
1880 | ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), | ||
1881 | ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), | ||
1882 | ('p[lang|="fr"]', ['lang-fr']), | ||
1883 | ('p[lang|="gb"]', []), | ||
1884 | ) | ||
1885 | |||
1886 | def test_attribute_exists(self): | ||
1887 | self.assertSelectMultiple( | ||
1888 | ('[rel]', ['l1', 'bob', 'me']), | ||
1889 | ('link[rel]', ['l1']), | ||
1890 | ('a[rel]', ['bob', 'me']), | ||
1891 | ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), | ||
1892 | ('p[class]', ['p1', 'pmulti']), | ||
1893 | ('[blah]', []), | ||
1894 | ('p[blah]', []), | ||
1895 | ('div[data-tag]', ['data1']) | ||
1896 | ) | ||
1897 | |||
1898 | def test_unsupported_pseudoclass(self): | ||
1899 | self.assertRaises( | ||
1900 | NotImplementedError, self.soup.select, "a:no-such-pseudoclass") | ||
1901 | |||
1902 | self.assertRaises( | ||
1903 | NotImplementedError, self.soup.select, "a:nth-of-type(a)") | ||
1904 | |||
1905 | |||
1906 | def test_nth_of_type(self): | ||
1907 | # Try to select first paragraph | ||
1908 | els = self.soup.select('div#inner p:nth-of-type(1)') | ||
1909 | self.assertEqual(len(els), 1) | ||
1910 | self.assertEqual(els[0].string, 'Some text') | ||
1911 | |||
1912 | # Try to select third paragraph | ||
1913 | els = self.soup.select('div#inner p:nth-of-type(3)') | ||
1914 | self.assertEqual(len(els), 1) | ||
1915 | self.assertEqual(els[0].string, 'Another') | ||
1916 | |||
1917 | # Try to select (non-existent!) fourth paragraph | ||
1918 | els = self.soup.select('div#inner p:nth-of-type(4)') | ||
1919 | self.assertEqual(len(els), 0) | ||
1920 | |||
1921 | # Pass in an invalid value. | ||
1922 | self.assertRaises( | ||
1923 | ValueError, self.soup.select, 'div p:nth-of-type(0)') | ||
1924 | |||
1925 | def test_nth_of_type_direct_descendant(self): | ||
1926 | els = self.soup.select('div#inner > p:nth-of-type(1)') | ||
1927 | self.assertEqual(len(els), 1) | ||
1928 | self.assertEqual(els[0].string, 'Some text') | ||
1929 | |||
1930 | def test_id_child_selector_nth_of_type(self): | ||
1931 | self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) | ||
1932 | |||
1933 | def test_select_on_element(self): | ||
1934 | # Other tests operate on the tree; this operates on an element | ||
1935 | # within the tree. | ||
1936 | inner = self.soup.find("div", id="main") | ||
1937 | selected = inner.select("div") | ||
1938 | # The <div id="inner"> tag was selected. The <div id="footer"> | ||
1939 | # tag was not. | ||
1940 | self.assertSelectsIDs(selected, ['inner', 'data1']) | ||
1941 | |||
1942 | def test_overspecified_child_id(self): | ||
1943 | self.assertSelects(".fancy #inner", ['inner']) | ||
1944 | self.assertSelects(".normal #inner", []) | ||
1945 | |||
1946 | def test_adjacent_sibling_selector(self): | ||
1947 | self.assertSelects('#p1 + h2', ['header2']) | ||
1948 | self.assertSelects('#p1 + h2 + p', ['pmulti']) | ||
1949 | self.assertSelects('#p1 + #header2 + .class1', ['pmulti']) | ||
1950 | self.assertEqual([], self.soup.select('#p1 + p')) | ||
1951 | |||
1952 | def test_general_sibling_selector(self): | ||
1953 | self.assertSelects('#p1 ~ h2', ['header2', 'header3']) | ||
1954 | self.assertSelects('#p1 ~ #header2', ['header2']) | ||
1955 | self.assertSelects('#p1 ~ h2 + a', ['me']) | ||
1956 | self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me']) | ||
1957 | self.assertEqual([], self.soup.select('#inner ~ h2')) | ||
1958 | |||
1959 | def test_dangling_combinator(self): | ||
1960 | self.assertRaises(ValueError, self.soup.select, 'h1 >') | ||
1961 | |||
1962 | def test_sibling_combinator_wont_select_same_tag_twice(self): | ||
1963 | self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) | ||
1964 | |||
1965 | # Test the selector grouping operator (the comma) | ||
1966 | def test_multiple_select(self): | ||
1967 | self.assertSelects('x, y', ['xid', 'yid']) | ||
1968 | |||
1969 | def test_multiple_select_with_no_space(self): | ||
1970 | self.assertSelects('x,y', ['xid', 'yid']) | ||
1971 | |||
1972 | def test_multiple_select_with_more_space(self): | ||
1973 | self.assertSelects('x, y', ['xid', 'yid']) | ||
1974 | |||
1975 | def test_multiple_select_duplicated(self): | ||
1976 | self.assertSelects('x, x', ['xid']) | ||
1977 | |||
1978 | def test_multiple_select_sibling(self): | ||
1979 | self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr']) | ||
1980 | |||
1981 | def test_multiple_select_tag_and_direct_descendant(self): | ||
1982 | self.assertSelects('x, y > z', ['xid', 'zidb']) | ||
1983 | |||
1984 | def test_multiple_select_direct_descendant_and_tags(self): | ||
1985 | self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) | ||
1986 | |||
1987 | def test_multiple_select_indirect_descendant(self): | ||
1988 | self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) | ||
1989 | |||
1990 | def test_invalid_multiple_select(self): | ||
1991 | self.assertRaises(ValueError, self.soup.select, ',x, y') | ||
1992 | self.assertRaises(ValueError, self.soup.select, 'x,,y') | ||
1993 | |||
1994 | def test_multiple_select_attrs(self): | ||
1995 | self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) | ||
1996 | |||
1997 | def test_multiple_select_ids(self): | ||
1998 | self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab']) | ||
1999 | |||
2000 | def test_multiple_select_nested(self): | ||
2001 | self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) | ||
2002 | |||
2003 | |||
2004 | |||