diff options
author | Aníbal Limón <anibal.limon@linux.intel.com> | 2014-11-05 12:10:27 -0600 |
---|---|---|
committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2014-11-06 16:45:23 +0000 |
commit | 25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch) | |
tree | 7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/tests | |
parent | bc6330cb7f288e76209410b0812aff1dbfa90950 (diff) | |
download | poky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz |
bitbake: bs4: Add beautifulsoup 4.3.2 to assist the fetcher
Added Beautifulsoup module because fetch/wget latest_versionstring
method depends on it.
This provides support to fetch/wget.py module for search new package
versions in upstream sites.
(Bitbake rev: 4626c9b77e5eded97507b6f9ca0d891f9a54bb8a)
Signed-off-by: Aníbal Limón <anibal.limon@linux.intel.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/tests')
-rw-r--r-- | bitbake/lib/bs4/tests/__init__.py | 1 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_builder_registry.py | 141 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_docs.py | 36 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_html5lib.py | 85 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_htmlparser.py | 19 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_lxml.py | 91 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_soup.py | 434 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_tree.py | 1829 |
8 files changed, 2636 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/tests/__init__.py b/bitbake/lib/bs4/tests/__init__.py new file mode 100644 index 0000000000..142c8cc3f1 --- /dev/null +++ b/bitbake/lib/bs4/tests/__init__.py | |||
@@ -0,0 +1 @@ | |||
"The beautifulsoup tests." | |||
diff --git a/bitbake/lib/bs4/tests/test_builder_registry.py b/bitbake/lib/bs4/tests/test_builder_registry.py new file mode 100644 index 0000000000..92ad10fb04 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_builder_registry.py | |||
@@ -0,0 +1,141 @@ | |||
1 | """Tests of the builder registry.""" | ||
2 | |||
3 | import unittest | ||
4 | |||
5 | from bs4 import BeautifulSoup | ||
6 | from bs4.builder import ( | ||
7 | builder_registry as registry, | ||
8 | HTMLParserTreeBuilder, | ||
9 | TreeBuilderRegistry, | ||
10 | ) | ||
11 | |||
12 | try: | ||
13 | from bs4.builder import HTML5TreeBuilder | ||
14 | HTML5LIB_PRESENT = True | ||
15 | except ImportError: | ||
16 | HTML5LIB_PRESENT = False | ||
17 | |||
18 | try: | ||
19 | from bs4.builder import ( | ||
20 | LXMLTreeBuilderForXML, | ||
21 | LXMLTreeBuilder, | ||
22 | ) | ||
23 | LXML_PRESENT = True | ||
24 | except ImportError: | ||
25 | LXML_PRESENT = False | ||
26 | |||
27 | |||
28 | class BuiltInRegistryTest(unittest.TestCase): | ||
29 | """Test the built-in registry with the default builders registered.""" | ||
30 | |||
31 | def test_combination(self): | ||
32 | if LXML_PRESENT: | ||
33 | self.assertEqual(registry.lookup('fast', 'html'), | ||
34 | LXMLTreeBuilder) | ||
35 | |||
36 | if LXML_PRESENT: | ||
37 | self.assertEqual(registry.lookup('permissive', 'xml'), | ||
38 | LXMLTreeBuilderForXML) | ||
39 | self.assertEqual(registry.lookup('strict', 'html'), | ||
40 | HTMLParserTreeBuilder) | ||
41 | if HTML5LIB_PRESENT: | ||
42 | self.assertEqual(registry.lookup('html5lib', 'html'), | ||
43 | HTML5TreeBuilder) | ||
44 | |||
45 | def test_lookup_by_markup_type(self): | ||
46 | if LXML_PRESENT: | ||
47 | self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) | ||
48 | self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) | ||
49 | else: | ||
50 | self.assertEqual(registry.lookup('xml'), None) | ||
51 | if HTML5LIB_PRESENT: | ||
52 | self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) | ||
53 | else: | ||
54 | self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) | ||
55 | |||
56 | def test_named_library(self): | ||
57 | if LXML_PRESENT: | ||
58 | self.assertEqual(registry.lookup('lxml', 'xml'), | ||
59 | LXMLTreeBuilderForXML) | ||
60 | self.assertEqual(registry.lookup('lxml', 'html'), | ||
61 | LXMLTreeBuilder) | ||
62 | if HTML5LIB_PRESENT: | ||
63 | self.assertEqual(registry.lookup('html5lib'), | ||
64 | HTML5TreeBuilder) | ||
65 | |||
66 | self.assertEqual(registry.lookup('html.parser'), | ||
67 | HTMLParserTreeBuilder) | ||
68 | |||
69 | def test_beautifulsoup_constructor_does_lookup(self): | ||
70 | # You can pass in a string. | ||
71 | BeautifulSoup("", features="html") | ||
72 | # Or a list of strings. | ||
73 | BeautifulSoup("", features=["html", "fast"]) | ||
74 | |||
75 | # You'll get an exception if BS can't find an appropriate | ||
76 | # builder. | ||
77 | self.assertRaises(ValueError, BeautifulSoup, | ||
78 | "", features="no-such-feature") | ||
79 | |||
80 | class RegistryTest(unittest.TestCase): | ||
81 | """Test the TreeBuilderRegistry class in general.""" | ||
82 | |||
83 | def setUp(self): | ||
84 | self.registry = TreeBuilderRegistry() | ||
85 | |||
86 | def builder_for_features(self, *feature_list): | ||
87 | cls = type('Builder_' + '_'.join(feature_list), | ||
88 | (object,), {'features' : feature_list}) | ||
89 | |||
90 | self.registry.register(cls) | ||
91 | return cls | ||
92 | |||
93 | def test_register_with_no_features(self): | ||
94 | builder = self.builder_for_features() | ||
95 | |||
96 | # Since the builder advertises no features, you can't find it | ||
97 | # by looking up features. | ||
98 | self.assertEqual(self.registry.lookup('foo'), None) | ||
99 | |||
100 | # But you can find it by doing a lookup with no features, if | ||
101 | # this happens to be the only registered builder. | ||
102 | self.assertEqual(self.registry.lookup(), builder) | ||
103 | |||
104 | def test_register_with_features_makes_lookup_succeed(self): | ||
105 | builder = self.builder_for_features('foo', 'bar') | ||
106 | self.assertEqual(self.registry.lookup('foo'), builder) | ||
107 | self.assertEqual(self.registry.lookup('bar'), builder) | ||
108 | |||
109 | def test_lookup_fails_when_no_builder_implements_feature(self): | ||
110 | builder = self.builder_for_features('foo', 'bar') | ||
111 | self.assertEqual(self.registry.lookup('baz'), None) | ||
112 | |||
113 | def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): | ||
114 | builder1 = self.builder_for_features('foo') | ||
115 | builder2 = self.builder_for_features('bar') | ||
116 | self.assertEqual(self.registry.lookup(), builder2) | ||
117 | |||
118 | def test_lookup_fails_when_no_tree_builders_registered(self): | ||
119 | self.assertEqual(self.registry.lookup(), None) | ||
120 | |||
121 | def test_lookup_gets_most_recent_builder_supporting_all_features(self): | ||
122 | has_one = self.builder_for_features('foo') | ||
123 | has_the_other = self.builder_for_features('bar') | ||
124 | has_both_early = self.builder_for_features('foo', 'bar', 'baz') | ||
125 | has_both_late = self.builder_for_features('foo', 'bar', 'quux') | ||
126 | lacks_one = self.builder_for_features('bar') | ||
127 | has_the_other = self.builder_for_features('foo') | ||
128 | |||
129 | # There are two builders featuring 'foo' and 'bar', but | ||
130 | # the one that also features 'quux' was registered later. | ||
131 | self.assertEqual(self.registry.lookup('foo', 'bar'), | ||
132 | has_both_late) | ||
133 | |||
134 | # There is only one builder featuring 'foo', 'bar', and 'baz'. | ||
135 | self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), | ||
136 | has_both_early) | ||
137 | |||
138 | def test_lookup_fails_when_cannot_reconcile_requested_features(self): | ||
139 | builder1 = self.builder_for_features('foo', 'bar') | ||
140 | builder2 = self.builder_for_features('foo', 'baz') | ||
141 | self.assertEqual(self.registry.lookup('bar', 'baz'), None) | ||
diff --git a/bitbake/lib/bs4/tests/test_docs.py b/bitbake/lib/bs4/tests/test_docs.py new file mode 100644 index 0000000000..5b9f677093 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_docs.py | |||
@@ -0,0 +1,36 @@ | |||
1 | "Test harness for doctests." | ||
2 | |||
3 | # pylint: disable-msg=E0611,W0142 | ||
4 | |||
5 | __metaclass__ = type | ||
6 | __all__ = [ | ||
7 | 'additional_tests', | ||
8 | ] | ||
9 | |||
10 | import atexit | ||
11 | import doctest | ||
12 | import os | ||
13 | #from pkg_resources import ( | ||
14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources) | ||
15 | import unittest | ||
16 | |||
17 | DOCTEST_FLAGS = ( | ||
18 | doctest.ELLIPSIS | | ||
19 | doctest.NORMALIZE_WHITESPACE | | ||
20 | doctest.REPORT_NDIFF) | ||
21 | |||
22 | |||
23 | # def additional_tests(): | ||
24 | # "Run the doc tests (README.txt and docs/*, if any exist)" | ||
25 | # doctest_files = [ | ||
26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))] | ||
27 | # if resource_exists('bs4', 'docs'): | ||
28 | # for name in resource_listdir('bs4', 'docs'): | ||
29 | # if name.endswith('.txt'): | ||
30 | # doctest_files.append( | ||
31 | # os.path.abspath( | ||
32 | # resource_filename('bs4', 'docs/%s' % name))) | ||
33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) | ||
34 | # atexit.register(cleanup_resources) | ||
35 | # return unittest.TestSuite(( | ||
36 | # doctest.DocFileSuite(*doctest_files, **kwargs))) | ||
diff --git a/bitbake/lib/bs4/tests/test_html5lib.py b/bitbake/lib/bs4/tests/test_html5lib.py new file mode 100644 index 0000000000..594c3e1f26 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_html5lib.py | |||
@@ -0,0 +1,85 @@ | |||
1 | """Tests to ensure that the html5lib tree builder generates good trees.""" | ||
2 | |||
3 | import warnings | ||
4 | |||
5 | try: | ||
6 | from bs4.builder import HTML5TreeBuilder | ||
7 | HTML5LIB_PRESENT = True | ||
8 | except ImportError, e: | ||
9 | HTML5LIB_PRESENT = False | ||
10 | from bs4.element import SoupStrainer | ||
11 | from bs4.testing import ( | ||
12 | HTML5TreeBuilderSmokeTest, | ||
13 | SoupTest, | ||
14 | skipIf, | ||
15 | ) | ||
16 | |||
17 | @skipIf( | ||
18 | not HTML5LIB_PRESENT, | ||
19 | "html5lib seems not to be present, not testing its tree builder.") | ||
20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): | ||
21 | """See ``HTML5TreeBuilderSmokeTest``.""" | ||
22 | |||
23 | @property | ||
24 | def default_builder(self): | ||
25 | return HTML5TreeBuilder() | ||
26 | |||
27 | def test_soupstrainer(self): | ||
28 | # The html5lib tree builder does not support SoupStrainers. | ||
29 | strainer = SoupStrainer("b") | ||
30 | markup = "<p>A <b>bold</b> statement.</p>" | ||
31 | with warnings.catch_warnings(record=True) as w: | ||
32 | soup = self.soup(markup, parse_only=strainer) | ||
33 | self.assertEqual( | ||
34 | soup.decode(), self.document_for(markup)) | ||
35 | |||
36 | self.assertTrue( | ||
37 | "the html5lib tree builder doesn't support parse_only" in | ||
38 | str(w[0].message)) | ||
39 | |||
40 | def test_correctly_nested_tables(self): | ||
41 | """html5lib inserts <tbody> tags where other parsers don't.""" | ||
42 | markup = ('<table id="1">' | ||
43 | '<tr>' | ||
44 | "<td>Here's another table:" | ||
45 | '<table id="2">' | ||
46 | '<tr><td>foo</td></tr>' | ||
47 | '</table></td>') | ||
48 | |||
49 | self.assertSoupEquals( | ||
50 | markup, | ||
51 | '<table id="1"><tbody><tr><td>Here\'s another table:' | ||
52 | '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' | ||
53 | '</td></tr></tbody></table>') | ||
54 | |||
55 | self.assertSoupEquals( | ||
56 | "<table><thead><tr><td>Foo</td></tr></thead>" | ||
57 | "<tbody><tr><td>Bar</td></tr></tbody>" | ||
58 | "<tfoot><tr><td>Baz</td></tr></tfoot></table>") | ||
59 | |||
60 | def test_xml_declaration_followed_by_doctype(self): | ||
61 | markup = '''<?xml version="1.0" encoding="utf-8"?> | ||
62 | <!DOCTYPE html> | ||
63 | <html> | ||
64 | <head> | ||
65 | </head> | ||
66 | <body> | ||
67 | <p>foo</p> | ||
68 | </body> | ||
69 | </html>''' | ||
70 | soup = self.soup(markup) | ||
71 | # Verify that we can reach the <p> tag; this means the tree is connected. | ||
72 | self.assertEqual(b"<p>foo</p>", soup.p.encode()) | ||
73 | |||
74 | def test_reparented_markup(self): | ||
75 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' | ||
76 | soup = self.soup(markup) | ||
77 | self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) | ||
78 | self.assertEqual(2, len(soup.find_all('p'))) | ||
79 | |||
80 | |||
81 | def test_reparented_markup_ends_with_whitespace(self): | ||
82 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' | ||
83 | soup = self.soup(markup) | ||
84 | self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) | ||
85 | self.assertEqual(2, len(soup.find_all('p'))) | ||
diff --git a/bitbake/lib/bs4/tests/test_htmlparser.py b/bitbake/lib/bs4/tests/test_htmlparser.py new file mode 100644 index 0000000000..bcb5ed232f --- /dev/null +++ b/bitbake/lib/bs4/tests/test_htmlparser.py | |||
@@ -0,0 +1,19 @@ | |||
1 | """Tests to ensure that the html.parser tree builder generates good | ||
2 | trees.""" | ||
3 | |||
4 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest | ||
5 | from bs4.builder import HTMLParserTreeBuilder | ||
6 | |||
7 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): | ||
8 | |||
9 | @property | ||
10 | def default_builder(self): | ||
11 | return HTMLParserTreeBuilder() | ||
12 | |||
13 | def test_namespaced_system_doctype(self): | ||
14 | # html.parser can't handle namespaced doctypes, so skip this one. | ||
15 | pass | ||
16 | |||
17 | def test_namespaced_public_doctype(self): | ||
18 | # html.parser can't handle namespaced doctypes, so skip this one. | ||
19 | pass | ||
diff --git a/bitbake/lib/bs4/tests/test_lxml.py b/bitbake/lib/bs4/tests/test_lxml.py new file mode 100644 index 0000000000..2b2e9b7e78 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_lxml.py | |||
@@ -0,0 +1,91 @@ | |||
1 | """Tests to ensure that the lxml tree builder generates good trees.""" | ||
2 | |||
3 | import re | ||
4 | import warnings | ||
5 | |||
6 | try: | ||
7 | import lxml.etree | ||
8 | LXML_PRESENT = True | ||
9 | LXML_VERSION = lxml.etree.LXML_VERSION | ||
10 | except ImportError, e: | ||
11 | LXML_PRESENT = False | ||
12 | LXML_VERSION = (0,) | ||
13 | |||
14 | if LXML_PRESENT: | ||
15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | ||
16 | |||
17 | from bs4 import ( | ||
18 | BeautifulSoup, | ||
19 | BeautifulStoneSoup, | ||
20 | ) | ||
21 | from bs4.element import Comment, Doctype, SoupStrainer | ||
22 | from bs4.testing import skipIf | ||
23 | from bs4.tests import test_htmlparser | ||
24 | from bs4.testing import ( | ||
25 | HTMLTreeBuilderSmokeTest, | ||
26 | XMLTreeBuilderSmokeTest, | ||
27 | SoupTest, | ||
28 | skipIf, | ||
29 | ) | ||
30 | |||
31 | @skipIf( | ||
32 | not LXML_PRESENT, | ||
33 | "lxml seems not to be present, not testing its tree builder.") | ||
34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): | ||
35 | """See ``HTMLTreeBuilderSmokeTest``.""" | ||
36 | |||
37 | @property | ||
38 | def default_builder(self): | ||
39 | return LXMLTreeBuilder() | ||
40 | |||
41 | def test_out_of_range_entity(self): | ||
42 | self.assertSoupEquals( | ||
43 | "<p>foo�bar</p>", "<p>foobar</p>") | ||
44 | self.assertSoupEquals( | ||
45 | "<p>foo�bar</p>", "<p>foobar</p>") | ||
46 | self.assertSoupEquals( | ||
47 | "<p>foo�bar</p>", "<p>foobar</p>") | ||
48 | |||
49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this | ||
50 | # test if an old version of lxml is installed. | ||
51 | |||
52 | @skipIf( | ||
53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), | ||
54 | "Skipping doctype test for old version of lxml to avoid segfault.") | ||
55 | def test_empty_doctype(self): | ||
56 | soup = self.soup("<!DOCTYPE>") | ||
57 | doctype = soup.contents[0] | ||
58 | self.assertEqual("", doctype.strip()) | ||
59 | |||
60 | def test_beautifulstonesoup_is_xml_parser(self): | ||
61 | # Make sure that the deprecated BSS class uses an xml builder | ||
62 | # if one is installed. | ||
63 | with warnings.catch_warnings(record=True) as w: | ||
64 | soup = BeautifulStoneSoup("<b />") | ||
65 | self.assertEqual(u"<b/>", unicode(soup.b)) | ||
66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) | ||
67 | |||
68 | def test_real_xhtml_document(self): | ||
69 | """lxml strips the XML definition from an XHTML doc, which is fine.""" | ||
70 | markup = b"""<?xml version="1.0" encoding="utf-8"?> | ||
71 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> | ||
72 | <html xmlns="http://www.w3.org/1999/xhtml"> | ||
73 | <head><title>Hello.</title></head> | ||
74 | <body>Goodbye.</body> | ||
75 | </html>""" | ||
76 | soup = self.soup(markup) | ||
77 | self.assertEqual( | ||
78 | soup.encode("utf-8").replace(b"\n", b''), | ||
79 | markup.replace(b'\n', b'').replace( | ||
80 | b'<?xml version="1.0" encoding="utf-8"?>', b'')) | ||
81 | |||
82 | |||
83 | @skipIf( | ||
84 | not LXML_PRESENT, | ||
85 | "lxml seems not to be present, not testing its XML tree builder.") | ||
86 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): | ||
87 | """See ``HTMLTreeBuilderSmokeTest``.""" | ||
88 | |||
89 | @property | ||
90 | def default_builder(self): | ||
91 | return LXMLTreeBuilderForXML() | ||
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py new file mode 100644 index 0000000000..47ac245f99 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_soup.py | |||
@@ -0,0 +1,434 @@ | |||
1 | # -*- coding: utf-8 -*- | ||
2 | """Tests of Beautiful Soup as a whole.""" | ||
3 | |||
4 | import logging | ||
5 | import unittest | ||
6 | import sys | ||
7 | import tempfile | ||
8 | |||
9 | from bs4 import ( | ||
10 | BeautifulSoup, | ||
11 | BeautifulStoneSoup, | ||
12 | ) | ||
13 | from bs4.element import ( | ||
14 | CharsetMetaAttributeValue, | ||
15 | ContentMetaAttributeValue, | ||
16 | SoupStrainer, | ||
17 | NamespacedAttribute, | ||
18 | ) | ||
19 | import bs4.dammit | ||
20 | from bs4.dammit import ( | ||
21 | EntitySubstitution, | ||
22 | UnicodeDammit, | ||
23 | ) | ||
24 | from bs4.testing import ( | ||
25 | SoupTest, | ||
26 | skipIf, | ||
27 | ) | ||
28 | import warnings | ||
29 | |||
30 | try: | ||
31 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | ||
32 | LXML_PRESENT = True | ||
33 | except ImportError, e: | ||
34 | LXML_PRESENT = False | ||
35 | |||
36 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) | ||
37 | PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) | ||
38 | |||
39 | class TestConstructor(SoupTest): | ||
40 | |||
41 | def test_short_unicode_input(self): | ||
42 | data = u"<h1>éé</h1>" | ||
43 | soup = self.soup(data) | ||
44 | self.assertEqual(u"éé", soup.h1.string) | ||
45 | |||
46 | def test_embedded_null(self): | ||
47 | data = u"<h1>foo\0bar</h1>" | ||
48 | soup = self.soup(data) | ||
49 | self.assertEqual(u"foo\0bar", soup.h1.string) | ||
50 | |||
51 | |||
52 | class TestDeprecatedConstructorArguments(SoupTest): | ||
53 | |||
54 | def test_parseOnlyThese_renamed_to_parse_only(self): | ||
55 | with warnings.catch_warnings(record=True) as w: | ||
56 | soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) | ||
57 | msg = str(w[0].message) | ||
58 | self.assertTrue("parseOnlyThese" in msg) | ||
59 | self.assertTrue("parse_only" in msg) | ||
60 | self.assertEqual(b"<b></b>", soup.encode()) | ||
61 | |||
62 | def test_fromEncoding_renamed_to_from_encoding(self): | ||
63 | with warnings.catch_warnings(record=True) as w: | ||
64 | utf8 = b"\xc3\xa9" | ||
65 | soup = self.soup(utf8, fromEncoding="utf8") | ||
66 | msg = str(w[0].message) | ||
67 | self.assertTrue("fromEncoding" in msg) | ||
68 | self.assertTrue("from_encoding" in msg) | ||
69 | self.assertEqual("utf8", soup.original_encoding) | ||
70 | |||
71 | def test_unrecognized_keyword_argument(self): | ||
72 | self.assertRaises( | ||
73 | TypeError, self.soup, "<a>", no_such_argument=True) | ||
74 | |||
75 | class TestWarnings(SoupTest): | ||
76 | |||
77 | def test_disk_file_warning(self): | ||
78 | filehandle = tempfile.NamedTemporaryFile() | ||
79 | filename = filehandle.name | ||
80 | try: | ||
81 | with warnings.catch_warnings(record=True) as w: | ||
82 | soup = self.soup(filename) | ||
83 | msg = str(w[0].message) | ||
84 | self.assertTrue("looks like a filename" in msg) | ||
85 | finally: | ||
86 | filehandle.close() | ||
87 | |||
88 | # The file no longer exists, so Beautiful Soup will no longer issue the warning. | ||
89 | with warnings.catch_warnings(record=True) as w: | ||
90 | soup = self.soup(filename) | ||
91 | self.assertEqual(0, len(w)) | ||
92 | |||
93 | def test_url_warning(self): | ||
94 | with warnings.catch_warnings(record=True) as w: | ||
95 | soup = self.soup("http://www.crummy.com/") | ||
96 | msg = str(w[0].message) | ||
97 | self.assertTrue("looks like a URL" in msg) | ||
98 | |||
99 | with warnings.catch_warnings(record=True) as w: | ||
100 | soup = self.soup("http://www.crummy.com/ is great") | ||
101 | self.assertEqual(0, len(w)) | ||
102 | |||
103 | class TestSelectiveParsing(SoupTest): | ||
104 | |||
105 | def test_parse_with_soupstrainer(self): | ||
106 | markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" | ||
107 | strainer = SoupStrainer("b") | ||
108 | soup = self.soup(markup, parse_only=strainer) | ||
109 | self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") | ||
110 | |||
111 | |||
112 | class TestEntitySubstitution(unittest.TestCase): | ||
113 | """Standalone tests of the EntitySubstitution class.""" | ||
114 | def setUp(self): | ||
115 | self.sub = EntitySubstitution | ||
116 | |||
117 | def test_simple_html_substitution(self): | ||
118 | # Unicode characters corresponding to named HTML entites | ||
119 | # are substituted, and no others. | ||
120 | s = u"foo\u2200\N{SNOWMAN}\u00f5bar" | ||
121 | self.assertEqual(self.sub.substitute_html(s), | ||
122 | u"foo∀\N{SNOWMAN}õbar") | ||
123 | |||
124 | def test_smart_quote_substitution(self): | ||
125 | # MS smart quotes are a common source of frustration, so we | ||
126 | # give them a special test. | ||
127 | quotes = b"\x91\x92foo\x93\x94" | ||
128 | dammit = UnicodeDammit(quotes) | ||
129 | self.assertEqual(self.sub.substitute_html(dammit.markup), | ||
130 | "‘’foo“”") | ||
131 | |||
132 | def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): | ||
133 | s = 'Welcome to "my bar"' | ||
134 | self.assertEqual(self.sub.substitute_xml(s, False), s) | ||
135 | |||
136 | def test_xml_attribute_quoting_normally_uses_double_quotes(self): | ||
137 | self.assertEqual(self.sub.substitute_xml("Welcome", True), | ||
138 | '"Welcome"') | ||
139 | self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), | ||
140 | '"Bob\'s Bar"') | ||
141 | |||
142 | def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): | ||
143 | s = 'Welcome to "my bar"' | ||
144 | self.assertEqual(self.sub.substitute_xml(s, True), | ||
145 | "'Welcome to \"my bar\"'") | ||
146 | |||
147 | def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): | ||
148 | s = 'Welcome to "Bob\'s Bar"' | ||
149 | self.assertEqual( | ||
150 | self.sub.substitute_xml(s, True), | ||
151 | '"Welcome to "Bob\'s Bar""') | ||
152 | |||
153 | def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): | ||
154 | quoted = 'Welcome to "Bob\'s Bar"' | ||
155 | self.assertEqual(self.sub.substitute_xml(quoted), quoted) | ||
156 | |||
157 | def test_xml_quoting_handles_angle_brackets(self): | ||
158 | self.assertEqual( | ||
159 | self.sub.substitute_xml("foo<bar>"), | ||
160 | "foo<bar>") | ||
161 | |||
162 | def test_xml_quoting_handles_ampersands(self): | ||
163 | self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") | ||
164 | |||
165 | def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): | ||
166 | self.assertEqual( | ||
167 | self.sub.substitute_xml("ÁT&T"), | ||
168 | "&Aacute;T&T") | ||
169 | |||
170 | def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): | ||
171 | self.assertEqual( | ||
172 | self.sub.substitute_xml_containing_entities("ÁT&T"), | ||
173 | "ÁT&T") | ||
174 | |||
175 | def test_quotes_not_html_substituted(self): | ||
176 | """There's no need to do this except inside attribute values.""" | ||
177 | text = 'Bob\'s "bar"' | ||
178 | self.assertEqual(self.sub.substitute_html(text), text) | ||
179 | |||
180 | |||
181 | class TestEncodingConversion(SoupTest): | ||
182 | # Test Beautiful Soup's ability to decode and encode from various | ||
183 | # encodings. | ||
184 | |||
185 | def setUp(self): | ||
186 | super(TestEncodingConversion, self).setUp() | ||
187 | self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' | ||
188 | self.utf8_data = self.unicode_data.encode("utf-8") | ||
189 | # Just so you know what it looks like. | ||
190 | self.assertEqual( | ||
191 | self.utf8_data, | ||
192 | b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') | ||
193 | |||
194 | def test_ascii_in_unicode_out(self): | ||
195 | # ASCII input is converted to Unicode. The original_encoding | ||
196 | # attribute is set to 'utf-8', a superset of ASCII. | ||
197 | chardet = bs4.dammit.chardet_dammit | ||
198 | logging.disable(logging.WARNING) | ||
199 | try: | ||
200 | def noop(str): | ||
201 | return None | ||
202 | # Disable chardet, which will realize that the ASCII is ASCII. | ||
203 | bs4.dammit.chardet_dammit = noop | ||
204 | ascii = b"<foo>a</foo>" | ||
205 | soup_from_ascii = self.soup(ascii) | ||
206 | unicode_output = soup_from_ascii.decode() | ||
207 | self.assertTrue(isinstance(unicode_output, unicode)) | ||
208 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) | ||
209 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") | ||
210 | finally: | ||
211 | logging.disable(logging.NOTSET) | ||
212 | bs4.dammit.chardet_dammit = chardet | ||
213 | |||
214 | def test_unicode_in_unicode_out(self): | ||
215 | # Unicode input is left alone. The original_encoding attribute | ||
216 | # is not set. | ||
217 | soup_from_unicode = self.soup(self.unicode_data) | ||
218 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) | ||
219 | self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') | ||
220 | self.assertEqual(soup_from_unicode.original_encoding, None) | ||
221 | |||
222 | def test_utf8_in_unicode_out(self): | ||
223 | # UTF-8 input is converted to Unicode. The original_encoding | ||
224 | # attribute is set. | ||
225 | soup_from_utf8 = self.soup(self.utf8_data) | ||
226 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) | ||
227 | self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') | ||
228 | |||
229 | def test_utf8_out(self): | ||
230 | # The internal data structures can be encoded as UTF-8. | ||
231 | soup_from_unicode = self.soup(self.unicode_data) | ||
232 | self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) | ||
233 | |||
234 | @skipIf( | ||
235 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, | ||
236 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") | ||
237 | def test_attribute_name_containing_unicode_characters(self): | ||
238 | markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' | ||
239 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) | ||
240 | |||
241 | class TestUnicodeDammit(unittest.TestCase): | ||
242 | """Standalone tests of UnicodeDammit.""" | ||
243 | |||
244 | def test_unicode_input(self): | ||
245 | markup = u"I'm already Unicode! \N{SNOWMAN}" | ||
246 | dammit = UnicodeDammit(markup) | ||
247 | self.assertEqual(dammit.unicode_markup, markup) | ||
248 | |||
249 | def test_smart_quotes_to_unicode(self): | ||
250 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
251 | dammit = UnicodeDammit(markup) | ||
252 | self.assertEqual( | ||
253 | dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") | ||
254 | |||
255 | def test_smart_quotes_to_xml_entities(self): | ||
256 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
257 | dammit = UnicodeDammit(markup, smart_quotes_to="xml") | ||
258 | self.assertEqual( | ||
259 | dammit.unicode_markup, "<foo>‘’“”</foo>") | ||
260 | |||
261 | def test_smart_quotes_to_html_entities(self): | ||
262 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
263 | dammit = UnicodeDammit(markup, smart_quotes_to="html") | ||
264 | self.assertEqual( | ||
265 | dammit.unicode_markup, "<foo>‘’“”</foo>") | ||
266 | |||
267 | def test_smart_quotes_to_ascii(self): | ||
268 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
269 | dammit = UnicodeDammit(markup, smart_quotes_to="ascii") | ||
270 | self.assertEqual( | ||
271 | dammit.unicode_markup, """<foo>''""</foo>""") | ||
272 | |||
273 | def test_detect_utf8(self): | ||
274 | utf8 = b"\xc3\xa9" | ||
275 | dammit = UnicodeDammit(utf8) | ||
276 | self.assertEqual(dammit.unicode_markup, u'\xe9') | ||
277 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
278 | |||
279 | def test_convert_hebrew(self): | ||
280 | hebrew = b"\xed\xe5\xec\xf9" | ||
281 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) | ||
282 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') | ||
283 | self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') | ||
284 | |||
285 | def test_dont_see_smart_quotes_where_there_are_none(self): | ||
286 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" | ||
287 | dammit = UnicodeDammit(utf_8) | ||
288 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
289 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) | ||
290 | |||
291 | def test_ignore_inappropriate_codecs(self): | ||
292 | utf8_data = u"Räksmörgås".encode("utf-8") | ||
293 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) | ||
294 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
295 | |||
296 | def test_ignore_invalid_codecs(self): | ||
297 | utf8_data = u"Räksmörgås".encode("utf-8") | ||
298 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: | ||
299 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) | ||
300 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
301 | |||
302 | def test_detect_html5_style_meta_tag(self): | ||
303 | |||
304 | for data in ( | ||
305 | b'<html><meta charset="euc-jp" /></html>', | ||
306 | b"<html><meta charset='euc-jp' /></html>", | ||
307 | b"<html><meta charset=euc-jp /></html>", | ||
308 | b"<html><meta charset=euc-jp/></html>"): | ||
309 | dammit = UnicodeDammit(data, is_html=True) | ||
310 | self.assertEqual( | ||
311 | "euc-jp", dammit.original_encoding) | ||
312 | |||
313 | def test_last_ditch_entity_replacement(self): | ||
314 | # This is a UTF-8 document that contains bytestrings | ||
315 | # completely incompatible with UTF-8 (ie. encoded with some other | ||
316 | # encoding). | ||
317 | # | ||
318 | # Since there is no consistent encoding for the document, | ||
319 | # Unicode, Dammit will eventually encode the document as UTF-8 | ||
320 | # and encode the incompatible characters as REPLACEMENT | ||
321 | # CHARACTER. | ||
322 | # | ||
323 | # If chardet is installed, it will detect that the document | ||
324 | # can be converted into ISO-8859-1 without errors. This happens | ||
325 | # to be the wrong encoding, but it is a consistent encoding, so the | ||
326 | # code we're testing here won't run. | ||
327 | # | ||
328 | # So we temporarily disable chardet if it's present. | ||
329 | doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> | ||
330 | <html><b>\330\250\330\252\330\261</b> | ||
331 | <i>\310\322\321\220\312\321\355\344</i></html>""" | ||
332 | chardet = bs4.dammit.chardet_dammit | ||
333 | logging.disable(logging.WARNING) | ||
334 | try: | ||
335 | def noop(str): | ||
336 | return None | ||
337 | bs4.dammit.chardet_dammit = noop | ||
338 | dammit = UnicodeDammit(doc) | ||
339 | self.assertEqual(True, dammit.contains_replacement_characters) | ||
340 | self.assertTrue(u"\ufffd" in dammit.unicode_markup) | ||
341 | |||
342 | soup = BeautifulSoup(doc, "html.parser") | ||
343 | self.assertTrue(soup.contains_replacement_characters) | ||
344 | finally: | ||
345 | logging.disable(logging.NOTSET) | ||
346 | bs4.dammit.chardet_dammit = chardet | ||
347 | |||
348 | def test_byte_order_mark_removed(self): | ||
349 | # A document written in UTF-16LE will have its byte order marker stripped. | ||
350 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | ||
351 | dammit = UnicodeDammit(data) | ||
352 | self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) | ||
353 | self.assertEqual("utf-16le", dammit.original_encoding) | ||
354 | |||
355 | def test_detwingle(self): | ||
356 | # Here's a UTF8 document. | ||
357 | utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") | ||
358 | |||
359 | # Here's a Windows-1252 document. | ||
360 | windows_1252 = ( | ||
361 | u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" | ||
362 | u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") | ||
363 | |||
364 | # Through some unholy alchemy, they've been stuck together. | ||
365 | doc = utf8 + windows_1252 + utf8 | ||
366 | |||
367 | # The document can't be turned into UTF-8: | ||
368 | self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") | ||
369 | |||
370 | # Unicode, Dammit thinks the whole document is Windows-1252, | ||
371 | # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" | ||
372 | |||
373 | # But if we run it through fix_embedded_windows_1252, it's fixed: | ||
374 | |||
375 | fixed = UnicodeDammit.detwingle(doc) | ||
376 | self.assertEqual( | ||
377 | u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) | ||
378 | |||
379 | def test_detwingle_ignores_multibyte_characters(self): | ||
380 | # Each of these characters has a UTF-8 representation ending | ||
381 | # in \x93. \x93 is a smart quote if interpreted as | ||
382 | # Windows-1252. But our code knows to skip over multibyte | ||
383 | # UTF-8 characters, so they'll survive the process unscathed. | ||
384 | for tricky_unicode_char in ( | ||
385 | u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' | ||
386 | u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' | ||
387 | u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. | ||
388 | ): | ||
389 | input = tricky_unicode_char.encode("utf8") | ||
390 | self.assertTrue(input.endswith(b'\x93')) | ||
391 | output = UnicodeDammit.detwingle(input) | ||
392 | self.assertEqual(output, input) | ||
393 | |||
394 | class TestNamedspacedAttribute(SoupTest): | ||
395 | |||
396 | def test_name_may_be_none(self): | ||
397 | a = NamespacedAttribute("xmlns", None) | ||
398 | self.assertEqual(a, "xmlns") | ||
399 | |||
400 | def test_attribute_is_equivalent_to_colon_separated_string(self): | ||
401 | a = NamespacedAttribute("a", "b") | ||
402 | self.assertEqual("a:b", a) | ||
403 | |||
404 | def test_attributes_are_equivalent_if_prefix_and_name_identical(self): | ||
405 | a = NamespacedAttribute("a", "b", "c") | ||
406 | b = NamespacedAttribute("a", "b", "c") | ||
407 | self.assertEqual(a, b) | ||
408 | |||
409 | # The actual namespace is not considered. | ||
410 | c = NamespacedAttribute("a", "b", None) | ||
411 | self.assertEqual(a, c) | ||
412 | |||
413 | # But name and prefix are important. | ||
414 | d = NamespacedAttribute("a", "z", "c") | ||
415 | self.assertNotEqual(a, d) | ||
416 | |||
417 | e = NamespacedAttribute("z", "b", "c") | ||
418 | self.assertNotEqual(a, e) | ||
419 | |||
420 | |||
421 | class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): | ||
422 | |||
423 | def test_content_meta_attribute_value(self): | ||
424 | value = CharsetMetaAttributeValue("euc-jp") | ||
425 | self.assertEqual("euc-jp", value) | ||
426 | self.assertEqual("euc-jp", value.original_value) | ||
427 | self.assertEqual("utf8", value.encode("utf8")) | ||
428 | |||
429 | |||
430 | def test_content_meta_attribute_value(self): | ||
431 | value = ContentMetaAttributeValue("text/html; charset=euc-jp") | ||
432 | self.assertEqual("text/html; charset=euc-jp", value) | ||
433 | self.assertEqual("text/html; charset=euc-jp", value.original_value) | ||
434 | self.assertEqual("text/html; charset=utf8", value.encode("utf8")) | ||
diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py new file mode 100644 index 0000000000..f8515c0ea1 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_tree.py | |||
@@ -0,0 +1,1829 @@ | |||
1 | # -*- coding: utf-8 -*- | ||
2 | """Tests for Beautiful Soup's tree traversal methods. | ||
3 | |||
4 | The tree traversal methods are the main advantage of using Beautiful | ||
5 | Soup over just using a parser. | ||
6 | |||
7 | Different parsers will build different Beautiful Soup trees given the | ||
8 | same markup, but all Beautiful Soup trees can be traversed with the | ||
9 | methods tested here. | ||
10 | """ | ||
11 | |||
12 | import copy | ||
13 | import pickle | ||
14 | import re | ||
15 | import warnings | ||
16 | from bs4 import BeautifulSoup | ||
17 | from bs4.builder import ( | ||
18 | builder_registry, | ||
19 | HTMLParserTreeBuilder, | ||
20 | ) | ||
21 | from bs4.element import ( | ||
22 | CData, | ||
23 | Comment, | ||
24 | Doctype, | ||
25 | NavigableString, | ||
26 | SoupStrainer, | ||
27 | Tag, | ||
28 | ) | ||
29 | from bs4.testing import ( | ||
30 | SoupTest, | ||
31 | skipIf, | ||
32 | ) | ||
33 | |||
34 | XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) | ||
35 | LXML_PRESENT = (builder_registry.lookup("lxml") is not None) | ||
36 | |||
37 | class TreeTest(SoupTest): | ||
38 | |||
39 | def assertSelects(self, tags, should_match): | ||
40 | """Make sure that the given tags have the correct text. | ||
41 | |||
42 | This is used in tests that define a bunch of tags, each | ||
43 | containing a single string, and then select certain strings by | ||
44 | some mechanism. | ||
45 | """ | ||
46 | self.assertEqual([tag.string for tag in tags], should_match) | ||
47 | |||
48 | def assertSelectsIDs(self, tags, should_match): | ||
49 | """Make sure that the given tags have the correct IDs. | ||
50 | |||
51 | This is used in tests that define a bunch of tags, each | ||
52 | containing a single string, and then select certain strings by | ||
53 | some mechanism. | ||
54 | """ | ||
55 | self.assertEqual([tag['id'] for tag in tags], should_match) | ||
56 | |||
57 | |||
58 | class TestFind(TreeTest): | ||
59 | """Basic tests of the find() method. | ||
60 | |||
61 | find() just calls find_all() with limit=1, so it's not tested all | ||
62 | that thouroughly here. | ||
63 | """ | ||
64 | |||
65 | def test_find_tag(self): | ||
66 | soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>") | ||
67 | self.assertEqual(soup.find("b").string, "2") | ||
68 | |||
69 | def test_unicode_text_find(self): | ||
70 | soup = self.soup(u'<h1>Räksmörgås</h1>') | ||
71 | self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') | ||
72 | |||
73 | def test_find_everything(self): | ||
74 | """Test an optimization that finds all tags.""" | ||
75 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
76 | self.assertEqual(2, len(soup.find_all())) | ||
77 | |||
78 | def test_find_everything_with_name(self): | ||
79 | """Test an optimization that finds all tags with a given name.""" | ||
80 | soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") | ||
81 | self.assertEqual(2, len(soup.find_all('a'))) | ||
82 | |||
83 | class TestFindAll(TreeTest): | ||
84 | """Basic tests of the find_all() method.""" | ||
85 | |||
86 | def test_find_all_text_nodes(self): | ||
87 | """You can search the tree for text nodes.""" | ||
88 | soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") | ||
89 | # Exact match. | ||
90 | self.assertEqual(soup.find_all(text="bar"), [u"bar"]) | ||
91 | # Match any of a number of strings. | ||
92 | self.assertEqual( | ||
93 | soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) | ||
94 | # Match a regular expression. | ||
95 | self.assertEqual(soup.find_all(text=re.compile('.*')), | ||
96 | [u"Foo", u"bar", u'\xbb']) | ||
97 | # Match anything. | ||
98 | self.assertEqual(soup.find_all(text=True), | ||
99 | [u"Foo", u"bar", u'\xbb']) | ||
100 | |||
101 | def test_find_all_limit(self): | ||
102 | """You can limit the number of items returned by find_all.""" | ||
103 | soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>") | ||
104 | self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) | ||
105 | self.assertSelects(soup.find_all('a', limit=1), ["1"]) | ||
106 | self.assertSelects( | ||
107 | soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) | ||
108 | |||
109 | # A limit of 0 means no limit. | ||
110 | self.assertSelects( | ||
111 | soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) | ||
112 | |||
113 | def test_calling_a_tag_is_calling_findall(self): | ||
114 | soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>") | ||
115 | self.assertSelects(soup('a', limit=1), ["1"]) | ||
116 | self.assertSelects(soup.b(id="foo"), ["3"]) | ||
117 | |||
118 | def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): | ||
119 | soup = self.soup("<a></a>") | ||
120 | # Create a self-referential list. | ||
121 | l = [] | ||
122 | l.append(l) | ||
123 | |||
124 | # Without special code in _normalize_search_value, this would cause infinite | ||
125 | # recursion. | ||
126 | self.assertEqual([], soup.find_all(l)) | ||
127 | |||
128 | def test_find_all_resultset(self): | ||
129 | """All find_all calls return a ResultSet""" | ||
130 | soup = self.soup("<a></a>") | ||
131 | result = soup.find_all("a") | ||
132 | self.assertTrue(hasattr(result, "source")) | ||
133 | |||
134 | result = soup.find_all(True) | ||
135 | self.assertTrue(hasattr(result, "source")) | ||
136 | |||
137 | result = soup.find_all(text="foo") | ||
138 | self.assertTrue(hasattr(result, "source")) | ||
139 | |||
140 | |||
141 | class TestFindAllBasicNamespaces(TreeTest): | ||
142 | |||
143 | def test_find_by_namespaced_name(self): | ||
144 | soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') | ||
145 | self.assertEqual("4", soup.find("mathml:msqrt").string) | ||
146 | self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) | ||
147 | |||
148 | |||
149 | class TestFindAllByName(TreeTest): | ||
150 | """Test ways of finding tags by tag name.""" | ||
151 | |||
152 | def setUp(self): | ||
153 | super(TreeTest, self).setUp() | ||
154 | self.tree = self.soup("""<a>First tag.</a> | ||
155 | <b>Second tag.</b> | ||
156 | <c>Third <a>Nested tag.</a> tag.</c>""") | ||
157 | |||
158 | def test_find_all_by_tag_name(self): | ||
159 | # Find all the <a> tags. | ||
160 | self.assertSelects( | ||
161 | self.tree.find_all('a'), ['First tag.', 'Nested tag.']) | ||
162 | |||
163 | def test_find_all_by_name_and_text(self): | ||
164 | self.assertSelects( | ||
165 | self.tree.find_all('a', text='First tag.'), ['First tag.']) | ||
166 | |||
167 | self.assertSelects( | ||
168 | self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) | ||
169 | |||
170 | self.assertSelects( | ||
171 | self.tree.find_all('a', text=re.compile("tag")), | ||
172 | ['First tag.', 'Nested tag.']) | ||
173 | |||
174 | |||
175 | def test_find_all_on_non_root_element(self): | ||
176 | # You can call find_all on any node, not just the root. | ||
177 | self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) | ||
178 | |||
179 | def test_calling_element_invokes_find_all(self): | ||
180 | self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) | ||
181 | |||
182 | def test_find_all_by_tag_strainer(self): | ||
183 | self.assertSelects( | ||
184 | self.tree.find_all(SoupStrainer('a')), | ||
185 | ['First tag.', 'Nested tag.']) | ||
186 | |||
187 | def test_find_all_by_tag_names(self): | ||
188 | self.assertSelects( | ||
189 | self.tree.find_all(['a', 'b']), | ||
190 | ['First tag.', 'Second tag.', 'Nested tag.']) | ||
191 | |||
192 | def test_find_all_by_tag_dict(self): | ||
193 | self.assertSelects( | ||
194 | self.tree.find_all({'a' : True, 'b' : True}), | ||
195 | ['First tag.', 'Second tag.', 'Nested tag.']) | ||
196 | |||
197 | def test_find_all_by_tag_re(self): | ||
198 | self.assertSelects( | ||
199 | self.tree.find_all(re.compile('^[ab]$')), | ||
200 | ['First tag.', 'Second tag.', 'Nested tag.']) | ||
201 | |||
202 | def test_find_all_with_tags_matching_method(self): | ||
203 | # You can define an oracle method that determines whether | ||
204 | # a tag matches the search. | ||
205 | def id_matches_name(tag): | ||
206 | return tag.name == tag.get('id') | ||
207 | |||
208 | tree = self.soup("""<a id="a">Match 1.</a> | ||
209 | <a id="1">Does not match.</a> | ||
210 | <b id="b">Match 2.</a>""") | ||
211 | |||
212 | self.assertSelects( | ||
213 | tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) | ||
214 | |||
215 | |||
216 | class TestFindAllByAttribute(TreeTest): | ||
217 | |||
218 | def test_find_all_by_attribute_name(self): | ||
219 | # You can pass in keyword arguments to find_all to search by | ||
220 | # attribute. | ||
221 | tree = self.soup(""" | ||
222 | <a id="first">Matching a.</a> | ||
223 | <a id="second"> | ||
224 | Non-matching <b id="first">Matching b.</b>a. | ||
225 | </a>""") | ||
226 | self.assertSelects(tree.find_all(id='first'), | ||
227 | ["Matching a.", "Matching b."]) | ||
228 | |||
229 | def test_find_all_by_utf8_attribute_value(self): | ||
230 | peace = u"םולש".encode("utf8") | ||
231 | data = u'<a title="םולש"></a>'.encode("utf8") | ||
232 | soup = self.soup(data) | ||
233 | self.assertEqual([soup.a], soup.find_all(title=peace)) | ||
234 | self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) | ||
235 | self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) | ||
236 | |||
237 | def test_find_all_by_attribute_dict(self): | ||
238 | # You can pass in a dictionary as the argument 'attrs'. This | ||
239 | # lets you search for attributes like 'name' (a fixed argument | ||
240 | # to find_all) and 'class' (a reserved word in Python.) | ||
241 | tree = self.soup(""" | ||
242 | <a name="name1" class="class1">Name match.</a> | ||
243 | <a name="name2" class="class2">Class match.</a> | ||
244 | <a name="name3" class="class3">Non-match.</a> | ||
245 | <name1>A tag called 'name1'.</name1> | ||
246 | """) | ||
247 | |||
248 | # This doesn't do what you want. | ||
249 | self.assertSelects(tree.find_all(name='name1'), | ||
250 | ["A tag called 'name1'."]) | ||
251 | # This does what you want. | ||
252 | self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), | ||
253 | ["Name match."]) | ||
254 | |||
255 | self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), | ||
256 | ["Class match."]) | ||
257 | |||
258 | def test_find_all_by_class(self): | ||
259 | tree = self.soup(""" | ||
260 | <a class="1">Class 1.</a> | ||
261 | <a class="2">Class 2.</a> | ||
262 | <b class="1">Class 1.</b> | ||
263 | <c class="3 4">Class 3 and 4.</c> | ||
264 | """) | ||
265 | |||
266 | # Passing in the class_ keyword argument will search against | ||
267 | # the 'class' attribute. | ||
268 | self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) | ||
269 | self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) | ||
270 | self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) | ||
271 | |||
272 | # Passing in a string to 'attrs' will also search the CSS class. | ||
273 | self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) | ||
274 | self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) | ||
275 | self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) | ||
276 | self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) | ||
277 | |||
278 | def test_find_by_class_when_multiple_classes_present(self): | ||
279 | tree = self.soup("<gar class='foo bar'>Found it</gar>") | ||
280 | |||
281 | f = tree.find_all("gar", class_=re.compile("o")) | ||
282 | self.assertSelects(f, ["Found it"]) | ||
283 | |||
284 | f = tree.find_all("gar", class_=re.compile("a")) | ||
285 | self.assertSelects(f, ["Found it"]) | ||
286 | |||
287 | # Since the class is not the string "foo bar", but the two | ||
288 | # strings "foo" and "bar", this will not find anything. | ||
289 | f = tree.find_all("gar", class_=re.compile("o b")) | ||
290 | self.assertSelects(f, []) | ||
291 | |||
292 | def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): | ||
293 | soup = self.soup("<a class='bar'>Found it</a>") | ||
294 | |||
295 | self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) | ||
296 | |||
297 | def big_attribute_value(value): | ||
298 | return len(value) > 3 | ||
299 | |||
300 | self.assertSelects(soup.find_all("a", big_attribute_value), []) | ||
301 | |||
302 | def small_attribute_value(value): | ||
303 | return len(value) <= 3 | ||
304 | |||
305 | self.assertSelects( | ||
306 | soup.find_all("a", small_attribute_value), ["Found it"]) | ||
307 | |||
308 | def test_find_all_with_string_for_attrs_finds_multiple_classes(self): | ||
309 | soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') | ||
310 | a, a2 = soup.find_all("a") | ||
311 | self.assertEqual([a, a2], soup.find_all("a", "foo")) | ||
312 | self.assertEqual([a], soup.find_all("a", "bar")) | ||
313 | |||
314 | # If you specify the class as a string that contains a | ||
315 | # space, only that specific value will be found. | ||
316 | self.assertEqual([a], soup.find_all("a", class_="foo bar")) | ||
317 | self.assertEqual([a], soup.find_all("a", "foo bar")) | ||
318 | self.assertEqual([], soup.find_all("a", "bar foo")) | ||
319 | |||
320 | def test_find_all_by_attribute_soupstrainer(self): | ||
321 | tree = self.soup(""" | ||
322 | <a id="first">Match.</a> | ||
323 | <a id="second">Non-match.</a>""") | ||
324 | |||
325 | strainer = SoupStrainer(attrs={'id' : 'first'}) | ||
326 | self.assertSelects(tree.find_all(strainer), ['Match.']) | ||
327 | |||
328 | def test_find_all_with_missing_atribute(self): | ||
329 | # You can pass in None as the value of an attribute to find_all. | ||
330 | # This will match tags that do not have that attribute set. | ||
331 | tree = self.soup("""<a id="1">ID present.</a> | ||
332 | <a>No ID present.</a> | ||
333 | <a id="">ID is empty.</a>""") | ||
334 | self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) | ||
335 | |||
336 | def test_find_all_with_defined_attribute(self): | ||
337 | # You can pass in None as the value of an attribute to find_all. | ||
338 | # This will match tags that have that attribute set to any value. | ||
339 | tree = self.soup("""<a id="1">ID present.</a> | ||
340 | <a>No ID present.</a> | ||
341 | <a id="">ID is empty.</a>""") | ||
342 | self.assertSelects( | ||
343 | tree.find_all(id=True), ["ID present.", "ID is empty."]) | ||
344 | |||
345 | def test_find_all_with_numeric_attribute(self): | ||
346 | # If you search for a number, it's treated as a string. | ||
347 | tree = self.soup("""<a id=1>Unquoted attribute.</a> | ||
348 | <a id="1">Quoted attribute.</a>""") | ||
349 | |||
350 | expected = ["Unquoted attribute.", "Quoted attribute."] | ||
351 | self.assertSelects(tree.find_all(id=1), expected) | ||
352 | self.assertSelects(tree.find_all(id="1"), expected) | ||
353 | |||
354 | def test_find_all_with_list_attribute_values(self): | ||
355 | # You can pass a list of attribute values instead of just one, | ||
356 | # and you'll get tags that match any of the values. | ||
357 | tree = self.soup("""<a id="1">1</a> | ||
358 | <a id="2">2</a> | ||
359 | <a id="3">3</a> | ||
360 | <a>No ID.</a>""") | ||
361 | self.assertSelects(tree.find_all(id=["1", "3", "4"]), | ||
362 | ["1", "3"]) | ||
363 | |||
364 | def test_find_all_with_regular_expression_attribute_value(self): | ||
365 | # You can pass a regular expression as an attribute value, and | ||
366 | # you'll get tags whose values for that attribute match the | ||
367 | # regular expression. | ||
368 | tree = self.soup("""<a id="a">One a.</a> | ||
369 | <a id="aa">Two as.</a> | ||
370 | <a id="ab">Mixed as and bs.</a> | ||
371 | <a id="b">One b.</a> | ||
372 | <a>No ID.</a>""") | ||
373 | |||
374 | self.assertSelects(tree.find_all(id=re.compile("^a+$")), | ||
375 | ["One a.", "Two as."]) | ||
376 | |||
377 | def test_find_by_name_and_containing_string(self): | ||
378 | soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") | ||
379 | a = soup.a | ||
380 | |||
381 | self.assertEqual([a], soup.find_all("a", text="foo")) | ||
382 | self.assertEqual([], soup.find_all("a", text="bar")) | ||
383 | self.assertEqual([], soup.find_all("a", text="bar")) | ||
384 | |||
385 | def test_find_by_name_and_containing_string_when_string_is_buried(self): | ||
386 | soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") | ||
387 | self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) | ||
388 | |||
389 | def test_find_by_attribute_and_containing_string(self): | ||
390 | soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') | ||
391 | a = soup.a | ||
392 | |||
393 | self.assertEqual([a], soup.find_all(id=2, text="foo")) | ||
394 | self.assertEqual([], soup.find_all(id=1, text="bar")) | ||
395 | |||
396 | |||
397 | |||
398 | |||
399 | class TestIndex(TreeTest): | ||
400 | """Test Tag.index""" | ||
401 | def test_index(self): | ||
402 | tree = self.soup("""<div> | ||
403 | <a>Identical</a> | ||
404 | <b>Not identical</b> | ||
405 | <a>Identical</a> | ||
406 | |||
407 | <c><d>Identical with child</d></c> | ||
408 | <b>Also not identical</b> | ||
409 | <c><d>Identical with child</d></c> | ||
410 | </div>""") | ||
411 | div = tree.div | ||
412 | for i, element in enumerate(div.contents): | ||
413 | self.assertEqual(i, div.index(element)) | ||
414 | self.assertRaises(ValueError, tree.index, 1) | ||
415 | |||
416 | |||
417 | class TestParentOperations(TreeTest): | ||
418 | """Test navigation and searching through an element's parents.""" | ||
419 | |||
420 | def setUp(self): | ||
421 | super(TestParentOperations, self).setUp() | ||
422 | self.tree = self.soup('''<ul id="empty"></ul> | ||
423 | <ul id="top"> | ||
424 | <ul id="middle"> | ||
425 | <ul id="bottom"> | ||
426 | <b>Start here</b> | ||
427 | </ul> | ||
428 | </ul>''') | ||
429 | self.start = self.tree.b | ||
430 | |||
431 | |||
432 | def test_parent(self): | ||
433 | self.assertEqual(self.start.parent['id'], 'bottom') | ||
434 | self.assertEqual(self.start.parent.parent['id'], 'middle') | ||
435 | self.assertEqual(self.start.parent.parent.parent['id'], 'top') | ||
436 | |||
437 | def test_parent_of_top_tag_is_soup_object(self): | ||
438 | top_tag = self.tree.contents[0] | ||
439 | self.assertEqual(top_tag.parent, self.tree) | ||
440 | |||
441 | def test_soup_object_has_no_parent(self): | ||
442 | self.assertEqual(None, self.tree.parent) | ||
443 | |||
444 | def test_find_parents(self): | ||
445 | self.assertSelectsIDs( | ||
446 | self.start.find_parents('ul'), ['bottom', 'middle', 'top']) | ||
447 | self.assertSelectsIDs( | ||
448 | self.start.find_parents('ul', id="middle"), ['middle']) | ||
449 | |||
450 | def test_find_parent(self): | ||
451 | self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') | ||
452 | self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') | ||
453 | |||
454 | def test_parent_of_text_element(self): | ||
455 | text = self.tree.find(text="Start here") | ||
456 | self.assertEqual(text.parent.name, 'b') | ||
457 | |||
458 | def test_text_element_find_parent(self): | ||
459 | text = self.tree.find(text="Start here") | ||
460 | self.assertEqual(text.find_parent('ul')['id'], 'bottom') | ||
461 | |||
462 | def test_parent_generator(self): | ||
463 | parents = [parent['id'] for parent in self.start.parents | ||
464 | if parent is not None and 'id' in parent.attrs] | ||
465 | self.assertEqual(parents, ['bottom', 'middle', 'top']) | ||
466 | |||
467 | |||
468 | class ProximityTest(TreeTest): | ||
469 | |||
470 | def setUp(self): | ||
471 | super(TreeTest, self).setUp() | ||
472 | self.tree = self.soup( | ||
473 | '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>') | ||
474 | |||
475 | |||
476 | class TestNextOperations(ProximityTest): | ||
477 | |||
478 | def setUp(self): | ||
479 | super(TestNextOperations, self).setUp() | ||
480 | self.start = self.tree.b | ||
481 | |||
482 | def test_next(self): | ||
483 | self.assertEqual(self.start.next_element, "One") | ||
484 | self.assertEqual(self.start.next_element.next_element['id'], "2") | ||
485 | |||
486 | def test_next_of_last_item_is_none(self): | ||
487 | last = self.tree.find(text="Three") | ||
488 | self.assertEqual(last.next_element, None) | ||
489 | |||
490 | def test_next_of_root_is_none(self): | ||
491 | # The document root is outside the next/previous chain. | ||
492 | self.assertEqual(self.tree.next_element, None) | ||
493 | |||
494 | def test_find_all_next(self): | ||
495 | self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) | ||
496 | self.start.find_all_next(id=3) | ||
497 | self.assertSelects(self.start.find_all_next(id=3), ["Three"]) | ||
498 | |||
499 | def test_find_next(self): | ||
500 | self.assertEqual(self.start.find_next('b')['id'], '2') | ||
501 | self.assertEqual(self.start.find_next(text="Three"), "Three") | ||
502 | |||
503 | def test_find_next_for_text_element(self): | ||
504 | text = self.tree.find(text="One") | ||
505 | self.assertEqual(text.find_next("b").string, "Two") | ||
506 | self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) | ||
507 | |||
508 | def test_next_generator(self): | ||
509 | start = self.tree.find(text="Two") | ||
510 | successors = [node for node in start.next_elements] | ||
511 | # There are two successors: the final <b> tag and its text contents. | ||
512 | tag, contents = successors | ||
513 | self.assertEqual(tag['id'], '3') | ||
514 | self.assertEqual(contents, "Three") | ||
515 | |||
516 | class TestPreviousOperations(ProximityTest): | ||
517 | |||
518 | def setUp(self): | ||
519 | super(TestPreviousOperations, self).setUp() | ||
520 | self.end = self.tree.find(text="Three") | ||
521 | |||
522 | def test_previous(self): | ||
523 | self.assertEqual(self.end.previous_element['id'], "3") | ||
524 | self.assertEqual(self.end.previous_element.previous_element, "Two") | ||
525 | |||
526 | def test_previous_of_first_item_is_none(self): | ||
527 | first = self.tree.find('html') | ||
528 | self.assertEqual(first.previous_element, None) | ||
529 | |||
530 | def test_previous_of_root_is_none(self): | ||
531 | # The document root is outside the next/previous chain. | ||
532 | # XXX This is broken! | ||
533 | #self.assertEqual(self.tree.previous_element, None) | ||
534 | pass | ||
535 | |||
536 | def test_find_all_previous(self): | ||
537 | # The <b> tag containing the "Three" node is the predecessor | ||
538 | # of the "Three" node itself, which is why "Three" shows up | ||
539 | # here. | ||
540 | self.assertSelects( | ||
541 | self.end.find_all_previous('b'), ["Three", "Two", "One"]) | ||
542 | self.assertSelects(self.end.find_all_previous(id=1), ["One"]) | ||
543 | |||
544 | def test_find_previous(self): | ||
545 | self.assertEqual(self.end.find_previous('b')['id'], '3') | ||
546 | self.assertEqual(self.end.find_previous(text="One"), "One") | ||
547 | |||
548 | def test_find_previous_for_text_element(self): | ||
549 | text = self.tree.find(text="Three") | ||
550 | self.assertEqual(text.find_previous("b").string, "Three") | ||
551 | self.assertSelects( | ||
552 | text.find_all_previous("b"), ["Three", "Two", "One"]) | ||
553 | |||
554 | def test_previous_generator(self): | ||
555 | start = self.tree.find(text="One") | ||
556 | predecessors = [node for node in start.previous_elements] | ||
557 | |||
558 | # There are four predecessors: the <b> tag containing "One" | ||
559 | # the <body> tag, the <head> tag, and the <html> tag. | ||
560 | b, body, head, html = predecessors | ||
561 | self.assertEqual(b['id'], '1') | ||
562 | self.assertEqual(body.name, "body") | ||
563 | self.assertEqual(head.name, "head") | ||
564 | self.assertEqual(html.name, "html") | ||
565 | |||
566 | |||
567 | class SiblingTest(TreeTest): | ||
568 | |||
569 | def setUp(self): | ||
570 | super(SiblingTest, self).setUp() | ||
571 | markup = '''<html> | ||
572 | <span id="1"> | ||
573 | <span id="1.1"></span> | ||
574 | </span> | ||
575 | <span id="2"> | ||
576 | <span id="2.1"></span> | ||
577 | </span> | ||
578 | <span id="3"> | ||
579 | <span id="3.1"></span> | ||
580 | </span> | ||
581 | <span id="4"></span> | ||
582 | </html>''' | ||
583 | # All that whitespace looks good but makes the tests more | ||
584 | # difficult. Get rid of it. | ||
585 | markup = re.compile("\n\s*").sub("", markup) | ||
586 | self.tree = self.soup(markup) | ||
587 | |||
588 | |||
589 | class TestNextSibling(SiblingTest): | ||
590 | |||
591 | def setUp(self): | ||
592 | super(TestNextSibling, self).setUp() | ||
593 | self.start = self.tree.find(id="1") | ||
594 | |||
595 | def test_next_sibling_of_root_is_none(self): | ||
596 | self.assertEqual(self.tree.next_sibling, None) | ||
597 | |||
598 | def test_next_sibling(self): | ||
599 | self.assertEqual(self.start.next_sibling['id'], '2') | ||
600 | self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') | ||
601 | |||
602 | # Note the difference between next_sibling and next_element. | ||
603 | self.assertEqual(self.start.next_element['id'], '1.1') | ||
604 | |||
605 | def test_next_sibling_may_not_exist(self): | ||
606 | self.assertEqual(self.tree.html.next_sibling, None) | ||
607 | |||
608 | nested_span = self.tree.find(id="1.1") | ||
609 | self.assertEqual(nested_span.next_sibling, None) | ||
610 | |||
611 | last_span = self.tree.find(id="4") | ||
612 | self.assertEqual(last_span.next_sibling, None) | ||
613 | |||
614 | def test_find_next_sibling(self): | ||
615 | self.assertEqual(self.start.find_next_sibling('span')['id'], '2') | ||
616 | |||
617 | def test_next_siblings(self): | ||
618 | self.assertSelectsIDs(self.start.find_next_siblings("span"), | ||
619 | ['2', '3', '4']) | ||
620 | |||
621 | self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) | ||
622 | |||
623 | def test_next_sibling_for_text_element(self): | ||
624 | soup = self.soup("Foo<b>bar</b>baz") | ||
625 | start = soup.find(text="Foo") | ||
626 | self.assertEqual(start.next_sibling.name, 'b') | ||
627 | self.assertEqual(start.next_sibling.next_sibling, 'baz') | ||
628 | |||
629 | self.assertSelects(start.find_next_siblings('b'), ['bar']) | ||
630 | self.assertEqual(start.find_next_sibling(text="baz"), "baz") | ||
631 | self.assertEqual(start.find_next_sibling(text="nonesuch"), None) | ||
632 | |||
633 | |||
634 | class TestPreviousSibling(SiblingTest): | ||
635 | |||
636 | def setUp(self): | ||
637 | super(TestPreviousSibling, self).setUp() | ||
638 | self.end = self.tree.find(id="4") | ||
639 | |||
640 | def test_previous_sibling_of_root_is_none(self): | ||
641 | self.assertEqual(self.tree.previous_sibling, None) | ||
642 | |||
643 | def test_previous_sibling(self): | ||
644 | self.assertEqual(self.end.previous_sibling['id'], '3') | ||
645 | self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') | ||
646 | |||
647 | # Note the difference between previous_sibling and previous_element. | ||
648 | self.assertEqual(self.end.previous_element['id'], '3.1') | ||
649 | |||
650 | def test_previous_sibling_may_not_exist(self): | ||
651 | self.assertEqual(self.tree.html.previous_sibling, None) | ||
652 | |||
653 | nested_span = self.tree.find(id="1.1") | ||
654 | self.assertEqual(nested_span.previous_sibling, None) | ||
655 | |||
656 | first_span = self.tree.find(id="1") | ||
657 | self.assertEqual(first_span.previous_sibling, None) | ||
658 | |||
659 | def test_find_previous_sibling(self): | ||
660 | self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') | ||
661 | |||
662 | def test_previous_siblings(self): | ||
663 | self.assertSelectsIDs(self.end.find_previous_siblings("span"), | ||
664 | ['3', '2', '1']) | ||
665 | |||
666 | self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) | ||
667 | |||
668 | def test_previous_sibling_for_text_element(self): | ||
669 | soup = self.soup("Foo<b>bar</b>baz") | ||
670 | start = soup.find(text="baz") | ||
671 | self.assertEqual(start.previous_sibling.name, 'b') | ||
672 | self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') | ||
673 | |||
674 | self.assertSelects(start.find_previous_siblings('b'), ['bar']) | ||
675 | self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") | ||
676 | self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) | ||
677 | |||
678 | |||
679 | class TestTagCreation(SoupTest): | ||
680 | """Test the ability to create new tags.""" | ||
681 | def test_new_tag(self): | ||
682 | soup = self.soup("") | ||
683 | new_tag = soup.new_tag("foo", bar="baz") | ||
684 | self.assertTrue(isinstance(new_tag, Tag)) | ||
685 | self.assertEqual("foo", new_tag.name) | ||
686 | self.assertEqual(dict(bar="baz"), new_tag.attrs) | ||
687 | self.assertEqual(None, new_tag.parent) | ||
688 | |||
689 | def test_tag_inherits_self_closing_rules_from_builder(self): | ||
690 | if XML_BUILDER_PRESENT: | ||
691 | xml_soup = BeautifulSoup("", "xml") | ||
692 | xml_br = xml_soup.new_tag("br") | ||
693 | xml_p = xml_soup.new_tag("p") | ||
694 | |||
695 | # Both the <br> and <p> tag are empty-element, just because | ||
696 | # they have no contents. | ||
697 | self.assertEqual(b"<br/>", xml_br.encode()) | ||
698 | self.assertEqual(b"<p/>", xml_p.encode()) | ||
699 | |||
700 | html_soup = BeautifulSoup("", "html") | ||
701 | html_br = html_soup.new_tag("br") | ||
702 | html_p = html_soup.new_tag("p") | ||
703 | |||
704 | # The HTML builder users HTML's rules about which tags are | ||
705 | # empty-element tags, and the new tags reflect these rules. | ||
706 | self.assertEqual(b"<br/>", html_br.encode()) | ||
707 | self.assertEqual(b"<p></p>", html_p.encode()) | ||
708 | |||
709 | def test_new_string_creates_navigablestring(self): | ||
710 | soup = self.soup("") | ||
711 | s = soup.new_string("foo") | ||
712 | self.assertEqual("foo", s) | ||
713 | self.assertTrue(isinstance(s, NavigableString)) | ||
714 | |||
715 | def test_new_string_can_create_navigablestring_subclass(self): | ||
716 | soup = self.soup("") | ||
717 | s = soup.new_string("foo", Comment) | ||
718 | self.assertEqual("foo", s) | ||
719 | self.assertTrue(isinstance(s, Comment)) | ||
720 | |||
721 | class TestTreeModification(SoupTest): | ||
722 | |||
723 | def test_attribute_modification(self): | ||
724 | soup = self.soup('<a id="1"></a>') | ||
725 | soup.a['id'] = 2 | ||
726 | self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>')) | ||
727 | del(soup.a['id']) | ||
728 | self.assertEqual(soup.decode(), self.document_for('<a></a>')) | ||
729 | soup.a['id2'] = 'foo' | ||
730 | self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) | ||
731 | |||
732 | def test_new_tag_creation(self): | ||
733 | builder = builder_registry.lookup('html')() | ||
734 | soup = self.soup("<body></body>", builder=builder) | ||
735 | a = Tag(soup, builder, 'a') | ||
736 | ol = Tag(soup, builder, 'ol') | ||
737 | a['href'] = 'http://foo.com/' | ||
738 | soup.body.insert(0, a) | ||
739 | soup.body.insert(1, ol) | ||
740 | self.assertEqual( | ||
741 | soup.body.encode(), | ||
742 | b'<body><a href="http://foo.com/"></a><ol></ol></body>') | ||
743 | |||
744 | def test_append_to_contents_moves_tag(self): | ||
745 | doc = """<p id="1">Don't leave me <b>here</b>.</p> | ||
746 | <p id="2">Don\'t leave!</p>""" | ||
747 | soup = self.soup(doc) | ||
748 | second_para = soup.find(id='2') | ||
749 | bold = soup.b | ||
750 | |||
751 | # Move the <b> tag to the end of the second paragraph. | ||
752 | soup.find(id='2').append(soup.b) | ||
753 | |||
754 | # The <b> tag is now a child of the second paragraph. | ||
755 | self.assertEqual(bold.parent, second_para) | ||
756 | |||
757 | self.assertEqual( | ||
758 | soup.decode(), self.document_for( | ||
759 | '<p id="1">Don\'t leave me .</p>\n' | ||
760 | '<p id="2">Don\'t leave!<b>here</b></p>')) | ||
761 | |||
762 | def test_replace_with_returns_thing_that_was_replaced(self): | ||
763 | text = "<a></a><b><c></c></b>" | ||
764 | soup = self.soup(text) | ||
765 | a = soup.a | ||
766 | new_a = a.replace_with(soup.c) | ||
767 | self.assertEqual(a, new_a) | ||
768 | |||
769 | def test_unwrap_returns_thing_that_was_replaced(self): | ||
770 | text = "<a><b></b><c></c></a>" | ||
771 | soup = self.soup(text) | ||
772 | a = soup.a | ||
773 | new_a = a.unwrap() | ||
774 | self.assertEqual(a, new_a) | ||
775 | |||
776 | def test_replace_tag_with_itself(self): | ||
777 | text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" | ||
778 | soup = self.soup(text) | ||
779 | c = soup.c | ||
780 | soup.c.replace_with(c) | ||
781 | self.assertEqual(soup.decode(), self.document_for(text)) | ||
782 | |||
783 | def test_replace_tag_with_its_parent_raises_exception(self): | ||
784 | text = "<a><b></b></a>" | ||
785 | soup = self.soup(text) | ||
786 | self.assertRaises(ValueError, soup.b.replace_with, soup.a) | ||
787 | |||
788 | def test_insert_tag_into_itself_raises_exception(self): | ||
789 | text = "<a><b></b></a>" | ||
790 | soup = self.soup(text) | ||
791 | self.assertRaises(ValueError, soup.a.insert, 0, soup.a) | ||
792 | |||
793 | def test_replace_with_maintains_next_element_throughout(self): | ||
794 | soup = self.soup('<p><a>one</a><b>three</b></p>') | ||
795 | a = soup.a | ||
796 | b = a.contents[0] | ||
797 | # Make it so the <a> tag has two text children. | ||
798 | a.insert(1, "two") | ||
799 | |||
800 | # Now replace each one with the empty string. | ||
801 | left, right = a.contents | ||
802 | left.replaceWith('') | ||
803 | right.replaceWith('') | ||
804 | |||
805 | # The <b> tag is still connected to the tree. | ||
806 | self.assertEqual("three", soup.b.string) | ||
807 | |||
808 | def test_replace_final_node(self): | ||
809 | soup = self.soup("<b>Argh!</b>") | ||
810 | soup.find(text="Argh!").replace_with("Hooray!") | ||
811 | new_text = soup.find(text="Hooray!") | ||
812 | b = soup.b | ||
813 | self.assertEqual(new_text.previous_element, b) | ||
814 | self.assertEqual(new_text.parent, b) | ||
815 | self.assertEqual(new_text.previous_element.next_element, new_text) | ||
816 | self.assertEqual(new_text.next_element, None) | ||
817 | |||
818 | def test_consecutive_text_nodes(self): | ||
819 | # A builder should never create two consecutive text nodes, | ||
820 | # but if you insert one next to another, Beautiful Soup will | ||
821 | # handle it correctly. | ||
822 | soup = self.soup("<a><b>Argh!</b><c></c></a>") | ||
823 | soup.b.insert(1, "Hooray!") | ||
824 | |||
825 | self.assertEqual( | ||
826 | soup.decode(), self.document_for( | ||
827 | "<a><b>Argh!Hooray!</b><c></c></a>")) | ||
828 | |||
829 | new_text = soup.find(text="Hooray!") | ||
830 | self.assertEqual(new_text.previous_element, "Argh!") | ||
831 | self.assertEqual(new_text.previous_element.next_element, new_text) | ||
832 | |||
833 | self.assertEqual(new_text.previous_sibling, "Argh!") | ||
834 | self.assertEqual(new_text.previous_sibling.next_sibling, new_text) | ||
835 | |||
836 | self.assertEqual(new_text.next_sibling, None) | ||
837 | self.assertEqual(new_text.next_element, soup.c) | ||
838 | |||
839 | def test_insert_string(self): | ||
840 | soup = self.soup("<a></a>") | ||
841 | soup.a.insert(0, "bar") | ||
842 | soup.a.insert(0, "foo") | ||
843 | # The string were added to the tag. | ||
844 | self.assertEqual(["foo", "bar"], soup.a.contents) | ||
845 | # And they were converted to NavigableStrings. | ||
846 | self.assertEqual(soup.a.contents[0].next_element, "bar") | ||
847 | |||
848 | def test_insert_tag(self): | ||
849 | builder = self.default_builder | ||
850 | soup = self.soup( | ||
851 | "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) | ||
852 | magic_tag = Tag(soup, builder, 'magictag') | ||
853 | magic_tag.insert(0, "the") | ||
854 | soup.a.insert(1, magic_tag) | ||
855 | |||
856 | self.assertEqual( | ||
857 | soup.decode(), self.document_for( | ||
858 | "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>")) | ||
859 | |||
860 | # Make sure all the relationships are hooked up correctly. | ||
861 | b_tag = soup.b | ||
862 | self.assertEqual(b_tag.next_sibling, magic_tag) | ||
863 | self.assertEqual(magic_tag.previous_sibling, b_tag) | ||
864 | |||
865 | find = b_tag.find(text="Find") | ||
866 | self.assertEqual(find.next_element, magic_tag) | ||
867 | self.assertEqual(magic_tag.previous_element, find) | ||
868 | |||
869 | c_tag = soup.c | ||
870 | self.assertEqual(magic_tag.next_sibling, c_tag) | ||
871 | self.assertEqual(c_tag.previous_sibling, magic_tag) | ||
872 | |||
873 | the = magic_tag.find(text="the") | ||
874 | self.assertEqual(the.parent, magic_tag) | ||
875 | self.assertEqual(the.next_element, c_tag) | ||
876 | self.assertEqual(c_tag.previous_element, the) | ||
877 | |||
878 | def test_append_child_thats_already_at_the_end(self): | ||
879 | data = "<a><b></b></a>" | ||
880 | soup = self.soup(data) | ||
881 | soup.a.append(soup.b) | ||
882 | self.assertEqual(data, soup.decode()) | ||
883 | |||
884 | def test_move_tag_to_beginning_of_parent(self): | ||
885 | data = "<a><b></b><c></c><d></d></a>" | ||
886 | soup = self.soup(data) | ||
887 | soup.a.insert(0, soup.d) | ||
888 | self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode()) | ||
889 | |||
890 | def test_insert_works_on_empty_element_tag(self): | ||
891 | # This is a little strange, since most HTML parsers don't allow | ||
892 | # markup like this to come through. But in general, we don't | ||
893 | # know what the parser would or wouldn't have allowed, so | ||
894 | # I'm letting this succeed for now. | ||
895 | soup = self.soup("<br/>") | ||
896 | soup.br.insert(1, "Contents") | ||
897 | self.assertEqual(str(soup.br), "<br>Contents</br>") | ||
898 | |||
899 | def test_insert_before(self): | ||
900 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
901 | soup.b.insert_before("BAZ") | ||
902 | soup.a.insert_before("QUUX") | ||
903 | self.assertEqual( | ||
904 | soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>")) | ||
905 | |||
906 | soup.a.insert_before(soup.b) | ||
907 | self.assertEqual( | ||
908 | soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) | ||
909 | |||
910 | def test_insert_after(self): | ||
911 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
912 | soup.b.insert_after("BAZ") | ||
913 | soup.a.insert_after("QUUX") | ||
914 | self.assertEqual( | ||
915 | soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ")) | ||
916 | soup.b.insert_after(soup.a) | ||
917 | self.assertEqual( | ||
918 | soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) | ||
919 | |||
920 | def test_insert_after_raises_exception_if_after_has_no_meaning(self): | ||
921 | soup = self.soup("") | ||
922 | tag = soup.new_tag("a") | ||
923 | string = soup.new_string("") | ||
924 | self.assertRaises(ValueError, string.insert_after, tag) | ||
925 | self.assertRaises(NotImplementedError, soup.insert_after, tag) | ||
926 | self.assertRaises(ValueError, tag.insert_after, tag) | ||
927 | |||
928 | def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): | ||
929 | soup = self.soup("") | ||
930 | tag = soup.new_tag("a") | ||
931 | string = soup.new_string("") | ||
932 | self.assertRaises(ValueError, string.insert_before, tag) | ||
933 | self.assertRaises(NotImplementedError, soup.insert_before, tag) | ||
934 | self.assertRaises(ValueError, tag.insert_before, tag) | ||
935 | |||
936 | def test_replace_with(self): | ||
937 | soup = self.soup( | ||
938 | "<p>There's <b>no</b> business like <b>show</b> business</p>") | ||
939 | no, show = soup.find_all('b') | ||
940 | show.replace_with(no) | ||
941 | self.assertEqual( | ||
942 | soup.decode(), | ||
943 | self.document_for( | ||
944 | "<p>There's business like <b>no</b> business</p>")) | ||
945 | |||
946 | self.assertEqual(show.parent, None) | ||
947 | self.assertEqual(no.parent, soup.p) | ||
948 | self.assertEqual(no.next_element, "no") | ||
949 | self.assertEqual(no.next_sibling, " business") | ||
950 | |||
951 | def test_replace_first_child(self): | ||
952 | data = "<a><b></b><c></c></a>" | ||
953 | soup = self.soup(data) | ||
954 | soup.b.replace_with(soup.c) | ||
955 | self.assertEqual("<a><c></c></a>", soup.decode()) | ||
956 | |||
957 | def test_replace_last_child(self): | ||
958 | data = "<a><b></b><c></c></a>" | ||
959 | soup = self.soup(data) | ||
960 | soup.c.replace_with(soup.b) | ||
961 | self.assertEqual("<a><b></b></a>", soup.decode()) | ||
962 | |||
963 | def test_nested_tag_replace_with(self): | ||
964 | soup = self.soup( | ||
965 | """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") | ||
966 | |||
967 | # Replace the entire <b> tag and its contents ("reserve the | ||
968 | # right") with the <f> tag ("refuse"). | ||
969 | remove_tag = soup.b | ||
970 | move_tag = soup.f | ||
971 | remove_tag.replace_with(move_tag) | ||
972 | |||
973 | self.assertEqual( | ||
974 | soup.decode(), self.document_for( | ||
975 | "<a>We<f>refuse</f></a><e>to<g>service</g></e>")) | ||
976 | |||
977 | # The <b> tag is now an orphan. | ||
978 | self.assertEqual(remove_tag.parent, None) | ||
979 | self.assertEqual(remove_tag.find(text="right").next_element, None) | ||
980 | self.assertEqual(remove_tag.previous_element, None) | ||
981 | self.assertEqual(remove_tag.next_sibling, None) | ||
982 | self.assertEqual(remove_tag.previous_sibling, None) | ||
983 | |||
984 | # The <f> tag is now connected to the <a> tag. | ||
985 | self.assertEqual(move_tag.parent, soup.a) | ||
986 | self.assertEqual(move_tag.previous_element, "We") | ||
987 | self.assertEqual(move_tag.next_element.next_element, soup.e) | ||
988 | self.assertEqual(move_tag.next_sibling, None) | ||
989 | |||
990 | # The gap where the <f> tag used to be has been mended, and | ||
991 | # the word "to" is now connected to the <g> tag. | ||
992 | to_text = soup.find(text="to") | ||
993 | g_tag = soup.g | ||
994 | self.assertEqual(to_text.next_element, g_tag) | ||
995 | self.assertEqual(to_text.next_sibling, g_tag) | ||
996 | self.assertEqual(g_tag.previous_element, to_text) | ||
997 | self.assertEqual(g_tag.previous_sibling, to_text) | ||
998 | |||
999 | def test_unwrap(self): | ||
1000 | tree = self.soup(""" | ||
1001 | <p>Unneeded <em>formatting</em> is unneeded</p> | ||
1002 | """) | ||
1003 | tree.em.unwrap() | ||
1004 | self.assertEqual(tree.em, None) | ||
1005 | self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") | ||
1006 | |||
1007 | def test_wrap(self): | ||
1008 | soup = self.soup("I wish I was bold.") | ||
1009 | value = soup.string.wrap(soup.new_tag("b")) | ||
1010 | self.assertEqual(value.decode(), "<b>I wish I was bold.</b>") | ||
1011 | self.assertEqual( | ||
1012 | soup.decode(), self.document_for("<b>I wish I was bold.</b>")) | ||
1013 | |||
1014 | def test_wrap_extracts_tag_from_elsewhere(self): | ||
1015 | soup = self.soup("<b></b>I wish I was bold.") | ||
1016 | soup.b.next_sibling.wrap(soup.b) | ||
1017 | self.assertEqual( | ||
1018 | soup.decode(), self.document_for("<b>I wish I was bold.</b>")) | ||
1019 | |||
1020 | def test_wrap_puts_new_contents_at_the_end(self): | ||
1021 | soup = self.soup("<b>I like being bold.</b>I wish I was bold.") | ||
1022 | soup.b.next_sibling.wrap(soup.b) | ||
1023 | self.assertEqual(2, len(soup.b.contents)) | ||
1024 | self.assertEqual( | ||
1025 | soup.decode(), self.document_for( | ||
1026 | "<b>I like being bold.I wish I was bold.</b>")) | ||
1027 | |||
1028 | def test_extract(self): | ||
1029 | soup = self.soup( | ||
1030 | '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>') | ||
1031 | |||
1032 | self.assertEqual(len(soup.body.contents), 3) | ||
1033 | extracted = soup.find(id="nav").extract() | ||
1034 | |||
1035 | self.assertEqual( | ||
1036 | soup.decode(), "<html><body>Some content. More content.</body></html>") | ||
1037 | self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') | ||
1038 | |||
1039 | # The extracted tag is now an orphan. | ||
1040 | self.assertEqual(len(soup.body.contents), 2) | ||
1041 | self.assertEqual(extracted.parent, None) | ||
1042 | self.assertEqual(extracted.previous_element, None) | ||
1043 | self.assertEqual(extracted.next_element.next_element, None) | ||
1044 | |||
1045 | # The gap where the extracted tag used to be has been mended. | ||
1046 | content_1 = soup.find(text="Some content. ") | ||
1047 | content_2 = soup.find(text=" More content.") | ||
1048 | self.assertEqual(content_1.next_element, content_2) | ||
1049 | self.assertEqual(content_1.next_sibling, content_2) | ||
1050 | self.assertEqual(content_2.previous_element, content_1) | ||
1051 | self.assertEqual(content_2.previous_sibling, content_1) | ||
1052 | |||
1053 | def test_extract_distinguishes_between_identical_strings(self): | ||
1054 | soup = self.soup("<a>foo</a><b>bar</b>") | ||
1055 | foo_1 = soup.a.string | ||
1056 | bar_1 = soup.b.string | ||
1057 | foo_2 = soup.new_string("foo") | ||
1058 | bar_2 = soup.new_string("bar") | ||
1059 | soup.a.append(foo_2) | ||
1060 | soup.b.append(bar_2) | ||
1061 | |||
1062 | # Now there are two identical strings in the <a> tag, and two | ||
1063 | # in the <b> tag. Let's remove the first "foo" and the second | ||
1064 | # "bar". | ||
1065 | foo_1.extract() | ||
1066 | bar_2.extract() | ||
1067 | self.assertEqual(foo_2, soup.a.string) | ||
1068 | self.assertEqual(bar_2, soup.b.string) | ||
1069 | |||
1070 | def test_clear(self): | ||
1071 | """Tag.clear()""" | ||
1072 | soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") | ||
1073 | # clear using extract() | ||
1074 | a = soup.a | ||
1075 | soup.p.clear() | ||
1076 | self.assertEqual(len(soup.p.contents), 0) | ||
1077 | self.assertTrue(hasattr(a, "contents")) | ||
1078 | |||
1079 | # clear using decompose() | ||
1080 | em = a.em | ||
1081 | a.clear(decompose=True) | ||
1082 | self.assertEqual(0, len(em.contents)) | ||
1083 | |||
1084 | def test_string_set(self): | ||
1085 | """Tag.string = 'string'""" | ||
1086 | soup = self.soup("<a></a> <b><c></c></b>") | ||
1087 | soup.a.string = "foo" | ||
1088 | self.assertEqual(soup.a.contents, ["foo"]) | ||
1089 | soup.b.string = "bar" | ||
1090 | self.assertEqual(soup.b.contents, ["bar"]) | ||
1091 | |||
1092 | def test_string_set_does_not_affect_original_string(self): | ||
1093 | soup = self.soup("<a><b>foo</b><c>bar</c>") | ||
1094 | soup.b.string = soup.c.string | ||
1095 | self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>") | ||
1096 | |||
1097 | def test_set_string_preserves_class_of_string(self): | ||
1098 | soup = self.soup("<a></a>") | ||
1099 | cdata = CData("foo") | ||
1100 | soup.a.string = cdata | ||
1101 | self.assertTrue(isinstance(soup.a.string, CData)) | ||
1102 | |||
1103 | class TestElementObjects(SoupTest): | ||
1104 | """Test various features of element objects.""" | ||
1105 | |||
1106 | def test_len(self): | ||
1107 | """The length of an element is its number of children.""" | ||
1108 | soup = self.soup("<top>1<b>2</b>3</top>") | ||
1109 | |||
1110 | # The BeautifulSoup object itself contains one element: the | ||
1111 | # <top> tag. | ||
1112 | self.assertEqual(len(soup.contents), 1) | ||
1113 | self.assertEqual(len(soup), 1) | ||
1114 | |||
1115 | # The <top> tag contains three elements: the text node "1", the | ||
1116 | # <b> tag, and the text node "3". | ||
1117 | self.assertEqual(len(soup.top), 3) | ||
1118 | self.assertEqual(len(soup.top.contents), 3) | ||
1119 | |||
1120 | def test_member_access_invokes_find(self): | ||
1121 | """Accessing a Python member .foo invokes find('foo')""" | ||
1122 | soup = self.soup('<b><i></i></b>') | ||
1123 | self.assertEqual(soup.b, soup.find('b')) | ||
1124 | self.assertEqual(soup.b.i, soup.find('b').find('i')) | ||
1125 | self.assertEqual(soup.a, None) | ||
1126 | |||
1127 | def test_deprecated_member_access(self): | ||
1128 | soup = self.soup('<b><i></i></b>') | ||
1129 | with warnings.catch_warnings(record=True) as w: | ||
1130 | tag = soup.bTag | ||
1131 | self.assertEqual(soup.b, tag) | ||
1132 | self.assertEqual( | ||
1133 | '.bTag is deprecated, use .find("b") instead.', | ||
1134 | str(w[0].message)) | ||
1135 | |||
1136 | def test_has_attr(self): | ||
1137 | """has_attr() checks for the presence of an attribute. | ||
1138 | |||
1139 | Please note note: has_attr() is different from | ||
1140 | __in__. has_attr() checks the tag's attributes and __in__ | ||
1141 | checks the tag's chidlren. | ||
1142 | """ | ||
1143 | soup = self.soup("<foo attr='bar'>") | ||
1144 | self.assertTrue(soup.foo.has_attr('attr')) | ||
1145 | self.assertFalse(soup.foo.has_attr('attr2')) | ||
1146 | |||
1147 | |||
1148 | def test_attributes_come_out_in_alphabetical_order(self): | ||
1149 | markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' | ||
1150 | self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') | ||
1151 | |||
1152 | def test_string(self): | ||
1153 | # A tag that contains only a text node makes that node | ||
1154 | # available as .string. | ||
1155 | soup = self.soup("<b>foo</b>") | ||
1156 | self.assertEqual(soup.b.string, 'foo') | ||
1157 | |||
1158 | def test_empty_tag_has_no_string(self): | ||
1159 | # A tag with no children has no .stirng. | ||
1160 | soup = self.soup("<b></b>") | ||
1161 | self.assertEqual(soup.b.string, None) | ||
1162 | |||
1163 | def test_tag_with_multiple_children_has_no_string(self): | ||
1164 | # A tag with no children has no .string. | ||
1165 | soup = self.soup("<a>foo<b></b><b></b></b>") | ||
1166 | self.assertEqual(soup.b.string, None) | ||
1167 | |||
1168 | soup = self.soup("<a>foo<b></b>bar</b>") | ||
1169 | self.assertEqual(soup.b.string, None) | ||
1170 | |||
1171 | # Even if all the children are strings, due to trickery, | ||
1172 | # it won't work--but this would be a good optimization. | ||
1173 | soup = self.soup("<a>foo</b>") | ||
1174 | soup.a.insert(1, "bar") | ||
1175 | self.assertEqual(soup.a.string, None) | ||
1176 | |||
1177 | def test_tag_with_recursive_string_has_string(self): | ||
1178 | # A tag with a single child which has a .string inherits that | ||
1179 | # .string. | ||
1180 | soup = self.soup("<a><b>foo</b></a>") | ||
1181 | self.assertEqual(soup.a.string, "foo") | ||
1182 | self.assertEqual(soup.string, "foo") | ||
1183 | |||
1184 | def test_lack_of_string(self): | ||
1185 | """Only a tag containing a single text node has a .string.""" | ||
1186 | soup = self.soup("<b>f<i>e</i>o</b>") | ||
1187 | self.assertFalse(soup.b.string) | ||
1188 | |||
1189 | soup = self.soup("<b></b>") | ||
1190 | self.assertFalse(soup.b.string) | ||
1191 | |||
1192 | def test_all_text(self): | ||
1193 | """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" | ||
1194 | soup = self.soup("<a>a<b>r</b> <r> t </r></a>") | ||
1195 | self.assertEqual(soup.a.text, "ar t ") | ||
1196 | self.assertEqual(soup.a.get_text(strip=True), "art") | ||
1197 | self.assertEqual(soup.a.get_text(","), "a,r, , t ") | ||
1198 | self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") | ||
1199 | |||
1200 | def test_get_text_ignores_comments(self): | ||
1201 | soup = self.soup("foo<!--IGNORE-->bar") | ||
1202 | self.assertEqual(soup.get_text(), "foobar") | ||
1203 | |||
1204 | self.assertEqual( | ||
1205 | soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") | ||
1206 | self.assertEqual( | ||
1207 | soup.get_text(types=None), "fooIGNOREbar") | ||
1208 | |||
1209 | def test_all_strings_ignores_comments(self): | ||
1210 | soup = self.soup("foo<!--IGNORE-->bar") | ||
1211 | self.assertEqual(['foo', 'bar'], list(soup.strings)) | ||
1212 | |||
1213 | class TestCDAtaListAttributes(SoupTest): | ||
1214 | |||
1215 | """Testing cdata-list attributes like 'class'. | ||
1216 | """ | ||
1217 | def test_single_value_becomes_list(self): | ||
1218 | soup = self.soup("<a class='foo'>") | ||
1219 | self.assertEqual(["foo"],soup.a['class']) | ||
1220 | |||
1221 | def test_multiple_values_becomes_list(self): | ||
1222 | soup = self.soup("<a class='foo bar'>") | ||
1223 | self.assertEqual(["foo", "bar"], soup.a['class']) | ||
1224 | |||
1225 | def test_multiple_values_separated_by_weird_whitespace(self): | ||
1226 | soup = self.soup("<a class='foo\tbar\nbaz'>") | ||
1227 | self.assertEqual(["foo", "bar", "baz"],soup.a['class']) | ||
1228 | |||
1229 | def test_attributes_joined_into_string_on_output(self): | ||
1230 | soup = self.soup("<a class='foo\tbar'>") | ||
1231 | self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) | ||
1232 | |||
1233 | def test_accept_charset(self): | ||
1234 | soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') | ||
1235 | self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) | ||
1236 | |||
1237 | def test_cdata_attribute_applying_only_to_one_tag(self): | ||
1238 | data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' | ||
1239 | soup = self.soup(data) | ||
1240 | # We saw in another test that accept-charset is a cdata-list | ||
1241 | # attribute for the <form> tag. But it's not a cdata-list | ||
1242 | # attribute for any other tag. | ||
1243 | self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) | ||
1244 | |||
1245 | def test_string_has_immutable_name_property(self): | ||
1246 | string = self.soup("s").string | ||
1247 | self.assertEqual(None, string.name) | ||
1248 | def t(): | ||
1249 | string.name = 'foo' | ||
1250 | self.assertRaises(AttributeError, t) | ||
1251 | |||
1252 | class TestPersistence(SoupTest): | ||
1253 | "Testing features like pickle and deepcopy." | ||
1254 | |||
1255 | def setUp(self): | ||
1256 | super(TestPersistence, self).setUp() | ||
1257 | self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" | ||
1258 | "http://www.w3.org/TR/REC-html40/transitional.dtd"> | ||
1259 | <html> | ||
1260 | <head> | ||
1261 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | ||
1262 | <title>Beautiful Soup: We called him Tortoise because he taught us.</title> | ||
1263 | <link rev="made" href="mailto:leonardr@segfault.org"> | ||
1264 | <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> | ||
1265 | <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> | ||
1266 | <meta name="author" content="Leonard Richardson"> | ||
1267 | </head> | ||
1268 | <body> | ||
1269 | <a href="foo">foo</a> | ||
1270 | <a href="foo"><b>bar</b></a> | ||
1271 | </body> | ||
1272 | </html>""" | ||
1273 | self.tree = self.soup(self.page) | ||
1274 | |||
1275 | def test_pickle_and_unpickle_identity(self): | ||
1276 | # Pickling a tree, then unpickling it, yields a tree identical | ||
1277 | # to the original. | ||
1278 | dumped = pickle.dumps(self.tree, 2) | ||
1279 | loaded = pickle.loads(dumped) | ||
1280 | self.assertEqual(loaded.__class__, BeautifulSoup) | ||
1281 | self.assertEqual(loaded.decode(), self.tree.decode()) | ||
1282 | |||
1283 | def test_deepcopy_identity(self): | ||
1284 | # Making a deepcopy of a tree yields an identical tree. | ||
1285 | copied = copy.deepcopy(self.tree) | ||
1286 | self.assertEqual(copied.decode(), self.tree.decode()) | ||
1287 | |||
1288 | def test_unicode_pickle(self): | ||
1289 | # A tree containing Unicode characters can be pickled. | ||
1290 | html = u"<b>\N{SNOWMAN}</b>" | ||
1291 | soup = self.soup(html) | ||
1292 | dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) | ||
1293 | loaded = pickle.loads(dumped) | ||
1294 | self.assertEqual(loaded.decode(), soup.decode()) | ||
1295 | |||
1296 | |||
1297 | class TestSubstitutions(SoupTest): | ||
1298 | |||
1299 | def test_default_formatter_is_minimal(self): | ||
1300 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1301 | soup = self.soup(markup) | ||
1302 | decoded = soup.decode(formatter="minimal") | ||
1303 | # The < is converted back into < but the e-with-acute is left alone. | ||
1304 | self.assertEqual( | ||
1305 | decoded, | ||
1306 | self.document_for( | ||
1307 | u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | ||
1308 | |||
1309 | def test_formatter_html(self): | ||
1310 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1311 | soup = self.soup(markup) | ||
1312 | decoded = soup.decode(formatter="html") | ||
1313 | self.assertEqual( | ||
1314 | decoded, | ||
1315 | self.document_for("<b><<Sacré bleu!>></b>")) | ||
1316 | |||
1317 | def test_formatter_minimal(self): | ||
1318 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1319 | soup = self.soup(markup) | ||
1320 | decoded = soup.decode(formatter="minimal") | ||
1321 | # The < is converted back into < but the e-with-acute is left alone. | ||
1322 | self.assertEqual( | ||
1323 | decoded, | ||
1324 | self.document_for( | ||
1325 | u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | ||
1326 | |||
1327 | def test_formatter_null(self): | ||
1328 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | ||
1329 | soup = self.soup(markup) | ||
1330 | decoded = soup.decode(formatter=None) | ||
1331 | # Neither the angle brackets nor the e-with-acute are converted. | ||
1332 | # This is not valid HTML, but it's what the user wanted. | ||
1333 | self.assertEqual(decoded, | ||
1334 | self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | ||
1335 | |||
1336 | def test_formatter_custom(self): | ||
1337 | markup = u"<b><foo></b><b>bar</b>" | ||
1338 | soup = self.soup(markup) | ||
1339 | decoded = soup.decode(formatter = lambda x: x.upper()) | ||
1340 | # Instead of normal entity conversion code, the custom | ||
1341 | # callable is called on every string. | ||
1342 | self.assertEqual( | ||
1343 | decoded, | ||
1344 | self.document_for(u"<b><FOO></b><b>BAR</b>")) | ||
1345 | |||
1346 | def test_formatter_is_run_on_attribute_values(self): | ||
1347 | markup = u'<a href="http://a.com?a=b&c=é">e</a>' | ||
1348 | soup = self.soup(markup) | ||
1349 | a = soup.a | ||
1350 | |||
1351 | expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>' | ||
1352 | |||
1353 | self.assertEqual(expect_minimal, a.decode()) | ||
1354 | self.assertEqual(expect_minimal, a.decode(formatter="minimal")) | ||
1355 | |||
1356 | expect_html = u'<a href="http://a.com?a=b&c=é">e</a>' | ||
1357 | self.assertEqual(expect_html, a.decode(formatter="html")) | ||
1358 | |||
1359 | self.assertEqual(markup, a.decode(formatter=None)) | ||
1360 | expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' | ||
1361 | self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) | ||
1362 | |||
1363 | def test_formatter_skips_script_tag_for_html_documents(self): | ||
1364 | doc = """ | ||
1365 | <script type="text/javascript"> | ||
1366 | console.log("< < hey > > "); | ||
1367 | </script> | ||
1368 | """ | ||
1369 | encoded = BeautifulSoup(doc).encode() | ||
1370 | self.assertTrue(b"< < hey > >" in encoded) | ||
1371 | |||
1372 | def test_formatter_skips_style_tag_for_html_documents(self): | ||
1373 | doc = """ | ||
1374 | <style type="text/css"> | ||
1375 | console.log("< < hey > > "); | ||
1376 | </style> | ||
1377 | """ | ||
1378 | encoded = BeautifulSoup(doc).encode() | ||
1379 | self.assertTrue(b"< < hey > >" in encoded) | ||
1380 | |||
1381 | def test_prettify_leaves_preformatted_text_alone(self): | ||
1382 | soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ") | ||
1383 | # Everything outside the <pre> tag is reformatted, but everything | ||
1384 | # inside is left alone. | ||
1385 | self.assertEqual( | ||
1386 | u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', | ||
1387 | soup.div.prettify()) | ||
1388 | |||
1389 | def test_prettify_accepts_formatter(self): | ||
1390 | soup = BeautifulSoup("<html><body>foo</body></html>") | ||
1391 | pretty = soup.prettify(formatter = lambda x: x.upper()) | ||
1392 | self.assertTrue("FOO" in pretty) | ||
1393 | |||
1394 | def test_prettify_outputs_unicode_by_default(self): | ||
1395 | soup = self.soup("<a></a>") | ||
1396 | self.assertEqual(unicode, type(soup.prettify())) | ||
1397 | |||
1398 | def test_prettify_can_encode_data(self): | ||
1399 | soup = self.soup("<a></a>") | ||
1400 | self.assertEqual(bytes, type(soup.prettify("utf-8"))) | ||
1401 | |||
1402 | def test_html_entity_substitution_off_by_default(self): | ||
1403 | markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" | ||
1404 | soup = self.soup(markup) | ||
1405 | encoded = soup.b.encode("utf-8") | ||
1406 | self.assertEqual(encoded, markup.encode('utf-8')) | ||
1407 | |||
1408 | def test_encoding_substitution(self): | ||
1409 | # Here's the <meta> tag saying that a document is | ||
1410 | # encoded in Shift-JIS. | ||
1411 | meta_tag = ('<meta content="text/html; charset=x-sjis" ' | ||
1412 | 'http-equiv="Content-type"/>') | ||
1413 | soup = self.soup(meta_tag) | ||
1414 | |||
1415 | # Parse the document, and the charset apprears unchanged. | ||
1416 | self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') | ||
1417 | |||
1418 | # Encode the document into some encoding, and the encoding is | ||
1419 | # substituted into the meta tag. | ||
1420 | utf_8 = soup.encode("utf-8") | ||
1421 | self.assertTrue(b"charset=utf-8" in utf_8) | ||
1422 | |||
1423 | euc_jp = soup.encode("euc_jp") | ||
1424 | self.assertTrue(b"charset=euc_jp" in euc_jp) | ||
1425 | |||
1426 | shift_jis = soup.encode("shift-jis") | ||
1427 | self.assertTrue(b"charset=shift-jis" in shift_jis) | ||
1428 | |||
1429 | utf_16_u = soup.encode("utf-16").decode("utf-16") | ||
1430 | self.assertTrue("charset=utf-16" in utf_16_u) | ||
1431 | |||
1432 | def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): | ||
1433 | markup = ('<head><meta content="text/html; charset=x-sjis" ' | ||
1434 | 'http-equiv="Content-type"/></head><pre>foo</pre>') | ||
1435 | |||
1436 | # Beautiful Soup used to try to rewrite the meta tag even if the | ||
1437 | # meta tag got filtered out by the strainer. This test makes | ||
1438 | # sure that doesn't happen. | ||
1439 | strainer = SoupStrainer('pre') | ||
1440 | soup = self.soup(markup, parse_only=strainer) | ||
1441 | self.assertEqual(soup.contents[0].name, 'pre') | ||
1442 | |||
1443 | class TestEncoding(SoupTest): | ||
1444 | """Test the ability to encode objects into strings.""" | ||
1445 | |||
1446 | def test_unicode_string_can_be_encoded(self): | ||
1447 | html = u"<b>\N{SNOWMAN}</b>" | ||
1448 | soup = self.soup(html) | ||
1449 | self.assertEqual(soup.b.string.encode("utf-8"), | ||
1450 | u"\N{SNOWMAN}".encode("utf-8")) | ||
1451 | |||
1452 | def test_tag_containing_unicode_string_can_be_encoded(self): | ||
1453 | html = u"<b>\N{SNOWMAN}</b>" | ||
1454 | soup = self.soup(html) | ||
1455 | self.assertEqual( | ||
1456 | soup.b.encode("utf-8"), html.encode("utf-8")) | ||
1457 | |||
1458 | def test_encoding_substitutes_unrecognized_characters_by_default(self): | ||
1459 | html = u"<b>\N{SNOWMAN}</b>" | ||
1460 | soup = self.soup(html) | ||
1461 | self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") | ||
1462 | |||
1463 | def test_encoding_can_be_made_strict(self): | ||
1464 | html = u"<b>\N{SNOWMAN}</b>" | ||
1465 | soup = self.soup(html) | ||
1466 | self.assertRaises( | ||
1467 | UnicodeEncodeError, soup.encode, "ascii", errors="strict") | ||
1468 | |||
1469 | def test_decode_contents(self): | ||
1470 | html = u"<b>\N{SNOWMAN}</b>" | ||
1471 | soup = self.soup(html) | ||
1472 | self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) | ||
1473 | |||
1474 | def test_encode_contents(self): | ||
1475 | html = u"<b>\N{SNOWMAN}</b>" | ||
1476 | soup = self.soup(html) | ||
1477 | self.assertEqual( | ||
1478 | u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( | ||
1479 | encoding="utf8")) | ||
1480 | |||
1481 | def test_deprecated_renderContents(self): | ||
1482 | html = u"<b>\N{SNOWMAN}</b>" | ||
1483 | soup = self.soup(html) | ||
1484 | self.assertEqual( | ||
1485 | u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) | ||
1486 | |||
1487 | class TestNavigableStringSubclasses(SoupTest): | ||
1488 | |||
1489 | def test_cdata(self): | ||
1490 | # None of the current builders turn CDATA sections into CData | ||
1491 | # objects, but you can create them manually. | ||
1492 | soup = self.soup("") | ||
1493 | cdata = CData("foo") | ||
1494 | soup.insert(1, cdata) | ||
1495 | self.assertEqual(str(soup), "<![CDATA[foo]]>") | ||
1496 | self.assertEqual(soup.find(text="foo"), "foo") | ||
1497 | self.assertEqual(soup.contents[0], "foo") | ||
1498 | |||
1499 | def test_cdata_is_never_formatted(self): | ||
1500 | """Text inside a CData object is passed into the formatter. | ||
1501 | |||
1502 | But the return value is ignored. | ||
1503 | """ | ||
1504 | |||
1505 | self.count = 0 | ||
1506 | def increment(*args): | ||
1507 | self.count += 1 | ||
1508 | return "BITTER FAILURE" | ||
1509 | |||
1510 | soup = self.soup("") | ||
1511 | cdata = CData("<><><>") | ||
1512 | soup.insert(1, cdata) | ||
1513 | self.assertEqual( | ||
1514 | b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) | ||
1515 | self.assertEqual(1, self.count) | ||
1516 | |||
1517 | def test_doctype_ends_in_newline(self): | ||
1518 | # Unlike other NavigableString subclasses, a DOCTYPE always ends | ||
1519 | # in a newline. | ||
1520 | doctype = Doctype("foo") | ||
1521 | soup = self.soup("") | ||
1522 | soup.insert(1, doctype) | ||
1523 | self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") | ||
1524 | |||
1525 | |||
1526 | class TestSoupSelector(TreeTest): | ||
1527 | |||
1528 | HTML = """ | ||
1529 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | ||
1530 | "http://www.w3.org/TR/html4/strict.dtd"> | ||
1531 | <html> | ||
1532 | <head> | ||
1533 | <title>The title</title> | ||
1534 | <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> | ||
1535 | </head> | ||
1536 | <body> | ||
1537 | |||
1538 | <div id="main" class="fancy"> | ||
1539 | <div id="inner"> | ||
1540 | <h1 id="header1">An H1</h1> | ||
1541 | <p>Some text</p> | ||
1542 | <p class="onep" id="p1">Some more text</p> | ||
1543 | <h2 id="header2">An H2</h2> | ||
1544 | <p class="class1 class2 class3" id="pmulti">Another</p> | ||
1545 | <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> | ||
1546 | <h2 id="header3">Another H2</h2> | ||
1547 | <a id="me" href="http://simonwillison.net/" rel="me">me</a> | ||
1548 | <span class="s1"> | ||
1549 | <a href="#" id="s1a1">span1a1</a> | ||
1550 | <a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> | ||
1551 | <span class="span2"> | ||
1552 | <a href="#" id="s2a1">span2a1</a> | ||
1553 | </span> | ||
1554 | <span class="span3"></span> | ||
1555 | </span> | ||
1556 | </div> | ||
1557 | <p lang="en" id="lang-en">English</p> | ||
1558 | <p lang="en-gb" id="lang-en-gb">English UK</p> | ||
1559 | <p lang="en-us" id="lang-en-us">English US</p> | ||
1560 | <p lang="fr" id="lang-fr">French</p> | ||
1561 | </div> | ||
1562 | |||
1563 | <div id="footer"> | ||
1564 | </div> | ||
1565 | """ | ||
1566 | |||
1567 | def setUp(self): | ||
1568 | self.soup = BeautifulSoup(self.HTML) | ||
1569 | |||
1570 | def assertSelects(self, selector, expected_ids): | ||
1571 | el_ids = [el['id'] for el in self.soup.select(selector)] | ||
1572 | el_ids.sort() | ||
1573 | expected_ids.sort() | ||
1574 | self.assertEqual(expected_ids, el_ids, | ||
1575 | "Selector %s, expected [%s], got [%s]" % ( | ||
1576 | selector, ', '.join(expected_ids), ', '.join(el_ids) | ||
1577 | ) | ||
1578 | ) | ||
1579 | |||
1580 | assertSelect = assertSelects | ||
1581 | |||
1582 | def assertSelectMultiple(self, *tests): | ||
1583 | for selector, expected_ids in tests: | ||
1584 | self.assertSelect(selector, expected_ids) | ||
1585 | |||
1586 | def test_one_tag_one(self): | ||
1587 | els = self.soup.select('title') | ||
1588 | self.assertEqual(len(els), 1) | ||
1589 | self.assertEqual(els[0].name, 'title') | ||
1590 | self.assertEqual(els[0].contents, [u'The title']) | ||
1591 | |||
1592 | def test_one_tag_many(self): | ||
1593 | els = self.soup.select('div') | ||
1594 | self.assertEqual(len(els), 3) | ||
1595 | for div in els: | ||
1596 | self.assertEqual(div.name, 'div') | ||
1597 | |||
1598 | def test_tag_in_tag_one(self): | ||
1599 | els = self.soup.select('div div') | ||
1600 | self.assertSelects('div div', ['inner']) | ||
1601 | |||
1602 | def test_tag_in_tag_many(self): | ||
1603 | for selector in ('html div', 'html body div', 'body div'): | ||
1604 | self.assertSelects(selector, ['main', 'inner', 'footer']) | ||
1605 | |||
1606 | def test_tag_no_match(self): | ||
1607 | self.assertEqual(len(self.soup.select('del')), 0) | ||
1608 | |||
1609 | def test_invalid_tag(self): | ||
1610 | self.assertRaises(ValueError, self.soup.select, 'tag%t') | ||
1611 | |||
1612 | def test_header_tags(self): | ||
1613 | self.assertSelectMultiple( | ||
1614 | ('h1', ['header1']), | ||
1615 | ('h2', ['header2', 'header3']), | ||
1616 | ) | ||
1617 | |||
1618 | def test_class_one(self): | ||
1619 | for selector in ('.onep', 'p.onep', 'html p.onep'): | ||
1620 | els = self.soup.select(selector) | ||
1621 | self.assertEqual(len(els), 1) | ||
1622 | self.assertEqual(els[0].name, 'p') | ||
1623 | self.assertEqual(els[0]['class'], ['onep']) | ||
1624 | |||
1625 | def test_class_mismatched_tag(self): | ||
1626 | els = self.soup.select('div.onep') | ||
1627 | self.assertEqual(len(els), 0) | ||
1628 | |||
1629 | def test_one_id(self): | ||
1630 | for selector in ('div#inner', '#inner', 'div div#inner'): | ||
1631 | self.assertSelects(selector, ['inner']) | ||
1632 | |||
1633 | def test_bad_id(self): | ||
1634 | els = self.soup.select('#doesnotexist') | ||
1635 | self.assertEqual(len(els), 0) | ||
1636 | |||
1637 | def test_items_in_id(self): | ||
1638 | els = self.soup.select('div#inner p') | ||
1639 | self.assertEqual(len(els), 3) | ||
1640 | for el in els: | ||
1641 | self.assertEqual(el.name, 'p') | ||
1642 | self.assertEqual(els[1]['class'], ['onep']) | ||
1643 | self.assertFalse(els[0].has_attr('class')) | ||
1644 | |||
1645 | def test_a_bunch_of_emptys(self): | ||
1646 | for selector in ('div#main del', 'div#main div.oops', 'div div#main'): | ||
1647 | self.assertEqual(len(self.soup.select(selector)), 0) | ||
1648 | |||
1649 | def test_multi_class_support(self): | ||
1650 | for selector in ('.class1', 'p.class1', '.class2', 'p.class2', | ||
1651 | '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): | ||
1652 | self.assertSelects(selector, ['pmulti']) | ||
1653 | |||
1654 | def test_multi_class_selection(self): | ||
1655 | for selector in ('.class1.class3', '.class3.class2', | ||
1656 | '.class1.class2.class3'): | ||
1657 | self.assertSelects(selector, ['pmulti']) | ||
1658 | |||
1659 | def test_child_selector(self): | ||
1660 | self.assertSelects('.s1 > a', ['s1a1', 's1a2']) | ||
1661 | self.assertSelects('.s1 > a span', ['s1a2s1']) | ||
1662 | |||
1663 | def test_child_selector_id(self): | ||
1664 | self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) | ||
1665 | |||
1666 | def test_attribute_equals(self): | ||
1667 | self.assertSelectMultiple( | ||
1668 | ('p[class="onep"]', ['p1']), | ||
1669 | ('p[id="p1"]', ['p1']), | ||
1670 | ('[class="onep"]', ['p1']), | ||
1671 | ('[id="p1"]', ['p1']), | ||
1672 | ('link[rel="stylesheet"]', ['l1']), | ||
1673 | ('link[type="text/css"]', ['l1']), | ||
1674 | ('link[href="blah.css"]', ['l1']), | ||
1675 | ('link[href="no-blah.css"]', []), | ||
1676 | ('[rel="stylesheet"]', ['l1']), | ||
1677 | ('[type="text/css"]', ['l1']), | ||
1678 | ('[href="blah.css"]', ['l1']), | ||
1679 | ('[href="no-blah.css"]', []), | ||
1680 | ('p[href="no-blah.css"]', []), | ||
1681 | ('[href="no-blah.css"]', []), | ||
1682 | ) | ||
1683 | |||
1684 | def test_attribute_tilde(self): | ||
1685 | self.assertSelectMultiple( | ||
1686 | ('p[class~="class1"]', ['pmulti']), | ||
1687 | ('p[class~="class2"]', ['pmulti']), | ||
1688 | ('p[class~="class3"]', ['pmulti']), | ||
1689 | ('[class~="class1"]', ['pmulti']), | ||
1690 | ('[class~="class2"]', ['pmulti']), | ||
1691 | ('[class~="class3"]', ['pmulti']), | ||
1692 | ('a[rel~="friend"]', ['bob']), | ||
1693 | ('a[rel~="met"]', ['bob']), | ||
1694 | ('[rel~="friend"]', ['bob']), | ||
1695 | ('[rel~="met"]', ['bob']), | ||
1696 | ) | ||
1697 | |||
1698 | def test_attribute_startswith(self): | ||
1699 | self.assertSelectMultiple( | ||
1700 | ('[rel^="style"]', ['l1']), | ||
1701 | ('link[rel^="style"]', ['l1']), | ||
1702 | ('notlink[rel^="notstyle"]', []), | ||
1703 | ('[rel^="notstyle"]', []), | ||
1704 | ('link[rel^="notstyle"]', []), | ||
1705 | ('link[href^="bla"]', ['l1']), | ||
1706 | ('a[href^="http://"]', ['bob', 'me']), | ||
1707 | ('[href^="http://"]', ['bob', 'me']), | ||
1708 | ('[id^="p"]', ['pmulti', 'p1']), | ||
1709 | ('[id^="m"]', ['me', 'main']), | ||
1710 | ('div[id^="m"]', ['main']), | ||
1711 | ('a[id^="m"]', ['me']), | ||
1712 | ) | ||
1713 | |||
1714 | def test_attribute_endswith(self): | ||
1715 | self.assertSelectMultiple( | ||
1716 | ('[href$=".css"]', ['l1']), | ||
1717 | ('link[href$=".css"]', ['l1']), | ||
1718 | ('link[id$="1"]', ['l1']), | ||
1719 | ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), | ||
1720 | ('div[id$="1"]', []), | ||
1721 | ('[id$="noending"]', []), | ||
1722 | ) | ||
1723 | |||
1724 | def test_attribute_contains(self): | ||
1725 | self.assertSelectMultiple( | ||
1726 | # From test_attribute_startswith | ||
1727 | ('[rel*="style"]', ['l1']), | ||
1728 | ('link[rel*="style"]', ['l1']), | ||
1729 | ('notlink[rel*="notstyle"]', []), | ||
1730 | ('[rel*="notstyle"]', []), | ||
1731 | ('link[rel*="notstyle"]', []), | ||
1732 | ('link[href*="bla"]', ['l1']), | ||
1733 | ('a[href*="http://"]', ['bob', 'me']), | ||
1734 | ('[href*="http://"]', ['bob', 'me']), | ||
1735 | ('[id*="p"]', ['pmulti', 'p1']), | ||
1736 | ('div[id*="m"]', ['main']), | ||
1737 | ('a[id*="m"]', ['me']), | ||
1738 | # From test_attribute_endswith | ||
1739 | ('[href*=".css"]', ['l1']), | ||
1740 | ('link[href*=".css"]', ['l1']), | ||
1741 | ('link[id*="1"]', ['l1']), | ||
1742 | ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), | ||
1743 | ('div[id*="1"]', []), | ||
1744 | ('[id*="noending"]', []), | ||
1745 | # New for this test | ||
1746 | ('[href*="."]', ['bob', 'me', 'l1']), | ||
1747 | ('a[href*="."]', ['bob', 'me']), | ||
1748 | ('link[href*="."]', ['l1']), | ||
1749 | ('div[id*="n"]', ['main', 'inner']), | ||
1750 | ('div[id*="nn"]', ['inner']), | ||
1751 | ) | ||
1752 | |||
1753 | def test_attribute_exact_or_hypen(self): | ||
1754 | self.assertSelectMultiple( | ||
1755 | ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), | ||
1756 | ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), | ||
1757 | ('p[lang|="fr"]', ['lang-fr']), | ||
1758 | ('p[lang|="gb"]', []), | ||
1759 | ) | ||
1760 | |||
1761 | def test_attribute_exists(self): | ||
1762 | self.assertSelectMultiple( | ||
1763 | ('[rel]', ['l1', 'bob', 'me']), | ||
1764 | ('link[rel]', ['l1']), | ||
1765 | ('a[rel]', ['bob', 'me']), | ||
1766 | ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), | ||
1767 | ('p[class]', ['p1', 'pmulti']), | ||
1768 | ('[blah]', []), | ||
1769 | ('p[blah]', []), | ||
1770 | ) | ||
1771 | |||
1772 | def test_nth_of_type(self): | ||
1773 | # Try to select first paragraph | ||
1774 | els = self.soup.select('div#inner p:nth-of-type(1)') | ||
1775 | self.assertEqual(len(els), 1) | ||
1776 | self.assertEqual(els[0].string, u'Some text') | ||
1777 | |||
1778 | # Try to select third paragraph | ||
1779 | els = self.soup.select('div#inner p:nth-of-type(3)') | ||
1780 | self.assertEqual(len(els), 1) | ||
1781 | self.assertEqual(els[0].string, u'Another') | ||
1782 | |||
1783 | # Try to select (non-existent!) fourth paragraph | ||
1784 | els = self.soup.select('div#inner p:nth-of-type(4)') | ||
1785 | self.assertEqual(len(els), 0) | ||
1786 | |||
1787 | # Pass in an invalid value. | ||
1788 | self.assertRaises( | ||
1789 | ValueError, self.soup.select, 'div p:nth-of-type(0)') | ||
1790 | |||
1791 | def test_nth_of_type_direct_descendant(self): | ||
1792 | els = self.soup.select('div#inner > p:nth-of-type(1)') | ||
1793 | self.assertEqual(len(els), 1) | ||
1794 | self.assertEqual(els[0].string, u'Some text') | ||
1795 | |||
1796 | def test_id_child_selector_nth_of_type(self): | ||
1797 | self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) | ||
1798 | |||
1799 | def test_select_on_element(self): | ||
1800 | # Other tests operate on the tree; this operates on an element | ||
1801 | # within the tree. | ||
1802 | inner = self.soup.find("div", id="main") | ||
1803 | selected = inner.select("div") | ||
1804 | # The <div id="inner"> tag was selected. The <div id="footer"> | ||
1805 | # tag was not. | ||
1806 | self.assertSelectsIDs(selected, ['inner']) | ||
1807 | |||
1808 | def test_overspecified_child_id(self): | ||
1809 | self.assertSelects(".fancy #inner", ['inner']) | ||
1810 | self.assertSelects(".normal #inner", []) | ||
1811 | |||
1812 | def test_adjacent_sibling_selector(self): | ||
1813 | self.assertSelects('#p1 + h2', ['header2']) | ||
1814 | self.assertSelects('#p1 + h2 + p', ['pmulti']) | ||
1815 | self.assertSelects('#p1 + #header2 + .class1', ['pmulti']) | ||
1816 | self.assertEqual([], self.soup.select('#p1 + p')) | ||
1817 | |||
1818 | def test_general_sibling_selector(self): | ||
1819 | self.assertSelects('#p1 ~ h2', ['header2', 'header3']) | ||
1820 | self.assertSelects('#p1 ~ #header2', ['header2']) | ||
1821 | self.assertSelects('#p1 ~ h2 + a', ['me']) | ||
1822 | self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me']) | ||
1823 | self.assertEqual([], self.soup.select('#inner ~ h2')) | ||
1824 | |||
1825 | def test_dangling_combinator(self): | ||
1826 | self.assertRaises(ValueError, self.soup.select, 'h1 >') | ||
1827 | |||
1828 | def test_sibling_combinator_wont_select_same_tag_twice(self): | ||
1829 | self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) | ||