summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/tests
diff options
context:
space:
mode:
authorAníbal Limón <anibal.limon@linux.intel.com>2014-11-05 12:10:27 -0600
committerRichard Purdie <richard.purdie@linuxfoundation.org>2014-11-06 16:45:23 +0000
commit25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch)
tree7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/tests
parentbc6330cb7f288e76209410b0812aff1dbfa90950 (diff)
downloadpoky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz
bitbake: bs4: Add beautifulsoup 4.3.2 to assist the fetcher
Added Beautifulsoup module because fetch/wget latest_versionstring method depends on it. This provides support to fetch/wget.py module for search new package versions in upstream sites. (Bitbake rev: 4626c9b77e5eded97507b6f9ca0d891f9a54bb8a) Signed-off-by: Aníbal Limón <anibal.limon@linux.intel.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/tests')
-rw-r--r--bitbake/lib/bs4/tests/__init__.py1
-rw-r--r--bitbake/lib/bs4/tests/test_builder_registry.py141
-rw-r--r--bitbake/lib/bs4/tests/test_docs.py36
-rw-r--r--bitbake/lib/bs4/tests/test_html5lib.py85
-rw-r--r--bitbake/lib/bs4/tests/test_htmlparser.py19
-rw-r--r--bitbake/lib/bs4/tests/test_lxml.py91
-rw-r--r--bitbake/lib/bs4/tests/test_soup.py434
-rw-r--r--bitbake/lib/bs4/tests/test_tree.py1829
8 files changed, 2636 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/tests/__init__.py b/bitbake/lib/bs4/tests/__init__.py
new file mode 100644
index 0000000000..142c8cc3f1
--- /dev/null
+++ b/bitbake/lib/bs4/tests/__init__.py
@@ -0,0 +1 @@
"The beautifulsoup tests."
diff --git a/bitbake/lib/bs4/tests/test_builder_registry.py b/bitbake/lib/bs4/tests/test_builder_registry.py
new file mode 100644
index 0000000000..92ad10fb04
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_builder_registry.py
@@ -0,0 +1,141 @@
1"""Tests of the builder registry."""
2
3import unittest
4
5from bs4 import BeautifulSoup
6from bs4.builder import (
7 builder_registry as registry,
8 HTMLParserTreeBuilder,
9 TreeBuilderRegistry,
10)
11
12try:
13 from bs4.builder import HTML5TreeBuilder
14 HTML5LIB_PRESENT = True
15except ImportError:
16 HTML5LIB_PRESENT = False
17
18try:
19 from bs4.builder import (
20 LXMLTreeBuilderForXML,
21 LXMLTreeBuilder,
22 )
23 LXML_PRESENT = True
24except ImportError:
25 LXML_PRESENT = False
26
27
28class BuiltInRegistryTest(unittest.TestCase):
29 """Test the built-in registry with the default builders registered."""
30
31 def test_combination(self):
32 if LXML_PRESENT:
33 self.assertEqual(registry.lookup('fast', 'html'),
34 LXMLTreeBuilder)
35
36 if LXML_PRESENT:
37 self.assertEqual(registry.lookup('permissive', 'xml'),
38 LXMLTreeBuilderForXML)
39 self.assertEqual(registry.lookup('strict', 'html'),
40 HTMLParserTreeBuilder)
41 if HTML5LIB_PRESENT:
42 self.assertEqual(registry.lookup('html5lib', 'html'),
43 HTML5TreeBuilder)
44
45 def test_lookup_by_markup_type(self):
46 if LXML_PRESENT:
47 self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
48 self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
49 else:
50 self.assertEqual(registry.lookup('xml'), None)
51 if HTML5LIB_PRESENT:
52 self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
53 else:
54 self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
55
56 def test_named_library(self):
57 if LXML_PRESENT:
58 self.assertEqual(registry.lookup('lxml', 'xml'),
59 LXMLTreeBuilderForXML)
60 self.assertEqual(registry.lookup('lxml', 'html'),
61 LXMLTreeBuilder)
62 if HTML5LIB_PRESENT:
63 self.assertEqual(registry.lookup('html5lib'),
64 HTML5TreeBuilder)
65
66 self.assertEqual(registry.lookup('html.parser'),
67 HTMLParserTreeBuilder)
68
69 def test_beautifulsoup_constructor_does_lookup(self):
70 # You can pass in a string.
71 BeautifulSoup("", features="html")
72 # Or a list of strings.
73 BeautifulSoup("", features=["html", "fast"])
74
75 # You'll get an exception if BS can't find an appropriate
76 # builder.
77 self.assertRaises(ValueError, BeautifulSoup,
78 "", features="no-such-feature")
79
80class RegistryTest(unittest.TestCase):
81 """Test the TreeBuilderRegistry class in general."""
82
83 def setUp(self):
84 self.registry = TreeBuilderRegistry()
85
86 def builder_for_features(self, *feature_list):
87 cls = type('Builder_' + '_'.join(feature_list),
88 (object,), {'features' : feature_list})
89
90 self.registry.register(cls)
91 return cls
92
93 def test_register_with_no_features(self):
94 builder = self.builder_for_features()
95
96 # Since the builder advertises no features, you can't find it
97 # by looking up features.
98 self.assertEqual(self.registry.lookup('foo'), None)
99
100 # But you can find it by doing a lookup with no features, if
101 # this happens to be the only registered builder.
102 self.assertEqual(self.registry.lookup(), builder)
103
104 def test_register_with_features_makes_lookup_succeed(self):
105 builder = self.builder_for_features('foo', 'bar')
106 self.assertEqual(self.registry.lookup('foo'), builder)
107 self.assertEqual(self.registry.lookup('bar'), builder)
108
109 def test_lookup_fails_when_no_builder_implements_feature(self):
110 builder = self.builder_for_features('foo', 'bar')
111 self.assertEqual(self.registry.lookup('baz'), None)
112
113 def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
114 builder1 = self.builder_for_features('foo')
115 builder2 = self.builder_for_features('bar')
116 self.assertEqual(self.registry.lookup(), builder2)
117
118 def test_lookup_fails_when_no_tree_builders_registered(self):
119 self.assertEqual(self.registry.lookup(), None)
120
121 def test_lookup_gets_most_recent_builder_supporting_all_features(self):
122 has_one = self.builder_for_features('foo')
123 has_the_other = self.builder_for_features('bar')
124 has_both_early = self.builder_for_features('foo', 'bar', 'baz')
125 has_both_late = self.builder_for_features('foo', 'bar', 'quux')
126 lacks_one = self.builder_for_features('bar')
127 has_the_other = self.builder_for_features('foo')
128
129 # There are two builders featuring 'foo' and 'bar', but
130 # the one that also features 'quux' was registered later.
131 self.assertEqual(self.registry.lookup('foo', 'bar'),
132 has_both_late)
133
134 # There is only one builder featuring 'foo', 'bar', and 'baz'.
135 self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
136 has_both_early)
137
138 def test_lookup_fails_when_cannot_reconcile_requested_features(self):
139 builder1 = self.builder_for_features('foo', 'bar')
140 builder2 = self.builder_for_features('foo', 'baz')
141 self.assertEqual(self.registry.lookup('bar', 'baz'), None)
diff --git a/bitbake/lib/bs4/tests/test_docs.py b/bitbake/lib/bs4/tests/test_docs.py
new file mode 100644
index 0000000000..5b9f677093
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_docs.py
@@ -0,0 +1,36 @@
1"Test harness for doctests."
2
3# pylint: disable-msg=E0611,W0142
4
5__metaclass__ = type
6__all__ = [
7 'additional_tests',
8 ]
9
10import atexit
11import doctest
12import os
13#from pkg_resources import (
14# resource_filename, resource_exists, resource_listdir, cleanup_resources)
15import unittest
16
17DOCTEST_FLAGS = (
18 doctest.ELLIPSIS |
19 doctest.NORMALIZE_WHITESPACE |
20 doctest.REPORT_NDIFF)
21
22
23# def additional_tests():
24# "Run the doc tests (README.txt and docs/*, if any exist)"
25# doctest_files = [
26# os.path.abspath(resource_filename('bs4', 'README.txt'))]
27# if resource_exists('bs4', 'docs'):
28# for name in resource_listdir('bs4', 'docs'):
29# if name.endswith('.txt'):
30# doctest_files.append(
31# os.path.abspath(
32# resource_filename('bs4', 'docs/%s' % name)))
33# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
34# atexit.register(cleanup_resources)
35# return unittest.TestSuite((
36# doctest.DocFileSuite(*doctest_files, **kwargs)))
diff --git a/bitbake/lib/bs4/tests/test_html5lib.py b/bitbake/lib/bs4/tests/test_html5lib.py
new file mode 100644
index 0000000000..594c3e1f26
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_html5lib.py
@@ -0,0 +1,85 @@
1"""Tests to ensure that the html5lib tree builder generates good trees."""
2
3import warnings
4
5try:
6 from bs4.builder import HTML5TreeBuilder
7 HTML5LIB_PRESENT = True
8except ImportError, e:
9 HTML5LIB_PRESENT = False
10from bs4.element import SoupStrainer
11from bs4.testing import (
12 HTML5TreeBuilderSmokeTest,
13 SoupTest,
14 skipIf,
15)
16
17@skipIf(
18 not HTML5LIB_PRESENT,
19 "html5lib seems not to be present, not testing its tree builder.")
20class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
21 """See ``HTML5TreeBuilderSmokeTest``."""
22
23 @property
24 def default_builder(self):
25 return HTML5TreeBuilder()
26
27 def test_soupstrainer(self):
28 # The html5lib tree builder does not support SoupStrainers.
29 strainer = SoupStrainer("b")
30 markup = "<p>A <b>bold</b> statement.</p>"
31 with warnings.catch_warnings(record=True) as w:
32 soup = self.soup(markup, parse_only=strainer)
33 self.assertEqual(
34 soup.decode(), self.document_for(markup))
35
36 self.assertTrue(
37 "the html5lib tree builder doesn't support parse_only" in
38 str(w[0].message))
39
40 def test_correctly_nested_tables(self):
41 """html5lib inserts <tbody> tags where other parsers don't."""
42 markup = ('<table id="1">'
43 '<tr>'
44 "<td>Here's another table:"
45 '<table id="2">'
46 '<tr><td>foo</td></tr>'
47 '</table></td>')
48
49 self.assertSoupEquals(
50 markup,
51 '<table id="1"><tbody><tr><td>Here\'s another table:'
52 '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
53 '</td></tr></tbody></table>')
54
55 self.assertSoupEquals(
56 "<table><thead><tr><td>Foo</td></tr></thead>"
57 "<tbody><tr><td>Bar</td></tr></tbody>"
58 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
59
60 def test_xml_declaration_followed_by_doctype(self):
61 markup = '''<?xml version="1.0" encoding="utf-8"?>
62<!DOCTYPE html>
63<html>
64 <head>
65 </head>
66 <body>
67 <p>foo</p>
68 </body>
69</html>'''
70 soup = self.soup(markup)
71 # Verify that we can reach the <p> tag; this means the tree is connected.
72 self.assertEqual(b"<p>foo</p>", soup.p.encode())
73
74 def test_reparented_markup(self):
75 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 soup = self.soup(markup)
77 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 self.assertEqual(2, len(soup.find_all('p')))
79
80
81 def test_reparented_markup_ends_with_whitespace(self):
82 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 soup = self.soup(markup)
84 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 self.assertEqual(2, len(soup.find_all('p')))
diff --git a/bitbake/lib/bs4/tests/test_htmlparser.py b/bitbake/lib/bs4/tests/test_htmlparser.py
new file mode 100644
index 0000000000..bcb5ed232f
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_htmlparser.py
@@ -0,0 +1,19 @@
1"""Tests to ensure that the html.parser tree builder generates good
2trees."""
3
4from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
5from bs4.builder import HTMLParserTreeBuilder
6
7class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
8
9 @property
10 def default_builder(self):
11 return HTMLParserTreeBuilder()
12
13 def test_namespaced_system_doctype(self):
14 # html.parser can't handle namespaced doctypes, so skip this one.
15 pass
16
17 def test_namespaced_public_doctype(self):
18 # html.parser can't handle namespaced doctypes, so skip this one.
19 pass
diff --git a/bitbake/lib/bs4/tests/test_lxml.py b/bitbake/lib/bs4/tests/test_lxml.py
new file mode 100644
index 0000000000..2b2e9b7e78
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_lxml.py
@@ -0,0 +1,91 @@
1"""Tests to ensure that the lxml tree builder generates good trees."""
2
3import re
4import warnings
5
6try:
7 import lxml.etree
8 LXML_PRESENT = True
9 LXML_VERSION = lxml.etree.LXML_VERSION
10except ImportError, e:
11 LXML_PRESENT = False
12 LXML_VERSION = (0,)
13
14if LXML_PRESENT:
15 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
16
17from bs4 import (
18 BeautifulSoup,
19 BeautifulStoneSoup,
20 )
21from bs4.element import Comment, Doctype, SoupStrainer
22from bs4.testing import skipIf
23from bs4.tests import test_htmlparser
24from bs4.testing import (
25 HTMLTreeBuilderSmokeTest,
26 XMLTreeBuilderSmokeTest,
27 SoupTest,
28 skipIf,
29)
30
31@skipIf(
32 not LXML_PRESENT,
33 "lxml seems not to be present, not testing its tree builder.")
34class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35 """See ``HTMLTreeBuilderSmokeTest``."""
36
37 @property
38 def default_builder(self):
39 return LXMLTreeBuilder()
40
41 def test_out_of_range_entity(self):
42 self.assertSoupEquals(
43 "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
44 self.assertSoupEquals(
45 "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
46 self.assertSoupEquals(
47 "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
48
49 # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50 # test if an old version of lxml is installed.
51
52 @skipIf(
53 not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54 "Skipping doctype test for old version of lxml to avoid segfault.")
55 def test_empty_doctype(self):
56 soup = self.soup("<!DOCTYPE>")
57 doctype = soup.contents[0]
58 self.assertEqual("", doctype.strip())
59
60 def test_beautifulstonesoup_is_xml_parser(self):
61 # Make sure that the deprecated BSS class uses an xml builder
62 # if one is installed.
63 with warnings.catch_warnings(record=True) as w:
64 soup = BeautifulStoneSoup("<b />")
65 self.assertEqual(u"<b/>", unicode(soup.b))
66 self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67
68 def test_real_xhtml_document(self):
69 """lxml strips the XML definition from an XHTML doc, which is fine."""
70 markup = b"""<?xml version="1.0" encoding="utf-8"?>
71<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
72<html xmlns="http://www.w3.org/1999/xhtml">
73<head><title>Hello.</title></head>
74<body>Goodbye.</body>
75</html>"""
76 soup = self.soup(markup)
77 self.assertEqual(
78 soup.encode("utf-8").replace(b"\n", b''),
79 markup.replace(b'\n', b'').replace(
80 b'<?xml version="1.0" encoding="utf-8"?>', b''))
81
82
83@skipIf(
84 not LXML_PRESENT,
85 "lxml seems not to be present, not testing its XML tree builder.")
86class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
87 """See ``HTMLTreeBuilderSmokeTest``."""
88
89 @property
90 def default_builder(self):
91 return LXMLTreeBuilderForXML()
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py
new file mode 100644
index 0000000000..47ac245f99
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_soup.py
@@ -0,0 +1,434 @@
1# -*- coding: utf-8 -*-
2"""Tests of Beautiful Soup as a whole."""
3
4import logging
5import unittest
6import sys
7import tempfile
8
9from bs4 import (
10 BeautifulSoup,
11 BeautifulStoneSoup,
12)
13from bs4.element import (
14 CharsetMetaAttributeValue,
15 ContentMetaAttributeValue,
16 SoupStrainer,
17 NamespacedAttribute,
18 )
19import bs4.dammit
20from bs4.dammit import (
21 EntitySubstitution,
22 UnicodeDammit,
23)
24from bs4.testing import (
25 SoupTest,
26 skipIf,
27)
28import warnings
29
30try:
31 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
32 LXML_PRESENT = True
33except ImportError, e:
34 LXML_PRESENT = False
35
36PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
37PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
38
39class TestConstructor(SoupTest):
40
41 def test_short_unicode_input(self):
42 data = u"<h1>éé</h1>"
43 soup = self.soup(data)
44 self.assertEqual(u"éé", soup.h1.string)
45
46 def test_embedded_null(self):
47 data = u"<h1>foo\0bar</h1>"
48 soup = self.soup(data)
49 self.assertEqual(u"foo\0bar", soup.h1.string)
50
51
52class TestDeprecatedConstructorArguments(SoupTest):
53
54 def test_parseOnlyThese_renamed_to_parse_only(self):
55 with warnings.catch_warnings(record=True) as w:
56 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
57 msg = str(w[0].message)
58 self.assertTrue("parseOnlyThese" in msg)
59 self.assertTrue("parse_only" in msg)
60 self.assertEqual(b"<b></b>", soup.encode())
61
62 def test_fromEncoding_renamed_to_from_encoding(self):
63 with warnings.catch_warnings(record=True) as w:
64 utf8 = b"\xc3\xa9"
65 soup = self.soup(utf8, fromEncoding="utf8")
66 msg = str(w[0].message)
67 self.assertTrue("fromEncoding" in msg)
68 self.assertTrue("from_encoding" in msg)
69 self.assertEqual("utf8", soup.original_encoding)
70
71 def test_unrecognized_keyword_argument(self):
72 self.assertRaises(
73 TypeError, self.soup, "<a>", no_such_argument=True)
74
75class TestWarnings(SoupTest):
76
77 def test_disk_file_warning(self):
78 filehandle = tempfile.NamedTemporaryFile()
79 filename = filehandle.name
80 try:
81 with warnings.catch_warnings(record=True) as w:
82 soup = self.soup(filename)
83 msg = str(w[0].message)
84 self.assertTrue("looks like a filename" in msg)
85 finally:
86 filehandle.close()
87
88 # The file no longer exists, so Beautiful Soup will no longer issue the warning.
89 with warnings.catch_warnings(record=True) as w:
90 soup = self.soup(filename)
91 self.assertEqual(0, len(w))
92
93 def test_url_warning(self):
94 with warnings.catch_warnings(record=True) as w:
95 soup = self.soup("http://www.crummy.com/")
96 msg = str(w[0].message)
97 self.assertTrue("looks like a URL" in msg)
98
99 with warnings.catch_warnings(record=True) as w:
100 soup = self.soup("http://www.crummy.com/ is great")
101 self.assertEqual(0, len(w))
102
103class TestSelectiveParsing(SoupTest):
104
105 def test_parse_with_soupstrainer(self):
106 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
107 strainer = SoupStrainer("b")
108 soup = self.soup(markup, parse_only=strainer)
109 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
110
111
112class TestEntitySubstitution(unittest.TestCase):
113 """Standalone tests of the EntitySubstitution class."""
114 def setUp(self):
115 self.sub = EntitySubstitution
116
117 def test_simple_html_substitution(self):
118 # Unicode characters corresponding to named HTML entites
119 # are substituted, and no others.
120 s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
121 self.assertEqual(self.sub.substitute_html(s),
122 u"foo&forall;\N{SNOWMAN}&otilde;bar")
123
124 def test_smart_quote_substitution(self):
125 # MS smart quotes are a common source of frustration, so we
126 # give them a special test.
127 quotes = b"\x91\x92foo\x93\x94"
128 dammit = UnicodeDammit(quotes)
129 self.assertEqual(self.sub.substitute_html(dammit.markup),
130 "&lsquo;&rsquo;foo&ldquo;&rdquo;")
131
132 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
133 s = 'Welcome to "my bar"'
134 self.assertEqual(self.sub.substitute_xml(s, False), s)
135
136 def test_xml_attribute_quoting_normally_uses_double_quotes(self):
137 self.assertEqual(self.sub.substitute_xml("Welcome", True),
138 '"Welcome"')
139 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
140 '"Bob\'s Bar"')
141
142 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
143 s = 'Welcome to "my bar"'
144 self.assertEqual(self.sub.substitute_xml(s, True),
145 "'Welcome to \"my bar\"'")
146
147 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
148 s = 'Welcome to "Bob\'s Bar"'
149 self.assertEqual(
150 self.sub.substitute_xml(s, True),
151 '"Welcome to &quot;Bob\'s Bar&quot;"')
152
153 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
154 quoted = 'Welcome to "Bob\'s Bar"'
155 self.assertEqual(self.sub.substitute_xml(quoted), quoted)
156
157 def test_xml_quoting_handles_angle_brackets(self):
158 self.assertEqual(
159 self.sub.substitute_xml("foo<bar>"),
160 "foo&lt;bar&gt;")
161
162 def test_xml_quoting_handles_ampersands(self):
163 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
164
165 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
166 self.assertEqual(
167 self.sub.substitute_xml("&Aacute;T&T"),
168 "&amp;Aacute;T&amp;T")
169
170 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
171 self.assertEqual(
172 self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
173 "&Aacute;T&amp;T")
174
175 def test_quotes_not_html_substituted(self):
176 """There's no need to do this except inside attribute values."""
177 text = 'Bob\'s "bar"'
178 self.assertEqual(self.sub.substitute_html(text), text)
179
180
181class TestEncodingConversion(SoupTest):
182 # Test Beautiful Soup's ability to decode and encode from various
183 # encodings.
184
185 def setUp(self):
186 super(TestEncodingConversion, self).setUp()
187 self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
188 self.utf8_data = self.unicode_data.encode("utf-8")
189 # Just so you know what it looks like.
190 self.assertEqual(
191 self.utf8_data,
192 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
193
194 def test_ascii_in_unicode_out(self):
195 # ASCII input is converted to Unicode. The original_encoding
196 # attribute is set to 'utf-8', a superset of ASCII.
197 chardet = bs4.dammit.chardet_dammit
198 logging.disable(logging.WARNING)
199 try:
200 def noop(str):
201 return None
202 # Disable chardet, which will realize that the ASCII is ASCII.
203 bs4.dammit.chardet_dammit = noop
204 ascii = b"<foo>a</foo>"
205 soup_from_ascii = self.soup(ascii)
206 unicode_output = soup_from_ascii.decode()
207 self.assertTrue(isinstance(unicode_output, unicode))
208 self.assertEqual(unicode_output, self.document_for(ascii.decode()))
209 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
210 finally:
211 logging.disable(logging.NOTSET)
212 bs4.dammit.chardet_dammit = chardet
213
214 def test_unicode_in_unicode_out(self):
215 # Unicode input is left alone. The original_encoding attribute
216 # is not set.
217 soup_from_unicode = self.soup(self.unicode_data)
218 self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
219 self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
220 self.assertEqual(soup_from_unicode.original_encoding, None)
221
222 def test_utf8_in_unicode_out(self):
223 # UTF-8 input is converted to Unicode. The original_encoding
224 # attribute is set.
225 soup_from_utf8 = self.soup(self.utf8_data)
226 self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
227 self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
228
229 def test_utf8_out(self):
230 # The internal data structures can be encoded as UTF-8.
231 soup_from_unicode = self.soup(self.unicode_data)
232 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
233
234 @skipIf(
235 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
236 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
237 def test_attribute_name_containing_unicode_characters(self):
238 markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
239 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
240
241class TestUnicodeDammit(unittest.TestCase):
242 """Standalone tests of UnicodeDammit."""
243
244 def test_unicode_input(self):
245 markup = u"I'm already Unicode! \N{SNOWMAN}"
246 dammit = UnicodeDammit(markup)
247 self.assertEqual(dammit.unicode_markup, markup)
248
249 def test_smart_quotes_to_unicode(self):
250 markup = b"<foo>\x91\x92\x93\x94</foo>"
251 dammit = UnicodeDammit(markup)
252 self.assertEqual(
253 dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
254
255 def test_smart_quotes_to_xml_entities(self):
256 markup = b"<foo>\x91\x92\x93\x94</foo>"
257 dammit = UnicodeDammit(markup, smart_quotes_to="xml")
258 self.assertEqual(
259 dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
260
261 def test_smart_quotes_to_html_entities(self):
262 markup = b"<foo>\x91\x92\x93\x94</foo>"
263 dammit = UnicodeDammit(markup, smart_quotes_to="html")
264 self.assertEqual(
265 dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
266
267 def test_smart_quotes_to_ascii(self):
268 markup = b"<foo>\x91\x92\x93\x94</foo>"
269 dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
270 self.assertEqual(
271 dammit.unicode_markup, """<foo>''""</foo>""")
272
273 def test_detect_utf8(self):
274 utf8 = b"\xc3\xa9"
275 dammit = UnicodeDammit(utf8)
276 self.assertEqual(dammit.unicode_markup, u'\xe9')
277 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
278
279 def test_convert_hebrew(self):
280 hebrew = b"\xed\xe5\xec\xf9"
281 dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
282 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
283 self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
284
285 def test_dont_see_smart_quotes_where_there_are_none(self):
286 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
287 dammit = UnicodeDammit(utf_8)
288 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
289 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
290
291 def test_ignore_inappropriate_codecs(self):
292 utf8_data = u"Räksmörgås".encode("utf-8")
293 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
294 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
295
296 def test_ignore_invalid_codecs(self):
297 utf8_data = u"Räksmörgås".encode("utf-8")
298 for bad_encoding in ['.utf8', '...', 'utF---16.!']:
299 dammit = UnicodeDammit(utf8_data, [bad_encoding])
300 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
301
302 def test_detect_html5_style_meta_tag(self):
303
304 for data in (
305 b'<html><meta charset="euc-jp" /></html>',
306 b"<html><meta charset='euc-jp' /></html>",
307 b"<html><meta charset=euc-jp /></html>",
308 b"<html><meta charset=euc-jp/></html>"):
309 dammit = UnicodeDammit(data, is_html=True)
310 self.assertEqual(
311 "euc-jp", dammit.original_encoding)
312
313 def test_last_ditch_entity_replacement(self):
314 # This is a UTF-8 document that contains bytestrings
315 # completely incompatible with UTF-8 (ie. encoded with some other
316 # encoding).
317 #
318 # Since there is no consistent encoding for the document,
319 # Unicode, Dammit will eventually encode the document as UTF-8
320 # and encode the incompatible characters as REPLACEMENT
321 # CHARACTER.
322 #
323 # If chardet is installed, it will detect that the document
324 # can be converted into ISO-8859-1 without errors. This happens
325 # to be the wrong encoding, but it is a consistent encoding, so the
326 # code we're testing here won't run.
327 #
328 # So we temporarily disable chardet if it's present.
329 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
330<html><b>\330\250\330\252\330\261</b>
331<i>\310\322\321\220\312\321\355\344</i></html>"""
332 chardet = bs4.dammit.chardet_dammit
333 logging.disable(logging.WARNING)
334 try:
335 def noop(str):
336 return None
337 bs4.dammit.chardet_dammit = noop
338 dammit = UnicodeDammit(doc)
339 self.assertEqual(True, dammit.contains_replacement_characters)
340 self.assertTrue(u"\ufffd" in dammit.unicode_markup)
341
342 soup = BeautifulSoup(doc, "html.parser")
343 self.assertTrue(soup.contains_replacement_characters)
344 finally:
345 logging.disable(logging.NOTSET)
346 bs4.dammit.chardet_dammit = chardet
347
348 def test_byte_order_mark_removed(self):
349 # A document written in UTF-16LE will have its byte order marker stripped.
350 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
351 dammit = UnicodeDammit(data)
352 self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
353 self.assertEqual("utf-16le", dammit.original_encoding)
354
355 def test_detwingle(self):
356 # Here's a UTF8 document.
357 utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
358
359 # Here's a Windows-1252 document.
360 windows_1252 = (
361 u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
362 u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
363
364 # Through some unholy alchemy, they've been stuck together.
365 doc = utf8 + windows_1252 + utf8
366
367 # The document can't be turned into UTF-8:
368 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
369
370 # Unicode, Dammit thinks the whole document is Windows-1252,
371 # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
372
373 # But if we run it through fix_embedded_windows_1252, it's fixed:
374
375 fixed = UnicodeDammit.detwingle(doc)
376 self.assertEqual(
377 u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
378
379 def test_detwingle_ignores_multibyte_characters(self):
380 # Each of these characters has a UTF-8 representation ending
381 # in \x93. \x93 is a smart quote if interpreted as
382 # Windows-1252. But our code knows to skip over multibyte
383 # UTF-8 characters, so they'll survive the process unscathed.
384 for tricky_unicode_char in (
385 u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
386 u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
387 u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
388 ):
389 input = tricky_unicode_char.encode("utf8")
390 self.assertTrue(input.endswith(b'\x93'))
391 output = UnicodeDammit.detwingle(input)
392 self.assertEqual(output, input)
393
394class TestNamedspacedAttribute(SoupTest):
395
396 def test_name_may_be_none(self):
397 a = NamespacedAttribute("xmlns", None)
398 self.assertEqual(a, "xmlns")
399
400 def test_attribute_is_equivalent_to_colon_separated_string(self):
401 a = NamespacedAttribute("a", "b")
402 self.assertEqual("a:b", a)
403
404 def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
405 a = NamespacedAttribute("a", "b", "c")
406 b = NamespacedAttribute("a", "b", "c")
407 self.assertEqual(a, b)
408
409 # The actual namespace is not considered.
410 c = NamespacedAttribute("a", "b", None)
411 self.assertEqual(a, c)
412
413 # But name and prefix are important.
414 d = NamespacedAttribute("a", "z", "c")
415 self.assertNotEqual(a, d)
416
417 e = NamespacedAttribute("z", "b", "c")
418 self.assertNotEqual(a, e)
419
420
421class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
422
423 def test_content_meta_attribute_value(self):
424 value = CharsetMetaAttributeValue("euc-jp")
425 self.assertEqual("euc-jp", value)
426 self.assertEqual("euc-jp", value.original_value)
427 self.assertEqual("utf8", value.encode("utf8"))
428
429
430 def test_content_meta_attribute_value(self):
431 value = ContentMetaAttributeValue("text/html; charset=euc-jp")
432 self.assertEqual("text/html; charset=euc-jp", value)
433 self.assertEqual("text/html; charset=euc-jp", value.original_value)
434 self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py
new file mode 100644
index 0000000000..f8515c0ea1
--- /dev/null
+++ b/bitbake/lib/bs4/tests/test_tree.py
@@ -0,0 +1,1829 @@
1# -*- coding: utf-8 -*-
2"""Tests for Beautiful Soup's tree traversal methods.
3
4The tree traversal methods are the main advantage of using Beautiful
5Soup over just using a parser.
6
7Different parsers will build different Beautiful Soup trees given the
8same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here.
10"""
11
12import copy
13import pickle
14import re
15import warnings
16from bs4 import BeautifulSoup
17from bs4.builder import (
18 builder_registry,
19 HTMLParserTreeBuilder,
20)
21from bs4.element import (
22 CData,
23 Comment,
24 Doctype,
25 NavigableString,
26 SoupStrainer,
27 Tag,
28)
29from bs4.testing import (
30 SoupTest,
31 skipIf,
32)
33
34XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
35LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
36
37class TreeTest(SoupTest):
38
39 def assertSelects(self, tags, should_match):
40 """Make sure that the given tags have the correct text.
41
42 This is used in tests that define a bunch of tags, each
43 containing a single string, and then select certain strings by
44 some mechanism.
45 """
46 self.assertEqual([tag.string for tag in tags], should_match)
47
48 def assertSelectsIDs(self, tags, should_match):
49 """Make sure that the given tags have the correct IDs.
50
51 This is used in tests that define a bunch of tags, each
52 containing a single string, and then select certain strings by
53 some mechanism.
54 """
55 self.assertEqual([tag['id'] for tag in tags], should_match)
56
57
58class TestFind(TreeTest):
59 """Basic tests of the find() method.
60
61 find() just calls find_all() with limit=1, so it's not tested all
62 that thouroughly here.
63 """
64
65 def test_find_tag(self):
66 soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>")
67 self.assertEqual(soup.find("b").string, "2")
68
69 def test_unicode_text_find(self):
70 soup = self.soup(u'<h1>Räksmörgås</h1>')
71 self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås')
72
73 def test_find_everything(self):
74 """Test an optimization that finds all tags."""
75 soup = self.soup("<a>foo</a><b>bar</b>")
76 self.assertEqual(2, len(soup.find_all()))
77
78 def test_find_everything_with_name(self):
79 """Test an optimization that finds all tags with a given name."""
80 soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
81 self.assertEqual(2, len(soup.find_all('a')))
82
83class TestFindAll(TreeTest):
84 """Basic tests of the find_all() method."""
85
86 def test_find_all_text_nodes(self):
87 """You can search the tree for text nodes."""
88 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
89 # Exact match.
90 self.assertEqual(soup.find_all(text="bar"), [u"bar"])
91 # Match any of a number of strings.
92 self.assertEqual(
93 soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
94 # Match a regular expression.
95 self.assertEqual(soup.find_all(text=re.compile('.*')),
96 [u"Foo", u"bar", u'\xbb'])
97 # Match anything.
98 self.assertEqual(soup.find_all(text=True),
99 [u"Foo", u"bar", u'\xbb'])
100
101 def test_find_all_limit(self):
102 """You can limit the number of items returned by find_all."""
103 soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>")
104 self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
105 self.assertSelects(soup.find_all('a', limit=1), ["1"])
106 self.assertSelects(
107 soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
108
109 # A limit of 0 means no limit.
110 self.assertSelects(
111 soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
112
113 def test_calling_a_tag_is_calling_findall(self):
114 soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>")
115 self.assertSelects(soup('a', limit=1), ["1"])
116 self.assertSelects(soup.b(id="foo"), ["3"])
117
118 def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
119 soup = self.soup("<a></a>")
120 # Create a self-referential list.
121 l = []
122 l.append(l)
123
124 # Without special code in _normalize_search_value, this would cause infinite
125 # recursion.
126 self.assertEqual([], soup.find_all(l))
127
128 def test_find_all_resultset(self):
129 """All find_all calls return a ResultSet"""
130 soup = self.soup("<a></a>")
131 result = soup.find_all("a")
132 self.assertTrue(hasattr(result, "source"))
133
134 result = soup.find_all(True)
135 self.assertTrue(hasattr(result, "source"))
136
137 result = soup.find_all(text="foo")
138 self.assertTrue(hasattr(result, "source"))
139
140
141class TestFindAllBasicNamespaces(TreeTest):
142
143 def test_find_by_namespaced_name(self):
144 soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
145 self.assertEqual("4", soup.find("mathml:msqrt").string)
146 self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
147
148
149class TestFindAllByName(TreeTest):
150 """Test ways of finding tags by tag name."""
151
152 def setUp(self):
153 super(TreeTest, self).setUp()
154 self.tree = self.soup("""<a>First tag.</a>
155 <b>Second tag.</b>
156 <c>Third <a>Nested tag.</a> tag.</c>""")
157
158 def test_find_all_by_tag_name(self):
159 # Find all the <a> tags.
160 self.assertSelects(
161 self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
162
163 def test_find_all_by_name_and_text(self):
164 self.assertSelects(
165 self.tree.find_all('a', text='First tag.'), ['First tag.'])
166
167 self.assertSelects(
168 self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
169
170 self.assertSelects(
171 self.tree.find_all('a', text=re.compile("tag")),
172 ['First tag.', 'Nested tag.'])
173
174
175 def test_find_all_on_non_root_element(self):
176 # You can call find_all on any node, not just the root.
177 self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
178
179 def test_calling_element_invokes_find_all(self):
180 self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
181
182 def test_find_all_by_tag_strainer(self):
183 self.assertSelects(
184 self.tree.find_all(SoupStrainer('a')),
185 ['First tag.', 'Nested tag.'])
186
187 def test_find_all_by_tag_names(self):
188 self.assertSelects(
189 self.tree.find_all(['a', 'b']),
190 ['First tag.', 'Second tag.', 'Nested tag.'])
191
192 def test_find_all_by_tag_dict(self):
193 self.assertSelects(
194 self.tree.find_all({'a' : True, 'b' : True}),
195 ['First tag.', 'Second tag.', 'Nested tag.'])
196
197 def test_find_all_by_tag_re(self):
198 self.assertSelects(
199 self.tree.find_all(re.compile('^[ab]$')),
200 ['First tag.', 'Second tag.', 'Nested tag.'])
201
202 def test_find_all_with_tags_matching_method(self):
203 # You can define an oracle method that determines whether
204 # a tag matches the search.
205 def id_matches_name(tag):
206 return tag.name == tag.get('id')
207
208 tree = self.soup("""<a id="a">Match 1.</a>
209 <a id="1">Does not match.</a>
210 <b id="b">Match 2.</a>""")
211
212 self.assertSelects(
213 tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
214
215
216class TestFindAllByAttribute(TreeTest):
217
218 def test_find_all_by_attribute_name(self):
219 # You can pass in keyword arguments to find_all to search by
220 # attribute.
221 tree = self.soup("""
222 <a id="first">Matching a.</a>
223 <a id="second">
224 Non-matching <b id="first">Matching b.</b>a.
225 </a>""")
226 self.assertSelects(tree.find_all(id='first'),
227 ["Matching a.", "Matching b."])
228
229 def test_find_all_by_utf8_attribute_value(self):
230 peace = u"םולש".encode("utf8")
231 data = u'<a title="םולש"></a>'.encode("utf8")
232 soup = self.soup(data)
233 self.assertEqual([soup.a], soup.find_all(title=peace))
234 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
235 self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
236
237 def test_find_all_by_attribute_dict(self):
238 # You can pass in a dictionary as the argument 'attrs'. This
239 # lets you search for attributes like 'name' (a fixed argument
240 # to find_all) and 'class' (a reserved word in Python.)
241 tree = self.soup("""
242 <a name="name1" class="class1">Name match.</a>
243 <a name="name2" class="class2">Class match.</a>
244 <a name="name3" class="class3">Non-match.</a>
245 <name1>A tag called 'name1'.</name1>
246 """)
247
248 # This doesn't do what you want.
249 self.assertSelects(tree.find_all(name='name1'),
250 ["A tag called 'name1'."])
251 # This does what you want.
252 self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
253 ["Name match."])
254
255 self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
256 ["Class match."])
257
258 def test_find_all_by_class(self):
259 tree = self.soup("""
260 <a class="1">Class 1.</a>
261 <a class="2">Class 2.</a>
262 <b class="1">Class 1.</b>
263 <c class="3 4">Class 3 and 4.</c>
264 """)
265
266 # Passing in the class_ keyword argument will search against
267 # the 'class' attribute.
268 self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
269 self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
270 self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
271
272 # Passing in a string to 'attrs' will also search the CSS class.
273 self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
274 self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
275 self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
276 self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
277
278 def test_find_by_class_when_multiple_classes_present(self):
279 tree = self.soup("<gar class='foo bar'>Found it</gar>")
280
281 f = tree.find_all("gar", class_=re.compile("o"))
282 self.assertSelects(f, ["Found it"])
283
284 f = tree.find_all("gar", class_=re.compile("a"))
285 self.assertSelects(f, ["Found it"])
286
287 # Since the class is not the string "foo bar", but the two
288 # strings "foo" and "bar", this will not find anything.
289 f = tree.find_all("gar", class_=re.compile("o b"))
290 self.assertSelects(f, [])
291
292 def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
293 soup = self.soup("<a class='bar'>Found it</a>")
294
295 self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
296
297 def big_attribute_value(value):
298 return len(value) > 3
299
300 self.assertSelects(soup.find_all("a", big_attribute_value), [])
301
302 def small_attribute_value(value):
303 return len(value) <= 3
304
305 self.assertSelects(
306 soup.find_all("a", small_attribute_value), ["Found it"])
307
308 def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
309 soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
310 a, a2 = soup.find_all("a")
311 self.assertEqual([a, a2], soup.find_all("a", "foo"))
312 self.assertEqual([a], soup.find_all("a", "bar"))
313
314 # If you specify the class as a string that contains a
315 # space, only that specific value will be found.
316 self.assertEqual([a], soup.find_all("a", class_="foo bar"))
317 self.assertEqual([a], soup.find_all("a", "foo bar"))
318 self.assertEqual([], soup.find_all("a", "bar foo"))
319
320 def test_find_all_by_attribute_soupstrainer(self):
321 tree = self.soup("""
322 <a id="first">Match.</a>
323 <a id="second">Non-match.</a>""")
324
325 strainer = SoupStrainer(attrs={'id' : 'first'})
326 self.assertSelects(tree.find_all(strainer), ['Match.'])
327
328 def test_find_all_with_missing_atribute(self):
329 # You can pass in None as the value of an attribute to find_all.
330 # This will match tags that do not have that attribute set.
331 tree = self.soup("""<a id="1">ID present.</a>
332 <a>No ID present.</a>
333 <a id="">ID is empty.</a>""")
334 self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
335
336 def test_find_all_with_defined_attribute(self):
337 # You can pass in None as the value of an attribute to find_all.
338 # This will match tags that have that attribute set to any value.
339 tree = self.soup("""<a id="1">ID present.</a>
340 <a>No ID present.</a>
341 <a id="">ID is empty.</a>""")
342 self.assertSelects(
343 tree.find_all(id=True), ["ID present.", "ID is empty."])
344
345 def test_find_all_with_numeric_attribute(self):
346 # If you search for a number, it's treated as a string.
347 tree = self.soup("""<a id=1>Unquoted attribute.</a>
348 <a id="1">Quoted attribute.</a>""")
349
350 expected = ["Unquoted attribute.", "Quoted attribute."]
351 self.assertSelects(tree.find_all(id=1), expected)
352 self.assertSelects(tree.find_all(id="1"), expected)
353
354 def test_find_all_with_list_attribute_values(self):
355 # You can pass a list of attribute values instead of just one,
356 # and you'll get tags that match any of the values.
357 tree = self.soup("""<a id="1">1</a>
358 <a id="2">2</a>
359 <a id="3">3</a>
360 <a>No ID.</a>""")
361 self.assertSelects(tree.find_all(id=["1", "3", "4"]),
362 ["1", "3"])
363
364 def test_find_all_with_regular_expression_attribute_value(self):
365 # You can pass a regular expression as an attribute value, and
366 # you'll get tags whose values for that attribute match the
367 # regular expression.
368 tree = self.soup("""<a id="a">One a.</a>
369 <a id="aa">Two as.</a>
370 <a id="ab">Mixed as and bs.</a>
371 <a id="b">One b.</a>
372 <a>No ID.</a>""")
373
374 self.assertSelects(tree.find_all(id=re.compile("^a+$")),
375 ["One a.", "Two as."])
376
377 def test_find_by_name_and_containing_string(self):
378 soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>")
379 a = soup.a
380
381 self.assertEqual([a], soup.find_all("a", text="foo"))
382 self.assertEqual([], soup.find_all("a", text="bar"))
383 self.assertEqual([], soup.find_all("a", text="bar"))
384
385 def test_find_by_name_and_containing_string_when_string_is_buried(self):
386 soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>")
387 self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
388
389 def test_find_by_attribute_and_containing_string(self):
390 soup = self.soup('<b id="1">foo</b><a id="2">foo</a>')
391 a = soup.a
392
393 self.assertEqual([a], soup.find_all(id=2, text="foo"))
394 self.assertEqual([], soup.find_all(id=1, text="bar"))
395
396
397
398
399class TestIndex(TreeTest):
400 """Test Tag.index"""
401 def test_index(self):
402 tree = self.soup("""<div>
403 <a>Identical</a>
404 <b>Not identical</b>
405 <a>Identical</a>
406
407 <c><d>Identical with child</d></c>
408 <b>Also not identical</b>
409 <c><d>Identical with child</d></c>
410 </div>""")
411 div = tree.div
412 for i, element in enumerate(div.contents):
413 self.assertEqual(i, div.index(element))
414 self.assertRaises(ValueError, tree.index, 1)
415
416
417class TestParentOperations(TreeTest):
418 """Test navigation and searching through an element's parents."""
419
420 def setUp(self):
421 super(TestParentOperations, self).setUp()
422 self.tree = self.soup('''<ul id="empty"></ul>
423 <ul id="top">
424 <ul id="middle">
425 <ul id="bottom">
426 <b>Start here</b>
427 </ul>
428 </ul>''')
429 self.start = self.tree.b
430
431
432 def test_parent(self):
433 self.assertEqual(self.start.parent['id'], 'bottom')
434 self.assertEqual(self.start.parent.parent['id'], 'middle')
435 self.assertEqual(self.start.parent.parent.parent['id'], 'top')
436
437 def test_parent_of_top_tag_is_soup_object(self):
438 top_tag = self.tree.contents[0]
439 self.assertEqual(top_tag.parent, self.tree)
440
441 def test_soup_object_has_no_parent(self):
442 self.assertEqual(None, self.tree.parent)
443
444 def test_find_parents(self):
445 self.assertSelectsIDs(
446 self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
447 self.assertSelectsIDs(
448 self.start.find_parents('ul', id="middle"), ['middle'])
449
450 def test_find_parent(self):
451 self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
452 self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
453
454 def test_parent_of_text_element(self):
455 text = self.tree.find(text="Start here")
456 self.assertEqual(text.parent.name, 'b')
457
458 def test_text_element_find_parent(self):
459 text = self.tree.find(text="Start here")
460 self.assertEqual(text.find_parent('ul')['id'], 'bottom')
461
462 def test_parent_generator(self):
463 parents = [parent['id'] for parent in self.start.parents
464 if parent is not None and 'id' in parent.attrs]
465 self.assertEqual(parents, ['bottom', 'middle', 'top'])
466
467
468class ProximityTest(TreeTest):
469
470 def setUp(self):
471 super(TreeTest, self).setUp()
472 self.tree = self.soup(
473 '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>')
474
475
476class TestNextOperations(ProximityTest):
477
478 def setUp(self):
479 super(TestNextOperations, self).setUp()
480 self.start = self.tree.b
481
482 def test_next(self):
483 self.assertEqual(self.start.next_element, "One")
484 self.assertEqual(self.start.next_element.next_element['id'], "2")
485
486 def test_next_of_last_item_is_none(self):
487 last = self.tree.find(text="Three")
488 self.assertEqual(last.next_element, None)
489
490 def test_next_of_root_is_none(self):
491 # The document root is outside the next/previous chain.
492 self.assertEqual(self.tree.next_element, None)
493
494 def test_find_all_next(self):
495 self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
496 self.start.find_all_next(id=3)
497 self.assertSelects(self.start.find_all_next(id=3), ["Three"])
498
499 def test_find_next(self):
500 self.assertEqual(self.start.find_next('b')['id'], '2')
501 self.assertEqual(self.start.find_next(text="Three"), "Three")
502
503 def test_find_next_for_text_element(self):
504 text = self.tree.find(text="One")
505 self.assertEqual(text.find_next("b").string, "Two")
506 self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
507
508 def test_next_generator(self):
509 start = self.tree.find(text="Two")
510 successors = [node for node in start.next_elements]
511 # There are two successors: the final <b> tag and its text contents.
512 tag, contents = successors
513 self.assertEqual(tag['id'], '3')
514 self.assertEqual(contents, "Three")
515
516class TestPreviousOperations(ProximityTest):
517
518 def setUp(self):
519 super(TestPreviousOperations, self).setUp()
520 self.end = self.tree.find(text="Three")
521
522 def test_previous(self):
523 self.assertEqual(self.end.previous_element['id'], "3")
524 self.assertEqual(self.end.previous_element.previous_element, "Two")
525
526 def test_previous_of_first_item_is_none(self):
527 first = self.tree.find('html')
528 self.assertEqual(first.previous_element, None)
529
530 def test_previous_of_root_is_none(self):
531 # The document root is outside the next/previous chain.
532 # XXX This is broken!
533 #self.assertEqual(self.tree.previous_element, None)
534 pass
535
536 def test_find_all_previous(self):
537 # The <b> tag containing the "Three" node is the predecessor
538 # of the "Three" node itself, which is why "Three" shows up
539 # here.
540 self.assertSelects(
541 self.end.find_all_previous('b'), ["Three", "Two", "One"])
542 self.assertSelects(self.end.find_all_previous(id=1), ["One"])
543
544 def test_find_previous(self):
545 self.assertEqual(self.end.find_previous('b')['id'], '3')
546 self.assertEqual(self.end.find_previous(text="One"), "One")
547
548 def test_find_previous_for_text_element(self):
549 text = self.tree.find(text="Three")
550 self.assertEqual(text.find_previous("b").string, "Three")
551 self.assertSelects(
552 text.find_all_previous("b"), ["Three", "Two", "One"])
553
554 def test_previous_generator(self):
555 start = self.tree.find(text="One")
556 predecessors = [node for node in start.previous_elements]
557
558 # There are four predecessors: the <b> tag containing "One"
559 # the <body> tag, the <head> tag, and the <html> tag.
560 b, body, head, html = predecessors
561 self.assertEqual(b['id'], '1')
562 self.assertEqual(body.name, "body")
563 self.assertEqual(head.name, "head")
564 self.assertEqual(html.name, "html")
565
566
567class SiblingTest(TreeTest):
568
569 def setUp(self):
570 super(SiblingTest, self).setUp()
571 markup = '''<html>
572 <span id="1">
573 <span id="1.1"></span>
574 </span>
575 <span id="2">
576 <span id="2.1"></span>
577 </span>
578 <span id="3">
579 <span id="3.1"></span>
580 </span>
581 <span id="4"></span>
582 </html>'''
583 # All that whitespace looks good but makes the tests more
584 # difficult. Get rid of it.
585 markup = re.compile("\n\s*").sub("", markup)
586 self.tree = self.soup(markup)
587
588
589class TestNextSibling(SiblingTest):
590
591 def setUp(self):
592 super(TestNextSibling, self).setUp()
593 self.start = self.tree.find(id="1")
594
595 def test_next_sibling_of_root_is_none(self):
596 self.assertEqual(self.tree.next_sibling, None)
597
598 def test_next_sibling(self):
599 self.assertEqual(self.start.next_sibling['id'], '2')
600 self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
601
602 # Note the difference between next_sibling and next_element.
603 self.assertEqual(self.start.next_element['id'], '1.1')
604
605 def test_next_sibling_may_not_exist(self):
606 self.assertEqual(self.tree.html.next_sibling, None)
607
608 nested_span = self.tree.find(id="1.1")
609 self.assertEqual(nested_span.next_sibling, None)
610
611 last_span = self.tree.find(id="4")
612 self.assertEqual(last_span.next_sibling, None)
613
614 def test_find_next_sibling(self):
615 self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
616
617 def test_next_siblings(self):
618 self.assertSelectsIDs(self.start.find_next_siblings("span"),
619 ['2', '3', '4'])
620
621 self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
622
623 def test_next_sibling_for_text_element(self):
624 soup = self.soup("Foo<b>bar</b>baz")
625 start = soup.find(text="Foo")
626 self.assertEqual(start.next_sibling.name, 'b')
627 self.assertEqual(start.next_sibling.next_sibling, 'baz')
628
629 self.assertSelects(start.find_next_siblings('b'), ['bar'])
630 self.assertEqual(start.find_next_sibling(text="baz"), "baz")
631 self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
632
633
634class TestPreviousSibling(SiblingTest):
635
636 def setUp(self):
637 super(TestPreviousSibling, self).setUp()
638 self.end = self.tree.find(id="4")
639
640 def test_previous_sibling_of_root_is_none(self):
641 self.assertEqual(self.tree.previous_sibling, None)
642
643 def test_previous_sibling(self):
644 self.assertEqual(self.end.previous_sibling['id'], '3')
645 self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
646
647 # Note the difference between previous_sibling and previous_element.
648 self.assertEqual(self.end.previous_element['id'], '3.1')
649
650 def test_previous_sibling_may_not_exist(self):
651 self.assertEqual(self.tree.html.previous_sibling, None)
652
653 nested_span = self.tree.find(id="1.1")
654 self.assertEqual(nested_span.previous_sibling, None)
655
656 first_span = self.tree.find(id="1")
657 self.assertEqual(first_span.previous_sibling, None)
658
659 def test_find_previous_sibling(self):
660 self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
661
662 def test_previous_siblings(self):
663 self.assertSelectsIDs(self.end.find_previous_siblings("span"),
664 ['3', '2', '1'])
665
666 self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
667
668 def test_previous_sibling_for_text_element(self):
669 soup = self.soup("Foo<b>bar</b>baz")
670 start = soup.find(text="baz")
671 self.assertEqual(start.previous_sibling.name, 'b')
672 self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
673
674 self.assertSelects(start.find_previous_siblings('b'), ['bar'])
675 self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
676 self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
677
678
679class TestTagCreation(SoupTest):
680 """Test the ability to create new tags."""
681 def test_new_tag(self):
682 soup = self.soup("")
683 new_tag = soup.new_tag("foo", bar="baz")
684 self.assertTrue(isinstance(new_tag, Tag))
685 self.assertEqual("foo", new_tag.name)
686 self.assertEqual(dict(bar="baz"), new_tag.attrs)
687 self.assertEqual(None, new_tag.parent)
688
689 def test_tag_inherits_self_closing_rules_from_builder(self):
690 if XML_BUILDER_PRESENT:
691 xml_soup = BeautifulSoup("", "xml")
692 xml_br = xml_soup.new_tag("br")
693 xml_p = xml_soup.new_tag("p")
694
695 # Both the <br> and <p> tag are empty-element, just because
696 # they have no contents.
697 self.assertEqual(b"<br/>", xml_br.encode())
698 self.assertEqual(b"<p/>", xml_p.encode())
699
700 html_soup = BeautifulSoup("", "html")
701 html_br = html_soup.new_tag("br")
702 html_p = html_soup.new_tag("p")
703
704 # The HTML builder users HTML's rules about which tags are
705 # empty-element tags, and the new tags reflect these rules.
706 self.assertEqual(b"<br/>", html_br.encode())
707 self.assertEqual(b"<p></p>", html_p.encode())
708
709 def test_new_string_creates_navigablestring(self):
710 soup = self.soup("")
711 s = soup.new_string("foo")
712 self.assertEqual("foo", s)
713 self.assertTrue(isinstance(s, NavigableString))
714
715 def test_new_string_can_create_navigablestring_subclass(self):
716 soup = self.soup("")
717 s = soup.new_string("foo", Comment)
718 self.assertEqual("foo", s)
719 self.assertTrue(isinstance(s, Comment))
720
721class TestTreeModification(SoupTest):
722
723 def test_attribute_modification(self):
724 soup = self.soup('<a id="1"></a>')
725 soup.a['id'] = 2
726 self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
727 del(soup.a['id'])
728 self.assertEqual(soup.decode(), self.document_for('<a></a>'))
729 soup.a['id2'] = 'foo'
730 self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
731
732 def test_new_tag_creation(self):
733 builder = builder_registry.lookup('html')()
734 soup = self.soup("<body></body>", builder=builder)
735 a = Tag(soup, builder, 'a')
736 ol = Tag(soup, builder, 'ol')
737 a['href'] = 'http://foo.com/'
738 soup.body.insert(0, a)
739 soup.body.insert(1, ol)
740 self.assertEqual(
741 soup.body.encode(),
742 b'<body><a href="http://foo.com/"></a><ol></ol></body>')
743
744 def test_append_to_contents_moves_tag(self):
745 doc = """<p id="1">Don't leave me <b>here</b>.</p>
746 <p id="2">Don\'t leave!</p>"""
747 soup = self.soup(doc)
748 second_para = soup.find(id='2')
749 bold = soup.b
750
751 # Move the <b> tag to the end of the second paragraph.
752 soup.find(id='2').append(soup.b)
753
754 # The <b> tag is now a child of the second paragraph.
755 self.assertEqual(bold.parent, second_para)
756
757 self.assertEqual(
758 soup.decode(), self.document_for(
759 '<p id="1">Don\'t leave me .</p>\n'
760 '<p id="2">Don\'t leave!<b>here</b></p>'))
761
762 def test_replace_with_returns_thing_that_was_replaced(self):
763 text = "<a></a><b><c></c></b>"
764 soup = self.soup(text)
765 a = soup.a
766 new_a = a.replace_with(soup.c)
767 self.assertEqual(a, new_a)
768
769 def test_unwrap_returns_thing_that_was_replaced(self):
770 text = "<a><b></b><c></c></a>"
771 soup = self.soup(text)
772 a = soup.a
773 new_a = a.unwrap()
774 self.assertEqual(a, new_a)
775
776 def test_replace_tag_with_itself(self):
777 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
778 soup = self.soup(text)
779 c = soup.c
780 soup.c.replace_with(c)
781 self.assertEqual(soup.decode(), self.document_for(text))
782
783 def test_replace_tag_with_its_parent_raises_exception(self):
784 text = "<a><b></b></a>"
785 soup = self.soup(text)
786 self.assertRaises(ValueError, soup.b.replace_with, soup.a)
787
788 def test_insert_tag_into_itself_raises_exception(self):
789 text = "<a><b></b></a>"
790 soup = self.soup(text)
791 self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
792
793 def test_replace_with_maintains_next_element_throughout(self):
794 soup = self.soup('<p><a>one</a><b>three</b></p>')
795 a = soup.a
796 b = a.contents[0]
797 # Make it so the <a> tag has two text children.
798 a.insert(1, "two")
799
800 # Now replace each one with the empty string.
801 left, right = a.contents
802 left.replaceWith('')
803 right.replaceWith('')
804
805 # The <b> tag is still connected to the tree.
806 self.assertEqual("three", soup.b.string)
807
808 def test_replace_final_node(self):
809 soup = self.soup("<b>Argh!</b>")
810 soup.find(text="Argh!").replace_with("Hooray!")
811 new_text = soup.find(text="Hooray!")
812 b = soup.b
813 self.assertEqual(new_text.previous_element, b)
814 self.assertEqual(new_text.parent, b)
815 self.assertEqual(new_text.previous_element.next_element, new_text)
816 self.assertEqual(new_text.next_element, None)
817
818 def test_consecutive_text_nodes(self):
819 # A builder should never create two consecutive text nodes,
820 # but if you insert one next to another, Beautiful Soup will
821 # handle it correctly.
822 soup = self.soup("<a><b>Argh!</b><c></c></a>")
823 soup.b.insert(1, "Hooray!")
824
825 self.assertEqual(
826 soup.decode(), self.document_for(
827 "<a><b>Argh!Hooray!</b><c></c></a>"))
828
829 new_text = soup.find(text="Hooray!")
830 self.assertEqual(new_text.previous_element, "Argh!")
831 self.assertEqual(new_text.previous_element.next_element, new_text)
832
833 self.assertEqual(new_text.previous_sibling, "Argh!")
834 self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
835
836 self.assertEqual(new_text.next_sibling, None)
837 self.assertEqual(new_text.next_element, soup.c)
838
839 def test_insert_string(self):
840 soup = self.soup("<a></a>")
841 soup.a.insert(0, "bar")
842 soup.a.insert(0, "foo")
843 # The string were added to the tag.
844 self.assertEqual(["foo", "bar"], soup.a.contents)
845 # And they were converted to NavigableStrings.
846 self.assertEqual(soup.a.contents[0].next_element, "bar")
847
848 def test_insert_tag(self):
849 builder = self.default_builder
850 soup = self.soup(
851 "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
852 magic_tag = Tag(soup, builder, 'magictag')
853 magic_tag.insert(0, "the")
854 soup.a.insert(1, magic_tag)
855
856 self.assertEqual(
857 soup.decode(), self.document_for(
858 "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
859
860 # Make sure all the relationships are hooked up correctly.
861 b_tag = soup.b
862 self.assertEqual(b_tag.next_sibling, magic_tag)
863 self.assertEqual(magic_tag.previous_sibling, b_tag)
864
865 find = b_tag.find(text="Find")
866 self.assertEqual(find.next_element, magic_tag)
867 self.assertEqual(magic_tag.previous_element, find)
868
869 c_tag = soup.c
870 self.assertEqual(magic_tag.next_sibling, c_tag)
871 self.assertEqual(c_tag.previous_sibling, magic_tag)
872
873 the = magic_tag.find(text="the")
874 self.assertEqual(the.parent, magic_tag)
875 self.assertEqual(the.next_element, c_tag)
876 self.assertEqual(c_tag.previous_element, the)
877
878 def test_append_child_thats_already_at_the_end(self):
879 data = "<a><b></b></a>"
880 soup = self.soup(data)
881 soup.a.append(soup.b)
882 self.assertEqual(data, soup.decode())
883
884 def test_move_tag_to_beginning_of_parent(self):
885 data = "<a><b></b><c></c><d></d></a>"
886 soup = self.soup(data)
887 soup.a.insert(0, soup.d)
888 self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode())
889
890 def test_insert_works_on_empty_element_tag(self):
891 # This is a little strange, since most HTML parsers don't allow
892 # markup like this to come through. But in general, we don't
893 # know what the parser would or wouldn't have allowed, so
894 # I'm letting this succeed for now.
895 soup = self.soup("<br/>")
896 soup.br.insert(1, "Contents")
897 self.assertEqual(str(soup.br), "<br>Contents</br>")
898
899 def test_insert_before(self):
900 soup = self.soup("<a>foo</a><b>bar</b>")
901 soup.b.insert_before("BAZ")
902 soup.a.insert_before("QUUX")
903 self.assertEqual(
904 soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>"))
905
906 soup.a.insert_before(soup.b)
907 self.assertEqual(
908 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
909
910 def test_insert_after(self):
911 soup = self.soup("<a>foo</a><b>bar</b>")
912 soup.b.insert_after("BAZ")
913 soup.a.insert_after("QUUX")
914 self.assertEqual(
915 soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ"))
916 soup.b.insert_after(soup.a)
917 self.assertEqual(
918 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
919
920 def test_insert_after_raises_exception_if_after_has_no_meaning(self):
921 soup = self.soup("")
922 tag = soup.new_tag("a")
923 string = soup.new_string("")
924 self.assertRaises(ValueError, string.insert_after, tag)
925 self.assertRaises(NotImplementedError, soup.insert_after, tag)
926 self.assertRaises(ValueError, tag.insert_after, tag)
927
928 def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
929 soup = self.soup("")
930 tag = soup.new_tag("a")
931 string = soup.new_string("")
932 self.assertRaises(ValueError, string.insert_before, tag)
933 self.assertRaises(NotImplementedError, soup.insert_before, tag)
934 self.assertRaises(ValueError, tag.insert_before, tag)
935
936 def test_replace_with(self):
937 soup = self.soup(
938 "<p>There's <b>no</b> business like <b>show</b> business</p>")
939 no, show = soup.find_all('b')
940 show.replace_with(no)
941 self.assertEqual(
942 soup.decode(),
943 self.document_for(
944 "<p>There's business like <b>no</b> business</p>"))
945
946 self.assertEqual(show.parent, None)
947 self.assertEqual(no.parent, soup.p)
948 self.assertEqual(no.next_element, "no")
949 self.assertEqual(no.next_sibling, " business")
950
951 def test_replace_first_child(self):
952 data = "<a><b></b><c></c></a>"
953 soup = self.soup(data)
954 soup.b.replace_with(soup.c)
955 self.assertEqual("<a><c></c></a>", soup.decode())
956
957 def test_replace_last_child(self):
958 data = "<a><b></b><c></c></a>"
959 soup = self.soup(data)
960 soup.c.replace_with(soup.b)
961 self.assertEqual("<a><b></b></a>", soup.decode())
962
963 def test_nested_tag_replace_with(self):
964 soup = self.soup(
965 """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
966
967 # Replace the entire <b> tag and its contents ("reserve the
968 # right") with the <f> tag ("refuse").
969 remove_tag = soup.b
970 move_tag = soup.f
971 remove_tag.replace_with(move_tag)
972
973 self.assertEqual(
974 soup.decode(), self.document_for(
975 "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
976
977 # The <b> tag is now an orphan.
978 self.assertEqual(remove_tag.parent, None)
979 self.assertEqual(remove_tag.find(text="right").next_element, None)
980 self.assertEqual(remove_tag.previous_element, None)
981 self.assertEqual(remove_tag.next_sibling, None)
982 self.assertEqual(remove_tag.previous_sibling, None)
983
984 # The <f> tag is now connected to the <a> tag.
985 self.assertEqual(move_tag.parent, soup.a)
986 self.assertEqual(move_tag.previous_element, "We")
987 self.assertEqual(move_tag.next_element.next_element, soup.e)
988 self.assertEqual(move_tag.next_sibling, None)
989
990 # The gap where the <f> tag used to be has been mended, and
991 # the word "to" is now connected to the <g> tag.
992 to_text = soup.find(text="to")
993 g_tag = soup.g
994 self.assertEqual(to_text.next_element, g_tag)
995 self.assertEqual(to_text.next_sibling, g_tag)
996 self.assertEqual(g_tag.previous_element, to_text)
997 self.assertEqual(g_tag.previous_sibling, to_text)
998
999 def test_unwrap(self):
1000 tree = self.soup("""
1001 <p>Unneeded <em>formatting</em> is unneeded</p>
1002 """)
1003 tree.em.unwrap()
1004 self.assertEqual(tree.em, None)
1005 self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
1006
1007 def test_wrap(self):
1008 soup = self.soup("I wish I was bold.")
1009 value = soup.string.wrap(soup.new_tag("b"))
1010 self.assertEqual(value.decode(), "<b>I wish I was bold.</b>")
1011 self.assertEqual(
1012 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1013
1014 def test_wrap_extracts_tag_from_elsewhere(self):
1015 soup = self.soup("<b></b>I wish I was bold.")
1016 soup.b.next_sibling.wrap(soup.b)
1017 self.assertEqual(
1018 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1019
1020 def test_wrap_puts_new_contents_at_the_end(self):
1021 soup = self.soup("<b>I like being bold.</b>I wish I was bold.")
1022 soup.b.next_sibling.wrap(soup.b)
1023 self.assertEqual(2, len(soup.b.contents))
1024 self.assertEqual(
1025 soup.decode(), self.document_for(
1026 "<b>I like being bold.I wish I was bold.</b>"))
1027
1028 def test_extract(self):
1029 soup = self.soup(
1030 '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
1031
1032 self.assertEqual(len(soup.body.contents), 3)
1033 extracted = soup.find(id="nav").extract()
1034
1035 self.assertEqual(
1036 soup.decode(), "<html><body>Some content. More content.</body></html>")
1037 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
1038
1039 # The extracted tag is now an orphan.
1040 self.assertEqual(len(soup.body.contents), 2)
1041 self.assertEqual(extracted.parent, None)
1042 self.assertEqual(extracted.previous_element, None)
1043 self.assertEqual(extracted.next_element.next_element, None)
1044
1045 # The gap where the extracted tag used to be has been mended.
1046 content_1 = soup.find(text="Some content. ")
1047 content_2 = soup.find(text=" More content.")
1048 self.assertEqual(content_1.next_element, content_2)
1049 self.assertEqual(content_1.next_sibling, content_2)
1050 self.assertEqual(content_2.previous_element, content_1)
1051 self.assertEqual(content_2.previous_sibling, content_1)
1052
1053 def test_extract_distinguishes_between_identical_strings(self):
1054 soup = self.soup("<a>foo</a><b>bar</b>")
1055 foo_1 = soup.a.string
1056 bar_1 = soup.b.string
1057 foo_2 = soup.new_string("foo")
1058 bar_2 = soup.new_string("bar")
1059 soup.a.append(foo_2)
1060 soup.b.append(bar_2)
1061
1062 # Now there are two identical strings in the <a> tag, and two
1063 # in the <b> tag. Let's remove the first "foo" and the second
1064 # "bar".
1065 foo_1.extract()
1066 bar_2.extract()
1067 self.assertEqual(foo_2, soup.a.string)
1068 self.assertEqual(bar_2, soup.b.string)
1069
1070 def test_clear(self):
1071 """Tag.clear()"""
1072 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
1073 # clear using extract()
1074 a = soup.a
1075 soup.p.clear()
1076 self.assertEqual(len(soup.p.contents), 0)
1077 self.assertTrue(hasattr(a, "contents"))
1078
1079 # clear using decompose()
1080 em = a.em
1081 a.clear(decompose=True)
1082 self.assertEqual(0, len(em.contents))
1083
1084 def test_string_set(self):
1085 """Tag.string = 'string'"""
1086 soup = self.soup("<a></a> <b><c></c></b>")
1087 soup.a.string = "foo"
1088 self.assertEqual(soup.a.contents, ["foo"])
1089 soup.b.string = "bar"
1090 self.assertEqual(soup.b.contents, ["bar"])
1091
1092 def test_string_set_does_not_affect_original_string(self):
1093 soup = self.soup("<a><b>foo</b><c>bar</c>")
1094 soup.b.string = soup.c.string
1095 self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
1096
1097 def test_set_string_preserves_class_of_string(self):
1098 soup = self.soup("<a></a>")
1099 cdata = CData("foo")
1100 soup.a.string = cdata
1101 self.assertTrue(isinstance(soup.a.string, CData))
1102
1103class TestElementObjects(SoupTest):
1104 """Test various features of element objects."""
1105
1106 def test_len(self):
1107 """The length of an element is its number of children."""
1108 soup = self.soup("<top>1<b>2</b>3</top>")
1109
1110 # The BeautifulSoup object itself contains one element: the
1111 # <top> tag.
1112 self.assertEqual(len(soup.contents), 1)
1113 self.assertEqual(len(soup), 1)
1114
1115 # The <top> tag contains three elements: the text node "1", the
1116 # <b> tag, and the text node "3".
1117 self.assertEqual(len(soup.top), 3)
1118 self.assertEqual(len(soup.top.contents), 3)
1119
1120 def test_member_access_invokes_find(self):
1121 """Accessing a Python member .foo invokes find('foo')"""
1122 soup = self.soup('<b><i></i></b>')
1123 self.assertEqual(soup.b, soup.find('b'))
1124 self.assertEqual(soup.b.i, soup.find('b').find('i'))
1125 self.assertEqual(soup.a, None)
1126
1127 def test_deprecated_member_access(self):
1128 soup = self.soup('<b><i></i></b>')
1129 with warnings.catch_warnings(record=True) as w:
1130 tag = soup.bTag
1131 self.assertEqual(soup.b, tag)
1132 self.assertEqual(
1133 '.bTag is deprecated, use .find("b") instead.',
1134 str(w[0].message))
1135
1136 def test_has_attr(self):
1137 """has_attr() checks for the presence of an attribute.
1138
1139 Please note note: has_attr() is different from
1140 __in__. has_attr() checks the tag's attributes and __in__
1141 checks the tag's chidlren.
1142 """
1143 soup = self.soup("<foo attr='bar'>")
1144 self.assertTrue(soup.foo.has_attr('attr'))
1145 self.assertFalse(soup.foo.has_attr('attr2'))
1146
1147
1148 def test_attributes_come_out_in_alphabetical_order(self):
1149 markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
1150 self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
1151
1152 def test_string(self):
1153 # A tag that contains only a text node makes that node
1154 # available as .string.
1155 soup = self.soup("<b>foo</b>")
1156 self.assertEqual(soup.b.string, 'foo')
1157
1158 def test_empty_tag_has_no_string(self):
1159 # A tag with no children has no .stirng.
1160 soup = self.soup("<b></b>")
1161 self.assertEqual(soup.b.string, None)
1162
1163 def test_tag_with_multiple_children_has_no_string(self):
1164 # A tag with no children has no .string.
1165 soup = self.soup("<a>foo<b></b><b></b></b>")
1166 self.assertEqual(soup.b.string, None)
1167
1168 soup = self.soup("<a>foo<b></b>bar</b>")
1169 self.assertEqual(soup.b.string, None)
1170
1171 # Even if all the children are strings, due to trickery,
1172 # it won't work--but this would be a good optimization.
1173 soup = self.soup("<a>foo</b>")
1174 soup.a.insert(1, "bar")
1175 self.assertEqual(soup.a.string, None)
1176
1177 def test_tag_with_recursive_string_has_string(self):
1178 # A tag with a single child which has a .string inherits that
1179 # .string.
1180 soup = self.soup("<a><b>foo</b></a>")
1181 self.assertEqual(soup.a.string, "foo")
1182 self.assertEqual(soup.string, "foo")
1183
1184 def test_lack_of_string(self):
1185 """Only a tag containing a single text node has a .string."""
1186 soup = self.soup("<b>f<i>e</i>o</b>")
1187 self.assertFalse(soup.b.string)
1188
1189 soup = self.soup("<b></b>")
1190 self.assertFalse(soup.b.string)
1191
1192 def test_all_text(self):
1193 """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
1194 soup = self.soup("<a>a<b>r</b> <r> t </r></a>")
1195 self.assertEqual(soup.a.text, "ar t ")
1196 self.assertEqual(soup.a.get_text(strip=True), "art")
1197 self.assertEqual(soup.a.get_text(","), "a,r, , t ")
1198 self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
1199
1200 def test_get_text_ignores_comments(self):
1201 soup = self.soup("foo<!--IGNORE-->bar")
1202 self.assertEqual(soup.get_text(), "foobar")
1203
1204 self.assertEqual(
1205 soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
1206 self.assertEqual(
1207 soup.get_text(types=None), "fooIGNOREbar")
1208
1209 def test_all_strings_ignores_comments(self):
1210 soup = self.soup("foo<!--IGNORE-->bar")
1211 self.assertEqual(['foo', 'bar'], list(soup.strings))
1212
1213class TestCDAtaListAttributes(SoupTest):
1214
1215 """Testing cdata-list attributes like 'class'.
1216 """
1217 def test_single_value_becomes_list(self):
1218 soup = self.soup("<a class='foo'>")
1219 self.assertEqual(["foo"],soup.a['class'])
1220
1221 def test_multiple_values_becomes_list(self):
1222 soup = self.soup("<a class='foo bar'>")
1223 self.assertEqual(["foo", "bar"], soup.a['class'])
1224
1225 def test_multiple_values_separated_by_weird_whitespace(self):
1226 soup = self.soup("<a class='foo\tbar\nbaz'>")
1227 self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
1228
1229 def test_attributes_joined_into_string_on_output(self):
1230 soup = self.soup("<a class='foo\tbar'>")
1231 self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
1232
1233 def test_accept_charset(self):
1234 soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
1235 self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
1236
1237 def test_cdata_attribute_applying_only_to_one_tag(self):
1238 data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
1239 soup = self.soup(data)
1240 # We saw in another test that accept-charset is a cdata-list
1241 # attribute for the <form> tag. But it's not a cdata-list
1242 # attribute for any other tag.
1243 self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
1244
1245 def test_string_has_immutable_name_property(self):
1246 string = self.soup("s").string
1247 self.assertEqual(None, string.name)
1248 def t():
1249 string.name = 'foo'
1250 self.assertRaises(AttributeError, t)
1251
1252class TestPersistence(SoupTest):
1253 "Testing features like pickle and deepcopy."
1254
1255 def setUp(self):
1256 super(TestPersistence, self).setUp()
1257 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
1258"http://www.w3.org/TR/REC-html40/transitional.dtd">
1259<html>
1260<head>
1261<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
1262<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
1263<link rev="made" href="mailto:leonardr@segfault.org">
1264<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
1265<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
1266<meta name="author" content="Leonard Richardson">
1267</head>
1268<body>
1269<a href="foo">foo</a>
1270<a href="foo"><b>bar</b></a>
1271</body>
1272</html>"""
1273 self.tree = self.soup(self.page)
1274
1275 def test_pickle_and_unpickle_identity(self):
1276 # Pickling a tree, then unpickling it, yields a tree identical
1277 # to the original.
1278 dumped = pickle.dumps(self.tree, 2)
1279 loaded = pickle.loads(dumped)
1280 self.assertEqual(loaded.__class__, BeautifulSoup)
1281 self.assertEqual(loaded.decode(), self.tree.decode())
1282
1283 def test_deepcopy_identity(self):
1284 # Making a deepcopy of a tree yields an identical tree.
1285 copied = copy.deepcopy(self.tree)
1286 self.assertEqual(copied.decode(), self.tree.decode())
1287
1288 def test_unicode_pickle(self):
1289 # A tree containing Unicode characters can be pickled.
1290 html = u"<b>\N{SNOWMAN}</b>"
1291 soup = self.soup(html)
1292 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1293 loaded = pickle.loads(dumped)
1294 self.assertEqual(loaded.decode(), soup.decode())
1295
1296
1297class TestSubstitutions(SoupTest):
1298
1299 def test_default_formatter_is_minimal(self):
1300 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1301 soup = self.soup(markup)
1302 decoded = soup.decode(formatter="minimal")
1303 # The < is converted back into &lt; but the e-with-acute is left alone.
1304 self.assertEqual(
1305 decoded,
1306 self.document_for(
1307 u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1308
1309 def test_formatter_html(self):
1310 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1311 soup = self.soup(markup)
1312 decoded = soup.decode(formatter="html")
1313 self.assertEqual(
1314 decoded,
1315 self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1316
1317 def test_formatter_minimal(self):
1318 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1319 soup = self.soup(markup)
1320 decoded = soup.decode(formatter="minimal")
1321 # The < is converted back into &lt; but the e-with-acute is left alone.
1322 self.assertEqual(
1323 decoded,
1324 self.document_for(
1325 u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1326
1327 def test_formatter_null(self):
1328 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1329 soup = self.soup(markup)
1330 decoded = soup.decode(formatter=None)
1331 # Neither the angle brackets nor the e-with-acute are converted.
1332 # This is not valid HTML, but it's what the user wanted.
1333 self.assertEqual(decoded,
1334 self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1335
1336 def test_formatter_custom(self):
1337 markup = u"<b>&lt;foo&gt;</b><b>bar</b>"
1338 soup = self.soup(markup)
1339 decoded = soup.decode(formatter = lambda x: x.upper())
1340 # Instead of normal entity conversion code, the custom
1341 # callable is called on every string.
1342 self.assertEqual(
1343 decoded,
1344 self.document_for(u"<b><FOO></b><b>BAR</b>"))
1345
1346 def test_formatter_is_run_on_attribute_values(self):
1347 markup = u'<a href="http://a.com?a=b&c=é">e</a>'
1348 soup = self.soup(markup)
1349 a = soup.a
1350
1351 expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>'
1352
1353 self.assertEqual(expect_minimal, a.decode())
1354 self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1355
1356 expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1357 self.assertEqual(expect_html, a.decode(formatter="html"))
1358
1359 self.assertEqual(markup, a.decode(formatter=None))
1360 expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
1361 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1362
1363 def test_formatter_skips_script_tag_for_html_documents(self):
1364 doc = """
1365 <script type="text/javascript">
1366 console.log("< < hey > > ");
1367 </script>
1368"""
1369 encoded = BeautifulSoup(doc).encode()
1370 self.assertTrue(b"< < hey > >" in encoded)
1371
1372 def test_formatter_skips_style_tag_for_html_documents(self):
1373 doc = """
1374 <style type="text/css">
1375 console.log("< < hey > > ");
1376 </style>
1377"""
1378 encoded = BeautifulSoup(doc).encode()
1379 self.assertTrue(b"< < hey > >" in encoded)
1380
1381 def test_prettify_leaves_preformatted_text_alone(self):
1382 soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
1383 # Everything outside the <pre> tag is reformatted, but everything
1384 # inside is left alone.
1385 self.assertEqual(
1386 u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
1387 soup.div.prettify())
1388
1389 def test_prettify_accepts_formatter(self):
1390 soup = BeautifulSoup("<html><body>foo</body></html>")
1391 pretty = soup.prettify(formatter = lambda x: x.upper())
1392 self.assertTrue("FOO" in pretty)
1393
1394 def test_prettify_outputs_unicode_by_default(self):
1395 soup = self.soup("<a></a>")
1396 self.assertEqual(unicode, type(soup.prettify()))
1397
1398 def test_prettify_can_encode_data(self):
1399 soup = self.soup("<a></a>")
1400 self.assertEqual(bytes, type(soup.prettify("utf-8")))
1401
1402 def test_html_entity_substitution_off_by_default(self):
1403 markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1404 soup = self.soup(markup)
1405 encoded = soup.b.encode("utf-8")
1406 self.assertEqual(encoded, markup.encode('utf-8'))
1407
1408 def test_encoding_substitution(self):
1409 # Here's the <meta> tag saying that a document is
1410 # encoded in Shift-JIS.
1411 meta_tag = ('<meta content="text/html; charset=x-sjis" '
1412 'http-equiv="Content-type"/>')
1413 soup = self.soup(meta_tag)
1414
1415 # Parse the document, and the charset apprears unchanged.
1416 self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
1417
1418 # Encode the document into some encoding, and the encoding is
1419 # substituted into the meta tag.
1420 utf_8 = soup.encode("utf-8")
1421 self.assertTrue(b"charset=utf-8" in utf_8)
1422
1423 euc_jp = soup.encode("euc_jp")
1424 self.assertTrue(b"charset=euc_jp" in euc_jp)
1425
1426 shift_jis = soup.encode("shift-jis")
1427 self.assertTrue(b"charset=shift-jis" in shift_jis)
1428
1429 utf_16_u = soup.encode("utf-16").decode("utf-16")
1430 self.assertTrue("charset=utf-16" in utf_16_u)
1431
1432 def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
1433 markup = ('<head><meta content="text/html; charset=x-sjis" '
1434 'http-equiv="Content-type"/></head><pre>foo</pre>')
1435
1436 # Beautiful Soup used to try to rewrite the meta tag even if the
1437 # meta tag got filtered out by the strainer. This test makes
1438 # sure that doesn't happen.
1439 strainer = SoupStrainer('pre')
1440 soup = self.soup(markup, parse_only=strainer)
1441 self.assertEqual(soup.contents[0].name, 'pre')
1442
1443class TestEncoding(SoupTest):
1444 """Test the ability to encode objects into strings."""
1445
1446 def test_unicode_string_can_be_encoded(self):
1447 html = u"<b>\N{SNOWMAN}</b>"
1448 soup = self.soup(html)
1449 self.assertEqual(soup.b.string.encode("utf-8"),
1450 u"\N{SNOWMAN}".encode("utf-8"))
1451
1452 def test_tag_containing_unicode_string_can_be_encoded(self):
1453 html = u"<b>\N{SNOWMAN}</b>"
1454 soup = self.soup(html)
1455 self.assertEqual(
1456 soup.b.encode("utf-8"), html.encode("utf-8"))
1457
1458 def test_encoding_substitutes_unrecognized_characters_by_default(self):
1459 html = u"<b>\N{SNOWMAN}</b>"
1460 soup = self.soup(html)
1461 self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1462
1463 def test_encoding_can_be_made_strict(self):
1464 html = u"<b>\N{SNOWMAN}</b>"
1465 soup = self.soup(html)
1466 self.assertRaises(
1467 UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1468
1469 def test_decode_contents(self):
1470 html = u"<b>\N{SNOWMAN}</b>"
1471 soup = self.soup(html)
1472 self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents())
1473
1474 def test_encode_contents(self):
1475 html = u"<b>\N{SNOWMAN}</b>"
1476 soup = self.soup(html)
1477 self.assertEqual(
1478 u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1479 encoding="utf8"))
1480
1481 def test_deprecated_renderContents(self):
1482 html = u"<b>\N{SNOWMAN}</b>"
1483 soup = self.soup(html)
1484 self.assertEqual(
1485 u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1486
1487class TestNavigableStringSubclasses(SoupTest):
1488
1489 def test_cdata(self):
1490 # None of the current builders turn CDATA sections into CData
1491 # objects, but you can create them manually.
1492 soup = self.soup("")
1493 cdata = CData("foo")
1494 soup.insert(1, cdata)
1495 self.assertEqual(str(soup), "<![CDATA[foo]]>")
1496 self.assertEqual(soup.find(text="foo"), "foo")
1497 self.assertEqual(soup.contents[0], "foo")
1498
1499 def test_cdata_is_never_formatted(self):
1500 """Text inside a CData object is passed into the formatter.
1501
1502 But the return value is ignored.
1503 """
1504
1505 self.count = 0
1506 def increment(*args):
1507 self.count += 1
1508 return "BITTER FAILURE"
1509
1510 soup = self.soup("")
1511 cdata = CData("<><><>")
1512 soup.insert(1, cdata)
1513 self.assertEqual(
1514 b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
1515 self.assertEqual(1, self.count)
1516
1517 def test_doctype_ends_in_newline(self):
1518 # Unlike other NavigableString subclasses, a DOCTYPE always ends
1519 # in a newline.
1520 doctype = Doctype("foo")
1521 soup = self.soup("")
1522 soup.insert(1, doctype)
1523 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1524
1525
1526class TestSoupSelector(TreeTest):
1527
1528 HTML = """
1529<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
1530"http://www.w3.org/TR/html4/strict.dtd">
1531<html>
1532<head>
1533<title>The title</title>
1534<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1535</head>
1536<body>
1537
1538<div id="main" class="fancy">
1539<div id="inner">
1540<h1 id="header1">An H1</h1>
1541<p>Some text</p>
1542<p class="onep" id="p1">Some more text</p>
1543<h2 id="header2">An H2</h2>
1544<p class="class1 class2 class3" id="pmulti">Another</p>
1545<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
1546<h2 id="header3">Another H2</h2>
1547<a id="me" href="http://simonwillison.net/" rel="me">me</a>
1548<span class="s1">
1549<a href="#" id="s1a1">span1a1</a>
1550<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
1551<span class="span2">
1552<a href="#" id="s2a1">span2a1</a>
1553</span>
1554<span class="span3"></span>
1555</span>
1556</div>
1557<p lang="en" id="lang-en">English</p>
1558<p lang="en-gb" id="lang-en-gb">English UK</p>
1559<p lang="en-us" id="lang-en-us">English US</p>
1560<p lang="fr" id="lang-fr">French</p>
1561</div>
1562
1563<div id="footer">
1564</div>
1565"""
1566
1567 def setUp(self):
1568 self.soup = BeautifulSoup(self.HTML)
1569
1570 def assertSelects(self, selector, expected_ids):
1571 el_ids = [el['id'] for el in self.soup.select(selector)]
1572 el_ids.sort()
1573 expected_ids.sort()
1574 self.assertEqual(expected_ids, el_ids,
1575 "Selector %s, expected [%s], got [%s]" % (
1576 selector, ', '.join(expected_ids), ', '.join(el_ids)
1577 )
1578 )
1579
1580 assertSelect = assertSelects
1581
1582 def assertSelectMultiple(self, *tests):
1583 for selector, expected_ids in tests:
1584 self.assertSelect(selector, expected_ids)
1585
1586 def test_one_tag_one(self):
1587 els = self.soup.select('title')
1588 self.assertEqual(len(els), 1)
1589 self.assertEqual(els[0].name, 'title')
1590 self.assertEqual(els[0].contents, [u'The title'])
1591
1592 def test_one_tag_many(self):
1593 els = self.soup.select('div')
1594 self.assertEqual(len(els), 3)
1595 for div in els:
1596 self.assertEqual(div.name, 'div')
1597
1598 def test_tag_in_tag_one(self):
1599 els = self.soup.select('div div')
1600 self.assertSelects('div div', ['inner'])
1601
1602 def test_tag_in_tag_many(self):
1603 for selector in ('html div', 'html body div', 'body div'):
1604 self.assertSelects(selector, ['main', 'inner', 'footer'])
1605
1606 def test_tag_no_match(self):
1607 self.assertEqual(len(self.soup.select('del')), 0)
1608
1609 def test_invalid_tag(self):
1610 self.assertRaises(ValueError, self.soup.select, 'tag%t')
1611
1612 def test_header_tags(self):
1613 self.assertSelectMultiple(
1614 ('h1', ['header1']),
1615 ('h2', ['header2', 'header3']),
1616 )
1617
1618 def test_class_one(self):
1619 for selector in ('.onep', 'p.onep', 'html p.onep'):
1620 els = self.soup.select(selector)
1621 self.assertEqual(len(els), 1)
1622 self.assertEqual(els[0].name, 'p')
1623 self.assertEqual(els[0]['class'], ['onep'])
1624
1625 def test_class_mismatched_tag(self):
1626 els = self.soup.select('div.onep')
1627 self.assertEqual(len(els), 0)
1628
1629 def test_one_id(self):
1630 for selector in ('div#inner', '#inner', 'div div#inner'):
1631 self.assertSelects(selector, ['inner'])
1632
1633 def test_bad_id(self):
1634 els = self.soup.select('#doesnotexist')
1635 self.assertEqual(len(els), 0)
1636
1637 def test_items_in_id(self):
1638 els = self.soup.select('div#inner p')
1639 self.assertEqual(len(els), 3)
1640 for el in els:
1641 self.assertEqual(el.name, 'p')
1642 self.assertEqual(els[1]['class'], ['onep'])
1643 self.assertFalse(els[0].has_attr('class'))
1644
1645 def test_a_bunch_of_emptys(self):
1646 for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
1647 self.assertEqual(len(self.soup.select(selector)), 0)
1648
1649 def test_multi_class_support(self):
1650 for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
1651 '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
1652 self.assertSelects(selector, ['pmulti'])
1653
1654 def test_multi_class_selection(self):
1655 for selector in ('.class1.class3', '.class3.class2',
1656 '.class1.class2.class3'):
1657 self.assertSelects(selector, ['pmulti'])
1658
1659 def test_child_selector(self):
1660 self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
1661 self.assertSelects('.s1 > a span', ['s1a2s1'])
1662
1663 def test_child_selector_id(self):
1664 self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
1665
1666 def test_attribute_equals(self):
1667 self.assertSelectMultiple(
1668 ('p[class="onep"]', ['p1']),
1669 ('p[id="p1"]', ['p1']),
1670 ('[class="onep"]', ['p1']),
1671 ('[id="p1"]', ['p1']),
1672 ('link[rel="stylesheet"]', ['l1']),
1673 ('link[type="text/css"]', ['l1']),
1674 ('link[href="blah.css"]', ['l1']),
1675 ('link[href="no-blah.css"]', []),
1676 ('[rel="stylesheet"]', ['l1']),
1677 ('[type="text/css"]', ['l1']),
1678 ('[href="blah.css"]', ['l1']),
1679 ('[href="no-blah.css"]', []),
1680 ('p[href="no-blah.css"]', []),
1681 ('[href="no-blah.css"]', []),
1682 )
1683
1684 def test_attribute_tilde(self):
1685 self.assertSelectMultiple(
1686 ('p[class~="class1"]', ['pmulti']),
1687 ('p[class~="class2"]', ['pmulti']),
1688 ('p[class~="class3"]', ['pmulti']),
1689 ('[class~="class1"]', ['pmulti']),
1690 ('[class~="class2"]', ['pmulti']),
1691 ('[class~="class3"]', ['pmulti']),
1692 ('a[rel~="friend"]', ['bob']),
1693 ('a[rel~="met"]', ['bob']),
1694 ('[rel~="friend"]', ['bob']),
1695 ('[rel~="met"]', ['bob']),
1696 )
1697
1698 def test_attribute_startswith(self):
1699 self.assertSelectMultiple(
1700 ('[rel^="style"]', ['l1']),
1701 ('link[rel^="style"]', ['l1']),
1702 ('notlink[rel^="notstyle"]', []),
1703 ('[rel^="notstyle"]', []),
1704 ('link[rel^="notstyle"]', []),
1705 ('link[href^="bla"]', ['l1']),
1706 ('a[href^="http://"]', ['bob', 'me']),
1707 ('[href^="http://"]', ['bob', 'me']),
1708 ('[id^="p"]', ['pmulti', 'p1']),
1709 ('[id^="m"]', ['me', 'main']),
1710 ('div[id^="m"]', ['main']),
1711 ('a[id^="m"]', ['me']),
1712 )
1713
1714 def test_attribute_endswith(self):
1715 self.assertSelectMultiple(
1716 ('[href$=".css"]', ['l1']),
1717 ('link[href$=".css"]', ['l1']),
1718 ('link[id$="1"]', ['l1']),
1719 ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']),
1720 ('div[id$="1"]', []),
1721 ('[id$="noending"]', []),
1722 )
1723
1724 def test_attribute_contains(self):
1725 self.assertSelectMultiple(
1726 # From test_attribute_startswith
1727 ('[rel*="style"]', ['l1']),
1728 ('link[rel*="style"]', ['l1']),
1729 ('notlink[rel*="notstyle"]', []),
1730 ('[rel*="notstyle"]', []),
1731 ('link[rel*="notstyle"]', []),
1732 ('link[href*="bla"]', ['l1']),
1733 ('a[href*="http://"]', ['bob', 'me']),
1734 ('[href*="http://"]', ['bob', 'me']),
1735 ('[id*="p"]', ['pmulti', 'p1']),
1736 ('div[id*="m"]', ['main']),
1737 ('a[id*="m"]', ['me']),
1738 # From test_attribute_endswith
1739 ('[href*=".css"]', ['l1']),
1740 ('link[href*=".css"]', ['l1']),
1741 ('link[id*="1"]', ['l1']),
1742 ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']),
1743 ('div[id*="1"]', []),
1744 ('[id*="noending"]', []),
1745 # New for this test
1746 ('[href*="."]', ['bob', 'me', 'l1']),
1747 ('a[href*="."]', ['bob', 'me']),
1748 ('link[href*="."]', ['l1']),
1749 ('div[id*="n"]', ['main', 'inner']),
1750 ('div[id*="nn"]', ['inner']),
1751 )
1752
1753 def test_attribute_exact_or_hypen(self):
1754 self.assertSelectMultiple(
1755 ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1756 ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1757 ('p[lang|="fr"]', ['lang-fr']),
1758 ('p[lang|="gb"]', []),
1759 )
1760
1761 def test_attribute_exists(self):
1762 self.assertSelectMultiple(
1763 ('[rel]', ['l1', 'bob', 'me']),
1764 ('link[rel]', ['l1']),
1765 ('a[rel]', ['bob', 'me']),
1766 ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
1767 ('p[class]', ['p1', 'pmulti']),
1768 ('[blah]', []),
1769 ('p[blah]', []),
1770 )
1771
1772 def test_nth_of_type(self):
1773 # Try to select first paragraph
1774 els = self.soup.select('div#inner p:nth-of-type(1)')
1775 self.assertEqual(len(els), 1)
1776 self.assertEqual(els[0].string, u'Some text')
1777
1778 # Try to select third paragraph
1779 els = self.soup.select('div#inner p:nth-of-type(3)')
1780 self.assertEqual(len(els), 1)
1781 self.assertEqual(els[0].string, u'Another')
1782
1783 # Try to select (non-existent!) fourth paragraph
1784 els = self.soup.select('div#inner p:nth-of-type(4)')
1785 self.assertEqual(len(els), 0)
1786
1787 # Pass in an invalid value.
1788 self.assertRaises(
1789 ValueError, self.soup.select, 'div p:nth-of-type(0)')
1790
1791 def test_nth_of_type_direct_descendant(self):
1792 els = self.soup.select('div#inner > p:nth-of-type(1)')
1793 self.assertEqual(len(els), 1)
1794 self.assertEqual(els[0].string, u'Some text')
1795
1796 def test_id_child_selector_nth_of_type(self):
1797 self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
1798
1799 def test_select_on_element(self):
1800 # Other tests operate on the tree; this operates on an element
1801 # within the tree.
1802 inner = self.soup.find("div", id="main")
1803 selected = inner.select("div")
1804 # The <div id="inner"> tag was selected. The <div id="footer">
1805 # tag was not.
1806 self.assertSelectsIDs(selected, ['inner'])
1807
1808 def test_overspecified_child_id(self):
1809 self.assertSelects(".fancy #inner", ['inner'])
1810 self.assertSelects(".normal #inner", [])
1811
1812 def test_adjacent_sibling_selector(self):
1813 self.assertSelects('#p1 + h2', ['header2'])
1814 self.assertSelects('#p1 + h2 + p', ['pmulti'])
1815 self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
1816 self.assertEqual([], self.soup.select('#p1 + p'))
1817
1818 def test_general_sibling_selector(self):
1819 self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
1820 self.assertSelects('#p1 ~ #header2', ['header2'])
1821 self.assertSelects('#p1 ~ h2 + a', ['me'])
1822 self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
1823 self.assertEqual([], self.soup.select('#inner ~ h2'))
1824
1825 def test_dangling_combinator(self):
1826 self.assertRaises(ValueError, self.soup.select, 'h1 >')
1827
1828 def test_sibling_combinator_wont_select_same_tag_twice(self):
1829 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])