summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/tests
diff options
context:
space:
mode:
authorRichard Purdie <richard.purdie@linuxfoundation.org>2016-05-06 09:06:51 +0100
committerRichard Purdie <richard.purdie@linuxfoundation.org>2016-06-02 08:24:02 +0100
commit822eabf32dd69346071bd25fc3639db252d2f346 (patch)
treeedac6d1d0d5114a4e3c72fea5589c069453b72d2 /bitbake/lib/bs4/tests
parent4f8959324df3b89487973bd4e8de21debb0a12ef (diff)
downloadpoky-822eabf32dd69346071bd25fc3639db252d2f346.tar.gz
bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)
Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers recommendation for v3 use. (Bitbake rev: 2f4b98af93c971a8c466ffaf3c09cca0edb6e3ad) Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/tests')
-rw-r--r--bitbake/lib/bs4/tests/test_builder_registry.py14
-rw-r--r--bitbake/lib/bs4/tests/test_html5lib.py19
-rw-r--r--bitbake/lib/bs4/tests/test_htmlparser.py13
-rw-r--r--bitbake/lib/bs4/tests/test_lxml.py19
-rw-r--r--bitbake/lib/bs4/tests/test_soup.py107
-rw-r--r--bitbake/lib/bs4/tests/test_tree.py294
6 files changed, 357 insertions, 109 deletions
diff --git a/bitbake/lib/bs4/tests/test_builder_registry.py b/bitbake/lib/bs4/tests/test_builder_registry.py
index 92ad10fb04..90cad82933 100644
--- a/bitbake/lib/bs4/tests/test_builder_registry.py
+++ b/bitbake/lib/bs4/tests/test_builder_registry.py
@@ -1,6 +1,7 @@
1"""Tests of the builder registry.""" 1"""Tests of the builder registry."""
2 2
3import unittest 3import unittest
4import warnings
4 5
5from bs4 import BeautifulSoup 6from bs4 import BeautifulSoup
6from bs4.builder import ( 7from bs4.builder import (
@@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase):
67 HTMLParserTreeBuilder) 68 HTMLParserTreeBuilder)
68 69
69 def test_beautifulsoup_constructor_does_lookup(self): 70 def test_beautifulsoup_constructor_does_lookup(self):
70 # You can pass in a string. 71
71 BeautifulSoup("", features="html") 72 with warnings.catch_warnings(record=True) as w:
72 # Or a list of strings. 73 # This will create a warning about not explicitly
73 BeautifulSoup("", features=["html", "fast"]) 74 # specifying a parser, but we'll ignore it.
75
76 # You can pass in a string.
77 BeautifulSoup("", features="html")
78 # Or a list of strings.
79 BeautifulSoup("", features=["html", "fast"])
74 80
75 # You'll get an exception if BS can't find an appropriate 81 # You'll get an exception if BS can't find an appropriate
76 # builder. 82 # builder.
diff --git a/bitbake/lib/bs4/tests/test_html5lib.py b/bitbake/lib/bs4/tests/test_html5lib.py
index 594c3e1f26..a7494ca5ba 100644
--- a/bitbake/lib/bs4/tests/test_html5lib.py
+++ b/bitbake/lib/bs4/tests/test_html5lib.py
@@ -5,7 +5,7 @@ import warnings
5try: 5try:
6 from bs4.builder import HTML5TreeBuilder 6 from bs4.builder import HTML5TreeBuilder
7 HTML5LIB_PRESENT = True 7 HTML5LIB_PRESENT = True
8except ImportError, e: 8except ImportError as e:
9 HTML5LIB_PRESENT = False 9 HTML5LIB_PRESENT = False
10from bs4.element import SoupStrainer 10from bs4.element import SoupStrainer
11from bs4.testing import ( 11from bs4.testing import (
@@ -74,12 +74,25 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
74 def test_reparented_markup(self): 74 def test_reparented_markup(self):
75 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' 75 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 soup = self.soup(markup) 76 soup = self.soup(markup)
77 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) 77 self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 self.assertEqual(2, len(soup.find_all('p'))) 78 self.assertEqual(2, len(soup.find_all('p')))
79 79
80 80
81 def test_reparented_markup_ends_with_whitespace(self): 81 def test_reparented_markup_ends_with_whitespace(self):
82 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' 82 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 soup = self.soup(markup) 83 soup = self.soup(markup)
84 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) 84 self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 self.assertEqual(2, len(soup.find_all('p'))) 85 self.assertEqual(2, len(soup.find_all('p')))
86
87 def test_processing_instruction(self):
88 """Processing instructions become comments."""
89 markup = b"""<?PITarget PIContent?>"""
90 soup = self.soup(markup)
91 assert str(soup).startswith("<!--?PITarget PIContent?-->")
92
93 def test_cloned_multivalue_node(self):
94 markup = b"""<a class="my_class"><p></a>"""
95 soup = self.soup(markup)
96 a1, a2 = soup.find_all('a')
97 self.assertEqual(a1, a2)
98 assert a1 is not a2
diff --git a/bitbake/lib/bs4/tests/test_htmlparser.py b/bitbake/lib/bs4/tests/test_htmlparser.py
index bcb5ed232f..b45e35f999 100644
--- a/bitbake/lib/bs4/tests/test_htmlparser.py
+++ b/bitbake/lib/bs4/tests/test_htmlparser.py
@@ -1,6 +1,8 @@
1"""Tests to ensure that the html.parser tree builder generates good 1"""Tests to ensure that the html.parser tree builder generates good
2trees.""" 2trees."""
3 3
4from pdb import set_trace
5import pickle
4from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest 6from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
5from bs4.builder import HTMLParserTreeBuilder 7from bs4.builder import HTMLParserTreeBuilder
6 8
@@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
17 def test_namespaced_public_doctype(self): 19 def test_namespaced_public_doctype(self):
18 # html.parser can't handle namespaced doctypes, so skip this one. 20 # html.parser can't handle namespaced doctypes, so skip this one.
19 pass 21 pass
22
23 def test_builder_is_pickled(self):
24 """Unlike most tree builders, HTMLParserTreeBuilder and will
25 be restored after pickling.
26 """
27 tree = self.soup("<a><b>foo</a>")
28 dumped = pickle.dumps(tree, 2)
29 loaded = pickle.loads(dumped)
30 self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
31
32
diff --git a/bitbake/lib/bs4/tests/test_lxml.py b/bitbake/lib/bs4/tests/test_lxml.py
index 2b2e9b7e78..6c2a1d73eb 100644
--- a/bitbake/lib/bs4/tests/test_lxml.py
+++ b/bitbake/lib/bs4/tests/test_lxml.py
@@ -7,7 +7,7 @@ try:
7 import lxml.etree 7 import lxml.etree
8 LXML_PRESENT = True 8 LXML_PRESENT = True
9 LXML_VERSION = lxml.etree.LXML_VERSION 9 LXML_VERSION = lxml.etree.LXML_VERSION
10except ImportError, e: 10except ImportError as e:
11 LXML_PRESENT = False 11 LXML_PRESENT = False
12 LXML_VERSION = (0,) 12 LXML_VERSION = (0,)
13 13
@@ -62,24 +62,9 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
62 # if one is installed. 62 # if one is installed.
63 with warnings.catch_warnings(record=True) as w: 63 with warnings.catch_warnings(record=True) as w:
64 soup = BeautifulStoneSoup("<b />") 64 soup = BeautifulStoneSoup("<b />")
65 self.assertEqual(u"<b/>", unicode(soup.b)) 65 self.assertEqual("<b/>", str(soup.b))
66 self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 66 self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67 67
68 def test_real_xhtml_document(self):
69 """lxml strips the XML definition from an XHTML doc, which is fine."""
70 markup = b"""<?xml version="1.0" encoding="utf-8"?>
71<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
72<html xmlns="http://www.w3.org/1999/xhtml">
73<head><title>Hello.</title></head>
74<body>Goodbye.</body>
75</html>"""
76 soup = self.soup(markup)
77 self.assertEqual(
78 soup.encode("utf-8").replace(b"\n", b''),
79 markup.replace(b'\n', b'').replace(
80 b'<?xml version="1.0" encoding="utf-8"?>', b''))
81
82
83@skipIf( 68@skipIf(
84 not LXML_PRESENT, 69 not LXML_PRESENT,
85 "lxml seems not to be present, not testing its XML tree builder.") 70 "lxml seems not to be present, not testing its XML tree builder.")
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py
index 47ac245f99..f87949e3d3 100644
--- a/bitbake/lib/bs4/tests/test_soup.py
+++ b/bitbake/lib/bs4/tests/test_soup.py
@@ -1,6 +1,7 @@
1# -*- coding: utf-8 -*- 1# -*- coding: utf-8 -*-
2"""Tests of Beautiful Soup as a whole.""" 2"""Tests of Beautiful Soup as a whole."""
3 3
4from pdb import set_trace
4import logging 5import logging
5import unittest 6import unittest
6import sys 7import sys
@@ -20,6 +21,7 @@ import bs4.dammit
20from bs4.dammit import ( 21from bs4.dammit import (
21 EntitySubstitution, 22 EntitySubstitution,
22 UnicodeDammit, 23 UnicodeDammit,
24 EncodingDetector,
23) 25)
24from bs4.testing import ( 26from bs4.testing import (
25 SoupTest, 27 SoupTest,
@@ -30,7 +32,7 @@ import warnings
30try: 32try:
31 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 33 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
32 LXML_PRESENT = True 34 LXML_PRESENT = True
33except ImportError, e: 35except ImportError as e:
34 LXML_PRESENT = False 36 LXML_PRESENT = False
35 37
36PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) 38PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
@@ -39,17 +41,43 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
39class TestConstructor(SoupTest): 41class TestConstructor(SoupTest):
40 42
41 def test_short_unicode_input(self): 43 def test_short_unicode_input(self):
42 data = u"<h1>éé</h1>" 44 data = "<h1>éé</h1>"
43 soup = self.soup(data) 45 soup = self.soup(data)
44 self.assertEqual(u"éé", soup.h1.string) 46 self.assertEqual("éé", soup.h1.string)
45 47
46 def test_embedded_null(self): 48 def test_embedded_null(self):
47 data = u"<h1>foo\0bar</h1>" 49 data = "<h1>foo\0bar</h1>"
48 soup = self.soup(data) 50 soup = self.soup(data)
49 self.assertEqual(u"foo\0bar", soup.h1.string) 51 self.assertEqual("foo\0bar", soup.h1.string)
50 52
53 def test_exclude_encodings(self):
54 utf8_data = "Räksmörgås".encode("utf-8")
55 soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
56 self.assertEqual("windows-1252", soup.original_encoding)
51 57
52class TestDeprecatedConstructorArguments(SoupTest): 58
59class TestWarnings(SoupTest):
60
61 def _no_parser_specified(self, s, is_there=True):
62 v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
63 self.assertTrue(v)
64
65 def test_warning_if_no_parser_specified(self):
66 with warnings.catch_warnings(record=True) as w:
67 soup = self.soup("<a><b></b></a>")
68 msg = str(w[0].message)
69 self._assert_no_parser_specified(msg)
70
71 def test_warning_if_parser_specified_too_vague(self):
72 with warnings.catch_warnings(record=True) as w:
73 soup = self.soup("<a><b></b></a>", "html")
74 msg = str(w[0].message)
75 self._assert_no_parser_specified(msg)
76
77 def test_no_warning_if_explicit_parser_specified(self):
78 with warnings.catch_warnings(record=True) as w:
79 soup = self.soup("<a><b></b></a>", "html.parser")
80 self.assertEqual([], w)
53 81
54 def test_parseOnlyThese_renamed_to_parse_only(self): 82 def test_parseOnlyThese_renamed_to_parse_only(self):
55 with warnings.catch_warnings(record=True) as w: 83 with warnings.catch_warnings(record=True) as w:
@@ -117,9 +145,9 @@ class TestEntitySubstitution(unittest.TestCase):
117 def test_simple_html_substitution(self): 145 def test_simple_html_substitution(self):
118 # Unicode characters corresponding to named HTML entites 146 # Unicode characters corresponding to named HTML entites
119 # are substituted, and no others. 147 # are substituted, and no others.
120 s = u"foo\u2200\N{SNOWMAN}\u00f5bar" 148 s = "foo\u2200\N{SNOWMAN}\u00f5bar"
121 self.assertEqual(self.sub.substitute_html(s), 149 self.assertEqual(self.sub.substitute_html(s),
122 u"foo&forall;\N{SNOWMAN}&otilde;bar") 150 "foo&forall;\N{SNOWMAN}&otilde;bar")
123 151
124 def test_smart_quote_substitution(self): 152 def test_smart_quote_substitution(self):
125 # MS smart quotes are a common source of frustration, so we 153 # MS smart quotes are a common source of frustration, so we
@@ -184,7 +212,7 @@ class TestEncodingConversion(SoupTest):
184 212
185 def setUp(self): 213 def setUp(self):
186 super(TestEncodingConversion, self).setUp() 214 super(TestEncodingConversion, self).setUp()
187 self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' 215 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
188 self.utf8_data = self.unicode_data.encode("utf-8") 216 self.utf8_data = self.unicode_data.encode("utf-8")
189 # Just so you know what it looks like. 217 # Just so you know what it looks like.
190 self.assertEqual( 218 self.assertEqual(
@@ -204,7 +232,7 @@ class TestEncodingConversion(SoupTest):
204 ascii = b"<foo>a</foo>" 232 ascii = b"<foo>a</foo>"
205 soup_from_ascii = self.soup(ascii) 233 soup_from_ascii = self.soup(ascii)
206 unicode_output = soup_from_ascii.decode() 234 unicode_output = soup_from_ascii.decode()
207 self.assertTrue(isinstance(unicode_output, unicode)) 235 self.assertTrue(isinstance(unicode_output, str))
208 self.assertEqual(unicode_output, self.document_for(ascii.decode())) 236 self.assertEqual(unicode_output, self.document_for(ascii.decode()))
209 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") 237 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
210 finally: 238 finally:
@@ -216,7 +244,7 @@ class TestEncodingConversion(SoupTest):
216 # is not set. 244 # is not set.
217 soup_from_unicode = self.soup(self.unicode_data) 245 soup_from_unicode = self.soup(self.unicode_data)
218 self.assertEqual(soup_from_unicode.decode(), self.unicode_data) 246 self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
219 self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') 247 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
220 self.assertEqual(soup_from_unicode.original_encoding, None) 248 self.assertEqual(soup_from_unicode.original_encoding, None)
221 249
222 def test_utf8_in_unicode_out(self): 250 def test_utf8_in_unicode_out(self):
@@ -224,7 +252,7 @@ class TestEncodingConversion(SoupTest):
224 # attribute is set. 252 # attribute is set.
225 soup_from_utf8 = self.soup(self.utf8_data) 253 soup_from_utf8 = self.soup(self.utf8_data)
226 self.assertEqual(soup_from_utf8.decode(), self.unicode_data) 254 self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
227 self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') 255 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
228 256
229 def test_utf8_out(self): 257 def test_utf8_out(self):
230 # The internal data structures can be encoded as UTF-8. 258 # The internal data structures can be encoded as UTF-8.
@@ -235,14 +263,14 @@ class TestEncodingConversion(SoupTest):
235 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, 263 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
236 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") 264 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
237 def test_attribute_name_containing_unicode_characters(self): 265 def test_attribute_name_containing_unicode_characters(self):
238 markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' 266 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
239 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) 267 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
240 268
241class TestUnicodeDammit(unittest.TestCase): 269class TestUnicodeDammit(unittest.TestCase):
242 """Standalone tests of UnicodeDammit.""" 270 """Standalone tests of UnicodeDammit."""
243 271
244 def test_unicode_input(self): 272 def test_unicode_input(self):
245 markup = u"I'm already Unicode! \N{SNOWMAN}" 273 markup = "I'm already Unicode! \N{SNOWMAN}"
246 dammit = UnicodeDammit(markup) 274 dammit = UnicodeDammit(markup)
247 self.assertEqual(dammit.unicode_markup, markup) 275 self.assertEqual(dammit.unicode_markup, markup)
248 276
@@ -250,7 +278,7 @@ class TestUnicodeDammit(unittest.TestCase):
250 markup = b"<foo>\x91\x92\x93\x94</foo>" 278 markup = b"<foo>\x91\x92\x93\x94</foo>"
251 dammit = UnicodeDammit(markup) 279 dammit = UnicodeDammit(markup)
252 self.assertEqual( 280 self.assertEqual(
253 dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") 281 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
254 282
255 def test_smart_quotes_to_xml_entities(self): 283 def test_smart_quotes_to_xml_entities(self):
256 markup = b"<foo>\x91\x92\x93\x94</foo>" 284 markup = b"<foo>\x91\x92\x93\x94</foo>"
@@ -271,16 +299,17 @@ class TestUnicodeDammit(unittest.TestCase):
271 dammit.unicode_markup, """<foo>''""</foo>""") 299 dammit.unicode_markup, """<foo>''""</foo>""")
272 300
273 def test_detect_utf8(self): 301 def test_detect_utf8(self):
274 utf8 = b"\xc3\xa9" 302 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
275 dammit = UnicodeDammit(utf8) 303 dammit = UnicodeDammit(utf8)
276 self.assertEqual(dammit.unicode_markup, u'\xe9')
277 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 304 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
305 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
306
278 307
279 def test_convert_hebrew(self): 308 def test_convert_hebrew(self):
280 hebrew = b"\xed\xe5\xec\xf9" 309 hebrew = b"\xed\xe5\xec\xf9"
281 dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) 310 dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
282 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') 311 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
283 self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') 312 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
284 313
285 def test_dont_see_smart_quotes_where_there_are_none(self): 314 def test_dont_see_smart_quotes_where_there_are_none(self):
286 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" 315 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@@ -289,16 +318,36 @@ class TestUnicodeDammit(unittest.TestCase):
289 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) 318 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
290 319
291 def test_ignore_inappropriate_codecs(self): 320 def test_ignore_inappropriate_codecs(self):
292 utf8_data = u"Räksmörgås".encode("utf-8") 321 utf8_data = "Räksmörgås".encode("utf-8")
293 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) 322 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
294 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 323 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
295 324
296 def test_ignore_invalid_codecs(self): 325 def test_ignore_invalid_codecs(self):
297 utf8_data = u"Räksmörgås".encode("utf-8") 326 utf8_data = "Räksmörgås".encode("utf-8")
298 for bad_encoding in ['.utf8', '...', 'utF---16.!']: 327 for bad_encoding in ['.utf8', '...', 'utF---16.!']:
299 dammit = UnicodeDammit(utf8_data, [bad_encoding]) 328 dammit = UnicodeDammit(utf8_data, [bad_encoding])
300 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 329 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
301 330
331 def test_exclude_encodings(self):
332 # This is UTF-8.
333 utf8_data = "Räksmörgås".encode("utf-8")
334
335 # But if we exclude UTF-8 from consideration, the guess is
336 # Windows-1252.
337 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
338 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
339
340 # And if we exclude that, there is no valid guess at all.
341 dammit = UnicodeDammit(
342 utf8_data, exclude_encodings=["utf-8", "windows-1252"])
343 self.assertEqual(dammit.original_encoding, None)
344
345 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
346 detected = EncodingDetector(
347 b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
348 encodings = list(detected.encodings)
349 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
350
302 def test_detect_html5_style_meta_tag(self): 351 def test_detect_html5_style_meta_tag(self):
303 352
304 for data in ( 353 for data in (
@@ -337,7 +386,7 @@ class TestUnicodeDammit(unittest.TestCase):
337 bs4.dammit.chardet_dammit = noop 386 bs4.dammit.chardet_dammit = noop
338 dammit = UnicodeDammit(doc) 387 dammit = UnicodeDammit(doc)
339 self.assertEqual(True, dammit.contains_replacement_characters) 388 self.assertEqual(True, dammit.contains_replacement_characters)
340 self.assertTrue(u"\ufffd" in dammit.unicode_markup) 389 self.assertTrue("\ufffd" in dammit.unicode_markup)
341 390
342 soup = BeautifulSoup(doc, "html.parser") 391 soup = BeautifulSoup(doc, "html.parser")
343 self.assertTrue(soup.contains_replacement_characters) 392 self.assertTrue(soup.contains_replacement_characters)
@@ -349,17 +398,17 @@ class TestUnicodeDammit(unittest.TestCase):
349 # A document written in UTF-16LE will have its byte order marker stripped. 398 # A document written in UTF-16LE will have its byte order marker stripped.
350 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' 399 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
351 dammit = UnicodeDammit(data) 400 dammit = UnicodeDammit(data)
352 self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) 401 self.assertEqual("<a>áé</a>", dammit.unicode_markup)
353 self.assertEqual("utf-16le", dammit.original_encoding) 402 self.assertEqual("utf-16le", dammit.original_encoding)
354 403
355 def test_detwingle(self): 404 def test_detwingle(self):
356 # Here's a UTF8 document. 405 # Here's a UTF8 document.
357 utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") 406 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
358 407
359 # Here's a Windows-1252 document. 408 # Here's a Windows-1252 document.
360 windows_1252 = ( 409 windows_1252 = (
361 u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" 410 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
362 u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") 411 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
363 412
364 # Through some unholy alchemy, they've been stuck together. 413 # Through some unholy alchemy, they've been stuck together.
365 doc = utf8 + windows_1252 + utf8 414 doc = utf8 + windows_1252 + utf8
@@ -374,7 +423,7 @@ class TestUnicodeDammit(unittest.TestCase):
374 423
375 fixed = UnicodeDammit.detwingle(doc) 424 fixed = UnicodeDammit.detwingle(doc)
376 self.assertEqual( 425 self.assertEqual(
377 u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) 426 "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
378 427
379 def test_detwingle_ignores_multibyte_characters(self): 428 def test_detwingle_ignores_multibyte_characters(self):
380 # Each of these characters has a UTF-8 representation ending 429 # Each of these characters has a UTF-8 representation ending
@@ -382,9 +431,9 @@ class TestUnicodeDammit(unittest.TestCase):
382 # Windows-1252. But our code knows to skip over multibyte 431 # Windows-1252. But our code knows to skip over multibyte
383 # UTF-8 characters, so they'll survive the process unscathed. 432 # UTF-8 characters, so they'll survive the process unscathed.
384 for tricky_unicode_char in ( 433 for tricky_unicode_char in (
385 u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' 434 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
386 u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' 435 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
387 u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. 436 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
388 ): 437 ):
389 input = tricky_unicode_char.encode("utf8") 438 input = tricky_unicode_char.encode("utf8")
390 self.assertTrue(input.endswith(b'\x93')) 439 self.assertTrue(input.endswith(b'\x93'))
diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py
index f8515c0ea1..6d3e67f311 100644
--- a/bitbake/lib/bs4/tests/test_tree.py
+++ b/bitbake/lib/bs4/tests/test_tree.py
@@ -9,6 +9,7 @@ same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here. 9methods tested here.
10""" 10"""
11 11
12from pdb import set_trace
12import copy 13import copy
13import pickle 14import pickle
14import re 15import re
@@ -19,8 +20,10 @@ from bs4.builder import (
19 HTMLParserTreeBuilder, 20 HTMLParserTreeBuilder,
20) 21)
21from bs4.element import ( 22from bs4.element import (
23 PY3K,
22 CData, 24 CData,
23 Comment, 25 Comment,
26 Declaration,
24 Doctype, 27 Doctype,
25 NavigableString, 28 NavigableString,
26 SoupStrainer, 29 SoupStrainer,
@@ -67,8 +70,14 @@ class TestFind(TreeTest):
67 self.assertEqual(soup.find("b").string, "2") 70 self.assertEqual(soup.find("b").string, "2")
68 71
69 def test_unicode_text_find(self): 72 def test_unicode_text_find(self):
70 soup = self.soup(u'<h1>Räksmörgås</h1>') 73 soup = self.soup('<h1>Räksmörgås</h1>')
71 self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') 74 self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
75
76 def test_unicode_attribute_find(self):
77 soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
78 str(soup)
79 self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
80
72 81
73 def test_find_everything(self): 82 def test_find_everything(self):
74 """Test an optimization that finds all tags.""" 83 """Test an optimization that finds all tags."""
@@ -87,16 +96,17 @@ class TestFindAll(TreeTest):
87 """You can search the tree for text nodes.""" 96 """You can search the tree for text nodes."""
88 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") 97 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
89 # Exact match. 98 # Exact match.
90 self.assertEqual(soup.find_all(text="bar"), [u"bar"]) 99 self.assertEqual(soup.find_all(string="bar"), ["bar"])
100 self.assertEqual(soup.find_all(text="bar"), ["bar"])
91 # Match any of a number of strings. 101 # Match any of a number of strings.
92 self.assertEqual( 102 self.assertEqual(
93 soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) 103 soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
94 # Match a regular expression. 104 # Match a regular expression.
95 self.assertEqual(soup.find_all(text=re.compile('.*')), 105 self.assertEqual(soup.find_all(text=re.compile('.*')),
96 [u"Foo", u"bar", u'\xbb']) 106 ["Foo", "bar", '\xbb'])
97 # Match anything. 107 # Match anything.
98 self.assertEqual(soup.find_all(text=True), 108 self.assertEqual(soup.find_all(text=True),
99 [u"Foo", u"bar", u'\xbb']) 109 ["Foo", "bar", '\xbb'])
100 110
101 def test_find_all_limit(self): 111 def test_find_all_limit(self):
102 """You can limit the number of items returned by find_all.""" 112 """You can limit the number of items returned by find_all."""
@@ -227,8 +237,8 @@ class TestFindAllByAttribute(TreeTest):
227 ["Matching a.", "Matching b."]) 237 ["Matching a.", "Matching b."])
228 238
229 def test_find_all_by_utf8_attribute_value(self): 239 def test_find_all_by_utf8_attribute_value(self):
230 peace = u"םולש".encode("utf8") 240 peace = "םולש".encode("utf8")
231 data = u'<a title="םולש"></a>'.encode("utf8") 241 data = '<a title="םולש"></a>'.encode("utf8")
232 soup = self.soup(data) 242 soup = self.soup(data)
233 self.assertEqual([soup.a], soup.find_all(title=peace)) 243 self.assertEqual([soup.a], soup.find_all(title=peace))
234 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) 244 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
@@ -688,7 +698,7 @@ class TestTagCreation(SoupTest):
688 698
689 def test_tag_inherits_self_closing_rules_from_builder(self): 699 def test_tag_inherits_self_closing_rules_from_builder(self):
690 if XML_BUILDER_PRESENT: 700 if XML_BUILDER_PRESENT:
691 xml_soup = BeautifulSoup("", "xml") 701 xml_soup = BeautifulSoup("", "lxml-xml")
692 xml_br = xml_soup.new_tag("br") 702 xml_br = xml_soup.new_tag("br")
693 xml_p = xml_soup.new_tag("p") 703 xml_p = xml_soup.new_tag("p")
694 704
@@ -697,7 +707,7 @@ class TestTagCreation(SoupTest):
697 self.assertEqual(b"<br/>", xml_br.encode()) 707 self.assertEqual(b"<br/>", xml_br.encode())
698 self.assertEqual(b"<p/>", xml_p.encode()) 708 self.assertEqual(b"<p/>", xml_p.encode())
699 709
700 html_soup = BeautifulSoup("", "html") 710 html_soup = BeautifulSoup("", "html.parser")
701 html_br = html_soup.new_tag("br") 711 html_br = html_soup.new_tag("br")
702 html_p = html_soup.new_tag("p") 712 html_p = html_soup.new_tag("p")
703 713
@@ -773,6 +783,14 @@ class TestTreeModification(SoupTest):
773 new_a = a.unwrap() 783 new_a = a.unwrap()
774 self.assertEqual(a, new_a) 784 self.assertEqual(a, new_a)
775 785
786 def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
787 soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
788 a = soup.a
789 a.extract()
790 self.assertEqual(None, a.parent)
791 self.assertRaises(ValueError, a.unwrap)
792 self.assertRaises(ValueError, a.replace_with, soup.c)
793
776 def test_replace_tag_with_itself(self): 794 def test_replace_tag_with_itself(self):
777 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" 795 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
778 soup = self.soup(text) 796 soup = self.soup(text)
@@ -1067,6 +1085,31 @@ class TestTreeModification(SoupTest):
1067 self.assertEqual(foo_2, soup.a.string) 1085 self.assertEqual(foo_2, soup.a.string)
1068 self.assertEqual(bar_2, soup.b.string) 1086 self.assertEqual(bar_2, soup.b.string)
1069 1087
1088 def test_extract_multiples_of_same_tag(self):
1089 soup = self.soup("""
1090<html>
1091<head>
1092<script>foo</script>
1093</head>
1094<body>
1095 <script>bar</script>
1096 <a></a>
1097</body>
1098<script>baz</script>
1099</html>""")
1100 [soup.script.extract() for i in soup.find_all("script")]
1101 self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
1102
1103
1104 def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
1105 soup = self.soup(
1106 '<html>\n'
1107 '<body>hi</body>\n'
1108 '</html>')
1109 soup.find('body').extract()
1110 self.assertEqual(None, soup.find('body'))
1111
1112
1070 def test_clear(self): 1113 def test_clear(self):
1071 """Tag.clear()""" 1114 """Tag.clear()"""
1072 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") 1115 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
@@ -1287,27 +1330,72 @@ class TestPersistence(SoupTest):
1287 1330
1288 def test_unicode_pickle(self): 1331 def test_unicode_pickle(self):
1289 # A tree containing Unicode characters can be pickled. 1332 # A tree containing Unicode characters can be pickled.
1290 html = u"<b>\N{SNOWMAN}</b>" 1333 html = "<b>\N{SNOWMAN}</b>"
1291 soup = self.soup(html) 1334 soup = self.soup(html)
1292 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) 1335 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1293 loaded = pickle.loads(dumped) 1336 loaded = pickle.loads(dumped)
1294 self.assertEqual(loaded.decode(), soup.decode()) 1337 self.assertEqual(loaded.decode(), soup.decode())
1295 1338
1339 def test_copy_navigablestring_is_not_attached_to_tree(self):
1340 html = "<b>Foo<a></a></b><b>Bar</b>"
1341 soup = self.soup(html)
1342 s1 = soup.find(string="Foo")
1343 s2 = copy.copy(s1)
1344 self.assertEqual(s1, s2)
1345 self.assertEqual(None, s2.parent)
1346 self.assertEqual(None, s2.next_element)
1347 self.assertNotEqual(None, s1.next_sibling)
1348 self.assertEqual(None, s2.next_sibling)
1349 self.assertEqual(None, s2.previous_element)
1350
1351 def test_copy_navigablestring_subclass_has_same_type(self):
1352 html = "<b><!--Foo--></b>"
1353 soup = self.soup(html)
1354 s1 = soup.string
1355 s2 = copy.copy(s1)
1356 self.assertEqual(s1, s2)
1357 self.assertTrue(isinstance(s2, Comment))
1358
1359 def test_copy_entire_soup(self):
1360 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1361 soup = self.soup(html)
1362 soup_copy = copy.copy(soup)
1363 self.assertEqual(soup, soup_copy)
1364
1365 def test_copy_tag_copies_contents(self):
1366 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1367 soup = self.soup(html)
1368 div = soup.div
1369 div_copy = copy.copy(div)
1370
1371 # The two tags look the same, and evaluate to equal.
1372 self.assertEqual(str(div), str(div_copy))
1373 self.assertEqual(div, div_copy)
1374
1375 # But they're not the same object.
1376 self.assertFalse(div is div_copy)
1377
1378 # And they don't have the same relation to the parse tree. The
1379 # copy is not associated with a parse tree at all.
1380 self.assertEqual(None, div_copy.parent)
1381 self.assertEqual(None, div_copy.previous_element)
1382 self.assertEqual(None, div_copy.find(string='Bar').next_element)
1383 self.assertNotEqual(None, div.find(string='Bar').next_element)
1296 1384
1297class TestSubstitutions(SoupTest): 1385class TestSubstitutions(SoupTest):
1298 1386
1299 def test_default_formatter_is_minimal(self): 1387 def test_default_formatter_is_minimal(self):
1300 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" 1388 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1301 soup = self.soup(markup) 1389 soup = self.soup(markup)
1302 decoded = soup.decode(formatter="minimal") 1390 decoded = soup.decode(formatter="minimal")
1303 # The < is converted back into &lt; but the e-with-acute is left alone. 1391 # The < is converted back into &lt; but the e-with-acute is left alone.
1304 self.assertEqual( 1392 self.assertEqual(
1305 decoded, 1393 decoded,
1306 self.document_for( 1394 self.document_for(
1307 u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>")) 1395 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1308 1396
1309 def test_formatter_html(self): 1397 def test_formatter_html(self):
1310 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" 1398 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1311 soup = self.soup(markup) 1399 soup = self.soup(markup)
1312 decoded = soup.decode(formatter="html") 1400 decoded = soup.decode(formatter="html")
1313 self.assertEqual( 1401 self.assertEqual(
@@ -1315,49 +1403,49 @@ class TestSubstitutions(SoupTest):
1315 self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>")) 1403 self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1316 1404
1317 def test_formatter_minimal(self): 1405 def test_formatter_minimal(self):
1318 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" 1406 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1319 soup = self.soup(markup) 1407 soup = self.soup(markup)
1320 decoded = soup.decode(formatter="minimal") 1408 decoded = soup.decode(formatter="minimal")
1321 # The < is converted back into &lt; but the e-with-acute is left alone. 1409 # The < is converted back into &lt; but the e-with-acute is left alone.
1322 self.assertEqual( 1410 self.assertEqual(
1323 decoded, 1411 decoded,
1324 self.document_for( 1412 self.document_for(
1325 u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>")) 1413 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1326 1414
1327 def test_formatter_null(self): 1415 def test_formatter_null(self):
1328 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" 1416 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1329 soup = self.soup(markup) 1417 soup = self.soup(markup)
1330 decoded = soup.decode(formatter=None) 1418 decoded = soup.decode(formatter=None)
1331 # Neither the angle brackets nor the e-with-acute are converted. 1419 # Neither the angle brackets nor the e-with-acute are converted.
1332 # This is not valid HTML, but it's what the user wanted. 1420 # This is not valid HTML, but it's what the user wanted.
1333 self.assertEqual(decoded, 1421 self.assertEqual(decoded,
1334 self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1422 self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1335 1423
1336 def test_formatter_custom(self): 1424 def test_formatter_custom(self):
1337 markup = u"<b>&lt;foo&gt;</b><b>bar</b>" 1425 markup = "<b>&lt;foo&gt;</b><b>bar</b>"
1338 soup = self.soup(markup) 1426 soup = self.soup(markup)
1339 decoded = soup.decode(formatter = lambda x: x.upper()) 1427 decoded = soup.decode(formatter = lambda x: x.upper())
1340 # Instead of normal entity conversion code, the custom 1428 # Instead of normal entity conversion code, the custom
1341 # callable is called on every string. 1429 # callable is called on every string.
1342 self.assertEqual( 1430 self.assertEqual(
1343 decoded, 1431 decoded,
1344 self.document_for(u"<b><FOO></b><b>BAR</b>")) 1432 self.document_for("<b><FOO></b><b>BAR</b>"))
1345 1433
1346 def test_formatter_is_run_on_attribute_values(self): 1434 def test_formatter_is_run_on_attribute_values(self):
1347 markup = u'<a href="http://a.com?a=b&c=é">e</a>' 1435 markup = '<a href="http://a.com?a=b&c=é">e</a>'
1348 soup = self.soup(markup) 1436 soup = self.soup(markup)
1349 a = soup.a 1437 a = soup.a
1350 1438
1351 expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>' 1439 expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
1352 1440
1353 self.assertEqual(expect_minimal, a.decode()) 1441 self.assertEqual(expect_minimal, a.decode())
1354 self.assertEqual(expect_minimal, a.decode(formatter="minimal")) 1442 self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1355 1443
1356 expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>' 1444 expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1357 self.assertEqual(expect_html, a.decode(formatter="html")) 1445 self.assertEqual(expect_html, a.decode(formatter="html"))
1358 1446
1359 self.assertEqual(markup, a.decode(formatter=None)) 1447 self.assertEqual(markup, a.decode(formatter=None))
1360 expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' 1448 expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
1361 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) 1449 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1362 1450
1363 def test_formatter_skips_script_tag_for_html_documents(self): 1451 def test_formatter_skips_script_tag_for_html_documents(self):
@@ -1366,7 +1454,7 @@ class TestSubstitutions(SoupTest):
1366 console.log("< < hey > > "); 1454 console.log("< < hey > > ");
1367 </script> 1455 </script>
1368""" 1456"""
1369 encoded = BeautifulSoup(doc).encode() 1457 encoded = BeautifulSoup(doc, 'html.parser').encode()
1370 self.assertTrue(b"< < hey > >" in encoded) 1458 self.assertTrue(b"< < hey > >" in encoded)
1371 1459
1372 def test_formatter_skips_style_tag_for_html_documents(self): 1460 def test_formatter_skips_style_tag_for_html_documents(self):
@@ -1375,7 +1463,7 @@ class TestSubstitutions(SoupTest):
1375 console.log("< < hey > > "); 1463 console.log("< < hey > > ");
1376 </style> 1464 </style>
1377""" 1465"""
1378 encoded = BeautifulSoup(doc).encode() 1466 encoded = BeautifulSoup(doc, 'html.parser').encode()
1379 self.assertTrue(b"< < hey > >" in encoded) 1467 self.assertTrue(b"< < hey > >" in encoded)
1380 1468
1381 def test_prettify_leaves_preformatted_text_alone(self): 1469 def test_prettify_leaves_preformatted_text_alone(self):
@@ -1383,24 +1471,24 @@ class TestSubstitutions(SoupTest):
1383 # Everything outside the <pre> tag is reformatted, but everything 1471 # Everything outside the <pre> tag is reformatted, but everything
1384 # inside is left alone. 1472 # inside is left alone.
1385 self.assertEqual( 1473 self.assertEqual(
1386 u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', 1474 '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
1387 soup.div.prettify()) 1475 soup.div.prettify())
1388 1476
1389 def test_prettify_accepts_formatter(self): 1477 def test_prettify_accepts_formatter(self):
1390 soup = BeautifulSoup("<html><body>foo</body></html>") 1478 soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
1391 pretty = soup.prettify(formatter = lambda x: x.upper()) 1479 pretty = soup.prettify(formatter = lambda x: x.upper())
1392 self.assertTrue("FOO" in pretty) 1480 self.assertTrue("FOO" in pretty)
1393 1481
1394 def test_prettify_outputs_unicode_by_default(self): 1482 def test_prettify_outputs_unicode_by_default(self):
1395 soup = self.soup("<a></a>") 1483 soup = self.soup("<a></a>")
1396 self.assertEqual(unicode, type(soup.prettify())) 1484 self.assertEqual(str, type(soup.prettify()))
1397 1485
1398 def test_prettify_can_encode_data(self): 1486 def test_prettify_can_encode_data(self):
1399 soup = self.soup("<a></a>") 1487 soup = self.soup("<a></a>")
1400 self.assertEqual(bytes, type(soup.prettify("utf-8"))) 1488 self.assertEqual(bytes, type(soup.prettify("utf-8")))
1401 1489
1402 def test_html_entity_substitution_off_by_default(self): 1490 def test_html_entity_substitution_off_by_default(self):
1403 markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" 1491 markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1404 soup = self.soup(markup) 1492 soup = self.soup(markup)
1405 encoded = soup.b.encode("utf-8") 1493 encoded = soup.b.encode("utf-8")
1406 self.assertEqual(encoded, markup.encode('utf-8')) 1494 self.assertEqual(encoded, markup.encode('utf-8'))
@@ -1444,45 +1532,53 @@ class TestEncoding(SoupTest):
1444 """Test the ability to encode objects into strings.""" 1532 """Test the ability to encode objects into strings."""
1445 1533
1446 def test_unicode_string_can_be_encoded(self): 1534 def test_unicode_string_can_be_encoded(self):
1447 html = u"<b>\N{SNOWMAN}</b>" 1535 html = "<b>\N{SNOWMAN}</b>"
1448 soup = self.soup(html) 1536 soup = self.soup(html)
1449 self.assertEqual(soup.b.string.encode("utf-8"), 1537 self.assertEqual(soup.b.string.encode("utf-8"),
1450 u"\N{SNOWMAN}".encode("utf-8")) 1538 "\N{SNOWMAN}".encode("utf-8"))
1451 1539
1452 def test_tag_containing_unicode_string_can_be_encoded(self): 1540 def test_tag_containing_unicode_string_can_be_encoded(self):
1453 html = u"<b>\N{SNOWMAN}</b>" 1541 html = "<b>\N{SNOWMAN}</b>"
1454 soup = self.soup(html) 1542 soup = self.soup(html)
1455 self.assertEqual( 1543 self.assertEqual(
1456 soup.b.encode("utf-8"), html.encode("utf-8")) 1544 soup.b.encode("utf-8"), html.encode("utf-8"))
1457 1545
1458 def test_encoding_substitutes_unrecognized_characters_by_default(self): 1546 def test_encoding_substitutes_unrecognized_characters_by_default(self):
1459 html = u"<b>\N{SNOWMAN}</b>" 1547 html = "<b>\N{SNOWMAN}</b>"
1460 soup = self.soup(html) 1548 soup = self.soup(html)
1461 self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>") 1549 self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1462 1550
1463 def test_encoding_can_be_made_strict(self): 1551 def test_encoding_can_be_made_strict(self):
1464 html = u"<b>\N{SNOWMAN}</b>" 1552 html = "<b>\N{SNOWMAN}</b>"
1465 soup = self.soup(html) 1553 soup = self.soup(html)
1466 self.assertRaises( 1554 self.assertRaises(
1467 UnicodeEncodeError, soup.encode, "ascii", errors="strict") 1555 UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1468 1556
1469 def test_decode_contents(self): 1557 def test_decode_contents(self):
1470 html = u"<b>\N{SNOWMAN}</b>" 1558 html = "<b>\N{SNOWMAN}</b>"
1471 soup = self.soup(html) 1559 soup = self.soup(html)
1472 self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) 1560 self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
1473 1561
1474 def test_encode_contents(self): 1562 def test_encode_contents(self):
1475 html = u"<b>\N{SNOWMAN}</b>" 1563 html = "<b>\N{SNOWMAN}</b>"
1476 soup = self.soup(html) 1564 soup = self.soup(html)
1477 self.assertEqual( 1565 self.assertEqual(
1478 u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( 1566 "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1479 encoding="utf8")) 1567 encoding="utf8"))
1480 1568
1481 def test_deprecated_renderContents(self): 1569 def test_deprecated_renderContents(self):
1482 html = u"<b>\N{SNOWMAN}</b>" 1570 html = "<b>\N{SNOWMAN}</b>"
1483 soup = self.soup(html) 1571 soup = self.soup(html)
1484 self.assertEqual( 1572 self.assertEqual(
1485 u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) 1573 "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1574
1575 def test_repr(self):
1576 html = "<b>\N{SNOWMAN}</b>"
1577 soup = self.soup(html)
1578 if PY3K:
1579 self.assertEqual(html, repr(soup))
1580 else:
1581 self.assertEqual(b'<b>\\u2603</b>', repr(soup))
1486 1582
1487class TestNavigableStringSubclasses(SoupTest): 1583class TestNavigableStringSubclasses(SoupTest):
1488 1584
@@ -1522,6 +1618,9 @@ class TestNavigableStringSubclasses(SoupTest):
1522 soup.insert(1, doctype) 1618 soup.insert(1, doctype)
1523 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") 1619 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1524 1620
1621 def test_declaration(self):
1622 d = Declaration("foo")
1623 self.assertEqual("<?foo?>", d.output_ready())
1525 1624
1526class TestSoupSelector(TreeTest): 1625class TestSoupSelector(TreeTest):
1527 1626
@@ -1534,7 +1633,7 @@ class TestSoupSelector(TreeTest):
1534<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> 1633<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1535</head> 1634</head>
1536<body> 1635<body>
1537 1636<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
1538<div id="main" class="fancy"> 1637<div id="main" class="fancy">
1539<div id="inner"> 1638<div id="inner">
1540<h1 id="header1">An H1</h1> 1639<h1 id="header1">An H1</h1>
@@ -1552,8 +1651,18 @@ class TestSoupSelector(TreeTest):
1552<a href="#" id="s2a1">span2a1</a> 1651<a href="#" id="s2a1">span2a1</a>
1553</span> 1652</span>
1554<span class="span3"></span> 1653<span class="span3"></span>
1654<custom-dashed-tag class="dashed" id="dash2"/>
1655<div data-tag="dashedvalue" id="data1"/>
1555</span> 1656</span>
1556</div> 1657</div>
1658<x id="xid">
1659<z id="zida"/>
1660<z id="zidab"/>
1661<z id="zidac"/>
1662</x>
1663<y id="yid">
1664<z id="zidb"/>
1665</y>
1557<p lang="en" id="lang-en">English</p> 1666<p lang="en" id="lang-en">English</p>
1558<p lang="en-gb" id="lang-en-gb">English UK</p> 1667<p lang="en-gb" id="lang-en-gb">English UK</p>
1559<p lang="en-us" id="lang-en-us">English US</p> 1668<p lang="en-us" id="lang-en-us">English US</p>
@@ -1565,7 +1674,7 @@ class TestSoupSelector(TreeTest):
1565""" 1674"""
1566 1675
1567 def setUp(self): 1676 def setUp(self):
1568 self.soup = BeautifulSoup(self.HTML) 1677 self.soup = BeautifulSoup(self.HTML, 'html.parser')
1569 1678
1570 def assertSelects(self, selector, expected_ids): 1679 def assertSelects(self, selector, expected_ids):
1571 el_ids = [el['id'] for el in self.soup.select(selector)] 1680 el_ids = [el['id'] for el in self.soup.select(selector)]
@@ -1587,21 +1696,29 @@ class TestSoupSelector(TreeTest):
1587 els = self.soup.select('title') 1696 els = self.soup.select('title')
1588 self.assertEqual(len(els), 1) 1697 self.assertEqual(len(els), 1)
1589 self.assertEqual(els[0].name, 'title') 1698 self.assertEqual(els[0].name, 'title')
1590 self.assertEqual(els[0].contents, [u'The title']) 1699 self.assertEqual(els[0].contents, ['The title'])
1591 1700
1592 def test_one_tag_many(self): 1701 def test_one_tag_many(self):
1593 els = self.soup.select('div') 1702 els = self.soup.select('div')
1594 self.assertEqual(len(els), 3) 1703 self.assertEqual(len(els), 4)
1595 for div in els: 1704 for div in els:
1596 self.assertEqual(div.name, 'div') 1705 self.assertEqual(div.name, 'div')
1597 1706
1707 el = self.soup.select_one('div')
1708 self.assertEqual('main', el['id'])
1709
1710 def test_select_one_returns_none_if_no_match(self):
1711 match = self.soup.select_one('nonexistenttag')
1712 self.assertEqual(None, match)
1713
1714
1598 def test_tag_in_tag_one(self): 1715 def test_tag_in_tag_one(self):
1599 els = self.soup.select('div div') 1716 els = self.soup.select('div div')
1600 self.assertSelects('div div', ['inner']) 1717 self.assertSelects('div div', ['inner', 'data1'])
1601 1718
1602 def test_tag_in_tag_many(self): 1719 def test_tag_in_tag_many(self):
1603 for selector in ('html div', 'html body div', 'body div'): 1720 for selector in ('html div', 'html body div', 'body div'):
1604 self.assertSelects(selector, ['main', 'inner', 'footer']) 1721 self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
1605 1722
1606 def test_tag_no_match(self): 1723 def test_tag_no_match(self):
1607 self.assertEqual(len(self.soup.select('del')), 0) 1724 self.assertEqual(len(self.soup.select('del')), 0)
@@ -1609,6 +1726,20 @@ class TestSoupSelector(TreeTest):
1609 def test_invalid_tag(self): 1726 def test_invalid_tag(self):
1610 self.assertRaises(ValueError, self.soup.select, 'tag%t') 1727 self.assertRaises(ValueError, self.soup.select, 'tag%t')
1611 1728
1729 def test_select_dashed_tag_ids(self):
1730 self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
1731
1732 def test_select_dashed_by_id(self):
1733 dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
1734 self.assertEqual(dashed[0].name, 'custom-dashed-tag')
1735 self.assertEqual(dashed[0]['id'], 'dash2')
1736
1737 def test_dashed_tag_text(self):
1738 self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
1739
1740 def test_select_dashed_matches_find_all(self):
1741 self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
1742
1612 def test_header_tags(self): 1743 def test_header_tags(self):
1613 self.assertSelectMultiple( 1744 self.assertSelectMultiple(
1614 ('h1', ['header1']), 1745 ('h1', ['header1']),
@@ -1709,6 +1840,7 @@ class TestSoupSelector(TreeTest):
1709 ('[id^="m"]', ['me', 'main']), 1840 ('[id^="m"]', ['me', 'main']),
1710 ('div[id^="m"]', ['main']), 1841 ('div[id^="m"]', ['main']),
1711 ('a[id^="m"]', ['me']), 1842 ('a[id^="m"]', ['me']),
1843 ('div[data-tag^="dashed"]', ['data1'])
1712 ) 1844 )
1713 1845
1714 def test_attribute_endswith(self): 1846 def test_attribute_endswith(self):
@@ -1716,8 +1848,8 @@ class TestSoupSelector(TreeTest):
1716 ('[href$=".css"]', ['l1']), 1848 ('[href$=".css"]', ['l1']),
1717 ('link[href$=".css"]', ['l1']), 1849 ('link[href$=".css"]', ['l1']),
1718 ('link[id$="1"]', ['l1']), 1850 ('link[id$="1"]', ['l1']),
1719 ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), 1851 ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
1720 ('div[id$="1"]', []), 1852 ('div[id$="1"]', ['data1']),
1721 ('[id$="noending"]', []), 1853 ('[id$="noending"]', []),
1722 ) 1854 )
1723 1855
@@ -1730,7 +1862,6 @@ class TestSoupSelector(TreeTest):
1730 ('[rel*="notstyle"]', []), 1862 ('[rel*="notstyle"]', []),
1731 ('link[rel*="notstyle"]', []), 1863 ('link[rel*="notstyle"]', []),
1732 ('link[href*="bla"]', ['l1']), 1864 ('link[href*="bla"]', ['l1']),
1733 ('a[href*="http://"]', ['bob', 'me']),
1734 ('[href*="http://"]', ['bob', 'me']), 1865 ('[href*="http://"]', ['bob', 'me']),
1735 ('[id*="p"]', ['pmulti', 'p1']), 1866 ('[id*="p"]', ['pmulti', 'p1']),
1736 ('div[id*="m"]', ['main']), 1867 ('div[id*="m"]', ['main']),
@@ -1739,8 +1870,8 @@ class TestSoupSelector(TreeTest):
1739 ('[href*=".css"]', ['l1']), 1870 ('[href*=".css"]', ['l1']),
1740 ('link[href*=".css"]', ['l1']), 1871 ('link[href*=".css"]', ['l1']),
1741 ('link[id*="1"]', ['l1']), 1872 ('link[id*="1"]', ['l1']),
1742 ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), 1873 ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
1743 ('div[id*="1"]', []), 1874 ('div[id*="1"]', ['data1']),
1744 ('[id*="noending"]', []), 1875 ('[id*="noending"]', []),
1745 # New for this test 1876 # New for this test
1746 ('[href*="."]', ['bob', 'me', 'l1']), 1877 ('[href*="."]', ['bob', 'me', 'l1']),
@@ -1748,6 +1879,7 @@ class TestSoupSelector(TreeTest):
1748 ('link[href*="."]', ['l1']), 1879 ('link[href*="."]', ['l1']),
1749 ('div[id*="n"]', ['main', 'inner']), 1880 ('div[id*="n"]', ['main', 'inner']),
1750 ('div[id*="nn"]', ['inner']), 1881 ('div[id*="nn"]', ['inner']),
1882 ('div[data-tag*="edval"]', ['data1'])
1751 ) 1883 )
1752 1884
1753 def test_attribute_exact_or_hypen(self): 1885 def test_attribute_exact_or_hypen(self):
@@ -1767,18 +1899,27 @@ class TestSoupSelector(TreeTest):
1767 ('p[class]', ['p1', 'pmulti']), 1899 ('p[class]', ['p1', 'pmulti']),
1768 ('[blah]', []), 1900 ('[blah]', []),
1769 ('p[blah]', []), 1901 ('p[blah]', []),
1902 ('div[data-tag]', ['data1'])
1770 ) 1903 )
1771 1904
1905 def test_unsupported_pseudoclass(self):
1906 self.assertRaises(
1907 NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
1908
1909 self.assertRaises(
1910 NotImplementedError, self.soup.select, "a:nth-of-type(a)")
1911
1912
1772 def test_nth_of_type(self): 1913 def test_nth_of_type(self):
1773 # Try to select first paragraph 1914 # Try to select first paragraph
1774 els = self.soup.select('div#inner p:nth-of-type(1)') 1915 els = self.soup.select('div#inner p:nth-of-type(1)')
1775 self.assertEqual(len(els), 1) 1916 self.assertEqual(len(els), 1)
1776 self.assertEqual(els[0].string, u'Some text') 1917 self.assertEqual(els[0].string, 'Some text')
1777 1918
1778 # Try to select third paragraph 1919 # Try to select third paragraph
1779 els = self.soup.select('div#inner p:nth-of-type(3)') 1920 els = self.soup.select('div#inner p:nth-of-type(3)')
1780 self.assertEqual(len(els), 1) 1921 self.assertEqual(len(els), 1)
1781 self.assertEqual(els[0].string, u'Another') 1922 self.assertEqual(els[0].string, 'Another')
1782 1923
1783 # Try to select (non-existent!) fourth paragraph 1924 # Try to select (non-existent!) fourth paragraph
1784 els = self.soup.select('div#inner p:nth-of-type(4)') 1925 els = self.soup.select('div#inner p:nth-of-type(4)')
@@ -1791,7 +1932,7 @@ class TestSoupSelector(TreeTest):
1791 def test_nth_of_type_direct_descendant(self): 1932 def test_nth_of_type_direct_descendant(self):
1792 els = self.soup.select('div#inner > p:nth-of-type(1)') 1933 els = self.soup.select('div#inner > p:nth-of-type(1)')
1793 self.assertEqual(len(els), 1) 1934 self.assertEqual(len(els), 1)
1794 self.assertEqual(els[0].string, u'Some text') 1935 self.assertEqual(els[0].string, 'Some text')
1795 1936
1796 def test_id_child_selector_nth_of_type(self): 1937 def test_id_child_selector_nth_of_type(self):
1797 self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) 1938 self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
@@ -1803,7 +1944,7 @@ class TestSoupSelector(TreeTest):
1803 selected = inner.select("div") 1944 selected = inner.select("div")
1804 # The <div id="inner"> tag was selected. The <div id="footer"> 1945 # The <div id="inner"> tag was selected. The <div id="footer">
1805 # tag was not. 1946 # tag was not.
1806 self.assertSelectsIDs(selected, ['inner']) 1947 self.assertSelectsIDs(selected, ['inner', 'data1'])
1807 1948
1808 def test_overspecified_child_id(self): 1949 def test_overspecified_child_id(self):
1809 self.assertSelects(".fancy #inner", ['inner']) 1950 self.assertSelects(".fancy #inner", ['inner'])
@@ -1827,3 +1968,44 @@ class TestSoupSelector(TreeTest):
1827 1968
1828 def test_sibling_combinator_wont_select_same_tag_twice(self): 1969 def test_sibling_combinator_wont_select_same_tag_twice(self):
1829 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) 1970 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
1971
1972 # Test the selector grouping operator (the comma)
1973 def test_multiple_select(self):
1974 self.assertSelects('x, y', ['xid', 'yid'])
1975
1976 def test_multiple_select_with_no_space(self):
1977 self.assertSelects('x,y', ['xid', 'yid'])
1978
1979 def test_multiple_select_with_more_space(self):
1980 self.assertSelects('x, y', ['xid', 'yid'])
1981
1982 def test_multiple_select_duplicated(self):
1983 self.assertSelects('x, x', ['xid'])
1984
1985 def test_multiple_select_sibling(self):
1986 self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
1987
1988 def test_multiple_select_tag_and_direct_descendant(self):
1989 self.assertSelects('x, y > z', ['xid', 'zidb'])
1990
1991 def test_multiple_select_direct_descendant_and_tags(self):
1992 self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1993
1994 def test_multiple_select_indirect_descendant(self):
1995 self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1996
1997 def test_invalid_multiple_select(self):
1998 self.assertRaises(ValueError, self.soup.select, ',x, y')
1999 self.assertRaises(ValueError, self.soup.select, 'x,,y')
2000
2001 def test_multiple_select_attrs(self):
2002 self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
2003
2004 def test_multiple_select_ids(self):
2005 self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
2006
2007 def test_multiple_select_nested(self):
2008 self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
2009
2010
2011