diff options
author | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-05-06 09:06:51 +0100 |
---|---|---|
committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-05-16 23:32:40 +0100 |
commit | 8d49bef632a0486e0172e543a6c2622398ed7a8c (patch) | |
tree | e1df010f269ba33c3b53300bd16f030873b75363 /bitbake/lib/bs4/tests/test_soup.py | |
parent | 64182f6a89761fbdb7929da067ca1e7d4e89bbb7 (diff) | |
download | poky-8d49bef632a0486e0172e543a6c2622398ed7a8c.tar.gz |
bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)
Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers
recommendation for v3 use.
(Bitbake rev: f06e0f8052ba44eeb9ce701192cdf19252b2646d)
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/tests/test_soup.py')
-rw-r--r-- | bitbake/lib/bs4/tests/test_soup.py | 107 |
1 files changed, 78 insertions, 29 deletions
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py index 47ac245f99..f87949e3d3 100644 --- a/bitbake/lib/bs4/tests/test_soup.py +++ b/bitbake/lib/bs4/tests/test_soup.py | |||
@@ -1,6 +1,7 @@ | |||
1 | # -*- coding: utf-8 -*- | 1 | # -*- coding: utf-8 -*- |
2 | """Tests of Beautiful Soup as a whole.""" | 2 | """Tests of Beautiful Soup as a whole.""" |
3 | 3 | ||
4 | from pdb import set_trace | ||
4 | import logging | 5 | import logging |
5 | import unittest | 6 | import unittest |
6 | import sys | 7 | import sys |
@@ -20,6 +21,7 @@ import bs4.dammit | |||
20 | from bs4.dammit import ( | 21 | from bs4.dammit import ( |
21 | EntitySubstitution, | 22 | EntitySubstitution, |
22 | UnicodeDammit, | 23 | UnicodeDammit, |
24 | EncodingDetector, | ||
23 | ) | 25 | ) |
24 | from bs4.testing import ( | 26 | from bs4.testing import ( |
25 | SoupTest, | 27 | SoupTest, |
@@ -30,7 +32,7 @@ import warnings | |||
30 | try: | 32 | try: |
31 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | 33 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML |
32 | LXML_PRESENT = True | 34 | LXML_PRESENT = True |
33 | except ImportError, e: | 35 | except ImportError as e: |
34 | LXML_PRESENT = False | 36 | LXML_PRESENT = False |
35 | 37 | ||
36 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) | 38 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) |
@@ -39,17 +41,43 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) | |||
39 | class TestConstructor(SoupTest): | 41 | class TestConstructor(SoupTest): |
40 | 42 | ||
41 | def test_short_unicode_input(self): | 43 | def test_short_unicode_input(self): |
42 | data = u"<h1>éé</h1>" | 44 | data = "<h1>éé</h1>" |
43 | soup = self.soup(data) | 45 | soup = self.soup(data) |
44 | self.assertEqual(u"éé", soup.h1.string) | 46 | self.assertEqual("éé", soup.h1.string) |
45 | 47 | ||
46 | def test_embedded_null(self): | 48 | def test_embedded_null(self): |
47 | data = u"<h1>foo\0bar</h1>" | 49 | data = "<h1>foo\0bar</h1>" |
48 | soup = self.soup(data) | 50 | soup = self.soup(data) |
49 | self.assertEqual(u"foo\0bar", soup.h1.string) | 51 | self.assertEqual("foo\0bar", soup.h1.string) |
50 | 52 | ||
53 | def test_exclude_encodings(self): | ||
54 | utf8_data = "Räksmörgås".encode("utf-8") | ||
55 | soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) | ||
56 | self.assertEqual("windows-1252", soup.original_encoding) | ||
51 | 57 | ||
52 | class TestDeprecatedConstructorArguments(SoupTest): | 58 | |
59 | class TestWarnings(SoupTest): | ||
60 | |||
61 | def _no_parser_specified(self, s, is_there=True): | ||
62 | v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) | ||
63 | self.assertTrue(v) | ||
64 | |||
65 | def test_warning_if_no_parser_specified(self): | ||
66 | with warnings.catch_warnings(record=True) as w: | ||
67 | soup = self.soup("<a><b></b></a>") | ||
68 | msg = str(w[0].message) | ||
69 | self._assert_no_parser_specified(msg) | ||
70 | |||
71 | def test_warning_if_parser_specified_too_vague(self): | ||
72 | with warnings.catch_warnings(record=True) as w: | ||
73 | soup = self.soup("<a><b></b></a>", "html") | ||
74 | msg = str(w[0].message) | ||
75 | self._assert_no_parser_specified(msg) | ||
76 | |||
77 | def test_no_warning_if_explicit_parser_specified(self): | ||
78 | with warnings.catch_warnings(record=True) as w: | ||
79 | soup = self.soup("<a><b></b></a>", "html.parser") | ||
80 | self.assertEqual([], w) | ||
53 | 81 | ||
54 | def test_parseOnlyThese_renamed_to_parse_only(self): | 82 | def test_parseOnlyThese_renamed_to_parse_only(self): |
55 | with warnings.catch_warnings(record=True) as w: | 83 | with warnings.catch_warnings(record=True) as w: |
@@ -117,9 +145,9 @@ class TestEntitySubstitution(unittest.TestCase): | |||
117 | def test_simple_html_substitution(self): | 145 | def test_simple_html_substitution(self): |
118 | # Unicode characters corresponding to named HTML entites | 146 | # Unicode characters corresponding to named HTML entites |
119 | # are substituted, and no others. | 147 | # are substituted, and no others. |
120 | s = u"foo\u2200\N{SNOWMAN}\u00f5bar" | 148 | s = "foo\u2200\N{SNOWMAN}\u00f5bar" |
121 | self.assertEqual(self.sub.substitute_html(s), | 149 | self.assertEqual(self.sub.substitute_html(s), |
122 | u"foo∀\N{SNOWMAN}õbar") | 150 | "foo∀\N{SNOWMAN}õbar") |
123 | 151 | ||
124 | def test_smart_quote_substitution(self): | 152 | def test_smart_quote_substitution(self): |
125 | # MS smart quotes are a common source of frustration, so we | 153 | # MS smart quotes are a common source of frustration, so we |
@@ -184,7 +212,7 @@ class TestEncodingConversion(SoupTest): | |||
184 | 212 | ||
185 | def setUp(self): | 213 | def setUp(self): |
186 | super(TestEncodingConversion, self).setUp() | 214 | super(TestEncodingConversion, self).setUp() |
187 | self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' | 215 | self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' |
188 | self.utf8_data = self.unicode_data.encode("utf-8") | 216 | self.utf8_data = self.unicode_data.encode("utf-8") |
189 | # Just so you know what it looks like. | 217 | # Just so you know what it looks like. |
190 | self.assertEqual( | 218 | self.assertEqual( |
@@ -204,7 +232,7 @@ class TestEncodingConversion(SoupTest): | |||
204 | ascii = b"<foo>a</foo>" | 232 | ascii = b"<foo>a</foo>" |
205 | soup_from_ascii = self.soup(ascii) | 233 | soup_from_ascii = self.soup(ascii) |
206 | unicode_output = soup_from_ascii.decode() | 234 | unicode_output = soup_from_ascii.decode() |
207 | self.assertTrue(isinstance(unicode_output, unicode)) | 235 | self.assertTrue(isinstance(unicode_output, str)) |
208 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) | 236 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) |
209 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") | 237 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") |
210 | finally: | 238 | finally: |
@@ -216,7 +244,7 @@ class TestEncodingConversion(SoupTest): | |||
216 | # is not set. | 244 | # is not set. |
217 | soup_from_unicode = self.soup(self.unicode_data) | 245 | soup_from_unicode = self.soup(self.unicode_data) |
218 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) | 246 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) |
219 | self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') | 247 | self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') |
220 | self.assertEqual(soup_from_unicode.original_encoding, None) | 248 | self.assertEqual(soup_from_unicode.original_encoding, None) |
221 | 249 | ||
222 | def test_utf8_in_unicode_out(self): | 250 | def test_utf8_in_unicode_out(self): |
@@ -224,7 +252,7 @@ class TestEncodingConversion(SoupTest): | |||
224 | # attribute is set. | 252 | # attribute is set. |
225 | soup_from_utf8 = self.soup(self.utf8_data) | 253 | soup_from_utf8 = self.soup(self.utf8_data) |
226 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) | 254 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) |
227 | self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') | 255 | self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') |
228 | 256 | ||
229 | def test_utf8_out(self): | 257 | def test_utf8_out(self): |
230 | # The internal data structures can be encoded as UTF-8. | 258 | # The internal data structures can be encoded as UTF-8. |
@@ -235,14 +263,14 @@ class TestEncodingConversion(SoupTest): | |||
235 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, | 263 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, |
236 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") | 264 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") |
237 | def test_attribute_name_containing_unicode_characters(self): | 265 | def test_attribute_name_containing_unicode_characters(self): |
238 | markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' | 266 | markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' |
239 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) | 267 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) |
240 | 268 | ||
241 | class TestUnicodeDammit(unittest.TestCase): | 269 | class TestUnicodeDammit(unittest.TestCase): |
242 | """Standalone tests of UnicodeDammit.""" | 270 | """Standalone tests of UnicodeDammit.""" |
243 | 271 | ||
244 | def test_unicode_input(self): | 272 | def test_unicode_input(self): |
245 | markup = u"I'm already Unicode! \N{SNOWMAN}" | 273 | markup = "I'm already Unicode! \N{SNOWMAN}" |
246 | dammit = UnicodeDammit(markup) | 274 | dammit = UnicodeDammit(markup) |
247 | self.assertEqual(dammit.unicode_markup, markup) | 275 | self.assertEqual(dammit.unicode_markup, markup) |
248 | 276 | ||
@@ -250,7 +278,7 @@ class TestUnicodeDammit(unittest.TestCase): | |||
250 | markup = b"<foo>\x91\x92\x93\x94</foo>" | 278 | markup = b"<foo>\x91\x92\x93\x94</foo>" |
251 | dammit = UnicodeDammit(markup) | 279 | dammit = UnicodeDammit(markup) |
252 | self.assertEqual( | 280 | self.assertEqual( |
253 | dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") | 281 | dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") |
254 | 282 | ||
255 | def test_smart_quotes_to_xml_entities(self): | 283 | def test_smart_quotes_to_xml_entities(self): |
256 | markup = b"<foo>\x91\x92\x93\x94</foo>" | 284 | markup = b"<foo>\x91\x92\x93\x94</foo>" |
@@ -271,16 +299,17 @@ class TestUnicodeDammit(unittest.TestCase): | |||
271 | dammit.unicode_markup, """<foo>''""</foo>""") | 299 | dammit.unicode_markup, """<foo>''""</foo>""") |
272 | 300 | ||
273 | def test_detect_utf8(self): | 301 | def test_detect_utf8(self): |
274 | utf8 = b"\xc3\xa9" | 302 | utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" |
275 | dammit = UnicodeDammit(utf8) | 303 | dammit = UnicodeDammit(utf8) |
276 | self.assertEqual(dammit.unicode_markup, u'\xe9') | ||
277 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | 304 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
305 | self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') | ||
306 | |||
278 | 307 | ||
279 | def test_convert_hebrew(self): | 308 | def test_convert_hebrew(self): |
280 | hebrew = b"\xed\xe5\xec\xf9" | 309 | hebrew = b"\xed\xe5\xec\xf9" |
281 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) | 310 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) |
282 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') | 311 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') |
283 | self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') | 312 | self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') |
284 | 313 | ||
285 | def test_dont_see_smart_quotes_where_there_are_none(self): | 314 | def test_dont_see_smart_quotes_where_there_are_none(self): |
286 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" | 315 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" |
@@ -289,16 +318,36 @@ class TestUnicodeDammit(unittest.TestCase): | |||
289 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) | 318 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) |
290 | 319 | ||
291 | def test_ignore_inappropriate_codecs(self): | 320 | def test_ignore_inappropriate_codecs(self): |
292 | utf8_data = u"Räksmörgås".encode("utf-8") | 321 | utf8_data = "Räksmörgås".encode("utf-8") |
293 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) | 322 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) |
294 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | 323 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
295 | 324 | ||
296 | def test_ignore_invalid_codecs(self): | 325 | def test_ignore_invalid_codecs(self): |
297 | utf8_data = u"Räksmörgås".encode("utf-8") | 326 | utf8_data = "Räksmörgås".encode("utf-8") |
298 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: | 327 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: |
299 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) | 328 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) |
300 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | 329 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
301 | 330 | ||
331 | def test_exclude_encodings(self): | ||
332 | # This is UTF-8. | ||
333 | utf8_data = "Räksmörgås".encode("utf-8") | ||
334 | |||
335 | # But if we exclude UTF-8 from consideration, the guess is | ||
336 | # Windows-1252. | ||
337 | dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) | ||
338 | self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') | ||
339 | |||
340 | # And if we exclude that, there is no valid guess at all. | ||
341 | dammit = UnicodeDammit( | ||
342 | utf8_data, exclude_encodings=["utf-8", "windows-1252"]) | ||
343 | self.assertEqual(dammit.original_encoding, None) | ||
344 | |||
345 | def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): | ||
346 | detected = EncodingDetector( | ||
347 | b'<?xml version="1.0" encoding="UTF-\xdb" ?>') | ||
348 | encodings = list(detected.encodings) | ||
349 | assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings | ||
350 | |||
302 | def test_detect_html5_style_meta_tag(self): | 351 | def test_detect_html5_style_meta_tag(self): |
303 | 352 | ||
304 | for data in ( | 353 | for data in ( |
@@ -337,7 +386,7 @@ class TestUnicodeDammit(unittest.TestCase): | |||
337 | bs4.dammit.chardet_dammit = noop | 386 | bs4.dammit.chardet_dammit = noop |
338 | dammit = UnicodeDammit(doc) | 387 | dammit = UnicodeDammit(doc) |
339 | self.assertEqual(True, dammit.contains_replacement_characters) | 388 | self.assertEqual(True, dammit.contains_replacement_characters) |
340 | self.assertTrue(u"\ufffd" in dammit.unicode_markup) | 389 | self.assertTrue("\ufffd" in dammit.unicode_markup) |
341 | 390 | ||
342 | soup = BeautifulSoup(doc, "html.parser") | 391 | soup = BeautifulSoup(doc, "html.parser") |
343 | self.assertTrue(soup.contains_replacement_characters) | 392 | self.assertTrue(soup.contains_replacement_characters) |
@@ -349,17 +398,17 @@ class TestUnicodeDammit(unittest.TestCase): | |||
349 | # A document written in UTF-16LE will have its byte order marker stripped. | 398 | # A document written in UTF-16LE will have its byte order marker stripped. |
350 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | 399 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' |
351 | dammit = UnicodeDammit(data) | 400 | dammit = UnicodeDammit(data) |
352 | self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) | 401 | self.assertEqual("<a>áé</a>", dammit.unicode_markup) |
353 | self.assertEqual("utf-16le", dammit.original_encoding) | 402 | self.assertEqual("utf-16le", dammit.original_encoding) |
354 | 403 | ||
355 | def test_detwingle(self): | 404 | def test_detwingle(self): |
356 | # Here's a UTF8 document. | 405 | # Here's a UTF8 document. |
357 | utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") | 406 | utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") |
358 | 407 | ||
359 | # Here's a Windows-1252 document. | 408 | # Here's a Windows-1252 document. |
360 | windows_1252 = ( | 409 | windows_1252 = ( |
361 | u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" | 410 | "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" |
362 | u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") | 411 | "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") |
363 | 412 | ||
364 | # Through some unholy alchemy, they've been stuck together. | 413 | # Through some unholy alchemy, they've been stuck together. |
365 | doc = utf8 + windows_1252 + utf8 | 414 | doc = utf8 + windows_1252 + utf8 |
@@ -374,7 +423,7 @@ class TestUnicodeDammit(unittest.TestCase): | |||
374 | 423 | ||
375 | fixed = UnicodeDammit.detwingle(doc) | 424 | fixed = UnicodeDammit.detwingle(doc) |
376 | self.assertEqual( | 425 | self.assertEqual( |
377 | u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) | 426 | "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) |
378 | 427 | ||
379 | def test_detwingle_ignores_multibyte_characters(self): | 428 | def test_detwingle_ignores_multibyte_characters(self): |
380 | # Each of these characters has a UTF-8 representation ending | 429 | # Each of these characters has a UTF-8 representation ending |
@@ -382,9 +431,9 @@ class TestUnicodeDammit(unittest.TestCase): | |||
382 | # Windows-1252. But our code knows to skip over multibyte | 431 | # Windows-1252. But our code knows to skip over multibyte |
383 | # UTF-8 characters, so they'll survive the process unscathed. | 432 | # UTF-8 characters, so they'll survive the process unscathed. |
384 | for tricky_unicode_char in ( | 433 | for tricky_unicode_char in ( |
385 | u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' | 434 | "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' |
386 | u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' | 435 | "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' |
387 | u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. | 436 | "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. |
388 | ): | 437 | ): |
389 | input = tricky_unicode_char.encode("utf8") | 438 | input = tricky_unicode_char.encode("utf8") |
390 | self.assertTrue(input.endswith(b'\x93')) | 439 | self.assertTrue(input.endswith(b'\x93')) |