summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/tests/test_soup.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/tests/test_soup.py')
-rw-r--r--bitbake/lib/bs4/tests/test_soup.py107
1 files changed, 78 insertions, 29 deletions
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py
index 47ac245f99..f87949e3d3 100644
--- a/bitbake/lib/bs4/tests/test_soup.py
+++ b/bitbake/lib/bs4/tests/test_soup.py
@@ -1,6 +1,7 @@
1# -*- coding: utf-8 -*- 1# -*- coding: utf-8 -*-
2"""Tests of Beautiful Soup as a whole.""" 2"""Tests of Beautiful Soup as a whole."""
3 3
4from pdb import set_trace
4import logging 5import logging
5import unittest 6import unittest
6import sys 7import sys
@@ -20,6 +21,7 @@ import bs4.dammit
20from bs4.dammit import ( 21from bs4.dammit import (
21 EntitySubstitution, 22 EntitySubstitution,
22 UnicodeDammit, 23 UnicodeDammit,
24 EncodingDetector,
23) 25)
24from bs4.testing import ( 26from bs4.testing import (
25 SoupTest, 27 SoupTest,
@@ -30,7 +32,7 @@ import warnings
30try: 32try:
31 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 33 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
32 LXML_PRESENT = True 34 LXML_PRESENT = True
33except ImportError, e: 35except ImportError as e:
34 LXML_PRESENT = False 36 LXML_PRESENT = False
35 37
36PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) 38PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
@@ -39,17 +41,43 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
39class TestConstructor(SoupTest): 41class TestConstructor(SoupTest):
40 42
41 def test_short_unicode_input(self): 43 def test_short_unicode_input(self):
42 data = u"<h1>éé</h1>" 44 data = "<h1>éé</h1>"
43 soup = self.soup(data) 45 soup = self.soup(data)
44 self.assertEqual(u"éé", soup.h1.string) 46 self.assertEqual("éé", soup.h1.string)
45 47
46 def test_embedded_null(self): 48 def test_embedded_null(self):
47 data = u"<h1>foo\0bar</h1>" 49 data = "<h1>foo\0bar</h1>"
48 soup = self.soup(data) 50 soup = self.soup(data)
49 self.assertEqual(u"foo\0bar", soup.h1.string) 51 self.assertEqual("foo\0bar", soup.h1.string)
50 52
53 def test_exclude_encodings(self):
54 utf8_data = "Räksmörgås".encode("utf-8")
55 soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
56 self.assertEqual("windows-1252", soup.original_encoding)
51 57
52class TestDeprecatedConstructorArguments(SoupTest): 58
59class TestWarnings(SoupTest):
60
61 def _no_parser_specified(self, s, is_there=True):
62 v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
63 self.assertTrue(v)
64
65 def test_warning_if_no_parser_specified(self):
66 with warnings.catch_warnings(record=True) as w:
67 soup = self.soup("<a><b></b></a>")
68 msg = str(w[0].message)
69 self._assert_no_parser_specified(msg)
70
71 def test_warning_if_parser_specified_too_vague(self):
72 with warnings.catch_warnings(record=True) as w:
73 soup = self.soup("<a><b></b></a>", "html")
74 msg = str(w[0].message)
75 self._assert_no_parser_specified(msg)
76
77 def test_no_warning_if_explicit_parser_specified(self):
78 with warnings.catch_warnings(record=True) as w:
79 soup = self.soup("<a><b></b></a>", "html.parser")
80 self.assertEqual([], w)
53 81
54 def test_parseOnlyThese_renamed_to_parse_only(self): 82 def test_parseOnlyThese_renamed_to_parse_only(self):
55 with warnings.catch_warnings(record=True) as w: 83 with warnings.catch_warnings(record=True) as w:
@@ -117,9 +145,9 @@ class TestEntitySubstitution(unittest.TestCase):
117 def test_simple_html_substitution(self): 145 def test_simple_html_substitution(self):
118 # Unicode characters corresponding to named HTML entites 146 # Unicode characters corresponding to named HTML entites
119 # are substituted, and no others. 147 # are substituted, and no others.
120 s = u"foo\u2200\N{SNOWMAN}\u00f5bar" 148 s = "foo\u2200\N{SNOWMAN}\u00f5bar"
121 self.assertEqual(self.sub.substitute_html(s), 149 self.assertEqual(self.sub.substitute_html(s),
122 u"foo&forall;\N{SNOWMAN}&otilde;bar") 150 "foo&forall;\N{SNOWMAN}&otilde;bar")
123 151
124 def test_smart_quote_substitution(self): 152 def test_smart_quote_substitution(self):
125 # MS smart quotes are a common source of frustration, so we 153 # MS smart quotes are a common source of frustration, so we
@@ -184,7 +212,7 @@ class TestEncodingConversion(SoupTest):
184 212
185 def setUp(self): 213 def setUp(self):
186 super(TestEncodingConversion, self).setUp() 214 super(TestEncodingConversion, self).setUp()
187 self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' 215 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
188 self.utf8_data = self.unicode_data.encode("utf-8") 216 self.utf8_data = self.unicode_data.encode("utf-8")
189 # Just so you know what it looks like. 217 # Just so you know what it looks like.
190 self.assertEqual( 218 self.assertEqual(
@@ -204,7 +232,7 @@ class TestEncodingConversion(SoupTest):
204 ascii = b"<foo>a</foo>" 232 ascii = b"<foo>a</foo>"
205 soup_from_ascii = self.soup(ascii) 233 soup_from_ascii = self.soup(ascii)
206 unicode_output = soup_from_ascii.decode() 234 unicode_output = soup_from_ascii.decode()
207 self.assertTrue(isinstance(unicode_output, unicode)) 235 self.assertTrue(isinstance(unicode_output, str))
208 self.assertEqual(unicode_output, self.document_for(ascii.decode())) 236 self.assertEqual(unicode_output, self.document_for(ascii.decode()))
209 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") 237 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
210 finally: 238 finally:
@@ -216,7 +244,7 @@ class TestEncodingConversion(SoupTest):
216 # is not set. 244 # is not set.
217 soup_from_unicode = self.soup(self.unicode_data) 245 soup_from_unicode = self.soup(self.unicode_data)
218 self.assertEqual(soup_from_unicode.decode(), self.unicode_data) 246 self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
219 self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') 247 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
220 self.assertEqual(soup_from_unicode.original_encoding, None) 248 self.assertEqual(soup_from_unicode.original_encoding, None)
221 249
222 def test_utf8_in_unicode_out(self): 250 def test_utf8_in_unicode_out(self):
@@ -224,7 +252,7 @@ class TestEncodingConversion(SoupTest):
224 # attribute is set. 252 # attribute is set.
225 soup_from_utf8 = self.soup(self.utf8_data) 253 soup_from_utf8 = self.soup(self.utf8_data)
226 self.assertEqual(soup_from_utf8.decode(), self.unicode_data) 254 self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
227 self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') 255 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
228 256
229 def test_utf8_out(self): 257 def test_utf8_out(self):
230 # The internal data structures can be encoded as UTF-8. 258 # The internal data structures can be encoded as UTF-8.
@@ -235,14 +263,14 @@ class TestEncodingConversion(SoupTest):
235 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, 263 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
236 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") 264 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
237 def test_attribute_name_containing_unicode_characters(self): 265 def test_attribute_name_containing_unicode_characters(self):
238 markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' 266 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
239 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) 267 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
240 268
241class TestUnicodeDammit(unittest.TestCase): 269class TestUnicodeDammit(unittest.TestCase):
242 """Standalone tests of UnicodeDammit.""" 270 """Standalone tests of UnicodeDammit."""
243 271
244 def test_unicode_input(self): 272 def test_unicode_input(self):
245 markup = u"I'm already Unicode! \N{SNOWMAN}" 273 markup = "I'm already Unicode! \N{SNOWMAN}"
246 dammit = UnicodeDammit(markup) 274 dammit = UnicodeDammit(markup)
247 self.assertEqual(dammit.unicode_markup, markup) 275 self.assertEqual(dammit.unicode_markup, markup)
248 276
@@ -250,7 +278,7 @@ class TestUnicodeDammit(unittest.TestCase):
250 markup = b"<foo>\x91\x92\x93\x94</foo>" 278 markup = b"<foo>\x91\x92\x93\x94</foo>"
251 dammit = UnicodeDammit(markup) 279 dammit = UnicodeDammit(markup)
252 self.assertEqual( 280 self.assertEqual(
253 dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") 281 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
254 282
255 def test_smart_quotes_to_xml_entities(self): 283 def test_smart_quotes_to_xml_entities(self):
256 markup = b"<foo>\x91\x92\x93\x94</foo>" 284 markup = b"<foo>\x91\x92\x93\x94</foo>"
@@ -271,16 +299,17 @@ class TestUnicodeDammit(unittest.TestCase):
271 dammit.unicode_markup, """<foo>''""</foo>""") 299 dammit.unicode_markup, """<foo>''""</foo>""")
272 300
273 def test_detect_utf8(self): 301 def test_detect_utf8(self):
274 utf8 = b"\xc3\xa9" 302 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
275 dammit = UnicodeDammit(utf8) 303 dammit = UnicodeDammit(utf8)
276 self.assertEqual(dammit.unicode_markup, u'\xe9')
277 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 304 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
305 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
306
278 307
279 def test_convert_hebrew(self): 308 def test_convert_hebrew(self):
280 hebrew = b"\xed\xe5\xec\xf9" 309 hebrew = b"\xed\xe5\xec\xf9"
281 dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) 310 dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
282 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') 311 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
283 self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') 312 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
284 313
285 def test_dont_see_smart_quotes_where_there_are_none(self): 314 def test_dont_see_smart_quotes_where_there_are_none(self):
286 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" 315 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@@ -289,16 +318,36 @@ class TestUnicodeDammit(unittest.TestCase):
289 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) 318 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
290 319
291 def test_ignore_inappropriate_codecs(self): 320 def test_ignore_inappropriate_codecs(self):
292 utf8_data = u"Räksmörgås".encode("utf-8") 321 utf8_data = "Räksmörgås".encode("utf-8")
293 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) 322 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
294 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 323 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
295 324
296 def test_ignore_invalid_codecs(self): 325 def test_ignore_invalid_codecs(self):
297 utf8_data = u"Räksmörgås".encode("utf-8") 326 utf8_data = "Räksmörgås".encode("utf-8")
298 for bad_encoding in ['.utf8', '...', 'utF---16.!']: 327 for bad_encoding in ['.utf8', '...', 'utF---16.!']:
299 dammit = UnicodeDammit(utf8_data, [bad_encoding]) 328 dammit = UnicodeDammit(utf8_data, [bad_encoding])
300 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 329 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
301 330
331 def test_exclude_encodings(self):
332 # This is UTF-8.
333 utf8_data = "Räksmörgås".encode("utf-8")
334
335 # But if we exclude UTF-8 from consideration, the guess is
336 # Windows-1252.
337 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
338 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
339
340 # And if we exclude that, there is no valid guess at all.
341 dammit = UnicodeDammit(
342 utf8_data, exclude_encodings=["utf-8", "windows-1252"])
343 self.assertEqual(dammit.original_encoding, None)
344
345 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
346 detected = EncodingDetector(
347 b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
348 encodings = list(detected.encodings)
349 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
350
302 def test_detect_html5_style_meta_tag(self): 351 def test_detect_html5_style_meta_tag(self):
303 352
304 for data in ( 353 for data in (
@@ -337,7 +386,7 @@ class TestUnicodeDammit(unittest.TestCase):
337 bs4.dammit.chardet_dammit = noop 386 bs4.dammit.chardet_dammit = noop
338 dammit = UnicodeDammit(doc) 387 dammit = UnicodeDammit(doc)
339 self.assertEqual(True, dammit.contains_replacement_characters) 388 self.assertEqual(True, dammit.contains_replacement_characters)
340 self.assertTrue(u"\ufffd" in dammit.unicode_markup) 389 self.assertTrue("\ufffd" in dammit.unicode_markup)
341 390
342 soup = BeautifulSoup(doc, "html.parser") 391 soup = BeautifulSoup(doc, "html.parser")
343 self.assertTrue(soup.contains_replacement_characters) 392 self.assertTrue(soup.contains_replacement_characters)
@@ -349,17 +398,17 @@ class TestUnicodeDammit(unittest.TestCase):
349 # A document written in UTF-16LE will have its byte order marker stripped. 398 # A document written in UTF-16LE will have its byte order marker stripped.
350 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' 399 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
351 dammit = UnicodeDammit(data) 400 dammit = UnicodeDammit(data)
352 self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) 401 self.assertEqual("<a>áé</a>", dammit.unicode_markup)
353 self.assertEqual("utf-16le", dammit.original_encoding) 402 self.assertEqual("utf-16le", dammit.original_encoding)
354 403
355 def test_detwingle(self): 404 def test_detwingle(self):
356 # Here's a UTF8 document. 405 # Here's a UTF8 document.
357 utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") 406 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
358 407
359 # Here's a Windows-1252 document. 408 # Here's a Windows-1252 document.
360 windows_1252 = ( 409 windows_1252 = (
361 u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" 410 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
362 u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") 411 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
363 412
364 # Through some unholy alchemy, they've been stuck together. 413 # Through some unholy alchemy, they've been stuck together.
365 doc = utf8 + windows_1252 + utf8 414 doc = utf8 + windows_1252 + utf8
@@ -374,7 +423,7 @@ class TestUnicodeDammit(unittest.TestCase):
374 423
375 fixed = UnicodeDammit.detwingle(doc) 424 fixed = UnicodeDammit.detwingle(doc)
376 self.assertEqual( 425 self.assertEqual(
377 u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) 426 "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
378 427
379 def test_detwingle_ignores_multibyte_characters(self): 428 def test_detwingle_ignores_multibyte_characters(self):
380 # Each of these characters has a UTF-8 representation ending 429 # Each of these characters has a UTF-8 representation ending
@@ -382,9 +431,9 @@ class TestUnicodeDammit(unittest.TestCase):
382 # Windows-1252. But our code knows to skip over multibyte 431 # Windows-1252. But our code knows to skip over multibyte
383 # UTF-8 characters, so they'll survive the process unscathed. 432 # UTF-8 characters, so they'll survive the process unscathed.
384 for tricky_unicode_char in ( 433 for tricky_unicode_char in (
385 u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' 434 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
386 u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' 435 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
387 u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. 436 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
388 ): 437 ):
389 input = tricky_unicode_char.encode("utf8") 438 input = tricky_unicode_char.encode("utf8")
390 self.assertTrue(input.endswith(b'\x93')) 439 self.assertTrue(input.endswith(b'\x93'))