From 8d49bef632a0486e0172e543a6c2622398ed7a8c Mon Sep 17 00:00:00 2001 From: Richard Purdie Date: Fri, 6 May 2016 09:06:51 +0100 Subject: bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version) Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers recommendation for v3 use. (Bitbake rev: f06e0f8052ba44eeb9ce701192cdf19252b2646d) Signed-off-by: Richard Purdie --- bitbake/lib/bs4/tests/test_builder_registry.py | 14 +- bitbake/lib/bs4/tests/test_html5lib.py | 19 +- bitbake/lib/bs4/tests/test_htmlparser.py | 13 ++ bitbake/lib/bs4/tests/test_lxml.py | 19 +- bitbake/lib/bs4/tests/test_soup.py | 107 ++++++--- bitbake/lib/bs4/tests/test_tree.py | 294 ++++++++++++++++++++----- 6 files changed, 357 insertions(+), 109 deletions(-) (limited to 'bitbake/lib/bs4/tests') diff --git a/bitbake/lib/bs4/tests/test_builder_registry.py b/bitbake/lib/bs4/tests/test_builder_registry.py index 92ad10fb04..90cad82933 100644 --- a/bitbake/lib/bs4/tests/test_builder_registry.py +++ b/bitbake/lib/bs4/tests/test_builder_registry.py @@ -1,6 +1,7 @@ """Tests of the builder registry.""" import unittest +import warnings from bs4 import BeautifulSoup from bs4.builder import ( @@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase): HTMLParserTreeBuilder) def test_beautifulsoup_constructor_does_lookup(self): - # You can pass in a string. - BeautifulSoup("", features="html") - # Or a list of strings. - BeautifulSoup("", features=["html", "fast"]) + + with warnings.catch_warnings(record=True) as w: + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "fast"]) # You'll get an exception if BS can't find an appropriate # builder. diff --git a/bitbake/lib/bs4/tests/test_html5lib.py b/bitbake/lib/bs4/tests/test_html5lib.py index 594c3e1f26..a7494ca5ba 100644 --- a/bitbake/lib/bs4/tests/test_html5lib.py +++ b/bitbake/lib/bs4/tests/test_html5lib.py @@ -5,7 +5,7 @@ import warnings try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True -except ImportError, e: +except ImportError as e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( @@ -74,12 +74,25 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): def test_reparented_markup(self): markup = '

foo

\n

bar

' soup = self.soup(markup) - self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) + self.assertEqual("

foo

\n

bar

", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) def test_reparented_markup_ends_with_whitespace(self): markup = '

foo

\n

bar

\n' soup = self.soup(markup) - self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) + self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) + + def test_processing_instruction(self): + """Processing instructions become comments.""" + markup = b"""""" + soup = self.soup(markup) + assert str(soup).startswith("") + + def test_cloned_multivalue_node(self): + markup = b"""

""" + soup = self.soup(markup) + a1, a2 = soup.find_all('a') + self.assertEqual(a1, a2) + assert a1 is not a2 diff --git a/bitbake/lib/bs4/tests/test_htmlparser.py b/bitbake/lib/bs4/tests/test_htmlparser.py index bcb5ed232f..b45e35f999 100644 --- a/bitbake/lib/bs4/tests/test_htmlparser.py +++ b/bitbake/lib/bs4/tests/test_htmlparser.py @@ -1,6 +1,8 @@ """Tests to ensure that the html.parser tree builder generates good trees.""" +from pdb import set_trace +import pickle from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder @@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_namespaced_public_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass + + def test_builder_is_pickled(self): + """Unlike most tree builders, HTMLParserTreeBuilder and will + be restored after pickling. + """ + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertTrue(isinstance(loaded.builder, type(tree.builder))) + + diff --git a/bitbake/lib/bs4/tests/test_lxml.py b/bitbake/lib/bs4/tests/test_lxml.py index 2b2e9b7e78..6c2a1d73eb 100644 --- a/bitbake/lib/bs4/tests/test_lxml.py +++ b/bitbake/lib/bs4/tests/test_lxml.py @@ -7,7 +7,7 @@ try: import lxml.etree LXML_PRESENT = True LXML_VERSION = lxml.etree.LXML_VERSION -except ImportError, e: +except ImportError as e: LXML_PRESENT = False LXML_VERSION = (0,) @@ -62,24 +62,9 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): # if one is installed. with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("") - self.assertEqual(u"", unicode(soup.b)) + self.assertEqual("", str(soup.b)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) - def test_real_xhtml_document(self): - """lxml strips the XML definition from an XHTML doc, which is fine.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8").replace(b"\n", b''), - markup.replace(b'\n', b'').replace( - b'', b'')) - - @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its XML tree builder.") diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py index 47ac245f99..f87949e3d3 100644 --- a/bitbake/lib/bs4/tests/test_soup.py +++ b/bitbake/lib/bs4/tests/test_soup.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" +from pdb import set_trace import logging import unittest import sys @@ -20,6 +21,7 @@ import bs4.dammit from bs4.dammit import ( EntitySubstitution, UnicodeDammit, + EncodingDetector, ) from bs4.testing import ( SoupTest, @@ -30,7 +32,7 @@ import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True -except ImportError, e: +except ImportError as e: LXML_PRESENT = False PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) @@ -39,17 +41,43 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) class TestConstructor(SoupTest): def test_short_unicode_input(self): - data = u"

éé

" + data = "

éé

" soup = self.soup(data) - self.assertEqual(u"éé", soup.h1.string) + self.assertEqual("éé", soup.h1.string) def test_embedded_null(self): - data = u"

foo\0bar

" + data = "

foo\0bar

" soup = self.soup(data) - self.assertEqual(u"foo\0bar", soup.h1.string) + self.assertEqual("foo\0bar", soup.h1.string) + def test_exclude_encodings(self): + utf8_data = "Räksmörgås".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual("windows-1252", soup.original_encoding) -class TestDeprecatedConstructorArguments(SoupTest): + +class TestWarnings(SoupTest): + + def _no_parser_specified(self, s, is_there=True): + v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) + self.assertTrue(v) + + def test_warning_if_no_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_warning_if_parser_specified_too_vague(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("", "html") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_no_warning_if_explicit_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("", "html.parser") + self.assertEqual([], w) def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: @@ -117,9 +145,9 @@ class TestEntitySubstitution(unittest.TestCase): def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. - s = u"foo\u2200\N{SNOWMAN}\u00f5bar" + s = "foo\u2200\N{SNOWMAN}\u00f5bar" self.assertEqual(self.sub.substitute_html(s), - u"foo∀\N{SNOWMAN}õbar") + "foo∀\N{SNOWMAN}õbar") def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we @@ -184,7 +212,7 @@ class TestEncodingConversion(SoupTest): def setUp(self): super(TestEncodingConversion, self).setUp() - self.unicode_data = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( @@ -204,7 +232,7 @@ class TestEncodingConversion(SoupTest): ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) + self.assertTrue(isinstance(unicode_output, str)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: @@ -216,7 +244,7 @@ class TestEncodingConversion(SoupTest): # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) - self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): @@ -224,7 +252,7 @@ class TestEncodingConversion(SoupTest): # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) - self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. @@ -235,14 +263,14 @@ class TestEncodingConversion(SoupTest): PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") def test_attribute_name_containing_unicode_characters(self): - markup = u'
' + markup = '
' self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of UnicodeDammit.""" def test_unicode_input(self): - markup = u"I'm already Unicode! \N{SNOWMAN}" + markup = "I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, markup) @@ -250,7 +278,7 @@ class TestUnicodeDammit(unittest.TestCase): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup) self.assertEqual( - dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") + dammit.unicode_markup, "\u2018\u2019\u201c\u201d") def test_smart_quotes_to_xml_entities(self): markup = b"\x91\x92\x93\x94" @@ -271,16 +299,17 @@ class TestUnicodeDammit(unittest.TestCase): dammit.unicode_markup, """''""""") def test_detect_utf8(self): - utf8 = b"\xc3\xa9" + utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) - self.assertEqual(dammit.unicode_markup, u'\xe9') self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') + def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') - self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" @@ -289,16 +318,36 @@ class TestUnicodeDammit(unittest.TestCase): self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') def test_ignore_invalid_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + def test_exclude_encodings(self): + # This is UTF-8. + utf8_data = "Räksmörgås".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') + + # And if we exclude that, there is no valid guess at all. + dammit = UnicodeDammit( + utf8_data, exclude_encodings=["utf-8", "windows-1252"]) + self.assertEqual(dammit.original_encoding, None) + + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): + detected = EncodingDetector( + b'') + encodings = list(detected.encodings) + assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings + def test_detect_html5_style_meta_tag(self): for data in ( @@ -337,7 +386,7 @@ class TestUnicodeDammit(unittest.TestCase): bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) + self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) @@ -349,17 +398,17 @@ class TestUnicodeDammit(unittest.TestCase): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) - self.assertEqual(u"áé", dammit.unicode_markup) + self.assertEqual("áé", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding) def test_detwingle(self): # Here's a UTF8 document. - utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") + utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ( - u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" - u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") + "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 @@ -374,7 +423,7 @@ class TestUnicodeDammit(unittest.TestCase): fixed = UnicodeDammit.detwingle(doc) self.assertEqual( - u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) + "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending @@ -382,9 +431,9 @@ class TestUnicodeDammit(unittest.TestCase): # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( - u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' - u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' - u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py index f8515c0ea1..6d3e67f311 100644 --- a/bitbake/lib/bs4/tests/test_tree.py +++ b/bitbake/lib/bs4/tests/test_tree.py @@ -9,6 +9,7 @@ same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ +from pdb import set_trace import copy import pickle import re @@ -19,8 +20,10 @@ from bs4.builder import ( HTMLParserTreeBuilder, ) from bs4.element import ( + PY3K, CData, Comment, + Declaration, Doctype, NavigableString, SoupStrainer, @@ -67,8 +70,14 @@ class TestFind(TreeTest): self.assertEqual(soup.find("b").string, "2") def test_unicode_text_find(self): - soup = self.soup(u'

Räksmörgås

') - self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') + soup = self.soup('

Räksmörgås

') + self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') + + def test_unicode_attribute_find(self): + soup = self.soup('

here it is

') + str(soup) + self.assertEqual("here it is", soup.find(id='Räksmörgås').text) + def test_find_everything(self): """Test an optimization that finds all tags.""" @@ -87,16 +96,17 @@ class TestFindAll(TreeTest): """You can search the tree for text nodes.""" soup = self.soup("Foobar\xbb") # Exact match. - self.assertEqual(soup.find_all(text="bar"), [u"bar"]) + self.assertEqual(soup.find_all(string="bar"), ["bar"]) + self.assertEqual(soup.find_all(text="bar"), ["bar"]) # Match any of a number of strings. self.assertEqual( - soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) + soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) # Match a regular expression. self.assertEqual(soup.find_all(text=re.compile('.*')), - [u"Foo", u"bar", u'\xbb']) + ["Foo", "bar", '\xbb']) # Match anything. self.assertEqual(soup.find_all(text=True), - [u"Foo", u"bar", u'\xbb']) + ["Foo", "bar", '\xbb']) def test_find_all_limit(self): """You can limit the number of items returned by find_all.""" @@ -227,8 +237,8 @@ class TestFindAllByAttribute(TreeTest): ["Matching a.", "Matching b."]) def test_find_all_by_utf8_attribute_value(self): - peace = u"םולש".encode("utf8") - data = u''.encode("utf8") + peace = "םולש".encode("utf8") + data = ''.encode("utf8") soup = self.soup(data) self.assertEqual([soup.a], soup.find_all(title=peace)) self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) @@ -688,7 +698,7 @@ class TestTagCreation(SoupTest): def test_tag_inherits_self_closing_rules_from_builder(self): if XML_BUILDER_PRESENT: - xml_soup = BeautifulSoup("", "xml") + xml_soup = BeautifulSoup("", "lxml-xml") xml_br = xml_soup.new_tag("br") xml_p = xml_soup.new_tag("p") @@ -697,7 +707,7 @@ class TestTagCreation(SoupTest): self.assertEqual(b"
", xml_br.encode()) self.assertEqual(b"

", xml_p.encode()) - html_soup = BeautifulSoup("", "html") + html_soup = BeautifulSoup("", "html.parser") html_br = html_soup.new_tag("br") html_p = html_soup.new_tag("p") @@ -773,6 +783,14 @@ class TestTreeModification(SoupTest): new_a = a.unwrap() self.assertEqual(a, new_a) + def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): + soup = self.soup("FooBar") + a = soup.a + a.extract() + self.assertEqual(None, a.parent) + self.assertRaises(ValueError, a.unwrap) + self.assertRaises(ValueError, a.replace_with, soup.c) + def test_replace_tag_with_itself(self): text = "Foo" soup = self.soup(text) @@ -1067,6 +1085,31 @@ class TestTreeModification(SoupTest): self.assertEqual(foo_2, soup.a.string) self.assertEqual(bar_2, soup.b.string) + def test_extract_multiples_of_same_tag(self): + soup = self.soup(""" + + + + + + + + + +""") + [soup.script.extract() for i in soup.find_all("script")] + self.assertEqual("\n\n\n", str(soup.body)) + + + def test_extract_works_when_element_is_surrounded_by_identical_strings(self): + soup = self.soup( + '\n' + 'hi\n' + '') + soup.find('body').extract() + self.assertEqual(None, soup.find('body')) + + def test_clear(self): """Tag.clear()""" soup = self.soup("

String Italicized and another

") @@ -1287,27 +1330,72 @@ class TestPersistence(SoupTest): def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. - html = u"\N{SNOWMAN}" + html = "\N{SNOWMAN}" soup = self.soup(html) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) loaded = pickle.loads(dumped) self.assertEqual(loaded.decode(), soup.decode()) + def test_copy_navigablestring_is_not_attached_to_tree(self): + html = "FooBar" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) + self.assertEqual(s1, s2) + self.assertEqual(None, s2.parent) + self.assertEqual(None, s2.next_element) + self.assertNotEqual(None, s1.next_sibling) + self.assertEqual(None, s2.next_sibling) + self.assertEqual(None, s2.previous_element) + + def test_copy_navigablestring_subclass_has_same_type(self): + html = "" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) + self.assertEqual(s1, s2) + self.assertTrue(isinstance(s2, Comment)) + + def test_copy_entire_soup(self): + html = "
FooBar
end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + self.assertEqual(soup, soup_copy) + + def test_copy_tag_copies_contents(self): + html = "
FooBar
end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. + self.assertEqual(str(div), str(div_copy)) + self.assertEqual(div, div_copy) + + # But they're not the same object. + self.assertFalse(div is div_copy) + + # And they don't have the same relation to the parse tree. The + # copy is not associated with a parse tree at all. + self.assertEqual(None, div_copy.parent) + self.assertEqual(None, div_copy.previous_element) + self.assertEqual(None, div_copy.find(string='Bar').next_element) + self.assertNotEqual(None, div.find(string='Bar').next_element) class TestSubstitutions(SoupTest): def test_default_formatter_is_minimal(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. self.assertEqual( decoded, self.document_for( - u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) def test_formatter_html(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="html") self.assertEqual( @@ -1315,49 +1403,49 @@ class TestSubstitutions(SoupTest): self.document_for("<<Sacré bleu!>>")) def test_formatter_minimal(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. self.assertEqual( decoded, self.document_for( - u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) def test_formatter_null(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter=None) # Neither the angle brackets nor the e-with-acute are converted. # This is not valid HTML, but it's what the user wanted. self.assertEqual(decoded, - self.document_for(u"<>")) + self.document_for("<>")) def test_formatter_custom(self): - markup = u"<foo>bar" + markup = "<foo>bar" soup = self.soup(markup) decoded = soup.decode(formatter = lambda x: x.upper()) # Instead of normal entity conversion code, the custom # callable is called on every string. self.assertEqual( decoded, - self.document_for(u"BAR")) + self.document_for("BAR")) def test_formatter_is_run_on_attribute_values(self): - markup = u'e' + markup = 'e' soup = self.soup(markup) a = soup.a - expect_minimal = u'e' + expect_minimal = 'e' self.assertEqual(expect_minimal, a.decode()) self.assertEqual(expect_minimal, a.decode(formatter="minimal")) - expect_html = u'e' + expect_html = 'e' self.assertEqual(expect_html, a.decode(formatter="html")) self.assertEqual(markup, a.decode(formatter=None)) - expect_upper = u'E' + expect_upper = 'E' self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) def test_formatter_skips_script_tag_for_html_documents(self): @@ -1366,7 +1454,7 @@ class TestSubstitutions(SoupTest): console.log("< < hey > > "); """ - encoded = BeautifulSoup(doc).encode() + encoded = BeautifulSoup(doc, 'html.parser').encode() self.assertTrue(b"< < hey > >" in encoded) def test_formatter_skips_style_tag_for_html_documents(self): @@ -1375,7 +1463,7 @@ class TestSubstitutions(SoupTest): console.log("< < hey > > "); """ - encoded = BeautifulSoup(doc).encode() + encoded = BeautifulSoup(doc, 'html.parser').encode() self.assertTrue(b"< < hey > >" in encoded) def test_prettify_leaves_preformatted_text_alone(self): @@ -1383,24 +1471,24 @@ class TestSubstitutions(SoupTest): # Everything outside the
 tag is reformatted, but everything
         # inside is left alone.
         self.assertEqual(
-            u'
\n foo\n
  \tbar\n  \n  
\n baz\n
', + '
\n foo\n
  \tbar\n  \n  
\n baz\n
', soup.div.prettify()) def test_prettify_accepts_formatter(self): - soup = BeautifulSoup("foo") + soup = BeautifulSoup("foo", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) self.assertTrue("FOO" in pretty) def test_prettify_outputs_unicode_by_default(self): soup = self.soup("") - self.assertEqual(unicode, type(soup.prettify())) + self.assertEqual(str, type(soup.prettify())) def test_prettify_can_encode_data(self): soup = self.soup("") self.assertEqual(bytes, type(soup.prettify("utf-8"))) def test_html_entity_substitution_off_by_default(self): - markup = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" + markup = "Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" soup = self.soup(markup) encoded = soup.b.encode("utf-8") self.assertEqual(encoded, markup.encode('utf-8')) @@ -1444,45 +1532,53 @@ class TestEncoding(SoupTest): """Test the ability to encode objects into strings.""" def test_unicode_string_can_be_encoded(self): - html = u"\N{SNOWMAN}" + html = "\N{SNOWMAN}" soup = self.soup(html) self.assertEqual(soup.b.string.encode("utf-8"), - u"\N{SNOWMAN}".encode("utf-8")) + "\N{SNOWMAN}".encode("utf-8")) def test_tag_containing_unicode_string_can_be_encoded(self): - html = u"\N{SNOWMAN}" + html = "\N{SNOWMAN}" soup = self.soup(html) self.assertEqual( soup.b.encode("utf-8"), html.encode("utf-8")) def test_encoding_substitutes_unrecognized_characters_by_default(self): - html = u"\N{SNOWMAN}" + html = "\N{SNOWMAN}" soup = self.soup(html) self.assertEqual(soup.b.encode("ascii"), b"") def test_encoding_can_be_made_strict(self): - html = u"\N{SNOWMAN}" + html = "\N{SNOWMAN}" soup = self.soup(html) self.assertRaises( UnicodeEncodeError, soup.encode, "ascii", errors="strict") def test_decode_contents(self): - html = u"\N{SNOWMAN}" + html = "\N{SNOWMAN}" soup = self.soup(html) - self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) + self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) def test_encode_contents(self): - html = u"\N{SNOWMAN}" + html = "\N{SNOWMAN}" soup = self.soup(html) self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( + "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( encoding="utf8")) def test_deprecated_renderContents(self): - html = u"\N{SNOWMAN}" + html = "\N{SNOWMAN}" soup = self.soup(html) self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + + def test_repr(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + if PY3K: + self.assertEqual(html, repr(soup)) + else: + self.assertEqual(b'\\u2603', repr(soup)) class TestNavigableStringSubclasses(SoupTest): @@ -1522,6 +1618,9 @@ class TestNavigableStringSubclasses(SoupTest): soup.insert(1, doctype) self.assertEqual(soup.encode(), b"\n") + def test_declaration(self): + d = Declaration("foo") + self.assertEqual("", d.output_ready()) class TestSoupSelector(TreeTest): @@ -1534,7 +1633,7 @@ class TestSoupSelector(TreeTest): - +Hello there.

An H1

@@ -1552,8 +1651,18 @@ class TestSoupSelector(TreeTest): span2a1 + +
+ + + + + + + +

English

English UK

English US

@@ -1565,7 +1674,7 @@ class TestSoupSelector(TreeTest): """ def setUp(self): - self.soup = BeautifulSoup(self.HTML) + self.soup = BeautifulSoup(self.HTML, 'html.parser') def assertSelects(self, selector, expected_ids): el_ids = [el['id'] for el in self.soup.select(selector)] @@ -1587,21 +1696,29 @@ class TestSoupSelector(TreeTest): els = self.soup.select('title') self.assertEqual(len(els), 1) self.assertEqual(els[0].name, 'title') - self.assertEqual(els[0].contents, [u'The title']) + self.assertEqual(els[0].contents, ['The title']) def test_one_tag_many(self): els = self.soup.select('div') - self.assertEqual(len(els), 3) + self.assertEqual(len(els), 4) for div in els: self.assertEqual(div.name, 'div') + el = self.soup.select_one('div') + self.assertEqual('main', el['id']) + + def test_select_one_returns_none_if_no_match(self): + match = self.soup.select_one('nonexistenttag') + self.assertEqual(None, match) + + def test_tag_in_tag_one(self): els = self.soup.select('div div') - self.assertSelects('div div', ['inner']) + self.assertSelects('div div', ['inner', 'data1']) def test_tag_in_tag_many(self): for selector in ('html div', 'html body div', 'body div'): - self.assertSelects(selector, ['main', 'inner', 'footer']) + self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) def test_tag_no_match(self): self.assertEqual(len(self.soup.select('del')), 0) @@ -1609,6 +1726,20 @@ class TestSoupSelector(TreeTest): def test_invalid_tag(self): self.assertRaises(ValueError, self.soup.select, 'tag%t') + def test_select_dashed_tag_ids(self): + self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) + + def test_select_dashed_by_id(self): + dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') + self.assertEqual(dashed[0].name, 'custom-dashed-tag') + self.assertEqual(dashed[0]['id'], 'dash2') + + def test_dashed_tag_text(self): + self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') + + def test_select_dashed_matches_find_all(self): + self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) + def test_header_tags(self): self.assertSelectMultiple( ('h1', ['header1']), @@ -1709,6 +1840,7 @@ class TestSoupSelector(TreeTest): ('[id^="m"]', ['me', 'main']), ('div[id^="m"]', ['main']), ('a[id^="m"]', ['me']), + ('div[data-tag^="dashed"]', ['data1']) ) def test_attribute_endswith(self): @@ -1716,8 +1848,8 @@ class TestSoupSelector(TreeTest): ('[href$=".css"]', ['l1']), ('link[href$=".css"]', ['l1']), ('link[id$="1"]', ['l1']), - ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), - ('div[id$="1"]', []), + ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), + ('div[id$="1"]', ['data1']), ('[id$="noending"]', []), ) @@ -1730,7 +1862,6 @@ class TestSoupSelector(TreeTest): ('[rel*="notstyle"]', []), ('link[rel*="notstyle"]', []), ('link[href*="bla"]', ['l1']), - ('a[href*="http://"]', ['bob', 'me']), ('[href*="http://"]', ['bob', 'me']), ('[id*="p"]', ['pmulti', 'p1']), ('div[id*="m"]', ['main']), @@ -1739,8 +1870,8 @@ class TestSoupSelector(TreeTest): ('[href*=".css"]', ['l1']), ('link[href*=".css"]', ['l1']), ('link[id*="1"]', ['l1']), - ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), - ('div[id*="1"]', []), + ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), + ('div[id*="1"]', ['data1']), ('[id*="noending"]', []), # New for this test ('[href*="."]', ['bob', 'me', 'l1']), @@ -1748,6 +1879,7 @@ class TestSoupSelector(TreeTest): ('link[href*="."]', ['l1']), ('div[id*="n"]', ['main', 'inner']), ('div[id*="nn"]', ['inner']), + ('div[data-tag*="edval"]', ['data1']) ) def test_attribute_exact_or_hypen(self): @@ -1767,18 +1899,27 @@ class TestSoupSelector(TreeTest): ('p[class]', ['p1', 'pmulti']), ('[blah]', []), ('p[blah]', []), + ('div[data-tag]', ['data1']) ) + def test_unsupported_pseudoclass(self): + self.assertRaises( + NotImplementedError, self.soup.select, "a:no-such-pseudoclass") + + self.assertRaises( + NotImplementedError, self.soup.select, "a:nth-of-type(a)") + + def test_nth_of_type(self): # Try to select first paragraph els = self.soup.select('div#inner p:nth-of-type(1)') self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') + self.assertEqual(els[0].string, 'Some text') # Try to select third paragraph els = self.soup.select('div#inner p:nth-of-type(3)') self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Another') + self.assertEqual(els[0].string, 'Another') # Try to select (non-existent!) fourth paragraph els = self.soup.select('div#inner p:nth-of-type(4)') @@ -1791,7 +1932,7 @@ class TestSoupSelector(TreeTest): def test_nth_of_type_direct_descendant(self): els = self.soup.select('div#inner > p:nth-of-type(1)') self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') + self.assertEqual(els[0].string, 'Some text') def test_id_child_selector_nth_of_type(self): self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) @@ -1803,7 +1944,7 @@ class TestSoupSelector(TreeTest): selected = inner.select("div") # The
tag was selected. The