summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--bitbake/lib/bs4/__init__.py112
-rw-r--r--bitbake/lib/bs4/builder/__init__.py7
-rw-r--r--bitbake/lib/bs4/builder/_html5lib.py71
-rw-r--r--bitbake/lib/bs4/builder/_htmlparser.py56
-rw-r--r--bitbake/lib/bs4/builder/_lxml.py47
-rw-r--r--bitbake/lib/bs4/dammit.py31
-rw-r--r--bitbake/lib/bs4/diagnose.py68
-rw-r--r--bitbake/lib/bs4/element.py346
-rw-r--r--bitbake/lib/bs4/testing.py129
-rw-r--r--bitbake/lib/bs4/tests/test_builder_registry.py14
-rw-r--r--bitbake/lib/bs4/tests/test_html5lib.py19
-rw-r--r--bitbake/lib/bs4/tests/test_htmlparser.py13
-rw-r--r--bitbake/lib/bs4/tests/test_lxml.py19
-rw-r--r--bitbake/lib/bs4/tests/test_soup.py107
-rw-r--r--bitbake/lib/bs4/tests/test_tree.py294
15 files changed, 972 insertions, 361 deletions
diff --git a/bitbake/lib/bs4/__init__.py b/bitbake/lib/bs4/__init__.py
index 7ba34269af..f6fdfd50b1 100644
--- a/bitbake/lib/bs4/__init__.py
+++ b/bitbake/lib/bs4/__init__.py
@@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17""" 17"""
18 18
19__author__ = "Leonard Richardson (leonardr@segfault.org)" 19__author__ = "Leonard Richardson (leonardr@segfault.org)"
20__version__ = "4.3.2" 20__version__ = "4.4.1"
21__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" 21__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
22__license__ = "MIT" 22__license__ = "MIT"
23 23
24__all__ = ['BeautifulSoup'] 24__all__ = ['BeautifulSoup']
@@ -45,7 +45,7 @@ from .element import (
45 45
46# The very first thing we do is give a useful error if someone is 46# The very first thing we do is give a useful error if someone is
47# running this code under Python 3 without converting it. 47# running this code under Python 3 without converting it.
48syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 48'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
49 49
50class BeautifulSoup(Tag): 50class BeautifulSoup(Tag):
51 """ 51 """
@@ -69,7 +69,7 @@ class BeautifulSoup(Tag):
69 like HTML's <br> tag), call handle_starttag and then 69 like HTML's <br> tag), call handle_starttag and then
70 handle_endtag. 70 handle_endtag.
71 """ 71 """
72 ROOT_TAG_NAME = u'[document]' 72 ROOT_TAG_NAME = '[document]'
73 73
74 # If the end-user gives no indication which tree builder they 74 # If the end-user gives no indication which tree builder they
75 # want, look for one with these features. 75 # want, look for one with these features.
@@ -77,8 +77,11 @@ class BeautifulSoup(Tag):
77 77
78 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 78 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
79 79
80 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
81
80 def __init__(self, markup="", features=None, builder=None, 82 def __init__(self, markup="", features=None, builder=None,
81 parse_only=None, from_encoding=None, **kwargs): 83 parse_only=None, from_encoding=None, exclude_encodings=None,
84 **kwargs):
82 """The Soup object is initialized as the 'root tag', and the 85 """The Soup object is initialized as the 'root tag', and the
83 provided markup (which can be a string or a file-like object) 86 provided markup (which can be a string or a file-like object)
84 is fed into the underlying parser.""" 87 is fed into the underlying parser."""
@@ -114,9 +117,9 @@ class BeautifulSoup(Tag):
114 del kwargs['isHTML'] 117 del kwargs['isHTML']
115 warnings.warn( 118 warnings.warn(
116 "BS4 does not respect the isHTML argument to the " 119 "BS4 does not respect the isHTML argument to the "
117 "BeautifulSoup constructor. You can pass in features='html' " 120 "BeautifulSoup constructor. Suggest you use "
118 "or features='xml' to get a builder capable of handling " 121 "features='lxml' for HTML and features='lxml-xml' for "
119 "one or the other.") 122 "XML.")
120 123
121 def deprecated_argument(old_name, new_name): 124 def deprecated_argument(old_name, new_name):
122 if old_name in kwargs: 125 if old_name in kwargs:
@@ -135,12 +138,13 @@ class BeautifulSoup(Tag):
135 "fromEncoding", "from_encoding") 138 "fromEncoding", "from_encoding")
136 139
137 if len(kwargs) > 0: 140 if len(kwargs) > 0:
138 arg = kwargs.keys().pop() 141 arg = list(kwargs.keys()).pop()
139 raise TypeError( 142 raise TypeError(
140 "__init__() got an unexpected keyword argument '%s'" % arg) 143 "__init__() got an unexpected keyword argument '%s'" % arg)
141 144
142 if builder is None: 145 if builder is None:
143 if isinstance(features, basestring): 146 original_features = features
147 if isinstance(features, str):
144 features = [features] 148 features = [features]
145 if features is None or len(features) == 0: 149 if features is None or len(features) == 0:
146 features = self.DEFAULT_BUILDER_FEATURES 150 features = self.DEFAULT_BUILDER_FEATURES
@@ -151,6 +155,16 @@ class BeautifulSoup(Tag):
151 "requested: %s. Do you need to install a parser library?" 155 "requested: %s. Do you need to install a parser library?"
152 % ",".join(features)) 156 % ",".join(features))
153 builder = builder_class() 157 builder = builder_class()
158 if not (original_features == builder.NAME or
159 original_features in builder.ALTERNATE_NAMES):
160 if builder.is_xml:
161 markup_type = "XML"
162 else:
163 markup_type = "HTML"
164 warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
165 parser=builder.NAME,
166 markup_type=markup_type))
167
154 self.builder = builder 168 self.builder = builder
155 self.is_xml = builder.is_xml 169 self.is_xml = builder.is_xml
156 self.builder.soup = self 170 self.builder.soup = self
@@ -164,7 +178,7 @@ class BeautifulSoup(Tag):
164 # involving passing non-markup to Beautiful Soup. 178 # involving passing non-markup to Beautiful Soup.
165 # Beautiful Soup will still parse the input as markup, 179 # Beautiful Soup will still parse the input as markup,
166 # just in case that's what the user really wants. 180 # just in case that's what the user really wants.
167 if (isinstance(markup, unicode) 181 if (isinstance(markup, str)
168 and not os.path.supports_unicode_filenames): 182 and not os.path.supports_unicode_filenames):
169 possible_filename = markup.encode("utf8") 183 possible_filename = markup.encode("utf8")
170 else: 184 else:
@@ -172,25 +186,30 @@ class BeautifulSoup(Tag):
172 is_file = False 186 is_file = False
173 try: 187 try:
174 is_file = os.path.exists(possible_filename) 188 is_file = os.path.exists(possible_filename)
175 except Exception, e: 189 except Exception as e:
176 # This is almost certainly a problem involving 190 # This is almost certainly a problem involving
177 # characters not valid in filenames on this 191 # characters not valid in filenames on this
178 # system. Just let it go. 192 # system. Just let it go.
179 pass 193 pass
180 if is_file: 194 if is_file:
195 if isinstance(markup, str):
196 markup = markup.encode("utf8")
181 warnings.warn( 197 warnings.warn(
182 '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) 198 '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
183 if markup[:5] == "http:" or markup[:6] == "https:": 199 if markup[:5] == "http:" or markup[:6] == "https:":
184 # TODO: This is ugly but I couldn't get it to work in 200 # TODO: This is ugly but I couldn't get it to work in
185 # Python 3 otherwise. 201 # Python 3 otherwise.
186 if ((isinstance(markup, bytes) and not b' ' in markup) 202 if ((isinstance(markup, bytes) and not b' ' in markup)
187 or (isinstance(markup, unicode) and not u' ' in markup)): 203 or (isinstance(markup, str) and not ' ' in markup)):
204 if isinstance(markup, str):
205 markup = markup.encode("utf8")
188 warnings.warn( 206 warnings.warn(
189 '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) 207 '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
190 208
191 for (self.markup, self.original_encoding, self.declared_html_encoding, 209 for (self.markup, self.original_encoding, self.declared_html_encoding,
192 self.contains_replacement_characters) in ( 210 self.contains_replacement_characters) in (
193 self.builder.prepare_markup(markup, from_encoding)): 211 self.builder.prepare_markup(
212 markup, from_encoding, exclude_encodings=exclude_encodings)):
194 self.reset() 213 self.reset()
195 try: 214 try:
196 self._feed() 215 self._feed()
@@ -203,6 +222,16 @@ class BeautifulSoup(Tag):
203 self.markup = None 222 self.markup = None
204 self.builder.soup = None 223 self.builder.soup = None
205 224
225 def __copy__(self):
226 return type(self)(self.encode(), builder=self.builder)
227
228 def __getstate__(self):
229 # Frequently a tree builder can't be pickled.
230 d = dict(self.__dict__)
231 if 'builder' in d and not self.builder.picklable:
232 del d['builder']
233 return d
234
206 def _feed(self): 235 def _feed(self):
207 # Convert the document to Unicode. 236 # Convert the document to Unicode.
208 self.builder.reset() 237 self.builder.reset()
@@ -229,9 +258,7 @@ class BeautifulSoup(Tag):
229 258
230 def new_string(self, s, subclass=NavigableString): 259 def new_string(self, s, subclass=NavigableString):
231 """Create a new NavigableString associated with this soup.""" 260 """Create a new NavigableString associated with this soup."""
232 navigable = subclass(s) 261 return subclass(s)
233 navigable.setup()
234 return navigable
235 262
236 def insert_before(self, successor): 263 def insert_before(self, successor):
237 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 264 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
@@ -259,7 +286,7 @@ class BeautifulSoup(Tag):
259 286
260 def endData(self, containerClass=NavigableString): 287 def endData(self, containerClass=NavigableString):
261 if self.current_data: 288 if self.current_data:
262 current_data = u''.join(self.current_data) 289 current_data = ''.join(self.current_data)
263 # If whitespace is not preserved, and this string contains 290 # If whitespace is not preserved, and this string contains
264 # nothing but ASCII spaces, replace it with a single space 291 # nothing but ASCII spaces, replace it with a single space
265 # or newline. 292 # or newline.
@@ -290,14 +317,49 @@ class BeautifulSoup(Tag):
290 def object_was_parsed(self, o, parent=None, most_recent_element=None): 317 def object_was_parsed(self, o, parent=None, most_recent_element=None):
291 """Add an object to the parse tree.""" 318 """Add an object to the parse tree."""
292 parent = parent or self.currentTag 319 parent = parent or self.currentTag
293 most_recent_element = most_recent_element or self._most_recent_element 320 previous_element = most_recent_element or self._most_recent_element
294 o.setup(parent, most_recent_element) 321
322 next_element = previous_sibling = next_sibling = None
323 if isinstance(o, Tag):
324 next_element = o.next_element
325 next_sibling = o.next_sibling
326 previous_sibling = o.previous_sibling
327 if not previous_element:
328 previous_element = o.previous_element
329
330 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
295 331
296 if most_recent_element is not None:
297 most_recent_element.next_element = o
298 self._most_recent_element = o 332 self._most_recent_element = o
299 parent.contents.append(o) 333 parent.contents.append(o)
300 334
335 if parent.next_sibling:
336 # This node is being inserted into an element that has
337 # already been parsed. Deal with any dangling references.
338 index = parent.contents.index(o)
339 if index == 0:
340 previous_element = parent
341 previous_sibling = None
342 else:
343 previous_element = previous_sibling = parent.contents[index-1]
344 if index == len(parent.contents)-1:
345 next_element = parent.next_sibling
346 next_sibling = None
347 else:
348 next_element = next_sibling = parent.contents[index+1]
349
350 o.previous_element = previous_element
351 if previous_element:
352 previous_element.next_element = o
353 o.next_element = next_element
354 if next_element:
355 next_element.previous_element = o
356 o.next_sibling = next_sibling
357 if next_sibling:
358 next_sibling.previous_sibling = o
359 o.previous_sibling = previous_sibling
360 if previous_sibling:
361 previous_sibling.next_sibling = o
362
301 def _popToTag(self, name, nsprefix=None, inclusivePop=True): 363 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
302 """Pops the tag stack up to and including the most recent 364 """Pops the tag stack up to and including the most recent
303 instance of the given tag. If inclusivePop is false, pops the tag 365 instance of the given tag. If inclusivePop is false, pops the tag
@@ -367,9 +429,9 @@ class BeautifulSoup(Tag):
367 encoding_part = '' 429 encoding_part = ''
368 if eventual_encoding != None: 430 if eventual_encoding != None:
369 encoding_part = ' encoding="%s"' % eventual_encoding 431 encoding_part = ' encoding="%s"' % eventual_encoding
370 prefix = u'<?xml version="1.0"%s?>\n' % encoding_part 432 prefix = '<?xml version="1.0"%s?>\n' % encoding_part
371 else: 433 else:
372 prefix = u'' 434 prefix = ''
373 if not pretty_print: 435 if not pretty_print:
374 indent_level = None 436 indent_level = None
375 else: 437 else:
@@ -403,4 +465,4 @@ class FeatureNotFound(ValueError):
403if __name__ == '__main__': 465if __name__ == '__main__':
404 import sys 466 import sys
405 soup = BeautifulSoup(sys.stdin) 467 soup = BeautifulSoup(sys.stdin)
406 print soup.prettify() 468 print(soup.prettify())
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py
index 740f5f29cd..6ccd4d23d6 100644
--- a/bitbake/lib/bs4/builder/__init__.py
+++ b/bitbake/lib/bs4/builder/__init__.py
@@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry()
80class TreeBuilder(object): 80class TreeBuilder(object):
81 """Turn a document into a Beautiful Soup object tree.""" 81 """Turn a document into a Beautiful Soup object tree."""
82 82
83 NAME = "[Unknown tree builder]"
84 ALTERNATE_NAMES = []
83 features = [] 85 features = []
84 86
85 is_xml = False 87 is_xml = False
88 picklable = False
86 preserve_whitespace_tags = set() 89 preserve_whitespace_tags = set()
87 empty_element_tags = None # A tag will be considered an empty-element 90 empty_element_tags = None # A tag will be considered an empty-element
88 # tag when and only when it has no contents. 91 # tag when and only when it has no contents.
@@ -153,13 +156,13 @@ class TreeBuilder(object):
153 universal = self.cdata_list_attributes.get('*', []) 156 universal = self.cdata_list_attributes.get('*', [])
154 tag_specific = self.cdata_list_attributes.get( 157 tag_specific = self.cdata_list_attributes.get(
155 tag_name.lower(), None) 158 tag_name.lower(), None)
156 for attr in attrs.keys(): 159 for attr in list(attrs.keys()):
157 if attr in universal or (tag_specific and attr in tag_specific): 160 if attr in universal or (tag_specific and attr in tag_specific):
158 # We have a "class"-type attribute whose string 161 # We have a "class"-type attribute whose string
159 # value is a whitespace-separated list of 162 # value is a whitespace-separated list of
160 # values. Split it into a list. 163 # values. Split it into a list.
161 value = attrs[attr] 164 value = attrs[attr]
162 if isinstance(value, basestring): 165 if isinstance(value, str):
163 values = whitespace_re.split(value) 166 values = whitespace_re.split(value)
164 else: 167 else:
165 # html5lib sometimes calls setAttributes twice 168 # html5lib sometimes calls setAttributes twice
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py
index 7de36ae75e..f0e5924ebb 100644
--- a/bitbake/lib/bs4/builder/_html5lib.py
+++ b/bitbake/lib/bs4/builder/_html5lib.py
@@ -2,6 +2,7 @@ __all__ = [
2 'HTML5TreeBuilder', 2 'HTML5TreeBuilder',
3 ] 3 ]
4 4
5from pdb import set_trace
5import warnings 6import warnings
6from bs4.builder import ( 7from bs4.builder import (
7 PERMISSIVE, 8 PERMISSIVE,
@@ -9,7 +10,10 @@ from bs4.builder import (
9 HTML_5, 10 HTML_5,
10 HTMLTreeBuilder, 11 HTMLTreeBuilder,
11 ) 12 )
12from bs4.element import NamespacedAttribute 13from bs4.element import (
14 NamespacedAttribute,
15 whitespace_re,
16)
13import html5lib 17import html5lib
14from html5lib.constants import namespaces 18from html5lib.constants import namespaces
15from bs4.element import ( 19from bs4.element import (
@@ -22,11 +26,20 @@ from bs4.element import (
22class HTML5TreeBuilder(HTMLTreeBuilder): 26class HTML5TreeBuilder(HTMLTreeBuilder):
23 """Use html5lib to build a tree.""" 27 """Use html5lib to build a tree."""
24 28
25 features = ['html5lib', PERMISSIVE, HTML_5, HTML] 29 NAME = "html5lib"
30
31 features = [NAME, PERMISSIVE, HTML_5, HTML]
26 32
27 def prepare_markup(self, markup, user_specified_encoding): 33 def prepare_markup(self, markup, user_specified_encoding,
34 document_declared_encoding=None, exclude_encodings=None):
28 # Store the user-specified encoding for use later on. 35 # Store the user-specified encoding for use later on.
29 self.user_specified_encoding = user_specified_encoding 36 self.user_specified_encoding = user_specified_encoding
37
38 # document_declared_encoding and exclude_encodings aren't used
39 # ATM because the html5lib TreeBuilder doesn't use
40 # UnicodeDammit.
41 if exclude_encodings:
42 warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
30 yield (markup, None, None, False) 43 yield (markup, None, None, False)
31 44
32 # These methods are defined by Beautiful Soup. 45 # These methods are defined by Beautiful Soup.
@@ -37,7 +50,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
37 doc = parser.parse(markup, encoding=self.user_specified_encoding) 50 doc = parser.parse(markup, encoding=self.user_specified_encoding)
38 51
39 # Set the character encoding detected by the tokenizer. 52 # Set the character encoding detected by the tokenizer.
40 if isinstance(markup, unicode): 53 if isinstance(markup, str):
41 # We need to special-case this because html5lib sets 54 # We need to special-case this because html5lib sets
42 # charEncoding to UTF-8 if it gets Unicode input. 55 # charEncoding to UTF-8 if it gets Unicode input.
43 doc.original_encoding = None 56 doc.original_encoding = None
@@ -51,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
51 64
52 def test_fragment_to_document(self, fragment): 65 def test_fragment_to_document(self, fragment):
53 """See `TreeBuilder`.""" 66 """See `TreeBuilder`."""
54 return u'<html><head></head><body>%s</body></html>' % fragment 67 return '<html><head></head><body>%s</body></html>' % fragment
55 68
56 69
57class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): 70class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
@@ -101,7 +114,16 @@ class AttrList(object):
101 def __iter__(self): 114 def __iter__(self):
102 return list(self.attrs.items()).__iter__() 115 return list(self.attrs.items()).__iter__()
103 def __setitem__(self, name, value): 116 def __setitem__(self, name, value):
104 "set attr", name, value 117 # If this attribute is a multi-valued attribute for this element,
118 # turn its value into a list.
119 list_attr = HTML5TreeBuilder.cdata_list_attributes
120 if (name in list_attr['*']
121 or (self.element.name in list_attr
122 and name in list_attr[self.element.name])):
123 # A node that is being cloned may have already undergone
124 # this procedure.
125 if not isinstance(value, list):
126 value = whitespace_re.split(value)
105 self.element[name] = value 127 self.element[name] = value
106 def items(self): 128 def items(self):
107 return list(self.attrs.items()) 129 return list(self.attrs.items())
@@ -124,7 +146,7 @@ class Element(html5lib.treebuilders._base.Node):
124 146
125 def appendChild(self, node): 147 def appendChild(self, node):
126 string_child = child = None 148 string_child = child = None
127 if isinstance(node, basestring): 149 if isinstance(node, str):
128 # Some other piece of code decided to pass in a string 150 # Some other piece of code decided to pass in a string
129 # instead of creating a TextElement object to contain the 151 # instead of creating a TextElement object to contain the
130 # string. 152 # string.
@@ -139,7 +161,7 @@ class Element(html5lib.treebuilders._base.Node):
139 else: 161 else:
140 child = node.element 162 child = node.element
141 163
142 if not isinstance(child, basestring) and child.parent is not None: 164 if not isinstance(child, str) and child.parent is not None:
143 node.element.extract() 165 node.element.extract()
144 166
145 if (string_child and self.element.contents 167 if (string_child and self.element.contents
@@ -152,7 +174,7 @@ class Element(html5lib.treebuilders._base.Node):
152 old_element.replace_with(new_element) 174 old_element.replace_with(new_element)
153 self.soup._most_recent_element = new_element 175 self.soup._most_recent_element = new_element
154 else: 176 else:
155 if isinstance(node, basestring): 177 if isinstance(node, str):
156 # Create a brand new NavigableString from this string. 178 # Create a brand new NavigableString from this string.
157 child = self.soup.new_string(node) 179 child = self.soup.new_string(node)
158 180
@@ -161,6 +183,12 @@ class Element(html5lib.treebuilders._base.Node):
161 # immediately after the parent, if it has no children.) 183 # immediately after the parent, if it has no children.)
162 if self.element.contents: 184 if self.element.contents:
163 most_recent_element = self.element._last_descendant(False) 185 most_recent_element = self.element._last_descendant(False)
186 elif self.element.next_element is not None:
187 # Something from further ahead in the parse tree is
188 # being inserted into this earlier element. This is
189 # very annoying because it means an expensive search
190 # for the last element in the tree.
191 most_recent_element = self.soup._last_descendant()
164 else: 192 else:
165 most_recent_element = self.element 193 most_recent_element = self.element
166 194
@@ -172,6 +200,7 @@ class Element(html5lib.treebuilders._base.Node):
172 return AttrList(self.element) 200 return AttrList(self.element)
173 201
174 def setAttributes(self, attributes): 202 def setAttributes(self, attributes):
203
175 if attributes is not None and len(attributes) > 0: 204 if attributes is not None and len(attributes) > 0:
176 205
177 converted_attributes = [] 206 converted_attributes = []
@@ -183,7 +212,7 @@ class Element(html5lib.treebuilders._base.Node):
183 212
184 self.soup.builder._replace_cdata_list_attribute_values( 213 self.soup.builder._replace_cdata_list_attribute_values(
185 self.name, attributes) 214 self.name, attributes)
186 for name, value in attributes.items(): 215 for name, value in list(attributes.items()):
187 self.element[name] = value 216 self.element[name] = value
188 217
189 # The attributes may contain variables that need substitution. 218 # The attributes may contain variables that need substitution.
@@ -218,6 +247,9 @@ class Element(html5lib.treebuilders._base.Node):
218 247
219 def reparentChildren(self, new_parent): 248 def reparentChildren(self, new_parent):
220 """Move all of this tag's children into another tag.""" 249 """Move all of this tag's children into another tag."""
250 # print "MOVE", self.element.contents
251 # print "FROM", self.element
252 # print "TO", new_parent.element
221 element = self.element 253 element = self.element
222 new_parent_element = new_parent.element 254 new_parent_element = new_parent.element
223 # Determine what this tag's next_element will be once all the children 255 # Determine what this tag's next_element will be once all the children
@@ -236,17 +268,28 @@ class Element(html5lib.treebuilders._base.Node):
236 new_parents_last_descendant_next_element = new_parent_element.next_element 268 new_parents_last_descendant_next_element = new_parent_element.next_element
237 269
238 to_append = element.contents 270 to_append = element.contents
239 append_after = new_parent.element.contents 271 append_after = new_parent_element.contents
240 if len(to_append) > 0: 272 if len(to_append) > 0:
241 # Set the first child's previous_element and previous_sibling 273 # Set the first child's previous_element and previous_sibling
242 # to elements within the new parent 274 # to elements within the new parent
243 first_child = to_append[0] 275 first_child = to_append[0]
244 first_child.previous_element = new_parents_last_descendant 276 if new_parents_last_descendant:
277 first_child.previous_element = new_parents_last_descendant
278 else:
279 first_child.previous_element = new_parent_element
245 first_child.previous_sibling = new_parents_last_child 280 first_child.previous_sibling = new_parents_last_child
281 if new_parents_last_descendant:
282 new_parents_last_descendant.next_element = first_child
283 else:
284 new_parent_element.next_element = first_child
285 if new_parents_last_child:
286 new_parents_last_child.next_sibling = first_child
246 287
247 # Fix the last child's next_element and next_sibling 288 # Fix the last child's next_element and next_sibling
248 last_child = to_append[-1] 289 last_child = to_append[-1]
249 last_child.next_element = new_parents_last_descendant_next_element 290 last_child.next_element = new_parents_last_descendant_next_element
291 if new_parents_last_descendant_next_element:
292 new_parents_last_descendant_next_element.previous_element = last_child
250 last_child.next_sibling = None 293 last_child.next_sibling = None
251 294
252 for child in to_append: 295 for child in to_append:
@@ -257,6 +300,10 @@ class Element(html5lib.treebuilders._base.Node):
257 element.contents = [] 300 element.contents = []
258 element.next_element = final_next_element 301 element.next_element = final_next_element
259 302
303 # print "DONE WITH MOVE"
304 # print "FROM", self.element
305 # print "TO", new_parent_element
306
260 def cloneNode(self): 307 def cloneNode(self):
261 tag = self.soup.new_tag(self.element.name, self.namespace) 308 tag = self.soup.new_tag(self.element.name, self.namespace)
262 node = Element(tag, self.soup, self.namespace) 309 node = Element(tag, self.soup, self.namespace)
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py
index ca8d8b892b..bb0a63f2f3 100644
--- a/bitbake/lib/bs4/builder/_htmlparser.py
+++ b/bitbake/lib/bs4/builder/_htmlparser.py
@@ -4,10 +4,16 @@ __all__ = [
4 'HTMLParserTreeBuilder', 4 'HTMLParserTreeBuilder',
5 ] 5 ]
6 6
7from HTMLParser import ( 7from html.parser import HTMLParser
8 HTMLParser, 8
9 HTMLParseError, 9try:
10 ) 10 from html.parser import HTMLParseError
11except ImportError as e:
12 # HTMLParseError is removed in Python 3.5. Since it can never be
13 # thrown in 3.5, we can just define our own class as a placeholder.
14 class HTMLParseError(Exception):
15 pass
16
11import sys 17import sys
12import warnings 18import warnings
13 19
@@ -19,10 +25,10 @@ import warnings
19# At the end of this file, we monkeypatch HTMLParser so that 25# At the end of this file, we monkeypatch HTMLParser so that
20# strict=True works well on Python 3.2.2. 26# strict=True works well on Python 3.2.2.
21major, minor, release = sys.version_info[:3] 27major, minor, release = sys.version_info[:3]
22CONSTRUCTOR_TAKES_STRICT = ( 28CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
23 major > 3 29CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
24 or (major == 3 and minor > 2) 30CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
25 or (major == 3 and minor == 2 and release >= 3)) 31
26 32
27from bs4.element import ( 33from bs4.element import (
28 CData, 34 CData,
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
63 69
64 def handle_charref(self, name): 70 def handle_charref(self, name):
65 # XXX workaround for a bug in HTMLParser. Remove this once 71 # XXX workaround for a bug in HTMLParser. Remove this once
66 # it's fixed. 72 # it's fixed in all supported versions.
73 # http://bugs.python.org/issue13633
67 if name.startswith('x'): 74 if name.startswith('x'):
68 real_name = int(name.lstrip('x'), 16) 75 real_name = int(name.lstrip('x'), 16)
69 elif name.startswith('X'): 76 elif name.startswith('X'):
@@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
72 real_name = int(name) 79 real_name = int(name)
73 80
74 try: 81 try:
75 data = unichr(real_name) 82 data = chr(real_name)
76 except (ValueError, OverflowError), e: 83 except (ValueError, OverflowError) as e:
77 data = u"\N{REPLACEMENT CHARACTER}" 84 data = "\N{REPLACEMENT CHARACTER}"
78 85
79 self.handle_data(data) 86 self.handle_data(data)
80 87
@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
113 120
114 def handle_pi(self, data): 121 def handle_pi(self, data):
115 self.soup.endData() 122 self.soup.endData()
116 if data.endswith("?") and data.lower().startswith("xml"):
117 # "An XHTML processing instruction using the trailing '?'
118 # will cause the '?' to be included in data." - HTMLParser
119 # docs.
120 #
121 # Strip the question mark so we don't end up with two
122 # question marks.
123 data = data[:-1]
124 self.soup.handle_data(data) 123 self.soup.handle_data(data)
125 self.soup.endData(ProcessingInstruction) 124 self.soup.endData(ProcessingInstruction)
126 125
@@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser):
128class HTMLParserTreeBuilder(HTMLTreeBuilder): 127class HTMLParserTreeBuilder(HTMLTreeBuilder):
129 128
130 is_xml = False 129 is_xml = False
131 features = [HTML, STRICT, HTMLPARSER] 130 picklable = True
131 NAME = HTMLPARSER
132 features = [NAME, HTML, STRICT]
132 133
133 def __init__(self, *args, **kwargs): 134 def __init__(self, *args, **kwargs):
134 if CONSTRUCTOR_TAKES_STRICT: 135 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
135 kwargs['strict'] = False 136 kwargs['strict'] = False
137 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
138 kwargs['convert_charrefs'] = False
136 self.parser_args = (args, kwargs) 139 self.parser_args = (args, kwargs)
137 140
138 def prepare_markup(self, markup, user_specified_encoding=None, 141 def prepare_markup(self, markup, user_specified_encoding=None,
139 document_declared_encoding=None): 142 document_declared_encoding=None, exclude_encodings=None):
140 """ 143 """
141 :return: A 4-tuple (markup, original encoding, encoding 144 :return: A 4-tuple (markup, original encoding, encoding
142 declared within markup, whether any characters had to be 145 declared within markup, whether any characters had to be
143 replaced with REPLACEMENT CHARACTER). 146 replaced with REPLACEMENT CHARACTER).
144 """ 147 """
145 if isinstance(markup, unicode): 148 if isinstance(markup, str):
146 yield (markup, None, None, False) 149 yield (markup, None, None, False)
147 return 150 return
148 151
149 try_encodings = [user_specified_encoding, document_declared_encoding] 152 try_encodings = [user_specified_encoding, document_declared_encoding]
150 dammit = UnicodeDammit(markup, try_encodings, is_html=True) 153 dammit = UnicodeDammit(markup, try_encodings, is_html=True,
154 exclude_encodings=exclude_encodings)
151 yield (dammit.markup, dammit.original_encoding, 155 yield (dammit.markup, dammit.original_encoding,
152 dammit.declared_html_encoding, 156 dammit.declared_html_encoding,
153 dammit.contains_replacement_characters) 157 dammit.contains_replacement_characters)
@@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
158 parser.soup = self.soup 162 parser.soup = self.soup
159 try: 163 try:
160 parser.feed(markup) 164 parser.feed(markup)
161 except HTMLParseError, e: 165 except HTMLParseError as e:
162 warnings.warn(RuntimeWarning( 166 warnings.warn(RuntimeWarning(
163 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 167 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
164 raise e 168 raise e
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py
index fa5d49875e..9c6c14ee65 100644
--- a/bitbake/lib/bs4/builder/_lxml.py
+++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -4,10 +4,15 @@ __all__ = [
4 ] 4 ]
5 5
6from io import BytesIO 6from io import BytesIO
7from StringIO import StringIO 7from io import StringIO
8import collections 8import collections
9from lxml import etree 9from lxml import etree
10from bs4.element import Comment, Doctype, NamespacedAttribute 10from bs4.element import (
11 Comment,
12 Doctype,
13 NamespacedAttribute,
14 ProcessingInstruction,
15)
11from bs4.builder import ( 16from bs4.builder import (
12 FAST, 17 FAST,
13 HTML, 18 HTML,
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
25 30
26 is_xml = True 31 is_xml = True
27 32
33 NAME = "lxml-xml"
34 ALTERNATE_NAMES = ["xml"]
35
28 # Well, it's permissive by XML parser standards. 36 # Well, it's permissive by XML parser standards.
29 features = [LXML, XML, FAST, PERMISSIVE] 37 features = [NAME, LXML, XML, FAST, PERMISSIVE]
30 38
31 CHUNK_SIZE = 512 39 CHUNK_SIZE = 512
32 40
@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
70 return (None, tag) 78 return (None, tag)
71 79
72 def prepare_markup(self, markup, user_specified_encoding=None, 80 def prepare_markup(self, markup, user_specified_encoding=None,
81 exclude_encodings=None,
73 document_declared_encoding=None): 82 document_declared_encoding=None):
74 """ 83 """
75 :yield: A series of 4-tuples. 84 :yield: A series of 4-tuples.
@@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
78 87
79 Each 4-tuple represents a strategy for parsing the document. 88 Each 4-tuple represents a strategy for parsing the document.
80 """ 89 """
81 if isinstance(markup, unicode): 90 if isinstance(markup, str):
82 # We were given Unicode. Maybe lxml can parse Unicode on 91 # We were given Unicode. Maybe lxml can parse Unicode on
83 # this system? 92 # this system?
84 yield markup, None, document_declared_encoding, False 93 yield markup, None, document_declared_encoding, False
85 94
86 if isinstance(markup, unicode): 95 if isinstance(markup, str):
87 # No, apparently not. Convert the Unicode to UTF-8 and 96 # No, apparently not. Convert the Unicode to UTF-8 and
88 # tell lxml to parse it as UTF-8. 97 # tell lxml to parse it as UTF-8.
89 yield (markup.encode("utf8"), "utf8", 98 yield (markup.encode("utf8"), "utf8",
@@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
95 # the document as each one in turn. 104 # the document as each one in turn.
96 is_html = not self.is_xml 105 is_html = not self.is_xml
97 try_encodings = [user_specified_encoding, document_declared_encoding] 106 try_encodings = [user_specified_encoding, document_declared_encoding]
98 detector = EncodingDetector(markup, try_encodings, is_html) 107 detector = EncodingDetector(
108 markup, try_encodings, is_html, exclude_encodings)
99 for encoding in detector.encodings: 109 for encoding in detector.encodings:
100 yield (detector.markup, encoding, document_declared_encoding, False) 110 yield (detector.markup, encoding, document_declared_encoding, False)
101 111
102 def feed(self, markup): 112 def feed(self, markup):
103 if isinstance(markup, bytes): 113 if isinstance(markup, bytes):
104 markup = BytesIO(markup) 114 markup = BytesIO(markup)
105 elif isinstance(markup, unicode): 115 elif isinstance(markup, str):
106 markup = StringIO(markup) 116 markup = StringIO(markup)
107 117
108 # Call feed() at least once, even if the markup is empty, 118 # Call feed() at least once, even if the markup is empty,
@@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
117 if len(data) != 0: 127 if len(data) != 0:
118 self.parser.feed(data) 128 self.parser.feed(data)
119 self.parser.close() 129 self.parser.close()
120 except (UnicodeDecodeError, LookupError, etree.ParserError), e: 130 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
121 raise ParserRejectedMarkup(str(e)) 131 raise ParserRejectedMarkup(str(e))
122 132
123 def close(self): 133 def close(self):
@@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
135 self.nsmaps.append(None) 145 self.nsmaps.append(None)
136 elif len(nsmap) > 0: 146 elif len(nsmap) > 0:
137 # A new namespace mapping has come into play. 147 # A new namespace mapping has come into play.
138 inverted_nsmap = dict((value, key) for key, value in nsmap.items()) 148 inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
139 self.nsmaps.append(inverted_nsmap) 149 self.nsmaps.append(inverted_nsmap)
140 # Also treat the namespace mapping as a set of attributes on the 150 # Also treat the namespace mapping as a set of attributes on the
141 # tag, so we can recreate it later. 151 # tag, so we can recreate it later.
142 attrs = attrs.copy() 152 attrs = attrs.copy()
143 for prefix, namespace in nsmap.items(): 153 for prefix, namespace in list(nsmap.items()):
144 attribute = NamespacedAttribute( 154 attribute = NamespacedAttribute(
145 "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 155 "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
146 attrs[attribute] = namespace 156 attrs[attribute] = namespace
@@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
149 # from lxml with namespaces attached to their names, and 159 # from lxml with namespaces attached to their names, and
150 # turn then into NamespacedAttribute objects. 160 # turn then into NamespacedAttribute objects.
151 new_attrs = {} 161 new_attrs = {}
152 for attr, value in attrs.items(): 162 for attr, value in list(attrs.items()):
153 namespace, attr = self._getNsTag(attr) 163 namespace, attr = self._getNsTag(attr)
154 if namespace is None: 164 if namespace is None:
155 new_attrs[attr] = value 165 new_attrs[attr] = value
@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
189 self.nsmaps.pop() 199 self.nsmaps.pop()
190 200
191 def pi(self, target, data): 201 def pi(self, target, data):
192 pass 202 self.soup.endData()
203 self.soup.handle_data(target + ' ' + data)
204 self.soup.endData(ProcessingInstruction)
193 205
194 def data(self, content): 206 def data(self, content):
195 self.soup.handle_data(content) 207 self.soup.handle_data(content)
@@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
207 219
208 def test_fragment_to_document(self, fragment): 220 def test_fragment_to_document(self, fragment):
209 """See `TreeBuilder`.""" 221 """See `TreeBuilder`."""
210 return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment 222 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
211 223
212 224
213class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 225class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
214 226
215 features = [LXML, HTML, FAST, PERMISSIVE] 227 NAME = LXML
228 ALTERNATE_NAMES = ["lxml-html"]
229
230 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
216 is_xml = False 231 is_xml = False
217 232
218 def default_parser(self, encoding): 233 def default_parser(self, encoding):
@@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
224 self.parser = self.parser_for(encoding) 239 self.parser = self.parser_for(encoding)
225 self.parser.feed(markup) 240 self.parser.feed(markup)
226 self.parser.close() 241 self.parser.close()
227 except (UnicodeDecodeError, LookupError, etree.ParserError), e: 242 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
228 raise ParserRejectedMarkup(str(e)) 243 raise ParserRejectedMarkup(str(e))
229 244
230 245
231 def test_fragment_to_document(self, fragment): 246 def test_fragment_to_document(self, fragment):
232 """See `TreeBuilder`.""" 247 """See `TreeBuilder`."""
233 return u'<html><body>%s</body></html>' % fragment 248 return '<html><body>%s</body></html>' % fragment
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py
index 59640b7ce3..68d419feb5 100644
--- a/bitbake/lib/bs4/dammit.py
+++ b/bitbake/lib/bs4/dammit.py
@@ -3,12 +3,14 @@
3 3
4This library converts a bytestream to Unicode through any means 4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal 5necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and XML, but it does not rewrite the 6Feed Parser. It works best on XML and HTML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job. 7XML or HTML to reflect a new encoding; that's the tree builder's job.
8""" 8"""
9__license__ = "MIT"
9 10
11from pdb import set_trace
10import codecs 12import codecs
11from htmlentitydefs import codepoint2name 13from html.entities import codepoint2name
12import re 14import re
13import logging 15import logging
14import string 16import string
@@ -56,7 +58,7 @@ class EntitySubstitution(object):
56 reverse_lookup = {} 58 reverse_lookup = {}
57 characters_for_re = [] 59 characters_for_re = []
58 for codepoint, name in list(codepoint2name.items()): 60 for codepoint, name in list(codepoint2name.items()):
59 character = unichr(codepoint) 61 character = chr(codepoint)
60 if codepoint != 34: 62 if codepoint != 34:
61 # There's no point in turning the quotation mark into 63 # There's no point in turning the quotation mark into
62 # &quot;, unless it happens within an attribute value, which 64 # &quot;, unless it happens within an attribute value, which
@@ -212,8 +214,11 @@ class EncodingDetector:
212 214
213 5. Windows-1252. 215 5. Windows-1252.
214 """ 216 """
215 def __init__(self, markup, override_encodings=None, is_html=False): 217 def __init__(self, markup, override_encodings=None, is_html=False,
218 exclude_encodings=None):
216 self.override_encodings = override_encodings or [] 219 self.override_encodings = override_encodings or []
220 exclude_encodings = exclude_encodings or []
221 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
217 self.chardet_encoding = None 222 self.chardet_encoding = None
218 self.is_html = is_html 223 self.is_html = is_html
219 self.declared_encoding = None 224 self.declared_encoding = None
@@ -224,6 +229,8 @@ class EncodingDetector:
224 def _usable(self, encoding, tried): 229 def _usable(self, encoding, tried):
225 if encoding is not None: 230 if encoding is not None:
226 encoding = encoding.lower() 231 encoding = encoding.lower()
232 if encoding in self.exclude_encodings:
233 return False
227 if encoding not in tried: 234 if encoding not in tried:
228 tried.add(encoding) 235 tried.add(encoding)
229 return True 236 return True
@@ -266,6 +273,9 @@ class EncodingDetector:
266 def strip_byte_order_mark(cls, data): 273 def strip_byte_order_mark(cls, data):
267 """If a byte-order mark is present, strip it and return the encoding it implies.""" 274 """If a byte-order mark is present, strip it and return the encoding it implies."""
268 encoding = None 275 encoding = None
276 if isinstance(data, str):
277 # Unicode data cannot have a byte-order mark.
278 return data, encoding
269 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ 279 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
270 and (data[2:4] != '\x00\x00'): 280 and (data[2:4] != '\x00\x00'):
271 encoding = 'utf-16be' 281 encoding = 'utf-16be'
@@ -306,7 +316,7 @@ class EncodingDetector:
306 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) 316 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
307 if declared_encoding_match is not None: 317 if declared_encoding_match is not None:
308 declared_encoding = declared_encoding_match.groups()[0].decode( 318 declared_encoding = declared_encoding_match.groups()[0].decode(
309 'ascii') 319 'ascii', 'replace')
310 if declared_encoding: 320 if declared_encoding:
311 return declared_encoding.lower() 321 return declared_encoding.lower()
312 return None 322 return None
@@ -331,18 +341,19 @@ class UnicodeDammit:
331 ] 341 ]
332 342
333 def __init__(self, markup, override_encodings=[], 343 def __init__(self, markup, override_encodings=[],
334 smart_quotes_to=None, is_html=False): 344 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
335 self.smart_quotes_to = smart_quotes_to 345 self.smart_quotes_to = smart_quotes_to
336 self.tried_encodings = [] 346 self.tried_encodings = []
337 self.contains_replacement_characters = False 347 self.contains_replacement_characters = False
338 self.is_html = is_html 348 self.is_html = is_html
339 349
340 self.detector = EncodingDetector(markup, override_encodings, is_html) 350 self.detector = EncodingDetector(
351 markup, override_encodings, is_html, exclude_encodings)
341 352
342 # Short-circuit if the data is in Unicode to begin with. 353 # Short-circuit if the data is in Unicode to begin with.
343 if isinstance(markup, unicode) or markup == '': 354 if isinstance(markup, str) or markup == '':
344 self.markup = markup 355 self.markup = markup
345 self.unicode_markup = unicode(markup) 356 self.unicode_markup = str(markup)
346 self.original_encoding = None 357 self.original_encoding = None
347 return 358 return
348 359
@@ -425,7 +436,7 @@ class UnicodeDammit:
425 def _to_unicode(self, data, encoding, errors="strict"): 436 def _to_unicode(self, data, encoding, errors="strict"):
426 '''Given a string and its encoding, decodes the string into Unicode. 437 '''Given a string and its encoding, decodes the string into Unicode.
427 %encoding is a string recognized by encodings.aliases''' 438 %encoding is a string recognized by encodings.aliases'''
428 return unicode(data, encoding, errors) 439 return str(data, encoding, errors)
429 440
430 @property 441 @property
431 def declared_html_encoding(self): 442 def declared_html_encoding(self):
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py
index 4d0b00afad..083395fb46 100644
--- a/bitbake/lib/bs4/diagnose.py
+++ b/bitbake/lib/bs4/diagnose.py
@@ -1,7 +1,10 @@
1"""Diagnostic functions, mainly for use when doing tech support.""" 1"""Diagnostic functions, mainly for use when doing tech support."""
2
3__license__ = "MIT"
4
2import cProfile 5import cProfile
3from StringIO import StringIO 6from io import StringIO
4from HTMLParser import HTMLParser 7from html.parser import HTMLParser
5import bs4 8import bs4
6from bs4 import BeautifulSoup, __version__ 9from bs4 import BeautifulSoup, __version__
7from bs4.builder import builder_registry 10from bs4.builder import builder_registry
@@ -17,8 +20,8 @@ import cProfile
17 20
18def diagnose(data): 21def diagnose(data):
19 """Diagnostic suite for isolating common problems.""" 22 """Diagnostic suite for isolating common problems."""
20 print "Diagnostic running on Beautiful Soup %s" % __version__ 23 print("Diagnostic running on Beautiful Soup %s" % __version__)
21 print "Python version %s" % sys.version 24 print("Python version %s" % sys.version)
22 25
23 basic_parsers = ["html.parser", "html5lib", "lxml"] 26 basic_parsers = ["html.parser", "html5lib", "lxml"]
24 for name in basic_parsers: 27 for name in basic_parsers:
@@ -27,44 +30,53 @@ def diagnose(data):
27 break 30 break
28 else: 31 else:
29 basic_parsers.remove(name) 32 basic_parsers.remove(name)
30 print ( 33 print((
31 "I noticed that %s is not installed. Installing it may help." % 34 "I noticed that %s is not installed. Installing it may help." %
32 name) 35 name))
33 36
34 if 'lxml' in basic_parsers: 37 if 'lxml' in basic_parsers:
35 basic_parsers.append(["lxml", "xml"]) 38 basic_parsers.append(["lxml", "xml"])
36 from lxml import etree 39 try:
37 print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 40 from lxml import etree
41 print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
42 except ImportError as e:
43 print (
44 "lxml is not installed or couldn't be imported.")
45
38 46
39 if 'html5lib' in basic_parsers: 47 if 'html5lib' in basic_parsers:
40 import html5lib 48 try:
41 print "Found html5lib version %s" % html5lib.__version__ 49 import html5lib
50 print("Found html5lib version %s" % html5lib.__version__)
51 except ImportError as e:
52 print (
53 "html5lib is not installed or couldn't be imported.")
42 54
43 if hasattr(data, 'read'): 55 if hasattr(data, 'read'):
44 data = data.read() 56 data = data.read()
45 elif os.path.exists(data): 57 elif os.path.exists(data):
46 print '"%s" looks like a filename. Reading data from the file.' % data 58 print('"%s" looks like a filename. Reading data from the file.' % data)
47 data = open(data).read() 59 data = open(data).read()
48 elif data.startswith("http:") or data.startswith("https:"): 60 elif data.startswith("http:") or data.startswith("https:"):
49 print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 61 print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
50 print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 62 print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
51 return 63 return
52 print 64 print()
53 65
54 for parser in basic_parsers: 66 for parser in basic_parsers:
55 print "Trying to parse your markup with %s" % parser 67 print("Trying to parse your markup with %s" % parser)
56 success = False 68 success = False
57 try: 69 try:
58 soup = BeautifulSoup(data, parser) 70 soup = BeautifulSoup(data, parser)
59 success = True 71 success = True
60 except Exception, e: 72 except Exception as e:
61 print "%s could not parse the markup." % parser 73 print("%s could not parse the markup." % parser)
62 traceback.print_exc() 74 traceback.print_exc()
63 if success: 75 if success:
64 print "Here's what %s did with the markup:" % parser 76 print("Here's what %s did with the markup:" % parser)
65 print soup.prettify() 77 print(soup.prettify())
66 78
67 print "-" * 80 79 print("-" * 80)
68 80
69def lxml_trace(data, html=True, **kwargs): 81def lxml_trace(data, html=True, **kwargs):
70 """Print out the lxml events that occur during parsing. 82 """Print out the lxml events that occur during parsing.
@@ -74,7 +86,7 @@ def lxml_trace(data, html=True, **kwargs):
74 """ 86 """
75 from lxml import etree 87 from lxml import etree
76 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 88 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
77 print("%s, %4s, %s" % (event, element.tag, element.text)) 89 print(("%s, %4s, %s" % (event, element.tag, element.text)))
78 90
79class AnnouncingParser(HTMLParser): 91class AnnouncingParser(HTMLParser):
80 """Announces HTMLParser parse events, without doing anything else.""" 92 """Announces HTMLParser parse events, without doing anything else."""
@@ -156,9 +168,9 @@ def rdoc(num_elements=1000):
156 168
157def benchmark_parsers(num_elements=100000): 169def benchmark_parsers(num_elements=100000):
158 """Very basic head-to-head performance benchmark.""" 170 """Very basic head-to-head performance benchmark."""
159 print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 171 print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
160 data = rdoc(num_elements) 172 data = rdoc(num_elements)
161 print "Generated a large invalid HTML document (%d bytes)." % len(data) 173 print("Generated a large invalid HTML document (%d bytes)." % len(data))
162 174
163 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 175 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
164 success = False 176 success = False
@@ -167,24 +179,24 @@ def benchmark_parsers(num_elements=100000):
167 soup = BeautifulSoup(data, parser) 179 soup = BeautifulSoup(data, parser)
168 b = time.time() 180 b = time.time()
169 success = True 181 success = True
170 except Exception, e: 182 except Exception as e:
171 print "%s could not parse the markup." % parser 183 print("%s could not parse the markup." % parser)
172 traceback.print_exc() 184 traceback.print_exc()
173 if success: 185 if success:
174 print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 186 print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
175 187
176 from lxml import etree 188 from lxml import etree
177 a = time.time() 189 a = time.time()
178 etree.HTML(data) 190 etree.HTML(data)
179 b = time.time() 191 b = time.time()
180 print "Raw lxml parsed the markup in %.2fs." % (b-a) 192 print("Raw lxml parsed the markup in %.2fs." % (b-a))
181 193
182 import html5lib 194 import html5lib
183 parser = html5lib.HTMLParser() 195 parser = html5lib.HTMLParser()
184 a = time.time() 196 a = time.time()
185 parser.parse(data) 197 parser.parse(data)
186 b = time.time() 198 b = time.time()
187 print "Raw html5lib parsed the markup in %.2fs." % (b-a) 199 print("Raw html5lib parsed the markup in %.2fs." % (b-a))
188 200
189def profile(num_elements=100000, parser="lxml"): 201def profile(num_elements=100000, parser="lxml"):
190 202
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py
index da9afdf48e..0e62c2e100 100644
--- a/bitbake/lib/bs4/element.py
+++ b/bitbake/lib/bs4/element.py
@@ -1,3 +1,6 @@
1__license__ = "MIT"
2
3from pdb import set_trace
1import collections 4import collections
2import re 5import re
3import sys 6import sys
@@ -21,22 +24,22 @@ def _alias(attr):
21 return alias 24 return alias
22 25
23 26
24class NamespacedAttribute(unicode): 27class NamespacedAttribute(str):
25 28
26 def __new__(cls, prefix, name, namespace=None): 29 def __new__(cls, prefix, name, namespace=None):
27 if name is None: 30 if name is None:
28 obj = unicode.__new__(cls, prefix) 31 obj = str.__new__(cls, prefix)
29 elif prefix is None: 32 elif prefix is None:
30 # Not really namespaced. 33 # Not really namespaced.
31 obj = unicode.__new__(cls, name) 34 obj = str.__new__(cls, name)
32 else: 35 else:
33 obj = unicode.__new__(cls, prefix + ":" + name) 36 obj = str.__new__(cls, prefix + ":" + name)
34 obj.prefix = prefix 37 obj.prefix = prefix
35 obj.name = name 38 obj.name = name
36 obj.namespace = namespace 39 obj.namespace = namespace
37 return obj 40 return obj
38 41
39class AttributeValueWithCharsetSubstitution(unicode): 42class AttributeValueWithCharsetSubstitution(str):
40 """A stand-in object for a character encoding specified in HTML.""" 43 """A stand-in object for a character encoding specified in HTML."""
41 44
42class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 45class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@@ -47,7 +50,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
47 """ 50 """
48 51
49 def __new__(cls, original_value): 52 def __new__(cls, original_value):
50 obj = unicode.__new__(cls, original_value) 53 obj = str.__new__(cls, original_value)
51 obj.original_value = original_value 54 obj.original_value = original_value
52 return obj 55 return obj
53 56
@@ -70,9 +73,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
70 match = cls.CHARSET_RE.search(original_value) 73 match = cls.CHARSET_RE.search(original_value)
71 if match is None: 74 if match is None:
72 # No substitution necessary. 75 # No substitution necessary.
73 return unicode.__new__(unicode, original_value) 76 return str.__new__(str, original_value)
74 77
75 obj = unicode.__new__(cls, original_value) 78 obj = str.__new__(cls, original_value)
76 obj.original_value = original_value 79 obj.original_value = original_value
77 return obj 80 return obj
78 81
@@ -152,7 +155,7 @@ class PageElement(object):
152 155
153 def format_string(self, s, formatter='minimal'): 156 def format_string(self, s, formatter='minimal'):
154 """Format the given string using the given formatter.""" 157 """Format the given string using the given formatter."""
155 if not callable(formatter): 158 if not isinstance(formatter, collections.Callable):
156 formatter = self._formatter_for_name(formatter) 159 formatter = self._formatter_for_name(formatter)
157 if formatter is None: 160 if formatter is None:
158 output = s 161 output = s
@@ -185,24 +188,40 @@ class PageElement(object):
185 return self.HTML_FORMATTERS.get( 188 return self.HTML_FORMATTERS.get(
186 name, HTMLAwareEntitySubstitution.substitute_xml) 189 name, HTMLAwareEntitySubstitution.substitute_xml)
187 190
188 def setup(self, parent=None, previous_element=None): 191 def setup(self, parent=None, previous_element=None, next_element=None,
192 previous_sibling=None, next_sibling=None):
189 """Sets up the initial relations between this element and 193 """Sets up the initial relations between this element and
190 other elements.""" 194 other elements."""
191 self.parent = parent 195 self.parent = parent
196
192 self.previous_element = previous_element 197 self.previous_element = previous_element
193 if previous_element is not None: 198 if previous_element is not None:
194 self.previous_element.next_element = self 199 self.previous_element.next_element = self
195 self.next_element = None 200
196 self.previous_sibling = None 201 self.next_element = next_element
197 self.next_sibling = None 202 if self.next_element:
198 if self.parent is not None and self.parent.contents: 203 self.next_element.previous_element = self
199 self.previous_sibling = self.parent.contents[-1] 204
205 self.next_sibling = next_sibling
206 if self.next_sibling:
207 self.next_sibling.previous_sibling = self
208
209 if (not previous_sibling
210 and self.parent is not None and self.parent.contents):
211 previous_sibling = self.parent.contents[-1]
212
213 self.previous_sibling = previous_sibling
214 if previous_sibling:
200 self.previous_sibling.next_sibling = self 215 self.previous_sibling.next_sibling = self
201 216
202 nextSibling = _alias("next_sibling") # BS3 217 nextSibling = _alias("next_sibling") # BS3
203 previousSibling = _alias("previous_sibling") # BS3 218 previousSibling = _alias("previous_sibling") # BS3
204 219
205 def replace_with(self, replace_with): 220 def replace_with(self, replace_with):
221 if not self.parent:
222 raise ValueError(
223 "Cannot replace one element with another when the"
224 "element to be replaced is not part of a tree.")
206 if replace_with is self: 225 if replace_with is self:
207 return 226 return
208 if replace_with is self.parent: 227 if replace_with is self.parent:
@@ -216,6 +235,10 @@ class PageElement(object):
216 235
217 def unwrap(self): 236 def unwrap(self):
218 my_parent = self.parent 237 my_parent = self.parent
238 if not self.parent:
239 raise ValueError(
240 "Cannot replace an element with its contents when that"
241 "element is not part of a tree.")
219 my_index = self.parent.index(self) 242 my_index = self.parent.index(self)
220 self.extract() 243 self.extract()
221 for child in reversed(self.contents[:]): 244 for child in reversed(self.contents[:]):
@@ -240,17 +263,20 @@ class PageElement(object):
240 last_child = self._last_descendant() 263 last_child = self._last_descendant()
241 next_element = last_child.next_element 264 next_element = last_child.next_element
242 265
243 if self.previous_element is not None: 266 if (self.previous_element is not None and
267 self.previous_element is not next_element):
244 self.previous_element.next_element = next_element 268 self.previous_element.next_element = next_element
245 if next_element is not None: 269 if next_element is not None and next_element is not self.previous_element:
246 next_element.previous_element = self.previous_element 270 next_element.previous_element = self.previous_element
247 self.previous_element = None 271 self.previous_element = None
248 last_child.next_element = None 272 last_child.next_element = None
249 273
250 self.parent = None 274 self.parent = None
251 if self.previous_sibling is not None: 275 if (self.previous_sibling is not None
276 and self.previous_sibling is not self.next_sibling):
252 self.previous_sibling.next_sibling = self.next_sibling 277 self.previous_sibling.next_sibling = self.next_sibling
253 if self.next_sibling is not None: 278 if (self.next_sibling is not None
279 and self.next_sibling is not self.previous_sibling):
254 self.next_sibling.previous_sibling = self.previous_sibling 280 self.next_sibling.previous_sibling = self.previous_sibling
255 self.previous_sibling = self.next_sibling = None 281 self.previous_sibling = self.next_sibling = None
256 return self 282 return self
@@ -263,16 +289,18 @@ class PageElement(object):
263 last_child = self 289 last_child = self
264 while isinstance(last_child, Tag) and last_child.contents: 290 while isinstance(last_child, Tag) and last_child.contents:
265 last_child = last_child.contents[-1] 291 last_child = last_child.contents[-1]
266 if not accept_self and last_child == self: 292 if not accept_self and last_child is self:
267 last_child = None 293 last_child = None
268 return last_child 294 return last_child
269 # BS3: Not part of the API! 295 # BS3: Not part of the API!
270 _lastRecursiveChild = _last_descendant 296 _lastRecursiveChild = _last_descendant
271 297
272 def insert(self, position, new_child): 298 def insert(self, position, new_child):
299 if new_child is None:
300 raise ValueError("Cannot insert None into a tag.")
273 if new_child is self: 301 if new_child is self:
274 raise ValueError("Cannot insert a tag into itself.") 302 raise ValueError("Cannot insert a tag into itself.")
275 if (isinstance(new_child, basestring) 303 if (isinstance(new_child, str)
276 and not isinstance(new_child, NavigableString)): 304 and not isinstance(new_child, NavigableString)):
277 new_child = NavigableString(new_child) 305 new_child = NavigableString(new_child)
278 306
@@ -478,6 +506,10 @@ class PageElement(object):
478 def _find_all(self, name, attrs, text, limit, generator, **kwargs): 506 def _find_all(self, name, attrs, text, limit, generator, **kwargs):
479 "Iterates over a generator looking for things that match." 507 "Iterates over a generator looking for things that match."
480 508
509 if text is None and 'string' in kwargs:
510 text = kwargs['string']
511 del kwargs['string']
512
481 if isinstance(name, SoupStrainer): 513 if isinstance(name, SoupStrainer):
482 strainer = name 514 strainer = name
483 else: 515 else:
@@ -489,7 +521,7 @@ class PageElement(object):
489 result = (element for element in generator 521 result = (element for element in generator
490 if isinstance(element, Tag)) 522 if isinstance(element, Tag))
491 return ResultSet(strainer, result) 523 return ResultSet(strainer, result)
492 elif isinstance(name, basestring): 524 elif isinstance(name, str):
493 # Optimization to find all tags with a given name. 525 # Optimization to find all tags with a given name.
494 result = (element for element in generator 526 result = (element for element in generator
495 if isinstance(element, Tag) 527 if isinstance(element, Tag)
@@ -548,17 +580,17 @@ class PageElement(object):
548 580
549 # Methods for supporting CSS selectors. 581 # Methods for supporting CSS selectors.
550 582
551 tag_name_re = re.compile('^[a-z0-9]+$') 583 tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
552 584
553 # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ 585 # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
554 # \---/ \---/\-------------/ \-------/ 586 # \---------------------------/ \---/\-------------/ \-------/
555 # | | | | 587 # | | | |
556 # | | | The value 588 # | | | The value
557 # | | ~,|,^,$,* or = 589 # | | ~,|,^,$,* or =
558 # | Attribute 590 # | Attribute
559 # Tag 591 # Tag
560 attribselect_re = re.compile( 592 attribselect_re = re.compile(
561 r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + 593 r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
562 r'=?"?(?P<value>[^\]"]*)"?\]$' 594 r'=?"?(?P<value>[^\]"]*)"?\]$'
563 ) 595 )
564 596
@@ -640,7 +672,7 @@ class PageElement(object):
640 return self.parents 672 return self.parents
641 673
642 674
643class NavigableString(unicode, PageElement): 675class NavigableString(str, PageElement):
644 676
645 PREFIX = '' 677 PREFIX = ''
646 SUFFIX = '' 678 SUFFIX = ''
@@ -653,15 +685,21 @@ class NavigableString(unicode, PageElement):
653 passed in to the superclass's __new__ or the superclass won't know 685 passed in to the superclass's __new__ or the superclass won't know
654 how to handle non-ASCII characters. 686 how to handle non-ASCII characters.
655 """ 687 """
656 if isinstance(value, unicode): 688 if isinstance(value, str):
657 return unicode.__new__(cls, value) 689 u = str.__new__(cls, value)
658 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 690 else:
691 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
692 u.setup()
693 return u
659 694
660 def __copy__(self): 695 def __copy__(self):
661 return self 696 """A copy of a NavigableString has the same contents and class
697 as the original, but it is not connected to the parse tree.
698 """
699 return type(self)(self)
662 700
663 def __getnewargs__(self): 701 def __getnewargs__(self):
664 return (unicode(self),) 702 return (str(self),)
665 703
666 def __getattr__(self, attr): 704 def __getattr__(self, attr):
667 """text.string gives you text. This is for backwards 705 """text.string gives you text. This is for backwards
@@ -701,23 +739,23 @@ class PreformattedString(NavigableString):
701 739
702class CData(PreformattedString): 740class CData(PreformattedString):
703 741
704 PREFIX = u'<![CDATA[' 742 PREFIX = '<![CDATA['
705 SUFFIX = u']]>' 743 SUFFIX = ']]>'
706 744
707class ProcessingInstruction(PreformattedString): 745class ProcessingInstruction(PreformattedString):
708 746
709 PREFIX = u'<?' 747 PREFIX = '<?'
710 SUFFIX = u'?>' 748 SUFFIX = '>'
711 749
712class Comment(PreformattedString): 750class Comment(PreformattedString):
713 751
714 PREFIX = u'<!--' 752 PREFIX = '<!--'
715 SUFFIX = u'-->' 753 SUFFIX = '-->'
716 754
717 755
718class Declaration(PreformattedString): 756class Declaration(PreformattedString):
719 PREFIX = u'<!' 757 PREFIX = '<?'
720 SUFFIX = u'!>' 758 SUFFIX = '?>'
721 759
722 760
723class Doctype(PreformattedString): 761class Doctype(PreformattedString):
@@ -734,8 +772,8 @@ class Doctype(PreformattedString):
734 772
735 return Doctype(value) 773 return Doctype(value)
736 774
737 PREFIX = u'<!DOCTYPE ' 775 PREFIX = '<!DOCTYPE '
738 SUFFIX = u'>\n' 776 SUFFIX = '>\n'
739 777
740 778
741class Tag(PageElement): 779class Tag(PageElement):
@@ -759,9 +797,12 @@ class Tag(PageElement):
759 self.prefix = prefix 797 self.prefix = prefix
760 if attrs is None: 798 if attrs is None:
761 attrs = {} 799 attrs = {}
762 elif attrs and builder.cdata_list_attributes: 800 elif attrs:
763 attrs = builder._replace_cdata_list_attribute_values( 801 if builder is not None and builder.cdata_list_attributes:
764 self.name, attrs) 802 attrs = builder._replace_cdata_list_attribute_values(
803 self.name, attrs)
804 else:
805 attrs = dict(attrs)
765 else: 806 else:
766 attrs = dict(attrs) 807 attrs = dict(attrs)
767 self.attrs = attrs 808 self.attrs = attrs
@@ -778,6 +819,18 @@ class Tag(PageElement):
778 819
779 parserClass = _alias("parser_class") # BS3 820 parserClass = _alias("parser_class") # BS3
780 821
822 def __copy__(self):
823 """A copy of a Tag is a new Tag, unconnected to the parse tree.
824 Its contents are a copy of the old Tag's contents.
825 """
826 clone = type(self)(None, self.builder, self.name, self.namespace,
827 self.nsprefix, self.attrs)
828 for attr in ('can_be_empty_element', 'hidden'):
829 setattr(clone, attr, getattr(self, attr))
830 for child in self.contents:
831 clone.append(child.__copy__())
832 return clone
833
781 @property 834 @property
782 def is_empty_element(self): 835 def is_empty_element(self):
783 """Is this tag an empty-element tag? (aka a self-closing tag) 836 """Is this tag an empty-element tag? (aka a self-closing tag)
@@ -843,7 +896,7 @@ class Tag(PageElement):
843 for string in self._all_strings(True): 896 for string in self._all_strings(True):
844 yield string 897 yield string
845 898
846 def get_text(self, separator=u"", strip=False, 899 def get_text(self, separator="", strip=False,
847 types=(NavigableString, CData)): 900 types=(NavigableString, CData)):
848 """ 901 """
849 Get all child strings, concatenated using the given separator. 902 Get all child strings, concatenated using the given separator.
@@ -915,7 +968,7 @@ class Tag(PageElement):
915 def __contains__(self, x): 968 def __contains__(self, x):
916 return x in self.contents 969 return x in self.contents
917 970
918 def __nonzero__(self): 971 def __bool__(self):
919 "A tag is non-None even if it has no contents." 972 "A tag is non-None even if it has no contents."
920 return True 973 return True
921 974
@@ -971,15 +1024,25 @@ class Tag(PageElement):
971 as defined in __eq__.""" 1024 as defined in __eq__."""
972 return not self == other 1025 return not self == other
973 1026
974 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 1027 def __repr__(self, encoding="unicode-escape"):
975 """Renders this tag as a string.""" 1028 """Renders this tag as a string."""
976 return self.encode(encoding) 1029 if PY3K:
1030 # "The return value must be a string object", i.e. Unicode
1031 return self.decode()
1032 else:
1033 # "The return value must be a string object", i.e. a bytestring.
1034 # By convention, the return value of __repr__ should also be
1035 # an ASCII string.
1036 return self.encode(encoding)
977 1037
978 def __unicode__(self): 1038 def __unicode__(self):
979 return self.decode() 1039 return self.decode()
980 1040
981 def __str__(self): 1041 def __str__(self):
982 return self.encode() 1042 if PY3K:
1043 return self.decode()
1044 else:
1045 return self.encode()
983 1046
984 if PY3K: 1047 if PY3K:
985 __str__ = __repr__ = __unicode__ 1048 __str__ = __repr__ = __unicode__
@@ -1014,7 +1077,7 @@ class Tag(PageElement):
1014 1077
1015 # First off, turn a string formatter into a function. This 1078 # First off, turn a string formatter into a function. This
1016 # will stop the lookup from happening over and over again. 1079 # will stop the lookup from happening over and over again.
1017 if not callable(formatter): 1080 if not isinstance(formatter, collections.Callable):
1018 formatter = self._formatter_for_name(formatter) 1081 formatter = self._formatter_for_name(formatter)
1019 1082
1020 attrs = [] 1083 attrs = []
@@ -1025,8 +1088,8 @@ class Tag(PageElement):
1025 else: 1088 else:
1026 if isinstance(val, list) or isinstance(val, tuple): 1089 if isinstance(val, list) or isinstance(val, tuple):
1027 val = ' '.join(val) 1090 val = ' '.join(val)
1028 elif not isinstance(val, basestring): 1091 elif not isinstance(val, str):
1029 val = unicode(val) 1092 val = str(val)
1030 elif ( 1093 elif (
1031 isinstance(val, AttributeValueWithCharsetSubstitution) 1094 isinstance(val, AttributeValueWithCharsetSubstitution)
1032 and eventual_encoding is not None): 1095 and eventual_encoding is not None):
@@ -1034,7 +1097,7 @@ class Tag(PageElement):
1034 1097
1035 text = self.format_string(val, formatter) 1098 text = self.format_string(val, formatter)
1036 decoded = ( 1099 decoded = (
1037 unicode(key) + '=' 1100 str(key) + '='
1038 + EntitySubstitution.quoted_attribute_value(text)) 1101 + EntitySubstitution.quoted_attribute_value(text))
1039 attrs.append(decoded) 1102 attrs.append(decoded)
1040 close = '' 1103 close = ''
@@ -1103,16 +1166,22 @@ class Tag(PageElement):
1103 formatter="minimal"): 1166 formatter="minimal"):
1104 """Renders the contents of this tag as a Unicode string. 1167 """Renders the contents of this tag as a Unicode string.
1105 1168
1169 :param indent_level: Each line of the rendering will be
1170 indented this many spaces.
1171
1106 :param eventual_encoding: The tag is destined to be 1172 :param eventual_encoding: The tag is destined to be
1107 encoded into this encoding. This method is _not_ 1173 encoded into this encoding. This method is _not_
1108 responsible for performing that encoding. This information 1174 responsible for performing that encoding. This information
1109 is passed in so that it can be substituted in if the 1175 is passed in so that it can be substituted in if the
1110 document contains a <META> tag that mentions the document's 1176 document contains a <META> tag that mentions the document's
1111 encoding. 1177 encoding.
1178
1179 :param formatter: The output formatter responsible for converting
1180 entities to Unicode characters.
1112 """ 1181 """
1113 # First off, turn a string formatter into a function. This 1182 # First off, turn a string formatter into a function. This
1114 # will stop the lookup from happening over and over again. 1183 # will stop the lookup from happening over and over again.
1115 if not callable(formatter): 1184 if not isinstance(formatter, collections.Callable):
1116 formatter = self._formatter_for_name(formatter) 1185 formatter = self._formatter_for_name(formatter)
1117 1186
1118 pretty_print = (indent_level is not None) 1187 pretty_print = (indent_level is not None)
@@ -1137,7 +1206,17 @@ class Tag(PageElement):
1137 def encode_contents( 1206 def encode_contents(
1138 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1207 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1139 formatter="minimal"): 1208 formatter="minimal"):
1140 """Renders the contents of this tag as a bytestring.""" 1209 """Renders the contents of this tag as a bytestring.
1210
1211 :param indent_level: Each line of the rendering will be
1212 indented this many spaces.
1213
1214 :param eventual_encoding: The bytestring will be in this encoding.
1215
1216 :param formatter: The output formatter responsible for converting
1217 entities to Unicode characters.
1218 """
1219
1141 contents = self.decode_contents(indent_level, encoding, formatter) 1220 contents = self.decode_contents(indent_level, encoding, formatter)
1142 return contents.encode(encoding) 1221 return contents.encode(encoding)
1143 1222
@@ -1201,26 +1280,57 @@ class Tag(PageElement):
1201 1280
1202 _selector_combinators = ['>', '+', '~'] 1281 _selector_combinators = ['>', '+', '~']
1203 _select_debug = False 1282 _select_debug = False
1204 def select(self, selector, _candidate_generator=None): 1283 def select_one(self, selector):
1284 """Perform a CSS selection operation on the current element."""
1285 value = self.select(selector, limit=1)
1286 if value:
1287 return value[0]
1288 return None
1289
1290 def select(self, selector, _candidate_generator=None, limit=None):
1205 """Perform a CSS selection operation on the current element.""" 1291 """Perform a CSS selection operation on the current element."""
1292
1293 # Handle grouping selectors if ',' exists, ie: p,a
1294 if ',' in selector:
1295 context = []
1296 for partial_selector in selector.split(','):
1297 partial_selector = partial_selector.strip()
1298 if partial_selector == '':
1299 raise ValueError('Invalid group selection syntax: %s' % selector)
1300 candidates = self.select(partial_selector, limit=limit)
1301 for candidate in candidates:
1302 if candidate not in context:
1303 context.append(candidate)
1304
1305 if limit and len(context) >= limit:
1306 break
1307 return context
1308
1206 tokens = selector.split() 1309 tokens = selector.split()
1207 current_context = [self] 1310 current_context = [self]
1208 1311
1209 if tokens[-1] in self._selector_combinators: 1312 if tokens[-1] in self._selector_combinators:
1210 raise ValueError( 1313 raise ValueError(
1211 'Final combinator "%s" is missing an argument.' % tokens[-1]) 1314 'Final combinator "%s" is missing an argument.' % tokens[-1])
1315
1212 if self._select_debug: 1316 if self._select_debug:
1213 print 'Running CSS selector "%s"' % selector 1317 print('Running CSS selector "%s"' % selector)
1318
1214 for index, token in enumerate(tokens): 1319 for index, token in enumerate(tokens):
1215 if self._select_debug: 1320 new_context = []
1216 print ' Considering token "%s"' % token 1321 new_context_ids = set([])
1217 recursive_candidate_generator = None 1322
1218 tag_name = None
1219 if tokens[index-1] in self._selector_combinators: 1323 if tokens[index-1] in self._selector_combinators:
1220 # This token was consumed by the previous combinator. Skip it. 1324 # This token was consumed by the previous combinator. Skip it.
1221 if self._select_debug: 1325 if self._select_debug:
1222 print ' Token was consumed by the previous combinator.' 1326 print(' Token was consumed by the previous combinator.')
1223 continue 1327 continue
1328
1329 if self._select_debug:
1330 print(' Considering token "%s"' % token)
1331 recursive_candidate_generator = None
1332 tag_name = None
1333
1224 # Each operation corresponds to a checker function, a rule 1334 # Each operation corresponds to a checker function, a rule
1225 # for determining whether a candidate matches the 1335 # for determining whether a candidate matches the
1226 # selector. Candidates are generated by the active 1336 # selector. Candidates are generated by the active
@@ -1256,35 +1366,38 @@ class Tag(PageElement):
1256 "A pseudo-class must be prefixed with a tag name.") 1366 "A pseudo-class must be prefixed with a tag name.")
1257 pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) 1367 pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1258 found = [] 1368 found = []
1259 if pseudo_attributes is not None: 1369 if pseudo_attributes is None:
1370 pseudo_type = pseudo
1371 pseudo_value = None
1372 else:
1260 pseudo_type, pseudo_value = pseudo_attributes.groups() 1373 pseudo_type, pseudo_value = pseudo_attributes.groups()
1261 if pseudo_type == 'nth-of-type': 1374 if pseudo_type == 'nth-of-type':
1262 try: 1375 try:
1263 pseudo_value = int(pseudo_value) 1376 pseudo_value = int(pseudo_value)
1264 except: 1377 except:
1265 raise NotImplementedError(
1266 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1267 if pseudo_value < 1:
1268 raise ValueError(
1269 'nth-of-type pseudo-class value must be at least 1.')
1270 class Counter(object):
1271 def __init__(self, destination):
1272 self.count = 0
1273 self.destination = destination
1274
1275 def nth_child_of_type(self, tag):
1276 self.count += 1
1277 if self.count == self.destination:
1278 return True
1279 if self.count > self.destination:
1280 # Stop the generator that's sending us
1281 # these things.
1282 raise StopIteration()
1283 return False
1284 checker = Counter(pseudo_value).nth_child_of_type
1285 else:
1286 raise NotImplementedError( 1378 raise NotImplementedError(
1287 'Only the following pseudo-classes are implemented: nth-of-type.') 1379 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1380 if pseudo_value < 1:
1381 raise ValueError(
1382 'nth-of-type pseudo-class value must be at least 1.')
1383 class Counter(object):
1384 def __init__(self, destination):
1385 self.count = 0
1386 self.destination = destination
1387
1388 def nth_child_of_type(self, tag):
1389 self.count += 1
1390 if self.count == self.destination:
1391 return True
1392 if self.count > self.destination:
1393 # Stop the generator that's sending us
1394 # these things.
1395 raise StopIteration()
1396 return False
1397 checker = Counter(pseudo_value).nth_child_of_type
1398 else:
1399 raise NotImplementedError(
1400 'Only the following pseudo-classes are implemented: nth-of-type.')
1288 1401
1289 elif token == '*': 1402 elif token == '*':
1290 # Star selector -- matches everything 1403 # Star selector -- matches everything
@@ -1311,7 +1424,6 @@ class Tag(PageElement):
1311 else: 1424 else:
1312 raise ValueError( 1425 raise ValueError(
1313 'Unsupported or invalid CSS selector: "%s"' % token) 1426 'Unsupported or invalid CSS selector: "%s"' % token)
1314
1315 if recursive_candidate_generator: 1427 if recursive_candidate_generator:
1316 # This happens when the selector looks like "> foo". 1428 # This happens when the selector looks like "> foo".
1317 # 1429 #
@@ -1325,14 +1437,14 @@ class Tag(PageElement):
1325 next_token = tokens[index+1] 1437 next_token = tokens[index+1]
1326 def recursive_select(tag): 1438 def recursive_select(tag):
1327 if self._select_debug: 1439 if self._select_debug:
1328 print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) 1440 print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
1329 print '-' * 40 1441 print('-' * 40)
1330 for i in tag.select(next_token, recursive_candidate_generator): 1442 for i in tag.select(next_token, recursive_candidate_generator):
1331 if self._select_debug: 1443 if self._select_debug:
1332 print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) 1444 print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
1333 yield i 1445 yield i
1334 if self._select_debug: 1446 if self._select_debug:
1335 print '-' * 40 1447 print('-' * 40)
1336 _use_candidate_generator = recursive_select 1448 _use_candidate_generator = recursive_select
1337 elif _candidate_generator is None: 1449 elif _candidate_generator is None:
1338 # By default, a tag's candidates are all of its 1450 # By default, a tag's candidates are all of its
@@ -1343,7 +1455,7 @@ class Tag(PageElement):
1343 check = "[any]" 1455 check = "[any]"
1344 else: 1456 else:
1345 check = tag_name 1457 check = tag_name
1346 print ' Default candidate generator, tag name="%s"' % check 1458 print(' Default candidate generator, tag name="%s"' % check)
1347 if self._select_debug: 1459 if self._select_debug:
1348 # This is redundant with later code, but it stops 1460 # This is redundant with later code, but it stops
1349 # a bunch of bogus tags from cluttering up the 1461 # a bunch of bogus tags from cluttering up the
@@ -1361,12 +1473,11 @@ class Tag(PageElement):
1361 else: 1473 else:
1362 _use_candidate_generator = _candidate_generator 1474 _use_candidate_generator = _candidate_generator
1363 1475
1364 new_context = [] 1476 count = 0
1365 new_context_ids = set([])
1366 for tag in current_context: 1477 for tag in current_context:
1367 if self._select_debug: 1478 if self._select_debug:
1368 print " Running candidate generator on %s %s" % ( 1479 print(" Running candidate generator on %s %s" % (
1369 tag.name, repr(tag.attrs)) 1480 tag.name, repr(tag.attrs)))
1370 for candidate in _use_candidate_generator(tag): 1481 for candidate in _use_candidate_generator(tag):
1371 if not isinstance(candidate, Tag): 1482 if not isinstance(candidate, Tag):
1372 continue 1483 continue
@@ -1381,21 +1492,24 @@ class Tag(PageElement):
1381 break 1492 break
1382 if checker is None or result: 1493 if checker is None or result:
1383 if self._select_debug: 1494 if self._select_debug:
1384 print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) 1495 print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
1385 if id(candidate) not in new_context_ids: 1496 if id(candidate) not in new_context_ids:
1386 # If a tag matches a selector more than once, 1497 # If a tag matches a selector more than once,
1387 # don't include it in the context more than once. 1498 # don't include it in the context more than once.
1388 new_context.append(candidate) 1499 new_context.append(candidate)
1389 new_context_ids.add(id(candidate)) 1500 new_context_ids.add(id(candidate))
1501 if limit and len(new_context) >= limit:
1502 break
1390 elif self._select_debug: 1503 elif self._select_debug:
1391 print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) 1504 print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
1505
1392 1506
1393 current_context = new_context 1507 current_context = new_context
1394 1508
1395 if self._select_debug: 1509 if self._select_debug:
1396 print "Final verdict:" 1510 print("Final verdict:")
1397 for i in current_context: 1511 for i in current_context:
1398 print " %s %s" % (i.name, i.attrs) 1512 print(" %s %s" % (i.name, i.attrs))
1399 return current_context 1513 return current_context
1400 1514
1401 # Old names for backwards compatibility 1515 # Old names for backwards compatibility
@@ -1439,7 +1553,7 @@ class SoupStrainer(object):
1439 else: 1553 else:
1440 attrs = kwargs 1554 attrs = kwargs
1441 normalized_attrs = {} 1555 normalized_attrs = {}
1442 for key, value in attrs.items(): 1556 for key, value in list(attrs.items()):
1443 normalized_attrs[key] = self._normalize_search_value(value) 1557 normalized_attrs[key] = self._normalize_search_value(value)
1444 1558
1445 self.attrs = normalized_attrs 1559 self.attrs = normalized_attrs
@@ -1448,7 +1562,7 @@ class SoupStrainer(object):
1448 def _normalize_search_value(self, value): 1562 def _normalize_search_value(self, value):
1449 # Leave it alone if it's a Unicode string, a callable, a 1563 # Leave it alone if it's a Unicode string, a callable, a
1450 # regular expression, a boolean, or None. 1564 # regular expression, a boolean, or None.
1451 if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') 1565 if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match')
1452 or isinstance(value, bool) or value is None): 1566 or isinstance(value, bool) or value is None):
1453 return value 1567 return value
1454 1568
@@ -1461,7 +1575,7 @@ class SoupStrainer(object):
1461 new_value = [] 1575 new_value = []
1462 for v in value: 1576 for v in value:
1463 if (hasattr(v, '__iter__') and not isinstance(v, bytes) 1577 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1464 and not isinstance(v, unicode)): 1578 and not isinstance(v, str)):
1465 # This is almost certainly the user's mistake. In the 1579 # This is almost certainly the user's mistake. In the
1466 # interests of avoiding infinite loops, we'll let 1580 # interests of avoiding infinite loops, we'll let
1467 # it through as-is rather than doing a recursive call. 1581 # it through as-is rather than doing a recursive call.
@@ -1473,7 +1587,7 @@ class SoupStrainer(object):
1473 # Otherwise, convert it into a Unicode string. 1587 # Otherwise, convert it into a Unicode string.
1474 # The unicode(str()) thing is so this will do the same thing on Python 2 1588 # The unicode(str()) thing is so this will do the same thing on Python 2
1475 # and Python 3. 1589 # and Python 3.
1476 return unicode(str(value)) 1590 return str(str(value))
1477 1591
1478 def __str__(self): 1592 def __str__(self):
1479 if self.text: 1593 if self.text:
@@ -1527,7 +1641,7 @@ class SoupStrainer(object):
1527 found = None 1641 found = None
1528 # If given a list of items, scan it for a text element that 1642 # If given a list of items, scan it for a text element that
1529 # matches. 1643 # matches.
1530 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): 1644 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
1531 for element in markup: 1645 for element in markup:
1532 if isinstance(element, NavigableString) \ 1646 if isinstance(element, NavigableString) \
1533 and self.search(element): 1647 and self.search(element):
@@ -1540,7 +1654,7 @@ class SoupStrainer(object):
1540 found = self.search_tag(markup) 1654 found = self.search_tag(markup)
1541 # If it's text, make sure the text matches. 1655 # If it's text, make sure the text matches.
1542 elif isinstance(markup, NavigableString) or \ 1656 elif isinstance(markup, NavigableString) or \
1543 isinstance(markup, basestring): 1657 isinstance(markup, str):
1544 if not self.name and not self.attrs and self._matches(markup, self.text): 1658 if not self.name and not self.attrs and self._matches(markup, self.text):
1545 found = markup 1659 found = markup
1546 else: 1660 else:
@@ -1554,7 +1668,7 @@ class SoupStrainer(object):
1554 if isinstance(markup, list) or isinstance(markup, tuple): 1668 if isinstance(markup, list) or isinstance(markup, tuple):
1555 # This should only happen when searching a multi-valued attribute 1669 # This should only happen when searching a multi-valued attribute
1556 # like 'class'. 1670 # like 'class'.
1557 if (isinstance(match_against, unicode) 1671 if (isinstance(match_against, str)
1558 and ' ' in match_against): 1672 and ' ' in match_against):
1559 # A bit of a special case. If they try to match "foo 1673 # A bit of a special case. If they try to match "foo
1560 # bar" on a multivalue attribute's value, only accept 1674 # bar" on a multivalue attribute's value, only accept
@@ -1589,7 +1703,7 @@ class SoupStrainer(object):
1589 # None matches None, False, an empty string, an empty list, and so on. 1703 # None matches None, False, an empty string, an empty list, and so on.
1590 return not match_against 1704 return not match_against
1591 1705
1592 if isinstance(match_against, unicode): 1706 if isinstance(match_against, str):
1593 # Exact string match 1707 # Exact string match
1594 return markup == match_against 1708 return markup == match_against
1595 1709
diff --git a/bitbake/lib/bs4/testing.py b/bitbake/lib/bs4/testing.py
index fd4495ac58..3a2f260e24 100644
--- a/bitbake/lib/bs4/testing.py
+++ b/bitbake/lib/bs4/testing.py
@@ -1,5 +1,8 @@
1"""Helper classes for tests.""" 1"""Helper classes for tests."""
2 2
3__license__ = "MIT"
4
5import pickle
3import copy 6import copy
4import functools 7import functools
5import unittest 8import unittest
@@ -43,6 +46,16 @@ class SoupTest(unittest.TestCase):
43 46
44 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) 47 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
45 48
49 def assertConnectedness(self, element):
50 """Ensure that next_element and previous_element are properly
51 set for all descendants of the given element.
52 """
53 earlier = None
54 for e in element.descendants:
55 if earlier:
56 self.assertEqual(e, earlier.next_element)
57 self.assertEqual(earlier, e.previous_element)
58 earlier = e
46 59
47class HTMLTreeBuilderSmokeTest(object): 60class HTMLTreeBuilderSmokeTest(object):
48 61
@@ -54,6 +67,15 @@ class HTMLTreeBuilderSmokeTest(object):
54 markup in these tests, there's not much room for interpretation. 67 markup in these tests, there's not much room for interpretation.
55 """ 68 """
56 69
70 def test_pickle_and_unpickle_identity(self):
71 # Pickling a tree, then unpickling it, yields a tree identical
72 # to the original.
73 tree = self.soup("<a><b>foo</a>")
74 dumped = pickle.dumps(tree, 2)
75 loaded = pickle.loads(dumped)
76 self.assertEqual(loaded.__class__, BeautifulSoup)
77 self.assertEqual(loaded.decode(), tree.decode())
78
57 def assertDoctypeHandled(self, doctype_fragment): 79 def assertDoctypeHandled(self, doctype_fragment):
58 """Assert that a given doctype string is handled correctly.""" 80 """Assert that a given doctype string is handled correctly."""
59 doctype_str, soup = self._document_with_doctype(doctype_fragment) 81 doctype_str, soup = self._document_with_doctype(doctype_fragment)
@@ -114,6 +136,11 @@ class HTMLTreeBuilderSmokeTest(object):
114 soup.encode("utf-8").replace(b"\n", b""), 136 soup.encode("utf-8").replace(b"\n", b""),
115 markup.replace(b"\n", b"")) 137 markup.replace(b"\n", b""))
116 138
139 def test_processing_instruction(self):
140 markup = b"""<?PITarget PIContent?>"""
141 soup = self.soup(markup)
142 self.assertEqual(markup, soup.encode("utf8"))
143
117 def test_deepcopy(self): 144 def test_deepcopy(self):
118 """Make sure you can copy the tree builder. 145 """Make sure you can copy the tree builder.
119 146
@@ -155,6 +182,23 @@ class HTMLTreeBuilderSmokeTest(object):
155 def test_nested_formatting_elements(self): 182 def test_nested_formatting_elements(self):
156 self.assertSoupEquals("<em><em></em></em>") 183 self.assertSoupEquals("<em><em></em></em>")
157 184
185 def test_double_head(self):
186 html = '''<!DOCTYPE html>
187<html>
188<head>
189<title>Ordinary HEAD element test</title>
190</head>
191<script type="text/javascript">
192alert("Help!");
193</script>
194<body>
195Hello, world!
196</body>
197</html>
198'''
199 soup = self.soup(html)
200 self.assertEqual("text/javascript", soup.find('script')['type'])
201
158 def test_comment(self): 202 def test_comment(self):
159 # Comments are represented as Comment objects. 203 # Comments are represented as Comment objects.
160 markup = "<p>foo<!--foobar-->baz</p>" 204 markup = "<p>foo<!--foobar-->baz</p>"
@@ -221,18 +265,26 @@ class HTMLTreeBuilderSmokeTest(object):
221 soup = self.soup(markup) 265 soup = self.soup(markup)
222 self.assertEqual(["css"], soup.div.div['class']) 266 self.assertEqual(["css"], soup.div.div['class'])
223 267
268 def test_multivalued_attribute_on_html(self):
269 # html5lib uses a different API to set the attributes ot the
270 # <html> tag. This has caused problems with multivalued
271 # attributes.
272 markup = '<html class="a b"></html>'
273 soup = self.soup(markup)
274 self.assertEqual(["a", "b"], soup.html['class'])
275
224 def test_angle_brackets_in_attribute_values_are_escaped(self): 276 def test_angle_brackets_in_attribute_values_are_escaped(self):
225 self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>') 277 self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
226 278
227 def test_entities_in_attributes_converted_to_unicode(self): 279 def test_entities_in_attributes_converted_to_unicode(self):
228 expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' 280 expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
229 self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect) 281 self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
230 self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect) 282 self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
231 self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect) 283 self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
232 self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect) 284 self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
233 285
234 def test_entities_in_text_converted_to_unicode(self): 286 def test_entities_in_text_converted_to_unicode(self):
235 expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' 287 expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
236 self.assertSoupEquals("<p>pi&#241;ata</p>", expect) 288 self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
237 self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect) 289 self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
238 self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect) 290 self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@@ -243,7 +295,7 @@ class HTMLTreeBuilderSmokeTest(object):
243 '<p>I said "good day!"</p>') 295 '<p>I said "good day!"</p>')
244 296
245 def test_out_of_range_entity(self): 297 def test_out_of_range_entity(self):
246 expect = u"\N{REPLACEMENT CHARACTER}" 298 expect = "\N{REPLACEMENT CHARACTER}"
247 self.assertSoupEquals("&#10000000000000;", expect) 299 self.assertSoupEquals("&#10000000000000;", expect)
248 self.assertSoupEquals("&#x10000000000000;", expect) 300 self.assertSoupEquals("&#x10000000000000;", expect)
249 self.assertSoupEquals("&#1000000000;", expect) 301 self.assertSoupEquals("&#1000000000;", expect)
@@ -253,6 +305,35 @@ class HTMLTreeBuilderSmokeTest(object):
253 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") 305 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
254 self.assertEqual("p", soup.h2.string.next_element.name) 306 self.assertEqual("p", soup.h2.string.next_element.name)
255 self.assertEqual("p", soup.p.name) 307 self.assertEqual("p", soup.p.name)
308 self.assertConnectedness(soup)
309
310 def test_head_tag_between_head_and_body(self):
311 "Prevent recurrence of a bug in the html5lib treebuilder."
312 content = """<html><head></head>
313 <link></link>
314 <body>foo</body>
315</html>
316"""
317 soup = self.soup(content)
318 self.assertNotEqual(None, soup.html.body)
319 self.assertConnectedness(soup)
320
321 def test_multiple_copies_of_a_tag(self):
322 "Prevent recurrence of a bug in the html5lib treebuilder."
323 content = """<!DOCTYPE html>
324<html>
325 <body>
326 <article id="a" >
327 <div><a href="1"></div>
328 <footer>
329 <a href="2"></a>
330 </footer>
331 </article>
332 </body>
333</html>
334"""
335 soup = self.soup(content)
336 self.assertConnectedness(soup.article)
256 337
257 def test_basic_namespaces(self): 338 def test_basic_namespaces(self):
258 """Parsers don't need to *understand* namespaces, but at the 339 """Parsers don't need to *understand* namespaces, but at the
@@ -285,9 +366,9 @@ class HTMLTreeBuilderSmokeTest(object):
285 # A seemingly innocuous document... but it's in Unicode! And 366 # A seemingly innocuous document... but it's in Unicode! And
286 # it contains characters that can't be represented in the 367 # it contains characters that can't be represented in the
287 # encoding found in the declaration! The horror! 368 # encoding found in the declaration! The horror!
288 markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' 369 markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
289 soup = self.soup(markup) 370 soup = self.soup(markup)
290 self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) 371 self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
291 372
292 def test_soupstrainer(self): 373 def test_soupstrainer(self):
293 """Parsers should be able to work with SoupStrainers.""" 374 """Parsers should be able to work with SoupStrainers."""
@@ -327,7 +408,7 @@ class HTMLTreeBuilderSmokeTest(object):
327 # Both XML and HTML entities are converted to Unicode characters 408 # Both XML and HTML entities are converted to Unicode characters
328 # during parsing. 409 # during parsing.
329 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" 410 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
330 expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>" 411 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
331 self.assertSoupEquals(text, expected) 412 self.assertSoupEquals(text, expected)
332 413
333 def test_smart_quotes_converted_on_the_way_in(self): 414 def test_smart_quotes_converted_on_the_way_in(self):
@@ -337,15 +418,15 @@ class HTMLTreeBuilderSmokeTest(object):
337 soup = self.soup(quote) 418 soup = self.soup(quote)
338 self.assertEqual( 419 self.assertEqual(
339 soup.p.string, 420 soup.p.string,
340 u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") 421 "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
341 422
342 def test_non_breaking_spaces_converted_on_the_way_in(self): 423 def test_non_breaking_spaces_converted_on_the_way_in(self):
343 soup = self.soup("<a>&nbsp;&nbsp;</a>") 424 soup = self.soup("<a>&nbsp;&nbsp;</a>")
344 self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) 425 self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
345 426
346 def test_entities_converted_on_the_way_out(self): 427 def test_entities_converted_on_the_way_out(self):
347 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" 428 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
348 expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8") 429 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
349 soup = self.soup(text) 430 soup = self.soup(text)
350 self.assertEqual(soup.p.encode("utf-8"), expected) 431 self.assertEqual(soup.p.encode("utf-8"), expected)
351 432
@@ -354,7 +435,7 @@ class HTMLTreeBuilderSmokeTest(object):
354 # easy-to-understand document. 435 # easy-to-understand document.
355 436
356 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. 437 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
357 unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' 438 unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
358 439
359 # That's because we're going to encode it into ISO-Latin-1, and use 440 # That's because we're going to encode it into ISO-Latin-1, and use
360 # that to test. 441 # that to test.
@@ -463,11 +544,25 @@ class HTMLTreeBuilderSmokeTest(object):
463 544
464class XMLTreeBuilderSmokeTest(object): 545class XMLTreeBuilderSmokeTest(object):
465 546
547 def test_pickle_and_unpickle_identity(self):
548 # Pickling a tree, then unpickling it, yields a tree identical
549 # to the original.
550 tree = self.soup("<a><b>foo</a>")
551 dumped = pickle.dumps(tree, 2)
552 loaded = pickle.loads(dumped)
553 self.assertEqual(loaded.__class__, BeautifulSoup)
554 self.assertEqual(loaded.decode(), tree.decode())
555
466 def test_docstring_generated(self): 556 def test_docstring_generated(self):
467 soup = self.soup("<root/>") 557 soup = self.soup("<root/>")
468 self.assertEqual( 558 self.assertEqual(
469 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') 559 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
470 560
561 def test_xml_declaration(self):
562 markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
563 soup = self.soup(markup)
564 self.assertEqual(markup, soup.encode("utf8"))
565
471 def test_real_xhtml_document(self): 566 def test_real_xhtml_document(self):
472 """A real XHTML document should come out *exactly* the same as it went in.""" 567 """A real XHTML document should come out *exactly* the same as it went in."""
473 markup = b"""<?xml version="1.0" encoding="utf-8"?> 568 markup = b"""<?xml version="1.0" encoding="utf-8"?>
@@ -485,7 +580,7 @@ class XMLTreeBuilderSmokeTest(object):
485 <script type="text/javascript"> 580 <script type="text/javascript">
486 </script> 581 </script>
487""" 582"""
488 soup = BeautifulSoup(doc, "xml") 583 soup = BeautifulSoup(doc, "lxml-xml")
489 # lxml would have stripped this while parsing, but we can add 584 # lxml would have stripped this while parsing, but we can add
490 # it later. 585 # it later.
491 soup.script.string = 'console.log("< < hey > > ");' 586 soup.script.string = 'console.log("< < hey > > ");'
@@ -493,15 +588,15 @@ class XMLTreeBuilderSmokeTest(object):
493 self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded) 588 self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
494 589
495 def test_can_parse_unicode_document(self): 590 def test_can_parse_unicode_document(self):
496 markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' 591 markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
497 soup = self.soup(markup) 592 soup = self.soup(markup)
498 self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) 593 self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
499 594
500 def test_popping_namespaced_tag(self): 595 def test_popping_namespaced_tag(self):
501 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' 596 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
502 soup = self.soup(markup) 597 soup = self.soup(markup)
503 self.assertEqual( 598 self.assertEqual(
504 unicode(soup.rss), markup) 599 str(soup.rss), markup)
505 600
506 def test_docstring_includes_correct_encoding(self): 601 def test_docstring_includes_correct_encoding(self):
507 soup = self.soup("<root/>") 602 soup = self.soup("<root/>")
@@ -532,17 +627,17 @@ class XMLTreeBuilderSmokeTest(object):
532 def test_closing_namespaced_tag(self): 627 def test_closing_namespaced_tag(self):
533 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' 628 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
534 soup = self.soup(markup) 629 soup = self.soup(markup)
535 self.assertEqual(unicode(soup.p), markup) 630 self.assertEqual(str(soup.p), markup)
536 631
537 def test_namespaced_attributes(self): 632 def test_namespaced_attributes(self):
538 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' 633 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
539 soup = self.soup(markup) 634 soup = self.soup(markup)
540 self.assertEqual(unicode(soup.foo), markup) 635 self.assertEqual(str(soup.foo), markup)
541 636
542 def test_namespaced_attributes_xml_namespace(self): 637 def test_namespaced_attributes_xml_namespace(self):
543 markup = '<foo xml:lang="fr">bar</foo>' 638 markup = '<foo xml:lang="fr">bar</foo>'
544 soup = self.soup(markup) 639 soup = self.soup(markup)
545 self.assertEqual(unicode(soup.foo), markup) 640 self.assertEqual(str(soup.foo), markup)
546 641
547class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): 642class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
548 """Smoke test for a tree builder that supports HTML5.""" 643 """Smoke test for a tree builder that supports HTML5."""
diff --git a/bitbake/lib/bs4/tests/test_builder_registry.py b/bitbake/lib/bs4/tests/test_builder_registry.py
index 92ad10fb04..90cad82933 100644
--- a/bitbake/lib/bs4/tests/test_builder_registry.py
+++ b/bitbake/lib/bs4/tests/test_builder_registry.py
@@ -1,6 +1,7 @@
1"""Tests of the builder registry.""" 1"""Tests of the builder registry."""
2 2
3import unittest 3import unittest
4import warnings
4 5
5from bs4 import BeautifulSoup 6from bs4 import BeautifulSoup
6from bs4.builder import ( 7from bs4.builder import (
@@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase):
67 HTMLParserTreeBuilder) 68 HTMLParserTreeBuilder)
68 69
69 def test_beautifulsoup_constructor_does_lookup(self): 70 def test_beautifulsoup_constructor_does_lookup(self):
70 # You can pass in a string. 71
71 BeautifulSoup("", features="html") 72 with warnings.catch_warnings(record=True) as w:
72 # Or a list of strings. 73 # This will create a warning about not explicitly
73 BeautifulSoup("", features=["html", "fast"]) 74 # specifying a parser, but we'll ignore it.
75
76 # You can pass in a string.
77 BeautifulSoup("", features="html")
78 # Or a list of strings.
79 BeautifulSoup("", features=["html", "fast"])
74 80
75 # You'll get an exception if BS can't find an appropriate 81 # You'll get an exception if BS can't find an appropriate
76 # builder. 82 # builder.
diff --git a/bitbake/lib/bs4/tests/test_html5lib.py b/bitbake/lib/bs4/tests/test_html5lib.py
index 594c3e1f26..a7494ca5ba 100644
--- a/bitbake/lib/bs4/tests/test_html5lib.py
+++ b/bitbake/lib/bs4/tests/test_html5lib.py
@@ -5,7 +5,7 @@ import warnings
5try: 5try:
6 from bs4.builder import HTML5TreeBuilder 6 from bs4.builder import HTML5TreeBuilder
7 HTML5LIB_PRESENT = True 7 HTML5LIB_PRESENT = True
8except ImportError, e: 8except ImportError as e:
9 HTML5LIB_PRESENT = False 9 HTML5LIB_PRESENT = False
10from bs4.element import SoupStrainer 10from bs4.element import SoupStrainer
11from bs4.testing import ( 11from bs4.testing import (
@@ -74,12 +74,25 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
74 def test_reparented_markup(self): 74 def test_reparented_markup(self):
75 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' 75 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 soup = self.soup(markup) 76 soup = self.soup(markup)
77 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) 77 self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 self.assertEqual(2, len(soup.find_all('p'))) 78 self.assertEqual(2, len(soup.find_all('p')))
79 79
80 80
81 def test_reparented_markup_ends_with_whitespace(self): 81 def test_reparented_markup_ends_with_whitespace(self):
82 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' 82 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 soup = self.soup(markup) 83 soup = self.soup(markup)
84 self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) 84 self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 self.assertEqual(2, len(soup.find_all('p'))) 85 self.assertEqual(2, len(soup.find_all('p')))
86
87 def test_processing_instruction(self):
88 """Processing instructions become comments."""
89 markup = b"""<?PITarget PIContent?>"""
90 soup = self.soup(markup)
91 assert str(soup).startswith("<!--?PITarget PIContent?-->")
92
93 def test_cloned_multivalue_node(self):
94 markup = b"""<a class="my_class"><p></a>"""
95 soup = self.soup(markup)
96 a1, a2 = soup.find_all('a')
97 self.assertEqual(a1, a2)
98 assert a1 is not a2
diff --git a/bitbake/lib/bs4/tests/test_htmlparser.py b/bitbake/lib/bs4/tests/test_htmlparser.py
index bcb5ed232f..b45e35f999 100644
--- a/bitbake/lib/bs4/tests/test_htmlparser.py
+++ b/bitbake/lib/bs4/tests/test_htmlparser.py
@@ -1,6 +1,8 @@
1"""Tests to ensure that the html.parser tree builder generates good 1"""Tests to ensure that the html.parser tree builder generates good
2trees.""" 2trees."""
3 3
4from pdb import set_trace
5import pickle
4from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest 6from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
5from bs4.builder import HTMLParserTreeBuilder 7from bs4.builder import HTMLParserTreeBuilder
6 8
@@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
17 def test_namespaced_public_doctype(self): 19 def test_namespaced_public_doctype(self):
18 # html.parser can't handle namespaced doctypes, so skip this one. 20 # html.parser can't handle namespaced doctypes, so skip this one.
19 pass 21 pass
22
23 def test_builder_is_pickled(self):
24 """Unlike most tree builders, HTMLParserTreeBuilder and will
25 be restored after pickling.
26 """
27 tree = self.soup("<a><b>foo</a>")
28 dumped = pickle.dumps(tree, 2)
29 loaded = pickle.loads(dumped)
30 self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
31
32
diff --git a/bitbake/lib/bs4/tests/test_lxml.py b/bitbake/lib/bs4/tests/test_lxml.py
index 2b2e9b7e78..6c2a1d73eb 100644
--- a/bitbake/lib/bs4/tests/test_lxml.py
+++ b/bitbake/lib/bs4/tests/test_lxml.py
@@ -7,7 +7,7 @@ try:
7 import lxml.etree 7 import lxml.etree
8 LXML_PRESENT = True 8 LXML_PRESENT = True
9 LXML_VERSION = lxml.etree.LXML_VERSION 9 LXML_VERSION = lxml.etree.LXML_VERSION
10except ImportError, e: 10except ImportError as e:
11 LXML_PRESENT = False 11 LXML_PRESENT = False
12 LXML_VERSION = (0,) 12 LXML_VERSION = (0,)
13 13
@@ -62,24 +62,9 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
62 # if one is installed. 62 # if one is installed.
63 with warnings.catch_warnings(record=True) as w: 63 with warnings.catch_warnings(record=True) as w:
64 soup = BeautifulStoneSoup("<b />") 64 soup = BeautifulStoneSoup("<b />")
65 self.assertEqual(u"<b/>", unicode(soup.b)) 65 self.assertEqual("<b/>", str(soup.b))
66 self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 66 self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67 67
68 def test_real_xhtml_document(self):
69 """lxml strips the XML definition from an XHTML doc, which is fine."""
70 markup = b"""<?xml version="1.0" encoding="utf-8"?>
71<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
72<html xmlns="http://www.w3.org/1999/xhtml">
73<head><title>Hello.</title></head>
74<body>Goodbye.</body>
75</html>"""
76 soup = self.soup(markup)
77 self.assertEqual(
78 soup.encode("utf-8").replace(b"\n", b''),
79 markup.replace(b'\n', b'').replace(
80 b'<?xml version="1.0" encoding="utf-8"?>', b''))
81
82
83@skipIf( 68@skipIf(
84 not LXML_PRESENT, 69 not LXML_PRESENT,
85 "lxml seems not to be present, not testing its XML tree builder.") 70 "lxml seems not to be present, not testing its XML tree builder.")
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py
index 47ac245f99..f87949e3d3 100644
--- a/bitbake/lib/bs4/tests/test_soup.py
+++ b/bitbake/lib/bs4/tests/test_soup.py
@@ -1,6 +1,7 @@
1# -*- coding: utf-8 -*- 1# -*- coding: utf-8 -*-
2"""Tests of Beautiful Soup as a whole.""" 2"""Tests of Beautiful Soup as a whole."""
3 3
4from pdb import set_trace
4import logging 5import logging
5import unittest 6import unittest
6import sys 7import sys
@@ -20,6 +21,7 @@ import bs4.dammit
20from bs4.dammit import ( 21from bs4.dammit import (
21 EntitySubstitution, 22 EntitySubstitution,
22 UnicodeDammit, 23 UnicodeDammit,
24 EncodingDetector,
23) 25)
24from bs4.testing import ( 26from bs4.testing import (
25 SoupTest, 27 SoupTest,
@@ -30,7 +32,7 @@ import warnings
30try: 32try:
31 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 33 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
32 LXML_PRESENT = True 34 LXML_PRESENT = True
33except ImportError, e: 35except ImportError as e:
34 LXML_PRESENT = False 36 LXML_PRESENT = False
35 37
36PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) 38PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
@@ -39,17 +41,43 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
39class TestConstructor(SoupTest): 41class TestConstructor(SoupTest):
40 42
41 def test_short_unicode_input(self): 43 def test_short_unicode_input(self):
42 data = u"<h1>éé</h1>" 44 data = "<h1>éé</h1>"
43 soup = self.soup(data) 45 soup = self.soup(data)
44 self.assertEqual(u"éé", soup.h1.string) 46 self.assertEqual("éé", soup.h1.string)
45 47
46 def test_embedded_null(self): 48 def test_embedded_null(self):
47 data = u"<h1>foo\0bar</h1>" 49 data = "<h1>foo\0bar</h1>"
48 soup = self.soup(data) 50 soup = self.soup(data)
49 self.assertEqual(u"foo\0bar", soup.h1.string) 51 self.assertEqual("foo\0bar", soup.h1.string)
50 52
53 def test_exclude_encodings(self):
54 utf8_data = "Räksmörgås".encode("utf-8")
55 soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
56 self.assertEqual("windows-1252", soup.original_encoding)
51 57
52class TestDeprecatedConstructorArguments(SoupTest): 58
59class TestWarnings(SoupTest):
60
61 def _no_parser_specified(self, s, is_there=True):
62 v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
63 self.assertTrue(v)
64
65 def test_warning_if_no_parser_specified(self):
66 with warnings.catch_warnings(record=True) as w:
67 soup = self.soup("<a><b></b></a>")
68 msg = str(w[0].message)
69 self._assert_no_parser_specified(msg)
70
71 def test_warning_if_parser_specified_too_vague(self):
72 with warnings.catch_warnings(record=True) as w:
73 soup = self.soup("<a><b></b></a>", "html")
74 msg = str(w[0].message)
75 self._assert_no_parser_specified(msg)
76
77 def test_no_warning_if_explicit_parser_specified(self):
78 with warnings.catch_warnings(record=True) as w:
79 soup = self.soup("<a><b></b></a>", "html.parser")
80 self.assertEqual([], w)
53 81
54 def test_parseOnlyThese_renamed_to_parse_only(self): 82 def test_parseOnlyThese_renamed_to_parse_only(self):
55 with warnings.catch_warnings(record=True) as w: 83 with warnings.catch_warnings(record=True) as w:
@@ -117,9 +145,9 @@ class TestEntitySubstitution(unittest.TestCase):
117 def test_simple_html_substitution(self): 145 def test_simple_html_substitution(self):
118 # Unicode characters corresponding to named HTML entites 146 # Unicode characters corresponding to named HTML entites
119 # are substituted, and no others. 147 # are substituted, and no others.
120 s = u"foo\u2200\N{SNOWMAN}\u00f5bar" 148 s = "foo\u2200\N{SNOWMAN}\u00f5bar"
121 self.assertEqual(self.sub.substitute_html(s), 149 self.assertEqual(self.sub.substitute_html(s),
122 u"foo&forall;\N{SNOWMAN}&otilde;bar") 150 "foo&forall;\N{SNOWMAN}&otilde;bar")
123 151
124 def test_smart_quote_substitution(self): 152 def test_smart_quote_substitution(self):
125 # MS smart quotes are a common source of frustration, so we 153 # MS smart quotes are a common source of frustration, so we
@@ -184,7 +212,7 @@ class TestEncodingConversion(SoupTest):
184 212
185 def setUp(self): 213 def setUp(self):
186 super(TestEncodingConversion, self).setUp() 214 super(TestEncodingConversion, self).setUp()
187 self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' 215 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
188 self.utf8_data = self.unicode_data.encode("utf-8") 216 self.utf8_data = self.unicode_data.encode("utf-8")
189 # Just so you know what it looks like. 217 # Just so you know what it looks like.
190 self.assertEqual( 218 self.assertEqual(
@@ -204,7 +232,7 @@ class TestEncodingConversion(SoupTest):
204 ascii = b"<foo>a</foo>" 232 ascii = b"<foo>a</foo>"
205 soup_from_ascii = self.soup(ascii) 233 soup_from_ascii = self.soup(ascii)
206 unicode_output = soup_from_ascii.decode() 234 unicode_output = soup_from_ascii.decode()
207 self.assertTrue(isinstance(unicode_output, unicode)) 235 self.assertTrue(isinstance(unicode_output, str))
208 self.assertEqual(unicode_output, self.document_for(ascii.decode())) 236 self.assertEqual(unicode_output, self.document_for(ascii.decode()))
209 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") 237 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
210 finally: 238 finally:
@@ -216,7 +244,7 @@ class TestEncodingConversion(SoupTest):
216 # is not set. 244 # is not set.
217 soup_from_unicode = self.soup(self.unicode_data) 245 soup_from_unicode = self.soup(self.unicode_data)
218 self.assertEqual(soup_from_unicode.decode(), self.unicode_data) 246 self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
219 self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') 247 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
220 self.assertEqual(soup_from_unicode.original_encoding, None) 248 self.assertEqual(soup_from_unicode.original_encoding, None)
221 249
222 def test_utf8_in_unicode_out(self): 250 def test_utf8_in_unicode_out(self):
@@ -224,7 +252,7 @@ class TestEncodingConversion(SoupTest):
224 # attribute is set. 252 # attribute is set.
225 soup_from_utf8 = self.soup(self.utf8_data) 253 soup_from_utf8 = self.soup(self.utf8_data)
226 self.assertEqual(soup_from_utf8.decode(), self.unicode_data) 254 self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
227 self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') 255 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
228 256
229 def test_utf8_out(self): 257 def test_utf8_out(self):
230 # The internal data structures can be encoded as UTF-8. 258 # The internal data structures can be encoded as UTF-8.
@@ -235,14 +263,14 @@ class TestEncodingConversion(SoupTest):
235 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, 263 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
236 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") 264 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
237 def test_attribute_name_containing_unicode_characters(self): 265 def test_attribute_name_containing_unicode_characters(self):
238 markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' 266 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
239 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) 267 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
240 268
241class TestUnicodeDammit(unittest.TestCase): 269class TestUnicodeDammit(unittest.TestCase):
242 """Standalone tests of UnicodeDammit.""" 270 """Standalone tests of UnicodeDammit."""
243 271
244 def test_unicode_input(self): 272 def test_unicode_input(self):
245 markup = u"I'm already Unicode! \N{SNOWMAN}" 273 markup = "I'm already Unicode! \N{SNOWMAN}"
246 dammit = UnicodeDammit(markup) 274 dammit = UnicodeDammit(markup)
247 self.assertEqual(dammit.unicode_markup, markup) 275 self.assertEqual(dammit.unicode_markup, markup)
248 276
@@ -250,7 +278,7 @@ class TestUnicodeDammit(unittest.TestCase):
250 markup = b"<foo>\x91\x92\x93\x94</foo>" 278 markup = b"<foo>\x91\x92\x93\x94</foo>"
251 dammit = UnicodeDammit(markup) 279 dammit = UnicodeDammit(markup)
252 self.assertEqual( 280 self.assertEqual(
253 dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") 281 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
254 282
255 def test_smart_quotes_to_xml_entities(self): 283 def test_smart_quotes_to_xml_entities(self):
256 markup = b"<foo>\x91\x92\x93\x94</foo>" 284 markup = b"<foo>\x91\x92\x93\x94</foo>"
@@ -271,16 +299,17 @@ class TestUnicodeDammit(unittest.TestCase):
271 dammit.unicode_markup, """<foo>''""</foo>""") 299 dammit.unicode_markup, """<foo>''""</foo>""")
272 300
273 def test_detect_utf8(self): 301 def test_detect_utf8(self):
274 utf8 = b"\xc3\xa9" 302 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
275 dammit = UnicodeDammit(utf8) 303 dammit = UnicodeDammit(utf8)
276 self.assertEqual(dammit.unicode_markup, u'\xe9')
277 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 304 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
305 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
306
278 307
279 def test_convert_hebrew(self): 308 def test_convert_hebrew(self):
280 hebrew = b"\xed\xe5\xec\xf9" 309 hebrew = b"\xed\xe5\xec\xf9"
281 dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) 310 dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
282 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') 311 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
283 self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') 312 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
284 313
285 def test_dont_see_smart_quotes_where_there_are_none(self): 314 def test_dont_see_smart_quotes_where_there_are_none(self):
286 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" 315 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@@ -289,16 +318,36 @@ class TestUnicodeDammit(unittest.TestCase):
289 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) 318 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
290 319
291 def test_ignore_inappropriate_codecs(self): 320 def test_ignore_inappropriate_codecs(self):
292 utf8_data = u"Räksmörgås".encode("utf-8") 321 utf8_data = "Räksmörgås".encode("utf-8")
293 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) 322 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
294 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 323 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
295 324
296 def test_ignore_invalid_codecs(self): 325 def test_ignore_invalid_codecs(self):
297 utf8_data = u"Räksmörgås".encode("utf-8") 326 utf8_data = "Räksmörgås".encode("utf-8")
298 for bad_encoding in ['.utf8', '...', 'utF---16.!']: 327 for bad_encoding in ['.utf8', '...', 'utF---16.!']:
299 dammit = UnicodeDammit(utf8_data, [bad_encoding]) 328 dammit = UnicodeDammit(utf8_data, [bad_encoding])
300 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 329 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
301 330
331 def test_exclude_encodings(self):
332 # This is UTF-8.
333 utf8_data = "Räksmörgås".encode("utf-8")
334
335 # But if we exclude UTF-8 from consideration, the guess is
336 # Windows-1252.
337 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
338 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
339
340 # And if we exclude that, there is no valid guess at all.
341 dammit = UnicodeDammit(
342 utf8_data, exclude_encodings=["utf-8", "windows-1252"])
343 self.assertEqual(dammit.original_encoding, None)
344
345 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
346 detected = EncodingDetector(
347 b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
348 encodings = list(detected.encodings)
349 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
350
302 def test_detect_html5_style_meta_tag(self): 351 def test_detect_html5_style_meta_tag(self):
303 352
304 for data in ( 353 for data in (
@@ -337,7 +386,7 @@ class TestUnicodeDammit(unittest.TestCase):
337 bs4.dammit.chardet_dammit = noop 386 bs4.dammit.chardet_dammit = noop
338 dammit = UnicodeDammit(doc) 387 dammit = UnicodeDammit(doc)
339 self.assertEqual(True, dammit.contains_replacement_characters) 388 self.assertEqual(True, dammit.contains_replacement_characters)
340 self.assertTrue(u"\ufffd" in dammit.unicode_markup) 389 self.assertTrue("\ufffd" in dammit.unicode_markup)
341 390
342 soup = BeautifulSoup(doc, "html.parser") 391 soup = BeautifulSoup(doc, "html.parser")
343 self.assertTrue(soup.contains_replacement_characters) 392 self.assertTrue(soup.contains_replacement_characters)
@@ -349,17 +398,17 @@ class TestUnicodeDammit(unittest.TestCase):
349 # A document written in UTF-16LE will have its byte order marker stripped. 398 # A document written in UTF-16LE will have its byte order marker stripped.
350 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' 399 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
351 dammit = UnicodeDammit(data) 400 dammit = UnicodeDammit(data)
352 self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) 401 self.assertEqual("<a>áé</a>", dammit.unicode_markup)
353 self.assertEqual("utf-16le", dammit.original_encoding) 402 self.assertEqual("utf-16le", dammit.original_encoding)
354 403
355 def test_detwingle(self): 404 def test_detwingle(self):
356 # Here's a UTF8 document. 405 # Here's a UTF8 document.
357 utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") 406 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
358 407
359 # Here's a Windows-1252 document. 408 # Here's a Windows-1252 document.
360 windows_1252 = ( 409 windows_1252 = (
361 u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" 410 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
362 u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") 411 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
363 412
364 # Through some unholy alchemy, they've been stuck together. 413 # Through some unholy alchemy, they've been stuck together.
365 doc = utf8 + windows_1252 + utf8 414 doc = utf8 + windows_1252 + utf8
@@ -374,7 +423,7 @@ class TestUnicodeDammit(unittest.TestCase):
374 423
375 fixed = UnicodeDammit.detwingle(doc) 424 fixed = UnicodeDammit.detwingle(doc)
376 self.assertEqual( 425 self.assertEqual(
377 u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) 426 "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
378 427
379 def test_detwingle_ignores_multibyte_characters(self): 428 def test_detwingle_ignores_multibyte_characters(self):
380 # Each of these characters has a UTF-8 representation ending 429 # Each of these characters has a UTF-8 representation ending
@@ -382,9 +431,9 @@ class TestUnicodeDammit(unittest.TestCase):
382 # Windows-1252. But our code knows to skip over multibyte 431 # Windows-1252. But our code knows to skip over multibyte
383 # UTF-8 characters, so they'll survive the process unscathed. 432 # UTF-8 characters, so they'll survive the process unscathed.
384 for tricky_unicode_char in ( 433 for tricky_unicode_char in (
385 u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' 434 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
386 u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' 435 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
387 u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. 436 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
388 ): 437 ):
389 input = tricky_unicode_char.encode("utf8") 438 input = tricky_unicode_char.encode("utf8")
390 self.assertTrue(input.endswith(b'\x93')) 439 self.assertTrue(input.endswith(b'\x93'))
diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py
index f8515c0ea1..6d3e67f311 100644
--- a/bitbake/lib/bs4/tests/test_tree.py
+++ b/bitbake/lib/bs4/tests/test_tree.py
@@ -9,6 +9,7 @@ same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here. 9methods tested here.
10""" 10"""
11 11
12from pdb import set_trace
12import copy 13import copy
13import pickle 14import pickle
14import re 15import re
@@ -19,8 +20,10 @@ from bs4.builder import (
19 HTMLParserTreeBuilder, 20 HTMLParserTreeBuilder,
20) 21)
21from bs4.element import ( 22from bs4.element import (
23 PY3K,
22 CData, 24 CData,
23 Comment, 25 Comment,
26 Declaration,
24 Doctype, 27 Doctype,
25 NavigableString, 28 NavigableString,
26 SoupStrainer, 29 SoupStrainer,
@@ -67,8 +70,14 @@ class TestFind(TreeTest):
67 self.assertEqual(soup.find("b").string, "2") 70 self.assertEqual(soup.find("b").string, "2")
68 71
69 def test_unicode_text_find(self): 72 def test_unicode_text_find(self):
70 soup = self.soup(u'<h1>Räksmörgås</h1>') 73 soup = self.soup('<h1>Räksmörgås</h1>')
71 self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') 74 self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
75
76 def test_unicode_attribute_find(self):
77 soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
78 str(soup)
79 self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
80
72 81
73 def test_find_everything(self): 82 def test_find_everything(self):
74 """Test an optimization that finds all tags.""" 83 """Test an optimization that finds all tags."""
@@ -87,16 +96,17 @@ class TestFindAll(TreeTest):
87 """You can search the tree for text nodes.""" 96 """You can search the tree for text nodes."""
88 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") 97 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
89 # Exact match. 98 # Exact match.
90 self.assertEqual(soup.find_all(text="bar"), [u"bar"]) 99 self.assertEqual(soup.find_all(string="bar"), ["bar"])
100 self.assertEqual(soup.find_all(text="bar"), ["bar"])
91 # Match any of a number of strings. 101 # Match any of a number of strings.
92 self.assertEqual( 102 self.assertEqual(
93 soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) 103 soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
94 # Match a regular expression. 104 # Match a regular expression.
95 self.assertEqual(soup.find_all(text=re.compile('.*')), 105 self.assertEqual(soup.find_all(text=re.compile('.*')),
96 [u"Foo", u"bar", u'\xbb']) 106 ["Foo", "bar", '\xbb'])
97 # Match anything. 107 # Match anything.
98 self.assertEqual(soup.find_all(text=True), 108 self.assertEqual(soup.find_all(text=True),
99 [u"Foo", u"bar", u'\xbb']) 109 ["Foo", "bar", '\xbb'])
100 110
101 def test_find_all_limit(self): 111 def test_find_all_limit(self):
102 """You can limit the number of items returned by find_all.""" 112 """You can limit the number of items returned by find_all."""
@@ -227,8 +237,8 @@ class TestFindAllByAttribute(TreeTest):
227 ["Matching a.", "Matching b."]) 237 ["Matching a.", "Matching b."])
228 238
229 def test_find_all_by_utf8_attribute_value(self): 239 def test_find_all_by_utf8_attribute_value(self):
230 peace = u"םולש".encode("utf8") 240 peace = "םולש".encode("utf8")
231 data = u'<a title="םולש"></a>'.encode("utf8") 241 data = '<a title="םולש"></a>'.encode("utf8")
232 soup = self.soup(data) 242 soup = self.soup(data)
233 self.assertEqual([soup.a], soup.find_all(title=peace)) 243 self.assertEqual([soup.a], soup.find_all(title=peace))
234 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) 244 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
@@ -688,7 +698,7 @@ class TestTagCreation(SoupTest):
688 698
689 def test_tag_inherits_self_closing_rules_from_builder(self): 699 def test_tag_inherits_self_closing_rules_from_builder(self):
690 if XML_BUILDER_PRESENT: 700 if XML_BUILDER_PRESENT:
691 xml_soup = BeautifulSoup("", "xml") 701 xml_soup = BeautifulSoup("", "lxml-xml")
692 xml_br = xml_soup.new_tag("br") 702 xml_br = xml_soup.new_tag("br")
693 xml_p = xml_soup.new_tag("p") 703 xml_p = xml_soup.new_tag("p")
694 704
@@ -697,7 +707,7 @@ class TestTagCreation(SoupTest):
697 self.assertEqual(b"<br/>", xml_br.encode()) 707 self.assertEqual(b"<br/>", xml_br.encode())
698 self.assertEqual(b"<p/>", xml_p.encode()) 708 self.assertEqual(b"<p/>", xml_p.encode())
699 709
700 html_soup = BeautifulSoup("", "html") 710 html_soup = BeautifulSoup("", "html.parser")
701 html_br = html_soup.new_tag("br") 711 html_br = html_soup.new_tag("br")
702 html_p = html_soup.new_tag("p") 712 html_p = html_soup.new_tag("p")
703 713
@@ -773,6 +783,14 @@ class TestTreeModification(SoupTest):
773 new_a = a.unwrap() 783 new_a = a.unwrap()
774 self.assertEqual(a, new_a) 784 self.assertEqual(a, new_a)
775 785
786 def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
787 soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
788 a = soup.a
789 a.extract()
790 self.assertEqual(None, a.parent)
791 self.assertRaises(ValueError, a.unwrap)
792 self.assertRaises(ValueError, a.replace_with, soup.c)
793
776 def test_replace_tag_with_itself(self): 794 def test_replace_tag_with_itself(self):
777 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" 795 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
778 soup = self.soup(text) 796 soup = self.soup(text)
@@ -1067,6 +1085,31 @@ class TestTreeModification(SoupTest):
1067 self.assertEqual(foo_2, soup.a.string) 1085 self.assertEqual(foo_2, soup.a.string)
1068 self.assertEqual(bar_2, soup.b.string) 1086 self.assertEqual(bar_2, soup.b.string)
1069 1087
1088 def test_extract_multiples_of_same_tag(self):
1089 soup = self.soup("""
1090<html>
1091<head>
1092<script>foo</script>
1093</head>
1094<body>
1095 <script>bar</script>
1096 <a></a>
1097</body>
1098<script>baz</script>
1099</html>""")
1100 [soup.script.extract() for i in soup.find_all("script")]
1101 self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
1102
1103
1104 def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
1105 soup = self.soup(
1106 '<html>\n'
1107 '<body>hi</body>\n'
1108 '</html>')
1109 soup.find('body').extract()
1110 self.assertEqual(None, soup.find('body'))
1111
1112
1070 def test_clear(self): 1113 def test_clear(self):
1071 """Tag.clear()""" 1114 """Tag.clear()"""
1072 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") 1115 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
@@ -1287,27 +1330,72 @@ class TestPersistence(SoupTest):
1287 1330
1288 def test_unicode_pickle(self): 1331 def test_unicode_pickle(self):
1289 # A tree containing Unicode characters can be pickled. 1332 # A tree containing Unicode characters can be pickled.
1290 html = u"<b>\N{SNOWMAN}</b>" 1333 html = "<b>\N{SNOWMAN}</b>"
1291 soup = self.soup(html) 1334 soup = self.soup(html)
1292 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) 1335 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1293 loaded = pickle.loads(dumped) 1336 loaded = pickle.loads(dumped)
1294 self.assertEqual(loaded.decode(), soup.decode()) 1337 self.assertEqual(loaded.decode(), soup.decode())
1295 1338
1339 def test_copy_navigablestring_is_not_attached_to_tree(self):
1340 html = "<b>Foo<a></a></b><b>Bar</b>"
1341 soup = self.soup(html)
1342 s1 = soup.find(string="Foo")
1343 s2 = copy.copy(s1)
1344 self.assertEqual(s1, s2)
1345 self.assertEqual(None, s2.parent)
1346 self.assertEqual(None, s2.next_element)
1347 self.assertNotEqual(None, s1.next_sibling)
1348 self.assertEqual(None, s2.next_sibling)
1349 self.assertEqual(None, s2.previous_element)
1350
1351 def test_copy_navigablestring_subclass_has_same_type(self):
1352 html = "<b><!--Foo--></b>"
1353 soup = self.soup(html)
1354 s1 = soup.string
1355 s2 = copy.copy(s1)
1356 self.assertEqual(s1, s2)
1357 self.assertTrue(isinstance(s2, Comment))
1358
1359 def test_copy_entire_soup(self):
1360 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1361 soup = self.soup(html)
1362 soup_copy = copy.copy(soup)
1363 self.assertEqual(soup, soup_copy)
1364
1365 def test_copy_tag_copies_contents(self):
1366 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1367 soup = self.soup(html)
1368 div = soup.div
1369 div_copy = copy.copy(div)
1370
1371 # The two tags look the same, and evaluate to equal.
1372 self.assertEqual(str(div), str(div_copy))
1373 self.assertEqual(div, div_copy)
1374
1375 # But they're not the same object.
1376 self.assertFalse(div is div_copy)
1377
1378 # And they don't have the same relation to the parse tree. The
1379 # copy is not associated with a parse tree at all.
1380 self.assertEqual(None, div_copy.parent)
1381 self.assertEqual(None, div_copy.previous_element)
1382 self.assertEqual(None, div_copy.find(string='Bar').next_element)
1383 self.assertNotEqual(None, div.find(string='Bar').next_element)
1296 1384
1297class TestSubstitutions(SoupTest): 1385class TestSubstitutions(SoupTest):
1298 1386
1299 def test_default_formatter_is_minimal(self): 1387 def test_default_formatter_is_minimal(self):
1300 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" 1388 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1301 soup = self.soup(markup) 1389 soup = self.soup(markup)
1302 decoded = soup.decode(formatter="minimal") 1390 decoded = soup.decode(formatter="minimal")
1303 # The < is converted back into &lt; but the e-with-acute is left alone. 1391 # The < is converted back into &lt; but the e-with-acute is left alone.
1304 self.assertEqual( 1392 self.assertEqual(
1305 decoded, 1393 decoded,
1306 self.document_for( 1394 self.document_for(
1307 u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>")) 1395 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1308 1396
1309 def test_formatter_html(self): 1397 def test_formatter_html(self):
1310 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" 1398 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1311 soup = self.soup(markup) 1399 soup = self.soup(markup)
1312 decoded = soup.decode(formatter="html") 1400 decoded = soup.decode(formatter="html")
1313 self.assertEqual( 1401 self.assertEqual(
@@ -1315,49 +1403,49 @@ class TestSubstitutions(SoupTest):
1315 self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>")) 1403 self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1316 1404
1317 def test_formatter_minimal(self): 1405 def test_formatter_minimal(self):
1318 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" 1406 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1319 soup = self.soup(markup) 1407 soup = self.soup(markup)
1320 decoded = soup.decode(formatter="minimal") 1408 decoded = soup.decode(formatter="minimal")
1321 # The < is converted back into &lt; but the e-with-acute is left alone. 1409 # The < is converted back into &lt; but the e-with-acute is left alone.
1322 self.assertEqual( 1410 self.assertEqual(
1323 decoded, 1411 decoded,
1324 self.document_for( 1412 self.document_for(
1325 u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>")) 1413 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1326 1414
1327 def test_formatter_null(self): 1415 def test_formatter_null(self):
1328 markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" 1416 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1329 soup = self.soup(markup) 1417 soup = self.soup(markup)
1330 decoded = soup.decode(formatter=None) 1418 decoded = soup.decode(formatter=None)
1331 # Neither the angle brackets nor the e-with-acute are converted. 1419 # Neither the angle brackets nor the e-with-acute are converted.
1332 # This is not valid HTML, but it's what the user wanted. 1420 # This is not valid HTML, but it's what the user wanted.
1333 self.assertEqual(decoded, 1421 self.assertEqual(decoded,
1334 self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1422 self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1335 1423
1336 def test_formatter_custom(self): 1424 def test_formatter_custom(self):
1337 markup = u"<b>&lt;foo&gt;</b><b>bar</b>" 1425 markup = "<b>&lt;foo&gt;</b><b>bar</b>"
1338 soup = self.soup(markup) 1426 soup = self.soup(markup)
1339 decoded = soup.decode(formatter = lambda x: x.upper()) 1427 decoded = soup.decode(formatter = lambda x: x.upper())
1340 # Instead of normal entity conversion code, the custom 1428 # Instead of normal entity conversion code, the custom
1341 # callable is called on every string. 1429 # callable is called on every string.
1342 self.assertEqual( 1430 self.assertEqual(
1343 decoded, 1431 decoded,
1344 self.document_for(u"<b><FOO></b><b>BAR</b>")) 1432 self.document_for("<b><FOO></b><b>BAR</b>"))
1345 1433
1346 def test_formatter_is_run_on_attribute_values(self): 1434 def test_formatter_is_run_on_attribute_values(self):
1347 markup = u'<a href="http://a.com?a=b&c=é">e</a>' 1435 markup = '<a href="http://a.com?a=b&c=é">e</a>'
1348 soup = self.soup(markup) 1436 soup = self.soup(markup)
1349 a = soup.a 1437 a = soup.a
1350 1438
1351 expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>' 1439 expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
1352 1440
1353 self.assertEqual(expect_minimal, a.decode()) 1441 self.assertEqual(expect_minimal, a.decode())
1354 self.assertEqual(expect_minimal, a.decode(formatter="minimal")) 1442 self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1355 1443
1356 expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>' 1444 expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1357 self.assertEqual(expect_html, a.decode(formatter="html")) 1445 self.assertEqual(expect_html, a.decode(formatter="html"))
1358 1446
1359 self.assertEqual(markup, a.decode(formatter=None)) 1447 self.assertEqual(markup, a.decode(formatter=None))
1360 expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' 1448 expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
1361 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) 1449 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1362 1450
1363 def test_formatter_skips_script_tag_for_html_documents(self): 1451 def test_formatter_skips_script_tag_for_html_documents(self):
@@ -1366,7 +1454,7 @@ class TestSubstitutions(SoupTest):
1366 console.log("< < hey > > "); 1454 console.log("< < hey > > ");
1367 </script> 1455 </script>
1368""" 1456"""
1369 encoded = BeautifulSoup(doc).encode() 1457 encoded = BeautifulSoup(doc, 'html.parser').encode()
1370 self.assertTrue(b"< < hey > >" in encoded) 1458 self.assertTrue(b"< < hey > >" in encoded)
1371 1459
1372 def test_formatter_skips_style_tag_for_html_documents(self): 1460 def test_formatter_skips_style_tag_for_html_documents(self):
@@ -1375,7 +1463,7 @@ class TestSubstitutions(SoupTest):
1375 console.log("< < hey > > "); 1463 console.log("< < hey > > ");
1376 </style> 1464 </style>
1377""" 1465"""
1378 encoded = BeautifulSoup(doc).encode() 1466 encoded = BeautifulSoup(doc, 'html.parser').encode()
1379 self.assertTrue(b"< < hey > >" in encoded) 1467 self.assertTrue(b"< < hey > >" in encoded)
1380 1468
1381 def test_prettify_leaves_preformatted_text_alone(self): 1469 def test_prettify_leaves_preformatted_text_alone(self):
@@ -1383,24 +1471,24 @@ class TestSubstitutions(SoupTest):
1383 # Everything outside the <pre> tag is reformatted, but everything 1471 # Everything outside the <pre> tag is reformatted, but everything
1384 # inside is left alone. 1472 # inside is left alone.
1385 self.assertEqual( 1473 self.assertEqual(
1386 u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', 1474 '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
1387 soup.div.prettify()) 1475 soup.div.prettify())
1388 1476
1389 def test_prettify_accepts_formatter(self): 1477 def test_prettify_accepts_formatter(self):
1390 soup = BeautifulSoup("<html><body>foo</body></html>") 1478 soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
1391 pretty = soup.prettify(formatter = lambda x: x.upper()) 1479 pretty = soup.prettify(formatter = lambda x: x.upper())
1392 self.assertTrue("FOO" in pretty) 1480 self.assertTrue("FOO" in pretty)
1393 1481
1394 def test_prettify_outputs_unicode_by_default(self): 1482 def test_prettify_outputs_unicode_by_default(self):
1395 soup = self.soup("<a></a>") 1483 soup = self.soup("<a></a>")
1396 self.assertEqual(unicode, type(soup.prettify())) 1484 self.assertEqual(str, type(soup.prettify()))
1397 1485
1398 def test_prettify_can_encode_data(self): 1486 def test_prettify_can_encode_data(self):
1399 soup = self.soup("<a></a>") 1487 soup = self.soup("<a></a>")
1400 self.assertEqual(bytes, type(soup.prettify("utf-8"))) 1488 self.assertEqual(bytes, type(soup.prettify("utf-8")))
1401 1489
1402 def test_html_entity_substitution_off_by_default(self): 1490 def test_html_entity_substitution_off_by_default(self):
1403 markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" 1491 markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1404 soup = self.soup(markup) 1492 soup = self.soup(markup)
1405 encoded = soup.b.encode("utf-8") 1493 encoded = soup.b.encode("utf-8")
1406 self.assertEqual(encoded, markup.encode('utf-8')) 1494 self.assertEqual(encoded, markup.encode('utf-8'))
@@ -1444,45 +1532,53 @@ class TestEncoding(SoupTest):
1444 """Test the ability to encode objects into strings.""" 1532 """Test the ability to encode objects into strings."""
1445 1533
1446 def test_unicode_string_can_be_encoded(self): 1534 def test_unicode_string_can_be_encoded(self):
1447 html = u"<b>\N{SNOWMAN}</b>" 1535 html = "<b>\N{SNOWMAN}</b>"
1448 soup = self.soup(html) 1536 soup = self.soup(html)
1449 self.assertEqual(soup.b.string.encode("utf-8"), 1537 self.assertEqual(soup.b.string.encode("utf-8"),
1450 u"\N{SNOWMAN}".encode("utf-8")) 1538 "\N{SNOWMAN}".encode("utf-8"))
1451 1539
1452 def test_tag_containing_unicode_string_can_be_encoded(self): 1540 def test_tag_containing_unicode_string_can_be_encoded(self):
1453 html = u"<b>\N{SNOWMAN}</b>" 1541 html = "<b>\N{SNOWMAN}</b>"
1454 soup = self.soup(html) 1542 soup = self.soup(html)
1455 self.assertEqual( 1543 self.assertEqual(
1456 soup.b.encode("utf-8"), html.encode("utf-8")) 1544 soup.b.encode("utf-8"), html.encode("utf-8"))
1457 1545
1458 def test_encoding_substitutes_unrecognized_characters_by_default(self): 1546 def test_encoding_substitutes_unrecognized_characters_by_default(self):
1459 html = u"<b>\N{SNOWMAN}</b>" 1547 html = "<b>\N{SNOWMAN}</b>"
1460 soup = self.soup(html) 1548 soup = self.soup(html)
1461 self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>") 1549 self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1462 1550
1463 def test_encoding_can_be_made_strict(self): 1551 def test_encoding_can_be_made_strict(self):
1464 html = u"<b>\N{SNOWMAN}</b>" 1552 html = "<b>\N{SNOWMAN}</b>"
1465 soup = self.soup(html) 1553 soup = self.soup(html)
1466 self.assertRaises( 1554 self.assertRaises(
1467 UnicodeEncodeError, soup.encode, "ascii", errors="strict") 1555 UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1468 1556
1469 def test_decode_contents(self): 1557 def test_decode_contents(self):
1470 html = u"<b>\N{SNOWMAN}</b>" 1558 html = "<b>\N{SNOWMAN}</b>"
1471 soup = self.soup(html) 1559 soup = self.soup(html)
1472 self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) 1560 self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
1473 1561
1474 def test_encode_contents(self): 1562 def test_encode_contents(self):
1475 html = u"<b>\N{SNOWMAN}</b>" 1563 html = "<b>\N{SNOWMAN}</b>"
1476 soup = self.soup(html) 1564 soup = self.soup(html)
1477 self.assertEqual( 1565 self.assertEqual(
1478 u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( 1566 "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1479 encoding="utf8")) 1567 encoding="utf8"))
1480 1568
1481 def test_deprecated_renderContents(self): 1569 def test_deprecated_renderContents(self):
1482 html = u"<b>\N{SNOWMAN}</b>" 1570 html = "<b>\N{SNOWMAN}</b>"
1483 soup = self.soup(html) 1571 soup = self.soup(html)
1484 self.assertEqual( 1572 self.assertEqual(
1485 u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) 1573 "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1574
1575 def test_repr(self):
1576 html = "<b>\N{SNOWMAN}</b>"
1577 soup = self.soup(html)
1578 if PY3K:
1579 self.assertEqual(html, repr(soup))
1580 else:
1581 self.assertEqual(b'<b>\\u2603</b>', repr(soup))
1486 1582
1487class TestNavigableStringSubclasses(SoupTest): 1583class TestNavigableStringSubclasses(SoupTest):
1488 1584
@@ -1522,6 +1618,9 @@ class TestNavigableStringSubclasses(SoupTest):
1522 soup.insert(1, doctype) 1618 soup.insert(1, doctype)
1523 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") 1619 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1524 1620
1621 def test_declaration(self):
1622 d = Declaration("foo")
1623 self.assertEqual("<?foo?>", d.output_ready())
1525 1624
1526class TestSoupSelector(TreeTest): 1625class TestSoupSelector(TreeTest):
1527 1626
@@ -1534,7 +1633,7 @@ class TestSoupSelector(TreeTest):
1534<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> 1633<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1535</head> 1634</head>
1536<body> 1635<body>
1537 1636<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
1538<div id="main" class="fancy"> 1637<div id="main" class="fancy">
1539<div id="inner"> 1638<div id="inner">
1540<h1 id="header1">An H1</h1> 1639<h1 id="header1">An H1</h1>
@@ -1552,8 +1651,18 @@ class TestSoupSelector(TreeTest):
1552<a href="#" id="s2a1">span2a1</a> 1651<a href="#" id="s2a1">span2a1</a>
1553</span> 1652</span>
1554<span class="span3"></span> 1653<span class="span3"></span>
1654<custom-dashed-tag class="dashed" id="dash2"/>
1655<div data-tag="dashedvalue" id="data1"/>
1555</span> 1656</span>
1556</div> 1657</div>
1658<x id="xid">
1659<z id="zida"/>
1660<z id="zidab"/>
1661<z id="zidac"/>
1662</x>
1663<y id="yid">
1664<z id="zidb"/>
1665</y>
1557<p lang="en" id="lang-en">English</p> 1666<p lang="en" id="lang-en">English</p>
1558<p lang="en-gb" id="lang-en-gb">English UK</p> 1667<p lang="en-gb" id="lang-en-gb">English UK</p>
1559<p lang="en-us" id="lang-en-us">English US</p> 1668<p lang="en-us" id="lang-en-us">English US</p>
@@ -1565,7 +1674,7 @@ class TestSoupSelector(TreeTest):
1565""" 1674"""
1566 1675
1567 def setUp(self): 1676 def setUp(self):
1568 self.soup = BeautifulSoup(self.HTML) 1677 self.soup = BeautifulSoup(self.HTML, 'html.parser')
1569 1678
1570 def assertSelects(self, selector, expected_ids): 1679 def assertSelects(self, selector, expected_ids):
1571 el_ids = [el['id'] for el in self.soup.select(selector)] 1680 el_ids = [el['id'] for el in self.soup.select(selector)]
@@ -1587,21 +1696,29 @@ class TestSoupSelector(TreeTest):
1587 els = self.soup.select('title') 1696 els = self.soup.select('title')
1588 self.assertEqual(len(els), 1) 1697 self.assertEqual(len(els), 1)
1589 self.assertEqual(els[0].name, 'title') 1698 self.assertEqual(els[0].name, 'title')
1590 self.assertEqual(els[0].contents, [u'The title']) 1699 self.assertEqual(els[0].contents, ['The title'])
1591 1700
1592 def test_one_tag_many(self): 1701 def test_one_tag_many(self):
1593 els = self.soup.select('div') 1702 els = self.soup.select('div')
1594 self.assertEqual(len(els), 3) 1703 self.assertEqual(len(els), 4)
1595 for div in els: 1704 for div in els:
1596 self.assertEqual(div.name, 'div') 1705 self.assertEqual(div.name, 'div')
1597 1706
1707 el = self.soup.select_one('div')
1708 self.assertEqual('main', el['id'])
1709
1710 def test_select_one_returns_none_if_no_match(self):
1711 match = self.soup.select_one('nonexistenttag')
1712 self.assertEqual(None, match)
1713
1714
1598 def test_tag_in_tag_one(self): 1715 def test_tag_in_tag_one(self):
1599 els = self.soup.select('div div') 1716 els = self.soup.select('div div')
1600 self.assertSelects('div div', ['inner']) 1717 self.assertSelects('div div', ['inner', 'data1'])
1601 1718
1602 def test_tag_in_tag_many(self): 1719 def test_tag_in_tag_many(self):
1603 for selector in ('html div', 'html body div', 'body div'): 1720 for selector in ('html div', 'html body div', 'body div'):
1604 self.assertSelects(selector, ['main', 'inner', 'footer']) 1721 self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
1605 1722
1606 def test_tag_no_match(self): 1723 def test_tag_no_match(self):
1607 self.assertEqual(len(self.soup.select('del')), 0) 1724 self.assertEqual(len(self.soup.select('del')), 0)
@@ -1609,6 +1726,20 @@ class TestSoupSelector(TreeTest):
1609 def test_invalid_tag(self): 1726 def test_invalid_tag(self):
1610 self.assertRaises(ValueError, self.soup.select, 'tag%t') 1727 self.assertRaises(ValueError, self.soup.select, 'tag%t')
1611 1728
1729 def test_select_dashed_tag_ids(self):
1730 self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
1731
1732 def test_select_dashed_by_id(self):
1733 dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
1734 self.assertEqual(dashed[0].name, 'custom-dashed-tag')
1735 self.assertEqual(dashed[0]['id'], 'dash2')
1736
1737 def test_dashed_tag_text(self):
1738 self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
1739
1740 def test_select_dashed_matches_find_all(self):
1741 self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
1742
1612 def test_header_tags(self): 1743 def test_header_tags(self):
1613 self.assertSelectMultiple( 1744 self.assertSelectMultiple(
1614 ('h1', ['header1']), 1745 ('h1', ['header1']),
@@ -1709,6 +1840,7 @@ class TestSoupSelector(TreeTest):
1709 ('[id^="m"]', ['me', 'main']), 1840 ('[id^="m"]', ['me', 'main']),
1710 ('div[id^="m"]', ['main']), 1841 ('div[id^="m"]', ['main']),
1711 ('a[id^="m"]', ['me']), 1842 ('a[id^="m"]', ['me']),
1843 ('div[data-tag^="dashed"]', ['data1'])
1712 ) 1844 )
1713 1845
1714 def test_attribute_endswith(self): 1846 def test_attribute_endswith(self):
@@ -1716,8 +1848,8 @@ class TestSoupSelector(TreeTest):
1716 ('[href$=".css"]', ['l1']), 1848 ('[href$=".css"]', ['l1']),
1717 ('link[href$=".css"]', ['l1']), 1849 ('link[href$=".css"]', ['l1']),
1718 ('link[id$="1"]', ['l1']), 1850 ('link[id$="1"]', ['l1']),
1719 ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), 1851 ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
1720 ('div[id$="1"]', []), 1852 ('div[id$="1"]', ['data1']),
1721 ('[id$="noending"]', []), 1853 ('[id$="noending"]', []),
1722 ) 1854 )
1723 1855
@@ -1730,7 +1862,6 @@ class TestSoupSelector(TreeTest):
1730 ('[rel*="notstyle"]', []), 1862 ('[rel*="notstyle"]', []),
1731 ('link[rel*="notstyle"]', []), 1863 ('link[rel*="notstyle"]', []),
1732 ('link[href*="bla"]', ['l1']), 1864 ('link[href*="bla"]', ['l1']),
1733 ('a[href*="http://"]', ['bob', 'me']),
1734 ('[href*="http://"]', ['bob', 'me']), 1865 ('[href*="http://"]', ['bob', 'me']),
1735 ('[id*="p"]', ['pmulti', 'p1']), 1866 ('[id*="p"]', ['pmulti', 'p1']),
1736 ('div[id*="m"]', ['main']), 1867 ('div[id*="m"]', ['main']),
@@ -1739,8 +1870,8 @@ class TestSoupSelector(TreeTest):
1739 ('[href*=".css"]', ['l1']), 1870 ('[href*=".css"]', ['l1']),
1740 ('link[href*=".css"]', ['l1']), 1871 ('link[href*=".css"]', ['l1']),
1741 ('link[id*="1"]', ['l1']), 1872 ('link[id*="1"]', ['l1']),
1742 ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), 1873 ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
1743 ('div[id*="1"]', []), 1874 ('div[id*="1"]', ['data1']),
1744 ('[id*="noending"]', []), 1875 ('[id*="noending"]', []),
1745 # New for this test 1876 # New for this test
1746 ('[href*="."]', ['bob', 'me', 'l1']), 1877 ('[href*="."]', ['bob', 'me', 'l1']),
@@ -1748,6 +1879,7 @@ class TestSoupSelector(TreeTest):
1748 ('link[href*="."]', ['l1']), 1879 ('link[href*="."]', ['l1']),
1749 ('div[id*="n"]', ['main', 'inner']), 1880 ('div[id*="n"]', ['main', 'inner']),
1750 ('div[id*="nn"]', ['inner']), 1881 ('div[id*="nn"]', ['inner']),
1882 ('div[data-tag*="edval"]', ['data1'])
1751 ) 1883 )
1752 1884
1753 def test_attribute_exact_or_hypen(self): 1885 def test_attribute_exact_or_hypen(self):
@@ -1767,18 +1899,27 @@ class TestSoupSelector(TreeTest):
1767 ('p[class]', ['p1', 'pmulti']), 1899 ('p[class]', ['p1', 'pmulti']),
1768 ('[blah]', []), 1900 ('[blah]', []),
1769 ('p[blah]', []), 1901 ('p[blah]', []),
1902 ('div[data-tag]', ['data1'])
1770 ) 1903 )
1771 1904
1905 def test_unsupported_pseudoclass(self):
1906 self.assertRaises(
1907 NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
1908
1909 self.assertRaises(
1910 NotImplementedError, self.soup.select, "a:nth-of-type(a)")
1911
1912
1772 def test_nth_of_type(self): 1913 def test_nth_of_type(self):
1773 # Try to select first paragraph 1914 # Try to select first paragraph
1774 els = self.soup.select('div#inner p:nth-of-type(1)') 1915 els = self.soup.select('div#inner p:nth-of-type(1)')
1775 self.assertEqual(len(els), 1) 1916 self.assertEqual(len(els), 1)
1776 self.assertEqual(els[0].string, u'Some text') 1917 self.assertEqual(els[0].string, 'Some text')
1777 1918
1778 # Try to select third paragraph 1919 # Try to select third paragraph
1779 els = self.soup.select('div#inner p:nth-of-type(3)') 1920 els = self.soup.select('div#inner p:nth-of-type(3)')
1780 self.assertEqual(len(els), 1) 1921 self.assertEqual(len(els), 1)
1781 self.assertEqual(els[0].string, u'Another') 1922 self.assertEqual(els[0].string, 'Another')
1782 1923
1783 # Try to select (non-existent!) fourth paragraph 1924 # Try to select (non-existent!) fourth paragraph
1784 els = self.soup.select('div#inner p:nth-of-type(4)') 1925 els = self.soup.select('div#inner p:nth-of-type(4)')
@@ -1791,7 +1932,7 @@ class TestSoupSelector(TreeTest):
1791 def test_nth_of_type_direct_descendant(self): 1932 def test_nth_of_type_direct_descendant(self):
1792 els = self.soup.select('div#inner > p:nth-of-type(1)') 1933 els = self.soup.select('div#inner > p:nth-of-type(1)')
1793 self.assertEqual(len(els), 1) 1934 self.assertEqual(len(els), 1)
1794 self.assertEqual(els[0].string, u'Some text') 1935 self.assertEqual(els[0].string, 'Some text')
1795 1936
1796 def test_id_child_selector_nth_of_type(self): 1937 def test_id_child_selector_nth_of_type(self):
1797 self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) 1938 self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
@@ -1803,7 +1944,7 @@ class TestSoupSelector(TreeTest):
1803 selected = inner.select("div") 1944 selected = inner.select("div")
1804 # The <div id="inner"> tag was selected. The <div id="footer"> 1945 # The <div id="inner"> tag was selected. The <div id="footer">
1805 # tag was not. 1946 # tag was not.
1806 self.assertSelectsIDs(selected, ['inner']) 1947 self.assertSelectsIDs(selected, ['inner', 'data1'])
1807 1948
1808 def test_overspecified_child_id(self): 1949 def test_overspecified_child_id(self):
1809 self.assertSelects(".fancy #inner", ['inner']) 1950 self.assertSelects(".fancy #inner", ['inner'])
@@ -1827,3 +1968,44 @@ class TestSoupSelector(TreeTest):
1827 1968
1828 def test_sibling_combinator_wont_select_same_tag_twice(self): 1969 def test_sibling_combinator_wont_select_same_tag_twice(self):
1829 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) 1970 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
1971
1972 # Test the selector grouping operator (the comma)
1973 def test_multiple_select(self):
1974 self.assertSelects('x, y', ['xid', 'yid'])
1975
1976 def test_multiple_select_with_no_space(self):
1977 self.assertSelects('x,y', ['xid', 'yid'])
1978
1979 def test_multiple_select_with_more_space(self):
1980 self.assertSelects('x, y', ['xid', 'yid'])
1981
1982 def test_multiple_select_duplicated(self):
1983 self.assertSelects('x, x', ['xid'])
1984
1985 def test_multiple_select_sibling(self):
1986 self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
1987
1988 def test_multiple_select_tag_and_direct_descendant(self):
1989 self.assertSelects('x, y > z', ['xid', 'zidb'])
1990
1991 def test_multiple_select_direct_descendant_and_tags(self):
1992 self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1993
1994 def test_multiple_select_indirect_descendant(self):
1995 self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1996
1997 def test_invalid_multiple_select(self):
1998 self.assertRaises(ValueError, self.soup.select, ',x, y')
1999 self.assertRaises(ValueError, self.soup.select, 'x,,y')
2000
2001 def test_multiple_select_attrs(self):
2002 self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
2003
2004 def test_multiple_select_ids(self):
2005 self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
2006
2007 def test_multiple_select_nested(self):
2008 self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
2009
2010
2011