diff options
author | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-05-06 09:06:51 +0100 |
---|---|---|
committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-06-02 08:24:02 +0100 |
commit | 822eabf32dd69346071bd25fc3639db252d2f346 (patch) | |
tree | edac6d1d0d5114a4e3c72fea5589c069453b72d2 /bitbake/lib/bs4 | |
parent | 4f8959324df3b89487973bd4e8de21debb0a12ef (diff) | |
download | poky-822eabf32dd69346071bd25fc3639db252d2f346.tar.gz |
bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)
Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers
recommendation for v3 use.
(Bitbake rev: 2f4b98af93c971a8c466ffaf3c09cca0edb6e3ad)
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4')
-rw-r--r-- | bitbake/lib/bs4/__init__.py | 112 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/__init__.py | 7 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/_html5lib.py | 71 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/_htmlparser.py | 56 | ||||
-rw-r--r-- | bitbake/lib/bs4/builder/_lxml.py | 47 | ||||
-rw-r--r-- | bitbake/lib/bs4/dammit.py | 31 | ||||
-rw-r--r-- | bitbake/lib/bs4/diagnose.py | 68 | ||||
-rw-r--r-- | bitbake/lib/bs4/element.py | 346 | ||||
-rw-r--r-- | bitbake/lib/bs4/testing.py | 129 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_builder_registry.py | 14 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_html5lib.py | 19 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_htmlparser.py | 13 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_lxml.py | 19 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_soup.py | 107 | ||||
-rw-r--r-- | bitbake/lib/bs4/tests/test_tree.py | 294 |
15 files changed, 972 insertions, 361 deletions
diff --git a/bitbake/lib/bs4/__init__.py b/bitbake/lib/bs4/__init__.py index 7ba34269af..f6fdfd50b1 100644 --- a/bitbake/lib/bs4/__init__.py +++ b/bitbake/lib/bs4/__init__.py | |||
@@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |||
17 | """ | 17 | """ |
18 | 18 | ||
19 | __author__ = "Leonard Richardson (leonardr@segfault.org)" | 19 | __author__ = "Leonard Richardson (leonardr@segfault.org)" |
20 | __version__ = "4.3.2" | 20 | __version__ = "4.4.1" |
21 | __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" | 21 | __copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" |
22 | __license__ = "MIT" | 22 | __license__ = "MIT" |
23 | 23 | ||
24 | __all__ = ['BeautifulSoup'] | 24 | __all__ = ['BeautifulSoup'] |
@@ -45,7 +45,7 @@ from .element import ( | |||
45 | 45 | ||
46 | # The very first thing we do is give a useful error if someone is | 46 | # The very first thing we do is give a useful error if someone is |
47 | # running this code under Python 3 without converting it. | 47 | # running this code under Python 3 without converting it. |
48 | syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' | 48 | 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' |
49 | 49 | ||
50 | class BeautifulSoup(Tag): | 50 | class BeautifulSoup(Tag): |
51 | """ | 51 | """ |
@@ -69,7 +69,7 @@ class BeautifulSoup(Tag): | |||
69 | like HTML's <br> tag), call handle_starttag and then | 69 | like HTML's <br> tag), call handle_starttag and then |
70 | handle_endtag. | 70 | handle_endtag. |
71 | """ | 71 | """ |
72 | ROOT_TAG_NAME = u'[document]' | 72 | ROOT_TAG_NAME = '[document]' |
73 | 73 | ||
74 | # If the end-user gives no indication which tree builder they | 74 | # If the end-user gives no indication which tree builder they |
75 | # want, look for one with these features. | 75 | # want, look for one with these features. |
@@ -77,8 +77,11 @@ class BeautifulSoup(Tag): | |||
77 | 77 | ||
78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' | 78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' |
79 | 79 | ||
80 | NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" | ||
81 | |||
80 | def __init__(self, markup="", features=None, builder=None, | 82 | def __init__(self, markup="", features=None, builder=None, |
81 | parse_only=None, from_encoding=None, **kwargs): | 83 | parse_only=None, from_encoding=None, exclude_encodings=None, |
84 | **kwargs): | ||
82 | """The Soup object is initialized as the 'root tag', and the | 85 | """The Soup object is initialized as the 'root tag', and the |
83 | provided markup (which can be a string or a file-like object) | 86 | provided markup (which can be a string or a file-like object) |
84 | is fed into the underlying parser.""" | 87 | is fed into the underlying parser.""" |
@@ -114,9 +117,9 @@ class BeautifulSoup(Tag): | |||
114 | del kwargs['isHTML'] | 117 | del kwargs['isHTML'] |
115 | warnings.warn( | 118 | warnings.warn( |
116 | "BS4 does not respect the isHTML argument to the " | 119 | "BS4 does not respect the isHTML argument to the " |
117 | "BeautifulSoup constructor. You can pass in features='html' " | 120 | "BeautifulSoup constructor. Suggest you use " |
118 | "or features='xml' to get a builder capable of handling " | 121 | "features='lxml' for HTML and features='lxml-xml' for " |
119 | "one or the other.") | 122 | "XML.") |
120 | 123 | ||
121 | def deprecated_argument(old_name, new_name): | 124 | def deprecated_argument(old_name, new_name): |
122 | if old_name in kwargs: | 125 | if old_name in kwargs: |
@@ -135,12 +138,13 @@ class BeautifulSoup(Tag): | |||
135 | "fromEncoding", "from_encoding") | 138 | "fromEncoding", "from_encoding") |
136 | 139 | ||
137 | if len(kwargs) > 0: | 140 | if len(kwargs) > 0: |
138 | arg = kwargs.keys().pop() | 141 | arg = list(kwargs.keys()).pop() |
139 | raise TypeError( | 142 | raise TypeError( |
140 | "__init__() got an unexpected keyword argument '%s'" % arg) | 143 | "__init__() got an unexpected keyword argument '%s'" % arg) |
141 | 144 | ||
142 | if builder is None: | 145 | if builder is None: |
143 | if isinstance(features, basestring): | 146 | original_features = features |
147 | if isinstance(features, str): | ||
144 | features = [features] | 148 | features = [features] |
145 | if features is None or len(features) == 0: | 149 | if features is None or len(features) == 0: |
146 | features = self.DEFAULT_BUILDER_FEATURES | 150 | features = self.DEFAULT_BUILDER_FEATURES |
@@ -151,6 +155,16 @@ class BeautifulSoup(Tag): | |||
151 | "requested: %s. Do you need to install a parser library?" | 155 | "requested: %s. Do you need to install a parser library?" |
152 | % ",".join(features)) | 156 | % ",".join(features)) |
153 | builder = builder_class() | 157 | builder = builder_class() |
158 | if not (original_features == builder.NAME or | ||
159 | original_features in builder.ALTERNATE_NAMES): | ||
160 | if builder.is_xml: | ||
161 | markup_type = "XML" | ||
162 | else: | ||
163 | markup_type = "HTML" | ||
164 | warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( | ||
165 | parser=builder.NAME, | ||
166 | markup_type=markup_type)) | ||
167 | |||
154 | self.builder = builder | 168 | self.builder = builder |
155 | self.is_xml = builder.is_xml | 169 | self.is_xml = builder.is_xml |
156 | self.builder.soup = self | 170 | self.builder.soup = self |
@@ -164,7 +178,7 @@ class BeautifulSoup(Tag): | |||
164 | # involving passing non-markup to Beautiful Soup. | 178 | # involving passing non-markup to Beautiful Soup. |
165 | # Beautiful Soup will still parse the input as markup, | 179 | # Beautiful Soup will still parse the input as markup, |
166 | # just in case that's what the user really wants. | 180 | # just in case that's what the user really wants. |
167 | if (isinstance(markup, unicode) | 181 | if (isinstance(markup, str) |
168 | and not os.path.supports_unicode_filenames): | 182 | and not os.path.supports_unicode_filenames): |
169 | possible_filename = markup.encode("utf8") | 183 | possible_filename = markup.encode("utf8") |
170 | else: | 184 | else: |
@@ -172,25 +186,30 @@ class BeautifulSoup(Tag): | |||
172 | is_file = False | 186 | is_file = False |
173 | try: | 187 | try: |
174 | is_file = os.path.exists(possible_filename) | 188 | is_file = os.path.exists(possible_filename) |
175 | except Exception, e: | 189 | except Exception as e: |
176 | # This is almost certainly a problem involving | 190 | # This is almost certainly a problem involving |
177 | # characters not valid in filenames on this | 191 | # characters not valid in filenames on this |
178 | # system. Just let it go. | 192 | # system. Just let it go. |
179 | pass | 193 | pass |
180 | if is_file: | 194 | if is_file: |
195 | if isinstance(markup, str): | ||
196 | markup = markup.encode("utf8") | ||
181 | warnings.warn( | 197 | warnings.warn( |
182 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) | 198 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) |
183 | if markup[:5] == "http:" or markup[:6] == "https:": | 199 | if markup[:5] == "http:" or markup[:6] == "https:": |
184 | # TODO: This is ugly but I couldn't get it to work in | 200 | # TODO: This is ugly but I couldn't get it to work in |
185 | # Python 3 otherwise. | 201 | # Python 3 otherwise. |
186 | if ((isinstance(markup, bytes) and not b' ' in markup) | 202 | if ((isinstance(markup, bytes) and not b' ' in markup) |
187 | or (isinstance(markup, unicode) and not u' ' in markup)): | 203 | or (isinstance(markup, str) and not ' ' in markup)): |
204 | if isinstance(markup, str): | ||
205 | markup = markup.encode("utf8") | ||
188 | warnings.warn( | 206 | warnings.warn( |
189 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) | 207 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) |
190 | 208 | ||
191 | for (self.markup, self.original_encoding, self.declared_html_encoding, | 209 | for (self.markup, self.original_encoding, self.declared_html_encoding, |
192 | self.contains_replacement_characters) in ( | 210 | self.contains_replacement_characters) in ( |
193 | self.builder.prepare_markup(markup, from_encoding)): | 211 | self.builder.prepare_markup( |
212 | markup, from_encoding, exclude_encodings=exclude_encodings)): | ||
194 | self.reset() | 213 | self.reset() |
195 | try: | 214 | try: |
196 | self._feed() | 215 | self._feed() |
@@ -203,6 +222,16 @@ class BeautifulSoup(Tag): | |||
203 | self.markup = None | 222 | self.markup = None |
204 | self.builder.soup = None | 223 | self.builder.soup = None |
205 | 224 | ||
225 | def __copy__(self): | ||
226 | return type(self)(self.encode(), builder=self.builder) | ||
227 | |||
228 | def __getstate__(self): | ||
229 | # Frequently a tree builder can't be pickled. | ||
230 | d = dict(self.__dict__) | ||
231 | if 'builder' in d and not self.builder.picklable: | ||
232 | del d['builder'] | ||
233 | return d | ||
234 | |||
206 | def _feed(self): | 235 | def _feed(self): |
207 | # Convert the document to Unicode. | 236 | # Convert the document to Unicode. |
208 | self.builder.reset() | 237 | self.builder.reset() |
@@ -229,9 +258,7 @@ class BeautifulSoup(Tag): | |||
229 | 258 | ||
230 | def new_string(self, s, subclass=NavigableString): | 259 | def new_string(self, s, subclass=NavigableString): |
231 | """Create a new NavigableString associated with this soup.""" | 260 | """Create a new NavigableString associated with this soup.""" |
232 | navigable = subclass(s) | 261 | return subclass(s) |
233 | navigable.setup() | ||
234 | return navigable | ||
235 | 262 | ||
236 | def insert_before(self, successor): | 263 | def insert_before(self, successor): |
237 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") | 264 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") |
@@ -259,7 +286,7 @@ class BeautifulSoup(Tag): | |||
259 | 286 | ||
260 | def endData(self, containerClass=NavigableString): | 287 | def endData(self, containerClass=NavigableString): |
261 | if self.current_data: | 288 | if self.current_data: |
262 | current_data = u''.join(self.current_data) | 289 | current_data = ''.join(self.current_data) |
263 | # If whitespace is not preserved, and this string contains | 290 | # If whitespace is not preserved, and this string contains |
264 | # nothing but ASCII spaces, replace it with a single space | 291 | # nothing but ASCII spaces, replace it with a single space |
265 | # or newline. | 292 | # or newline. |
@@ -290,14 +317,49 @@ class BeautifulSoup(Tag): | |||
290 | def object_was_parsed(self, o, parent=None, most_recent_element=None): | 317 | def object_was_parsed(self, o, parent=None, most_recent_element=None): |
291 | """Add an object to the parse tree.""" | 318 | """Add an object to the parse tree.""" |
292 | parent = parent or self.currentTag | 319 | parent = parent or self.currentTag |
293 | most_recent_element = most_recent_element or self._most_recent_element | 320 | previous_element = most_recent_element or self._most_recent_element |
294 | o.setup(parent, most_recent_element) | 321 | |
322 | next_element = previous_sibling = next_sibling = None | ||
323 | if isinstance(o, Tag): | ||
324 | next_element = o.next_element | ||
325 | next_sibling = o.next_sibling | ||
326 | previous_sibling = o.previous_sibling | ||
327 | if not previous_element: | ||
328 | previous_element = o.previous_element | ||
329 | |||
330 | o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) | ||
295 | 331 | ||
296 | if most_recent_element is not None: | ||
297 | most_recent_element.next_element = o | ||
298 | self._most_recent_element = o | 332 | self._most_recent_element = o |
299 | parent.contents.append(o) | 333 | parent.contents.append(o) |
300 | 334 | ||
335 | if parent.next_sibling: | ||
336 | # This node is being inserted into an element that has | ||
337 | # already been parsed. Deal with any dangling references. | ||
338 | index = parent.contents.index(o) | ||
339 | if index == 0: | ||
340 | previous_element = parent | ||
341 | previous_sibling = None | ||
342 | else: | ||
343 | previous_element = previous_sibling = parent.contents[index-1] | ||
344 | if index == len(parent.contents)-1: | ||
345 | next_element = parent.next_sibling | ||
346 | next_sibling = None | ||
347 | else: | ||
348 | next_element = next_sibling = parent.contents[index+1] | ||
349 | |||
350 | o.previous_element = previous_element | ||
351 | if previous_element: | ||
352 | previous_element.next_element = o | ||
353 | o.next_element = next_element | ||
354 | if next_element: | ||
355 | next_element.previous_element = o | ||
356 | o.next_sibling = next_sibling | ||
357 | if next_sibling: | ||
358 | next_sibling.previous_sibling = o | ||
359 | o.previous_sibling = previous_sibling | ||
360 | if previous_sibling: | ||
361 | previous_sibling.next_sibling = o | ||
362 | |||
301 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): | 363 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): |
302 | """Pops the tag stack up to and including the most recent | 364 | """Pops the tag stack up to and including the most recent |
303 | instance of the given tag. If inclusivePop is false, pops the tag | 365 | instance of the given tag. If inclusivePop is false, pops the tag |
@@ -367,9 +429,9 @@ class BeautifulSoup(Tag): | |||
367 | encoding_part = '' | 429 | encoding_part = '' |
368 | if eventual_encoding != None: | 430 | if eventual_encoding != None: |
369 | encoding_part = ' encoding="%s"' % eventual_encoding | 431 | encoding_part = ' encoding="%s"' % eventual_encoding |
370 | prefix = u'<?xml version="1.0"%s?>\n' % encoding_part | 432 | prefix = '<?xml version="1.0"%s?>\n' % encoding_part |
371 | else: | 433 | else: |
372 | prefix = u'' | 434 | prefix = '' |
373 | if not pretty_print: | 435 | if not pretty_print: |
374 | indent_level = None | 436 | indent_level = None |
375 | else: | 437 | else: |
@@ -403,4 +465,4 @@ class FeatureNotFound(ValueError): | |||
403 | if __name__ == '__main__': | 465 | if __name__ == '__main__': |
404 | import sys | 466 | import sys |
405 | soup = BeautifulSoup(sys.stdin) | 467 | soup = BeautifulSoup(sys.stdin) |
406 | print soup.prettify() | 468 | print(soup.prettify()) |
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py index 740f5f29cd..6ccd4d23d6 100644 --- a/bitbake/lib/bs4/builder/__init__.py +++ b/bitbake/lib/bs4/builder/__init__.py | |||
@@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry() | |||
80 | class TreeBuilder(object): | 80 | class TreeBuilder(object): |
81 | """Turn a document into a Beautiful Soup object tree.""" | 81 | """Turn a document into a Beautiful Soup object tree.""" |
82 | 82 | ||
83 | NAME = "[Unknown tree builder]" | ||
84 | ALTERNATE_NAMES = [] | ||
83 | features = [] | 85 | features = [] |
84 | 86 | ||
85 | is_xml = False | 87 | is_xml = False |
88 | picklable = False | ||
86 | preserve_whitespace_tags = set() | 89 | preserve_whitespace_tags = set() |
87 | empty_element_tags = None # A tag will be considered an empty-element | 90 | empty_element_tags = None # A tag will be considered an empty-element |
88 | # tag when and only when it has no contents. | 91 | # tag when and only when it has no contents. |
@@ -153,13 +156,13 @@ class TreeBuilder(object): | |||
153 | universal = self.cdata_list_attributes.get('*', []) | 156 | universal = self.cdata_list_attributes.get('*', []) |
154 | tag_specific = self.cdata_list_attributes.get( | 157 | tag_specific = self.cdata_list_attributes.get( |
155 | tag_name.lower(), None) | 158 | tag_name.lower(), None) |
156 | for attr in attrs.keys(): | 159 | for attr in list(attrs.keys()): |
157 | if attr in universal or (tag_specific and attr in tag_specific): | 160 | if attr in universal or (tag_specific and attr in tag_specific): |
158 | # We have a "class"-type attribute whose string | 161 | # We have a "class"-type attribute whose string |
159 | # value is a whitespace-separated list of | 162 | # value is a whitespace-separated list of |
160 | # values. Split it into a list. | 163 | # values. Split it into a list. |
161 | value = attrs[attr] | 164 | value = attrs[attr] |
162 | if isinstance(value, basestring): | 165 | if isinstance(value, str): |
163 | values = whitespace_re.split(value) | 166 | values = whitespace_re.split(value) |
164 | else: | 167 | else: |
165 | # html5lib sometimes calls setAttributes twice | 168 | # html5lib sometimes calls setAttributes twice |
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py index 7de36ae75e..f0e5924ebb 100644 --- a/bitbake/lib/bs4/builder/_html5lib.py +++ b/bitbake/lib/bs4/builder/_html5lib.py | |||
@@ -2,6 +2,7 @@ __all__ = [ | |||
2 | 'HTML5TreeBuilder', | 2 | 'HTML5TreeBuilder', |
3 | ] | 3 | ] |
4 | 4 | ||
5 | from pdb import set_trace | ||
5 | import warnings | 6 | import warnings |
6 | from bs4.builder import ( | 7 | from bs4.builder import ( |
7 | PERMISSIVE, | 8 | PERMISSIVE, |
@@ -9,7 +10,10 @@ from bs4.builder import ( | |||
9 | HTML_5, | 10 | HTML_5, |
10 | HTMLTreeBuilder, | 11 | HTMLTreeBuilder, |
11 | ) | 12 | ) |
12 | from bs4.element import NamespacedAttribute | 13 | from bs4.element import ( |
14 | NamespacedAttribute, | ||
15 | whitespace_re, | ||
16 | ) | ||
13 | import html5lib | 17 | import html5lib |
14 | from html5lib.constants import namespaces | 18 | from html5lib.constants import namespaces |
15 | from bs4.element import ( | 19 | from bs4.element import ( |
@@ -22,11 +26,20 @@ from bs4.element import ( | |||
22 | class HTML5TreeBuilder(HTMLTreeBuilder): | 26 | class HTML5TreeBuilder(HTMLTreeBuilder): |
23 | """Use html5lib to build a tree.""" | 27 | """Use html5lib to build a tree.""" |
24 | 28 | ||
25 | features = ['html5lib', PERMISSIVE, HTML_5, HTML] | 29 | NAME = "html5lib" |
30 | |||
31 | features = [NAME, PERMISSIVE, HTML_5, HTML] | ||
26 | 32 | ||
27 | def prepare_markup(self, markup, user_specified_encoding): | 33 | def prepare_markup(self, markup, user_specified_encoding, |
34 | document_declared_encoding=None, exclude_encodings=None): | ||
28 | # Store the user-specified encoding for use later on. | 35 | # Store the user-specified encoding for use later on. |
29 | self.user_specified_encoding = user_specified_encoding | 36 | self.user_specified_encoding = user_specified_encoding |
37 | |||
38 | # document_declared_encoding and exclude_encodings aren't used | ||
39 | # ATM because the html5lib TreeBuilder doesn't use | ||
40 | # UnicodeDammit. | ||
41 | if exclude_encodings: | ||
42 | warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") | ||
30 | yield (markup, None, None, False) | 43 | yield (markup, None, None, False) |
31 | 44 | ||
32 | # These methods are defined by Beautiful Soup. | 45 | # These methods are defined by Beautiful Soup. |
@@ -37,7 +50,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): | |||
37 | doc = parser.parse(markup, encoding=self.user_specified_encoding) | 50 | doc = parser.parse(markup, encoding=self.user_specified_encoding) |
38 | 51 | ||
39 | # Set the character encoding detected by the tokenizer. | 52 | # Set the character encoding detected by the tokenizer. |
40 | if isinstance(markup, unicode): | 53 | if isinstance(markup, str): |
41 | # We need to special-case this because html5lib sets | 54 | # We need to special-case this because html5lib sets |
42 | # charEncoding to UTF-8 if it gets Unicode input. | 55 | # charEncoding to UTF-8 if it gets Unicode input. |
43 | doc.original_encoding = None | 56 | doc.original_encoding = None |
@@ -51,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): | |||
51 | 64 | ||
52 | def test_fragment_to_document(self, fragment): | 65 | def test_fragment_to_document(self, fragment): |
53 | """See `TreeBuilder`.""" | 66 | """See `TreeBuilder`.""" |
54 | return u'<html><head></head><body>%s</body></html>' % fragment | 67 | return '<html><head></head><body>%s</body></html>' % fragment |
55 | 68 | ||
56 | 69 | ||
57 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): | 70 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): |
@@ -101,7 +114,16 @@ class AttrList(object): | |||
101 | def __iter__(self): | 114 | def __iter__(self): |
102 | return list(self.attrs.items()).__iter__() | 115 | return list(self.attrs.items()).__iter__() |
103 | def __setitem__(self, name, value): | 116 | def __setitem__(self, name, value): |
104 | "set attr", name, value | 117 | # If this attribute is a multi-valued attribute for this element, |
118 | # turn its value into a list. | ||
119 | list_attr = HTML5TreeBuilder.cdata_list_attributes | ||
120 | if (name in list_attr['*'] | ||
121 | or (self.element.name in list_attr | ||
122 | and name in list_attr[self.element.name])): | ||
123 | # A node that is being cloned may have already undergone | ||
124 | # this procedure. | ||
125 | if not isinstance(value, list): | ||
126 | value = whitespace_re.split(value) | ||
105 | self.element[name] = value | 127 | self.element[name] = value |
106 | def items(self): | 128 | def items(self): |
107 | return list(self.attrs.items()) | 129 | return list(self.attrs.items()) |
@@ -124,7 +146,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
124 | 146 | ||
125 | def appendChild(self, node): | 147 | def appendChild(self, node): |
126 | string_child = child = None | 148 | string_child = child = None |
127 | if isinstance(node, basestring): | 149 | if isinstance(node, str): |
128 | # Some other piece of code decided to pass in a string | 150 | # Some other piece of code decided to pass in a string |
129 | # instead of creating a TextElement object to contain the | 151 | # instead of creating a TextElement object to contain the |
130 | # string. | 152 | # string. |
@@ -139,7 +161,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
139 | else: | 161 | else: |
140 | child = node.element | 162 | child = node.element |
141 | 163 | ||
142 | if not isinstance(child, basestring) and child.parent is not None: | 164 | if not isinstance(child, str) and child.parent is not None: |
143 | node.element.extract() | 165 | node.element.extract() |
144 | 166 | ||
145 | if (string_child and self.element.contents | 167 | if (string_child and self.element.contents |
@@ -152,7 +174,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
152 | old_element.replace_with(new_element) | 174 | old_element.replace_with(new_element) |
153 | self.soup._most_recent_element = new_element | 175 | self.soup._most_recent_element = new_element |
154 | else: | 176 | else: |
155 | if isinstance(node, basestring): | 177 | if isinstance(node, str): |
156 | # Create a brand new NavigableString from this string. | 178 | # Create a brand new NavigableString from this string. |
157 | child = self.soup.new_string(node) | 179 | child = self.soup.new_string(node) |
158 | 180 | ||
@@ -161,6 +183,12 @@ class Element(html5lib.treebuilders._base.Node): | |||
161 | # immediately after the parent, if it has no children.) | 183 | # immediately after the parent, if it has no children.) |
162 | if self.element.contents: | 184 | if self.element.contents: |
163 | most_recent_element = self.element._last_descendant(False) | 185 | most_recent_element = self.element._last_descendant(False) |
186 | elif self.element.next_element is not None: | ||
187 | # Something from further ahead in the parse tree is | ||
188 | # being inserted into this earlier element. This is | ||
189 | # very annoying because it means an expensive search | ||
190 | # for the last element in the tree. | ||
191 | most_recent_element = self.soup._last_descendant() | ||
164 | else: | 192 | else: |
165 | most_recent_element = self.element | 193 | most_recent_element = self.element |
166 | 194 | ||
@@ -172,6 +200,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
172 | return AttrList(self.element) | 200 | return AttrList(self.element) |
173 | 201 | ||
174 | def setAttributes(self, attributes): | 202 | def setAttributes(self, attributes): |
203 | |||
175 | if attributes is not None and len(attributes) > 0: | 204 | if attributes is not None and len(attributes) > 0: |
176 | 205 | ||
177 | converted_attributes = [] | 206 | converted_attributes = [] |
@@ -183,7 +212,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
183 | 212 | ||
184 | self.soup.builder._replace_cdata_list_attribute_values( | 213 | self.soup.builder._replace_cdata_list_attribute_values( |
185 | self.name, attributes) | 214 | self.name, attributes) |
186 | for name, value in attributes.items(): | 215 | for name, value in list(attributes.items()): |
187 | self.element[name] = value | 216 | self.element[name] = value |
188 | 217 | ||
189 | # The attributes may contain variables that need substitution. | 218 | # The attributes may contain variables that need substitution. |
@@ -218,6 +247,9 @@ class Element(html5lib.treebuilders._base.Node): | |||
218 | 247 | ||
219 | def reparentChildren(self, new_parent): | 248 | def reparentChildren(self, new_parent): |
220 | """Move all of this tag's children into another tag.""" | 249 | """Move all of this tag's children into another tag.""" |
250 | # print "MOVE", self.element.contents | ||
251 | # print "FROM", self.element | ||
252 | # print "TO", new_parent.element | ||
221 | element = self.element | 253 | element = self.element |
222 | new_parent_element = new_parent.element | 254 | new_parent_element = new_parent.element |
223 | # Determine what this tag's next_element will be once all the children | 255 | # Determine what this tag's next_element will be once all the children |
@@ -236,17 +268,28 @@ class Element(html5lib.treebuilders._base.Node): | |||
236 | new_parents_last_descendant_next_element = new_parent_element.next_element | 268 | new_parents_last_descendant_next_element = new_parent_element.next_element |
237 | 269 | ||
238 | to_append = element.contents | 270 | to_append = element.contents |
239 | append_after = new_parent.element.contents | 271 | append_after = new_parent_element.contents |
240 | if len(to_append) > 0: | 272 | if len(to_append) > 0: |
241 | # Set the first child's previous_element and previous_sibling | 273 | # Set the first child's previous_element and previous_sibling |
242 | # to elements within the new parent | 274 | # to elements within the new parent |
243 | first_child = to_append[0] | 275 | first_child = to_append[0] |
244 | first_child.previous_element = new_parents_last_descendant | 276 | if new_parents_last_descendant: |
277 | first_child.previous_element = new_parents_last_descendant | ||
278 | else: | ||
279 | first_child.previous_element = new_parent_element | ||
245 | first_child.previous_sibling = new_parents_last_child | 280 | first_child.previous_sibling = new_parents_last_child |
281 | if new_parents_last_descendant: | ||
282 | new_parents_last_descendant.next_element = first_child | ||
283 | else: | ||
284 | new_parent_element.next_element = first_child | ||
285 | if new_parents_last_child: | ||
286 | new_parents_last_child.next_sibling = first_child | ||
246 | 287 | ||
247 | # Fix the last child's next_element and next_sibling | 288 | # Fix the last child's next_element and next_sibling |
248 | last_child = to_append[-1] | 289 | last_child = to_append[-1] |
249 | last_child.next_element = new_parents_last_descendant_next_element | 290 | last_child.next_element = new_parents_last_descendant_next_element |
291 | if new_parents_last_descendant_next_element: | ||
292 | new_parents_last_descendant_next_element.previous_element = last_child | ||
250 | last_child.next_sibling = None | 293 | last_child.next_sibling = None |
251 | 294 | ||
252 | for child in to_append: | 295 | for child in to_append: |
@@ -257,6 +300,10 @@ class Element(html5lib.treebuilders._base.Node): | |||
257 | element.contents = [] | 300 | element.contents = [] |
258 | element.next_element = final_next_element | 301 | element.next_element = final_next_element |
259 | 302 | ||
303 | # print "DONE WITH MOVE" | ||
304 | # print "FROM", self.element | ||
305 | # print "TO", new_parent_element | ||
306 | |||
260 | def cloneNode(self): | 307 | def cloneNode(self): |
261 | tag = self.soup.new_tag(self.element.name, self.namespace) | 308 | tag = self.soup.new_tag(self.element.name, self.namespace) |
262 | node = Element(tag, self.soup, self.namespace) | 309 | node = Element(tag, self.soup, self.namespace) |
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py index ca8d8b892b..bb0a63f2f3 100644 --- a/bitbake/lib/bs4/builder/_htmlparser.py +++ b/bitbake/lib/bs4/builder/_htmlparser.py | |||
@@ -4,10 +4,16 @@ __all__ = [ | |||
4 | 'HTMLParserTreeBuilder', | 4 | 'HTMLParserTreeBuilder', |
5 | ] | 5 | ] |
6 | 6 | ||
7 | from HTMLParser import ( | 7 | from html.parser import HTMLParser |
8 | HTMLParser, | 8 | |
9 | HTMLParseError, | 9 | try: |
10 | ) | 10 | from html.parser import HTMLParseError |
11 | except ImportError as e: | ||
12 | # HTMLParseError is removed in Python 3.5. Since it can never be | ||
13 | # thrown in 3.5, we can just define our own class as a placeholder. | ||
14 | class HTMLParseError(Exception): | ||
15 | pass | ||
16 | |||
11 | import sys | 17 | import sys |
12 | import warnings | 18 | import warnings |
13 | 19 | ||
@@ -19,10 +25,10 @@ import warnings | |||
19 | # At the end of this file, we monkeypatch HTMLParser so that | 25 | # At the end of this file, we monkeypatch HTMLParser so that |
20 | # strict=True works well on Python 3.2.2. | 26 | # strict=True works well on Python 3.2.2. |
21 | major, minor, release = sys.version_info[:3] | 27 | major, minor, release = sys.version_info[:3] |
22 | CONSTRUCTOR_TAKES_STRICT = ( | 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 |
23 | major > 3 | 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 |
24 | or (major == 3 and minor > 2) | 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 |
25 | or (major == 3 and minor == 2 and release >= 3)) | 31 | |
26 | 32 | ||
27 | from bs4.element import ( | 33 | from bs4.element import ( |
28 | CData, | 34 | CData, |
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
63 | 69 | ||
64 | def handle_charref(self, name): | 70 | def handle_charref(self, name): |
65 | # XXX workaround for a bug in HTMLParser. Remove this once | 71 | # XXX workaround for a bug in HTMLParser. Remove this once |
66 | # it's fixed. | 72 | # it's fixed in all supported versions. |
73 | # http://bugs.python.org/issue13633 | ||
67 | if name.startswith('x'): | 74 | if name.startswith('x'): |
68 | real_name = int(name.lstrip('x'), 16) | 75 | real_name = int(name.lstrip('x'), 16) |
69 | elif name.startswith('X'): | 76 | elif name.startswith('X'): |
@@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
72 | real_name = int(name) | 79 | real_name = int(name) |
73 | 80 | ||
74 | try: | 81 | try: |
75 | data = unichr(real_name) | 82 | data = chr(real_name) |
76 | except (ValueError, OverflowError), e: | 83 | except (ValueError, OverflowError) as e: |
77 | data = u"\N{REPLACEMENT CHARACTER}" | 84 | data = "\N{REPLACEMENT CHARACTER}" |
78 | 85 | ||
79 | self.handle_data(data) | 86 | self.handle_data(data) |
80 | 87 | ||
@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
113 | 120 | ||
114 | def handle_pi(self, data): | 121 | def handle_pi(self, data): |
115 | self.soup.endData() | 122 | self.soup.endData() |
116 | if data.endswith("?") and data.lower().startswith("xml"): | ||
117 | # "An XHTML processing instruction using the trailing '?' | ||
118 | # will cause the '?' to be included in data." - HTMLParser | ||
119 | # docs. | ||
120 | # | ||
121 | # Strip the question mark so we don't end up with two | ||
122 | # question marks. | ||
123 | data = data[:-1] | ||
124 | self.soup.handle_data(data) | 123 | self.soup.handle_data(data) |
125 | self.soup.endData(ProcessingInstruction) | 124 | self.soup.endData(ProcessingInstruction) |
126 | 125 | ||
@@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
128 | class HTMLParserTreeBuilder(HTMLTreeBuilder): | 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): |
129 | 128 | ||
130 | is_xml = False | 129 | is_xml = False |
131 | features = [HTML, STRICT, HTMLPARSER] | 130 | picklable = True |
131 | NAME = HTMLPARSER | ||
132 | features = [NAME, HTML, STRICT] | ||
132 | 133 | ||
133 | def __init__(self, *args, **kwargs): | 134 | def __init__(self, *args, **kwargs): |
134 | if CONSTRUCTOR_TAKES_STRICT: | 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: |
135 | kwargs['strict'] = False | 136 | kwargs['strict'] = False |
137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: | ||
138 | kwargs['convert_charrefs'] = False | ||
136 | self.parser_args = (args, kwargs) | 139 | self.parser_args = (args, kwargs) |
137 | 140 | ||
138 | def prepare_markup(self, markup, user_specified_encoding=None, | 141 | def prepare_markup(self, markup, user_specified_encoding=None, |
139 | document_declared_encoding=None): | 142 | document_declared_encoding=None, exclude_encodings=None): |
140 | """ | 143 | """ |
141 | :return: A 4-tuple (markup, original encoding, encoding | 144 | :return: A 4-tuple (markup, original encoding, encoding |
142 | declared within markup, whether any characters had to be | 145 | declared within markup, whether any characters had to be |
143 | replaced with REPLACEMENT CHARACTER). | 146 | replaced with REPLACEMENT CHARACTER). |
144 | """ | 147 | """ |
145 | if isinstance(markup, unicode): | 148 | if isinstance(markup, str): |
146 | yield (markup, None, None, False) | 149 | yield (markup, None, None, False) |
147 | return | 150 | return |
148 | 151 | ||
149 | try_encodings = [user_specified_encoding, document_declared_encoding] | 152 | try_encodings = [user_specified_encoding, document_declared_encoding] |
150 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) | 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, |
154 | exclude_encodings=exclude_encodings) | ||
151 | yield (dammit.markup, dammit.original_encoding, | 155 | yield (dammit.markup, dammit.original_encoding, |
152 | dammit.declared_html_encoding, | 156 | dammit.declared_html_encoding, |
153 | dammit.contains_replacement_characters) | 157 | dammit.contains_replacement_characters) |
@@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): | |||
158 | parser.soup = self.soup | 162 | parser.soup = self.soup |
159 | try: | 163 | try: |
160 | parser.feed(markup) | 164 | parser.feed(markup) |
161 | except HTMLParseError, e: | 165 | except HTMLParseError as e: |
162 | warnings.warn(RuntimeWarning( | 166 | warnings.warn(RuntimeWarning( |
163 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) | 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) |
164 | raise e | 168 | raise e |
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py index fa5d49875e..9c6c14ee65 100644 --- a/bitbake/lib/bs4/builder/_lxml.py +++ b/bitbake/lib/bs4/builder/_lxml.py | |||
@@ -4,10 +4,15 @@ __all__ = [ | |||
4 | ] | 4 | ] |
5 | 5 | ||
6 | from io import BytesIO | 6 | from io import BytesIO |
7 | from StringIO import StringIO | 7 | from io import StringIO |
8 | import collections | 8 | import collections |
9 | from lxml import etree | 9 | from lxml import etree |
10 | from bs4.element import Comment, Doctype, NamespacedAttribute | 10 | from bs4.element import ( |
11 | Comment, | ||
12 | Doctype, | ||
13 | NamespacedAttribute, | ||
14 | ProcessingInstruction, | ||
15 | ) | ||
11 | from bs4.builder import ( | 16 | from bs4.builder import ( |
12 | FAST, | 17 | FAST, |
13 | HTML, | 18 | HTML, |
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
25 | 30 | ||
26 | is_xml = True | 31 | is_xml = True |
27 | 32 | ||
33 | NAME = "lxml-xml" | ||
34 | ALTERNATE_NAMES = ["xml"] | ||
35 | |||
28 | # Well, it's permissive by XML parser standards. | 36 | # Well, it's permissive by XML parser standards. |
29 | features = [LXML, XML, FAST, PERMISSIVE] | 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] |
30 | 38 | ||
31 | CHUNK_SIZE = 512 | 39 | CHUNK_SIZE = 512 |
32 | 40 | ||
@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
70 | return (None, tag) | 78 | return (None, tag) |
71 | 79 | ||
72 | def prepare_markup(self, markup, user_specified_encoding=None, | 80 | def prepare_markup(self, markup, user_specified_encoding=None, |
81 | exclude_encodings=None, | ||
73 | document_declared_encoding=None): | 82 | document_declared_encoding=None): |
74 | """ | 83 | """ |
75 | :yield: A series of 4-tuples. | 84 | :yield: A series of 4-tuples. |
@@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
78 | 87 | ||
79 | Each 4-tuple represents a strategy for parsing the document. | 88 | Each 4-tuple represents a strategy for parsing the document. |
80 | """ | 89 | """ |
81 | if isinstance(markup, unicode): | 90 | if isinstance(markup, str): |
82 | # We were given Unicode. Maybe lxml can parse Unicode on | 91 | # We were given Unicode. Maybe lxml can parse Unicode on |
83 | # this system? | 92 | # this system? |
84 | yield markup, None, document_declared_encoding, False | 93 | yield markup, None, document_declared_encoding, False |
85 | 94 | ||
86 | if isinstance(markup, unicode): | 95 | if isinstance(markup, str): |
87 | # No, apparently not. Convert the Unicode to UTF-8 and | 96 | # No, apparently not. Convert the Unicode to UTF-8 and |
88 | # tell lxml to parse it as UTF-8. | 97 | # tell lxml to parse it as UTF-8. |
89 | yield (markup.encode("utf8"), "utf8", | 98 | yield (markup.encode("utf8"), "utf8", |
@@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
95 | # the document as each one in turn. | 104 | # the document as each one in turn. |
96 | is_html = not self.is_xml | 105 | is_html = not self.is_xml |
97 | try_encodings = [user_specified_encoding, document_declared_encoding] | 106 | try_encodings = [user_specified_encoding, document_declared_encoding] |
98 | detector = EncodingDetector(markup, try_encodings, is_html) | 107 | detector = EncodingDetector( |
108 | markup, try_encodings, is_html, exclude_encodings) | ||
99 | for encoding in detector.encodings: | 109 | for encoding in detector.encodings: |
100 | yield (detector.markup, encoding, document_declared_encoding, False) | 110 | yield (detector.markup, encoding, document_declared_encoding, False) |
101 | 111 | ||
102 | def feed(self, markup): | 112 | def feed(self, markup): |
103 | if isinstance(markup, bytes): | 113 | if isinstance(markup, bytes): |
104 | markup = BytesIO(markup) | 114 | markup = BytesIO(markup) |
105 | elif isinstance(markup, unicode): | 115 | elif isinstance(markup, str): |
106 | markup = StringIO(markup) | 116 | markup = StringIO(markup) |
107 | 117 | ||
108 | # Call feed() at least once, even if the markup is empty, | 118 | # Call feed() at least once, even if the markup is empty, |
@@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
117 | if len(data) != 0: | 127 | if len(data) != 0: |
118 | self.parser.feed(data) | 128 | self.parser.feed(data) |
119 | self.parser.close() | 129 | self.parser.close() |
120 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | 130 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
121 | raise ParserRejectedMarkup(str(e)) | 131 | raise ParserRejectedMarkup(str(e)) |
122 | 132 | ||
123 | def close(self): | 133 | def close(self): |
@@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
135 | self.nsmaps.append(None) | 145 | self.nsmaps.append(None) |
136 | elif len(nsmap) > 0: | 146 | elif len(nsmap) > 0: |
137 | # A new namespace mapping has come into play. | 147 | # A new namespace mapping has come into play. |
138 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) | 148 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) |
139 | self.nsmaps.append(inverted_nsmap) | 149 | self.nsmaps.append(inverted_nsmap) |
140 | # Also treat the namespace mapping as a set of attributes on the | 150 | # Also treat the namespace mapping as a set of attributes on the |
141 | # tag, so we can recreate it later. | 151 | # tag, so we can recreate it later. |
142 | attrs = attrs.copy() | 152 | attrs = attrs.copy() |
143 | for prefix, namespace in nsmap.items(): | 153 | for prefix, namespace in list(nsmap.items()): |
144 | attribute = NamespacedAttribute( | 154 | attribute = NamespacedAttribute( |
145 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") | 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") |
146 | attrs[attribute] = namespace | 156 | attrs[attribute] = namespace |
@@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
149 | # from lxml with namespaces attached to their names, and | 159 | # from lxml with namespaces attached to their names, and |
150 | # turn then into NamespacedAttribute objects. | 160 | # turn then into NamespacedAttribute objects. |
151 | new_attrs = {} | 161 | new_attrs = {} |
152 | for attr, value in attrs.items(): | 162 | for attr, value in list(attrs.items()): |
153 | namespace, attr = self._getNsTag(attr) | 163 | namespace, attr = self._getNsTag(attr) |
154 | if namespace is None: | 164 | if namespace is None: |
155 | new_attrs[attr] = value | 165 | new_attrs[attr] = value |
@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
189 | self.nsmaps.pop() | 199 | self.nsmaps.pop() |
190 | 200 | ||
191 | def pi(self, target, data): | 201 | def pi(self, target, data): |
192 | pass | 202 | self.soup.endData() |
203 | self.soup.handle_data(target + ' ' + data) | ||
204 | self.soup.endData(ProcessingInstruction) | ||
193 | 205 | ||
194 | def data(self, content): | 206 | def data(self, content): |
195 | self.soup.handle_data(content) | 207 | self.soup.handle_data(content) |
@@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
207 | 219 | ||
208 | def test_fragment_to_document(self, fragment): | 220 | def test_fragment_to_document(self, fragment): |
209 | """See `TreeBuilder`.""" | 221 | """See `TreeBuilder`.""" |
210 | return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment | 222 | return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment |
211 | 223 | ||
212 | 224 | ||
213 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): |
214 | 226 | ||
215 | features = [LXML, HTML, FAST, PERMISSIVE] | 227 | NAME = LXML |
228 | ALTERNATE_NAMES = ["lxml-html"] | ||
229 | |||
230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] | ||
216 | is_xml = False | 231 | is_xml = False |
217 | 232 | ||
218 | def default_parser(self, encoding): | 233 | def default_parser(self, encoding): |
@@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | |||
224 | self.parser = self.parser_for(encoding) | 239 | self.parser = self.parser_for(encoding) |
225 | self.parser.feed(markup) | 240 | self.parser.feed(markup) |
226 | self.parser.close() | 241 | self.parser.close() |
227 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | 242 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
228 | raise ParserRejectedMarkup(str(e)) | 243 | raise ParserRejectedMarkup(str(e)) |
229 | 244 | ||
230 | 245 | ||
231 | def test_fragment_to_document(self, fragment): | 246 | def test_fragment_to_document(self, fragment): |
232 | """See `TreeBuilder`.""" | 247 | """See `TreeBuilder`.""" |
233 | return u'<html><body>%s</body></html>' % fragment | 248 | return '<html><body>%s</body></html>' % fragment |
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py index 59640b7ce3..68d419feb5 100644 --- a/bitbake/lib/bs4/dammit.py +++ b/bitbake/lib/bs4/dammit.py | |||
@@ -3,12 +3,14 @@ | |||
3 | 3 | ||
4 | This library converts a bytestream to Unicode through any means | 4 | This library converts a bytestream to Unicode through any means |
5 | necessary. It is heavily based on code from Mark Pilgrim's Universal | 5 | necessary. It is heavily based on code from Mark Pilgrim's Universal |
6 | Feed Parser. It works best on XML and XML, but it does not rewrite the | 6 | Feed Parser. It works best on XML and HTML, but it does not rewrite the |
7 | XML or HTML to reflect a new encoding; that's the tree builder's job. | 7 | XML or HTML to reflect a new encoding; that's the tree builder's job. |
8 | """ | 8 | """ |
9 | __license__ = "MIT" | ||
9 | 10 | ||
11 | from pdb import set_trace | ||
10 | import codecs | 12 | import codecs |
11 | from htmlentitydefs import codepoint2name | 13 | from html.entities import codepoint2name |
12 | import re | 14 | import re |
13 | import logging | 15 | import logging |
14 | import string | 16 | import string |
@@ -56,7 +58,7 @@ class EntitySubstitution(object): | |||
56 | reverse_lookup = {} | 58 | reverse_lookup = {} |
57 | characters_for_re = [] | 59 | characters_for_re = [] |
58 | for codepoint, name in list(codepoint2name.items()): | 60 | for codepoint, name in list(codepoint2name.items()): |
59 | character = unichr(codepoint) | 61 | character = chr(codepoint) |
60 | if codepoint != 34: | 62 | if codepoint != 34: |
61 | # There's no point in turning the quotation mark into | 63 | # There's no point in turning the quotation mark into |
62 | # ", unless it happens within an attribute value, which | 64 | # ", unless it happens within an attribute value, which |
@@ -212,8 +214,11 @@ class EncodingDetector: | |||
212 | 214 | ||
213 | 5. Windows-1252. | 215 | 5. Windows-1252. |
214 | """ | 216 | """ |
215 | def __init__(self, markup, override_encodings=None, is_html=False): | 217 | def __init__(self, markup, override_encodings=None, is_html=False, |
218 | exclude_encodings=None): | ||
216 | self.override_encodings = override_encodings or [] | 219 | self.override_encodings = override_encodings or [] |
220 | exclude_encodings = exclude_encodings or [] | ||
221 | self.exclude_encodings = set([x.lower() for x in exclude_encodings]) | ||
217 | self.chardet_encoding = None | 222 | self.chardet_encoding = None |
218 | self.is_html = is_html | 223 | self.is_html = is_html |
219 | self.declared_encoding = None | 224 | self.declared_encoding = None |
@@ -224,6 +229,8 @@ class EncodingDetector: | |||
224 | def _usable(self, encoding, tried): | 229 | def _usable(self, encoding, tried): |
225 | if encoding is not None: | 230 | if encoding is not None: |
226 | encoding = encoding.lower() | 231 | encoding = encoding.lower() |
232 | if encoding in self.exclude_encodings: | ||
233 | return False | ||
227 | if encoding not in tried: | 234 | if encoding not in tried: |
228 | tried.add(encoding) | 235 | tried.add(encoding) |
229 | return True | 236 | return True |
@@ -266,6 +273,9 @@ class EncodingDetector: | |||
266 | def strip_byte_order_mark(cls, data): | 273 | def strip_byte_order_mark(cls, data): |
267 | """If a byte-order mark is present, strip it and return the encoding it implies.""" | 274 | """If a byte-order mark is present, strip it and return the encoding it implies.""" |
268 | encoding = None | 275 | encoding = None |
276 | if isinstance(data, str): | ||
277 | # Unicode data cannot have a byte-order mark. | ||
278 | return data, encoding | ||
269 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | 279 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ |
270 | and (data[2:4] != '\x00\x00'): | 280 | and (data[2:4] != '\x00\x00'): |
271 | encoding = 'utf-16be' | 281 | encoding = 'utf-16be' |
@@ -306,7 +316,7 @@ class EncodingDetector: | |||
306 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) | 316 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) |
307 | if declared_encoding_match is not None: | 317 | if declared_encoding_match is not None: |
308 | declared_encoding = declared_encoding_match.groups()[0].decode( | 318 | declared_encoding = declared_encoding_match.groups()[0].decode( |
309 | 'ascii') | 319 | 'ascii', 'replace') |
310 | if declared_encoding: | 320 | if declared_encoding: |
311 | return declared_encoding.lower() | 321 | return declared_encoding.lower() |
312 | return None | 322 | return None |
@@ -331,18 +341,19 @@ class UnicodeDammit: | |||
331 | ] | 341 | ] |
332 | 342 | ||
333 | def __init__(self, markup, override_encodings=[], | 343 | def __init__(self, markup, override_encodings=[], |
334 | smart_quotes_to=None, is_html=False): | 344 | smart_quotes_to=None, is_html=False, exclude_encodings=[]): |
335 | self.smart_quotes_to = smart_quotes_to | 345 | self.smart_quotes_to = smart_quotes_to |
336 | self.tried_encodings = [] | 346 | self.tried_encodings = [] |
337 | self.contains_replacement_characters = False | 347 | self.contains_replacement_characters = False |
338 | self.is_html = is_html | 348 | self.is_html = is_html |
339 | 349 | ||
340 | self.detector = EncodingDetector(markup, override_encodings, is_html) | 350 | self.detector = EncodingDetector( |
351 | markup, override_encodings, is_html, exclude_encodings) | ||
341 | 352 | ||
342 | # Short-circuit if the data is in Unicode to begin with. | 353 | # Short-circuit if the data is in Unicode to begin with. |
343 | if isinstance(markup, unicode) or markup == '': | 354 | if isinstance(markup, str) or markup == '': |
344 | self.markup = markup | 355 | self.markup = markup |
345 | self.unicode_markup = unicode(markup) | 356 | self.unicode_markup = str(markup) |
346 | self.original_encoding = None | 357 | self.original_encoding = None |
347 | return | 358 | return |
348 | 359 | ||
@@ -425,7 +436,7 @@ class UnicodeDammit: | |||
425 | def _to_unicode(self, data, encoding, errors="strict"): | 436 | def _to_unicode(self, data, encoding, errors="strict"): |
426 | '''Given a string and its encoding, decodes the string into Unicode. | 437 | '''Given a string and its encoding, decodes the string into Unicode. |
427 | %encoding is a string recognized by encodings.aliases''' | 438 | %encoding is a string recognized by encodings.aliases''' |
428 | return unicode(data, encoding, errors) | 439 | return str(data, encoding, errors) |
429 | 440 | ||
430 | @property | 441 | @property |
431 | def declared_html_encoding(self): | 442 | def declared_html_encoding(self): |
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py index 4d0b00afad..083395fb46 100644 --- a/bitbake/lib/bs4/diagnose.py +++ b/bitbake/lib/bs4/diagnose.py | |||
@@ -1,7 +1,10 @@ | |||
1 | """Diagnostic functions, mainly for use when doing tech support.""" | 1 | """Diagnostic functions, mainly for use when doing tech support.""" |
2 | |||
3 | __license__ = "MIT" | ||
4 | |||
2 | import cProfile | 5 | import cProfile |
3 | from StringIO import StringIO | 6 | from io import StringIO |
4 | from HTMLParser import HTMLParser | 7 | from html.parser import HTMLParser |
5 | import bs4 | 8 | import bs4 |
6 | from bs4 import BeautifulSoup, __version__ | 9 | from bs4 import BeautifulSoup, __version__ |
7 | from bs4.builder import builder_registry | 10 | from bs4.builder import builder_registry |
@@ -17,8 +20,8 @@ import cProfile | |||
17 | 20 | ||
18 | def diagnose(data): | 21 | def diagnose(data): |
19 | """Diagnostic suite for isolating common problems.""" | 22 | """Diagnostic suite for isolating common problems.""" |
20 | print "Diagnostic running on Beautiful Soup %s" % __version__ | 23 | print("Diagnostic running on Beautiful Soup %s" % __version__) |
21 | print "Python version %s" % sys.version | 24 | print("Python version %s" % sys.version) |
22 | 25 | ||
23 | basic_parsers = ["html.parser", "html5lib", "lxml"] | 26 | basic_parsers = ["html.parser", "html5lib", "lxml"] |
24 | for name in basic_parsers: | 27 | for name in basic_parsers: |
@@ -27,44 +30,53 @@ def diagnose(data): | |||
27 | break | 30 | break |
28 | else: | 31 | else: |
29 | basic_parsers.remove(name) | 32 | basic_parsers.remove(name) |
30 | print ( | 33 | print(( |
31 | "I noticed that %s is not installed. Installing it may help." % | 34 | "I noticed that %s is not installed. Installing it may help." % |
32 | name) | 35 | name)) |
33 | 36 | ||
34 | if 'lxml' in basic_parsers: | 37 | if 'lxml' in basic_parsers: |
35 | basic_parsers.append(["lxml", "xml"]) | 38 | basic_parsers.append(["lxml", "xml"]) |
36 | from lxml import etree | 39 | try: |
37 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) | 40 | from lxml import etree |
41 | print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) | ||
42 | except ImportError as e: | ||
43 | print ( | ||
44 | "lxml is not installed or couldn't be imported.") | ||
45 | |||
38 | 46 | ||
39 | if 'html5lib' in basic_parsers: | 47 | if 'html5lib' in basic_parsers: |
40 | import html5lib | 48 | try: |
41 | print "Found html5lib version %s" % html5lib.__version__ | 49 | import html5lib |
50 | print("Found html5lib version %s" % html5lib.__version__) | ||
51 | except ImportError as e: | ||
52 | print ( | ||
53 | "html5lib is not installed or couldn't be imported.") | ||
42 | 54 | ||
43 | if hasattr(data, 'read'): | 55 | if hasattr(data, 'read'): |
44 | data = data.read() | 56 | data = data.read() |
45 | elif os.path.exists(data): | 57 | elif os.path.exists(data): |
46 | print '"%s" looks like a filename. Reading data from the file.' % data | 58 | print('"%s" looks like a filename. Reading data from the file.' % data) |
47 | data = open(data).read() | 59 | data = open(data).read() |
48 | elif data.startswith("http:") or data.startswith("https:"): | 60 | elif data.startswith("http:") or data.startswith("https:"): |
49 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data | 61 | print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) |
50 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." | 62 | print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") |
51 | return | 63 | return |
52 | 64 | print() | |
53 | 65 | ||
54 | for parser in basic_parsers: | 66 | for parser in basic_parsers: |
55 | print "Trying to parse your markup with %s" % parser | 67 | print("Trying to parse your markup with %s" % parser) |
56 | success = False | 68 | success = False |
57 | try: | 69 | try: |
58 | soup = BeautifulSoup(data, parser) | 70 | soup = BeautifulSoup(data, parser) |
59 | success = True | 71 | success = True |
60 | except Exception, e: | 72 | except Exception as e: |
61 | print "%s could not parse the markup." % parser | 73 | print("%s could not parse the markup." % parser) |
62 | traceback.print_exc() | 74 | traceback.print_exc() |
63 | if success: | 75 | if success: |
64 | print "Here's what %s did with the markup:" % parser | 76 | print("Here's what %s did with the markup:" % parser) |
65 | print soup.prettify() | 77 | print(soup.prettify()) |
66 | 78 | ||
67 | print "-" * 80 | 79 | print("-" * 80) |
68 | 80 | ||
69 | def lxml_trace(data, html=True, **kwargs): | 81 | def lxml_trace(data, html=True, **kwargs): |
70 | """Print out the lxml events that occur during parsing. | 82 | """Print out the lxml events that occur during parsing. |
@@ -74,7 +86,7 @@ def lxml_trace(data, html=True, **kwargs): | |||
74 | """ | 86 | """ |
75 | from lxml import etree | 87 | from lxml import etree |
76 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): | 88 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): |
77 | print("%s, %4s, %s" % (event, element.tag, element.text)) | 89 | print(("%s, %4s, %s" % (event, element.tag, element.text))) |
78 | 90 | ||
79 | class AnnouncingParser(HTMLParser): | 91 | class AnnouncingParser(HTMLParser): |
80 | """Announces HTMLParser parse events, without doing anything else.""" | 92 | """Announces HTMLParser parse events, without doing anything else.""" |
@@ -156,9 +168,9 @@ def rdoc(num_elements=1000): | |||
156 | 168 | ||
157 | def benchmark_parsers(num_elements=100000): | 169 | def benchmark_parsers(num_elements=100000): |
158 | """Very basic head-to-head performance benchmark.""" | 170 | """Very basic head-to-head performance benchmark.""" |
159 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__ | 171 | print("Comparative parser benchmark on Beautiful Soup %s" % __version__) |
160 | data = rdoc(num_elements) | 172 | data = rdoc(num_elements) |
161 | print "Generated a large invalid HTML document (%d bytes)." % len(data) | 173 | print("Generated a large invalid HTML document (%d bytes)." % len(data)) |
162 | 174 | ||
163 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: | 175 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: |
164 | success = False | 176 | success = False |
@@ -167,24 +179,24 @@ def benchmark_parsers(num_elements=100000): | |||
167 | soup = BeautifulSoup(data, parser) | 179 | soup = BeautifulSoup(data, parser) |
168 | b = time.time() | 180 | b = time.time() |
169 | success = True | 181 | success = True |
170 | except Exception, e: | 182 | except Exception as e: |
171 | print "%s could not parse the markup." % parser | 183 | print("%s could not parse the markup." % parser) |
172 | traceback.print_exc() | 184 | traceback.print_exc() |
173 | if success: | 185 | if success: |
174 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) | 186 | print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) |
175 | 187 | ||
176 | from lxml import etree | 188 | from lxml import etree |
177 | a = time.time() | 189 | a = time.time() |
178 | etree.HTML(data) | 190 | etree.HTML(data) |
179 | b = time.time() | 191 | b = time.time() |
180 | print "Raw lxml parsed the markup in %.2fs." % (b-a) | 192 | print("Raw lxml parsed the markup in %.2fs." % (b-a)) |
181 | 193 | ||
182 | import html5lib | 194 | import html5lib |
183 | parser = html5lib.HTMLParser() | 195 | parser = html5lib.HTMLParser() |
184 | a = time.time() | 196 | a = time.time() |
185 | parser.parse(data) | 197 | parser.parse(data) |
186 | b = time.time() | 198 | b = time.time() |
187 | print "Raw html5lib parsed the markup in %.2fs." % (b-a) | 199 | print("Raw html5lib parsed the markup in %.2fs." % (b-a)) |
188 | 200 | ||
189 | def profile(num_elements=100000, parser="lxml"): | 201 | def profile(num_elements=100000, parser="lxml"): |
190 | 202 | ||
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py index da9afdf48e..0e62c2e100 100644 --- a/bitbake/lib/bs4/element.py +++ b/bitbake/lib/bs4/element.py | |||
@@ -1,3 +1,6 @@ | |||
1 | __license__ = "MIT" | ||
2 | |||
3 | from pdb import set_trace | ||
1 | import collections | 4 | import collections |
2 | import re | 5 | import re |
3 | import sys | 6 | import sys |
@@ -21,22 +24,22 @@ def _alias(attr): | |||
21 | return alias | 24 | return alias |
22 | 25 | ||
23 | 26 | ||
24 | class NamespacedAttribute(unicode): | 27 | class NamespacedAttribute(str): |
25 | 28 | ||
26 | def __new__(cls, prefix, name, namespace=None): | 29 | def __new__(cls, prefix, name, namespace=None): |
27 | if name is None: | 30 | if name is None: |
28 | obj = unicode.__new__(cls, prefix) | 31 | obj = str.__new__(cls, prefix) |
29 | elif prefix is None: | 32 | elif prefix is None: |
30 | # Not really namespaced. | 33 | # Not really namespaced. |
31 | obj = unicode.__new__(cls, name) | 34 | obj = str.__new__(cls, name) |
32 | else: | 35 | else: |
33 | obj = unicode.__new__(cls, prefix + ":" + name) | 36 | obj = str.__new__(cls, prefix + ":" + name) |
34 | obj.prefix = prefix | 37 | obj.prefix = prefix |
35 | obj.name = name | 38 | obj.name = name |
36 | obj.namespace = namespace | 39 | obj.namespace = namespace |
37 | return obj | 40 | return obj |
38 | 41 | ||
39 | class AttributeValueWithCharsetSubstitution(unicode): | 42 | class AttributeValueWithCharsetSubstitution(str): |
40 | """A stand-in object for a character encoding specified in HTML.""" | 43 | """A stand-in object for a character encoding specified in HTML.""" |
41 | 44 | ||
42 | class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): | 45 | class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): |
@@ -47,7 +50,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |||
47 | """ | 50 | """ |
48 | 51 | ||
49 | def __new__(cls, original_value): | 52 | def __new__(cls, original_value): |
50 | obj = unicode.__new__(cls, original_value) | 53 | obj = str.__new__(cls, original_value) |
51 | obj.original_value = original_value | 54 | obj.original_value = original_value |
52 | return obj | 55 | return obj |
53 | 56 | ||
@@ -70,9 +73,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |||
70 | match = cls.CHARSET_RE.search(original_value) | 73 | match = cls.CHARSET_RE.search(original_value) |
71 | if match is None: | 74 | if match is None: |
72 | # No substitution necessary. | 75 | # No substitution necessary. |
73 | return unicode.__new__(unicode, original_value) | 76 | return str.__new__(str, original_value) |
74 | 77 | ||
75 | obj = unicode.__new__(cls, original_value) | 78 | obj = str.__new__(cls, original_value) |
76 | obj.original_value = original_value | 79 | obj.original_value = original_value |
77 | return obj | 80 | return obj |
78 | 81 | ||
@@ -152,7 +155,7 @@ class PageElement(object): | |||
152 | 155 | ||
153 | def format_string(self, s, formatter='minimal'): | 156 | def format_string(self, s, formatter='minimal'): |
154 | """Format the given string using the given formatter.""" | 157 | """Format the given string using the given formatter.""" |
155 | if not callable(formatter): | 158 | if not isinstance(formatter, collections.Callable): |
156 | formatter = self._formatter_for_name(formatter) | 159 | formatter = self._formatter_for_name(formatter) |
157 | if formatter is None: | 160 | if formatter is None: |
158 | output = s | 161 | output = s |
@@ -185,24 +188,40 @@ class PageElement(object): | |||
185 | return self.HTML_FORMATTERS.get( | 188 | return self.HTML_FORMATTERS.get( |
186 | name, HTMLAwareEntitySubstitution.substitute_xml) | 189 | name, HTMLAwareEntitySubstitution.substitute_xml) |
187 | 190 | ||
188 | def setup(self, parent=None, previous_element=None): | 191 | def setup(self, parent=None, previous_element=None, next_element=None, |
192 | previous_sibling=None, next_sibling=None): | ||
189 | """Sets up the initial relations between this element and | 193 | """Sets up the initial relations between this element and |
190 | other elements.""" | 194 | other elements.""" |
191 | self.parent = parent | 195 | self.parent = parent |
196 | |||
192 | self.previous_element = previous_element | 197 | self.previous_element = previous_element |
193 | if previous_element is not None: | 198 | if previous_element is not None: |
194 | self.previous_element.next_element = self | 199 | self.previous_element.next_element = self |
195 | self.next_element = None | 200 | |
196 | self.previous_sibling = None | 201 | self.next_element = next_element |
197 | self.next_sibling = None | 202 | if self.next_element: |
198 | if self.parent is not None and self.parent.contents: | 203 | self.next_element.previous_element = self |
199 | self.previous_sibling = self.parent.contents[-1] | 204 | |
205 | self.next_sibling = next_sibling | ||
206 | if self.next_sibling: | ||
207 | self.next_sibling.previous_sibling = self | ||
208 | |||
209 | if (not previous_sibling | ||
210 | and self.parent is not None and self.parent.contents): | ||
211 | previous_sibling = self.parent.contents[-1] | ||
212 | |||
213 | self.previous_sibling = previous_sibling | ||
214 | if previous_sibling: | ||
200 | self.previous_sibling.next_sibling = self | 215 | self.previous_sibling.next_sibling = self |
201 | 216 | ||
202 | nextSibling = _alias("next_sibling") # BS3 | 217 | nextSibling = _alias("next_sibling") # BS3 |
203 | previousSibling = _alias("previous_sibling") # BS3 | 218 | previousSibling = _alias("previous_sibling") # BS3 |
204 | 219 | ||
205 | def replace_with(self, replace_with): | 220 | def replace_with(self, replace_with): |
221 | if not self.parent: | ||
222 | raise ValueError( | ||
223 | "Cannot replace one element with another when the" | ||
224 | "element to be replaced is not part of a tree.") | ||
206 | if replace_with is self: | 225 | if replace_with is self: |
207 | return | 226 | return |
208 | if replace_with is self.parent: | 227 | if replace_with is self.parent: |
@@ -216,6 +235,10 @@ class PageElement(object): | |||
216 | 235 | ||
217 | def unwrap(self): | 236 | def unwrap(self): |
218 | my_parent = self.parent | 237 | my_parent = self.parent |
238 | if not self.parent: | ||
239 | raise ValueError( | ||
240 | "Cannot replace an element with its contents when that" | ||
241 | "element is not part of a tree.") | ||
219 | my_index = self.parent.index(self) | 242 | my_index = self.parent.index(self) |
220 | self.extract() | 243 | self.extract() |
221 | for child in reversed(self.contents[:]): | 244 | for child in reversed(self.contents[:]): |
@@ -240,17 +263,20 @@ class PageElement(object): | |||
240 | last_child = self._last_descendant() | 263 | last_child = self._last_descendant() |
241 | next_element = last_child.next_element | 264 | next_element = last_child.next_element |
242 | 265 | ||
243 | if self.previous_element is not None: | 266 | if (self.previous_element is not None and |
267 | self.previous_element is not next_element): | ||
244 | self.previous_element.next_element = next_element | 268 | self.previous_element.next_element = next_element |
245 | if next_element is not None: | 269 | if next_element is not None and next_element is not self.previous_element: |
246 | next_element.previous_element = self.previous_element | 270 | next_element.previous_element = self.previous_element |
247 | self.previous_element = None | 271 | self.previous_element = None |
248 | last_child.next_element = None | 272 | last_child.next_element = None |
249 | 273 | ||
250 | self.parent = None | 274 | self.parent = None |
251 | if self.previous_sibling is not None: | 275 | if (self.previous_sibling is not None |
276 | and self.previous_sibling is not self.next_sibling): | ||
252 | self.previous_sibling.next_sibling = self.next_sibling | 277 | self.previous_sibling.next_sibling = self.next_sibling |
253 | if self.next_sibling is not None: | 278 | if (self.next_sibling is not None |
279 | and self.next_sibling is not self.previous_sibling): | ||
254 | self.next_sibling.previous_sibling = self.previous_sibling | 280 | self.next_sibling.previous_sibling = self.previous_sibling |
255 | self.previous_sibling = self.next_sibling = None | 281 | self.previous_sibling = self.next_sibling = None |
256 | return self | 282 | return self |
@@ -263,16 +289,18 @@ class PageElement(object): | |||
263 | last_child = self | 289 | last_child = self |
264 | while isinstance(last_child, Tag) and last_child.contents: | 290 | while isinstance(last_child, Tag) and last_child.contents: |
265 | last_child = last_child.contents[-1] | 291 | last_child = last_child.contents[-1] |
266 | if not accept_self and last_child == self: | 292 | if not accept_self and last_child is self: |
267 | last_child = None | 293 | last_child = None |
268 | return last_child | 294 | return last_child |
269 | # BS3: Not part of the API! | 295 | # BS3: Not part of the API! |
270 | _lastRecursiveChild = _last_descendant | 296 | _lastRecursiveChild = _last_descendant |
271 | 297 | ||
272 | def insert(self, position, new_child): | 298 | def insert(self, position, new_child): |
299 | if new_child is None: | ||
300 | raise ValueError("Cannot insert None into a tag.") | ||
273 | if new_child is self: | 301 | if new_child is self: |
274 | raise ValueError("Cannot insert a tag into itself.") | 302 | raise ValueError("Cannot insert a tag into itself.") |
275 | if (isinstance(new_child, basestring) | 303 | if (isinstance(new_child, str) |
276 | and not isinstance(new_child, NavigableString)): | 304 | and not isinstance(new_child, NavigableString)): |
277 | new_child = NavigableString(new_child) | 305 | new_child = NavigableString(new_child) |
278 | 306 | ||
@@ -478,6 +506,10 @@ class PageElement(object): | |||
478 | def _find_all(self, name, attrs, text, limit, generator, **kwargs): | 506 | def _find_all(self, name, attrs, text, limit, generator, **kwargs): |
479 | "Iterates over a generator looking for things that match." | 507 | "Iterates over a generator looking for things that match." |
480 | 508 | ||
509 | if text is None and 'string' in kwargs: | ||
510 | text = kwargs['string'] | ||
511 | del kwargs['string'] | ||
512 | |||
481 | if isinstance(name, SoupStrainer): | 513 | if isinstance(name, SoupStrainer): |
482 | strainer = name | 514 | strainer = name |
483 | else: | 515 | else: |
@@ -489,7 +521,7 @@ class PageElement(object): | |||
489 | result = (element for element in generator | 521 | result = (element for element in generator |
490 | if isinstance(element, Tag)) | 522 | if isinstance(element, Tag)) |
491 | return ResultSet(strainer, result) | 523 | return ResultSet(strainer, result) |
492 | elif isinstance(name, basestring): | 524 | elif isinstance(name, str): |
493 | # Optimization to find all tags with a given name. | 525 | # Optimization to find all tags with a given name. |
494 | result = (element for element in generator | 526 | result = (element for element in generator |
495 | if isinstance(element, Tag) | 527 | if isinstance(element, Tag) |
@@ -548,17 +580,17 @@ class PageElement(object): | |||
548 | 580 | ||
549 | # Methods for supporting CSS selectors. | 581 | # Methods for supporting CSS selectors. |
550 | 582 | ||
551 | tag_name_re = re.compile('^[a-z0-9]+$') | 583 | tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') |
552 | 584 | ||
553 | # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ | 585 | # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ |
554 | # \---/ \---/\-------------/ \-------/ | 586 | # \---------------------------/ \---/\-------------/ \-------/ |
555 | # | | | | | 587 | # | | | | |
556 | # | | | The value | 588 | # | | | The value |
557 | # | | ~,|,^,$,* or = | 589 | # | | ~,|,^,$,* or = |
558 | # | Attribute | 590 | # | Attribute |
559 | # Tag | 591 | # Tag |
560 | attribselect_re = re.compile( | 592 | attribselect_re = re.compile( |
561 | r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + | 593 | r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' + |
562 | r'=?"?(?P<value>[^\]"]*)"?\]$' | 594 | r'=?"?(?P<value>[^\]"]*)"?\]$' |
563 | ) | 595 | ) |
564 | 596 | ||
@@ -640,7 +672,7 @@ class PageElement(object): | |||
640 | return self.parents | 672 | return self.parents |
641 | 673 | ||
642 | 674 | ||
643 | class NavigableString(unicode, PageElement): | 675 | class NavigableString(str, PageElement): |
644 | 676 | ||
645 | PREFIX = '' | 677 | PREFIX = '' |
646 | SUFFIX = '' | 678 | SUFFIX = '' |
@@ -653,15 +685,21 @@ class NavigableString(unicode, PageElement): | |||
653 | passed in to the superclass's __new__ or the superclass won't know | 685 | passed in to the superclass's __new__ or the superclass won't know |
654 | how to handle non-ASCII characters. | 686 | how to handle non-ASCII characters. |
655 | """ | 687 | """ |
656 | if isinstance(value, unicode): | 688 | if isinstance(value, str): |
657 | return unicode.__new__(cls, value) | 689 | u = str.__new__(cls, value) |
658 | return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) | 690 | else: |
691 | u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) | ||
692 | u.setup() | ||
693 | return u | ||
659 | 694 | ||
660 | def __copy__(self): | 695 | def __copy__(self): |
661 | return self | 696 | """A copy of a NavigableString has the same contents and class |
697 | as the original, but it is not connected to the parse tree. | ||
698 | """ | ||
699 | return type(self)(self) | ||
662 | 700 | ||
663 | def __getnewargs__(self): | 701 | def __getnewargs__(self): |
664 | return (unicode(self),) | 702 | return (str(self),) |
665 | 703 | ||
666 | def __getattr__(self, attr): | 704 | def __getattr__(self, attr): |
667 | """text.string gives you text. This is for backwards | 705 | """text.string gives you text. This is for backwards |
@@ -701,23 +739,23 @@ class PreformattedString(NavigableString): | |||
701 | 739 | ||
702 | class CData(PreformattedString): | 740 | class CData(PreformattedString): |
703 | 741 | ||
704 | PREFIX = u'<![CDATA[' | 742 | PREFIX = '<![CDATA[' |
705 | SUFFIX = u']]>' | 743 | SUFFIX = ']]>' |
706 | 744 | ||
707 | class ProcessingInstruction(PreformattedString): | 745 | class ProcessingInstruction(PreformattedString): |
708 | 746 | ||
709 | PREFIX = u'<?' | 747 | PREFIX = '<?' |
710 | SUFFIX = u'?>' | 748 | SUFFIX = '>' |
711 | 749 | ||
712 | class Comment(PreformattedString): | 750 | class Comment(PreformattedString): |
713 | 751 | ||
714 | PREFIX = u'<!--' | 752 | PREFIX = '<!--' |
715 | SUFFIX = u'-->' | 753 | SUFFIX = '-->' |
716 | 754 | ||
717 | 755 | ||
718 | class Declaration(PreformattedString): | 756 | class Declaration(PreformattedString): |
719 | PREFIX = u'<!' | 757 | PREFIX = '<?' |
720 | SUFFIX = u'!>' | 758 | SUFFIX = '?>' |
721 | 759 | ||
722 | 760 | ||
723 | class Doctype(PreformattedString): | 761 | class Doctype(PreformattedString): |
@@ -734,8 +772,8 @@ class Doctype(PreformattedString): | |||
734 | 772 | ||
735 | return Doctype(value) | 773 | return Doctype(value) |
736 | 774 | ||
737 | PREFIX = u'<!DOCTYPE ' | 775 | PREFIX = '<!DOCTYPE ' |
738 | SUFFIX = u'>\n' | 776 | SUFFIX = '>\n' |
739 | 777 | ||
740 | 778 | ||
741 | class Tag(PageElement): | 779 | class Tag(PageElement): |
@@ -759,9 +797,12 @@ class Tag(PageElement): | |||
759 | self.prefix = prefix | 797 | self.prefix = prefix |
760 | if attrs is None: | 798 | if attrs is None: |
761 | attrs = {} | 799 | attrs = {} |
762 | elif attrs and builder.cdata_list_attributes: | 800 | elif attrs: |
763 | attrs = builder._replace_cdata_list_attribute_values( | 801 | if builder is not None and builder.cdata_list_attributes: |
764 | self.name, attrs) | 802 | attrs = builder._replace_cdata_list_attribute_values( |
803 | self.name, attrs) | ||
804 | else: | ||
805 | attrs = dict(attrs) | ||
765 | else: | 806 | else: |
766 | attrs = dict(attrs) | 807 | attrs = dict(attrs) |
767 | self.attrs = attrs | 808 | self.attrs = attrs |
@@ -778,6 +819,18 @@ class Tag(PageElement): | |||
778 | 819 | ||
779 | parserClass = _alias("parser_class") # BS3 | 820 | parserClass = _alias("parser_class") # BS3 |
780 | 821 | ||
822 | def __copy__(self): | ||
823 | """A copy of a Tag is a new Tag, unconnected to the parse tree. | ||
824 | Its contents are a copy of the old Tag's contents. | ||
825 | """ | ||
826 | clone = type(self)(None, self.builder, self.name, self.namespace, | ||
827 | self.nsprefix, self.attrs) | ||
828 | for attr in ('can_be_empty_element', 'hidden'): | ||
829 | setattr(clone, attr, getattr(self, attr)) | ||
830 | for child in self.contents: | ||
831 | clone.append(child.__copy__()) | ||
832 | return clone | ||
833 | |||
781 | @property | 834 | @property |
782 | def is_empty_element(self): | 835 | def is_empty_element(self): |
783 | """Is this tag an empty-element tag? (aka a self-closing tag) | 836 | """Is this tag an empty-element tag? (aka a self-closing tag) |
@@ -843,7 +896,7 @@ class Tag(PageElement): | |||
843 | for string in self._all_strings(True): | 896 | for string in self._all_strings(True): |
844 | yield string | 897 | yield string |
845 | 898 | ||
846 | def get_text(self, separator=u"", strip=False, | 899 | def get_text(self, separator="", strip=False, |
847 | types=(NavigableString, CData)): | 900 | types=(NavigableString, CData)): |
848 | """ | 901 | """ |
849 | Get all child strings, concatenated using the given separator. | 902 | Get all child strings, concatenated using the given separator. |
@@ -915,7 +968,7 @@ class Tag(PageElement): | |||
915 | def __contains__(self, x): | 968 | def __contains__(self, x): |
916 | return x in self.contents | 969 | return x in self.contents |
917 | 970 | ||
918 | def __nonzero__(self): | 971 | def __bool__(self): |
919 | "A tag is non-None even if it has no contents." | 972 | "A tag is non-None even if it has no contents." |
920 | return True | 973 | return True |
921 | 974 | ||
@@ -971,15 +1024,25 @@ class Tag(PageElement): | |||
971 | as defined in __eq__.""" | 1024 | as defined in __eq__.""" |
972 | return not self == other | 1025 | return not self == other |
973 | 1026 | ||
974 | def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): | 1027 | def __repr__(self, encoding="unicode-escape"): |
975 | """Renders this tag as a string.""" | 1028 | """Renders this tag as a string.""" |
976 | return self.encode(encoding) | 1029 | if PY3K: |
1030 | # "The return value must be a string object", i.e. Unicode | ||
1031 | return self.decode() | ||
1032 | else: | ||
1033 | # "The return value must be a string object", i.e. a bytestring. | ||
1034 | # By convention, the return value of __repr__ should also be | ||
1035 | # an ASCII string. | ||
1036 | return self.encode(encoding) | ||
977 | 1037 | ||
978 | def __unicode__(self): | 1038 | def __unicode__(self): |
979 | return self.decode() | 1039 | return self.decode() |
980 | 1040 | ||
981 | def __str__(self): | 1041 | def __str__(self): |
982 | return self.encode() | 1042 | if PY3K: |
1043 | return self.decode() | ||
1044 | else: | ||
1045 | return self.encode() | ||
983 | 1046 | ||
984 | if PY3K: | 1047 | if PY3K: |
985 | __str__ = __repr__ = __unicode__ | 1048 | __str__ = __repr__ = __unicode__ |
@@ -1014,7 +1077,7 @@ class Tag(PageElement): | |||
1014 | 1077 | ||
1015 | # First off, turn a string formatter into a function. This | 1078 | # First off, turn a string formatter into a function. This |
1016 | # will stop the lookup from happening over and over again. | 1079 | # will stop the lookup from happening over and over again. |
1017 | if not callable(formatter): | 1080 | if not isinstance(formatter, collections.Callable): |
1018 | formatter = self._formatter_for_name(formatter) | 1081 | formatter = self._formatter_for_name(formatter) |
1019 | 1082 | ||
1020 | attrs = [] | 1083 | attrs = [] |
@@ -1025,8 +1088,8 @@ class Tag(PageElement): | |||
1025 | else: | 1088 | else: |
1026 | if isinstance(val, list) or isinstance(val, tuple): | 1089 | if isinstance(val, list) or isinstance(val, tuple): |
1027 | val = ' '.join(val) | 1090 | val = ' '.join(val) |
1028 | elif not isinstance(val, basestring): | 1091 | elif not isinstance(val, str): |
1029 | val = unicode(val) | 1092 | val = str(val) |
1030 | elif ( | 1093 | elif ( |
1031 | isinstance(val, AttributeValueWithCharsetSubstitution) | 1094 | isinstance(val, AttributeValueWithCharsetSubstitution) |
1032 | and eventual_encoding is not None): | 1095 | and eventual_encoding is not None): |
@@ -1034,7 +1097,7 @@ class Tag(PageElement): | |||
1034 | 1097 | ||
1035 | text = self.format_string(val, formatter) | 1098 | text = self.format_string(val, formatter) |
1036 | decoded = ( | 1099 | decoded = ( |
1037 | unicode(key) + '=' | 1100 | str(key) + '=' |
1038 | + EntitySubstitution.quoted_attribute_value(text)) | 1101 | + EntitySubstitution.quoted_attribute_value(text)) |
1039 | attrs.append(decoded) | 1102 | attrs.append(decoded) |
1040 | close = '' | 1103 | close = '' |
@@ -1103,16 +1166,22 @@ class Tag(PageElement): | |||
1103 | formatter="minimal"): | 1166 | formatter="minimal"): |
1104 | """Renders the contents of this tag as a Unicode string. | 1167 | """Renders the contents of this tag as a Unicode string. |
1105 | 1168 | ||
1169 | :param indent_level: Each line of the rendering will be | ||
1170 | indented this many spaces. | ||
1171 | |||
1106 | :param eventual_encoding: The tag is destined to be | 1172 | :param eventual_encoding: The tag is destined to be |
1107 | encoded into this encoding. This method is _not_ | 1173 | encoded into this encoding. This method is _not_ |
1108 | responsible for performing that encoding. This information | 1174 | responsible for performing that encoding. This information |
1109 | is passed in so that it can be substituted in if the | 1175 | is passed in so that it can be substituted in if the |
1110 | document contains a <META> tag that mentions the document's | 1176 | document contains a <META> tag that mentions the document's |
1111 | encoding. | 1177 | encoding. |
1178 | |||
1179 | :param formatter: The output formatter responsible for converting | ||
1180 | entities to Unicode characters. | ||
1112 | """ | 1181 | """ |
1113 | # First off, turn a string formatter into a function. This | 1182 | # First off, turn a string formatter into a function. This |
1114 | # will stop the lookup from happening over and over again. | 1183 | # will stop the lookup from happening over and over again. |
1115 | if not callable(formatter): | 1184 | if not isinstance(formatter, collections.Callable): |
1116 | formatter = self._formatter_for_name(formatter) | 1185 | formatter = self._formatter_for_name(formatter) |
1117 | 1186 | ||
1118 | pretty_print = (indent_level is not None) | 1187 | pretty_print = (indent_level is not None) |
@@ -1137,7 +1206,17 @@ class Tag(PageElement): | |||
1137 | def encode_contents( | 1206 | def encode_contents( |
1138 | self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, | 1207 | self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, |
1139 | formatter="minimal"): | 1208 | formatter="minimal"): |
1140 | """Renders the contents of this tag as a bytestring.""" | 1209 | """Renders the contents of this tag as a bytestring. |
1210 | |||
1211 | :param indent_level: Each line of the rendering will be | ||
1212 | indented this many spaces. | ||
1213 | |||
1214 | :param eventual_encoding: The bytestring will be in this encoding. | ||
1215 | |||
1216 | :param formatter: The output formatter responsible for converting | ||
1217 | entities to Unicode characters. | ||
1218 | """ | ||
1219 | |||
1141 | contents = self.decode_contents(indent_level, encoding, formatter) | 1220 | contents = self.decode_contents(indent_level, encoding, formatter) |
1142 | return contents.encode(encoding) | 1221 | return contents.encode(encoding) |
1143 | 1222 | ||
@@ -1201,26 +1280,57 @@ class Tag(PageElement): | |||
1201 | 1280 | ||
1202 | _selector_combinators = ['>', '+', '~'] | 1281 | _selector_combinators = ['>', '+', '~'] |
1203 | _select_debug = False | 1282 | _select_debug = False |
1204 | def select(self, selector, _candidate_generator=None): | 1283 | def select_one(self, selector): |
1284 | """Perform a CSS selection operation on the current element.""" | ||
1285 | value = self.select(selector, limit=1) | ||
1286 | if value: | ||
1287 | return value[0] | ||
1288 | return None | ||
1289 | |||
1290 | def select(self, selector, _candidate_generator=None, limit=None): | ||
1205 | """Perform a CSS selection operation on the current element.""" | 1291 | """Perform a CSS selection operation on the current element.""" |
1292 | |||
1293 | # Handle grouping selectors if ',' exists, ie: p,a | ||
1294 | if ',' in selector: | ||
1295 | context = [] | ||
1296 | for partial_selector in selector.split(','): | ||
1297 | partial_selector = partial_selector.strip() | ||
1298 | if partial_selector == '': | ||
1299 | raise ValueError('Invalid group selection syntax: %s' % selector) | ||
1300 | candidates = self.select(partial_selector, limit=limit) | ||
1301 | for candidate in candidates: | ||
1302 | if candidate not in context: | ||
1303 | context.append(candidate) | ||
1304 | |||
1305 | if limit and len(context) >= limit: | ||
1306 | break | ||
1307 | return context | ||
1308 | |||
1206 | tokens = selector.split() | 1309 | tokens = selector.split() |
1207 | current_context = [self] | 1310 | current_context = [self] |
1208 | 1311 | ||
1209 | if tokens[-1] in self._selector_combinators: | 1312 | if tokens[-1] in self._selector_combinators: |
1210 | raise ValueError( | 1313 | raise ValueError( |
1211 | 'Final combinator "%s" is missing an argument.' % tokens[-1]) | 1314 | 'Final combinator "%s" is missing an argument.' % tokens[-1]) |
1315 | |||
1212 | if self._select_debug: | 1316 | if self._select_debug: |
1213 | print 'Running CSS selector "%s"' % selector | 1317 | print('Running CSS selector "%s"' % selector) |
1318 | |||
1214 | for index, token in enumerate(tokens): | 1319 | for index, token in enumerate(tokens): |
1215 | if self._select_debug: | 1320 | new_context = [] |
1216 | print ' Considering token "%s"' % token | 1321 | new_context_ids = set([]) |
1217 | recursive_candidate_generator = None | 1322 | |
1218 | tag_name = None | ||
1219 | if tokens[index-1] in self._selector_combinators: | 1323 | if tokens[index-1] in self._selector_combinators: |
1220 | # This token was consumed by the previous combinator. Skip it. | 1324 | # This token was consumed by the previous combinator. Skip it. |
1221 | if self._select_debug: | 1325 | if self._select_debug: |
1222 | print ' Token was consumed by the previous combinator.' | 1326 | print(' Token was consumed by the previous combinator.') |
1223 | continue | 1327 | continue |
1328 | |||
1329 | if self._select_debug: | ||
1330 | print(' Considering token "%s"' % token) | ||
1331 | recursive_candidate_generator = None | ||
1332 | tag_name = None | ||
1333 | |||
1224 | # Each operation corresponds to a checker function, a rule | 1334 | # Each operation corresponds to a checker function, a rule |
1225 | # for determining whether a candidate matches the | 1335 | # for determining whether a candidate matches the |
1226 | # selector. Candidates are generated by the active | 1336 | # selector. Candidates are generated by the active |
@@ -1256,35 +1366,38 @@ class Tag(PageElement): | |||
1256 | "A pseudo-class must be prefixed with a tag name.") | 1366 | "A pseudo-class must be prefixed with a tag name.") |
1257 | pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) | 1367 | pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) |
1258 | found = [] | 1368 | found = [] |
1259 | if pseudo_attributes is not None: | 1369 | if pseudo_attributes is None: |
1370 | pseudo_type = pseudo | ||
1371 | pseudo_value = None | ||
1372 | else: | ||
1260 | pseudo_type, pseudo_value = pseudo_attributes.groups() | 1373 | pseudo_type, pseudo_value = pseudo_attributes.groups() |
1261 | if pseudo_type == 'nth-of-type': | 1374 | if pseudo_type == 'nth-of-type': |
1262 | try: | 1375 | try: |
1263 | pseudo_value = int(pseudo_value) | 1376 | pseudo_value = int(pseudo_value) |
1264 | except: | 1377 | except: |
1265 | raise NotImplementedError( | ||
1266 | 'Only numeric values are currently supported for the nth-of-type pseudo-class.') | ||
1267 | if pseudo_value < 1: | ||
1268 | raise ValueError( | ||
1269 | 'nth-of-type pseudo-class value must be at least 1.') | ||
1270 | class Counter(object): | ||
1271 | def __init__(self, destination): | ||
1272 | self.count = 0 | ||
1273 | self.destination = destination | ||
1274 | |||
1275 | def nth_child_of_type(self, tag): | ||
1276 | self.count += 1 | ||
1277 | if self.count == self.destination: | ||
1278 | return True | ||
1279 | if self.count > self.destination: | ||
1280 | # Stop the generator that's sending us | ||
1281 | # these things. | ||
1282 | raise StopIteration() | ||
1283 | return False | ||
1284 | checker = Counter(pseudo_value).nth_child_of_type | ||
1285 | else: | ||
1286 | raise NotImplementedError( | 1378 | raise NotImplementedError( |
1287 | 'Only the following pseudo-classes are implemented: nth-of-type.') | 1379 | 'Only numeric values are currently supported for the nth-of-type pseudo-class.') |
1380 | if pseudo_value < 1: | ||
1381 | raise ValueError( | ||
1382 | 'nth-of-type pseudo-class value must be at least 1.') | ||
1383 | class Counter(object): | ||
1384 | def __init__(self, destination): | ||
1385 | self.count = 0 | ||
1386 | self.destination = destination | ||
1387 | |||
1388 | def nth_child_of_type(self, tag): | ||
1389 | self.count += 1 | ||
1390 | if self.count == self.destination: | ||
1391 | return True | ||
1392 | if self.count > self.destination: | ||
1393 | # Stop the generator that's sending us | ||
1394 | # these things. | ||
1395 | raise StopIteration() | ||
1396 | return False | ||
1397 | checker = Counter(pseudo_value).nth_child_of_type | ||
1398 | else: | ||
1399 | raise NotImplementedError( | ||
1400 | 'Only the following pseudo-classes are implemented: nth-of-type.') | ||
1288 | 1401 | ||
1289 | elif token == '*': | 1402 | elif token == '*': |
1290 | # Star selector -- matches everything | 1403 | # Star selector -- matches everything |
@@ -1311,7 +1424,6 @@ class Tag(PageElement): | |||
1311 | else: | 1424 | else: |
1312 | raise ValueError( | 1425 | raise ValueError( |
1313 | 'Unsupported or invalid CSS selector: "%s"' % token) | 1426 | 'Unsupported or invalid CSS selector: "%s"' % token) |
1314 | |||
1315 | if recursive_candidate_generator: | 1427 | if recursive_candidate_generator: |
1316 | # This happens when the selector looks like "> foo". | 1428 | # This happens when the selector looks like "> foo". |
1317 | # | 1429 | # |
@@ -1325,14 +1437,14 @@ class Tag(PageElement): | |||
1325 | next_token = tokens[index+1] | 1437 | next_token = tokens[index+1] |
1326 | def recursive_select(tag): | 1438 | def recursive_select(tag): |
1327 | if self._select_debug: | 1439 | if self._select_debug: |
1328 | print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) | 1440 | print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)) |
1329 | print '-' * 40 | 1441 | print('-' * 40) |
1330 | for i in tag.select(next_token, recursive_candidate_generator): | 1442 | for i in tag.select(next_token, recursive_candidate_generator): |
1331 | if self._select_debug: | 1443 | if self._select_debug: |
1332 | print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) | 1444 | print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)) |
1333 | yield i | 1445 | yield i |
1334 | if self._select_debug: | 1446 | if self._select_debug: |
1335 | print '-' * 40 | 1447 | print('-' * 40) |
1336 | _use_candidate_generator = recursive_select | 1448 | _use_candidate_generator = recursive_select |
1337 | elif _candidate_generator is None: | 1449 | elif _candidate_generator is None: |
1338 | # By default, a tag's candidates are all of its | 1450 | # By default, a tag's candidates are all of its |
@@ -1343,7 +1455,7 @@ class Tag(PageElement): | |||
1343 | check = "[any]" | 1455 | check = "[any]" |
1344 | else: | 1456 | else: |
1345 | check = tag_name | 1457 | check = tag_name |
1346 | print ' Default candidate generator, tag name="%s"' % check | 1458 | print(' Default candidate generator, tag name="%s"' % check) |
1347 | if self._select_debug: | 1459 | if self._select_debug: |
1348 | # This is redundant with later code, but it stops | 1460 | # This is redundant with later code, but it stops |
1349 | # a bunch of bogus tags from cluttering up the | 1461 | # a bunch of bogus tags from cluttering up the |
@@ -1361,12 +1473,11 @@ class Tag(PageElement): | |||
1361 | else: | 1473 | else: |
1362 | _use_candidate_generator = _candidate_generator | 1474 | _use_candidate_generator = _candidate_generator |
1363 | 1475 | ||
1364 | new_context = [] | 1476 | count = 0 |
1365 | new_context_ids = set([]) | ||
1366 | for tag in current_context: | 1477 | for tag in current_context: |
1367 | if self._select_debug: | 1478 | if self._select_debug: |
1368 | print " Running candidate generator on %s %s" % ( | 1479 | print(" Running candidate generator on %s %s" % ( |
1369 | tag.name, repr(tag.attrs)) | 1480 | tag.name, repr(tag.attrs))) |
1370 | for candidate in _use_candidate_generator(tag): | 1481 | for candidate in _use_candidate_generator(tag): |
1371 | if not isinstance(candidate, Tag): | 1482 | if not isinstance(candidate, Tag): |
1372 | continue | 1483 | continue |
@@ -1381,21 +1492,24 @@ class Tag(PageElement): | |||
1381 | break | 1492 | break |
1382 | if checker is None or result: | 1493 | if checker is None or result: |
1383 | if self._select_debug: | 1494 | if self._select_debug: |
1384 | print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) | 1495 | print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))) |
1385 | if id(candidate) not in new_context_ids: | 1496 | if id(candidate) not in new_context_ids: |
1386 | # If a tag matches a selector more than once, | 1497 | # If a tag matches a selector more than once, |
1387 | # don't include it in the context more than once. | 1498 | # don't include it in the context more than once. |
1388 | new_context.append(candidate) | 1499 | new_context.append(candidate) |
1389 | new_context_ids.add(id(candidate)) | 1500 | new_context_ids.add(id(candidate)) |
1501 | if limit and len(new_context) >= limit: | ||
1502 | break | ||
1390 | elif self._select_debug: | 1503 | elif self._select_debug: |
1391 | print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) | 1504 | print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs))) |
1505 | |||
1392 | 1506 | ||
1393 | current_context = new_context | 1507 | current_context = new_context |
1394 | 1508 | ||
1395 | if self._select_debug: | 1509 | if self._select_debug: |
1396 | print "Final verdict:" | 1510 | print("Final verdict:") |
1397 | for i in current_context: | 1511 | for i in current_context: |
1398 | print " %s %s" % (i.name, i.attrs) | 1512 | print(" %s %s" % (i.name, i.attrs)) |
1399 | return current_context | 1513 | return current_context |
1400 | 1514 | ||
1401 | # Old names for backwards compatibility | 1515 | # Old names for backwards compatibility |
@@ -1439,7 +1553,7 @@ class SoupStrainer(object): | |||
1439 | else: | 1553 | else: |
1440 | attrs = kwargs | 1554 | attrs = kwargs |
1441 | normalized_attrs = {} | 1555 | normalized_attrs = {} |
1442 | for key, value in attrs.items(): | 1556 | for key, value in list(attrs.items()): |
1443 | normalized_attrs[key] = self._normalize_search_value(value) | 1557 | normalized_attrs[key] = self._normalize_search_value(value) |
1444 | 1558 | ||
1445 | self.attrs = normalized_attrs | 1559 | self.attrs = normalized_attrs |
@@ -1448,7 +1562,7 @@ class SoupStrainer(object): | |||
1448 | def _normalize_search_value(self, value): | 1562 | def _normalize_search_value(self, value): |
1449 | # Leave it alone if it's a Unicode string, a callable, a | 1563 | # Leave it alone if it's a Unicode string, a callable, a |
1450 | # regular expression, a boolean, or None. | 1564 | # regular expression, a boolean, or None. |
1451 | if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') | 1565 | if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match') |
1452 | or isinstance(value, bool) or value is None): | 1566 | or isinstance(value, bool) or value is None): |
1453 | return value | 1567 | return value |
1454 | 1568 | ||
@@ -1461,7 +1575,7 @@ class SoupStrainer(object): | |||
1461 | new_value = [] | 1575 | new_value = [] |
1462 | for v in value: | 1576 | for v in value: |
1463 | if (hasattr(v, '__iter__') and not isinstance(v, bytes) | 1577 | if (hasattr(v, '__iter__') and not isinstance(v, bytes) |
1464 | and not isinstance(v, unicode)): | 1578 | and not isinstance(v, str)): |
1465 | # This is almost certainly the user's mistake. In the | 1579 | # This is almost certainly the user's mistake. In the |
1466 | # interests of avoiding infinite loops, we'll let | 1580 | # interests of avoiding infinite loops, we'll let |
1467 | # it through as-is rather than doing a recursive call. | 1581 | # it through as-is rather than doing a recursive call. |
@@ -1473,7 +1587,7 @@ class SoupStrainer(object): | |||
1473 | # Otherwise, convert it into a Unicode string. | 1587 | # Otherwise, convert it into a Unicode string. |
1474 | # The unicode(str()) thing is so this will do the same thing on Python 2 | 1588 | # The unicode(str()) thing is so this will do the same thing on Python 2 |
1475 | # and Python 3. | 1589 | # and Python 3. |
1476 | return unicode(str(value)) | 1590 | return str(str(value)) |
1477 | 1591 | ||
1478 | def __str__(self): | 1592 | def __str__(self): |
1479 | if self.text: | 1593 | if self.text: |
@@ -1527,7 +1641,7 @@ class SoupStrainer(object): | |||
1527 | found = None | 1641 | found = None |
1528 | # If given a list of items, scan it for a text element that | 1642 | # If given a list of items, scan it for a text element that |
1529 | # matches. | 1643 | # matches. |
1530 | if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): | 1644 | if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): |
1531 | for element in markup: | 1645 | for element in markup: |
1532 | if isinstance(element, NavigableString) \ | 1646 | if isinstance(element, NavigableString) \ |
1533 | and self.search(element): | 1647 | and self.search(element): |
@@ -1540,7 +1654,7 @@ class SoupStrainer(object): | |||
1540 | found = self.search_tag(markup) | 1654 | found = self.search_tag(markup) |
1541 | # If it's text, make sure the text matches. | 1655 | # If it's text, make sure the text matches. |
1542 | elif isinstance(markup, NavigableString) or \ | 1656 | elif isinstance(markup, NavigableString) or \ |
1543 | isinstance(markup, basestring): | 1657 | isinstance(markup, str): |
1544 | if not self.name and not self.attrs and self._matches(markup, self.text): | 1658 | if not self.name and not self.attrs and self._matches(markup, self.text): |
1545 | found = markup | 1659 | found = markup |
1546 | else: | 1660 | else: |
@@ -1554,7 +1668,7 @@ class SoupStrainer(object): | |||
1554 | if isinstance(markup, list) or isinstance(markup, tuple): | 1668 | if isinstance(markup, list) or isinstance(markup, tuple): |
1555 | # This should only happen when searching a multi-valued attribute | 1669 | # This should only happen when searching a multi-valued attribute |
1556 | # like 'class'. | 1670 | # like 'class'. |
1557 | if (isinstance(match_against, unicode) | 1671 | if (isinstance(match_against, str) |
1558 | and ' ' in match_against): | 1672 | and ' ' in match_against): |
1559 | # A bit of a special case. If they try to match "foo | 1673 | # A bit of a special case. If they try to match "foo |
1560 | # bar" on a multivalue attribute's value, only accept | 1674 | # bar" on a multivalue attribute's value, only accept |
@@ -1589,7 +1703,7 @@ class SoupStrainer(object): | |||
1589 | # None matches None, False, an empty string, an empty list, and so on. | 1703 | # None matches None, False, an empty string, an empty list, and so on. |
1590 | return not match_against | 1704 | return not match_against |
1591 | 1705 | ||
1592 | if isinstance(match_against, unicode): | 1706 | if isinstance(match_against, str): |
1593 | # Exact string match | 1707 | # Exact string match |
1594 | return markup == match_against | 1708 | return markup == match_against |
1595 | 1709 | ||
diff --git a/bitbake/lib/bs4/testing.py b/bitbake/lib/bs4/testing.py index fd4495ac58..3a2f260e24 100644 --- a/bitbake/lib/bs4/testing.py +++ b/bitbake/lib/bs4/testing.py | |||
@@ -1,5 +1,8 @@ | |||
1 | """Helper classes for tests.""" | 1 | """Helper classes for tests.""" |
2 | 2 | ||
3 | __license__ = "MIT" | ||
4 | |||
5 | import pickle | ||
3 | import copy | 6 | import copy |
4 | import functools | 7 | import functools |
5 | import unittest | 8 | import unittest |
@@ -43,6 +46,16 @@ class SoupTest(unittest.TestCase): | |||
43 | 46 | ||
44 | self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) | 47 | self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) |
45 | 48 | ||
49 | def assertConnectedness(self, element): | ||
50 | """Ensure that next_element and previous_element are properly | ||
51 | set for all descendants of the given element. | ||
52 | """ | ||
53 | earlier = None | ||
54 | for e in element.descendants: | ||
55 | if earlier: | ||
56 | self.assertEqual(e, earlier.next_element) | ||
57 | self.assertEqual(earlier, e.previous_element) | ||
58 | earlier = e | ||
46 | 59 | ||
47 | class HTMLTreeBuilderSmokeTest(object): | 60 | class HTMLTreeBuilderSmokeTest(object): |
48 | 61 | ||
@@ -54,6 +67,15 @@ class HTMLTreeBuilderSmokeTest(object): | |||
54 | markup in these tests, there's not much room for interpretation. | 67 | markup in these tests, there's not much room for interpretation. |
55 | """ | 68 | """ |
56 | 69 | ||
70 | def test_pickle_and_unpickle_identity(self): | ||
71 | # Pickling a tree, then unpickling it, yields a tree identical | ||
72 | # to the original. | ||
73 | tree = self.soup("<a><b>foo</a>") | ||
74 | dumped = pickle.dumps(tree, 2) | ||
75 | loaded = pickle.loads(dumped) | ||
76 | self.assertEqual(loaded.__class__, BeautifulSoup) | ||
77 | self.assertEqual(loaded.decode(), tree.decode()) | ||
78 | |||
57 | def assertDoctypeHandled(self, doctype_fragment): | 79 | def assertDoctypeHandled(self, doctype_fragment): |
58 | """Assert that a given doctype string is handled correctly.""" | 80 | """Assert that a given doctype string is handled correctly.""" |
59 | doctype_str, soup = self._document_with_doctype(doctype_fragment) | 81 | doctype_str, soup = self._document_with_doctype(doctype_fragment) |
@@ -114,6 +136,11 @@ class HTMLTreeBuilderSmokeTest(object): | |||
114 | soup.encode("utf-8").replace(b"\n", b""), | 136 | soup.encode("utf-8").replace(b"\n", b""), |
115 | markup.replace(b"\n", b"")) | 137 | markup.replace(b"\n", b"")) |
116 | 138 | ||
139 | def test_processing_instruction(self): | ||
140 | markup = b"""<?PITarget PIContent?>""" | ||
141 | soup = self.soup(markup) | ||
142 | self.assertEqual(markup, soup.encode("utf8")) | ||
143 | |||
117 | def test_deepcopy(self): | 144 | def test_deepcopy(self): |
118 | """Make sure you can copy the tree builder. | 145 | """Make sure you can copy the tree builder. |
119 | 146 | ||
@@ -155,6 +182,23 @@ class HTMLTreeBuilderSmokeTest(object): | |||
155 | def test_nested_formatting_elements(self): | 182 | def test_nested_formatting_elements(self): |
156 | self.assertSoupEquals("<em><em></em></em>") | 183 | self.assertSoupEquals("<em><em></em></em>") |
157 | 184 | ||
185 | def test_double_head(self): | ||
186 | html = '''<!DOCTYPE html> | ||
187 | <html> | ||
188 | <head> | ||
189 | <title>Ordinary HEAD element test</title> | ||
190 | </head> | ||
191 | <script type="text/javascript"> | ||
192 | alert("Help!"); | ||
193 | </script> | ||
194 | <body> | ||
195 | Hello, world! | ||
196 | </body> | ||
197 | </html> | ||
198 | ''' | ||
199 | soup = self.soup(html) | ||
200 | self.assertEqual("text/javascript", soup.find('script')['type']) | ||
201 | |||
158 | def test_comment(self): | 202 | def test_comment(self): |
159 | # Comments are represented as Comment objects. | 203 | # Comments are represented as Comment objects. |
160 | markup = "<p>foo<!--foobar-->baz</p>" | 204 | markup = "<p>foo<!--foobar-->baz</p>" |
@@ -221,18 +265,26 @@ class HTMLTreeBuilderSmokeTest(object): | |||
221 | soup = self.soup(markup) | 265 | soup = self.soup(markup) |
222 | self.assertEqual(["css"], soup.div.div['class']) | 266 | self.assertEqual(["css"], soup.div.div['class']) |
223 | 267 | ||
268 | def test_multivalued_attribute_on_html(self): | ||
269 | # html5lib uses a different API to set the attributes ot the | ||
270 | # <html> tag. This has caused problems with multivalued | ||
271 | # attributes. | ||
272 | markup = '<html class="a b"></html>' | ||
273 | soup = self.soup(markup) | ||
274 | self.assertEqual(["a", "b"], soup.html['class']) | ||
275 | |||
224 | def test_angle_brackets_in_attribute_values_are_escaped(self): | 276 | def test_angle_brackets_in_attribute_values_are_escaped(self): |
225 | self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') | 277 | self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') |
226 | 278 | ||
227 | def test_entities_in_attributes_converted_to_unicode(self): | 279 | def test_entities_in_attributes_converted_to_unicode(self): |
228 | expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' | 280 | expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' |
229 | self.assertSoupEquals('<p id="piñata"></p>', expect) | 281 | self.assertSoupEquals('<p id="piñata"></p>', expect) |
230 | self.assertSoupEquals('<p id="piñata"></p>', expect) | 282 | self.assertSoupEquals('<p id="piñata"></p>', expect) |
231 | self.assertSoupEquals('<p id="piñata"></p>', expect) | 283 | self.assertSoupEquals('<p id="piñata"></p>', expect) |
232 | self.assertSoupEquals('<p id="piñata"></p>', expect) | 284 | self.assertSoupEquals('<p id="piñata"></p>', expect) |
233 | 285 | ||
234 | def test_entities_in_text_converted_to_unicode(self): | 286 | def test_entities_in_text_converted_to_unicode(self): |
235 | expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' | 287 | expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' |
236 | self.assertSoupEquals("<p>piñata</p>", expect) | 288 | self.assertSoupEquals("<p>piñata</p>", expect) |
237 | self.assertSoupEquals("<p>piñata</p>", expect) | 289 | self.assertSoupEquals("<p>piñata</p>", expect) |
238 | self.assertSoupEquals("<p>piñata</p>", expect) | 290 | self.assertSoupEquals("<p>piñata</p>", expect) |
@@ -243,7 +295,7 @@ class HTMLTreeBuilderSmokeTest(object): | |||
243 | '<p>I said "good day!"</p>') | 295 | '<p>I said "good day!"</p>') |
244 | 296 | ||
245 | def test_out_of_range_entity(self): | 297 | def test_out_of_range_entity(self): |
246 | expect = u"\N{REPLACEMENT CHARACTER}" | 298 | expect = "\N{REPLACEMENT CHARACTER}" |
247 | self.assertSoupEquals("�", expect) | 299 | self.assertSoupEquals("�", expect) |
248 | self.assertSoupEquals("�", expect) | 300 | self.assertSoupEquals("�", expect) |
249 | self.assertSoupEquals("�", expect) | 301 | self.assertSoupEquals("�", expect) |
@@ -253,6 +305,35 @@ class HTMLTreeBuilderSmokeTest(object): | |||
253 | soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") | 305 | soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") |
254 | self.assertEqual("p", soup.h2.string.next_element.name) | 306 | self.assertEqual("p", soup.h2.string.next_element.name) |
255 | self.assertEqual("p", soup.p.name) | 307 | self.assertEqual("p", soup.p.name) |
308 | self.assertConnectedness(soup) | ||
309 | |||
310 | def test_head_tag_between_head_and_body(self): | ||
311 | "Prevent recurrence of a bug in the html5lib treebuilder." | ||
312 | content = """<html><head></head> | ||
313 | <link></link> | ||
314 | <body>foo</body> | ||
315 | </html> | ||
316 | """ | ||
317 | soup = self.soup(content) | ||
318 | self.assertNotEqual(None, soup.html.body) | ||
319 | self.assertConnectedness(soup) | ||
320 | |||
321 | def test_multiple_copies_of_a_tag(self): | ||
322 | "Prevent recurrence of a bug in the html5lib treebuilder." | ||
323 | content = """<!DOCTYPE html> | ||
324 | <html> | ||
325 | <body> | ||
326 | <article id="a" > | ||
327 | <div><a href="1"></div> | ||
328 | <footer> | ||
329 | <a href="2"></a> | ||
330 | </footer> | ||
331 | </article> | ||
332 | </body> | ||
333 | </html> | ||
334 | """ | ||
335 | soup = self.soup(content) | ||
336 | self.assertConnectedness(soup.article) | ||
256 | 337 | ||
257 | def test_basic_namespaces(self): | 338 | def test_basic_namespaces(self): |
258 | """Parsers don't need to *understand* namespaces, but at the | 339 | """Parsers don't need to *understand* namespaces, but at the |
@@ -285,9 +366,9 @@ class HTMLTreeBuilderSmokeTest(object): | |||
285 | # A seemingly innocuous document... but it's in Unicode! And | 366 | # A seemingly innocuous document... but it's in Unicode! And |
286 | # it contains characters that can't be represented in the | 367 | # it contains characters that can't be represented in the |
287 | # encoding found in the declaration! The horror! | 368 | # encoding found in the declaration! The horror! |
288 | markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' | 369 | markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' |
289 | soup = self.soup(markup) | 370 | soup = self.soup(markup) |
290 | self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) | 371 | self.assertEqual('Sacr\xe9 bleu!', soup.body.string) |
291 | 372 | ||
292 | def test_soupstrainer(self): | 373 | def test_soupstrainer(self): |
293 | """Parsers should be able to work with SoupStrainers.""" | 374 | """Parsers should be able to work with SoupStrainers.""" |
@@ -327,7 +408,7 @@ class HTMLTreeBuilderSmokeTest(object): | |||
327 | # Both XML and HTML entities are converted to Unicode characters | 408 | # Both XML and HTML entities are converted to Unicode characters |
328 | # during parsing. | 409 | # during parsing. |
329 | text = "<p><<sacré bleu!>></p>" | 410 | text = "<p><<sacré bleu!>></p>" |
330 | expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" | 411 | expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" |
331 | self.assertSoupEquals(text, expected) | 412 | self.assertSoupEquals(text, expected) |
332 | 413 | ||
333 | def test_smart_quotes_converted_on_the_way_in(self): | 414 | def test_smart_quotes_converted_on_the_way_in(self): |
@@ -337,15 +418,15 @@ class HTMLTreeBuilderSmokeTest(object): | |||
337 | soup = self.soup(quote) | 418 | soup = self.soup(quote) |
338 | self.assertEqual( | 419 | self.assertEqual( |
339 | soup.p.string, | 420 | soup.p.string, |
340 | u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") | 421 | "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") |
341 | 422 | ||
342 | def test_non_breaking_spaces_converted_on_the_way_in(self): | 423 | def test_non_breaking_spaces_converted_on_the_way_in(self): |
343 | soup = self.soup("<a> </a>") | 424 | soup = self.soup("<a> </a>") |
344 | self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) | 425 | self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) |
345 | 426 | ||
346 | def test_entities_converted_on_the_way_out(self): | 427 | def test_entities_converted_on_the_way_out(self): |
347 | text = "<p><<sacré bleu!>></p>" | 428 | text = "<p><<sacré bleu!>></p>" |
348 | expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") | 429 | expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") |
349 | soup = self.soup(text) | 430 | soup = self.soup(text) |
350 | self.assertEqual(soup.p.encode("utf-8"), expected) | 431 | self.assertEqual(soup.p.encode("utf-8"), expected) |
351 | 432 | ||
@@ -354,7 +435,7 @@ class HTMLTreeBuilderSmokeTest(object): | |||
354 | # easy-to-understand document. | 435 | # easy-to-understand document. |
355 | 436 | ||
356 | # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. | 437 | # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. |
357 | unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' | 438 | unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' |
358 | 439 | ||
359 | # That's because we're going to encode it into ISO-Latin-1, and use | 440 | # That's because we're going to encode it into ISO-Latin-1, and use |
360 | # that to test. | 441 | # that to test. |
@@ -463,11 +544,25 @@ class HTMLTreeBuilderSmokeTest(object): | |||
463 | 544 | ||
464 | class XMLTreeBuilderSmokeTest(object): | 545 | class XMLTreeBuilderSmokeTest(object): |
465 | 546 | ||
547 | def test_pickle_and_unpickle_identity(self): | ||
548 | # Pickling a tree, then unpickling it, yields a tree identical | ||
549 | # to the original. | ||
550 | tree = self.soup("<a><b>foo</a>") | ||
551 | dumped = pickle.dumps(tree, 2) | ||
552 | loaded = pickle.loads(dumped) | ||
553 | self.assertEqual(loaded.__class__, BeautifulSoup) | ||
554 | self.assertEqual(loaded.decode(), tree.decode()) | ||
555 | |||
466 | def test_docstring_generated(self): | 556 | def test_docstring_generated(self): |
467 | soup = self.soup("<root/>") | 557 | soup = self.soup("<root/>") |
468 | self.assertEqual( | 558 | self.assertEqual( |
469 | soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') | 559 | soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') |
470 | 560 | ||
561 | def test_xml_declaration(self): | ||
562 | markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>""" | ||
563 | soup = self.soup(markup) | ||
564 | self.assertEqual(markup, soup.encode("utf8")) | ||
565 | |||
471 | def test_real_xhtml_document(self): | 566 | def test_real_xhtml_document(self): |
472 | """A real XHTML document should come out *exactly* the same as it went in.""" | 567 | """A real XHTML document should come out *exactly* the same as it went in.""" |
473 | markup = b"""<?xml version="1.0" encoding="utf-8"?> | 568 | markup = b"""<?xml version="1.0" encoding="utf-8"?> |
@@ -485,7 +580,7 @@ class XMLTreeBuilderSmokeTest(object): | |||
485 | <script type="text/javascript"> | 580 | <script type="text/javascript"> |
486 | </script> | 581 | </script> |
487 | """ | 582 | """ |
488 | soup = BeautifulSoup(doc, "xml") | 583 | soup = BeautifulSoup(doc, "lxml-xml") |
489 | # lxml would have stripped this while parsing, but we can add | 584 | # lxml would have stripped this while parsing, but we can add |
490 | # it later. | 585 | # it later. |
491 | soup.script.string = 'console.log("< < hey > > ");' | 586 | soup.script.string = 'console.log("< < hey > > ");' |
@@ -493,15 +588,15 @@ class XMLTreeBuilderSmokeTest(object): | |||
493 | self.assertTrue(b"< < hey > >" in encoded) | 588 | self.assertTrue(b"< < hey > >" in encoded) |
494 | 589 | ||
495 | def test_can_parse_unicode_document(self): | 590 | def test_can_parse_unicode_document(self): |
496 | markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' | 591 | markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' |
497 | soup = self.soup(markup) | 592 | soup = self.soup(markup) |
498 | self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) | 593 | self.assertEqual('Sacr\xe9 bleu!', soup.root.string) |
499 | 594 | ||
500 | def test_popping_namespaced_tag(self): | 595 | def test_popping_namespaced_tag(self): |
501 | markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' | 596 | markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' |
502 | soup = self.soup(markup) | 597 | soup = self.soup(markup) |
503 | self.assertEqual( | 598 | self.assertEqual( |
504 | unicode(soup.rss), markup) | 599 | str(soup.rss), markup) |
505 | 600 | ||
506 | def test_docstring_includes_correct_encoding(self): | 601 | def test_docstring_includes_correct_encoding(self): |
507 | soup = self.soup("<root/>") | 602 | soup = self.soup("<root/>") |
@@ -532,17 +627,17 @@ class XMLTreeBuilderSmokeTest(object): | |||
532 | def test_closing_namespaced_tag(self): | 627 | def test_closing_namespaced_tag(self): |
533 | markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' | 628 | markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' |
534 | soup = self.soup(markup) | 629 | soup = self.soup(markup) |
535 | self.assertEqual(unicode(soup.p), markup) | 630 | self.assertEqual(str(soup.p), markup) |
536 | 631 | ||
537 | def test_namespaced_attributes(self): | 632 | def test_namespaced_attributes(self): |
538 | markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' | 633 | markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' |
539 | soup = self.soup(markup) | 634 | soup = self.soup(markup) |
540 | self.assertEqual(unicode(soup.foo), markup) | 635 | self.assertEqual(str(soup.foo), markup) |
541 | 636 | ||
542 | def test_namespaced_attributes_xml_namespace(self): | 637 | def test_namespaced_attributes_xml_namespace(self): |
543 | markup = '<foo xml:lang="fr">bar</foo>' | 638 | markup = '<foo xml:lang="fr">bar</foo>' |
544 | soup = self.soup(markup) | 639 | soup = self.soup(markup) |
545 | self.assertEqual(unicode(soup.foo), markup) | 640 | self.assertEqual(str(soup.foo), markup) |
546 | 641 | ||
547 | class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): | 642 | class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): |
548 | """Smoke test for a tree builder that supports HTML5.""" | 643 | """Smoke test for a tree builder that supports HTML5.""" |
diff --git a/bitbake/lib/bs4/tests/test_builder_registry.py b/bitbake/lib/bs4/tests/test_builder_registry.py index 92ad10fb04..90cad82933 100644 --- a/bitbake/lib/bs4/tests/test_builder_registry.py +++ b/bitbake/lib/bs4/tests/test_builder_registry.py | |||
@@ -1,6 +1,7 @@ | |||
1 | """Tests of the builder registry.""" | 1 | """Tests of the builder registry.""" |
2 | 2 | ||
3 | import unittest | 3 | import unittest |
4 | import warnings | ||
4 | 5 | ||
5 | from bs4 import BeautifulSoup | 6 | from bs4 import BeautifulSoup |
6 | from bs4.builder import ( | 7 | from bs4.builder import ( |
@@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase): | |||
67 | HTMLParserTreeBuilder) | 68 | HTMLParserTreeBuilder) |
68 | 69 | ||
69 | def test_beautifulsoup_constructor_does_lookup(self): | 70 | def test_beautifulsoup_constructor_does_lookup(self): |
70 | # You can pass in a string. | 71 | |
71 | BeautifulSoup("", features="html") | 72 | with warnings.catch_warnings(record=True) as w: |
72 | # Or a list of strings. | 73 | # This will create a warning about not explicitly |
73 | BeautifulSoup("", features=["html", "fast"]) | 74 | # specifying a parser, but we'll ignore it. |
75 | |||
76 | # You can pass in a string. | ||
77 | BeautifulSoup("", features="html") | ||
78 | # Or a list of strings. | ||
79 | BeautifulSoup("", features=["html", "fast"]) | ||
74 | 80 | ||
75 | # You'll get an exception if BS can't find an appropriate | 81 | # You'll get an exception if BS can't find an appropriate |
76 | # builder. | 82 | # builder. |
diff --git a/bitbake/lib/bs4/tests/test_html5lib.py b/bitbake/lib/bs4/tests/test_html5lib.py index 594c3e1f26..a7494ca5ba 100644 --- a/bitbake/lib/bs4/tests/test_html5lib.py +++ b/bitbake/lib/bs4/tests/test_html5lib.py | |||
@@ -5,7 +5,7 @@ import warnings | |||
5 | try: | 5 | try: |
6 | from bs4.builder import HTML5TreeBuilder | 6 | from bs4.builder import HTML5TreeBuilder |
7 | HTML5LIB_PRESENT = True | 7 | HTML5LIB_PRESENT = True |
8 | except ImportError, e: | 8 | except ImportError as e: |
9 | HTML5LIB_PRESENT = False | 9 | HTML5LIB_PRESENT = False |
10 | from bs4.element import SoupStrainer | 10 | from bs4.element import SoupStrainer |
11 | from bs4.testing import ( | 11 | from bs4.testing import ( |
@@ -74,12 +74,25 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): | |||
74 | def test_reparented_markup(self): | 74 | def test_reparented_markup(self): |
75 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' | 75 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' |
76 | soup = self.soup(markup) | 76 | soup = self.soup(markup) |
77 | self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) | 77 | self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) |
78 | self.assertEqual(2, len(soup.find_all('p'))) | 78 | self.assertEqual(2, len(soup.find_all('p'))) |
79 | 79 | ||
80 | 80 | ||
81 | def test_reparented_markup_ends_with_whitespace(self): | 81 | def test_reparented_markup_ends_with_whitespace(self): |
82 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' | 82 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' |
83 | soup = self.soup(markup) | 83 | soup = self.soup(markup) |
84 | self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) | 84 | self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) |
85 | self.assertEqual(2, len(soup.find_all('p'))) | 85 | self.assertEqual(2, len(soup.find_all('p'))) |
86 | |||
87 | def test_processing_instruction(self): | ||
88 | """Processing instructions become comments.""" | ||
89 | markup = b"""<?PITarget PIContent?>""" | ||
90 | soup = self.soup(markup) | ||
91 | assert str(soup).startswith("<!--?PITarget PIContent?-->") | ||
92 | |||
93 | def test_cloned_multivalue_node(self): | ||
94 | markup = b"""<a class="my_class"><p></a>""" | ||
95 | soup = self.soup(markup) | ||
96 | a1, a2 = soup.find_all('a') | ||
97 | self.assertEqual(a1, a2) | ||
98 | assert a1 is not a2 | ||
diff --git a/bitbake/lib/bs4/tests/test_htmlparser.py b/bitbake/lib/bs4/tests/test_htmlparser.py index bcb5ed232f..b45e35f999 100644 --- a/bitbake/lib/bs4/tests/test_htmlparser.py +++ b/bitbake/lib/bs4/tests/test_htmlparser.py | |||
@@ -1,6 +1,8 @@ | |||
1 | """Tests to ensure that the html.parser tree builder generates good | 1 | """Tests to ensure that the html.parser tree builder generates good |
2 | trees.""" | 2 | trees.""" |
3 | 3 | ||
4 | from pdb import set_trace | ||
5 | import pickle | ||
4 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest | 6 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest |
5 | from bs4.builder import HTMLParserTreeBuilder | 7 | from bs4.builder import HTMLParserTreeBuilder |
6 | 8 | ||
@@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): | |||
17 | def test_namespaced_public_doctype(self): | 19 | def test_namespaced_public_doctype(self): |
18 | # html.parser can't handle namespaced doctypes, so skip this one. | 20 | # html.parser can't handle namespaced doctypes, so skip this one. |
19 | pass | 21 | pass |
22 | |||
23 | def test_builder_is_pickled(self): | ||
24 | """Unlike most tree builders, HTMLParserTreeBuilder and will | ||
25 | be restored after pickling. | ||
26 | """ | ||
27 | tree = self.soup("<a><b>foo</a>") | ||
28 | dumped = pickle.dumps(tree, 2) | ||
29 | loaded = pickle.loads(dumped) | ||
30 | self.assertTrue(isinstance(loaded.builder, type(tree.builder))) | ||
31 | |||
32 | |||
diff --git a/bitbake/lib/bs4/tests/test_lxml.py b/bitbake/lib/bs4/tests/test_lxml.py index 2b2e9b7e78..6c2a1d73eb 100644 --- a/bitbake/lib/bs4/tests/test_lxml.py +++ b/bitbake/lib/bs4/tests/test_lxml.py | |||
@@ -7,7 +7,7 @@ try: | |||
7 | import lxml.etree | 7 | import lxml.etree |
8 | LXML_PRESENT = True | 8 | LXML_PRESENT = True |
9 | LXML_VERSION = lxml.etree.LXML_VERSION | 9 | LXML_VERSION = lxml.etree.LXML_VERSION |
10 | except ImportError, e: | 10 | except ImportError as e: |
11 | LXML_PRESENT = False | 11 | LXML_PRESENT = False |
12 | LXML_VERSION = (0,) | 12 | LXML_VERSION = (0,) |
13 | 13 | ||
@@ -62,24 +62,9 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): | |||
62 | # if one is installed. | 62 | # if one is installed. |
63 | with warnings.catch_warnings(record=True) as w: | 63 | with warnings.catch_warnings(record=True) as w: |
64 | soup = BeautifulStoneSoup("<b />") | 64 | soup = BeautifulStoneSoup("<b />") |
65 | self.assertEqual(u"<b/>", unicode(soup.b)) | 65 | self.assertEqual("<b/>", str(soup.b)) |
66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) | 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) |
67 | 67 | ||
68 | def test_real_xhtml_document(self): | ||
69 | """lxml strips the XML definition from an XHTML doc, which is fine.""" | ||
70 | markup = b"""<?xml version="1.0" encoding="utf-8"?> | ||
71 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> | ||
72 | <html xmlns="http://www.w3.org/1999/xhtml"> | ||
73 | <head><title>Hello.</title></head> | ||
74 | <body>Goodbye.</body> | ||
75 | </html>""" | ||
76 | soup = self.soup(markup) | ||
77 | self.assertEqual( | ||
78 | soup.encode("utf-8").replace(b"\n", b''), | ||
79 | markup.replace(b'\n', b'').replace( | ||
80 | b'<?xml version="1.0" encoding="utf-8"?>', b'')) | ||
81 | |||
82 | |||
83 | @skipIf( | 68 | @skipIf( |
84 | not LXML_PRESENT, | 69 | not LXML_PRESENT, |
85 | "lxml seems not to be present, not testing its XML tree builder.") | 70 | "lxml seems not to be present, not testing its XML tree builder.") |
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py index 47ac245f99..f87949e3d3 100644 --- a/bitbake/lib/bs4/tests/test_soup.py +++ b/bitbake/lib/bs4/tests/test_soup.py | |||
@@ -1,6 +1,7 @@ | |||
1 | # -*- coding: utf-8 -*- | 1 | # -*- coding: utf-8 -*- |
2 | """Tests of Beautiful Soup as a whole.""" | 2 | """Tests of Beautiful Soup as a whole.""" |
3 | 3 | ||
4 | from pdb import set_trace | ||
4 | import logging | 5 | import logging |
5 | import unittest | 6 | import unittest |
6 | import sys | 7 | import sys |
@@ -20,6 +21,7 @@ import bs4.dammit | |||
20 | from bs4.dammit import ( | 21 | from bs4.dammit import ( |
21 | EntitySubstitution, | 22 | EntitySubstitution, |
22 | UnicodeDammit, | 23 | UnicodeDammit, |
24 | EncodingDetector, | ||
23 | ) | 25 | ) |
24 | from bs4.testing import ( | 26 | from bs4.testing import ( |
25 | SoupTest, | 27 | SoupTest, |
@@ -30,7 +32,7 @@ import warnings | |||
30 | try: | 32 | try: |
31 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | 33 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML |
32 | LXML_PRESENT = True | 34 | LXML_PRESENT = True |
33 | except ImportError, e: | 35 | except ImportError as e: |
34 | LXML_PRESENT = False | 36 | LXML_PRESENT = False |
35 | 37 | ||
36 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) | 38 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) |
@@ -39,17 +41,43 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) | |||
39 | class TestConstructor(SoupTest): | 41 | class TestConstructor(SoupTest): |
40 | 42 | ||
41 | def test_short_unicode_input(self): | 43 | def test_short_unicode_input(self): |
42 | data = u"<h1>éé</h1>" | 44 | data = "<h1>éé</h1>" |
43 | soup = self.soup(data) | 45 | soup = self.soup(data) |
44 | self.assertEqual(u"éé", soup.h1.string) | 46 | self.assertEqual("éé", soup.h1.string) |
45 | 47 | ||
46 | def test_embedded_null(self): | 48 | def test_embedded_null(self): |
47 | data = u"<h1>foo\0bar</h1>" | 49 | data = "<h1>foo\0bar</h1>" |
48 | soup = self.soup(data) | 50 | soup = self.soup(data) |
49 | self.assertEqual(u"foo\0bar", soup.h1.string) | 51 | self.assertEqual("foo\0bar", soup.h1.string) |
50 | 52 | ||
53 | def test_exclude_encodings(self): | ||
54 | utf8_data = "Räksmörgås".encode("utf-8") | ||
55 | soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) | ||
56 | self.assertEqual("windows-1252", soup.original_encoding) | ||
51 | 57 | ||
52 | class TestDeprecatedConstructorArguments(SoupTest): | 58 | |
59 | class TestWarnings(SoupTest): | ||
60 | |||
61 | def _no_parser_specified(self, s, is_there=True): | ||
62 | v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) | ||
63 | self.assertTrue(v) | ||
64 | |||
65 | def test_warning_if_no_parser_specified(self): | ||
66 | with warnings.catch_warnings(record=True) as w: | ||
67 | soup = self.soup("<a><b></b></a>") | ||
68 | msg = str(w[0].message) | ||
69 | self._assert_no_parser_specified(msg) | ||
70 | |||
71 | def test_warning_if_parser_specified_too_vague(self): | ||
72 | with warnings.catch_warnings(record=True) as w: | ||
73 | soup = self.soup("<a><b></b></a>", "html") | ||
74 | msg = str(w[0].message) | ||
75 | self._assert_no_parser_specified(msg) | ||
76 | |||
77 | def test_no_warning_if_explicit_parser_specified(self): | ||
78 | with warnings.catch_warnings(record=True) as w: | ||
79 | soup = self.soup("<a><b></b></a>", "html.parser") | ||
80 | self.assertEqual([], w) | ||
53 | 81 | ||
54 | def test_parseOnlyThese_renamed_to_parse_only(self): | 82 | def test_parseOnlyThese_renamed_to_parse_only(self): |
55 | with warnings.catch_warnings(record=True) as w: | 83 | with warnings.catch_warnings(record=True) as w: |
@@ -117,9 +145,9 @@ class TestEntitySubstitution(unittest.TestCase): | |||
117 | def test_simple_html_substitution(self): | 145 | def test_simple_html_substitution(self): |
118 | # Unicode characters corresponding to named HTML entites | 146 | # Unicode characters corresponding to named HTML entites |
119 | # are substituted, and no others. | 147 | # are substituted, and no others. |
120 | s = u"foo\u2200\N{SNOWMAN}\u00f5bar" | 148 | s = "foo\u2200\N{SNOWMAN}\u00f5bar" |
121 | self.assertEqual(self.sub.substitute_html(s), | 149 | self.assertEqual(self.sub.substitute_html(s), |
122 | u"foo∀\N{SNOWMAN}õbar") | 150 | "foo∀\N{SNOWMAN}õbar") |
123 | 151 | ||
124 | def test_smart_quote_substitution(self): | 152 | def test_smart_quote_substitution(self): |
125 | # MS smart quotes are a common source of frustration, so we | 153 | # MS smart quotes are a common source of frustration, so we |
@@ -184,7 +212,7 @@ class TestEncodingConversion(SoupTest): | |||
184 | 212 | ||
185 | def setUp(self): | 213 | def setUp(self): |
186 | super(TestEncodingConversion, self).setUp() | 214 | super(TestEncodingConversion, self).setUp() |
187 | self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' | 215 | self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' |
188 | self.utf8_data = self.unicode_data.encode("utf-8") | 216 | self.utf8_data = self.unicode_data.encode("utf-8") |
189 | # Just so you know what it looks like. | 217 | # Just so you know what it looks like. |
190 | self.assertEqual( | 218 | self.assertEqual( |
@@ -204,7 +232,7 @@ class TestEncodingConversion(SoupTest): | |||
204 | ascii = b"<foo>a</foo>" | 232 | ascii = b"<foo>a</foo>" |
205 | soup_from_ascii = self.soup(ascii) | 233 | soup_from_ascii = self.soup(ascii) |
206 | unicode_output = soup_from_ascii.decode() | 234 | unicode_output = soup_from_ascii.decode() |
207 | self.assertTrue(isinstance(unicode_output, unicode)) | 235 | self.assertTrue(isinstance(unicode_output, str)) |
208 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) | 236 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) |
209 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") | 237 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") |
210 | finally: | 238 | finally: |
@@ -216,7 +244,7 @@ class TestEncodingConversion(SoupTest): | |||
216 | # is not set. | 244 | # is not set. |
217 | soup_from_unicode = self.soup(self.unicode_data) | 245 | soup_from_unicode = self.soup(self.unicode_data) |
218 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) | 246 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) |
219 | self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') | 247 | self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') |
220 | self.assertEqual(soup_from_unicode.original_encoding, None) | 248 | self.assertEqual(soup_from_unicode.original_encoding, None) |
221 | 249 | ||
222 | def test_utf8_in_unicode_out(self): | 250 | def test_utf8_in_unicode_out(self): |
@@ -224,7 +252,7 @@ class TestEncodingConversion(SoupTest): | |||
224 | # attribute is set. | 252 | # attribute is set. |
225 | soup_from_utf8 = self.soup(self.utf8_data) | 253 | soup_from_utf8 = self.soup(self.utf8_data) |
226 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) | 254 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) |
227 | self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') | 255 | self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') |
228 | 256 | ||
229 | def test_utf8_out(self): | 257 | def test_utf8_out(self): |
230 | # The internal data structures can be encoded as UTF-8. | 258 | # The internal data structures can be encoded as UTF-8. |
@@ -235,14 +263,14 @@ class TestEncodingConversion(SoupTest): | |||
235 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, | 263 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, |
236 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") | 264 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") |
237 | def test_attribute_name_containing_unicode_characters(self): | 265 | def test_attribute_name_containing_unicode_characters(self): |
238 | markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' | 266 | markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' |
239 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) | 267 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) |
240 | 268 | ||
241 | class TestUnicodeDammit(unittest.TestCase): | 269 | class TestUnicodeDammit(unittest.TestCase): |
242 | """Standalone tests of UnicodeDammit.""" | 270 | """Standalone tests of UnicodeDammit.""" |
243 | 271 | ||
244 | def test_unicode_input(self): | 272 | def test_unicode_input(self): |
245 | markup = u"I'm already Unicode! \N{SNOWMAN}" | 273 | markup = "I'm already Unicode! \N{SNOWMAN}" |
246 | dammit = UnicodeDammit(markup) | 274 | dammit = UnicodeDammit(markup) |
247 | self.assertEqual(dammit.unicode_markup, markup) | 275 | self.assertEqual(dammit.unicode_markup, markup) |
248 | 276 | ||
@@ -250,7 +278,7 @@ class TestUnicodeDammit(unittest.TestCase): | |||
250 | markup = b"<foo>\x91\x92\x93\x94</foo>" | 278 | markup = b"<foo>\x91\x92\x93\x94</foo>" |
251 | dammit = UnicodeDammit(markup) | 279 | dammit = UnicodeDammit(markup) |
252 | self.assertEqual( | 280 | self.assertEqual( |
253 | dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") | 281 | dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") |
254 | 282 | ||
255 | def test_smart_quotes_to_xml_entities(self): | 283 | def test_smart_quotes_to_xml_entities(self): |
256 | markup = b"<foo>\x91\x92\x93\x94</foo>" | 284 | markup = b"<foo>\x91\x92\x93\x94</foo>" |
@@ -271,16 +299,17 @@ class TestUnicodeDammit(unittest.TestCase): | |||
271 | dammit.unicode_markup, """<foo>''""</foo>""") | 299 | dammit.unicode_markup, """<foo>''""</foo>""") |
272 | 300 | ||
273 | def test_detect_utf8(self): | 301 | def test_detect_utf8(self): |
274 | utf8 = b"\xc3\xa9" | 302 | utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" |
275 | dammit = UnicodeDammit(utf8) | 303 | dammit = UnicodeDammit(utf8) |
276 | self.assertEqual(dammit.unicode_markup, u'\xe9') | ||
277 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | 304 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
305 | self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') | ||
306 | |||
278 | 307 | ||
279 | def test_convert_hebrew(self): | 308 | def test_convert_hebrew(self): |
280 | hebrew = b"\xed\xe5\xec\xf9" | 309 | hebrew = b"\xed\xe5\xec\xf9" |
281 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) | 310 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) |
282 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') | 311 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') |
283 | self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') | 312 | self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') |
284 | 313 | ||
285 | def test_dont_see_smart_quotes_where_there_are_none(self): | 314 | def test_dont_see_smart_quotes_where_there_are_none(self): |
286 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" | 315 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" |
@@ -289,16 +318,36 @@ class TestUnicodeDammit(unittest.TestCase): | |||
289 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) | 318 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) |
290 | 319 | ||
291 | def test_ignore_inappropriate_codecs(self): | 320 | def test_ignore_inappropriate_codecs(self): |
292 | utf8_data = u"Räksmörgås".encode("utf-8") | 321 | utf8_data = "Räksmörgås".encode("utf-8") |
293 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) | 322 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) |
294 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | 323 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
295 | 324 | ||
296 | def test_ignore_invalid_codecs(self): | 325 | def test_ignore_invalid_codecs(self): |
297 | utf8_data = u"Räksmörgås".encode("utf-8") | 326 | utf8_data = "Räksmörgås".encode("utf-8") |
298 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: | 327 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: |
299 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) | 328 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) |
300 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | 329 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
301 | 330 | ||
331 | def test_exclude_encodings(self): | ||
332 | # This is UTF-8. | ||
333 | utf8_data = "Räksmörgås".encode("utf-8") | ||
334 | |||
335 | # But if we exclude UTF-8 from consideration, the guess is | ||
336 | # Windows-1252. | ||
337 | dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) | ||
338 | self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') | ||
339 | |||
340 | # And if we exclude that, there is no valid guess at all. | ||
341 | dammit = UnicodeDammit( | ||
342 | utf8_data, exclude_encodings=["utf-8", "windows-1252"]) | ||
343 | self.assertEqual(dammit.original_encoding, None) | ||
344 | |||
345 | def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): | ||
346 | detected = EncodingDetector( | ||
347 | b'<?xml version="1.0" encoding="UTF-\xdb" ?>') | ||
348 | encodings = list(detected.encodings) | ||
349 | assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings | ||
350 | |||
302 | def test_detect_html5_style_meta_tag(self): | 351 | def test_detect_html5_style_meta_tag(self): |
303 | 352 | ||
304 | for data in ( | 353 | for data in ( |
@@ -337,7 +386,7 @@ class TestUnicodeDammit(unittest.TestCase): | |||
337 | bs4.dammit.chardet_dammit = noop | 386 | bs4.dammit.chardet_dammit = noop |
338 | dammit = UnicodeDammit(doc) | 387 | dammit = UnicodeDammit(doc) |
339 | self.assertEqual(True, dammit.contains_replacement_characters) | 388 | self.assertEqual(True, dammit.contains_replacement_characters) |
340 | self.assertTrue(u"\ufffd" in dammit.unicode_markup) | 389 | self.assertTrue("\ufffd" in dammit.unicode_markup) |
341 | 390 | ||
342 | soup = BeautifulSoup(doc, "html.parser") | 391 | soup = BeautifulSoup(doc, "html.parser") |
343 | self.assertTrue(soup.contains_replacement_characters) | 392 | self.assertTrue(soup.contains_replacement_characters) |
@@ -349,17 +398,17 @@ class TestUnicodeDammit(unittest.TestCase): | |||
349 | # A document written in UTF-16LE will have its byte order marker stripped. | 398 | # A document written in UTF-16LE will have its byte order marker stripped. |
350 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | 399 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' |
351 | dammit = UnicodeDammit(data) | 400 | dammit = UnicodeDammit(data) |
352 | self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) | 401 | self.assertEqual("<a>áé</a>", dammit.unicode_markup) |
353 | self.assertEqual("utf-16le", dammit.original_encoding) | 402 | self.assertEqual("utf-16le", dammit.original_encoding) |
354 | 403 | ||
355 | def test_detwingle(self): | 404 | def test_detwingle(self): |
356 | # Here's a UTF8 document. | 405 | # Here's a UTF8 document. |
357 | utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") | 406 | utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") |
358 | 407 | ||
359 | # Here's a Windows-1252 document. | 408 | # Here's a Windows-1252 document. |
360 | windows_1252 = ( | 409 | windows_1252 = ( |
361 | u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" | 410 | "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" |
362 | u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") | 411 | "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") |
363 | 412 | ||
364 | # Through some unholy alchemy, they've been stuck together. | 413 | # Through some unholy alchemy, they've been stuck together. |
365 | doc = utf8 + windows_1252 + utf8 | 414 | doc = utf8 + windows_1252 + utf8 |
@@ -374,7 +423,7 @@ class TestUnicodeDammit(unittest.TestCase): | |||
374 | 423 | ||
375 | fixed = UnicodeDammit.detwingle(doc) | 424 | fixed = UnicodeDammit.detwingle(doc) |
376 | self.assertEqual( | 425 | self.assertEqual( |
377 | u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) | 426 | "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) |
378 | 427 | ||
379 | def test_detwingle_ignores_multibyte_characters(self): | 428 | def test_detwingle_ignores_multibyte_characters(self): |
380 | # Each of these characters has a UTF-8 representation ending | 429 | # Each of these characters has a UTF-8 representation ending |
@@ -382,9 +431,9 @@ class TestUnicodeDammit(unittest.TestCase): | |||
382 | # Windows-1252. But our code knows to skip over multibyte | 431 | # Windows-1252. But our code knows to skip over multibyte |
383 | # UTF-8 characters, so they'll survive the process unscathed. | 432 | # UTF-8 characters, so they'll survive the process unscathed. |
384 | for tricky_unicode_char in ( | 433 | for tricky_unicode_char in ( |
385 | u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' | 434 | "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' |
386 | u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' | 435 | "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' |
387 | u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. | 436 | "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. |
388 | ): | 437 | ): |
389 | input = tricky_unicode_char.encode("utf8") | 438 | input = tricky_unicode_char.encode("utf8") |
390 | self.assertTrue(input.endswith(b'\x93')) | 439 | self.assertTrue(input.endswith(b'\x93')) |
diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py index f8515c0ea1..6d3e67f311 100644 --- a/bitbake/lib/bs4/tests/test_tree.py +++ b/bitbake/lib/bs4/tests/test_tree.py | |||
@@ -9,6 +9,7 @@ same markup, but all Beautiful Soup trees can be traversed with the | |||
9 | methods tested here. | 9 | methods tested here. |
10 | """ | 10 | """ |
11 | 11 | ||
12 | from pdb import set_trace | ||
12 | import copy | 13 | import copy |
13 | import pickle | 14 | import pickle |
14 | import re | 15 | import re |
@@ -19,8 +20,10 @@ from bs4.builder import ( | |||
19 | HTMLParserTreeBuilder, | 20 | HTMLParserTreeBuilder, |
20 | ) | 21 | ) |
21 | from bs4.element import ( | 22 | from bs4.element import ( |
23 | PY3K, | ||
22 | CData, | 24 | CData, |
23 | Comment, | 25 | Comment, |
26 | Declaration, | ||
24 | Doctype, | 27 | Doctype, |
25 | NavigableString, | 28 | NavigableString, |
26 | SoupStrainer, | 29 | SoupStrainer, |
@@ -67,8 +70,14 @@ class TestFind(TreeTest): | |||
67 | self.assertEqual(soup.find("b").string, "2") | 70 | self.assertEqual(soup.find("b").string, "2") |
68 | 71 | ||
69 | def test_unicode_text_find(self): | 72 | def test_unicode_text_find(self): |
70 | soup = self.soup(u'<h1>Räksmörgås</h1>') | 73 | soup = self.soup('<h1>Räksmörgås</h1>') |
71 | self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') | 74 | self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') |
75 | |||
76 | def test_unicode_attribute_find(self): | ||
77 | soup = self.soup('<h1 id="Räksmörgås">here it is</h1>') | ||
78 | str(soup) | ||
79 | self.assertEqual("here it is", soup.find(id='Räksmörgås').text) | ||
80 | |||
72 | 81 | ||
73 | def test_find_everything(self): | 82 | def test_find_everything(self): |
74 | """Test an optimization that finds all tags.""" | 83 | """Test an optimization that finds all tags.""" |
@@ -87,16 +96,17 @@ class TestFindAll(TreeTest): | |||
87 | """You can search the tree for text nodes.""" | 96 | """You can search the tree for text nodes.""" |
88 | soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") | 97 | soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") |
89 | # Exact match. | 98 | # Exact match. |
90 | self.assertEqual(soup.find_all(text="bar"), [u"bar"]) | 99 | self.assertEqual(soup.find_all(string="bar"), ["bar"]) |
100 | self.assertEqual(soup.find_all(text="bar"), ["bar"]) | ||
91 | # Match any of a number of strings. | 101 | # Match any of a number of strings. |
92 | self.assertEqual( | 102 | self.assertEqual( |
93 | soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) | 103 | soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) |
94 | # Match a regular expression. | 104 | # Match a regular expression. |
95 | self.assertEqual(soup.find_all(text=re.compile('.*')), | 105 | self.assertEqual(soup.find_all(text=re.compile('.*')), |
96 | [u"Foo", u"bar", u'\xbb']) | 106 | ["Foo", "bar", '\xbb']) |
97 | # Match anything. | 107 | # Match anything. |
98 | self.assertEqual(soup.find_all(text=True), | 108 | self.assertEqual(soup.find_all(text=True), |
99 | [u"Foo", u"bar", u'\xbb']) | 109 | ["Foo", "bar", '\xbb']) |
100 | 110 | ||
101 | def test_find_all_limit(self): | 111 | def test_find_all_limit(self): |
102 | """You can limit the number of items returned by find_all.""" | 112 | """You can limit the number of items returned by find_all.""" |
@@ -227,8 +237,8 @@ class TestFindAllByAttribute(TreeTest): | |||
227 | ["Matching a.", "Matching b."]) | 237 | ["Matching a.", "Matching b."]) |
228 | 238 | ||
229 | def test_find_all_by_utf8_attribute_value(self): | 239 | def test_find_all_by_utf8_attribute_value(self): |
230 | peace = u"םולש".encode("utf8") | 240 | peace = "םולש".encode("utf8") |
231 | data = u'<a title="םולש"></a>'.encode("utf8") | 241 | data = '<a title="םולש"></a>'.encode("utf8") |
232 | soup = self.soup(data) | 242 | soup = self.soup(data) |
233 | self.assertEqual([soup.a], soup.find_all(title=peace)) | 243 | self.assertEqual([soup.a], soup.find_all(title=peace)) |
234 | self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) | 244 | self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) |
@@ -688,7 +698,7 @@ class TestTagCreation(SoupTest): | |||
688 | 698 | ||
689 | def test_tag_inherits_self_closing_rules_from_builder(self): | 699 | def test_tag_inherits_self_closing_rules_from_builder(self): |
690 | if XML_BUILDER_PRESENT: | 700 | if XML_BUILDER_PRESENT: |
691 | xml_soup = BeautifulSoup("", "xml") | 701 | xml_soup = BeautifulSoup("", "lxml-xml") |
692 | xml_br = xml_soup.new_tag("br") | 702 | xml_br = xml_soup.new_tag("br") |
693 | xml_p = xml_soup.new_tag("p") | 703 | xml_p = xml_soup.new_tag("p") |
694 | 704 | ||
@@ -697,7 +707,7 @@ class TestTagCreation(SoupTest): | |||
697 | self.assertEqual(b"<br/>", xml_br.encode()) | 707 | self.assertEqual(b"<br/>", xml_br.encode()) |
698 | self.assertEqual(b"<p/>", xml_p.encode()) | 708 | self.assertEqual(b"<p/>", xml_p.encode()) |
699 | 709 | ||
700 | html_soup = BeautifulSoup("", "html") | 710 | html_soup = BeautifulSoup("", "html.parser") |
701 | html_br = html_soup.new_tag("br") | 711 | html_br = html_soup.new_tag("br") |
702 | html_p = html_soup.new_tag("p") | 712 | html_p = html_soup.new_tag("p") |
703 | 713 | ||
@@ -773,6 +783,14 @@ class TestTreeModification(SoupTest): | |||
773 | new_a = a.unwrap() | 783 | new_a = a.unwrap() |
774 | self.assertEqual(a, new_a) | 784 | self.assertEqual(a, new_a) |
775 | 785 | ||
786 | def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): | ||
787 | soup = self.soup("<a><b>Foo</b></a><c>Bar</c>") | ||
788 | a = soup.a | ||
789 | a.extract() | ||
790 | self.assertEqual(None, a.parent) | ||
791 | self.assertRaises(ValueError, a.unwrap) | ||
792 | self.assertRaises(ValueError, a.replace_with, soup.c) | ||
793 | |||
776 | def test_replace_tag_with_itself(self): | 794 | def test_replace_tag_with_itself(self): |
777 | text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" | 795 | text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" |
778 | soup = self.soup(text) | 796 | soup = self.soup(text) |
@@ -1067,6 +1085,31 @@ class TestTreeModification(SoupTest): | |||
1067 | self.assertEqual(foo_2, soup.a.string) | 1085 | self.assertEqual(foo_2, soup.a.string) |
1068 | self.assertEqual(bar_2, soup.b.string) | 1086 | self.assertEqual(bar_2, soup.b.string) |
1069 | 1087 | ||
1088 | def test_extract_multiples_of_same_tag(self): | ||
1089 | soup = self.soup(""" | ||
1090 | <html> | ||
1091 | <head> | ||
1092 | <script>foo</script> | ||
1093 | </head> | ||
1094 | <body> | ||
1095 | <script>bar</script> | ||
1096 | <a></a> | ||
1097 | </body> | ||
1098 | <script>baz</script> | ||
1099 | </html>""") | ||
1100 | [soup.script.extract() for i in soup.find_all("script")] | ||
1101 | self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body)) | ||
1102 | |||
1103 | |||
1104 | def test_extract_works_when_element_is_surrounded_by_identical_strings(self): | ||
1105 | soup = self.soup( | ||
1106 | '<html>\n' | ||
1107 | '<body>hi</body>\n' | ||
1108 | '</html>') | ||
1109 | soup.find('body').extract() | ||
1110 | self.assertEqual(None, soup.find('body')) | ||
1111 | |||
1112 | |||
1070 | def test_clear(self): | 1113 | def test_clear(self): |
1071 | """Tag.clear()""" | 1114 | """Tag.clear()""" |
1072 | soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") | 1115 | soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") |
@@ -1287,27 +1330,72 @@ class TestPersistence(SoupTest): | |||
1287 | 1330 | ||
1288 | def test_unicode_pickle(self): | 1331 | def test_unicode_pickle(self): |
1289 | # A tree containing Unicode characters can be pickled. | 1332 | # A tree containing Unicode characters can be pickled. |
1290 | html = u"<b>\N{SNOWMAN}</b>" | 1333 | html = "<b>\N{SNOWMAN}</b>" |
1291 | soup = self.soup(html) | 1334 | soup = self.soup(html) |
1292 | dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) | 1335 | dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) |
1293 | loaded = pickle.loads(dumped) | 1336 | loaded = pickle.loads(dumped) |
1294 | self.assertEqual(loaded.decode(), soup.decode()) | 1337 | self.assertEqual(loaded.decode(), soup.decode()) |
1295 | 1338 | ||
1339 | def test_copy_navigablestring_is_not_attached_to_tree(self): | ||
1340 | html = "<b>Foo<a></a></b><b>Bar</b>" | ||
1341 | soup = self.soup(html) | ||
1342 | s1 = soup.find(string="Foo") | ||
1343 | s2 = copy.copy(s1) | ||
1344 | self.assertEqual(s1, s2) | ||
1345 | self.assertEqual(None, s2.parent) | ||
1346 | self.assertEqual(None, s2.next_element) | ||
1347 | self.assertNotEqual(None, s1.next_sibling) | ||
1348 | self.assertEqual(None, s2.next_sibling) | ||
1349 | self.assertEqual(None, s2.previous_element) | ||
1350 | |||
1351 | def test_copy_navigablestring_subclass_has_same_type(self): | ||
1352 | html = "<b><!--Foo--></b>" | ||
1353 | soup = self.soup(html) | ||
1354 | s1 = soup.string | ||
1355 | s2 = copy.copy(s1) | ||
1356 | self.assertEqual(s1, s2) | ||
1357 | self.assertTrue(isinstance(s2, Comment)) | ||
1358 | |||
1359 | def test_copy_entire_soup(self): | ||
1360 | html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" | ||
1361 | soup = self.soup(html) | ||
1362 | soup_copy = copy.copy(soup) | ||
1363 | self.assertEqual(soup, soup_copy) | ||
1364 | |||
1365 | def test_copy_tag_copies_contents(self): | ||
1366 | html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" | ||
1367 | soup = self.soup(html) | ||
1368 | div = soup.div | ||
1369 | div_copy = copy.copy(div) | ||
1370 | |||
1371 | # The two tags look the same, and evaluate to equal. | ||
1372 | self.assertEqual(str(div), str(div_copy)) | ||
1373 | self.assertEqual(div, div_copy) | ||
1374 | |||
1375 | # But they're not the same object. | ||
1376 | self.assertFalse(div is div_copy) | ||
1377 | |||
1378 | # And they don't have the same relation to the parse tree. The | ||
1379 | # copy is not associated with a parse tree at all. | ||
1380 | self.assertEqual(None, div_copy.parent) | ||
1381 | self.assertEqual(None, div_copy.previous_element) | ||
1382 | self.assertEqual(None, div_copy.find(string='Bar').next_element) | ||
1383 | self.assertNotEqual(None, div.find(string='Bar').next_element) | ||
1296 | 1384 | ||
1297 | class TestSubstitutions(SoupTest): | 1385 | class TestSubstitutions(SoupTest): |
1298 | 1386 | ||
1299 | def test_default_formatter_is_minimal(self): | 1387 | def test_default_formatter_is_minimal(self): |
1300 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | 1388 | markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" |
1301 | soup = self.soup(markup) | 1389 | soup = self.soup(markup) |
1302 | decoded = soup.decode(formatter="minimal") | 1390 | decoded = soup.decode(formatter="minimal") |
1303 | # The < is converted back into < but the e-with-acute is left alone. | 1391 | # The < is converted back into < but the e-with-acute is left alone. |
1304 | self.assertEqual( | 1392 | self.assertEqual( |
1305 | decoded, | 1393 | decoded, |
1306 | self.document_for( | 1394 | self.document_for( |
1307 | u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | 1395 | "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) |
1308 | 1396 | ||
1309 | def test_formatter_html(self): | 1397 | def test_formatter_html(self): |
1310 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | 1398 | markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" |
1311 | soup = self.soup(markup) | 1399 | soup = self.soup(markup) |
1312 | decoded = soup.decode(formatter="html") | 1400 | decoded = soup.decode(formatter="html") |
1313 | self.assertEqual( | 1401 | self.assertEqual( |
@@ -1315,49 +1403,49 @@ class TestSubstitutions(SoupTest): | |||
1315 | self.document_for("<b><<Sacré bleu!>></b>")) | 1403 | self.document_for("<b><<Sacré bleu!>></b>")) |
1316 | 1404 | ||
1317 | def test_formatter_minimal(self): | 1405 | def test_formatter_minimal(self): |
1318 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | 1406 | markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" |
1319 | soup = self.soup(markup) | 1407 | soup = self.soup(markup) |
1320 | decoded = soup.decode(formatter="minimal") | 1408 | decoded = soup.decode(formatter="minimal") |
1321 | # The < is converted back into < but the e-with-acute is left alone. | 1409 | # The < is converted back into < but the e-with-acute is left alone. |
1322 | self.assertEqual( | 1410 | self.assertEqual( |
1323 | decoded, | 1411 | decoded, |
1324 | self.document_for( | 1412 | self.document_for( |
1325 | u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | 1413 | "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) |
1326 | 1414 | ||
1327 | def test_formatter_null(self): | 1415 | def test_formatter_null(self): |
1328 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | 1416 | markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" |
1329 | soup = self.soup(markup) | 1417 | soup = self.soup(markup) |
1330 | decoded = soup.decode(formatter=None) | 1418 | decoded = soup.decode(formatter=None) |
1331 | # Neither the angle brackets nor the e-with-acute are converted. | 1419 | # Neither the angle brackets nor the e-with-acute are converted. |
1332 | # This is not valid HTML, but it's what the user wanted. | 1420 | # This is not valid HTML, but it's what the user wanted. |
1333 | self.assertEqual(decoded, | 1421 | self.assertEqual(decoded, |
1334 | self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | 1422 | self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) |
1335 | 1423 | ||
1336 | def test_formatter_custom(self): | 1424 | def test_formatter_custom(self): |
1337 | markup = u"<b><foo></b><b>bar</b>" | 1425 | markup = "<b><foo></b><b>bar</b>" |
1338 | soup = self.soup(markup) | 1426 | soup = self.soup(markup) |
1339 | decoded = soup.decode(formatter = lambda x: x.upper()) | 1427 | decoded = soup.decode(formatter = lambda x: x.upper()) |
1340 | # Instead of normal entity conversion code, the custom | 1428 | # Instead of normal entity conversion code, the custom |
1341 | # callable is called on every string. | 1429 | # callable is called on every string. |
1342 | self.assertEqual( | 1430 | self.assertEqual( |
1343 | decoded, | 1431 | decoded, |
1344 | self.document_for(u"<b><FOO></b><b>BAR</b>")) | 1432 | self.document_for("<b><FOO></b><b>BAR</b>")) |
1345 | 1433 | ||
1346 | def test_formatter_is_run_on_attribute_values(self): | 1434 | def test_formatter_is_run_on_attribute_values(self): |
1347 | markup = u'<a href="http://a.com?a=b&c=é">e</a>' | 1435 | markup = '<a href="http://a.com?a=b&c=é">e</a>' |
1348 | soup = self.soup(markup) | 1436 | soup = self.soup(markup) |
1349 | a = soup.a | 1437 | a = soup.a |
1350 | 1438 | ||
1351 | expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>' | 1439 | expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' |
1352 | 1440 | ||
1353 | self.assertEqual(expect_minimal, a.decode()) | 1441 | self.assertEqual(expect_minimal, a.decode()) |
1354 | self.assertEqual(expect_minimal, a.decode(formatter="minimal")) | 1442 | self.assertEqual(expect_minimal, a.decode(formatter="minimal")) |
1355 | 1443 | ||
1356 | expect_html = u'<a href="http://a.com?a=b&c=é">e</a>' | 1444 | expect_html = '<a href="http://a.com?a=b&c=é">e</a>' |
1357 | self.assertEqual(expect_html, a.decode(formatter="html")) | 1445 | self.assertEqual(expect_html, a.decode(formatter="html")) |
1358 | 1446 | ||
1359 | self.assertEqual(markup, a.decode(formatter=None)) | 1447 | self.assertEqual(markup, a.decode(formatter=None)) |
1360 | expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' | 1448 | expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' |
1361 | self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) | 1449 | self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) |
1362 | 1450 | ||
1363 | def test_formatter_skips_script_tag_for_html_documents(self): | 1451 | def test_formatter_skips_script_tag_for_html_documents(self): |
@@ -1366,7 +1454,7 @@ class TestSubstitutions(SoupTest): | |||
1366 | console.log("< < hey > > "); | 1454 | console.log("< < hey > > "); |
1367 | </script> | 1455 | </script> |
1368 | """ | 1456 | """ |
1369 | encoded = BeautifulSoup(doc).encode() | 1457 | encoded = BeautifulSoup(doc, 'html.parser').encode() |
1370 | self.assertTrue(b"< < hey > >" in encoded) | 1458 | self.assertTrue(b"< < hey > >" in encoded) |
1371 | 1459 | ||
1372 | def test_formatter_skips_style_tag_for_html_documents(self): | 1460 | def test_formatter_skips_style_tag_for_html_documents(self): |
@@ -1375,7 +1463,7 @@ class TestSubstitutions(SoupTest): | |||
1375 | console.log("< < hey > > "); | 1463 | console.log("< < hey > > "); |
1376 | </style> | 1464 | </style> |
1377 | """ | 1465 | """ |
1378 | encoded = BeautifulSoup(doc).encode() | 1466 | encoded = BeautifulSoup(doc, 'html.parser').encode() |
1379 | self.assertTrue(b"< < hey > >" in encoded) | 1467 | self.assertTrue(b"< < hey > >" in encoded) |
1380 | 1468 | ||
1381 | def test_prettify_leaves_preformatted_text_alone(self): | 1469 | def test_prettify_leaves_preformatted_text_alone(self): |
@@ -1383,24 +1471,24 @@ class TestSubstitutions(SoupTest): | |||
1383 | # Everything outside the <pre> tag is reformatted, but everything | 1471 | # Everything outside the <pre> tag is reformatted, but everything |
1384 | # inside is left alone. | 1472 | # inside is left alone. |
1385 | self.assertEqual( | 1473 | self.assertEqual( |
1386 | u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', | 1474 | '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', |
1387 | soup.div.prettify()) | 1475 | soup.div.prettify()) |
1388 | 1476 | ||
1389 | def test_prettify_accepts_formatter(self): | 1477 | def test_prettify_accepts_formatter(self): |
1390 | soup = BeautifulSoup("<html><body>foo</body></html>") | 1478 | soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') |
1391 | pretty = soup.prettify(formatter = lambda x: x.upper()) | 1479 | pretty = soup.prettify(formatter = lambda x: x.upper()) |
1392 | self.assertTrue("FOO" in pretty) | 1480 | self.assertTrue("FOO" in pretty) |
1393 | 1481 | ||
1394 | def test_prettify_outputs_unicode_by_default(self): | 1482 | def test_prettify_outputs_unicode_by_default(self): |
1395 | soup = self.soup("<a></a>") | 1483 | soup = self.soup("<a></a>") |
1396 | self.assertEqual(unicode, type(soup.prettify())) | 1484 | self.assertEqual(str, type(soup.prettify())) |
1397 | 1485 | ||
1398 | def test_prettify_can_encode_data(self): | 1486 | def test_prettify_can_encode_data(self): |
1399 | soup = self.soup("<a></a>") | 1487 | soup = self.soup("<a></a>") |
1400 | self.assertEqual(bytes, type(soup.prettify("utf-8"))) | 1488 | self.assertEqual(bytes, type(soup.prettify("utf-8"))) |
1401 | 1489 | ||
1402 | def test_html_entity_substitution_off_by_default(self): | 1490 | def test_html_entity_substitution_off_by_default(self): |
1403 | markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" | 1491 | markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" |
1404 | soup = self.soup(markup) | 1492 | soup = self.soup(markup) |
1405 | encoded = soup.b.encode("utf-8") | 1493 | encoded = soup.b.encode("utf-8") |
1406 | self.assertEqual(encoded, markup.encode('utf-8')) | 1494 | self.assertEqual(encoded, markup.encode('utf-8')) |
@@ -1444,45 +1532,53 @@ class TestEncoding(SoupTest): | |||
1444 | """Test the ability to encode objects into strings.""" | 1532 | """Test the ability to encode objects into strings.""" |
1445 | 1533 | ||
1446 | def test_unicode_string_can_be_encoded(self): | 1534 | def test_unicode_string_can_be_encoded(self): |
1447 | html = u"<b>\N{SNOWMAN}</b>" | 1535 | html = "<b>\N{SNOWMAN}</b>" |
1448 | soup = self.soup(html) | 1536 | soup = self.soup(html) |
1449 | self.assertEqual(soup.b.string.encode("utf-8"), | 1537 | self.assertEqual(soup.b.string.encode("utf-8"), |
1450 | u"\N{SNOWMAN}".encode("utf-8")) | 1538 | "\N{SNOWMAN}".encode("utf-8")) |
1451 | 1539 | ||
1452 | def test_tag_containing_unicode_string_can_be_encoded(self): | 1540 | def test_tag_containing_unicode_string_can_be_encoded(self): |
1453 | html = u"<b>\N{SNOWMAN}</b>" | 1541 | html = "<b>\N{SNOWMAN}</b>" |
1454 | soup = self.soup(html) | 1542 | soup = self.soup(html) |
1455 | self.assertEqual( | 1543 | self.assertEqual( |
1456 | soup.b.encode("utf-8"), html.encode("utf-8")) | 1544 | soup.b.encode("utf-8"), html.encode("utf-8")) |
1457 | 1545 | ||
1458 | def test_encoding_substitutes_unrecognized_characters_by_default(self): | 1546 | def test_encoding_substitutes_unrecognized_characters_by_default(self): |
1459 | html = u"<b>\N{SNOWMAN}</b>" | 1547 | html = "<b>\N{SNOWMAN}</b>" |
1460 | soup = self.soup(html) | 1548 | soup = self.soup(html) |
1461 | self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") | 1549 | self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") |
1462 | 1550 | ||
1463 | def test_encoding_can_be_made_strict(self): | 1551 | def test_encoding_can_be_made_strict(self): |
1464 | html = u"<b>\N{SNOWMAN}</b>" | 1552 | html = "<b>\N{SNOWMAN}</b>" |
1465 | soup = self.soup(html) | 1553 | soup = self.soup(html) |
1466 | self.assertRaises( | 1554 | self.assertRaises( |
1467 | UnicodeEncodeError, soup.encode, "ascii", errors="strict") | 1555 | UnicodeEncodeError, soup.encode, "ascii", errors="strict") |
1468 | 1556 | ||
1469 | def test_decode_contents(self): | 1557 | def test_decode_contents(self): |
1470 | html = u"<b>\N{SNOWMAN}</b>" | 1558 | html = "<b>\N{SNOWMAN}</b>" |
1471 | soup = self.soup(html) | 1559 | soup = self.soup(html) |
1472 | self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) | 1560 | self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) |
1473 | 1561 | ||
1474 | def test_encode_contents(self): | 1562 | def test_encode_contents(self): |
1475 | html = u"<b>\N{SNOWMAN}</b>" | 1563 | html = "<b>\N{SNOWMAN}</b>" |
1476 | soup = self.soup(html) | 1564 | soup = self.soup(html) |
1477 | self.assertEqual( | 1565 | self.assertEqual( |
1478 | u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( | 1566 | "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( |
1479 | encoding="utf8")) | 1567 | encoding="utf8")) |
1480 | 1568 | ||
1481 | def test_deprecated_renderContents(self): | 1569 | def test_deprecated_renderContents(self): |
1482 | html = u"<b>\N{SNOWMAN}</b>" | 1570 | html = "<b>\N{SNOWMAN}</b>" |
1483 | soup = self.soup(html) | 1571 | soup = self.soup(html) |
1484 | self.assertEqual( | 1572 | self.assertEqual( |
1485 | u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) | 1573 | "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) |
1574 | |||
1575 | def test_repr(self): | ||
1576 | html = "<b>\N{SNOWMAN}</b>" | ||
1577 | soup = self.soup(html) | ||
1578 | if PY3K: | ||
1579 | self.assertEqual(html, repr(soup)) | ||
1580 | else: | ||
1581 | self.assertEqual(b'<b>\\u2603</b>', repr(soup)) | ||
1486 | 1582 | ||
1487 | class TestNavigableStringSubclasses(SoupTest): | 1583 | class TestNavigableStringSubclasses(SoupTest): |
1488 | 1584 | ||
@@ -1522,6 +1618,9 @@ class TestNavigableStringSubclasses(SoupTest): | |||
1522 | soup.insert(1, doctype) | 1618 | soup.insert(1, doctype) |
1523 | self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") | 1619 | self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") |
1524 | 1620 | ||
1621 | def test_declaration(self): | ||
1622 | d = Declaration("foo") | ||
1623 | self.assertEqual("<?foo?>", d.output_ready()) | ||
1525 | 1624 | ||
1526 | class TestSoupSelector(TreeTest): | 1625 | class TestSoupSelector(TreeTest): |
1527 | 1626 | ||
@@ -1534,7 +1633,7 @@ class TestSoupSelector(TreeTest): | |||
1534 | <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> | 1633 | <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> |
1535 | </head> | 1634 | </head> |
1536 | <body> | 1635 | <body> |
1537 | 1636 | <custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag> | |
1538 | <div id="main" class="fancy"> | 1637 | <div id="main" class="fancy"> |
1539 | <div id="inner"> | 1638 | <div id="inner"> |
1540 | <h1 id="header1">An H1</h1> | 1639 | <h1 id="header1">An H1</h1> |
@@ -1552,8 +1651,18 @@ class TestSoupSelector(TreeTest): | |||
1552 | <a href="#" id="s2a1">span2a1</a> | 1651 | <a href="#" id="s2a1">span2a1</a> |
1553 | </span> | 1652 | </span> |
1554 | <span class="span3"></span> | 1653 | <span class="span3"></span> |
1654 | <custom-dashed-tag class="dashed" id="dash2"/> | ||
1655 | <div data-tag="dashedvalue" id="data1"/> | ||
1555 | </span> | 1656 | </span> |
1556 | </div> | 1657 | </div> |
1658 | <x id="xid"> | ||
1659 | <z id="zida"/> | ||
1660 | <z id="zidab"/> | ||
1661 | <z id="zidac"/> | ||
1662 | </x> | ||
1663 | <y id="yid"> | ||
1664 | <z id="zidb"/> | ||
1665 | </y> | ||
1557 | <p lang="en" id="lang-en">English</p> | 1666 | <p lang="en" id="lang-en">English</p> |
1558 | <p lang="en-gb" id="lang-en-gb">English UK</p> | 1667 | <p lang="en-gb" id="lang-en-gb">English UK</p> |
1559 | <p lang="en-us" id="lang-en-us">English US</p> | 1668 | <p lang="en-us" id="lang-en-us">English US</p> |
@@ -1565,7 +1674,7 @@ class TestSoupSelector(TreeTest): | |||
1565 | """ | 1674 | """ |
1566 | 1675 | ||
1567 | def setUp(self): | 1676 | def setUp(self): |
1568 | self.soup = BeautifulSoup(self.HTML) | 1677 | self.soup = BeautifulSoup(self.HTML, 'html.parser') |
1569 | 1678 | ||
1570 | def assertSelects(self, selector, expected_ids): | 1679 | def assertSelects(self, selector, expected_ids): |
1571 | el_ids = [el['id'] for el in self.soup.select(selector)] | 1680 | el_ids = [el['id'] for el in self.soup.select(selector)] |
@@ -1587,21 +1696,29 @@ class TestSoupSelector(TreeTest): | |||
1587 | els = self.soup.select('title') | 1696 | els = self.soup.select('title') |
1588 | self.assertEqual(len(els), 1) | 1697 | self.assertEqual(len(els), 1) |
1589 | self.assertEqual(els[0].name, 'title') | 1698 | self.assertEqual(els[0].name, 'title') |
1590 | self.assertEqual(els[0].contents, [u'The title']) | 1699 | self.assertEqual(els[0].contents, ['The title']) |
1591 | 1700 | ||
1592 | def test_one_tag_many(self): | 1701 | def test_one_tag_many(self): |
1593 | els = self.soup.select('div') | 1702 | els = self.soup.select('div') |
1594 | self.assertEqual(len(els), 3) | 1703 | self.assertEqual(len(els), 4) |
1595 | for div in els: | 1704 | for div in els: |
1596 | self.assertEqual(div.name, 'div') | 1705 | self.assertEqual(div.name, 'div') |
1597 | 1706 | ||
1707 | el = self.soup.select_one('div') | ||
1708 | self.assertEqual('main', el['id']) | ||
1709 | |||
1710 | def test_select_one_returns_none_if_no_match(self): | ||
1711 | match = self.soup.select_one('nonexistenttag') | ||
1712 | self.assertEqual(None, match) | ||
1713 | |||
1714 | |||
1598 | def test_tag_in_tag_one(self): | 1715 | def test_tag_in_tag_one(self): |
1599 | els = self.soup.select('div div') | 1716 | els = self.soup.select('div div') |
1600 | self.assertSelects('div div', ['inner']) | 1717 | self.assertSelects('div div', ['inner', 'data1']) |
1601 | 1718 | ||
1602 | def test_tag_in_tag_many(self): | 1719 | def test_tag_in_tag_many(self): |
1603 | for selector in ('html div', 'html body div', 'body div'): | 1720 | for selector in ('html div', 'html body div', 'body div'): |
1604 | self.assertSelects(selector, ['main', 'inner', 'footer']) | 1721 | self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) |
1605 | 1722 | ||
1606 | def test_tag_no_match(self): | 1723 | def test_tag_no_match(self): |
1607 | self.assertEqual(len(self.soup.select('del')), 0) | 1724 | self.assertEqual(len(self.soup.select('del')), 0) |
@@ -1609,6 +1726,20 @@ class TestSoupSelector(TreeTest): | |||
1609 | def test_invalid_tag(self): | 1726 | def test_invalid_tag(self): |
1610 | self.assertRaises(ValueError, self.soup.select, 'tag%t') | 1727 | self.assertRaises(ValueError, self.soup.select, 'tag%t') |
1611 | 1728 | ||
1729 | def test_select_dashed_tag_ids(self): | ||
1730 | self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) | ||
1731 | |||
1732 | def test_select_dashed_by_id(self): | ||
1733 | dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') | ||
1734 | self.assertEqual(dashed[0].name, 'custom-dashed-tag') | ||
1735 | self.assertEqual(dashed[0]['id'], 'dash2') | ||
1736 | |||
1737 | def test_dashed_tag_text(self): | ||
1738 | self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') | ||
1739 | |||
1740 | def test_select_dashed_matches_find_all(self): | ||
1741 | self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) | ||
1742 | |||
1612 | def test_header_tags(self): | 1743 | def test_header_tags(self): |
1613 | self.assertSelectMultiple( | 1744 | self.assertSelectMultiple( |
1614 | ('h1', ['header1']), | 1745 | ('h1', ['header1']), |
@@ -1709,6 +1840,7 @@ class TestSoupSelector(TreeTest): | |||
1709 | ('[id^="m"]', ['me', 'main']), | 1840 | ('[id^="m"]', ['me', 'main']), |
1710 | ('div[id^="m"]', ['main']), | 1841 | ('div[id^="m"]', ['main']), |
1711 | ('a[id^="m"]', ['me']), | 1842 | ('a[id^="m"]', ['me']), |
1843 | ('div[data-tag^="dashed"]', ['data1']) | ||
1712 | ) | 1844 | ) |
1713 | 1845 | ||
1714 | def test_attribute_endswith(self): | 1846 | def test_attribute_endswith(self): |
@@ -1716,8 +1848,8 @@ class TestSoupSelector(TreeTest): | |||
1716 | ('[href$=".css"]', ['l1']), | 1848 | ('[href$=".css"]', ['l1']), |
1717 | ('link[href$=".css"]', ['l1']), | 1849 | ('link[href$=".css"]', ['l1']), |
1718 | ('link[id$="1"]', ['l1']), | 1850 | ('link[id$="1"]', ['l1']), |
1719 | ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), | 1851 | ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), |
1720 | ('div[id$="1"]', []), | 1852 | ('div[id$="1"]', ['data1']), |
1721 | ('[id$="noending"]', []), | 1853 | ('[id$="noending"]', []), |
1722 | ) | 1854 | ) |
1723 | 1855 | ||
@@ -1730,7 +1862,6 @@ class TestSoupSelector(TreeTest): | |||
1730 | ('[rel*="notstyle"]', []), | 1862 | ('[rel*="notstyle"]', []), |
1731 | ('link[rel*="notstyle"]', []), | 1863 | ('link[rel*="notstyle"]', []), |
1732 | ('link[href*="bla"]', ['l1']), | 1864 | ('link[href*="bla"]', ['l1']), |
1733 | ('a[href*="http://"]', ['bob', 'me']), | ||
1734 | ('[href*="http://"]', ['bob', 'me']), | 1865 | ('[href*="http://"]', ['bob', 'me']), |
1735 | ('[id*="p"]', ['pmulti', 'p1']), | 1866 | ('[id*="p"]', ['pmulti', 'p1']), |
1736 | ('div[id*="m"]', ['main']), | 1867 | ('div[id*="m"]', ['main']), |
@@ -1739,8 +1870,8 @@ class TestSoupSelector(TreeTest): | |||
1739 | ('[href*=".css"]', ['l1']), | 1870 | ('[href*=".css"]', ['l1']), |
1740 | ('link[href*=".css"]', ['l1']), | 1871 | ('link[href*=".css"]', ['l1']), |
1741 | ('link[id*="1"]', ['l1']), | 1872 | ('link[id*="1"]', ['l1']), |
1742 | ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), | 1873 | ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), |
1743 | ('div[id*="1"]', []), | 1874 | ('div[id*="1"]', ['data1']), |
1744 | ('[id*="noending"]', []), | 1875 | ('[id*="noending"]', []), |
1745 | # New for this test | 1876 | # New for this test |
1746 | ('[href*="."]', ['bob', 'me', 'l1']), | 1877 | ('[href*="."]', ['bob', 'me', 'l1']), |
@@ -1748,6 +1879,7 @@ class TestSoupSelector(TreeTest): | |||
1748 | ('link[href*="."]', ['l1']), | 1879 | ('link[href*="."]', ['l1']), |
1749 | ('div[id*="n"]', ['main', 'inner']), | 1880 | ('div[id*="n"]', ['main', 'inner']), |
1750 | ('div[id*="nn"]', ['inner']), | 1881 | ('div[id*="nn"]', ['inner']), |
1882 | ('div[data-tag*="edval"]', ['data1']) | ||
1751 | ) | 1883 | ) |
1752 | 1884 | ||
1753 | def test_attribute_exact_or_hypen(self): | 1885 | def test_attribute_exact_or_hypen(self): |
@@ -1767,18 +1899,27 @@ class TestSoupSelector(TreeTest): | |||
1767 | ('p[class]', ['p1', 'pmulti']), | 1899 | ('p[class]', ['p1', 'pmulti']), |
1768 | ('[blah]', []), | 1900 | ('[blah]', []), |
1769 | ('p[blah]', []), | 1901 | ('p[blah]', []), |
1902 | ('div[data-tag]', ['data1']) | ||
1770 | ) | 1903 | ) |
1771 | 1904 | ||
1905 | def test_unsupported_pseudoclass(self): | ||
1906 | self.assertRaises( | ||
1907 | NotImplementedError, self.soup.select, "a:no-such-pseudoclass") | ||
1908 | |||
1909 | self.assertRaises( | ||
1910 | NotImplementedError, self.soup.select, "a:nth-of-type(a)") | ||
1911 | |||
1912 | |||
1772 | def test_nth_of_type(self): | 1913 | def test_nth_of_type(self): |
1773 | # Try to select first paragraph | 1914 | # Try to select first paragraph |
1774 | els = self.soup.select('div#inner p:nth-of-type(1)') | 1915 | els = self.soup.select('div#inner p:nth-of-type(1)') |
1775 | self.assertEqual(len(els), 1) | 1916 | self.assertEqual(len(els), 1) |
1776 | self.assertEqual(els[0].string, u'Some text') | 1917 | self.assertEqual(els[0].string, 'Some text') |
1777 | 1918 | ||
1778 | # Try to select third paragraph | 1919 | # Try to select third paragraph |
1779 | els = self.soup.select('div#inner p:nth-of-type(3)') | 1920 | els = self.soup.select('div#inner p:nth-of-type(3)') |
1780 | self.assertEqual(len(els), 1) | 1921 | self.assertEqual(len(els), 1) |
1781 | self.assertEqual(els[0].string, u'Another') | 1922 | self.assertEqual(els[0].string, 'Another') |
1782 | 1923 | ||
1783 | # Try to select (non-existent!) fourth paragraph | 1924 | # Try to select (non-existent!) fourth paragraph |
1784 | els = self.soup.select('div#inner p:nth-of-type(4)') | 1925 | els = self.soup.select('div#inner p:nth-of-type(4)') |
@@ -1791,7 +1932,7 @@ class TestSoupSelector(TreeTest): | |||
1791 | def test_nth_of_type_direct_descendant(self): | 1932 | def test_nth_of_type_direct_descendant(self): |
1792 | els = self.soup.select('div#inner > p:nth-of-type(1)') | 1933 | els = self.soup.select('div#inner > p:nth-of-type(1)') |
1793 | self.assertEqual(len(els), 1) | 1934 | self.assertEqual(len(els), 1) |
1794 | self.assertEqual(els[0].string, u'Some text') | 1935 | self.assertEqual(els[0].string, 'Some text') |
1795 | 1936 | ||
1796 | def test_id_child_selector_nth_of_type(self): | 1937 | def test_id_child_selector_nth_of_type(self): |
1797 | self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) | 1938 | self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) |
@@ -1803,7 +1944,7 @@ class TestSoupSelector(TreeTest): | |||
1803 | selected = inner.select("div") | 1944 | selected = inner.select("div") |
1804 | # The <div id="inner"> tag was selected. The <div id="footer"> | 1945 | # The <div id="inner"> tag was selected. The <div id="footer"> |
1805 | # tag was not. | 1946 | # tag was not. |
1806 | self.assertSelectsIDs(selected, ['inner']) | 1947 | self.assertSelectsIDs(selected, ['inner', 'data1']) |
1807 | 1948 | ||
1808 | def test_overspecified_child_id(self): | 1949 | def test_overspecified_child_id(self): |
1809 | self.assertSelects(".fancy #inner", ['inner']) | 1950 | self.assertSelects(".fancy #inner", ['inner']) |
@@ -1827,3 +1968,44 @@ class TestSoupSelector(TreeTest): | |||
1827 | 1968 | ||
1828 | def test_sibling_combinator_wont_select_same_tag_twice(self): | 1969 | def test_sibling_combinator_wont_select_same_tag_twice(self): |
1829 | self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) | 1970 | self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) |
1971 | |||
1972 | # Test the selector grouping operator (the comma) | ||
1973 | def test_multiple_select(self): | ||
1974 | self.assertSelects('x, y', ['xid', 'yid']) | ||
1975 | |||
1976 | def test_multiple_select_with_no_space(self): | ||
1977 | self.assertSelects('x,y', ['xid', 'yid']) | ||
1978 | |||
1979 | def test_multiple_select_with_more_space(self): | ||
1980 | self.assertSelects('x, y', ['xid', 'yid']) | ||
1981 | |||
1982 | def test_multiple_select_duplicated(self): | ||
1983 | self.assertSelects('x, x', ['xid']) | ||
1984 | |||
1985 | def test_multiple_select_sibling(self): | ||
1986 | self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr']) | ||
1987 | |||
1988 | def test_multiple_select_tag_and_direct_descendant(self): | ||
1989 | self.assertSelects('x, y > z', ['xid', 'zidb']) | ||
1990 | |||
1991 | def test_multiple_select_direct_descendant_and_tags(self): | ||
1992 | self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) | ||
1993 | |||
1994 | def test_multiple_select_indirect_descendant(self): | ||
1995 | self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) | ||
1996 | |||
1997 | def test_invalid_multiple_select(self): | ||
1998 | self.assertRaises(ValueError, self.soup.select, ',x, y') | ||
1999 | self.assertRaises(ValueError, self.soup.select, 'x,,y') | ||
2000 | |||
2001 | def test_multiple_select_attrs(self): | ||
2002 | self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) | ||
2003 | |||
2004 | def test_multiple_select_ids(self): | ||
2005 | self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab']) | ||
2006 | |||
2007 | def test_multiple_select_nested(self): | ||
2008 | self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) | ||
2009 | |||
2010 | |||
2011 | |||