diff options
| author | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-05-06 09:06:51 +0100 |
|---|---|---|
| committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-06-02 08:24:02 +0100 |
| commit | 822eabf32dd69346071bd25fc3639db252d2f346 (patch) | |
| tree | edac6d1d0d5114a4e3c72fea5589c069453b72d2 /bitbake/lib/bs4/builder | |
| parent | 4f8959324df3b89487973bd4e8de21debb0a12ef (diff) | |
| download | poky-822eabf32dd69346071bd25fc3639db252d2f346.tar.gz | |
bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)
Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers
recommendation for v3 use.
(Bitbake rev: 2f4b98af93c971a8c466ffaf3c09cca0edb6e3ad)
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/builder')
| -rw-r--r-- | bitbake/lib/bs4/builder/__init__.py | 7 | ||||
| -rw-r--r-- | bitbake/lib/bs4/builder/_html5lib.py | 71 | ||||
| -rw-r--r-- | bitbake/lib/bs4/builder/_htmlparser.py | 56 | ||||
| -rw-r--r-- | bitbake/lib/bs4/builder/_lxml.py | 47 |
4 files changed, 125 insertions, 56 deletions
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py index 740f5f29cd..6ccd4d23d6 100644 --- a/bitbake/lib/bs4/builder/__init__.py +++ b/bitbake/lib/bs4/builder/__init__.py | |||
| @@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry() | |||
| 80 | class TreeBuilder(object): | 80 | class TreeBuilder(object): |
| 81 | """Turn a document into a Beautiful Soup object tree.""" | 81 | """Turn a document into a Beautiful Soup object tree.""" |
| 82 | 82 | ||
| 83 | NAME = "[Unknown tree builder]" | ||
| 84 | ALTERNATE_NAMES = [] | ||
| 83 | features = [] | 85 | features = [] |
| 84 | 86 | ||
| 85 | is_xml = False | 87 | is_xml = False |
| 88 | picklable = False | ||
| 86 | preserve_whitespace_tags = set() | 89 | preserve_whitespace_tags = set() |
| 87 | empty_element_tags = None # A tag will be considered an empty-element | 90 | empty_element_tags = None # A tag will be considered an empty-element |
| 88 | # tag when and only when it has no contents. | 91 | # tag when and only when it has no contents. |
| @@ -153,13 +156,13 @@ class TreeBuilder(object): | |||
| 153 | universal = self.cdata_list_attributes.get('*', []) | 156 | universal = self.cdata_list_attributes.get('*', []) |
| 154 | tag_specific = self.cdata_list_attributes.get( | 157 | tag_specific = self.cdata_list_attributes.get( |
| 155 | tag_name.lower(), None) | 158 | tag_name.lower(), None) |
| 156 | for attr in attrs.keys(): | 159 | for attr in list(attrs.keys()): |
| 157 | if attr in universal or (tag_specific and attr in tag_specific): | 160 | if attr in universal or (tag_specific and attr in tag_specific): |
| 158 | # We have a "class"-type attribute whose string | 161 | # We have a "class"-type attribute whose string |
| 159 | # value is a whitespace-separated list of | 162 | # value is a whitespace-separated list of |
| 160 | # values. Split it into a list. | 163 | # values. Split it into a list. |
| 161 | value = attrs[attr] | 164 | value = attrs[attr] |
| 162 | if isinstance(value, basestring): | 165 | if isinstance(value, str): |
| 163 | values = whitespace_re.split(value) | 166 | values = whitespace_re.split(value) |
| 164 | else: | 167 | else: |
| 165 | # html5lib sometimes calls setAttributes twice | 168 | # html5lib sometimes calls setAttributes twice |
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py index 7de36ae75e..f0e5924ebb 100644 --- a/bitbake/lib/bs4/builder/_html5lib.py +++ b/bitbake/lib/bs4/builder/_html5lib.py | |||
| @@ -2,6 +2,7 @@ __all__ = [ | |||
| 2 | 'HTML5TreeBuilder', | 2 | 'HTML5TreeBuilder', |
| 3 | ] | 3 | ] |
| 4 | 4 | ||
| 5 | from pdb import set_trace | ||
| 5 | import warnings | 6 | import warnings |
| 6 | from bs4.builder import ( | 7 | from bs4.builder import ( |
| 7 | PERMISSIVE, | 8 | PERMISSIVE, |
| @@ -9,7 +10,10 @@ from bs4.builder import ( | |||
| 9 | HTML_5, | 10 | HTML_5, |
| 10 | HTMLTreeBuilder, | 11 | HTMLTreeBuilder, |
| 11 | ) | 12 | ) |
| 12 | from bs4.element import NamespacedAttribute | 13 | from bs4.element import ( |
| 14 | NamespacedAttribute, | ||
| 15 | whitespace_re, | ||
| 16 | ) | ||
| 13 | import html5lib | 17 | import html5lib |
| 14 | from html5lib.constants import namespaces | 18 | from html5lib.constants import namespaces |
| 15 | from bs4.element import ( | 19 | from bs4.element import ( |
| @@ -22,11 +26,20 @@ from bs4.element import ( | |||
| 22 | class HTML5TreeBuilder(HTMLTreeBuilder): | 26 | class HTML5TreeBuilder(HTMLTreeBuilder): |
| 23 | """Use html5lib to build a tree.""" | 27 | """Use html5lib to build a tree.""" |
| 24 | 28 | ||
| 25 | features = ['html5lib', PERMISSIVE, HTML_5, HTML] | 29 | NAME = "html5lib" |
| 30 | |||
| 31 | features = [NAME, PERMISSIVE, HTML_5, HTML] | ||
| 26 | 32 | ||
| 27 | def prepare_markup(self, markup, user_specified_encoding): | 33 | def prepare_markup(self, markup, user_specified_encoding, |
| 34 | document_declared_encoding=None, exclude_encodings=None): | ||
| 28 | # Store the user-specified encoding for use later on. | 35 | # Store the user-specified encoding for use later on. |
| 29 | self.user_specified_encoding = user_specified_encoding | 36 | self.user_specified_encoding = user_specified_encoding |
| 37 | |||
| 38 | # document_declared_encoding and exclude_encodings aren't used | ||
| 39 | # ATM because the html5lib TreeBuilder doesn't use | ||
| 40 | # UnicodeDammit. | ||
| 41 | if exclude_encodings: | ||
| 42 | warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") | ||
| 30 | yield (markup, None, None, False) | 43 | yield (markup, None, None, False) |
| 31 | 44 | ||
| 32 | # These methods are defined by Beautiful Soup. | 45 | # These methods are defined by Beautiful Soup. |
| @@ -37,7 +50,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): | |||
| 37 | doc = parser.parse(markup, encoding=self.user_specified_encoding) | 50 | doc = parser.parse(markup, encoding=self.user_specified_encoding) |
| 38 | 51 | ||
| 39 | # Set the character encoding detected by the tokenizer. | 52 | # Set the character encoding detected by the tokenizer. |
| 40 | if isinstance(markup, unicode): | 53 | if isinstance(markup, str): |
| 41 | # We need to special-case this because html5lib sets | 54 | # We need to special-case this because html5lib sets |
| 42 | # charEncoding to UTF-8 if it gets Unicode input. | 55 | # charEncoding to UTF-8 if it gets Unicode input. |
| 43 | doc.original_encoding = None | 56 | doc.original_encoding = None |
| @@ -51,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): | |||
| 51 | 64 | ||
| 52 | def test_fragment_to_document(self, fragment): | 65 | def test_fragment_to_document(self, fragment): |
| 53 | """See `TreeBuilder`.""" | 66 | """See `TreeBuilder`.""" |
| 54 | return u'<html><head></head><body>%s</body></html>' % fragment | 67 | return '<html><head></head><body>%s</body></html>' % fragment |
| 55 | 68 | ||
| 56 | 69 | ||
| 57 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): | 70 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): |
| @@ -101,7 +114,16 @@ class AttrList(object): | |||
| 101 | def __iter__(self): | 114 | def __iter__(self): |
| 102 | return list(self.attrs.items()).__iter__() | 115 | return list(self.attrs.items()).__iter__() |
| 103 | def __setitem__(self, name, value): | 116 | def __setitem__(self, name, value): |
| 104 | "set attr", name, value | 117 | # If this attribute is a multi-valued attribute for this element, |
| 118 | # turn its value into a list. | ||
| 119 | list_attr = HTML5TreeBuilder.cdata_list_attributes | ||
| 120 | if (name in list_attr['*'] | ||
| 121 | or (self.element.name in list_attr | ||
| 122 | and name in list_attr[self.element.name])): | ||
| 123 | # A node that is being cloned may have already undergone | ||
| 124 | # this procedure. | ||
| 125 | if not isinstance(value, list): | ||
| 126 | value = whitespace_re.split(value) | ||
| 105 | self.element[name] = value | 127 | self.element[name] = value |
| 106 | def items(self): | 128 | def items(self): |
| 107 | return list(self.attrs.items()) | 129 | return list(self.attrs.items()) |
| @@ -124,7 +146,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
| 124 | 146 | ||
| 125 | def appendChild(self, node): | 147 | def appendChild(self, node): |
| 126 | string_child = child = None | 148 | string_child = child = None |
| 127 | if isinstance(node, basestring): | 149 | if isinstance(node, str): |
| 128 | # Some other piece of code decided to pass in a string | 150 | # Some other piece of code decided to pass in a string |
| 129 | # instead of creating a TextElement object to contain the | 151 | # instead of creating a TextElement object to contain the |
| 130 | # string. | 152 | # string. |
| @@ -139,7 +161,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
| 139 | else: | 161 | else: |
| 140 | child = node.element | 162 | child = node.element |
| 141 | 163 | ||
| 142 | if not isinstance(child, basestring) and child.parent is not None: | 164 | if not isinstance(child, str) and child.parent is not None: |
| 143 | node.element.extract() | 165 | node.element.extract() |
| 144 | 166 | ||
| 145 | if (string_child and self.element.contents | 167 | if (string_child and self.element.contents |
| @@ -152,7 +174,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
| 152 | old_element.replace_with(new_element) | 174 | old_element.replace_with(new_element) |
| 153 | self.soup._most_recent_element = new_element | 175 | self.soup._most_recent_element = new_element |
| 154 | else: | 176 | else: |
| 155 | if isinstance(node, basestring): | 177 | if isinstance(node, str): |
| 156 | # Create a brand new NavigableString from this string. | 178 | # Create a brand new NavigableString from this string. |
| 157 | child = self.soup.new_string(node) | 179 | child = self.soup.new_string(node) |
| 158 | 180 | ||
| @@ -161,6 +183,12 @@ class Element(html5lib.treebuilders._base.Node): | |||
| 161 | # immediately after the parent, if it has no children.) | 183 | # immediately after the parent, if it has no children.) |
| 162 | if self.element.contents: | 184 | if self.element.contents: |
| 163 | most_recent_element = self.element._last_descendant(False) | 185 | most_recent_element = self.element._last_descendant(False) |
| 186 | elif self.element.next_element is not None: | ||
| 187 | # Something from further ahead in the parse tree is | ||
| 188 | # being inserted into this earlier element. This is | ||
| 189 | # very annoying because it means an expensive search | ||
| 190 | # for the last element in the tree. | ||
| 191 | most_recent_element = self.soup._last_descendant() | ||
| 164 | else: | 192 | else: |
| 165 | most_recent_element = self.element | 193 | most_recent_element = self.element |
| 166 | 194 | ||
| @@ -172,6 +200,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
| 172 | return AttrList(self.element) | 200 | return AttrList(self.element) |
| 173 | 201 | ||
| 174 | def setAttributes(self, attributes): | 202 | def setAttributes(self, attributes): |
| 203 | |||
| 175 | if attributes is not None and len(attributes) > 0: | 204 | if attributes is not None and len(attributes) > 0: |
| 176 | 205 | ||
| 177 | converted_attributes = [] | 206 | converted_attributes = [] |
| @@ -183,7 +212,7 @@ class Element(html5lib.treebuilders._base.Node): | |||
| 183 | 212 | ||
| 184 | self.soup.builder._replace_cdata_list_attribute_values( | 213 | self.soup.builder._replace_cdata_list_attribute_values( |
| 185 | self.name, attributes) | 214 | self.name, attributes) |
| 186 | for name, value in attributes.items(): | 215 | for name, value in list(attributes.items()): |
| 187 | self.element[name] = value | 216 | self.element[name] = value |
| 188 | 217 | ||
| 189 | # The attributes may contain variables that need substitution. | 218 | # The attributes may contain variables that need substitution. |
| @@ -218,6 +247,9 @@ class Element(html5lib.treebuilders._base.Node): | |||
| 218 | 247 | ||
| 219 | def reparentChildren(self, new_parent): | 248 | def reparentChildren(self, new_parent): |
| 220 | """Move all of this tag's children into another tag.""" | 249 | """Move all of this tag's children into another tag.""" |
| 250 | # print "MOVE", self.element.contents | ||
| 251 | # print "FROM", self.element | ||
| 252 | # print "TO", new_parent.element | ||
| 221 | element = self.element | 253 | element = self.element |
| 222 | new_parent_element = new_parent.element | 254 | new_parent_element = new_parent.element |
| 223 | # Determine what this tag's next_element will be once all the children | 255 | # Determine what this tag's next_element will be once all the children |
| @@ -236,17 +268,28 @@ class Element(html5lib.treebuilders._base.Node): | |||
| 236 | new_parents_last_descendant_next_element = new_parent_element.next_element | 268 | new_parents_last_descendant_next_element = new_parent_element.next_element |
| 237 | 269 | ||
| 238 | to_append = element.contents | 270 | to_append = element.contents |
| 239 | append_after = new_parent.element.contents | 271 | append_after = new_parent_element.contents |
| 240 | if len(to_append) > 0: | 272 | if len(to_append) > 0: |
| 241 | # Set the first child's previous_element and previous_sibling | 273 | # Set the first child's previous_element and previous_sibling |
| 242 | # to elements within the new parent | 274 | # to elements within the new parent |
| 243 | first_child = to_append[0] | 275 | first_child = to_append[0] |
| 244 | first_child.previous_element = new_parents_last_descendant | 276 | if new_parents_last_descendant: |
| 277 | first_child.previous_element = new_parents_last_descendant | ||
| 278 | else: | ||
| 279 | first_child.previous_element = new_parent_element | ||
| 245 | first_child.previous_sibling = new_parents_last_child | 280 | first_child.previous_sibling = new_parents_last_child |
| 281 | if new_parents_last_descendant: | ||
| 282 | new_parents_last_descendant.next_element = first_child | ||
| 283 | else: | ||
| 284 | new_parent_element.next_element = first_child | ||
| 285 | if new_parents_last_child: | ||
| 286 | new_parents_last_child.next_sibling = first_child | ||
| 246 | 287 | ||
| 247 | # Fix the last child's next_element and next_sibling | 288 | # Fix the last child's next_element and next_sibling |
| 248 | last_child = to_append[-1] | 289 | last_child = to_append[-1] |
| 249 | last_child.next_element = new_parents_last_descendant_next_element | 290 | last_child.next_element = new_parents_last_descendant_next_element |
| 291 | if new_parents_last_descendant_next_element: | ||
| 292 | new_parents_last_descendant_next_element.previous_element = last_child | ||
| 250 | last_child.next_sibling = None | 293 | last_child.next_sibling = None |
| 251 | 294 | ||
| 252 | for child in to_append: | 295 | for child in to_append: |
| @@ -257,6 +300,10 @@ class Element(html5lib.treebuilders._base.Node): | |||
| 257 | element.contents = [] | 300 | element.contents = [] |
| 258 | element.next_element = final_next_element | 301 | element.next_element = final_next_element |
| 259 | 302 | ||
| 303 | # print "DONE WITH MOVE" | ||
| 304 | # print "FROM", self.element | ||
| 305 | # print "TO", new_parent_element | ||
| 306 | |||
| 260 | def cloneNode(self): | 307 | def cloneNode(self): |
| 261 | tag = self.soup.new_tag(self.element.name, self.namespace) | 308 | tag = self.soup.new_tag(self.element.name, self.namespace) |
| 262 | node = Element(tag, self.soup, self.namespace) | 309 | node = Element(tag, self.soup, self.namespace) |
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py index ca8d8b892b..bb0a63f2f3 100644 --- a/bitbake/lib/bs4/builder/_htmlparser.py +++ b/bitbake/lib/bs4/builder/_htmlparser.py | |||
| @@ -4,10 +4,16 @@ __all__ = [ | |||
| 4 | 'HTMLParserTreeBuilder', | 4 | 'HTMLParserTreeBuilder', |
| 5 | ] | 5 | ] |
| 6 | 6 | ||
| 7 | from HTMLParser import ( | 7 | from html.parser import HTMLParser |
| 8 | HTMLParser, | 8 | |
| 9 | HTMLParseError, | 9 | try: |
| 10 | ) | 10 | from html.parser import HTMLParseError |
| 11 | except ImportError as e: | ||
| 12 | # HTMLParseError is removed in Python 3.5. Since it can never be | ||
| 13 | # thrown in 3.5, we can just define our own class as a placeholder. | ||
| 14 | class HTMLParseError(Exception): | ||
| 15 | pass | ||
| 16 | |||
| 11 | import sys | 17 | import sys |
| 12 | import warnings | 18 | import warnings |
| 13 | 19 | ||
| @@ -19,10 +25,10 @@ import warnings | |||
| 19 | # At the end of this file, we monkeypatch HTMLParser so that | 25 | # At the end of this file, we monkeypatch HTMLParser so that |
| 20 | # strict=True works well on Python 3.2.2. | 26 | # strict=True works well on Python 3.2.2. |
| 21 | major, minor, release = sys.version_info[:3] | 27 | major, minor, release = sys.version_info[:3] |
| 22 | CONSTRUCTOR_TAKES_STRICT = ( | 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 |
| 23 | major > 3 | 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 |
| 24 | or (major == 3 and minor > 2) | 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 |
| 25 | or (major == 3 and minor == 2 and release >= 3)) | 31 | |
| 26 | 32 | ||
| 27 | from bs4.element import ( | 33 | from bs4.element import ( |
| 28 | CData, | 34 | CData, |
| @@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
| 63 | 69 | ||
| 64 | def handle_charref(self, name): | 70 | def handle_charref(self, name): |
| 65 | # XXX workaround for a bug in HTMLParser. Remove this once | 71 | # XXX workaround for a bug in HTMLParser. Remove this once |
| 66 | # it's fixed. | 72 | # it's fixed in all supported versions. |
| 73 | # http://bugs.python.org/issue13633 | ||
| 67 | if name.startswith('x'): | 74 | if name.startswith('x'): |
| 68 | real_name = int(name.lstrip('x'), 16) | 75 | real_name = int(name.lstrip('x'), 16) |
| 69 | elif name.startswith('X'): | 76 | elif name.startswith('X'): |
| @@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
| 72 | real_name = int(name) | 79 | real_name = int(name) |
| 73 | 80 | ||
| 74 | try: | 81 | try: |
| 75 | data = unichr(real_name) | 82 | data = chr(real_name) |
| 76 | except (ValueError, OverflowError), e: | 83 | except (ValueError, OverflowError) as e: |
| 77 | data = u"\N{REPLACEMENT CHARACTER}" | 84 | data = "\N{REPLACEMENT CHARACTER}" |
| 78 | 85 | ||
| 79 | self.handle_data(data) | 86 | self.handle_data(data) |
| 80 | 87 | ||
| @@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
| 113 | 120 | ||
| 114 | def handle_pi(self, data): | 121 | def handle_pi(self, data): |
| 115 | self.soup.endData() | 122 | self.soup.endData() |
| 116 | if data.endswith("?") and data.lower().startswith("xml"): | ||
| 117 | # "An XHTML processing instruction using the trailing '?' | ||
| 118 | # will cause the '?' to be included in data." - HTMLParser | ||
| 119 | # docs. | ||
| 120 | # | ||
| 121 | # Strip the question mark so we don't end up with two | ||
| 122 | # question marks. | ||
| 123 | data = data[:-1] | ||
| 124 | self.soup.handle_data(data) | 123 | self.soup.handle_data(data) |
| 125 | self.soup.endData(ProcessingInstruction) | 124 | self.soup.endData(ProcessingInstruction) |
| 126 | 125 | ||
| @@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
| 128 | class HTMLParserTreeBuilder(HTMLTreeBuilder): | 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): |
| 129 | 128 | ||
| 130 | is_xml = False | 129 | is_xml = False |
| 131 | features = [HTML, STRICT, HTMLPARSER] | 130 | picklable = True |
| 131 | NAME = HTMLPARSER | ||
| 132 | features = [NAME, HTML, STRICT] | ||
| 132 | 133 | ||
| 133 | def __init__(self, *args, **kwargs): | 134 | def __init__(self, *args, **kwargs): |
| 134 | if CONSTRUCTOR_TAKES_STRICT: | 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: |
| 135 | kwargs['strict'] = False | 136 | kwargs['strict'] = False |
| 137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: | ||
| 138 | kwargs['convert_charrefs'] = False | ||
| 136 | self.parser_args = (args, kwargs) | 139 | self.parser_args = (args, kwargs) |
| 137 | 140 | ||
| 138 | def prepare_markup(self, markup, user_specified_encoding=None, | 141 | def prepare_markup(self, markup, user_specified_encoding=None, |
| 139 | document_declared_encoding=None): | 142 | document_declared_encoding=None, exclude_encodings=None): |
| 140 | """ | 143 | """ |
| 141 | :return: A 4-tuple (markup, original encoding, encoding | 144 | :return: A 4-tuple (markup, original encoding, encoding |
| 142 | declared within markup, whether any characters had to be | 145 | declared within markup, whether any characters had to be |
| 143 | replaced with REPLACEMENT CHARACTER). | 146 | replaced with REPLACEMENT CHARACTER). |
| 144 | """ | 147 | """ |
| 145 | if isinstance(markup, unicode): | 148 | if isinstance(markup, str): |
| 146 | yield (markup, None, None, False) | 149 | yield (markup, None, None, False) |
| 147 | return | 150 | return |
| 148 | 151 | ||
| 149 | try_encodings = [user_specified_encoding, document_declared_encoding] | 152 | try_encodings = [user_specified_encoding, document_declared_encoding] |
| 150 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) | 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, |
| 154 | exclude_encodings=exclude_encodings) | ||
| 151 | yield (dammit.markup, dammit.original_encoding, | 155 | yield (dammit.markup, dammit.original_encoding, |
| 152 | dammit.declared_html_encoding, | 156 | dammit.declared_html_encoding, |
| 153 | dammit.contains_replacement_characters) | 157 | dammit.contains_replacement_characters) |
| @@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): | |||
| 158 | parser.soup = self.soup | 162 | parser.soup = self.soup |
| 159 | try: | 163 | try: |
| 160 | parser.feed(markup) | 164 | parser.feed(markup) |
| 161 | except HTMLParseError, e: | 165 | except HTMLParseError as e: |
| 162 | warnings.warn(RuntimeWarning( | 166 | warnings.warn(RuntimeWarning( |
| 163 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) | 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) |
| 164 | raise e | 168 | raise e |
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py index fa5d49875e..9c6c14ee65 100644 --- a/bitbake/lib/bs4/builder/_lxml.py +++ b/bitbake/lib/bs4/builder/_lxml.py | |||
| @@ -4,10 +4,15 @@ __all__ = [ | |||
| 4 | ] | 4 | ] |
| 5 | 5 | ||
| 6 | from io import BytesIO | 6 | from io import BytesIO |
| 7 | from StringIO import StringIO | 7 | from io import StringIO |
| 8 | import collections | 8 | import collections |
| 9 | from lxml import etree | 9 | from lxml import etree |
| 10 | from bs4.element import Comment, Doctype, NamespacedAttribute | 10 | from bs4.element import ( |
| 11 | Comment, | ||
| 12 | Doctype, | ||
| 13 | NamespacedAttribute, | ||
| 14 | ProcessingInstruction, | ||
| 15 | ) | ||
| 11 | from bs4.builder import ( | 16 | from bs4.builder import ( |
| 12 | FAST, | 17 | FAST, |
| 13 | HTML, | 18 | HTML, |
| @@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 25 | 30 | ||
| 26 | is_xml = True | 31 | is_xml = True |
| 27 | 32 | ||
| 33 | NAME = "lxml-xml" | ||
| 34 | ALTERNATE_NAMES = ["xml"] | ||
| 35 | |||
| 28 | # Well, it's permissive by XML parser standards. | 36 | # Well, it's permissive by XML parser standards. |
| 29 | features = [LXML, XML, FAST, PERMISSIVE] | 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] |
| 30 | 38 | ||
| 31 | CHUNK_SIZE = 512 | 39 | CHUNK_SIZE = 512 |
| 32 | 40 | ||
| @@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 70 | return (None, tag) | 78 | return (None, tag) |
| 71 | 79 | ||
| 72 | def prepare_markup(self, markup, user_specified_encoding=None, | 80 | def prepare_markup(self, markup, user_specified_encoding=None, |
| 81 | exclude_encodings=None, | ||
| 73 | document_declared_encoding=None): | 82 | document_declared_encoding=None): |
| 74 | """ | 83 | """ |
| 75 | :yield: A series of 4-tuples. | 84 | :yield: A series of 4-tuples. |
| @@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 78 | 87 | ||
| 79 | Each 4-tuple represents a strategy for parsing the document. | 88 | Each 4-tuple represents a strategy for parsing the document. |
| 80 | """ | 89 | """ |
| 81 | if isinstance(markup, unicode): | 90 | if isinstance(markup, str): |
| 82 | # We were given Unicode. Maybe lxml can parse Unicode on | 91 | # We were given Unicode. Maybe lxml can parse Unicode on |
| 83 | # this system? | 92 | # this system? |
| 84 | yield markup, None, document_declared_encoding, False | 93 | yield markup, None, document_declared_encoding, False |
| 85 | 94 | ||
| 86 | if isinstance(markup, unicode): | 95 | if isinstance(markup, str): |
| 87 | # No, apparently not. Convert the Unicode to UTF-8 and | 96 | # No, apparently not. Convert the Unicode to UTF-8 and |
| 88 | # tell lxml to parse it as UTF-8. | 97 | # tell lxml to parse it as UTF-8. |
| 89 | yield (markup.encode("utf8"), "utf8", | 98 | yield (markup.encode("utf8"), "utf8", |
| @@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 95 | # the document as each one in turn. | 104 | # the document as each one in turn. |
| 96 | is_html = not self.is_xml | 105 | is_html = not self.is_xml |
| 97 | try_encodings = [user_specified_encoding, document_declared_encoding] | 106 | try_encodings = [user_specified_encoding, document_declared_encoding] |
| 98 | detector = EncodingDetector(markup, try_encodings, is_html) | 107 | detector = EncodingDetector( |
| 108 | markup, try_encodings, is_html, exclude_encodings) | ||
| 99 | for encoding in detector.encodings: | 109 | for encoding in detector.encodings: |
| 100 | yield (detector.markup, encoding, document_declared_encoding, False) | 110 | yield (detector.markup, encoding, document_declared_encoding, False) |
| 101 | 111 | ||
| 102 | def feed(self, markup): | 112 | def feed(self, markup): |
| 103 | if isinstance(markup, bytes): | 113 | if isinstance(markup, bytes): |
| 104 | markup = BytesIO(markup) | 114 | markup = BytesIO(markup) |
| 105 | elif isinstance(markup, unicode): | 115 | elif isinstance(markup, str): |
| 106 | markup = StringIO(markup) | 116 | markup = StringIO(markup) |
| 107 | 117 | ||
| 108 | # Call feed() at least once, even if the markup is empty, | 118 | # Call feed() at least once, even if the markup is empty, |
| @@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 117 | if len(data) != 0: | 127 | if len(data) != 0: |
| 118 | self.parser.feed(data) | 128 | self.parser.feed(data) |
| 119 | self.parser.close() | 129 | self.parser.close() |
| 120 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | 130 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
| 121 | raise ParserRejectedMarkup(str(e)) | 131 | raise ParserRejectedMarkup(str(e)) |
| 122 | 132 | ||
| 123 | def close(self): | 133 | def close(self): |
| @@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 135 | self.nsmaps.append(None) | 145 | self.nsmaps.append(None) |
| 136 | elif len(nsmap) > 0: | 146 | elif len(nsmap) > 0: |
| 137 | # A new namespace mapping has come into play. | 147 | # A new namespace mapping has come into play. |
| 138 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) | 148 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) |
| 139 | self.nsmaps.append(inverted_nsmap) | 149 | self.nsmaps.append(inverted_nsmap) |
| 140 | # Also treat the namespace mapping as a set of attributes on the | 150 | # Also treat the namespace mapping as a set of attributes on the |
| 141 | # tag, so we can recreate it later. | 151 | # tag, so we can recreate it later. |
| 142 | attrs = attrs.copy() | 152 | attrs = attrs.copy() |
| 143 | for prefix, namespace in nsmap.items(): | 153 | for prefix, namespace in list(nsmap.items()): |
| 144 | attribute = NamespacedAttribute( | 154 | attribute = NamespacedAttribute( |
| 145 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") | 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") |
| 146 | attrs[attribute] = namespace | 156 | attrs[attribute] = namespace |
| @@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 149 | # from lxml with namespaces attached to their names, and | 159 | # from lxml with namespaces attached to their names, and |
| 150 | # turn then into NamespacedAttribute objects. | 160 | # turn then into NamespacedAttribute objects. |
| 151 | new_attrs = {} | 161 | new_attrs = {} |
| 152 | for attr, value in attrs.items(): | 162 | for attr, value in list(attrs.items()): |
| 153 | namespace, attr = self._getNsTag(attr) | 163 | namespace, attr = self._getNsTag(attr) |
| 154 | if namespace is None: | 164 | if namespace is None: |
| 155 | new_attrs[attr] = value | 165 | new_attrs[attr] = value |
| @@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 189 | self.nsmaps.pop() | 199 | self.nsmaps.pop() |
| 190 | 200 | ||
| 191 | def pi(self, target, data): | 201 | def pi(self, target, data): |
| 192 | pass | 202 | self.soup.endData() |
| 203 | self.soup.handle_data(target + ' ' + data) | ||
| 204 | self.soup.endData(ProcessingInstruction) | ||
| 193 | 205 | ||
| 194 | def data(self, content): | 206 | def data(self, content): |
| 195 | self.soup.handle_data(content) | 207 | self.soup.handle_data(content) |
| @@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): | |||
| 207 | 219 | ||
| 208 | def test_fragment_to_document(self, fragment): | 220 | def test_fragment_to_document(self, fragment): |
| 209 | """See `TreeBuilder`.""" | 221 | """See `TreeBuilder`.""" |
| 210 | return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment | 222 | return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment |
| 211 | 223 | ||
| 212 | 224 | ||
| 213 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): |
| 214 | 226 | ||
| 215 | features = [LXML, HTML, FAST, PERMISSIVE] | 227 | NAME = LXML |
| 228 | ALTERNATE_NAMES = ["lxml-html"] | ||
| 229 | |||
| 230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] | ||
| 216 | is_xml = False | 231 | is_xml = False |
| 217 | 232 | ||
| 218 | def default_parser(self, encoding): | 233 | def default_parser(self, encoding): |
| @@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): | |||
| 224 | self.parser = self.parser_for(encoding) | 239 | self.parser = self.parser_for(encoding) |
| 225 | self.parser.feed(markup) | 240 | self.parser.feed(markup) |
| 226 | self.parser.close() | 241 | self.parser.close() |
| 227 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: | 242 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: |
| 228 | raise ParserRejectedMarkup(str(e)) | 243 | raise ParserRejectedMarkup(str(e)) |
| 229 | 244 | ||
| 230 | 245 | ||
| 231 | def test_fragment_to_document(self, fragment): | 246 | def test_fragment_to_document(self, fragment): |
| 232 | """See `TreeBuilder`.""" | 247 | """See `TreeBuilder`.""" |
| 233 | return u'<html><body>%s</body></html>' % fragment | 248 | return '<html><body>%s</body></html>' % fragment |
