diff options
| author | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-05-06 09:06:51 +0100 |
|---|---|---|
| committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2016-06-02 08:24:02 +0100 |
| commit | 822eabf32dd69346071bd25fc3639db252d2f346 (patch) | |
| tree | edac6d1d0d5114a4e3c72fea5589c069453b72d2 /bitbake/lib/bs4/dammit.py | |
| parent | 4f8959324df3b89487973bd4e8de21debb0a12ef (diff) | |
| download | poky-822eabf32dd69346071bd25fc3639db252d2f346.tar.gz | |
bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)
Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers
recommendation for v3 use.
(Bitbake rev: 2f4b98af93c971a8c466ffaf3c09cca0edb6e3ad)
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/dammit.py')
| -rw-r--r-- | bitbake/lib/bs4/dammit.py | 31 |
1 files changed, 21 insertions, 10 deletions
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py index 59640b7ce3..68d419feb5 100644 --- a/bitbake/lib/bs4/dammit.py +++ b/bitbake/lib/bs4/dammit.py | |||
| @@ -3,12 +3,14 @@ | |||
| 3 | 3 | ||
| 4 | This library converts a bytestream to Unicode through any means | 4 | This library converts a bytestream to Unicode through any means |
| 5 | necessary. It is heavily based on code from Mark Pilgrim's Universal | 5 | necessary. It is heavily based on code from Mark Pilgrim's Universal |
| 6 | Feed Parser. It works best on XML and XML, but it does not rewrite the | 6 | Feed Parser. It works best on XML and HTML, but it does not rewrite the |
| 7 | XML or HTML to reflect a new encoding; that's the tree builder's job. | 7 | XML or HTML to reflect a new encoding; that's the tree builder's job. |
| 8 | """ | 8 | """ |
| 9 | __license__ = "MIT" | ||
| 9 | 10 | ||
| 11 | from pdb import set_trace | ||
| 10 | import codecs | 12 | import codecs |
| 11 | from htmlentitydefs import codepoint2name | 13 | from html.entities import codepoint2name |
| 12 | import re | 14 | import re |
| 13 | import logging | 15 | import logging |
| 14 | import string | 16 | import string |
| @@ -56,7 +58,7 @@ class EntitySubstitution(object): | |||
| 56 | reverse_lookup = {} | 58 | reverse_lookup = {} |
| 57 | characters_for_re = [] | 59 | characters_for_re = [] |
| 58 | for codepoint, name in list(codepoint2name.items()): | 60 | for codepoint, name in list(codepoint2name.items()): |
| 59 | character = unichr(codepoint) | 61 | character = chr(codepoint) |
| 60 | if codepoint != 34: | 62 | if codepoint != 34: |
| 61 | # There's no point in turning the quotation mark into | 63 | # There's no point in turning the quotation mark into |
| 62 | # ", unless it happens within an attribute value, which | 64 | # ", unless it happens within an attribute value, which |
| @@ -212,8 +214,11 @@ class EncodingDetector: | |||
| 212 | 214 | ||
| 213 | 5. Windows-1252. | 215 | 5. Windows-1252. |
| 214 | """ | 216 | """ |
| 215 | def __init__(self, markup, override_encodings=None, is_html=False): | 217 | def __init__(self, markup, override_encodings=None, is_html=False, |
| 218 | exclude_encodings=None): | ||
| 216 | self.override_encodings = override_encodings or [] | 219 | self.override_encodings = override_encodings or [] |
| 220 | exclude_encodings = exclude_encodings or [] | ||
| 221 | self.exclude_encodings = set([x.lower() for x in exclude_encodings]) | ||
| 217 | self.chardet_encoding = None | 222 | self.chardet_encoding = None |
| 218 | self.is_html = is_html | 223 | self.is_html = is_html |
| 219 | self.declared_encoding = None | 224 | self.declared_encoding = None |
| @@ -224,6 +229,8 @@ class EncodingDetector: | |||
| 224 | def _usable(self, encoding, tried): | 229 | def _usable(self, encoding, tried): |
| 225 | if encoding is not None: | 230 | if encoding is not None: |
| 226 | encoding = encoding.lower() | 231 | encoding = encoding.lower() |
| 232 | if encoding in self.exclude_encodings: | ||
| 233 | return False | ||
| 227 | if encoding not in tried: | 234 | if encoding not in tried: |
| 228 | tried.add(encoding) | 235 | tried.add(encoding) |
| 229 | return True | 236 | return True |
| @@ -266,6 +273,9 @@ class EncodingDetector: | |||
| 266 | def strip_byte_order_mark(cls, data): | 273 | def strip_byte_order_mark(cls, data): |
| 267 | """If a byte-order mark is present, strip it and return the encoding it implies.""" | 274 | """If a byte-order mark is present, strip it and return the encoding it implies.""" |
| 268 | encoding = None | 275 | encoding = None |
| 276 | if isinstance(data, str): | ||
| 277 | # Unicode data cannot have a byte-order mark. | ||
| 278 | return data, encoding | ||
| 269 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | 279 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ |
| 270 | and (data[2:4] != '\x00\x00'): | 280 | and (data[2:4] != '\x00\x00'): |
| 271 | encoding = 'utf-16be' | 281 | encoding = 'utf-16be' |
| @@ -306,7 +316,7 @@ class EncodingDetector: | |||
| 306 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) | 316 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) |
| 307 | if declared_encoding_match is not None: | 317 | if declared_encoding_match is not None: |
| 308 | declared_encoding = declared_encoding_match.groups()[0].decode( | 318 | declared_encoding = declared_encoding_match.groups()[0].decode( |
| 309 | 'ascii') | 319 | 'ascii', 'replace') |
| 310 | if declared_encoding: | 320 | if declared_encoding: |
| 311 | return declared_encoding.lower() | 321 | return declared_encoding.lower() |
| 312 | return None | 322 | return None |
| @@ -331,18 +341,19 @@ class UnicodeDammit: | |||
| 331 | ] | 341 | ] |
| 332 | 342 | ||
| 333 | def __init__(self, markup, override_encodings=[], | 343 | def __init__(self, markup, override_encodings=[], |
| 334 | smart_quotes_to=None, is_html=False): | 344 | smart_quotes_to=None, is_html=False, exclude_encodings=[]): |
| 335 | self.smart_quotes_to = smart_quotes_to | 345 | self.smart_quotes_to = smart_quotes_to |
| 336 | self.tried_encodings = [] | 346 | self.tried_encodings = [] |
| 337 | self.contains_replacement_characters = False | 347 | self.contains_replacement_characters = False |
| 338 | self.is_html = is_html | 348 | self.is_html = is_html |
| 339 | 349 | ||
| 340 | self.detector = EncodingDetector(markup, override_encodings, is_html) | 350 | self.detector = EncodingDetector( |
| 351 | markup, override_encodings, is_html, exclude_encodings) | ||
| 341 | 352 | ||
| 342 | # Short-circuit if the data is in Unicode to begin with. | 353 | # Short-circuit if the data is in Unicode to begin with. |
| 343 | if isinstance(markup, unicode) or markup == '': | 354 | if isinstance(markup, str) or markup == '': |
| 344 | self.markup = markup | 355 | self.markup = markup |
| 345 | self.unicode_markup = unicode(markup) | 356 | self.unicode_markup = str(markup) |
| 346 | self.original_encoding = None | 357 | self.original_encoding = None |
| 347 | return | 358 | return |
| 348 | 359 | ||
| @@ -425,7 +436,7 @@ class UnicodeDammit: | |||
| 425 | def _to_unicode(self, data, encoding, errors="strict"): | 436 | def _to_unicode(self, data, encoding, errors="strict"): |
| 426 | '''Given a string and its encoding, decodes the string into Unicode. | 437 | '''Given a string and its encoding, decodes the string into Unicode. |
| 427 | %encoding is a string recognized by encodings.aliases''' | 438 | %encoding is a string recognized by encodings.aliases''' |
| 428 | return unicode(data, encoding, errors) | 439 | return str(data, encoding, errors) |
| 429 | 440 | ||
| 430 | @property | 441 | @property |
| 431 | def declared_html_encoding(self): | 442 | def declared_html_encoding(self): |
