diff options
Diffstat (limited to 'bitbake/lib/bs4/dammit.py')
-rw-r--r-- | bitbake/lib/bs4/dammit.py | 31 |
1 files changed, 21 insertions, 10 deletions
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py index 59640b7ce3..68d419feb5 100644 --- a/bitbake/lib/bs4/dammit.py +++ b/bitbake/lib/bs4/dammit.py | |||
@@ -3,12 +3,14 @@ | |||
3 | 3 | ||
4 | This library converts a bytestream to Unicode through any means | 4 | This library converts a bytestream to Unicode through any means |
5 | necessary. It is heavily based on code from Mark Pilgrim's Universal | 5 | necessary. It is heavily based on code from Mark Pilgrim's Universal |
6 | Feed Parser. It works best on XML and XML, but it does not rewrite the | 6 | Feed Parser. It works best on XML and HTML, but it does not rewrite the |
7 | XML or HTML to reflect a new encoding; that's the tree builder's job. | 7 | XML or HTML to reflect a new encoding; that's the tree builder's job. |
8 | """ | 8 | """ |
9 | __license__ = "MIT" | ||
9 | 10 | ||
11 | from pdb import set_trace | ||
10 | import codecs | 12 | import codecs |
11 | from htmlentitydefs import codepoint2name | 13 | from html.entities import codepoint2name |
12 | import re | 14 | import re |
13 | import logging | 15 | import logging |
14 | import string | 16 | import string |
@@ -56,7 +58,7 @@ class EntitySubstitution(object): | |||
56 | reverse_lookup = {} | 58 | reverse_lookup = {} |
57 | characters_for_re = [] | 59 | characters_for_re = [] |
58 | for codepoint, name in list(codepoint2name.items()): | 60 | for codepoint, name in list(codepoint2name.items()): |
59 | character = unichr(codepoint) | 61 | character = chr(codepoint) |
60 | if codepoint != 34: | 62 | if codepoint != 34: |
61 | # There's no point in turning the quotation mark into | 63 | # There's no point in turning the quotation mark into |
62 | # ", unless it happens within an attribute value, which | 64 | # ", unless it happens within an attribute value, which |
@@ -212,8 +214,11 @@ class EncodingDetector: | |||
212 | 214 | ||
213 | 5. Windows-1252. | 215 | 5. Windows-1252. |
214 | """ | 216 | """ |
215 | def __init__(self, markup, override_encodings=None, is_html=False): | 217 | def __init__(self, markup, override_encodings=None, is_html=False, |
218 | exclude_encodings=None): | ||
216 | self.override_encodings = override_encodings or [] | 219 | self.override_encodings = override_encodings or [] |
220 | exclude_encodings = exclude_encodings or [] | ||
221 | self.exclude_encodings = set([x.lower() for x in exclude_encodings]) | ||
217 | self.chardet_encoding = None | 222 | self.chardet_encoding = None |
218 | self.is_html = is_html | 223 | self.is_html = is_html |
219 | self.declared_encoding = None | 224 | self.declared_encoding = None |
@@ -224,6 +229,8 @@ class EncodingDetector: | |||
224 | def _usable(self, encoding, tried): | 229 | def _usable(self, encoding, tried): |
225 | if encoding is not None: | 230 | if encoding is not None: |
226 | encoding = encoding.lower() | 231 | encoding = encoding.lower() |
232 | if encoding in self.exclude_encodings: | ||
233 | return False | ||
227 | if encoding not in tried: | 234 | if encoding not in tried: |
228 | tried.add(encoding) | 235 | tried.add(encoding) |
229 | return True | 236 | return True |
@@ -266,6 +273,9 @@ class EncodingDetector: | |||
266 | def strip_byte_order_mark(cls, data): | 273 | def strip_byte_order_mark(cls, data): |
267 | """If a byte-order mark is present, strip it and return the encoding it implies.""" | 274 | """If a byte-order mark is present, strip it and return the encoding it implies.""" |
268 | encoding = None | 275 | encoding = None |
276 | if isinstance(data, str): | ||
277 | # Unicode data cannot have a byte-order mark. | ||
278 | return data, encoding | ||
269 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | 279 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ |
270 | and (data[2:4] != '\x00\x00'): | 280 | and (data[2:4] != '\x00\x00'): |
271 | encoding = 'utf-16be' | 281 | encoding = 'utf-16be' |
@@ -306,7 +316,7 @@ class EncodingDetector: | |||
306 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) | 316 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) |
307 | if declared_encoding_match is not None: | 317 | if declared_encoding_match is not None: |
308 | declared_encoding = declared_encoding_match.groups()[0].decode( | 318 | declared_encoding = declared_encoding_match.groups()[0].decode( |
309 | 'ascii') | 319 | 'ascii', 'replace') |
310 | if declared_encoding: | 320 | if declared_encoding: |
311 | return declared_encoding.lower() | 321 | return declared_encoding.lower() |
312 | return None | 322 | return None |
@@ -331,18 +341,19 @@ class UnicodeDammit: | |||
331 | ] | 341 | ] |
332 | 342 | ||
333 | def __init__(self, markup, override_encodings=[], | 343 | def __init__(self, markup, override_encodings=[], |
334 | smart_quotes_to=None, is_html=False): | 344 | smart_quotes_to=None, is_html=False, exclude_encodings=[]): |
335 | self.smart_quotes_to = smart_quotes_to | 345 | self.smart_quotes_to = smart_quotes_to |
336 | self.tried_encodings = [] | 346 | self.tried_encodings = [] |
337 | self.contains_replacement_characters = False | 347 | self.contains_replacement_characters = False |
338 | self.is_html = is_html | 348 | self.is_html = is_html |
339 | 349 | ||
340 | self.detector = EncodingDetector(markup, override_encodings, is_html) | 350 | self.detector = EncodingDetector( |
351 | markup, override_encodings, is_html, exclude_encodings) | ||
341 | 352 | ||
342 | # Short-circuit if the data is in Unicode to begin with. | 353 | # Short-circuit if the data is in Unicode to begin with. |
343 | if isinstance(markup, unicode) or markup == '': | 354 | if isinstance(markup, str) or markup == '': |
344 | self.markup = markup | 355 | self.markup = markup |
345 | self.unicode_markup = unicode(markup) | 356 | self.unicode_markup = str(markup) |
346 | self.original_encoding = None | 357 | self.original_encoding = None |
347 | return | 358 | return |
348 | 359 | ||
@@ -425,7 +436,7 @@ class UnicodeDammit: | |||
425 | def _to_unicode(self, data, encoding, errors="strict"): | 436 | def _to_unicode(self, data, encoding, errors="strict"): |
426 | '''Given a string and its encoding, decodes the string into Unicode. | 437 | '''Given a string and its encoding, decodes the string into Unicode. |
427 | %encoding is a string recognized by encodings.aliases''' | 438 | %encoding is a string recognized by encodings.aliases''' |
428 | return unicode(data, encoding, errors) | 439 | return str(data, encoding, errors) |
429 | 440 | ||
430 | @property | 441 | @property |
431 | def declared_html_encoding(self): | 442 | def declared_html_encoding(self): |