summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/dammit.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/dammit.py')
-rw-r--r--bitbake/lib/bs4/dammit.py31
1 files changed, 21 insertions, 10 deletions
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py
index 59640b7ce3..68d419feb5 100644
--- a/bitbake/lib/bs4/dammit.py
+++ b/bitbake/lib/bs4/dammit.py
@@ -3,12 +3,14 @@
3 3
4This library converts a bytestream to Unicode through any means 4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal 5necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and XML, but it does not rewrite the 6Feed Parser. It works best on XML and HTML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job. 7XML or HTML to reflect a new encoding; that's the tree builder's job.
8""" 8"""
9__license__ = "MIT"
9 10
11from pdb import set_trace
10import codecs 12import codecs
11from htmlentitydefs import codepoint2name 13from html.entities import codepoint2name
12import re 14import re
13import logging 15import logging
14import string 16import string
@@ -56,7 +58,7 @@ class EntitySubstitution(object):
56 reverse_lookup = {} 58 reverse_lookup = {}
57 characters_for_re = [] 59 characters_for_re = []
58 for codepoint, name in list(codepoint2name.items()): 60 for codepoint, name in list(codepoint2name.items()):
59 character = unichr(codepoint) 61 character = chr(codepoint)
60 if codepoint != 34: 62 if codepoint != 34:
61 # There's no point in turning the quotation mark into 63 # There's no point in turning the quotation mark into
62 # ", unless it happens within an attribute value, which 64 # ", unless it happens within an attribute value, which
@@ -212,8 +214,11 @@ class EncodingDetector:
212 214
213 5. Windows-1252. 215 5. Windows-1252.
214 """ 216 """
215 def __init__(self, markup, override_encodings=None, is_html=False): 217 def __init__(self, markup, override_encodings=None, is_html=False,
218 exclude_encodings=None):
216 self.override_encodings = override_encodings or [] 219 self.override_encodings = override_encodings or []
220 exclude_encodings = exclude_encodings or []
221 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
217 self.chardet_encoding = None 222 self.chardet_encoding = None
218 self.is_html = is_html 223 self.is_html = is_html
219 self.declared_encoding = None 224 self.declared_encoding = None
@@ -224,6 +229,8 @@ class EncodingDetector:
224 def _usable(self, encoding, tried): 229 def _usable(self, encoding, tried):
225 if encoding is not None: 230 if encoding is not None:
226 encoding = encoding.lower() 231 encoding = encoding.lower()
232 if encoding in self.exclude_encodings:
233 return False
227 if encoding not in tried: 234 if encoding not in tried:
228 tried.add(encoding) 235 tried.add(encoding)
229 return True 236 return True
@@ -266,6 +273,9 @@ class EncodingDetector:
266 def strip_byte_order_mark(cls, data): 273 def strip_byte_order_mark(cls, data):
267 """If a byte-order mark is present, strip it and return the encoding it implies.""" 274 """If a byte-order mark is present, strip it and return the encoding it implies."""
268 encoding = None 275 encoding = None
276 if isinstance(data, str):
277 # Unicode data cannot have a byte-order mark.
278 return data, encoding
269 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ 279 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
270 and (data[2:4] != '\x00\x00'): 280 and (data[2:4] != '\x00\x00'):
271 encoding = 'utf-16be' 281 encoding = 'utf-16be'
@@ -306,7 +316,7 @@ class EncodingDetector:
306 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) 316 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
307 if declared_encoding_match is not None: 317 if declared_encoding_match is not None:
308 declared_encoding = declared_encoding_match.groups()[0].decode( 318 declared_encoding = declared_encoding_match.groups()[0].decode(
309 'ascii') 319 'ascii', 'replace')
310 if declared_encoding: 320 if declared_encoding:
311 return declared_encoding.lower() 321 return declared_encoding.lower()
312 return None 322 return None
@@ -331,18 +341,19 @@ class UnicodeDammit:
331 ] 341 ]
332 342
333 def __init__(self, markup, override_encodings=[], 343 def __init__(self, markup, override_encodings=[],
334 smart_quotes_to=None, is_html=False): 344 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
335 self.smart_quotes_to = smart_quotes_to 345 self.smart_quotes_to = smart_quotes_to
336 self.tried_encodings = [] 346 self.tried_encodings = []
337 self.contains_replacement_characters = False 347 self.contains_replacement_characters = False
338 self.is_html = is_html 348 self.is_html = is_html
339 349
340 self.detector = EncodingDetector(markup, override_encodings, is_html) 350 self.detector = EncodingDetector(
351 markup, override_encodings, is_html, exclude_encodings)
341 352
342 # Short-circuit if the data is in Unicode to begin with. 353 # Short-circuit if the data is in Unicode to begin with.
343 if isinstance(markup, unicode) or markup == '': 354 if isinstance(markup, str) or markup == '':
344 self.markup = markup 355 self.markup = markup
345 self.unicode_markup = unicode(markup) 356 self.unicode_markup = str(markup)
346 self.original_encoding = None 357 self.original_encoding = None
347 return 358 return
348 359
@@ -425,7 +436,7 @@ class UnicodeDammit:
425 def _to_unicode(self, data, encoding, errors="strict"): 436 def _to_unicode(self, data, encoding, errors="strict"):
426 '''Given a string and its encoding, decodes the string into Unicode. 437 '''Given a string and its encoding, decodes the string into Unicode.
427 %encoding is a string recognized by encodings.aliases''' 438 %encoding is a string recognized by encodings.aliases'''
428 return unicode(data, encoding, errors) 439 return str(data, encoding, errors)
429 440
430 @property 441 @property
431 def declared_html_encoding(self): 442 def declared_html_encoding(self):