1 files changed, 21 insertions, 10 deletions
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py
index 59640b7ce3..68d419feb5 100644
--- a/bitbake/lib/bs4/dammit.py
+++ b/bitbake/lib/bs4/dammit.py
@@ -3,12 +3,14 @@
 This library converts a bytestream to Unicode through any means
 necessary. It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It works best on XML and XML, but it does not rewrite the
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
 XML or HTML to reflect a new encoding; that's the tree builder's job.
 """
+__license__ = "MIT"
+from pdb import set_trace
 import codecs
-from htmlentitydefs import codepoint2name
+from html.entities import codepoint2name
 import re
 import logging
 import string
@@ -56,7 +58,7 @@ class EntitySubstitution(object):
        reverse_lookup = {}
        characters_for_re = []
        for codepoint, name in list(codepoint2name.items()):
-            character = unichr(codepoint)
+            character = chr(codepoint)
            if codepoint != 34:
                # There's no point in turning the quotation mark into
                # &quot;, unless it happens within an attribute value, which
@@ -212,8 +214,11 @@ class EncodingDetector:
    5. Windows-1252.
    """
-    def __init__(self, markup, override_encodings=None, is_html=False):
+    def __init__(self, markup, override_encodings=None, is_html=False,
+                 exclude_encodings=None):
        self.override_encodings = override_encodings or []
+        exclude_encodings = exclude_encodings or []
+        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
        self.chardet_encoding = None
        self.is_html = is_html
        self.declared_encoding = None
@@ -224,6 +229,8 @@ class EncodingDetector:
    def _usable(self, encoding, tried):
        if encoding is not None:
            encoding = encoding.lower()
+            if encoding in self.exclude_encodings:
+                return False
            if encoding not in tried:
                tried.add(encoding)
                return True
@@ -266,6 +273,9 @@ class EncodingDetector:
    def strip_byte_order_mark(cls, data):
        """If a byte-order mark is present, strip it and return the encoding it implies."""
        encoding = None
+        if isinstance(data, str):
+            # Unicode data cannot have a byte-order mark.
+            return data, encoding
        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
               and (data[2:4] != '\x00\x00'):
            encoding = 'utf-16be'
@@ -306,7 +316,7 @@ class EncodingDetector:
            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
        if declared_encoding_match is not None:
            declared_encoding = declared_encoding_match.groups()[0].decode(
-                'ascii')
+                'ascii', 'replace')
        if declared_encoding:
            return declared_encoding.lower()
        return None
@@ -331,18 +341,19 @@ class UnicodeDammit:
        ]
    def __init__(self, markup, override_encodings=[],
-                 smart_quotes_to=None, is_html=False):
+                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
        self.smart_quotes_to = smart_quotes_to
        self.tried_encodings = []
        self.contains_replacement_characters = False
        self.is_html = is_html
-        self.detector = EncodingDetector(markup, override_encodings, is_html)
+        self.detector = EncodingDetector(
+            markup, override_encodings, is_html, exclude_encodings)
        # Short-circuit if the data is in Unicode to begin with.
-        if isinstance(markup, unicode) or markup == '':
+        if isinstance(markup, str) or markup == '':
            self.markup = markup
-            self.unicode_markup = unicode(markup)
+            self.unicode_markup = str(markup)
            self.original_encoding = None
            return
@@ -425,7 +436,7 @@ class UnicodeDammit:
    def _to_unicode(self, data, encoding, errors="strict"):
        '''Given a string and its encoding, decodes the string into Unicode.
        %encoding is a string recognized by encodings.aliases'''
-        return unicode(data, encoding, errors)
+        return str(data, encoding, errors)
    @property
    def declared_html_encoding(self):