summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/dammit.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/dammit.py')
-rw-r--r--bitbake/lib/bs4/dammit.py411
1 files changed, 337 insertions, 74 deletions
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py
index 7ad9e0dd1e..692433c57a 100644
--- a/bitbake/lib/bs4/dammit.py
+++ b/bitbake/lib/bs4/dammit.py
@@ -6,61 +6,185 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and HTML, but it does not rewrite the 6Feed Parser. It works best on XML and HTML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job. 7XML or HTML to reflect a new encoding; that's the tree builder's job.
8""" 8"""
9# Use of this source code is governed by the MIT license.
9__license__ = "MIT" 10__license__ = "MIT"
10 11
11import codecs
12from html.entities import codepoint2name 12from html.entities import codepoint2name
13from collections import defaultdict
14import codecs
13import re 15import re
14import logging 16import logging
15 17import string
16# Import a library to autodetect character encodings. 18
17chardet_type = None 19# Import a library to autodetect character encodings. We'll support
20# any of a number of libraries that all support the same API:
21#
22# * cchardet
23# * chardet
24# * charset-normalizer
25chardet_module = None
18try: 26try:
19 # First try the fast C implementation.
20 # PyPI package: cchardet 27 # PyPI package: cchardet
21 import cchardet 28 import cchardet as chardet_module
22 def chardet_dammit(s):
23 return cchardet.detect(s)['encoding']
24except ImportError: 29except ImportError:
25 try: 30 try:
26 # Fall back to the pure Python implementation
27 # Debian package: python-chardet 31 # Debian package: python-chardet
28 # PyPI package: chardet 32 # PyPI package: chardet
29 import chardet 33 import chardet as chardet_module
30 def chardet_dammit(s):
31 return chardet.detect(s)['encoding']
32 #import chardet.constants
33 #chardet.constants._debug = 1
34 except ImportError: 34 except ImportError:
35 # No chardet available. 35 try:
36 def chardet_dammit(s): 36 # PyPI package: charset-normalizer
37 import charset_normalizer as chardet_module
38 except ImportError:
39 # No chardet available.
40 chardet_module = None
41
42if chardet_module:
43 def chardet_dammit(s):
44 if isinstance(s, str):
37 return None 45 return None
46 return chardet_module.detect(s)['encoding']
47else:
48 def chardet_dammit(s):
49 return None
38 50
39xml_encoding_re = re.compile( 51# Build bytestring and Unicode versions of regular expressions for finding
40 r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) 52# a declared encoding inside an XML or HTML document.
41html_meta_re = re.compile( 53xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
42 r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) 54html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
55encoding_res = dict()
56encoding_res[bytes] = {
57 'html' : re.compile(html_meta.encode("ascii"), re.I),
58 'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
59}
60encoding_res[str] = {
61 'html' : re.compile(html_meta, re.I),
62 'xml' : re.compile(xml_encoding, re.I)
63}
64
65from html.entities import html5
43 66
44class EntitySubstitution(object): 67class EntitySubstitution(object):
45 68 """The ability to substitute XML or HTML entities for certain characters."""
46 """Substitute XML or HTML entities for the corresponding characters."""
47 69
48 def _populate_class_variables(): 70 def _populate_class_variables():
49 lookup = {} 71 """Initialize variables used by this class to manage the plethora of
50 reverse_lookup = {} 72 HTML5 named entities.
51 characters_for_re = [] 73
74 This function returns a 3-tuple containing two dictionaries
75 and a regular expression:
76
77 unicode_to_name - A mapping of Unicode strings like "⦨" to
78 entity names like "angmsdaa". When a single Unicode string has
79 multiple entity names, we try to choose the most commonly-used
80 name.
81
82 name_to_unicode: A mapping of entity names like "angmsdaa" to
83 Unicode strings like "⦨".
84
85 named_entity_re: A regular expression matching (almost) any
86 Unicode string that corresponds to an HTML5 named entity.
87 """
88 unicode_to_name = {}
89 name_to_unicode = {}
90
91 short_entities = set()
92 long_entities_by_first_character = defaultdict(set)
93
94 for name_with_semicolon, character in sorted(html5.items()):
95 # "It is intentional, for legacy compatibility, that many
96 # code points have multiple character reference names. For
97 # example, some appear both with and without the trailing
98 # semicolon, or with different capitalizations."
99 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
100 #
101 # The parsers are in charge of handling (or not) character
102 # references with no trailing semicolon, so we remove the
103 # semicolon whenever it appears.
104 if name_with_semicolon.endswith(';'):
105 name = name_with_semicolon[:-1]
106 else:
107 name = name_with_semicolon
108
109 # When parsing HTML, we want to recognize any known named
110 # entity and convert it to a sequence of Unicode
111 # characters.
112 if name not in name_to_unicode:
113 name_to_unicode[name] = character
114
115 # When _generating_ HTML, we want to recognize special
116 # character sequences that _could_ be converted to named
117 # entities.
118 unicode_to_name[character] = name
119
120 # We also need to build a regular expression that lets us
121 # _find_ those characters in output strings so we can
122 # replace them.
123 #
124 # This is tricky, for two reasons.
125
126 if (len(character) == 1 and ord(character) < 128
127 and character not in '<>&'):
128 # First, it would be annoying to turn single ASCII
129 # characters like | into named entities like
130 # &verbar;. The exceptions are <>&, which we _must_
131 # turn into named entities to produce valid HTML.
132 continue
133
134 if len(character) > 1 and all(ord(x) < 128 for x in character):
135 # We also do not want to turn _combinations_ of ASCII
136 # characters like 'fj' into named entities like '&fjlig;',
137 # though that's more debateable.
138 continue
139
140 # Second, some named entities have a Unicode value that's
141 # a subset of the Unicode value for some _other_ named
142 # entity. As an example, \u2267' is &GreaterFullEqual;,
143 # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular
144 # expression needs to match the first two characters of
145 # "\u2267\u0338foo", but only the first character of
146 # "\u2267foo".
147 #
148 # In this step, we build two sets of characters that
149 # _eventually_ need to go into the regular expression. But
150 # we won't know exactly what the regular expression needs
151 # to look like until we've gone through the entire list of
152 # named entities.
153 if len(character) == 1:
154 short_entities.add(character)
155 else:
156 long_entities_by_first_character[character[0]].add(character)
157
158 # Now that we've been through the entire list of entities, we
159 # can create a regular expression that matches any of them.
160 particles = set()
161 for short in short_entities:
162 long_versions = long_entities_by_first_character[short]
163 if not long_versions:
164 particles.add(short)
165 else:
166 ignore = "".join([x[1] for x in long_versions])
167 # This finds, e.g. \u2267 but only if it is _not_
168 # followed by \u0338.
169 particles.add("%s(?![%s])" % (short, ignore))
170
171 for long_entities in list(long_entities_by_first_character.values()):
172 for long_entity in long_entities:
173 particles.add(long_entity)
174
175 re_definition = "(%s)" % "|".join(particles)
176
177 # If an entity shows up in both html5 and codepoint2name, it's
178 # likely that HTML5 gives it several different names, such as
179 # 'rsquo' and 'rsquor'. When converting Unicode characters to
180 # named entities, the codepoint2name name should take
181 # precedence where possible, since that's the more easily
182 # recognizable one.
52 for codepoint, name in list(codepoint2name.items()): 183 for codepoint, name in list(codepoint2name.items()):
53 character = chr(codepoint) 184 character = chr(codepoint)
54 if codepoint != 34: 185 unicode_to_name[character] = name
55 # There's no point in turning the quotation mark into 186
56 # &quot;, unless it happens within an attribute value, which 187 return unicode_to_name, name_to_unicode, re.compile(re_definition)
57 # is handled elsewhere.
58 characters_for_re.append(character)
59 lookup[character] = name
60 # But we do want to turn &quot; into the quotation mark.
61 reverse_lookup[name] = character
62 re_definition = "[%s]" % "".join(characters_for_re)
63 return lookup, reverse_lookup, re.compile(re_definition)
64 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 188 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
65 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 189 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
66 190
@@ -72,21 +196,23 @@ class EntitySubstitution(object):
72 ">": "gt", 196 ">": "gt",
73 } 197 }
74 198
75 BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|" 199 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
76 r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 200 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
77 r")") 201 ")")
78 202
79 AMPERSAND_OR_BRACKET = re.compile(r"([<>&])") 203 AMPERSAND_OR_BRACKET = re.compile("([<>&])")
80 204
81 @classmethod 205 @classmethod
82 def _substitute_html_entity(cls, matchobj): 206 def _substitute_html_entity(cls, matchobj):
207 """Used with a regular expression to substitute the
208 appropriate HTML entity for a special character string."""
83 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 209 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
84 return "&%s;" % entity 210 return "&%s;" % entity
85 211
86 @classmethod 212 @classmethod
87 def _substitute_xml_entity(cls, matchobj): 213 def _substitute_xml_entity(cls, matchobj):
88 """Used with a regular expression to substitute the 214 """Used with a regular expression to substitute the
89 appropriate XML entity for an XML special character.""" 215 appropriate XML entity for a special character string."""
90 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 216 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
91 return "&%s;" % entity 217 return "&%s;" % entity
92 218
@@ -181,6 +307,8 @@ class EntitySubstitution(object):
181 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 307 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
182 character with "&eacute;" will make it more readable to some 308 character with "&eacute;" will make it more readable to some
183 people. 309 people.
310
311 :param s: A Unicode string.
184 """ 312 """
185 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 313 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
186 cls._substitute_html_entity, s) 314 cls._substitute_html_entity, s)
@@ -192,23 +320,65 @@ class EncodingDetector:
192 Order of precedence: 320 Order of precedence:
193 321
194 1. Encodings you specifically tell EncodingDetector to try first 322 1. Encodings you specifically tell EncodingDetector to try first
195 (the override_encodings argument to the constructor). 323 (the known_definite_encodings argument to the constructor).
324
325 2. An encoding determined by sniffing the document's byte-order mark.
326
327 3. Encodings you specifically tell EncodingDetector to try if
328 byte-order mark sniffing fails (the user_encodings argument to the
329 constructor).
196 330
197 2. An encoding declared within the bytestring itself, either in an 331 4. An encoding declared within the bytestring itself, either in an
198 XML declaration (if the bytestring is to be interpreted as an XML 332 XML declaration (if the bytestring is to be interpreted as an XML
199 document), or in a <meta> tag (if the bytestring is to be 333 document), or in a <meta> tag (if the bytestring is to be
200 interpreted as an HTML document.) 334 interpreted as an HTML document.)
201 335
202 3. An encoding detected through textual analysis by chardet, 336 5. An encoding detected through textual analysis by chardet,
203 cchardet, or a similar external library. 337 cchardet, or a similar external library.
204 338
205 4. UTF-8. 339 4. UTF-8.
206 340
207 5. Windows-1252. 341 5. Windows-1252.
342
208 """ 343 """
209 def __init__(self, markup, override_encodings=None, is_html=False, 344 def __init__(self, markup, known_definite_encodings=None,
210 exclude_encodings=None): 345 is_html=False, exclude_encodings=None,
211 self.override_encodings = override_encodings or [] 346 user_encodings=None, override_encodings=None):
347 """Constructor.
348
349 :param markup: Some markup in an unknown encoding.
350
351 :param known_definite_encodings: When determining the encoding
352 of `markup`, these encodings will be tried first, in
353 order. In HTML terms, this corresponds to the "known
354 definite encoding" step defined here:
355 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
356
357 :param user_encodings: These encodings will be tried after the
358 `known_definite_encodings` have been tried and failed, and
359 after an attempt to sniff the encoding by looking at a
360 byte order mark has failed. In HTML terms, this
361 corresponds to the step "user has explicitly instructed
362 the user agent to override the document's character
363 encoding", defined here:
364 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
365
366 :param override_encodings: A deprecated alias for
367 known_definite_encodings. Any encodings here will be tried
368 immediately after the encodings in
369 known_definite_encodings.
370
371 :param is_html: If True, this markup is considered to be
372 HTML. Otherwise it's assumed to be XML.
373
374 :param exclude_encodings: These encodings will not be tried,
375 even if they otherwise would be.
376
377 """
378 self.known_definite_encodings = list(known_definite_encodings or [])
379 if override_encodings:
380 self.known_definite_encodings += override_encodings
381 self.user_encodings = user_encodings or []
212 exclude_encodings = exclude_encodings or [] 382 exclude_encodings = exclude_encodings or []
213 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) 383 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
214 self.chardet_encoding = None 384 self.chardet_encoding = None
@@ -219,6 +389,12 @@ class EncodingDetector:
219 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 389 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
220 390
221 def _usable(self, encoding, tried): 391 def _usable(self, encoding, tried):
392 """Should we even bother to try this encoding?
393
394 :param encoding: Name of an encoding.
395 :param tried: Encodings that have already been tried. This will be modified
396 as a side effect.
397 """
222 if encoding is not None: 398 if encoding is not None:
223 encoding = encoding.lower() 399 encoding = encoding.lower()
224 if encoding in self.exclude_encodings: 400 if encoding in self.exclude_encodings:
@@ -230,9 +406,14 @@ class EncodingDetector:
230 406
231 @property 407 @property
232 def encodings(self): 408 def encodings(self):
233 """Yield a number of encodings that might work for this markup.""" 409 """Yield a number of encodings that might work for this markup.
410
411 :yield: A sequence of strings.
412 """
234 tried = set() 413 tried = set()
235 for e in self.override_encodings: 414
415 # First, try the known definite encodings
416 for e in self.known_definite_encodings:
236 if self._usable(e, tried): 417 if self._usable(e, tried):
237 yield e 418 yield e
238 419
@@ -241,6 +422,12 @@ class EncodingDetector:
241 if self._usable(self.sniffed_encoding, tried): 422 if self._usable(self.sniffed_encoding, tried):
242 yield self.sniffed_encoding 423 yield self.sniffed_encoding
243 424
425 # Sniffing the byte-order mark did nothing; try the user
426 # encodings.
427 for e in self.user_encodings:
428 if self._usable(e, tried):
429 yield e
430
244 # Look within the document for an XML or HTML encoding 431 # Look within the document for an XML or HTML encoding
245 # declaration. 432 # declaration.
246 if self.declared_encoding is None: 433 if self.declared_encoding is None:
@@ -263,7 +450,11 @@ class EncodingDetector:
263 450
264 @classmethod 451 @classmethod
265 def strip_byte_order_mark(cls, data): 452 def strip_byte_order_mark(cls, data):
266 """If a byte-order mark is present, strip it and return the encoding it implies.""" 453 """If a byte-order mark is present, strip it and return the encoding it implies.
454
455 :param data: Some markup.
456 :return: A 2-tuple (modified data, implied encoding)
457 """
267 encoding = None 458 encoding = None
268 if isinstance(data, str): 459 if isinstance(data, str):
269 # Unicode data cannot have a byte-order mark. 460 # Unicode data cannot have a byte-order mark.
@@ -295,21 +486,36 @@ class EncodingDetector:
295 486
296 An HTML encoding is declared in a <meta> tag, hopefully near the 487 An HTML encoding is declared in a <meta> tag, hopefully near the
297 beginning of the document. 488 beginning of the document.
489
490 :param markup: Some markup.
491 :param is_html: If True, this markup is considered to be HTML. Otherwise
492 it's assumed to be XML.
493 :param search_entire_document: Since an encoding is supposed to declared near the beginning
494 of the document, most of the time it's only necessary to search a few kilobytes of data.
495 Set this to True to force this method to search the entire document.
298 """ 496 """
299 if search_entire_document: 497 if search_entire_document:
300 xml_endpos = html_endpos = len(markup) 498 xml_endpos = html_endpos = len(markup)
301 else: 499 else:
302 xml_endpos = 1024 500 xml_endpos = 1024
303 html_endpos = max(2048, int(len(markup) * 0.05)) 501 html_endpos = max(2048, int(len(markup) * 0.05))
304 502
503 if isinstance(markup, bytes):
504 res = encoding_res[bytes]
505 else:
506 res = encoding_res[str]
507
508 xml_re = res['xml']
509 html_re = res['html']
305 declared_encoding = None 510 declared_encoding = None
306 declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) 511 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
307 if not declared_encoding_match and is_html: 512 if not declared_encoding_match and is_html:
308 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) 513 declared_encoding_match = html_re.search(markup, endpos=html_endpos)
309 if declared_encoding_match is not None: 514 if declared_encoding_match is not None:
310 declared_encoding = declared_encoding_match.groups()[0].decode( 515 declared_encoding = declared_encoding_match.groups()[0]
311 'ascii', 'replace')
312 if declared_encoding: 516 if declared_encoding:
517 if isinstance(declared_encoding, bytes):
518 declared_encoding = declared_encoding.decode('ascii', 'replace')
313 return declared_encoding.lower() 519 return declared_encoding.lower()
314 return None 520 return None
315 521
@@ -332,15 +538,53 @@ class UnicodeDammit:
332 "iso-8859-2", 538 "iso-8859-2",
333 ] 539 ]
334 540
335 def __init__(self, markup, override_encodings=[], 541 def __init__(self, markup, known_definite_encodings=[],
336 smart_quotes_to=None, is_html=False, exclude_encodings=[]): 542 smart_quotes_to=None, is_html=False, exclude_encodings=[],
543 user_encodings=None, override_encodings=None
544 ):
545 """Constructor.
546
547 :param markup: A bytestring representing markup in an unknown encoding.
548
549 :param known_definite_encodings: When determining the encoding
550 of `markup`, these encodings will be tried first, in
551 order. In HTML terms, this corresponds to the "known
552 definite encoding" step defined here:
553 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
554
555 :param user_encodings: These encodings will be tried after the
556 `known_definite_encodings` have been tried and failed, and
557 after an attempt to sniff the encoding by looking at a
558 byte order mark has failed. In HTML terms, this
559 corresponds to the step "user has explicitly instructed
560 the user agent to override the document's character
561 encoding", defined here:
562 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
563
564 :param override_encodings: A deprecated alias for
565 known_definite_encodings. Any encodings here will be tried
566 immediately after the encodings in
567 known_definite_encodings.
568
569 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
570 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
571 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
572 will convert them to HTML entity references.
573 :param is_html: If True, this markup is considered to be HTML. Otherwise
574 it's assumed to be XML.
575 :param exclude_encodings: These encodings will not be considered, even
576 if the sniffing code thinks they might make sense.
577
578 """
337 self.smart_quotes_to = smart_quotes_to 579 self.smart_quotes_to = smart_quotes_to
338 self.tried_encodings = [] 580 self.tried_encodings = []
339 self.contains_replacement_characters = False 581 self.contains_replacement_characters = False
340 self.is_html = is_html 582 self.is_html = is_html
341 583 self.log = logging.getLogger(__name__)
342 self.detector = EncodingDetector( 584 self.detector = EncodingDetector(
343 markup, override_encodings, is_html, exclude_encodings) 585 markup, known_definite_encodings, is_html, exclude_encodings,
586 user_encodings, override_encodings
587 )
344 588
345 # Short-circuit if the data is in Unicode to begin with. 589 # Short-circuit if the data is in Unicode to begin with.
346 if isinstance(markup, str) or markup == '': 590 if isinstance(markup, str) or markup == '':
@@ -368,9 +612,10 @@ class UnicodeDammit:
368 if encoding != "ascii": 612 if encoding != "ascii":
369 u = self._convert_from(encoding, "replace") 613 u = self._convert_from(encoding, "replace")
370 if u is not None: 614 if u is not None:
371 logging.warning( 615 self.log.warning(
372 "Some characters could not be decoded, and were " 616 "Some characters could not be decoded, and were "
373 "replaced with REPLACEMENT CHARACTER.") 617 "replaced with REPLACEMENT CHARACTER."
618 )
374 self.contains_replacement_characters = True 619 self.contains_replacement_characters = True
375 break 620 break
376 621
@@ -399,6 +644,10 @@ class UnicodeDammit:
399 return sub 644 return sub
400 645
401 def _convert_from(self, proposed, errors="strict"): 646 def _convert_from(self, proposed, errors="strict"):
647 """Attempt to convert the markup to the proposed encoding.
648
649 :param proposed: The name of a character encoding.
650 """
402 proposed = self.find_codec(proposed) 651 proposed = self.find_codec(proposed)
403 if not proposed or (proposed, errors) in self.tried_encodings: 652 if not proposed or (proposed, errors) in self.tried_encodings:
404 return None 653 return None
@@ -413,30 +662,40 @@ class UnicodeDammit:
413 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 662 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
414 663
415 try: 664 try:
416 #print "Trying to convert document to %s (errors=%s)" % ( 665 #print("Trying to convert document to %s (errors=%s)" % (
417 # proposed, errors) 666 # proposed, errors))
418 u = self._to_unicode(markup, proposed, errors) 667 u = self._to_unicode(markup, proposed, errors)
419 self.markup = u 668 self.markup = u
420 self.original_encoding = proposed 669 self.original_encoding = proposed
421 except Exception as e: 670 except Exception as e:
422 #print "That didn't work!" 671 #print("That didn't work!")
423 #print e 672 #print(e)
424 return None 673 return None
425 #print "Correct encoding: %s" % proposed 674 #print("Correct encoding: %s" % proposed)
426 return self.markup 675 return self.markup
427 676
428 def _to_unicode(self, data, encoding, errors="strict"): 677 def _to_unicode(self, data, encoding, errors="strict"):
429 '''Given a string and its encoding, decodes the string into Unicode. 678 """Given a string and its encoding, decodes the string into Unicode.
430 %encoding is a string recognized by encodings.aliases''' 679
680 :param encoding: The name of an encoding.
681 """
431 return str(data, encoding, errors) 682 return str(data, encoding, errors)
432 683
433 @property 684 @property
434 def declared_html_encoding(self): 685 def declared_html_encoding(self):
686 """If the markup is an HTML document, returns the encoding declared _within_
687 the document.
688 """
435 if not self.is_html: 689 if not self.is_html:
436 return None 690 return None
437 return self.detector.declared_encoding 691 return self.detector.declared_encoding
438 692
439 def find_codec(self, charset): 693 def find_codec(self, charset):
694 """Convert the name of a character set to a codec name.
695
696 :param charset: The name of a character set.
697 :return: The name of a codec.
698 """
440 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) 699 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
441 or (charset and self._codec(charset.replace("-", ""))) 700 or (charset and self._codec(charset.replace("-", "")))
442 or (charset and self._codec(charset.replace("-", "_"))) 701 or (charset and self._codec(charset.replace("-", "_")))
@@ -726,7 +985,7 @@ class UnicodeDammit:
726 0xde : b'\xc3\x9e', # Þ 985 0xde : b'\xc3\x9e', # Þ
727 0xdf : b'\xc3\x9f', # ß 986 0xdf : b'\xc3\x9f', # ß
728 0xe0 : b'\xc3\xa0', # à 987 0xe0 : b'\xc3\xa0', # à
729 0xe1 : b'\xa1', # á 988 0xe1 : b'\xa1', # á
730 0xe2 : b'\xc3\xa2', # â 989 0xe2 : b'\xc3\xa2', # â
731 0xe3 : b'\xc3\xa3', # ã 990 0xe3 : b'\xc3\xa3', # ã
732 0xe4 : b'\xc3\xa4', # ä 991 0xe4 : b'\xc3\xa4', # ä
@@ -775,12 +1034,16 @@ class UnicodeDammit:
775 Currently the only situation supported is Windows-1252 (or its 1034 Currently the only situation supported is Windows-1252 (or its
776 subset ISO-8859-1), embedded in UTF-8. 1035 subset ISO-8859-1), embedded in UTF-8.
777 1036
778 The input must be a bytestring. If you've already converted 1037 :param in_bytes: A bytestring that you suspect contains
779 the document to Unicode, you're too late. 1038 characters from multiple encodings. Note that this _must_
780 1039 be a bytestring. If you've already converted the document
781 The output is a bytestring in which `embedded_encoding` 1040 to Unicode, you're too late.
782 characters have been converted to their `main_encoding` 1041 :param main_encoding: The primary encoding of `in_bytes`.
783 equivalents. 1042 :param embedded_encoding: The encoding that was used to embed characters
1043 in the main document.
1044 :return: A bytestring in which `embedded_encoding`
1045 characters have been converted to their `main_encoding`
1046 equivalents.
784 """ 1047 """
785 if embedded_encoding.replace('_', '-').lower() not in ( 1048 if embedded_encoding.replace('_', '-').lower() not in (
786 'windows-1252', 'windows_1252'): 1049 'windows-1252', 'windows_1252'):