diff options
Diffstat (limited to 'bitbake/lib/bs4/dammit.py')
-rw-r--r-- | bitbake/lib/bs4/dammit.py | 411 |
1 files changed, 337 insertions, 74 deletions
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py index 7ad9e0dd1e..692433c57a 100644 --- a/bitbake/lib/bs4/dammit.py +++ b/bitbake/lib/bs4/dammit.py | |||
@@ -6,61 +6,185 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal | |||
6 | Feed Parser. It works best on XML and HTML, but it does not rewrite the | 6 | Feed Parser. It works best on XML and HTML, but it does not rewrite the |
7 | XML or HTML to reflect a new encoding; that's the tree builder's job. | 7 | XML or HTML to reflect a new encoding; that's the tree builder's job. |
8 | """ | 8 | """ |
9 | # Use of this source code is governed by the MIT license. | ||
9 | __license__ = "MIT" | 10 | __license__ = "MIT" |
10 | 11 | ||
11 | import codecs | ||
12 | from html.entities import codepoint2name | 12 | from html.entities import codepoint2name |
13 | from collections import defaultdict | ||
14 | import codecs | ||
13 | import re | 15 | import re |
14 | import logging | 16 | import logging |
15 | 17 | import string | |
16 | # Import a library to autodetect character encodings. | 18 | |
17 | chardet_type = None | 19 | # Import a library to autodetect character encodings. We'll support |
20 | # any of a number of libraries that all support the same API: | ||
21 | # | ||
22 | # * cchardet | ||
23 | # * chardet | ||
24 | # * charset-normalizer | ||
25 | chardet_module = None | ||
18 | try: | 26 | try: |
19 | # First try the fast C implementation. | ||
20 | # PyPI package: cchardet | 27 | # PyPI package: cchardet |
21 | import cchardet | 28 | import cchardet as chardet_module |
22 | def chardet_dammit(s): | ||
23 | return cchardet.detect(s)['encoding'] | ||
24 | except ImportError: | 29 | except ImportError: |
25 | try: | 30 | try: |
26 | # Fall back to the pure Python implementation | ||
27 | # Debian package: python-chardet | 31 | # Debian package: python-chardet |
28 | # PyPI package: chardet | 32 | # PyPI package: chardet |
29 | import chardet | 33 | import chardet as chardet_module |
30 | def chardet_dammit(s): | ||
31 | return chardet.detect(s)['encoding'] | ||
32 | #import chardet.constants | ||
33 | #chardet.constants._debug = 1 | ||
34 | except ImportError: | 34 | except ImportError: |
35 | # No chardet available. | 35 | try: |
36 | def chardet_dammit(s): | 36 | # PyPI package: charset-normalizer |
37 | import charset_normalizer as chardet_module | ||
38 | except ImportError: | ||
39 | # No chardet available. | ||
40 | chardet_module = None | ||
41 | |||
42 | if chardet_module: | ||
43 | def chardet_dammit(s): | ||
44 | if isinstance(s, str): | ||
37 | return None | 45 | return None |
46 | return chardet_module.detect(s)['encoding'] | ||
47 | else: | ||
48 | def chardet_dammit(s): | ||
49 | return None | ||
38 | 50 | ||
39 | xml_encoding_re = re.compile( | 51 | # Build bytestring and Unicode versions of regular expressions for finding |
40 | r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) | 52 | # a declared encoding inside an XML or HTML document. |
41 | html_meta_re = re.compile( | 53 | xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' |
42 | r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) | 54 | html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' |
55 | encoding_res = dict() | ||
56 | encoding_res[bytes] = { | ||
57 | 'html' : re.compile(html_meta.encode("ascii"), re.I), | ||
58 | 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), | ||
59 | } | ||
60 | encoding_res[str] = { | ||
61 | 'html' : re.compile(html_meta, re.I), | ||
62 | 'xml' : re.compile(xml_encoding, re.I) | ||
63 | } | ||
64 | |||
65 | from html.entities import html5 | ||
43 | 66 | ||
44 | class EntitySubstitution(object): | 67 | class EntitySubstitution(object): |
45 | 68 | """The ability to substitute XML or HTML entities for certain characters.""" | |
46 | """Substitute XML or HTML entities for the corresponding characters.""" | ||
47 | 69 | ||
48 | def _populate_class_variables(): | 70 | def _populate_class_variables(): |
49 | lookup = {} | 71 | """Initialize variables used by this class to manage the plethora of |
50 | reverse_lookup = {} | 72 | HTML5 named entities. |
51 | characters_for_re = [] | 73 | |
74 | This function returns a 3-tuple containing two dictionaries | ||
75 | and a regular expression: | ||
76 | |||
77 | unicode_to_name - A mapping of Unicode strings like "⦨" to | ||
78 | entity names like "angmsdaa". When a single Unicode string has | ||
79 | multiple entity names, we try to choose the most commonly-used | ||
80 | name. | ||
81 | |||
82 | name_to_unicode: A mapping of entity names like "angmsdaa" to | ||
83 | Unicode strings like "⦨". | ||
84 | |||
85 | named_entity_re: A regular expression matching (almost) any | ||
86 | Unicode string that corresponds to an HTML5 named entity. | ||
87 | """ | ||
88 | unicode_to_name = {} | ||
89 | name_to_unicode = {} | ||
90 | |||
91 | short_entities = set() | ||
92 | long_entities_by_first_character = defaultdict(set) | ||
93 | |||
94 | for name_with_semicolon, character in sorted(html5.items()): | ||
95 | # "It is intentional, for legacy compatibility, that many | ||
96 | # code points have multiple character reference names. For | ||
97 | # example, some appear both with and without the trailing | ||
98 | # semicolon, or with different capitalizations." | ||
99 | # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references | ||
100 | # | ||
101 | # The parsers are in charge of handling (or not) character | ||
102 | # references with no trailing semicolon, so we remove the | ||
103 | # semicolon whenever it appears. | ||
104 | if name_with_semicolon.endswith(';'): | ||
105 | name = name_with_semicolon[:-1] | ||
106 | else: | ||
107 | name = name_with_semicolon | ||
108 | |||
109 | # When parsing HTML, we want to recognize any known named | ||
110 | # entity and convert it to a sequence of Unicode | ||
111 | # characters. | ||
112 | if name not in name_to_unicode: | ||
113 | name_to_unicode[name] = character | ||
114 | |||
115 | # When _generating_ HTML, we want to recognize special | ||
116 | # character sequences that _could_ be converted to named | ||
117 | # entities. | ||
118 | unicode_to_name[character] = name | ||
119 | |||
120 | # We also need to build a regular expression that lets us | ||
121 | # _find_ those characters in output strings so we can | ||
122 | # replace them. | ||
123 | # | ||
124 | # This is tricky, for two reasons. | ||
125 | |||
126 | if (len(character) == 1 and ord(character) < 128 | ||
127 | and character not in '<>&'): | ||
128 | # First, it would be annoying to turn single ASCII | ||
129 | # characters like | into named entities like | ||
130 | # |. The exceptions are <>&, which we _must_ | ||
131 | # turn into named entities to produce valid HTML. | ||
132 | continue | ||
133 | |||
134 | if len(character) > 1 and all(ord(x) < 128 for x in character): | ||
135 | # We also do not want to turn _combinations_ of ASCII | ||
136 | # characters like 'fj' into named entities like 'fj', | ||
137 | # though that's more debateable. | ||
138 | continue | ||
139 | |||
140 | # Second, some named entities have a Unicode value that's | ||
141 | # a subset of the Unicode value for some _other_ named | ||
142 | # entity. As an example, \u2267' is ≧, | ||
143 | # but '\u2267\u0338' is ≧̸. Our regular | ||
144 | # expression needs to match the first two characters of | ||
145 | # "\u2267\u0338foo", but only the first character of | ||
146 | # "\u2267foo". | ||
147 | # | ||
148 | # In this step, we build two sets of characters that | ||
149 | # _eventually_ need to go into the regular expression. But | ||
150 | # we won't know exactly what the regular expression needs | ||
151 | # to look like until we've gone through the entire list of | ||
152 | # named entities. | ||
153 | if len(character) == 1: | ||
154 | short_entities.add(character) | ||
155 | else: | ||
156 | long_entities_by_first_character[character[0]].add(character) | ||
157 | |||
158 | # Now that we've been through the entire list of entities, we | ||
159 | # can create a regular expression that matches any of them. | ||
160 | particles = set() | ||
161 | for short in short_entities: | ||
162 | long_versions = long_entities_by_first_character[short] | ||
163 | if not long_versions: | ||
164 | particles.add(short) | ||
165 | else: | ||
166 | ignore = "".join([x[1] for x in long_versions]) | ||
167 | # This finds, e.g. \u2267 but only if it is _not_ | ||
168 | # followed by \u0338. | ||
169 | particles.add("%s(?![%s])" % (short, ignore)) | ||
170 | |||
171 | for long_entities in list(long_entities_by_first_character.values()): | ||
172 | for long_entity in long_entities: | ||
173 | particles.add(long_entity) | ||
174 | |||
175 | re_definition = "(%s)" % "|".join(particles) | ||
176 | |||
177 | # If an entity shows up in both html5 and codepoint2name, it's | ||
178 | # likely that HTML5 gives it several different names, such as | ||
179 | # 'rsquo' and 'rsquor'. When converting Unicode characters to | ||
180 | # named entities, the codepoint2name name should take | ||
181 | # precedence where possible, since that's the more easily | ||
182 | # recognizable one. | ||
52 | for codepoint, name in list(codepoint2name.items()): | 183 | for codepoint, name in list(codepoint2name.items()): |
53 | character = chr(codepoint) | 184 | character = chr(codepoint) |
54 | if codepoint != 34: | 185 | unicode_to_name[character] = name |
55 | # There's no point in turning the quotation mark into | 186 | |
56 | # ", unless it happens within an attribute value, which | 187 | return unicode_to_name, name_to_unicode, re.compile(re_definition) |
57 | # is handled elsewhere. | ||
58 | characters_for_re.append(character) | ||
59 | lookup[character] = name | ||
60 | # But we do want to turn " into the quotation mark. | ||
61 | reverse_lookup[name] = character | ||
62 | re_definition = "[%s]" % "".join(characters_for_re) | ||
63 | return lookup, reverse_lookup, re.compile(re_definition) | ||
64 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, | 188 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, |
65 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() | 189 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() |
66 | 190 | ||
@@ -72,21 +196,23 @@ class EntitySubstitution(object): | |||
72 | ">": "gt", | 196 | ">": "gt", |
73 | } | 197 | } |
74 | 198 | ||
75 | BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|" | 199 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" |
76 | r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" | 200 | "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" |
77 | r")") | 201 | ")") |
78 | 202 | ||
79 | AMPERSAND_OR_BRACKET = re.compile(r"([<>&])") | 203 | AMPERSAND_OR_BRACKET = re.compile("([<>&])") |
80 | 204 | ||
81 | @classmethod | 205 | @classmethod |
82 | def _substitute_html_entity(cls, matchobj): | 206 | def _substitute_html_entity(cls, matchobj): |
207 | """Used with a regular expression to substitute the | ||
208 | appropriate HTML entity for a special character string.""" | ||
83 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) | 209 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) |
84 | return "&%s;" % entity | 210 | return "&%s;" % entity |
85 | 211 | ||
86 | @classmethod | 212 | @classmethod |
87 | def _substitute_xml_entity(cls, matchobj): | 213 | def _substitute_xml_entity(cls, matchobj): |
88 | """Used with a regular expression to substitute the | 214 | """Used with a regular expression to substitute the |
89 | appropriate XML entity for an XML special character.""" | 215 | appropriate XML entity for a special character string.""" |
90 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] | 216 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] |
91 | return "&%s;" % entity | 217 | return "&%s;" % entity |
92 | 218 | ||
@@ -181,6 +307,8 @@ class EntitySubstitution(object): | |||
181 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that | 307 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that |
182 | character with "é" will make it more readable to some | 308 | character with "é" will make it more readable to some |
183 | people. | 309 | people. |
310 | |||
311 | :param s: A Unicode string. | ||
184 | """ | 312 | """ |
185 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( | 313 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( |
186 | cls._substitute_html_entity, s) | 314 | cls._substitute_html_entity, s) |
@@ -192,23 +320,65 @@ class EncodingDetector: | |||
192 | Order of precedence: | 320 | Order of precedence: |
193 | 321 | ||
194 | 1. Encodings you specifically tell EncodingDetector to try first | 322 | 1. Encodings you specifically tell EncodingDetector to try first |
195 | (the override_encodings argument to the constructor). | 323 | (the known_definite_encodings argument to the constructor). |
324 | |||
325 | 2. An encoding determined by sniffing the document's byte-order mark. | ||
326 | |||
327 | 3. Encodings you specifically tell EncodingDetector to try if | ||
328 | byte-order mark sniffing fails (the user_encodings argument to the | ||
329 | constructor). | ||
196 | 330 | ||
197 | 2. An encoding declared within the bytestring itself, either in an | 331 | 4. An encoding declared within the bytestring itself, either in an |
198 | XML declaration (if the bytestring is to be interpreted as an XML | 332 | XML declaration (if the bytestring is to be interpreted as an XML |
199 | document), or in a <meta> tag (if the bytestring is to be | 333 | document), or in a <meta> tag (if the bytestring is to be |
200 | interpreted as an HTML document.) | 334 | interpreted as an HTML document.) |
201 | 335 | ||
202 | 3. An encoding detected through textual analysis by chardet, | 336 | 5. An encoding detected through textual analysis by chardet, |
203 | cchardet, or a similar external library. | 337 | cchardet, or a similar external library. |
204 | 338 | ||
205 | 4. UTF-8. | 339 | 4. UTF-8. |
206 | 340 | ||
207 | 5. Windows-1252. | 341 | 5. Windows-1252. |
342 | |||
208 | """ | 343 | """ |
209 | def __init__(self, markup, override_encodings=None, is_html=False, | 344 | def __init__(self, markup, known_definite_encodings=None, |
210 | exclude_encodings=None): | 345 | is_html=False, exclude_encodings=None, |
211 | self.override_encodings = override_encodings or [] | 346 | user_encodings=None, override_encodings=None): |
347 | """Constructor. | ||
348 | |||
349 | :param markup: Some markup in an unknown encoding. | ||
350 | |||
351 | :param known_definite_encodings: When determining the encoding | ||
352 | of `markup`, these encodings will be tried first, in | ||
353 | order. In HTML terms, this corresponds to the "known | ||
354 | definite encoding" step defined here: | ||
355 | https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding | ||
356 | |||
357 | :param user_encodings: These encodings will be tried after the | ||
358 | `known_definite_encodings` have been tried and failed, and | ||
359 | after an attempt to sniff the encoding by looking at a | ||
360 | byte order mark has failed. In HTML terms, this | ||
361 | corresponds to the step "user has explicitly instructed | ||
362 | the user agent to override the document's character | ||
363 | encoding", defined here: | ||
364 | https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding | ||
365 | |||
366 | :param override_encodings: A deprecated alias for | ||
367 | known_definite_encodings. Any encodings here will be tried | ||
368 | immediately after the encodings in | ||
369 | known_definite_encodings. | ||
370 | |||
371 | :param is_html: If True, this markup is considered to be | ||
372 | HTML. Otherwise it's assumed to be XML. | ||
373 | |||
374 | :param exclude_encodings: These encodings will not be tried, | ||
375 | even if they otherwise would be. | ||
376 | |||
377 | """ | ||
378 | self.known_definite_encodings = list(known_definite_encodings or []) | ||
379 | if override_encodings: | ||
380 | self.known_definite_encodings += override_encodings | ||
381 | self.user_encodings = user_encodings or [] | ||
212 | exclude_encodings = exclude_encodings or [] | 382 | exclude_encodings = exclude_encodings or [] |
213 | self.exclude_encodings = set([x.lower() for x in exclude_encodings]) | 383 | self.exclude_encodings = set([x.lower() for x in exclude_encodings]) |
214 | self.chardet_encoding = None | 384 | self.chardet_encoding = None |
@@ -219,6 +389,12 @@ class EncodingDetector: | |||
219 | self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) | 389 | self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) |
220 | 390 | ||
221 | def _usable(self, encoding, tried): | 391 | def _usable(self, encoding, tried): |
392 | """Should we even bother to try this encoding? | ||
393 | |||
394 | :param encoding: Name of an encoding. | ||
395 | :param tried: Encodings that have already been tried. This will be modified | ||
396 | as a side effect. | ||
397 | """ | ||
222 | if encoding is not None: | 398 | if encoding is not None: |
223 | encoding = encoding.lower() | 399 | encoding = encoding.lower() |
224 | if encoding in self.exclude_encodings: | 400 | if encoding in self.exclude_encodings: |
@@ -230,9 +406,14 @@ class EncodingDetector: | |||
230 | 406 | ||
231 | @property | 407 | @property |
232 | def encodings(self): | 408 | def encodings(self): |
233 | """Yield a number of encodings that might work for this markup.""" | 409 | """Yield a number of encodings that might work for this markup. |
410 | |||
411 | :yield: A sequence of strings. | ||
412 | """ | ||
234 | tried = set() | 413 | tried = set() |
235 | for e in self.override_encodings: | 414 | |
415 | # First, try the known definite encodings | ||
416 | for e in self.known_definite_encodings: | ||
236 | if self._usable(e, tried): | 417 | if self._usable(e, tried): |
237 | yield e | 418 | yield e |
238 | 419 | ||
@@ -241,6 +422,12 @@ class EncodingDetector: | |||
241 | if self._usable(self.sniffed_encoding, tried): | 422 | if self._usable(self.sniffed_encoding, tried): |
242 | yield self.sniffed_encoding | 423 | yield self.sniffed_encoding |
243 | 424 | ||
425 | # Sniffing the byte-order mark did nothing; try the user | ||
426 | # encodings. | ||
427 | for e in self.user_encodings: | ||
428 | if self._usable(e, tried): | ||
429 | yield e | ||
430 | |||
244 | # Look within the document for an XML or HTML encoding | 431 | # Look within the document for an XML or HTML encoding |
245 | # declaration. | 432 | # declaration. |
246 | if self.declared_encoding is None: | 433 | if self.declared_encoding is None: |
@@ -263,7 +450,11 @@ class EncodingDetector: | |||
263 | 450 | ||
264 | @classmethod | 451 | @classmethod |
265 | def strip_byte_order_mark(cls, data): | 452 | def strip_byte_order_mark(cls, data): |
266 | """If a byte-order mark is present, strip it and return the encoding it implies.""" | 453 | """If a byte-order mark is present, strip it and return the encoding it implies. |
454 | |||
455 | :param data: Some markup. | ||
456 | :return: A 2-tuple (modified data, implied encoding) | ||
457 | """ | ||
267 | encoding = None | 458 | encoding = None |
268 | if isinstance(data, str): | 459 | if isinstance(data, str): |
269 | # Unicode data cannot have a byte-order mark. | 460 | # Unicode data cannot have a byte-order mark. |
@@ -295,21 +486,36 @@ class EncodingDetector: | |||
295 | 486 | ||
296 | An HTML encoding is declared in a <meta> tag, hopefully near the | 487 | An HTML encoding is declared in a <meta> tag, hopefully near the |
297 | beginning of the document. | 488 | beginning of the document. |
489 | |||
490 | :param markup: Some markup. | ||
491 | :param is_html: If True, this markup is considered to be HTML. Otherwise | ||
492 | it's assumed to be XML. | ||
493 | :param search_entire_document: Since an encoding is supposed to declared near the beginning | ||
494 | of the document, most of the time it's only necessary to search a few kilobytes of data. | ||
495 | Set this to True to force this method to search the entire document. | ||
298 | """ | 496 | """ |
299 | if search_entire_document: | 497 | if search_entire_document: |
300 | xml_endpos = html_endpos = len(markup) | 498 | xml_endpos = html_endpos = len(markup) |
301 | else: | 499 | else: |
302 | xml_endpos = 1024 | 500 | xml_endpos = 1024 |
303 | html_endpos = max(2048, int(len(markup) * 0.05)) | 501 | html_endpos = max(2048, int(len(markup) * 0.05)) |
304 | 502 | ||
503 | if isinstance(markup, bytes): | ||
504 | res = encoding_res[bytes] | ||
505 | else: | ||
506 | res = encoding_res[str] | ||
507 | |||
508 | xml_re = res['xml'] | ||
509 | html_re = res['html'] | ||
305 | declared_encoding = None | 510 | declared_encoding = None |
306 | declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) | 511 | declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) |
307 | if not declared_encoding_match and is_html: | 512 | if not declared_encoding_match and is_html: |
308 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) | 513 | declared_encoding_match = html_re.search(markup, endpos=html_endpos) |
309 | if declared_encoding_match is not None: | 514 | if declared_encoding_match is not None: |
310 | declared_encoding = declared_encoding_match.groups()[0].decode( | 515 | declared_encoding = declared_encoding_match.groups()[0] |
311 | 'ascii', 'replace') | ||
312 | if declared_encoding: | 516 | if declared_encoding: |
517 | if isinstance(declared_encoding, bytes): | ||
518 | declared_encoding = declared_encoding.decode('ascii', 'replace') | ||
313 | return declared_encoding.lower() | 519 | return declared_encoding.lower() |
314 | return None | 520 | return None |
315 | 521 | ||
@@ -332,15 +538,53 @@ class UnicodeDammit: | |||
332 | "iso-8859-2", | 538 | "iso-8859-2", |
333 | ] | 539 | ] |
334 | 540 | ||
335 | def __init__(self, markup, override_encodings=[], | 541 | def __init__(self, markup, known_definite_encodings=[], |
336 | smart_quotes_to=None, is_html=False, exclude_encodings=[]): | 542 | smart_quotes_to=None, is_html=False, exclude_encodings=[], |
543 | user_encodings=None, override_encodings=None | ||
544 | ): | ||
545 | """Constructor. | ||
546 | |||
547 | :param markup: A bytestring representing markup in an unknown encoding. | ||
548 | |||
549 | :param known_definite_encodings: When determining the encoding | ||
550 | of `markup`, these encodings will be tried first, in | ||
551 | order. In HTML terms, this corresponds to the "known | ||
552 | definite encoding" step defined here: | ||
553 | https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding | ||
554 | |||
555 | :param user_encodings: These encodings will be tried after the | ||
556 | `known_definite_encodings` have been tried and failed, and | ||
557 | after an attempt to sniff the encoding by looking at a | ||
558 | byte order mark has failed. In HTML terms, this | ||
559 | corresponds to the step "user has explicitly instructed | ||
560 | the user agent to override the document's character | ||
561 | encoding", defined here: | ||
562 | https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding | ||
563 | |||
564 | :param override_encodings: A deprecated alias for | ||
565 | known_definite_encodings. Any encodings here will be tried | ||
566 | immediately after the encodings in | ||
567 | known_definite_encodings. | ||
568 | |||
569 | :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted | ||
570 | to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. | ||
571 | Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' | ||
572 | will convert them to HTML entity references. | ||
573 | :param is_html: If True, this markup is considered to be HTML. Otherwise | ||
574 | it's assumed to be XML. | ||
575 | :param exclude_encodings: These encodings will not be considered, even | ||
576 | if the sniffing code thinks they might make sense. | ||
577 | |||
578 | """ | ||
337 | self.smart_quotes_to = smart_quotes_to | 579 | self.smart_quotes_to = smart_quotes_to |
338 | self.tried_encodings = [] | 580 | self.tried_encodings = [] |
339 | self.contains_replacement_characters = False | 581 | self.contains_replacement_characters = False |
340 | self.is_html = is_html | 582 | self.is_html = is_html |
341 | 583 | self.log = logging.getLogger(__name__) | |
342 | self.detector = EncodingDetector( | 584 | self.detector = EncodingDetector( |
343 | markup, override_encodings, is_html, exclude_encodings) | 585 | markup, known_definite_encodings, is_html, exclude_encodings, |
586 | user_encodings, override_encodings | ||
587 | ) | ||
344 | 588 | ||
345 | # Short-circuit if the data is in Unicode to begin with. | 589 | # Short-circuit if the data is in Unicode to begin with. |
346 | if isinstance(markup, str) or markup == '': | 590 | if isinstance(markup, str) or markup == '': |
@@ -368,9 +612,10 @@ class UnicodeDammit: | |||
368 | if encoding != "ascii": | 612 | if encoding != "ascii": |
369 | u = self._convert_from(encoding, "replace") | 613 | u = self._convert_from(encoding, "replace") |
370 | if u is not None: | 614 | if u is not None: |
371 | logging.warning( | 615 | self.log.warning( |
372 | "Some characters could not be decoded, and were " | 616 | "Some characters could not be decoded, and were " |
373 | "replaced with REPLACEMENT CHARACTER.") | 617 | "replaced with REPLACEMENT CHARACTER." |
618 | ) | ||
374 | self.contains_replacement_characters = True | 619 | self.contains_replacement_characters = True |
375 | break | 620 | break |
376 | 621 | ||
@@ -399,6 +644,10 @@ class UnicodeDammit: | |||
399 | return sub | 644 | return sub |
400 | 645 | ||
401 | def _convert_from(self, proposed, errors="strict"): | 646 | def _convert_from(self, proposed, errors="strict"): |
647 | """Attempt to convert the markup to the proposed encoding. | ||
648 | |||
649 | :param proposed: The name of a character encoding. | ||
650 | """ | ||
402 | proposed = self.find_codec(proposed) | 651 | proposed = self.find_codec(proposed) |
403 | if not proposed or (proposed, errors) in self.tried_encodings: | 652 | if not proposed or (proposed, errors) in self.tried_encodings: |
404 | return None | 653 | return None |
@@ -413,30 +662,40 @@ class UnicodeDammit: | |||
413 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) | 662 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) |
414 | 663 | ||
415 | try: | 664 | try: |
416 | #print "Trying to convert document to %s (errors=%s)" % ( | 665 | #print("Trying to convert document to %s (errors=%s)" % ( |
417 | # proposed, errors) | 666 | # proposed, errors)) |
418 | u = self._to_unicode(markup, proposed, errors) | 667 | u = self._to_unicode(markup, proposed, errors) |
419 | self.markup = u | 668 | self.markup = u |
420 | self.original_encoding = proposed | 669 | self.original_encoding = proposed |
421 | except Exception as e: | 670 | except Exception as e: |
422 | #print "That didn't work!" | 671 | #print("That didn't work!") |
423 | #print e | 672 | #print(e) |
424 | return None | 673 | return None |
425 | #print "Correct encoding: %s" % proposed | 674 | #print("Correct encoding: %s" % proposed) |
426 | return self.markup | 675 | return self.markup |
427 | 676 | ||
428 | def _to_unicode(self, data, encoding, errors="strict"): | 677 | def _to_unicode(self, data, encoding, errors="strict"): |
429 | '''Given a string and its encoding, decodes the string into Unicode. | 678 | """Given a string and its encoding, decodes the string into Unicode. |
430 | %encoding is a string recognized by encodings.aliases''' | 679 | |
680 | :param encoding: The name of an encoding. | ||
681 | """ | ||
431 | return str(data, encoding, errors) | 682 | return str(data, encoding, errors) |
432 | 683 | ||
433 | @property | 684 | @property |
434 | def declared_html_encoding(self): | 685 | def declared_html_encoding(self): |
686 | """If the markup is an HTML document, returns the encoding declared _within_ | ||
687 | the document. | ||
688 | """ | ||
435 | if not self.is_html: | 689 | if not self.is_html: |
436 | return None | 690 | return None |
437 | return self.detector.declared_encoding | 691 | return self.detector.declared_encoding |
438 | 692 | ||
439 | def find_codec(self, charset): | 693 | def find_codec(self, charset): |
694 | """Convert the name of a character set to a codec name. | ||
695 | |||
696 | :param charset: The name of a character set. | ||
697 | :return: The name of a codec. | ||
698 | """ | ||
440 | value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) | 699 | value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) |
441 | or (charset and self._codec(charset.replace("-", ""))) | 700 | or (charset and self._codec(charset.replace("-", ""))) |
442 | or (charset and self._codec(charset.replace("-", "_"))) | 701 | or (charset and self._codec(charset.replace("-", "_"))) |
@@ -726,7 +985,7 @@ class UnicodeDammit: | |||
726 | 0xde : b'\xc3\x9e', # Þ | 985 | 0xde : b'\xc3\x9e', # Þ |
727 | 0xdf : b'\xc3\x9f', # ß | 986 | 0xdf : b'\xc3\x9f', # ß |
728 | 0xe0 : b'\xc3\xa0', # à | 987 | 0xe0 : b'\xc3\xa0', # à |
729 | 0xe1 : b'\xa1', # á | 988 | 0xe1 : b'\xa1', # á |
730 | 0xe2 : b'\xc3\xa2', # â | 989 | 0xe2 : b'\xc3\xa2', # â |
731 | 0xe3 : b'\xc3\xa3', # ã | 990 | 0xe3 : b'\xc3\xa3', # ã |
732 | 0xe4 : b'\xc3\xa4', # ä | 991 | 0xe4 : b'\xc3\xa4', # ä |
@@ -775,12 +1034,16 @@ class UnicodeDammit: | |||
775 | Currently the only situation supported is Windows-1252 (or its | 1034 | Currently the only situation supported is Windows-1252 (or its |
776 | subset ISO-8859-1), embedded in UTF-8. | 1035 | subset ISO-8859-1), embedded in UTF-8. |
777 | 1036 | ||
778 | The input must be a bytestring. If you've already converted | 1037 | :param in_bytes: A bytestring that you suspect contains |
779 | the document to Unicode, you're too late. | 1038 | characters from multiple encodings. Note that this _must_ |
780 | 1039 | be a bytestring. If you've already converted the document | |
781 | The output is a bytestring in which `embedded_encoding` | 1040 | to Unicode, you're too late. |
782 | characters have been converted to their `main_encoding` | 1041 | :param main_encoding: The primary encoding of `in_bytes`. |
783 | equivalents. | 1042 | :param embedded_encoding: The encoding that was used to embed characters |
1043 | in the main document. | ||
1044 | :return: A bytestring in which `embedded_encoding` | ||
1045 | characters have been converted to their `main_encoding` | ||
1046 | equivalents. | ||
784 | """ | 1047 | """ |
785 | if embedded_encoding.replace('_', '-').lower() not in ( | 1048 | if embedded_encoding.replace('_', '-').lower() not in ( |
786 | 'windows-1252', 'windows_1252'): | 1049 | 'windows-1252', 'windows_1252'): |