diff options
Diffstat (limited to 'bitbake/lib/bs4/dammit.py')
-rw-r--r-- | bitbake/lib/bs4/dammit.py | 829 |
1 files changed, 829 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py new file mode 100644 index 0000000000..59640b7ce3 --- /dev/null +++ b/bitbake/lib/bs4/dammit.py | |||
@@ -0,0 +1,829 @@ | |||
1 | # -*- coding: utf-8 -*- | ||
2 | """Beautiful Soup bonus library: Unicode, Dammit | ||
3 | |||
4 | This library converts a bytestream to Unicode through any means | ||
5 | necessary. It is heavily based on code from Mark Pilgrim's Universal | ||
6 | Feed Parser. It works best on XML and XML, but it does not rewrite the | ||
7 | XML or HTML to reflect a new encoding; that's the tree builder's job. | ||
8 | """ | ||
9 | |||
10 | import codecs | ||
11 | from htmlentitydefs import codepoint2name | ||
12 | import re | ||
13 | import logging | ||
14 | import string | ||
15 | |||
16 | # Import a library to autodetect character encodings. | ||
17 | chardet_type = None | ||
18 | try: | ||
19 | # First try the fast C implementation. | ||
20 | # PyPI package: cchardet | ||
21 | import cchardet | ||
22 | def chardet_dammit(s): | ||
23 | return cchardet.detect(s)['encoding'] | ||
24 | except ImportError: | ||
25 | try: | ||
26 | # Fall back to the pure Python implementation | ||
27 | # Debian package: python-chardet | ||
28 | # PyPI package: chardet | ||
29 | import chardet | ||
30 | def chardet_dammit(s): | ||
31 | return chardet.detect(s)['encoding'] | ||
32 | #import chardet.constants | ||
33 | #chardet.constants._debug = 1 | ||
34 | except ImportError: | ||
35 | # No chardet available. | ||
36 | def chardet_dammit(s): | ||
37 | return None | ||
38 | |||
39 | # Available from http://cjkpython.i18n.org/. | ||
40 | try: | ||
41 | import iconv_codec | ||
42 | except ImportError: | ||
43 | pass | ||
44 | |||
45 | xml_encoding_re = re.compile( | ||
46 | '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) | ||
47 | html_meta_re = re.compile( | ||
48 | '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) | ||
49 | |||
50 | class EntitySubstitution(object): | ||
51 | |||
52 | """Substitute XML or HTML entities for the corresponding characters.""" | ||
53 | |||
54 | def _populate_class_variables(): | ||
55 | lookup = {} | ||
56 | reverse_lookup = {} | ||
57 | characters_for_re = [] | ||
58 | for codepoint, name in list(codepoint2name.items()): | ||
59 | character = unichr(codepoint) | ||
60 | if codepoint != 34: | ||
61 | # There's no point in turning the quotation mark into | ||
62 | # ", unless it happens within an attribute value, which | ||
63 | # is handled elsewhere. | ||
64 | characters_for_re.append(character) | ||
65 | lookup[character] = name | ||
66 | # But we do want to turn " into the quotation mark. | ||
67 | reverse_lookup[name] = character | ||
68 | re_definition = "[%s]" % "".join(characters_for_re) | ||
69 | return lookup, reverse_lookup, re.compile(re_definition) | ||
70 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, | ||
71 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() | ||
72 | |||
73 | CHARACTER_TO_XML_ENTITY = { | ||
74 | "'": "apos", | ||
75 | '"': "quot", | ||
76 | "&": "amp", | ||
77 | "<": "lt", | ||
78 | ">": "gt", | ||
79 | } | ||
80 | |||
81 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" | ||
82 | "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" | ||
83 | ")") | ||
84 | |||
85 | AMPERSAND_OR_BRACKET = re.compile("([<>&])") | ||
86 | |||
87 | @classmethod | ||
88 | def _substitute_html_entity(cls, matchobj): | ||
89 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) | ||
90 | return "&%s;" % entity | ||
91 | |||
92 | @classmethod | ||
93 | def _substitute_xml_entity(cls, matchobj): | ||
94 | """Used with a regular expression to substitute the | ||
95 | appropriate XML entity for an XML special character.""" | ||
96 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] | ||
97 | return "&%s;" % entity | ||
98 | |||
99 | @classmethod | ||
100 | def quoted_attribute_value(self, value): | ||
101 | """Make a value into a quoted XML attribute, possibly escaping it. | ||
102 | |||
103 | Most strings will be quoted using double quotes. | ||
104 | |||
105 | Bob's Bar -> "Bob's Bar" | ||
106 | |||
107 | If a string contains double quotes, it will be quoted using | ||
108 | single quotes. | ||
109 | |||
110 | Welcome to "my bar" -> 'Welcome to "my bar"' | ||
111 | |||
112 | If a string contains both single and double quotes, the | ||
113 | double quotes will be escaped, and the string will be quoted | ||
114 | using double quotes. | ||
115 | |||
116 | Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" | ||
117 | """ | ||
118 | quote_with = '"' | ||
119 | if '"' in value: | ||
120 | if "'" in value: | ||
121 | # The string contains both single and double | ||
122 | # quotes. Turn the double quotes into | ||
123 | # entities. We quote the double quotes rather than | ||
124 | # the single quotes because the entity name is | ||
125 | # """ whether this is HTML or XML. If we | ||
126 | # quoted the single quotes, we'd have to decide | ||
127 | # between ' and &squot;. | ||
128 | replace_with = """ | ||
129 | value = value.replace('"', replace_with) | ||
130 | else: | ||
131 | # There are double quotes but no single quotes. | ||
132 | # We can use single quotes to quote the attribute. | ||
133 | quote_with = "'" | ||
134 | return quote_with + value + quote_with | ||
135 | |||
136 | @classmethod | ||
137 | def substitute_xml(cls, value, make_quoted_attribute=False): | ||
138 | """Substitute XML entities for special XML characters. | ||
139 | |||
140 | :param value: A string to be substituted. The less-than sign | ||
141 | will become <, the greater-than sign will become >, | ||
142 | and any ampersands will become &. If you want ampersands | ||
143 | that appear to be part of an entity definition to be left | ||
144 | alone, use substitute_xml_containing_entities() instead. | ||
145 | |||
146 | :param make_quoted_attribute: If True, then the string will be | ||
147 | quoted, as befits an attribute value. | ||
148 | """ | ||
149 | # Escape angle brackets and ampersands. | ||
150 | value = cls.AMPERSAND_OR_BRACKET.sub( | ||
151 | cls._substitute_xml_entity, value) | ||
152 | |||
153 | if make_quoted_attribute: | ||
154 | value = cls.quoted_attribute_value(value) | ||
155 | return value | ||
156 | |||
157 | @classmethod | ||
158 | def substitute_xml_containing_entities( | ||
159 | cls, value, make_quoted_attribute=False): | ||
160 | """Substitute XML entities for special XML characters. | ||
161 | |||
162 | :param value: A string to be substituted. The less-than sign will | ||
163 | become <, the greater-than sign will become >, and any | ||
164 | ampersands that are not part of an entity defition will | ||
165 | become &. | ||
166 | |||
167 | :param make_quoted_attribute: If True, then the string will be | ||
168 | quoted, as befits an attribute value. | ||
169 | """ | ||
170 | # Escape angle brackets, and ampersands that aren't part of | ||
171 | # entities. | ||
172 | value = cls.BARE_AMPERSAND_OR_BRACKET.sub( | ||
173 | cls._substitute_xml_entity, value) | ||
174 | |||
175 | if make_quoted_attribute: | ||
176 | value = cls.quoted_attribute_value(value) | ||
177 | return value | ||
178 | |||
179 | @classmethod | ||
180 | def substitute_html(cls, s): | ||
181 | """Replace certain Unicode characters with named HTML entities. | ||
182 | |||
183 | This differs from data.encode(encoding, 'xmlcharrefreplace') | ||
184 | in that the goal is to make the result more readable (to those | ||
185 | with ASCII displays) rather than to recover from | ||
186 | errors. There's absolutely nothing wrong with a UTF-8 string | ||
187 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that | ||
188 | character with "é" will make it more readable to some | ||
189 | people. | ||
190 | """ | ||
191 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( | ||
192 | cls._substitute_html_entity, s) | ||
193 | |||
194 | |||
195 | class EncodingDetector: | ||
196 | """Suggests a number of possible encodings for a bytestring. | ||
197 | |||
198 | Order of precedence: | ||
199 | |||
200 | 1. Encodings you specifically tell EncodingDetector to try first | ||
201 | (the override_encodings argument to the constructor). | ||
202 | |||
203 | 2. An encoding declared within the bytestring itself, either in an | ||
204 | XML declaration (if the bytestring is to be interpreted as an XML | ||
205 | document), or in a <meta> tag (if the bytestring is to be | ||
206 | interpreted as an HTML document.) | ||
207 | |||
208 | 3. An encoding detected through textual analysis by chardet, | ||
209 | cchardet, or a similar external library. | ||
210 | |||
211 | 4. UTF-8. | ||
212 | |||
213 | 5. Windows-1252. | ||
214 | """ | ||
215 | def __init__(self, markup, override_encodings=None, is_html=False): | ||
216 | self.override_encodings = override_encodings or [] | ||
217 | self.chardet_encoding = None | ||
218 | self.is_html = is_html | ||
219 | self.declared_encoding = None | ||
220 | |||
221 | # First order of business: strip a byte-order mark. | ||
222 | self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) | ||
223 | |||
224 | def _usable(self, encoding, tried): | ||
225 | if encoding is not None: | ||
226 | encoding = encoding.lower() | ||
227 | if encoding not in tried: | ||
228 | tried.add(encoding) | ||
229 | return True | ||
230 | return False | ||
231 | |||
232 | @property | ||
233 | def encodings(self): | ||
234 | """Yield a number of encodings that might work for this markup.""" | ||
235 | tried = set() | ||
236 | for e in self.override_encodings: | ||
237 | if self._usable(e, tried): | ||
238 | yield e | ||
239 | |||
240 | # Did the document originally start with a byte-order mark | ||
241 | # that indicated its encoding? | ||
242 | if self._usable(self.sniffed_encoding, tried): | ||
243 | yield self.sniffed_encoding | ||
244 | |||
245 | # Look within the document for an XML or HTML encoding | ||
246 | # declaration. | ||
247 | if self.declared_encoding is None: | ||
248 | self.declared_encoding = self.find_declared_encoding( | ||
249 | self.markup, self.is_html) | ||
250 | if self._usable(self.declared_encoding, tried): | ||
251 | yield self.declared_encoding | ||
252 | |||
253 | # Use third-party character set detection to guess at the | ||
254 | # encoding. | ||
255 | if self.chardet_encoding is None: | ||
256 | self.chardet_encoding = chardet_dammit(self.markup) | ||
257 | if self._usable(self.chardet_encoding, tried): | ||
258 | yield self.chardet_encoding | ||
259 | |||
260 | # As a last-ditch effort, try utf-8 and windows-1252. | ||
261 | for e in ('utf-8', 'windows-1252'): | ||
262 | if self._usable(e, tried): | ||
263 | yield e | ||
264 | |||
265 | @classmethod | ||
266 | def strip_byte_order_mark(cls, data): | ||
267 | """If a byte-order mark is present, strip it and return the encoding it implies.""" | ||
268 | encoding = None | ||
269 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | ||
270 | and (data[2:4] != '\x00\x00'): | ||
271 | encoding = 'utf-16be' | ||
272 | data = data[2:] | ||
273 | elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ | ||
274 | and (data[2:4] != '\x00\x00'): | ||
275 | encoding = 'utf-16le' | ||
276 | data = data[2:] | ||
277 | elif data[:3] == b'\xef\xbb\xbf': | ||
278 | encoding = 'utf-8' | ||
279 | data = data[3:] | ||
280 | elif data[:4] == b'\x00\x00\xfe\xff': | ||
281 | encoding = 'utf-32be' | ||
282 | data = data[4:] | ||
283 | elif data[:4] == b'\xff\xfe\x00\x00': | ||
284 | encoding = 'utf-32le' | ||
285 | data = data[4:] | ||
286 | return data, encoding | ||
287 | |||
288 | @classmethod | ||
289 | def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): | ||
290 | """Given a document, tries to find its declared encoding. | ||
291 | |||
292 | An XML encoding is declared at the beginning of the document. | ||
293 | |||
294 | An HTML encoding is declared in a <meta> tag, hopefully near the | ||
295 | beginning of the document. | ||
296 | """ | ||
297 | if search_entire_document: | ||
298 | xml_endpos = html_endpos = len(markup) | ||
299 | else: | ||
300 | xml_endpos = 1024 | ||
301 | html_endpos = max(2048, int(len(markup) * 0.05)) | ||
302 | |||
303 | declared_encoding = None | ||
304 | declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) | ||
305 | if not declared_encoding_match and is_html: | ||
306 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) | ||
307 | if declared_encoding_match is not None: | ||
308 | declared_encoding = declared_encoding_match.groups()[0].decode( | ||
309 | 'ascii') | ||
310 | if declared_encoding: | ||
311 | return declared_encoding.lower() | ||
312 | return None | ||
313 | |||
314 | class UnicodeDammit: | ||
315 | """A class for detecting the encoding of a *ML document and | ||
316 | converting it to a Unicode string. If the source encoding is | ||
317 | windows-1252, can replace MS smart quotes with their HTML or XML | ||
318 | equivalents.""" | ||
319 | |||
320 | # This dictionary maps commonly seen values for "charset" in HTML | ||
321 | # meta tags to the corresponding Python codec names. It only covers | ||
322 | # values that aren't in Python's aliases and can't be determined | ||
323 | # by the heuristics in find_codec. | ||
324 | CHARSET_ALIASES = {"macintosh": "mac-roman", | ||
325 | "x-sjis": "shift-jis"} | ||
326 | |||
327 | ENCODINGS_WITH_SMART_QUOTES = [ | ||
328 | "windows-1252", | ||
329 | "iso-8859-1", | ||
330 | "iso-8859-2", | ||
331 | ] | ||
332 | |||
333 | def __init__(self, markup, override_encodings=[], | ||
334 | smart_quotes_to=None, is_html=False): | ||
335 | self.smart_quotes_to = smart_quotes_to | ||
336 | self.tried_encodings = [] | ||
337 | self.contains_replacement_characters = False | ||
338 | self.is_html = is_html | ||
339 | |||
340 | self.detector = EncodingDetector(markup, override_encodings, is_html) | ||
341 | |||
342 | # Short-circuit if the data is in Unicode to begin with. | ||
343 | if isinstance(markup, unicode) or markup == '': | ||
344 | self.markup = markup | ||
345 | self.unicode_markup = unicode(markup) | ||
346 | self.original_encoding = None | ||
347 | return | ||
348 | |||
349 | # The encoding detector may have stripped a byte-order mark. | ||
350 | # Use the stripped markup from this point on. | ||
351 | self.markup = self.detector.markup | ||
352 | |||
353 | u = None | ||
354 | for encoding in self.detector.encodings: | ||
355 | markup = self.detector.markup | ||
356 | u = self._convert_from(encoding) | ||
357 | if u is not None: | ||
358 | break | ||
359 | |||
360 | if not u: | ||
361 | # None of the encodings worked. As an absolute last resort, | ||
362 | # try them again with character replacement. | ||
363 | |||
364 | for encoding in self.detector.encodings: | ||
365 | if encoding != "ascii": | ||
366 | u = self._convert_from(encoding, "replace") | ||
367 | if u is not None: | ||
368 | logging.warning( | ||
369 | "Some characters could not be decoded, and were " | ||
370 | "replaced with REPLACEMENT CHARACTER.") | ||
371 | self.contains_replacement_characters = True | ||
372 | break | ||
373 | |||
374 | # If none of that worked, we could at this point force it to | ||
375 | # ASCII, but that would destroy so much data that I think | ||
376 | # giving up is better. | ||
377 | self.unicode_markup = u | ||
378 | if not u: | ||
379 | self.original_encoding = None | ||
380 | |||
381 | def _sub_ms_char(self, match): | ||
382 | """Changes a MS smart quote character to an XML or HTML | ||
383 | entity, or an ASCII character.""" | ||
384 | orig = match.group(1) | ||
385 | if self.smart_quotes_to == 'ascii': | ||
386 | sub = self.MS_CHARS_TO_ASCII.get(orig).encode() | ||
387 | else: | ||
388 | sub = self.MS_CHARS.get(orig) | ||
389 | if type(sub) == tuple: | ||
390 | if self.smart_quotes_to == 'xml': | ||
391 | sub = '&#x'.encode() + sub[1].encode() + ';'.encode() | ||
392 | else: | ||
393 | sub = '&'.encode() + sub[0].encode() + ';'.encode() | ||
394 | else: | ||
395 | sub = sub.encode() | ||
396 | return sub | ||
397 | |||
398 | def _convert_from(self, proposed, errors="strict"): | ||
399 | proposed = self.find_codec(proposed) | ||
400 | if not proposed or (proposed, errors) in self.tried_encodings: | ||
401 | return None | ||
402 | self.tried_encodings.append((proposed, errors)) | ||
403 | markup = self.markup | ||
404 | # Convert smart quotes to HTML if coming from an encoding | ||
405 | # that might have them. | ||
406 | if (self.smart_quotes_to is not None | ||
407 | and proposed in self.ENCODINGS_WITH_SMART_QUOTES): | ||
408 | smart_quotes_re = b"([\x80-\x9f])" | ||
409 | smart_quotes_compiled = re.compile(smart_quotes_re) | ||
410 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) | ||
411 | |||
412 | try: | ||
413 | #print "Trying to convert document to %s (errors=%s)" % ( | ||
414 | # proposed, errors) | ||
415 | u = self._to_unicode(markup, proposed, errors) | ||
416 | self.markup = u | ||
417 | self.original_encoding = proposed | ||
418 | except Exception as e: | ||
419 | #print "That didn't work!" | ||
420 | #print e | ||
421 | return None | ||
422 | #print "Correct encoding: %s" % proposed | ||
423 | return self.markup | ||
424 | |||
425 | def _to_unicode(self, data, encoding, errors="strict"): | ||
426 | '''Given a string and its encoding, decodes the string into Unicode. | ||
427 | %encoding is a string recognized by encodings.aliases''' | ||
428 | return unicode(data, encoding, errors) | ||
429 | |||
430 | @property | ||
431 | def declared_html_encoding(self): | ||
432 | if not self.is_html: | ||
433 | return None | ||
434 | return self.detector.declared_encoding | ||
435 | |||
436 | def find_codec(self, charset): | ||
437 | value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) | ||
438 | or (charset and self._codec(charset.replace("-", ""))) | ||
439 | or (charset and self._codec(charset.replace("-", "_"))) | ||
440 | or (charset and charset.lower()) | ||
441 | or charset | ||
442 | ) | ||
443 | if value: | ||
444 | return value.lower() | ||
445 | return None | ||
446 | |||
447 | def _codec(self, charset): | ||
448 | if not charset: | ||
449 | return charset | ||
450 | codec = None | ||
451 | try: | ||
452 | codecs.lookup(charset) | ||
453 | codec = charset | ||
454 | except (LookupError, ValueError): | ||
455 | pass | ||
456 | return codec | ||
457 | |||
458 | |||
459 | # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. | ||
460 | MS_CHARS = {b'\x80': ('euro', '20AC'), | ||
461 | b'\x81': ' ', | ||
462 | b'\x82': ('sbquo', '201A'), | ||
463 | b'\x83': ('fnof', '192'), | ||
464 | b'\x84': ('bdquo', '201E'), | ||
465 | b'\x85': ('hellip', '2026'), | ||
466 | b'\x86': ('dagger', '2020'), | ||
467 | b'\x87': ('Dagger', '2021'), | ||
468 | b'\x88': ('circ', '2C6'), | ||
469 | b'\x89': ('permil', '2030'), | ||
470 | b'\x8A': ('Scaron', '160'), | ||
471 | b'\x8B': ('lsaquo', '2039'), | ||
472 | b'\x8C': ('OElig', '152'), | ||
473 | b'\x8D': '?', | ||
474 | b'\x8E': ('#x17D', '17D'), | ||
475 | b'\x8F': '?', | ||
476 | b'\x90': '?', | ||
477 | b'\x91': ('lsquo', '2018'), | ||
478 | b'\x92': ('rsquo', '2019'), | ||
479 | b'\x93': ('ldquo', '201C'), | ||
480 | b'\x94': ('rdquo', '201D'), | ||
481 | b'\x95': ('bull', '2022'), | ||
482 | b'\x96': ('ndash', '2013'), | ||
483 | b'\x97': ('mdash', '2014'), | ||
484 | b'\x98': ('tilde', '2DC'), | ||
485 | b'\x99': ('trade', '2122'), | ||
486 | b'\x9a': ('scaron', '161'), | ||
487 | b'\x9b': ('rsaquo', '203A'), | ||
488 | b'\x9c': ('oelig', '153'), | ||
489 | b'\x9d': '?', | ||
490 | b'\x9e': ('#x17E', '17E'), | ||
491 | b'\x9f': ('Yuml', ''),} | ||
492 | |||
493 | # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains | ||
494 | # horrors like stripping diacritical marks to turn á into a, but also | ||
495 | # contains non-horrors like turning “ into ". | ||
496 | MS_CHARS_TO_ASCII = { | ||
497 | b'\x80' : 'EUR', | ||
498 | b'\x81' : ' ', | ||
499 | b'\x82' : ',', | ||
500 | b'\x83' : 'f', | ||
501 | b'\x84' : ',,', | ||
502 | b'\x85' : '...', | ||
503 | b'\x86' : '+', | ||
504 | b'\x87' : '++', | ||
505 | b'\x88' : '^', | ||
506 | b'\x89' : '%', | ||
507 | b'\x8a' : 'S', | ||
508 | b'\x8b' : '<', | ||
509 | b'\x8c' : 'OE', | ||
510 | b'\x8d' : '?', | ||
511 | b'\x8e' : 'Z', | ||
512 | b'\x8f' : '?', | ||
513 | b'\x90' : '?', | ||
514 | b'\x91' : "'", | ||
515 | b'\x92' : "'", | ||
516 | b'\x93' : '"', | ||
517 | b'\x94' : '"', | ||
518 | b'\x95' : '*', | ||
519 | b'\x96' : '-', | ||
520 | b'\x97' : '--', | ||
521 | b'\x98' : '~', | ||
522 | b'\x99' : '(TM)', | ||
523 | b'\x9a' : 's', | ||
524 | b'\x9b' : '>', | ||
525 | b'\x9c' : 'oe', | ||
526 | b'\x9d' : '?', | ||
527 | b'\x9e' : 'z', | ||
528 | b'\x9f' : 'Y', | ||
529 | b'\xa0' : ' ', | ||
530 | b'\xa1' : '!', | ||
531 | b'\xa2' : 'c', | ||
532 | b'\xa3' : 'GBP', | ||
533 | b'\xa4' : '$', #This approximation is especially parochial--this is the | ||
534 | #generic currency symbol. | ||
535 | b'\xa5' : 'YEN', | ||
536 | b'\xa6' : '|', | ||
537 | b'\xa7' : 'S', | ||
538 | b'\xa8' : '..', | ||
539 | b'\xa9' : '', | ||
540 | b'\xaa' : '(th)', | ||
541 | b'\xab' : '<<', | ||
542 | b'\xac' : '!', | ||
543 | b'\xad' : ' ', | ||
544 | b'\xae' : '(R)', | ||
545 | b'\xaf' : '-', | ||
546 | b'\xb0' : 'o', | ||
547 | b'\xb1' : '+-', | ||
548 | b'\xb2' : '2', | ||
549 | b'\xb3' : '3', | ||
550 | b'\xb4' : ("'", 'acute'), | ||
551 | b'\xb5' : 'u', | ||
552 | b'\xb6' : 'P', | ||
553 | b'\xb7' : '*', | ||
554 | b'\xb8' : ',', | ||
555 | b'\xb9' : '1', | ||
556 | b'\xba' : '(th)', | ||
557 | b'\xbb' : '>>', | ||
558 | b'\xbc' : '1/4', | ||
559 | b'\xbd' : '1/2', | ||
560 | b'\xbe' : '3/4', | ||
561 | b'\xbf' : '?', | ||
562 | b'\xc0' : 'A', | ||
563 | b'\xc1' : 'A', | ||
564 | b'\xc2' : 'A', | ||
565 | b'\xc3' : 'A', | ||
566 | b'\xc4' : 'A', | ||
567 | b'\xc5' : 'A', | ||
568 | b'\xc6' : 'AE', | ||
569 | b'\xc7' : 'C', | ||
570 | b'\xc8' : 'E', | ||
571 | b'\xc9' : 'E', | ||
572 | b'\xca' : 'E', | ||
573 | b'\xcb' : 'E', | ||
574 | b'\xcc' : 'I', | ||
575 | b'\xcd' : 'I', | ||
576 | b'\xce' : 'I', | ||
577 | b'\xcf' : 'I', | ||
578 | b'\xd0' : 'D', | ||
579 | b'\xd1' : 'N', | ||
580 | b'\xd2' : 'O', | ||
581 | b'\xd3' : 'O', | ||
582 | b'\xd4' : 'O', | ||
583 | b'\xd5' : 'O', | ||
584 | b'\xd6' : 'O', | ||
585 | b'\xd7' : '*', | ||
586 | b'\xd8' : 'O', | ||
587 | b'\xd9' : 'U', | ||
588 | b'\xda' : 'U', | ||
589 | b'\xdb' : 'U', | ||
590 | b'\xdc' : 'U', | ||
591 | b'\xdd' : 'Y', | ||
592 | b'\xde' : 'b', | ||
593 | b'\xdf' : 'B', | ||
594 | b'\xe0' : 'a', | ||
595 | b'\xe1' : 'a', | ||
596 | b'\xe2' : 'a', | ||
597 | b'\xe3' : 'a', | ||
598 | b'\xe4' : 'a', | ||
599 | b'\xe5' : 'a', | ||
600 | b'\xe6' : 'ae', | ||
601 | b'\xe7' : 'c', | ||
602 | b'\xe8' : 'e', | ||
603 | b'\xe9' : 'e', | ||
604 | b'\xea' : 'e', | ||
605 | b'\xeb' : 'e', | ||
606 | b'\xec' : 'i', | ||
607 | b'\xed' : 'i', | ||
608 | b'\xee' : 'i', | ||
609 | b'\xef' : 'i', | ||
610 | b'\xf0' : 'o', | ||
611 | b'\xf1' : 'n', | ||
612 | b'\xf2' : 'o', | ||
613 | b'\xf3' : 'o', | ||
614 | b'\xf4' : 'o', | ||
615 | b'\xf5' : 'o', | ||
616 | b'\xf6' : 'o', | ||
617 | b'\xf7' : '/', | ||
618 | b'\xf8' : 'o', | ||
619 | b'\xf9' : 'u', | ||
620 | b'\xfa' : 'u', | ||
621 | b'\xfb' : 'u', | ||
622 | b'\xfc' : 'u', | ||
623 | b'\xfd' : 'y', | ||
624 | b'\xfe' : 'b', | ||
625 | b'\xff' : 'y', | ||
626 | } | ||
627 | |||
628 | # A map used when removing rogue Windows-1252/ISO-8859-1 | ||
629 | # characters in otherwise UTF-8 documents. | ||
630 | # | ||
631 | # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in | ||
632 | # Windows-1252. | ||
633 | WINDOWS_1252_TO_UTF8 = { | ||
634 | 0x80 : b'\xe2\x82\xac', # € | ||
635 | 0x82 : b'\xe2\x80\x9a', # ‚ | ||
636 | 0x83 : b'\xc6\x92', # ƒ | ||
637 | 0x84 : b'\xe2\x80\x9e', # „ | ||
638 | 0x85 : b'\xe2\x80\xa6', # … | ||
639 | 0x86 : b'\xe2\x80\xa0', # † | ||
640 | 0x87 : b'\xe2\x80\xa1', # ‡ | ||
641 | 0x88 : b'\xcb\x86', # ˆ | ||
642 | 0x89 : b'\xe2\x80\xb0', # ‰ | ||
643 | 0x8a : b'\xc5\xa0', # Š | ||
644 | 0x8b : b'\xe2\x80\xb9', # ‹ | ||
645 | 0x8c : b'\xc5\x92', # Œ | ||
646 | 0x8e : b'\xc5\xbd', # Ž | ||
647 | 0x91 : b'\xe2\x80\x98', # ‘ | ||
648 | 0x92 : b'\xe2\x80\x99', # ’ | ||
649 | 0x93 : b'\xe2\x80\x9c', # “ | ||
650 | 0x94 : b'\xe2\x80\x9d', # ” | ||
651 | 0x95 : b'\xe2\x80\xa2', # • | ||
652 | 0x96 : b'\xe2\x80\x93', # – | ||
653 | 0x97 : b'\xe2\x80\x94', # — | ||
654 | 0x98 : b'\xcb\x9c', # ˜ | ||
655 | 0x99 : b'\xe2\x84\xa2', # ™ | ||
656 | 0x9a : b'\xc5\xa1', # š | ||
657 | 0x9b : b'\xe2\x80\xba', # › | ||
658 | 0x9c : b'\xc5\x93', # œ | ||
659 | 0x9e : b'\xc5\xbe', # ž | ||
660 | 0x9f : b'\xc5\xb8', # Ÿ | ||
661 | 0xa0 : b'\xc2\xa0', # | ||
662 | 0xa1 : b'\xc2\xa1', # ¡ | ||
663 | 0xa2 : b'\xc2\xa2', # ¢ | ||
664 | 0xa3 : b'\xc2\xa3', # £ | ||
665 | 0xa4 : b'\xc2\xa4', # ¤ | ||
666 | 0xa5 : b'\xc2\xa5', # ¥ | ||
667 | 0xa6 : b'\xc2\xa6', # ¦ | ||
668 | 0xa7 : b'\xc2\xa7', # § | ||
669 | 0xa8 : b'\xc2\xa8', # ¨ | ||
670 | 0xa9 : b'\xc2\xa9', # © | ||
671 | 0xaa : b'\xc2\xaa', # ª | ||
672 | 0xab : b'\xc2\xab', # « | ||
673 | 0xac : b'\xc2\xac', # ¬ | ||
674 | 0xad : b'\xc2\xad', # | ||
675 | 0xae : b'\xc2\xae', # ® | ||
676 | 0xaf : b'\xc2\xaf', # ¯ | ||
677 | 0xb0 : b'\xc2\xb0', # ° | ||
678 | 0xb1 : b'\xc2\xb1', # ± | ||
679 | 0xb2 : b'\xc2\xb2', # ² | ||
680 | 0xb3 : b'\xc2\xb3', # ³ | ||
681 | 0xb4 : b'\xc2\xb4', # ´ | ||
682 | 0xb5 : b'\xc2\xb5', # µ | ||
683 | 0xb6 : b'\xc2\xb6', # ¶ | ||
684 | 0xb7 : b'\xc2\xb7', # · | ||
685 | 0xb8 : b'\xc2\xb8', # ¸ | ||
686 | 0xb9 : b'\xc2\xb9', # ¹ | ||
687 | 0xba : b'\xc2\xba', # º | ||
688 | 0xbb : b'\xc2\xbb', # » | ||
689 | 0xbc : b'\xc2\xbc', # ¼ | ||
690 | 0xbd : b'\xc2\xbd', # ½ | ||
691 | 0xbe : b'\xc2\xbe', # ¾ | ||
692 | 0xbf : b'\xc2\xbf', # ¿ | ||
693 | 0xc0 : b'\xc3\x80', # À | ||
694 | 0xc1 : b'\xc3\x81', # Á | ||
695 | 0xc2 : b'\xc3\x82', # Â | ||
696 | 0xc3 : b'\xc3\x83', # Ã | ||
697 | 0xc4 : b'\xc3\x84', # Ä | ||
698 | 0xc5 : b'\xc3\x85', # Å | ||
699 | 0xc6 : b'\xc3\x86', # Æ | ||
700 | 0xc7 : b'\xc3\x87', # Ç | ||
701 | 0xc8 : b'\xc3\x88', # È | ||
702 | 0xc9 : b'\xc3\x89', # É | ||
703 | 0xca : b'\xc3\x8a', # Ê | ||
704 | 0xcb : b'\xc3\x8b', # Ë | ||
705 | 0xcc : b'\xc3\x8c', # Ì | ||
706 | 0xcd : b'\xc3\x8d', # Í | ||
707 | 0xce : b'\xc3\x8e', # Î | ||
708 | 0xcf : b'\xc3\x8f', # Ï | ||
709 | 0xd0 : b'\xc3\x90', # Ð | ||
710 | 0xd1 : b'\xc3\x91', # Ñ | ||
711 | 0xd2 : b'\xc3\x92', # Ò | ||
712 | 0xd3 : b'\xc3\x93', # Ó | ||
713 | 0xd4 : b'\xc3\x94', # Ô | ||
714 | 0xd5 : b'\xc3\x95', # Õ | ||
715 | 0xd6 : b'\xc3\x96', # Ö | ||
716 | 0xd7 : b'\xc3\x97', # × | ||
717 | 0xd8 : b'\xc3\x98', # Ø | ||
718 | 0xd9 : b'\xc3\x99', # Ù | ||
719 | 0xda : b'\xc3\x9a', # Ú | ||
720 | 0xdb : b'\xc3\x9b', # Û | ||
721 | 0xdc : b'\xc3\x9c', # Ü | ||
722 | 0xdd : b'\xc3\x9d', # Ý | ||
723 | 0xde : b'\xc3\x9e', # Þ | ||
724 | 0xdf : b'\xc3\x9f', # ß | ||
725 | 0xe0 : b'\xc3\xa0', # à | ||
726 | 0xe1 : b'\xa1', # á | ||
727 | 0xe2 : b'\xc3\xa2', # â | ||
728 | 0xe3 : b'\xc3\xa3', # ã | ||
729 | 0xe4 : b'\xc3\xa4', # ä | ||
730 | 0xe5 : b'\xc3\xa5', # å | ||
731 | 0xe6 : b'\xc3\xa6', # æ | ||
732 | 0xe7 : b'\xc3\xa7', # ç | ||
733 | 0xe8 : b'\xc3\xa8', # è | ||
734 | 0xe9 : b'\xc3\xa9', # é | ||
735 | 0xea : b'\xc3\xaa', # ê | ||
736 | 0xeb : b'\xc3\xab', # ë | ||
737 | 0xec : b'\xc3\xac', # ì | ||
738 | 0xed : b'\xc3\xad', # í | ||
739 | 0xee : b'\xc3\xae', # î | ||
740 | 0xef : b'\xc3\xaf', # ï | ||
741 | 0xf0 : b'\xc3\xb0', # ð | ||
742 | 0xf1 : b'\xc3\xb1', # ñ | ||
743 | 0xf2 : b'\xc3\xb2', # ò | ||
744 | 0xf3 : b'\xc3\xb3', # ó | ||
745 | 0xf4 : b'\xc3\xb4', # ô | ||
746 | 0xf5 : b'\xc3\xb5', # õ | ||
747 | 0xf6 : b'\xc3\xb6', # ö | ||
748 | 0xf7 : b'\xc3\xb7', # ÷ | ||
749 | 0xf8 : b'\xc3\xb8', # ø | ||
750 | 0xf9 : b'\xc3\xb9', # ù | ||
751 | 0xfa : b'\xc3\xba', # ú | ||
752 | 0xfb : b'\xc3\xbb', # û | ||
753 | 0xfc : b'\xc3\xbc', # ü | ||
754 | 0xfd : b'\xc3\xbd', # ý | ||
755 | 0xfe : b'\xc3\xbe', # þ | ||
756 | } | ||
757 | |||
758 | MULTIBYTE_MARKERS_AND_SIZES = [ | ||
759 | (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF | ||
760 | (0xe0, 0xef, 3), # 3-byte characters start with E0-EF | ||
761 | (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 | ||
762 | ] | ||
763 | |||
764 | FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] | ||
765 | LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] | ||
766 | |||
767 | @classmethod | ||
768 | def detwingle(cls, in_bytes, main_encoding="utf8", | ||
769 | embedded_encoding="windows-1252"): | ||
770 | """Fix characters from one encoding embedded in some other encoding. | ||
771 | |||
772 | Currently the only situation supported is Windows-1252 (or its | ||
773 | subset ISO-8859-1), embedded in UTF-8. | ||
774 | |||
775 | The input must be a bytestring. If you've already converted | ||
776 | the document to Unicode, you're too late. | ||
777 | |||
778 | The output is a bytestring in which `embedded_encoding` | ||
779 | characters have been converted to their `main_encoding` | ||
780 | equivalents. | ||
781 | """ | ||
782 | if embedded_encoding.replace('_', '-').lower() not in ( | ||
783 | 'windows-1252', 'windows_1252'): | ||
784 | raise NotImplementedError( | ||
785 | "Windows-1252 and ISO-8859-1 are the only currently supported " | ||
786 | "embedded encodings.") | ||
787 | |||
788 | if main_encoding.lower() not in ('utf8', 'utf-8'): | ||
789 | raise NotImplementedError( | ||
790 | "UTF-8 is the only currently supported main encoding.") | ||
791 | |||
792 | byte_chunks = [] | ||
793 | |||
794 | chunk_start = 0 | ||
795 | pos = 0 | ||
796 | while pos < len(in_bytes): | ||
797 | byte = in_bytes[pos] | ||
798 | if not isinstance(byte, int): | ||
799 | # Python 2.x | ||
800 | byte = ord(byte) | ||
801 | if (byte >= cls.FIRST_MULTIBYTE_MARKER | ||
802 | and byte <= cls.LAST_MULTIBYTE_MARKER): | ||
803 | # This is the start of a UTF-8 multibyte character. Skip | ||
804 | # to the end. | ||
805 | for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: | ||
806 | if byte >= start and byte <= end: | ||
807 | pos += size | ||
808 | break | ||
809 | elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: | ||
810 | # We found a Windows-1252 character! | ||
811 | # Save the string up to this point as a chunk. | ||
812 | byte_chunks.append(in_bytes[chunk_start:pos]) | ||
813 | |||
814 | # Now translate the Windows-1252 character into UTF-8 | ||
815 | # and add it as another, one-byte chunk. | ||
816 | byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) | ||
817 | pos += 1 | ||
818 | chunk_start = pos | ||
819 | else: | ||
820 | # Go on to the next character. | ||
821 | pos += 1 | ||
822 | if chunk_start == 0: | ||
823 | # The string is unchanged. | ||
824 | return in_bytes | ||
825 | else: | ||
826 | # Store the final chunk. | ||
827 | byte_chunks.append(in_bytes[chunk_start:]) | ||
828 | return b''.join(byte_chunks) | ||
829 | |||