diff options
| author | Aníbal Limón <anibal.limon@linux.intel.com> | 2014-11-05 12:10:27 -0600 |
|---|---|---|
| committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2014-11-06 16:45:23 +0000 |
| commit | 25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch) | |
| tree | 7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/dammit.py | |
| parent | bc6330cb7f288e76209410b0812aff1dbfa90950 (diff) | |
| download | poky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz | |
bitbake: bs4: Add beautifulsoup 4.3.2 to assist the fetcher
Added Beautifulsoup module because fetch/wget latest_versionstring
method depends on it.
This provides support to fetch/wget.py module for search new package
versions in upstream sites.
(Bitbake rev: 4626c9b77e5eded97507b6f9ca0d891f9a54bb8a)
Signed-off-by: Aníbal Limón <anibal.limon@linux.intel.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/dammit.py')
| -rw-r--r-- | bitbake/lib/bs4/dammit.py | 829 |
1 files changed, 829 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py new file mode 100644 index 0000000000..59640b7ce3 --- /dev/null +++ b/bitbake/lib/bs4/dammit.py | |||
| @@ -0,0 +1,829 @@ | |||
| 1 | # -*- coding: utf-8 -*- | ||
| 2 | """Beautiful Soup bonus library: Unicode, Dammit | ||
| 3 | |||
| 4 | This library converts a bytestream to Unicode through any means | ||
| 5 | necessary. It is heavily based on code from Mark Pilgrim's Universal | ||
| 6 | Feed Parser. It works best on XML and XML, but it does not rewrite the | ||
| 7 | XML or HTML to reflect a new encoding; that's the tree builder's job. | ||
| 8 | """ | ||
| 9 | |||
| 10 | import codecs | ||
| 11 | from htmlentitydefs import codepoint2name | ||
| 12 | import re | ||
| 13 | import logging | ||
| 14 | import string | ||
| 15 | |||
| 16 | # Import a library to autodetect character encodings. | ||
| 17 | chardet_type = None | ||
| 18 | try: | ||
| 19 | # First try the fast C implementation. | ||
| 20 | # PyPI package: cchardet | ||
| 21 | import cchardet | ||
| 22 | def chardet_dammit(s): | ||
| 23 | return cchardet.detect(s)['encoding'] | ||
| 24 | except ImportError: | ||
| 25 | try: | ||
| 26 | # Fall back to the pure Python implementation | ||
| 27 | # Debian package: python-chardet | ||
| 28 | # PyPI package: chardet | ||
| 29 | import chardet | ||
| 30 | def chardet_dammit(s): | ||
| 31 | return chardet.detect(s)['encoding'] | ||
| 32 | #import chardet.constants | ||
| 33 | #chardet.constants._debug = 1 | ||
| 34 | except ImportError: | ||
| 35 | # No chardet available. | ||
| 36 | def chardet_dammit(s): | ||
| 37 | return None | ||
| 38 | |||
| 39 | # Available from http://cjkpython.i18n.org/. | ||
| 40 | try: | ||
| 41 | import iconv_codec | ||
| 42 | except ImportError: | ||
| 43 | pass | ||
| 44 | |||
| 45 | xml_encoding_re = re.compile( | ||
| 46 | '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) | ||
| 47 | html_meta_re = re.compile( | ||
| 48 | '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) | ||
| 49 | |||
| 50 | class EntitySubstitution(object): | ||
| 51 | |||
| 52 | """Substitute XML or HTML entities for the corresponding characters.""" | ||
| 53 | |||
| 54 | def _populate_class_variables(): | ||
| 55 | lookup = {} | ||
| 56 | reverse_lookup = {} | ||
| 57 | characters_for_re = [] | ||
| 58 | for codepoint, name in list(codepoint2name.items()): | ||
| 59 | character = unichr(codepoint) | ||
| 60 | if codepoint != 34: | ||
| 61 | # There's no point in turning the quotation mark into | ||
| 62 | # ", unless it happens within an attribute value, which | ||
| 63 | # is handled elsewhere. | ||
| 64 | characters_for_re.append(character) | ||
| 65 | lookup[character] = name | ||
| 66 | # But we do want to turn " into the quotation mark. | ||
| 67 | reverse_lookup[name] = character | ||
| 68 | re_definition = "[%s]" % "".join(characters_for_re) | ||
| 69 | return lookup, reverse_lookup, re.compile(re_definition) | ||
| 70 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, | ||
| 71 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() | ||
| 72 | |||
| 73 | CHARACTER_TO_XML_ENTITY = { | ||
| 74 | "'": "apos", | ||
| 75 | '"': "quot", | ||
| 76 | "&": "amp", | ||
| 77 | "<": "lt", | ||
| 78 | ">": "gt", | ||
| 79 | } | ||
| 80 | |||
| 81 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" | ||
| 82 | "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" | ||
| 83 | ")") | ||
| 84 | |||
| 85 | AMPERSAND_OR_BRACKET = re.compile("([<>&])") | ||
| 86 | |||
| 87 | @classmethod | ||
| 88 | def _substitute_html_entity(cls, matchobj): | ||
| 89 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) | ||
| 90 | return "&%s;" % entity | ||
| 91 | |||
| 92 | @classmethod | ||
| 93 | def _substitute_xml_entity(cls, matchobj): | ||
| 94 | """Used with a regular expression to substitute the | ||
| 95 | appropriate XML entity for an XML special character.""" | ||
| 96 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] | ||
| 97 | return "&%s;" % entity | ||
| 98 | |||
| 99 | @classmethod | ||
| 100 | def quoted_attribute_value(self, value): | ||
| 101 | """Make a value into a quoted XML attribute, possibly escaping it. | ||
| 102 | |||
| 103 | Most strings will be quoted using double quotes. | ||
| 104 | |||
| 105 | Bob's Bar -> "Bob's Bar" | ||
| 106 | |||
| 107 | If a string contains double quotes, it will be quoted using | ||
| 108 | single quotes. | ||
| 109 | |||
| 110 | Welcome to "my bar" -> 'Welcome to "my bar"' | ||
| 111 | |||
| 112 | If a string contains both single and double quotes, the | ||
| 113 | double quotes will be escaped, and the string will be quoted | ||
| 114 | using double quotes. | ||
| 115 | |||
| 116 | Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" | ||
| 117 | """ | ||
| 118 | quote_with = '"' | ||
| 119 | if '"' in value: | ||
| 120 | if "'" in value: | ||
| 121 | # The string contains both single and double | ||
| 122 | # quotes. Turn the double quotes into | ||
| 123 | # entities. We quote the double quotes rather than | ||
| 124 | # the single quotes because the entity name is | ||
| 125 | # """ whether this is HTML or XML. If we | ||
| 126 | # quoted the single quotes, we'd have to decide | ||
| 127 | # between ' and &squot;. | ||
| 128 | replace_with = """ | ||
| 129 | value = value.replace('"', replace_with) | ||
| 130 | else: | ||
| 131 | # There are double quotes but no single quotes. | ||
| 132 | # We can use single quotes to quote the attribute. | ||
| 133 | quote_with = "'" | ||
| 134 | return quote_with + value + quote_with | ||
| 135 | |||
| 136 | @classmethod | ||
| 137 | def substitute_xml(cls, value, make_quoted_attribute=False): | ||
| 138 | """Substitute XML entities for special XML characters. | ||
| 139 | |||
| 140 | :param value: A string to be substituted. The less-than sign | ||
| 141 | will become <, the greater-than sign will become >, | ||
| 142 | and any ampersands will become &. If you want ampersands | ||
| 143 | that appear to be part of an entity definition to be left | ||
| 144 | alone, use substitute_xml_containing_entities() instead. | ||
| 145 | |||
| 146 | :param make_quoted_attribute: If True, then the string will be | ||
| 147 | quoted, as befits an attribute value. | ||
| 148 | """ | ||
| 149 | # Escape angle brackets and ampersands. | ||
| 150 | value = cls.AMPERSAND_OR_BRACKET.sub( | ||
| 151 | cls._substitute_xml_entity, value) | ||
| 152 | |||
| 153 | if make_quoted_attribute: | ||
| 154 | value = cls.quoted_attribute_value(value) | ||
| 155 | return value | ||
| 156 | |||
| 157 | @classmethod | ||
| 158 | def substitute_xml_containing_entities( | ||
| 159 | cls, value, make_quoted_attribute=False): | ||
| 160 | """Substitute XML entities for special XML characters. | ||
| 161 | |||
| 162 | :param value: A string to be substituted. The less-than sign will | ||
| 163 | become <, the greater-than sign will become >, and any | ||
| 164 | ampersands that are not part of an entity defition will | ||
| 165 | become &. | ||
| 166 | |||
| 167 | :param make_quoted_attribute: If True, then the string will be | ||
| 168 | quoted, as befits an attribute value. | ||
| 169 | """ | ||
| 170 | # Escape angle brackets, and ampersands that aren't part of | ||
| 171 | # entities. | ||
| 172 | value = cls.BARE_AMPERSAND_OR_BRACKET.sub( | ||
| 173 | cls._substitute_xml_entity, value) | ||
| 174 | |||
| 175 | if make_quoted_attribute: | ||
| 176 | value = cls.quoted_attribute_value(value) | ||
| 177 | return value | ||
| 178 | |||
| 179 | @classmethod | ||
| 180 | def substitute_html(cls, s): | ||
| 181 | """Replace certain Unicode characters with named HTML entities. | ||
| 182 | |||
| 183 | This differs from data.encode(encoding, 'xmlcharrefreplace') | ||
| 184 | in that the goal is to make the result more readable (to those | ||
| 185 | with ASCII displays) rather than to recover from | ||
| 186 | errors. There's absolutely nothing wrong with a UTF-8 string | ||
| 187 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that | ||
| 188 | character with "é" will make it more readable to some | ||
| 189 | people. | ||
| 190 | """ | ||
| 191 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( | ||
| 192 | cls._substitute_html_entity, s) | ||
| 193 | |||
| 194 | |||
| 195 | class EncodingDetector: | ||
| 196 | """Suggests a number of possible encodings for a bytestring. | ||
| 197 | |||
| 198 | Order of precedence: | ||
| 199 | |||
| 200 | 1. Encodings you specifically tell EncodingDetector to try first | ||
| 201 | (the override_encodings argument to the constructor). | ||
| 202 | |||
| 203 | 2. An encoding declared within the bytestring itself, either in an | ||
| 204 | XML declaration (if the bytestring is to be interpreted as an XML | ||
| 205 | document), or in a <meta> tag (if the bytestring is to be | ||
| 206 | interpreted as an HTML document.) | ||
| 207 | |||
| 208 | 3. An encoding detected through textual analysis by chardet, | ||
| 209 | cchardet, or a similar external library. | ||
| 210 | |||
| 211 | 4. UTF-8. | ||
| 212 | |||
| 213 | 5. Windows-1252. | ||
| 214 | """ | ||
| 215 | def __init__(self, markup, override_encodings=None, is_html=False): | ||
| 216 | self.override_encodings = override_encodings or [] | ||
| 217 | self.chardet_encoding = None | ||
| 218 | self.is_html = is_html | ||
| 219 | self.declared_encoding = None | ||
| 220 | |||
| 221 | # First order of business: strip a byte-order mark. | ||
| 222 | self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) | ||
| 223 | |||
| 224 | def _usable(self, encoding, tried): | ||
| 225 | if encoding is not None: | ||
| 226 | encoding = encoding.lower() | ||
| 227 | if encoding not in tried: | ||
| 228 | tried.add(encoding) | ||
| 229 | return True | ||
| 230 | return False | ||
| 231 | |||
| 232 | @property | ||
| 233 | def encodings(self): | ||
| 234 | """Yield a number of encodings that might work for this markup.""" | ||
| 235 | tried = set() | ||
| 236 | for e in self.override_encodings: | ||
| 237 | if self._usable(e, tried): | ||
| 238 | yield e | ||
| 239 | |||
| 240 | # Did the document originally start with a byte-order mark | ||
| 241 | # that indicated its encoding? | ||
| 242 | if self._usable(self.sniffed_encoding, tried): | ||
| 243 | yield self.sniffed_encoding | ||
| 244 | |||
| 245 | # Look within the document for an XML or HTML encoding | ||
| 246 | # declaration. | ||
| 247 | if self.declared_encoding is None: | ||
| 248 | self.declared_encoding = self.find_declared_encoding( | ||
| 249 | self.markup, self.is_html) | ||
| 250 | if self._usable(self.declared_encoding, tried): | ||
| 251 | yield self.declared_encoding | ||
| 252 | |||
| 253 | # Use third-party character set detection to guess at the | ||
| 254 | # encoding. | ||
| 255 | if self.chardet_encoding is None: | ||
| 256 | self.chardet_encoding = chardet_dammit(self.markup) | ||
| 257 | if self._usable(self.chardet_encoding, tried): | ||
| 258 | yield self.chardet_encoding | ||
| 259 | |||
| 260 | # As a last-ditch effort, try utf-8 and windows-1252. | ||
| 261 | for e in ('utf-8', 'windows-1252'): | ||
| 262 | if self._usable(e, tried): | ||
| 263 | yield e | ||
| 264 | |||
| 265 | @classmethod | ||
| 266 | def strip_byte_order_mark(cls, data): | ||
| 267 | """If a byte-order mark is present, strip it and return the encoding it implies.""" | ||
| 268 | encoding = None | ||
| 269 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | ||
| 270 | and (data[2:4] != '\x00\x00'): | ||
| 271 | encoding = 'utf-16be' | ||
| 272 | data = data[2:] | ||
| 273 | elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ | ||
| 274 | and (data[2:4] != '\x00\x00'): | ||
| 275 | encoding = 'utf-16le' | ||
| 276 | data = data[2:] | ||
| 277 | elif data[:3] == b'\xef\xbb\xbf': | ||
| 278 | encoding = 'utf-8' | ||
| 279 | data = data[3:] | ||
| 280 | elif data[:4] == b'\x00\x00\xfe\xff': | ||
| 281 | encoding = 'utf-32be' | ||
| 282 | data = data[4:] | ||
| 283 | elif data[:4] == b'\xff\xfe\x00\x00': | ||
| 284 | encoding = 'utf-32le' | ||
| 285 | data = data[4:] | ||
| 286 | return data, encoding | ||
| 287 | |||
| 288 | @classmethod | ||
| 289 | def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): | ||
| 290 | """Given a document, tries to find its declared encoding. | ||
| 291 | |||
| 292 | An XML encoding is declared at the beginning of the document. | ||
| 293 | |||
| 294 | An HTML encoding is declared in a <meta> tag, hopefully near the | ||
| 295 | beginning of the document. | ||
| 296 | """ | ||
| 297 | if search_entire_document: | ||
| 298 | xml_endpos = html_endpos = len(markup) | ||
| 299 | else: | ||
| 300 | xml_endpos = 1024 | ||
| 301 | html_endpos = max(2048, int(len(markup) * 0.05)) | ||
| 302 | |||
| 303 | declared_encoding = None | ||
| 304 | declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) | ||
| 305 | if not declared_encoding_match and is_html: | ||
| 306 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) | ||
| 307 | if declared_encoding_match is not None: | ||
| 308 | declared_encoding = declared_encoding_match.groups()[0].decode( | ||
| 309 | 'ascii') | ||
| 310 | if declared_encoding: | ||
| 311 | return declared_encoding.lower() | ||
| 312 | return None | ||
| 313 | |||
| 314 | class UnicodeDammit: | ||
| 315 | """A class for detecting the encoding of a *ML document and | ||
| 316 | converting it to a Unicode string. If the source encoding is | ||
| 317 | windows-1252, can replace MS smart quotes with their HTML or XML | ||
| 318 | equivalents.""" | ||
| 319 | |||
| 320 | # This dictionary maps commonly seen values for "charset" in HTML | ||
| 321 | # meta tags to the corresponding Python codec names. It only covers | ||
| 322 | # values that aren't in Python's aliases and can't be determined | ||
| 323 | # by the heuristics in find_codec. | ||
| 324 | CHARSET_ALIASES = {"macintosh": "mac-roman", | ||
| 325 | "x-sjis": "shift-jis"} | ||
| 326 | |||
| 327 | ENCODINGS_WITH_SMART_QUOTES = [ | ||
| 328 | "windows-1252", | ||
| 329 | "iso-8859-1", | ||
| 330 | "iso-8859-2", | ||
| 331 | ] | ||
| 332 | |||
| 333 | def __init__(self, markup, override_encodings=[], | ||
| 334 | smart_quotes_to=None, is_html=False): | ||
| 335 | self.smart_quotes_to = smart_quotes_to | ||
| 336 | self.tried_encodings = [] | ||
| 337 | self.contains_replacement_characters = False | ||
| 338 | self.is_html = is_html | ||
| 339 | |||
| 340 | self.detector = EncodingDetector(markup, override_encodings, is_html) | ||
| 341 | |||
| 342 | # Short-circuit if the data is in Unicode to begin with. | ||
| 343 | if isinstance(markup, unicode) or markup == '': | ||
| 344 | self.markup = markup | ||
| 345 | self.unicode_markup = unicode(markup) | ||
| 346 | self.original_encoding = None | ||
| 347 | return | ||
| 348 | |||
| 349 | # The encoding detector may have stripped a byte-order mark. | ||
| 350 | # Use the stripped markup from this point on. | ||
| 351 | self.markup = self.detector.markup | ||
| 352 | |||
| 353 | u = None | ||
| 354 | for encoding in self.detector.encodings: | ||
| 355 | markup = self.detector.markup | ||
| 356 | u = self._convert_from(encoding) | ||
| 357 | if u is not None: | ||
| 358 | break | ||
| 359 | |||
| 360 | if not u: | ||
| 361 | # None of the encodings worked. As an absolute last resort, | ||
| 362 | # try them again with character replacement. | ||
| 363 | |||
| 364 | for encoding in self.detector.encodings: | ||
| 365 | if encoding != "ascii": | ||
| 366 | u = self._convert_from(encoding, "replace") | ||
| 367 | if u is not None: | ||
| 368 | logging.warning( | ||
| 369 | "Some characters could not be decoded, and were " | ||
| 370 | "replaced with REPLACEMENT CHARACTER.") | ||
| 371 | self.contains_replacement_characters = True | ||
| 372 | break | ||
| 373 | |||
| 374 | # If none of that worked, we could at this point force it to | ||
| 375 | # ASCII, but that would destroy so much data that I think | ||
| 376 | # giving up is better. | ||
| 377 | self.unicode_markup = u | ||
| 378 | if not u: | ||
| 379 | self.original_encoding = None | ||
| 380 | |||
| 381 | def _sub_ms_char(self, match): | ||
| 382 | """Changes a MS smart quote character to an XML or HTML | ||
| 383 | entity, or an ASCII character.""" | ||
| 384 | orig = match.group(1) | ||
| 385 | if self.smart_quotes_to == 'ascii': | ||
| 386 | sub = self.MS_CHARS_TO_ASCII.get(orig).encode() | ||
| 387 | else: | ||
| 388 | sub = self.MS_CHARS.get(orig) | ||
| 389 | if type(sub) == tuple: | ||
| 390 | if self.smart_quotes_to == 'xml': | ||
| 391 | sub = '&#x'.encode() + sub[1].encode() + ';'.encode() | ||
| 392 | else: | ||
| 393 | sub = '&'.encode() + sub[0].encode() + ';'.encode() | ||
| 394 | else: | ||
| 395 | sub = sub.encode() | ||
| 396 | return sub | ||
| 397 | |||
| 398 | def _convert_from(self, proposed, errors="strict"): | ||
| 399 | proposed = self.find_codec(proposed) | ||
| 400 | if not proposed or (proposed, errors) in self.tried_encodings: | ||
| 401 | return None | ||
| 402 | self.tried_encodings.append((proposed, errors)) | ||
| 403 | markup = self.markup | ||
| 404 | # Convert smart quotes to HTML if coming from an encoding | ||
| 405 | # that might have them. | ||
| 406 | if (self.smart_quotes_to is not None | ||
| 407 | and proposed in self.ENCODINGS_WITH_SMART_QUOTES): | ||
| 408 | smart_quotes_re = b"([\x80-\x9f])" | ||
| 409 | smart_quotes_compiled = re.compile(smart_quotes_re) | ||
| 410 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) | ||
| 411 | |||
| 412 | try: | ||
| 413 | #print "Trying to convert document to %s (errors=%s)" % ( | ||
| 414 | # proposed, errors) | ||
| 415 | u = self._to_unicode(markup, proposed, errors) | ||
| 416 | self.markup = u | ||
| 417 | self.original_encoding = proposed | ||
| 418 | except Exception as e: | ||
| 419 | #print "That didn't work!" | ||
| 420 | #print e | ||
| 421 | return None | ||
| 422 | #print "Correct encoding: %s" % proposed | ||
| 423 | return self.markup | ||
| 424 | |||
| 425 | def _to_unicode(self, data, encoding, errors="strict"): | ||
| 426 | '''Given a string and its encoding, decodes the string into Unicode. | ||
| 427 | %encoding is a string recognized by encodings.aliases''' | ||
| 428 | return unicode(data, encoding, errors) | ||
| 429 | |||
| 430 | @property | ||
| 431 | def declared_html_encoding(self): | ||
| 432 | if not self.is_html: | ||
| 433 | return None | ||
| 434 | return self.detector.declared_encoding | ||
| 435 | |||
| 436 | def find_codec(self, charset): | ||
| 437 | value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) | ||
| 438 | or (charset and self._codec(charset.replace("-", ""))) | ||
| 439 | or (charset and self._codec(charset.replace("-", "_"))) | ||
| 440 | or (charset and charset.lower()) | ||
| 441 | or charset | ||
| 442 | ) | ||
| 443 | if value: | ||
| 444 | return value.lower() | ||
| 445 | return None | ||
| 446 | |||
| 447 | def _codec(self, charset): | ||
| 448 | if not charset: | ||
| 449 | return charset | ||
| 450 | codec = None | ||
| 451 | try: | ||
| 452 | codecs.lookup(charset) | ||
| 453 | codec = charset | ||
| 454 | except (LookupError, ValueError): | ||
| 455 | pass | ||
| 456 | return codec | ||
| 457 | |||
| 458 | |||
| 459 | # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. | ||
| 460 | MS_CHARS = {b'\x80': ('euro', '20AC'), | ||
| 461 | b'\x81': ' ', | ||
| 462 | b'\x82': ('sbquo', '201A'), | ||
| 463 | b'\x83': ('fnof', '192'), | ||
| 464 | b'\x84': ('bdquo', '201E'), | ||
| 465 | b'\x85': ('hellip', '2026'), | ||
| 466 | b'\x86': ('dagger', '2020'), | ||
| 467 | b'\x87': ('Dagger', '2021'), | ||
| 468 | b'\x88': ('circ', '2C6'), | ||
| 469 | b'\x89': ('permil', '2030'), | ||
| 470 | b'\x8A': ('Scaron', '160'), | ||
| 471 | b'\x8B': ('lsaquo', '2039'), | ||
| 472 | b'\x8C': ('OElig', '152'), | ||
| 473 | b'\x8D': '?', | ||
| 474 | b'\x8E': ('#x17D', '17D'), | ||
| 475 | b'\x8F': '?', | ||
| 476 | b'\x90': '?', | ||
| 477 | b'\x91': ('lsquo', '2018'), | ||
| 478 | b'\x92': ('rsquo', '2019'), | ||
| 479 | b'\x93': ('ldquo', '201C'), | ||
| 480 | b'\x94': ('rdquo', '201D'), | ||
| 481 | b'\x95': ('bull', '2022'), | ||
| 482 | b'\x96': ('ndash', '2013'), | ||
| 483 | b'\x97': ('mdash', '2014'), | ||
| 484 | b'\x98': ('tilde', '2DC'), | ||
| 485 | b'\x99': ('trade', '2122'), | ||
| 486 | b'\x9a': ('scaron', '161'), | ||
| 487 | b'\x9b': ('rsaquo', '203A'), | ||
| 488 | b'\x9c': ('oelig', '153'), | ||
| 489 | b'\x9d': '?', | ||
| 490 | b'\x9e': ('#x17E', '17E'), | ||
| 491 | b'\x9f': ('Yuml', ''),} | ||
| 492 | |||
| 493 | # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains | ||
| 494 | # horrors like stripping diacritical marks to turn á into a, but also | ||
| 495 | # contains non-horrors like turning “ into ". | ||
| 496 | MS_CHARS_TO_ASCII = { | ||
| 497 | b'\x80' : 'EUR', | ||
| 498 | b'\x81' : ' ', | ||
| 499 | b'\x82' : ',', | ||
| 500 | b'\x83' : 'f', | ||
| 501 | b'\x84' : ',,', | ||
| 502 | b'\x85' : '...', | ||
| 503 | b'\x86' : '+', | ||
| 504 | b'\x87' : '++', | ||
| 505 | b'\x88' : '^', | ||
| 506 | b'\x89' : '%', | ||
| 507 | b'\x8a' : 'S', | ||
| 508 | b'\x8b' : '<', | ||
| 509 | b'\x8c' : 'OE', | ||
| 510 | b'\x8d' : '?', | ||
| 511 | b'\x8e' : 'Z', | ||
| 512 | b'\x8f' : '?', | ||
| 513 | b'\x90' : '?', | ||
| 514 | b'\x91' : "'", | ||
| 515 | b'\x92' : "'", | ||
| 516 | b'\x93' : '"', | ||
| 517 | b'\x94' : '"', | ||
| 518 | b'\x95' : '*', | ||
| 519 | b'\x96' : '-', | ||
| 520 | b'\x97' : '--', | ||
| 521 | b'\x98' : '~', | ||
| 522 | b'\x99' : '(TM)', | ||
| 523 | b'\x9a' : 's', | ||
| 524 | b'\x9b' : '>', | ||
| 525 | b'\x9c' : 'oe', | ||
| 526 | b'\x9d' : '?', | ||
| 527 | b'\x9e' : 'z', | ||
| 528 | b'\x9f' : 'Y', | ||
| 529 | b'\xa0' : ' ', | ||
| 530 | b'\xa1' : '!', | ||
| 531 | b'\xa2' : 'c', | ||
| 532 | b'\xa3' : 'GBP', | ||
| 533 | b'\xa4' : '$', #This approximation is especially parochial--this is the | ||
| 534 | #generic currency symbol. | ||
| 535 | b'\xa5' : 'YEN', | ||
| 536 | b'\xa6' : '|', | ||
| 537 | b'\xa7' : 'S', | ||
| 538 | b'\xa8' : '..', | ||
| 539 | b'\xa9' : '', | ||
| 540 | b'\xaa' : '(th)', | ||
| 541 | b'\xab' : '<<', | ||
| 542 | b'\xac' : '!', | ||
| 543 | b'\xad' : ' ', | ||
| 544 | b'\xae' : '(R)', | ||
| 545 | b'\xaf' : '-', | ||
| 546 | b'\xb0' : 'o', | ||
| 547 | b'\xb1' : '+-', | ||
| 548 | b'\xb2' : '2', | ||
| 549 | b'\xb3' : '3', | ||
| 550 | b'\xb4' : ("'", 'acute'), | ||
| 551 | b'\xb5' : 'u', | ||
| 552 | b'\xb6' : 'P', | ||
| 553 | b'\xb7' : '*', | ||
| 554 | b'\xb8' : ',', | ||
| 555 | b'\xb9' : '1', | ||
| 556 | b'\xba' : '(th)', | ||
| 557 | b'\xbb' : '>>', | ||
| 558 | b'\xbc' : '1/4', | ||
| 559 | b'\xbd' : '1/2', | ||
| 560 | b'\xbe' : '3/4', | ||
| 561 | b'\xbf' : '?', | ||
| 562 | b'\xc0' : 'A', | ||
| 563 | b'\xc1' : 'A', | ||
| 564 | b'\xc2' : 'A', | ||
| 565 | b'\xc3' : 'A', | ||
| 566 | b'\xc4' : 'A', | ||
| 567 | b'\xc5' : 'A', | ||
| 568 | b'\xc6' : 'AE', | ||
| 569 | b'\xc7' : 'C', | ||
| 570 | b'\xc8' : 'E', | ||
| 571 | b'\xc9' : 'E', | ||
| 572 | b'\xca' : 'E', | ||
| 573 | b'\xcb' : 'E', | ||
| 574 | b'\xcc' : 'I', | ||
| 575 | b'\xcd' : 'I', | ||
| 576 | b'\xce' : 'I', | ||
| 577 | b'\xcf' : 'I', | ||
| 578 | b'\xd0' : 'D', | ||
| 579 | b'\xd1' : 'N', | ||
| 580 | b'\xd2' : 'O', | ||
| 581 | b'\xd3' : 'O', | ||
| 582 | b'\xd4' : 'O', | ||
| 583 | b'\xd5' : 'O', | ||
| 584 | b'\xd6' : 'O', | ||
| 585 | b'\xd7' : '*', | ||
| 586 | b'\xd8' : 'O', | ||
| 587 | b'\xd9' : 'U', | ||
| 588 | b'\xda' : 'U', | ||
| 589 | b'\xdb' : 'U', | ||
| 590 | b'\xdc' : 'U', | ||
| 591 | b'\xdd' : 'Y', | ||
| 592 | b'\xde' : 'b', | ||
| 593 | b'\xdf' : 'B', | ||
| 594 | b'\xe0' : 'a', | ||
| 595 | b'\xe1' : 'a', | ||
| 596 | b'\xe2' : 'a', | ||
| 597 | b'\xe3' : 'a', | ||
| 598 | b'\xe4' : 'a', | ||
| 599 | b'\xe5' : 'a', | ||
| 600 | b'\xe6' : 'ae', | ||
| 601 | b'\xe7' : 'c', | ||
| 602 | b'\xe8' : 'e', | ||
| 603 | b'\xe9' : 'e', | ||
| 604 | b'\xea' : 'e', | ||
| 605 | b'\xeb' : 'e', | ||
| 606 | b'\xec' : 'i', | ||
| 607 | b'\xed' : 'i', | ||
| 608 | b'\xee' : 'i', | ||
| 609 | b'\xef' : 'i', | ||
| 610 | b'\xf0' : 'o', | ||
| 611 | b'\xf1' : 'n', | ||
| 612 | b'\xf2' : 'o', | ||
| 613 | b'\xf3' : 'o', | ||
| 614 | b'\xf4' : 'o', | ||
| 615 | b'\xf5' : 'o', | ||
| 616 | b'\xf6' : 'o', | ||
| 617 | b'\xf7' : '/', | ||
| 618 | b'\xf8' : 'o', | ||
| 619 | b'\xf9' : 'u', | ||
| 620 | b'\xfa' : 'u', | ||
| 621 | b'\xfb' : 'u', | ||
| 622 | b'\xfc' : 'u', | ||
| 623 | b'\xfd' : 'y', | ||
| 624 | b'\xfe' : 'b', | ||
| 625 | b'\xff' : 'y', | ||
| 626 | } | ||
| 627 | |||
| 628 | # A map used when removing rogue Windows-1252/ISO-8859-1 | ||
| 629 | # characters in otherwise UTF-8 documents. | ||
| 630 | # | ||
| 631 | # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in | ||
| 632 | # Windows-1252. | ||
| 633 | WINDOWS_1252_TO_UTF8 = { | ||
| 634 | 0x80 : b'\xe2\x82\xac', # € | ||
| 635 | 0x82 : b'\xe2\x80\x9a', # ‚ | ||
| 636 | 0x83 : b'\xc6\x92', # ƒ | ||
| 637 | 0x84 : b'\xe2\x80\x9e', # „ | ||
| 638 | 0x85 : b'\xe2\x80\xa6', # … | ||
| 639 | 0x86 : b'\xe2\x80\xa0', # † | ||
| 640 | 0x87 : b'\xe2\x80\xa1', # ‡ | ||
| 641 | 0x88 : b'\xcb\x86', # ˆ | ||
| 642 | 0x89 : b'\xe2\x80\xb0', # ‰ | ||
| 643 | 0x8a : b'\xc5\xa0', # Š | ||
| 644 | 0x8b : b'\xe2\x80\xb9', # ‹ | ||
| 645 | 0x8c : b'\xc5\x92', # Œ | ||
| 646 | 0x8e : b'\xc5\xbd', # Ž | ||
| 647 | 0x91 : b'\xe2\x80\x98', # ‘ | ||
| 648 | 0x92 : b'\xe2\x80\x99', # ’ | ||
| 649 | 0x93 : b'\xe2\x80\x9c', # “ | ||
| 650 | 0x94 : b'\xe2\x80\x9d', # ” | ||
| 651 | 0x95 : b'\xe2\x80\xa2', # • | ||
| 652 | 0x96 : b'\xe2\x80\x93', # – | ||
| 653 | 0x97 : b'\xe2\x80\x94', # — | ||
| 654 | 0x98 : b'\xcb\x9c', # ˜ | ||
| 655 | 0x99 : b'\xe2\x84\xa2', # ™ | ||
| 656 | 0x9a : b'\xc5\xa1', # š | ||
| 657 | 0x9b : b'\xe2\x80\xba', # › | ||
| 658 | 0x9c : b'\xc5\x93', # œ | ||
| 659 | 0x9e : b'\xc5\xbe', # ž | ||
| 660 | 0x9f : b'\xc5\xb8', # Ÿ | ||
| 661 | 0xa0 : b'\xc2\xa0', # | ||
| 662 | 0xa1 : b'\xc2\xa1', # ¡ | ||
| 663 | 0xa2 : b'\xc2\xa2', # ¢ | ||
| 664 | 0xa3 : b'\xc2\xa3', # £ | ||
| 665 | 0xa4 : b'\xc2\xa4', # ¤ | ||
| 666 | 0xa5 : b'\xc2\xa5', # ¥ | ||
| 667 | 0xa6 : b'\xc2\xa6', # ¦ | ||
| 668 | 0xa7 : b'\xc2\xa7', # § | ||
| 669 | 0xa8 : b'\xc2\xa8', # ¨ | ||
| 670 | 0xa9 : b'\xc2\xa9', # © | ||
| 671 | 0xaa : b'\xc2\xaa', # ª | ||
| 672 | 0xab : b'\xc2\xab', # « | ||
| 673 | 0xac : b'\xc2\xac', # ¬ | ||
| 674 | 0xad : b'\xc2\xad', # | ||
| 675 | 0xae : b'\xc2\xae', # ® | ||
| 676 | 0xaf : b'\xc2\xaf', # ¯ | ||
| 677 | 0xb0 : b'\xc2\xb0', # ° | ||
| 678 | 0xb1 : b'\xc2\xb1', # ± | ||
| 679 | 0xb2 : b'\xc2\xb2', # ² | ||
| 680 | 0xb3 : b'\xc2\xb3', # ³ | ||
| 681 | 0xb4 : b'\xc2\xb4', # ´ | ||
| 682 | 0xb5 : b'\xc2\xb5', # µ | ||
| 683 | 0xb6 : b'\xc2\xb6', # ¶ | ||
| 684 | 0xb7 : b'\xc2\xb7', # · | ||
| 685 | 0xb8 : b'\xc2\xb8', # ¸ | ||
| 686 | 0xb9 : b'\xc2\xb9', # ¹ | ||
| 687 | 0xba : b'\xc2\xba', # º | ||
| 688 | 0xbb : b'\xc2\xbb', # » | ||
| 689 | 0xbc : b'\xc2\xbc', # ¼ | ||
| 690 | 0xbd : b'\xc2\xbd', # ½ | ||
| 691 | 0xbe : b'\xc2\xbe', # ¾ | ||
| 692 | 0xbf : b'\xc2\xbf', # ¿ | ||
| 693 | 0xc0 : b'\xc3\x80', # À | ||
| 694 | 0xc1 : b'\xc3\x81', # Á | ||
| 695 | 0xc2 : b'\xc3\x82', # Â | ||
| 696 | 0xc3 : b'\xc3\x83', # Ã | ||
| 697 | 0xc4 : b'\xc3\x84', # Ä | ||
| 698 | 0xc5 : b'\xc3\x85', # Å | ||
| 699 | 0xc6 : b'\xc3\x86', # Æ | ||
| 700 | 0xc7 : b'\xc3\x87', # Ç | ||
| 701 | 0xc8 : b'\xc3\x88', # È | ||
| 702 | 0xc9 : b'\xc3\x89', # É | ||
| 703 | 0xca : b'\xc3\x8a', # Ê | ||
| 704 | 0xcb : b'\xc3\x8b', # Ë | ||
| 705 | 0xcc : b'\xc3\x8c', # Ì | ||
| 706 | 0xcd : b'\xc3\x8d', # Í | ||
| 707 | 0xce : b'\xc3\x8e', # Î | ||
| 708 | 0xcf : b'\xc3\x8f', # Ï | ||
| 709 | 0xd0 : b'\xc3\x90', # Ð | ||
| 710 | 0xd1 : b'\xc3\x91', # Ñ | ||
| 711 | 0xd2 : b'\xc3\x92', # Ò | ||
| 712 | 0xd3 : b'\xc3\x93', # Ó | ||
| 713 | 0xd4 : b'\xc3\x94', # Ô | ||
| 714 | 0xd5 : b'\xc3\x95', # Õ | ||
| 715 | 0xd6 : b'\xc3\x96', # Ö | ||
| 716 | 0xd7 : b'\xc3\x97', # × | ||
| 717 | 0xd8 : b'\xc3\x98', # Ø | ||
| 718 | 0xd9 : b'\xc3\x99', # Ù | ||
| 719 | 0xda : b'\xc3\x9a', # Ú | ||
| 720 | 0xdb : b'\xc3\x9b', # Û | ||
| 721 | 0xdc : b'\xc3\x9c', # Ü | ||
| 722 | 0xdd : b'\xc3\x9d', # Ý | ||
| 723 | 0xde : b'\xc3\x9e', # Þ | ||
| 724 | 0xdf : b'\xc3\x9f', # ß | ||
| 725 | 0xe0 : b'\xc3\xa0', # à | ||
| 726 | 0xe1 : b'\xa1', # á | ||
| 727 | 0xe2 : b'\xc3\xa2', # â | ||
| 728 | 0xe3 : b'\xc3\xa3', # ã | ||
| 729 | 0xe4 : b'\xc3\xa4', # ä | ||
| 730 | 0xe5 : b'\xc3\xa5', # å | ||
| 731 | 0xe6 : b'\xc3\xa6', # æ | ||
| 732 | 0xe7 : b'\xc3\xa7', # ç | ||
| 733 | 0xe8 : b'\xc3\xa8', # è | ||
| 734 | 0xe9 : b'\xc3\xa9', # é | ||
| 735 | 0xea : b'\xc3\xaa', # ê | ||
| 736 | 0xeb : b'\xc3\xab', # ë | ||
| 737 | 0xec : b'\xc3\xac', # ì | ||
| 738 | 0xed : b'\xc3\xad', # í | ||
| 739 | 0xee : b'\xc3\xae', # î | ||
| 740 | 0xef : b'\xc3\xaf', # ï | ||
| 741 | 0xf0 : b'\xc3\xb0', # ð | ||
| 742 | 0xf1 : b'\xc3\xb1', # ñ | ||
| 743 | 0xf2 : b'\xc3\xb2', # ò | ||
| 744 | 0xf3 : b'\xc3\xb3', # ó | ||
| 745 | 0xf4 : b'\xc3\xb4', # ô | ||
| 746 | 0xf5 : b'\xc3\xb5', # õ | ||
| 747 | 0xf6 : b'\xc3\xb6', # ö | ||
| 748 | 0xf7 : b'\xc3\xb7', # ÷ | ||
| 749 | 0xf8 : b'\xc3\xb8', # ø | ||
| 750 | 0xf9 : b'\xc3\xb9', # ù | ||
| 751 | 0xfa : b'\xc3\xba', # ú | ||
| 752 | 0xfb : b'\xc3\xbb', # û | ||
| 753 | 0xfc : b'\xc3\xbc', # ü | ||
| 754 | 0xfd : b'\xc3\xbd', # ý | ||
| 755 | 0xfe : b'\xc3\xbe', # þ | ||
| 756 | } | ||
| 757 | |||
| 758 | MULTIBYTE_MARKERS_AND_SIZES = [ | ||
| 759 | (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF | ||
| 760 | (0xe0, 0xef, 3), # 3-byte characters start with E0-EF | ||
| 761 | (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 | ||
| 762 | ] | ||
| 763 | |||
| 764 | FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] | ||
| 765 | LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] | ||
| 766 | |||
| 767 | @classmethod | ||
| 768 | def detwingle(cls, in_bytes, main_encoding="utf8", | ||
| 769 | embedded_encoding="windows-1252"): | ||
| 770 | """Fix characters from one encoding embedded in some other encoding. | ||
| 771 | |||
| 772 | Currently the only situation supported is Windows-1252 (or its | ||
| 773 | subset ISO-8859-1), embedded in UTF-8. | ||
| 774 | |||
| 775 | The input must be a bytestring. If you've already converted | ||
| 776 | the document to Unicode, you're too late. | ||
| 777 | |||
| 778 | The output is a bytestring in which `embedded_encoding` | ||
| 779 | characters have been converted to their `main_encoding` | ||
| 780 | equivalents. | ||
| 781 | """ | ||
| 782 | if embedded_encoding.replace('_', '-').lower() not in ( | ||
| 783 | 'windows-1252', 'windows_1252'): | ||
| 784 | raise NotImplementedError( | ||
| 785 | "Windows-1252 and ISO-8859-1 are the only currently supported " | ||
| 786 | "embedded encodings.") | ||
| 787 | |||
| 788 | if main_encoding.lower() not in ('utf8', 'utf-8'): | ||
| 789 | raise NotImplementedError( | ||
| 790 | "UTF-8 is the only currently supported main encoding.") | ||
| 791 | |||
| 792 | byte_chunks = [] | ||
| 793 | |||
| 794 | chunk_start = 0 | ||
| 795 | pos = 0 | ||
| 796 | while pos < len(in_bytes): | ||
| 797 | byte = in_bytes[pos] | ||
| 798 | if not isinstance(byte, int): | ||
| 799 | # Python 2.x | ||
| 800 | byte = ord(byte) | ||
| 801 | if (byte >= cls.FIRST_MULTIBYTE_MARKER | ||
| 802 | and byte <= cls.LAST_MULTIBYTE_MARKER): | ||
| 803 | # This is the start of a UTF-8 multibyte character. Skip | ||
| 804 | # to the end. | ||
| 805 | for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: | ||
| 806 | if byte >= start and byte <= end: | ||
| 807 | pos += size | ||
| 808 | break | ||
| 809 | elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: | ||
| 810 | # We found a Windows-1252 character! | ||
| 811 | # Save the string up to this point as a chunk. | ||
| 812 | byte_chunks.append(in_bytes[chunk_start:pos]) | ||
| 813 | |||
| 814 | # Now translate the Windows-1252 character into UTF-8 | ||
| 815 | # and add it as another, one-byte chunk. | ||
| 816 | byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) | ||
| 817 | pos += 1 | ||
| 818 | chunk_start = pos | ||
| 819 | else: | ||
| 820 | # Go on to the next character. | ||
| 821 | pos += 1 | ||
| 822 | if chunk_start == 0: | ||
| 823 | # The string is unchanged. | ||
| 824 | return in_bytes | ||
| 825 | else: | ||
| 826 | # Store the final chunk. | ||
| 827 | byte_chunks.append(in_bytes[chunk_start:]) | ||
| 828 | return b''.join(byte_chunks) | ||
| 829 | |||
