diff options
Diffstat (limited to 'bitbake/lib/bs4/builder/_htmlparser.py')
-rw-r--r-- | bitbake/lib/bs4/builder/_htmlparser.py | 56 |
1 files changed, 30 insertions, 26 deletions
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py index ca8d8b892b..bb0a63f2f3 100644 --- a/bitbake/lib/bs4/builder/_htmlparser.py +++ b/bitbake/lib/bs4/builder/_htmlparser.py | |||
@@ -4,10 +4,16 @@ __all__ = [ | |||
4 | 'HTMLParserTreeBuilder', | 4 | 'HTMLParserTreeBuilder', |
5 | ] | 5 | ] |
6 | 6 | ||
7 | from HTMLParser import ( | 7 | from html.parser import HTMLParser |
8 | HTMLParser, | 8 | |
9 | HTMLParseError, | 9 | try: |
10 | ) | 10 | from html.parser import HTMLParseError |
11 | except ImportError as e: | ||
12 | # HTMLParseError is removed in Python 3.5. Since it can never be | ||
13 | # thrown in 3.5, we can just define our own class as a placeholder. | ||
14 | class HTMLParseError(Exception): | ||
15 | pass | ||
16 | |||
11 | import sys | 17 | import sys |
12 | import warnings | 18 | import warnings |
13 | 19 | ||
@@ -19,10 +25,10 @@ import warnings | |||
19 | # At the end of this file, we monkeypatch HTMLParser so that | 25 | # At the end of this file, we monkeypatch HTMLParser so that |
20 | # strict=True works well on Python 3.2.2. | 26 | # strict=True works well on Python 3.2.2. |
21 | major, minor, release = sys.version_info[:3] | 27 | major, minor, release = sys.version_info[:3] |
22 | CONSTRUCTOR_TAKES_STRICT = ( | 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 |
23 | major > 3 | 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 |
24 | or (major == 3 and minor > 2) | 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 |
25 | or (major == 3 and minor == 2 and release >= 3)) | 31 | |
26 | 32 | ||
27 | from bs4.element import ( | 33 | from bs4.element import ( |
28 | CData, | 34 | CData, |
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
63 | 69 | ||
64 | def handle_charref(self, name): | 70 | def handle_charref(self, name): |
65 | # XXX workaround for a bug in HTMLParser. Remove this once | 71 | # XXX workaround for a bug in HTMLParser. Remove this once |
66 | # it's fixed. | 72 | # it's fixed in all supported versions. |
73 | # http://bugs.python.org/issue13633 | ||
67 | if name.startswith('x'): | 74 | if name.startswith('x'): |
68 | real_name = int(name.lstrip('x'), 16) | 75 | real_name = int(name.lstrip('x'), 16) |
69 | elif name.startswith('X'): | 76 | elif name.startswith('X'): |
@@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
72 | real_name = int(name) | 79 | real_name = int(name) |
73 | 80 | ||
74 | try: | 81 | try: |
75 | data = unichr(real_name) | 82 | data = chr(real_name) |
76 | except (ValueError, OverflowError), e: | 83 | except (ValueError, OverflowError) as e: |
77 | data = u"\N{REPLACEMENT CHARACTER}" | 84 | data = "\N{REPLACEMENT CHARACTER}" |
78 | 85 | ||
79 | self.handle_data(data) | 86 | self.handle_data(data) |
80 | 87 | ||
@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
113 | 120 | ||
114 | def handle_pi(self, data): | 121 | def handle_pi(self, data): |
115 | self.soup.endData() | 122 | self.soup.endData() |
116 | if data.endswith("?") and data.lower().startswith("xml"): | ||
117 | # "An XHTML processing instruction using the trailing '?' | ||
118 | # will cause the '?' to be included in data." - HTMLParser | ||
119 | # docs. | ||
120 | # | ||
121 | # Strip the question mark so we don't end up with two | ||
122 | # question marks. | ||
123 | data = data[:-1] | ||
124 | self.soup.handle_data(data) | 123 | self.soup.handle_data(data) |
125 | self.soup.endData(ProcessingInstruction) | 124 | self.soup.endData(ProcessingInstruction) |
126 | 125 | ||
@@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
128 | class HTMLParserTreeBuilder(HTMLTreeBuilder): | 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): |
129 | 128 | ||
130 | is_xml = False | 129 | is_xml = False |
131 | features = [HTML, STRICT, HTMLPARSER] | 130 | picklable = True |
131 | NAME = HTMLPARSER | ||
132 | features = [NAME, HTML, STRICT] | ||
132 | 133 | ||
133 | def __init__(self, *args, **kwargs): | 134 | def __init__(self, *args, **kwargs): |
134 | if CONSTRUCTOR_TAKES_STRICT: | 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: |
135 | kwargs['strict'] = False | 136 | kwargs['strict'] = False |
137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: | ||
138 | kwargs['convert_charrefs'] = False | ||
136 | self.parser_args = (args, kwargs) | 139 | self.parser_args = (args, kwargs) |
137 | 140 | ||
138 | def prepare_markup(self, markup, user_specified_encoding=None, | 141 | def prepare_markup(self, markup, user_specified_encoding=None, |
139 | document_declared_encoding=None): | 142 | document_declared_encoding=None, exclude_encodings=None): |
140 | """ | 143 | """ |
141 | :return: A 4-tuple (markup, original encoding, encoding | 144 | :return: A 4-tuple (markup, original encoding, encoding |
142 | declared within markup, whether any characters had to be | 145 | declared within markup, whether any characters had to be |
143 | replaced with REPLACEMENT CHARACTER). | 146 | replaced with REPLACEMENT CHARACTER). |
144 | """ | 147 | """ |
145 | if isinstance(markup, unicode): | 148 | if isinstance(markup, str): |
146 | yield (markup, None, None, False) | 149 | yield (markup, None, None, False) |
147 | return | 150 | return |
148 | 151 | ||
149 | try_encodings = [user_specified_encoding, document_declared_encoding] | 152 | try_encodings = [user_specified_encoding, document_declared_encoding] |
150 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) | 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, |
154 | exclude_encodings=exclude_encodings) | ||
151 | yield (dammit.markup, dammit.original_encoding, | 155 | yield (dammit.markup, dammit.original_encoding, |
152 | dammit.declared_html_encoding, | 156 | dammit.declared_html_encoding, |
153 | dammit.contains_replacement_characters) | 157 | dammit.contains_replacement_characters) |
@@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): | |||
158 | parser.soup = self.soup | 162 | parser.soup = self.soup |
159 | try: | 163 | try: |
160 | parser.feed(markup) | 164 | parser.feed(markup) |
161 | except HTMLParseError, e: | 165 | except HTMLParseError as e: |
162 | warnings.warn(RuntimeWarning( | 166 | warnings.warn(RuntimeWarning( |
163 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) | 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) |
164 | raise e | 168 | raise e |