diff options
Diffstat (limited to 'bitbake/lib/bs4/builder/_htmlparser.py')
-rw-r--r-- | bitbake/lib/bs4/builder/_htmlparser.py | 258 |
1 files changed, 258 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py new file mode 100644 index 0000000000..ca8d8b892b --- /dev/null +++ b/bitbake/lib/bs4/builder/_htmlparser.py | |||
@@ -0,0 +1,258 @@ | |||
1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" | ||
2 | |||
3 | __all__ = [ | ||
4 | 'HTMLParserTreeBuilder', | ||
5 | ] | ||
6 | |||
7 | from HTMLParser import ( | ||
8 | HTMLParser, | ||
9 | HTMLParseError, | ||
10 | ) | ||
11 | import sys | ||
12 | import warnings | ||
13 | |||
14 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' | ||
15 | # argument, which we'd like to set to False. Unfortunately, | ||
16 | # http://bugs.python.org/issue13273 makes strict=True a better bet | ||
17 | # before Python 3.2.3. | ||
18 | # | ||
19 | # At the end of this file, we monkeypatch HTMLParser so that | ||
20 | # strict=True works well on Python 3.2.2. | ||
21 | major, minor, release = sys.version_info[:3] | ||
22 | CONSTRUCTOR_TAKES_STRICT = ( | ||
23 | major > 3 | ||
24 | or (major == 3 and minor > 2) | ||
25 | or (major == 3 and minor == 2 and release >= 3)) | ||
26 | |||
27 | from bs4.element import ( | ||
28 | CData, | ||
29 | Comment, | ||
30 | Declaration, | ||
31 | Doctype, | ||
32 | ProcessingInstruction, | ||
33 | ) | ||
34 | from bs4.dammit import EntitySubstitution, UnicodeDammit | ||
35 | |||
36 | from bs4.builder import ( | ||
37 | HTML, | ||
38 | HTMLTreeBuilder, | ||
39 | STRICT, | ||
40 | ) | ||
41 | |||
42 | |||
43 | HTMLPARSER = 'html.parser' | ||
44 | |||
45 | class BeautifulSoupHTMLParser(HTMLParser): | ||
46 | def handle_starttag(self, name, attrs): | ||
47 | # XXX namespace | ||
48 | attr_dict = {} | ||
49 | for key, value in attrs: | ||
50 | # Change None attribute values to the empty string | ||
51 | # for consistency with the other tree builders. | ||
52 | if value is None: | ||
53 | value = '' | ||
54 | attr_dict[key] = value | ||
55 | attrvalue = '""' | ||
56 | self.soup.handle_starttag(name, None, None, attr_dict) | ||
57 | |||
58 | def handle_endtag(self, name): | ||
59 | self.soup.handle_endtag(name) | ||
60 | |||
61 | def handle_data(self, data): | ||
62 | self.soup.handle_data(data) | ||
63 | |||
64 | def handle_charref(self, name): | ||
65 | # XXX workaround for a bug in HTMLParser. Remove this once | ||
66 | # it's fixed. | ||
67 | if name.startswith('x'): | ||
68 | real_name = int(name.lstrip('x'), 16) | ||
69 | elif name.startswith('X'): | ||
70 | real_name = int(name.lstrip('X'), 16) | ||
71 | else: | ||
72 | real_name = int(name) | ||
73 | |||
74 | try: | ||
75 | data = unichr(real_name) | ||
76 | except (ValueError, OverflowError), e: | ||
77 | data = u"\N{REPLACEMENT CHARACTER}" | ||
78 | |||
79 | self.handle_data(data) | ||
80 | |||
81 | def handle_entityref(self, name): | ||
82 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) | ||
83 | if character is not None: | ||
84 | data = character | ||
85 | else: | ||
86 | data = "&%s;" % name | ||
87 | self.handle_data(data) | ||
88 | |||
89 | def handle_comment(self, data): | ||
90 | self.soup.endData() | ||
91 | self.soup.handle_data(data) | ||
92 | self.soup.endData(Comment) | ||
93 | |||
94 | def handle_decl(self, data): | ||
95 | self.soup.endData() | ||
96 | if data.startswith("DOCTYPE "): | ||
97 | data = data[len("DOCTYPE "):] | ||
98 | elif data == 'DOCTYPE': | ||
99 | # i.e. "<!DOCTYPE>" | ||
100 | data = '' | ||
101 | self.soup.handle_data(data) | ||
102 | self.soup.endData(Doctype) | ||
103 | |||
104 | def unknown_decl(self, data): | ||
105 | if data.upper().startswith('CDATA['): | ||
106 | cls = CData | ||
107 | data = data[len('CDATA['):] | ||
108 | else: | ||
109 | cls = Declaration | ||
110 | self.soup.endData() | ||
111 | self.soup.handle_data(data) | ||
112 | self.soup.endData(cls) | ||
113 | |||
114 | def handle_pi(self, data): | ||
115 | self.soup.endData() | ||
116 | if data.endswith("?") and data.lower().startswith("xml"): | ||
117 | # "An XHTML processing instruction using the trailing '?' | ||
118 | # will cause the '?' to be included in data." - HTMLParser | ||
119 | # docs. | ||
120 | # | ||
121 | # Strip the question mark so we don't end up with two | ||
122 | # question marks. | ||
123 | data = data[:-1] | ||
124 | self.soup.handle_data(data) | ||
125 | self.soup.endData(ProcessingInstruction) | ||
126 | |||
127 | |||
128 | class HTMLParserTreeBuilder(HTMLTreeBuilder): | ||
129 | |||
130 | is_xml = False | ||
131 | features = [HTML, STRICT, HTMLPARSER] | ||
132 | |||
133 | def __init__(self, *args, **kwargs): | ||
134 | if CONSTRUCTOR_TAKES_STRICT: | ||
135 | kwargs['strict'] = False | ||
136 | self.parser_args = (args, kwargs) | ||
137 | |||
138 | def prepare_markup(self, markup, user_specified_encoding=None, | ||
139 | document_declared_encoding=None): | ||
140 | """ | ||
141 | :return: A 4-tuple (markup, original encoding, encoding | ||
142 | declared within markup, whether any characters had to be | ||
143 | replaced with REPLACEMENT CHARACTER). | ||
144 | """ | ||
145 | if isinstance(markup, unicode): | ||
146 | yield (markup, None, None, False) | ||
147 | return | ||
148 | |||
149 | try_encodings = [user_specified_encoding, document_declared_encoding] | ||
150 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) | ||
151 | yield (dammit.markup, dammit.original_encoding, | ||
152 | dammit.declared_html_encoding, | ||
153 | dammit.contains_replacement_characters) | ||
154 | |||
155 | def feed(self, markup): | ||
156 | args, kwargs = self.parser_args | ||
157 | parser = BeautifulSoupHTMLParser(*args, **kwargs) | ||
158 | parser.soup = self.soup | ||
159 | try: | ||
160 | parser.feed(markup) | ||
161 | except HTMLParseError, e: | ||
162 | warnings.warn(RuntimeWarning( | ||
163 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) | ||
164 | raise e | ||
165 | |||
166 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some | ||
167 | # 3.2.3 code. This ensures they don't treat markup like <p></p> as a | ||
168 | # string. | ||
169 | # | ||
170 | # XXX This code can be removed once most Python 3 users are on 3.2.3. | ||
171 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: | ||
172 | import re | ||
173 | attrfind_tolerant = re.compile( | ||
174 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' | ||
175 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') | ||
176 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant | ||
177 | |||
178 | locatestarttagend = re.compile(r""" | ||
179 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name | ||
180 | (?:\s+ # whitespace before attribute name | ||
181 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name | ||
182 | (?:\s*=\s* # value indicator | ||
183 | (?:'[^']*' # LITA-enclosed value | ||
184 | |\"[^\"]*\" # LIT-enclosed value | ||
185 | |[^'\">\s]+ # bare value | ||
186 | ) | ||
187 | )? | ||
188 | ) | ||
189 | )* | ||
190 | \s* # trailing whitespace | ||
191 | """, re.VERBOSE) | ||
192 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend | ||
193 | |||
194 | from html.parser import tagfind, attrfind | ||
195 | |||
196 | def parse_starttag(self, i): | ||
197 | self.__starttag_text = None | ||
198 | endpos = self.check_for_whole_start_tag(i) | ||
199 | if endpos < 0: | ||
200 | return endpos | ||
201 | rawdata = self.rawdata | ||
202 | self.__starttag_text = rawdata[i:endpos] | ||
203 | |||
204 | # Now parse the data between i+1 and j into a tag and attrs | ||
205 | attrs = [] | ||
206 | match = tagfind.match(rawdata, i+1) | ||
207 | assert match, 'unexpected call to parse_starttag()' | ||
208 | k = match.end() | ||
209 | self.lasttag = tag = rawdata[i+1:k].lower() | ||
210 | while k < endpos: | ||
211 | if self.strict: | ||
212 | m = attrfind.match(rawdata, k) | ||
213 | else: | ||
214 | m = attrfind_tolerant.match(rawdata, k) | ||
215 | if not m: | ||
216 | break | ||
217 | attrname, rest, attrvalue = m.group(1, 2, 3) | ||
218 | if not rest: | ||
219 | attrvalue = None | ||
220 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ | ||
221 | attrvalue[:1] == '"' == attrvalue[-1:]: | ||
222 | attrvalue = attrvalue[1:-1] | ||
223 | if attrvalue: | ||
224 | attrvalue = self.unescape(attrvalue) | ||
225 | attrs.append((attrname.lower(), attrvalue)) | ||
226 | k = m.end() | ||
227 | |||
228 | end = rawdata[k:endpos].strip() | ||
229 | if end not in (">", "/>"): | ||
230 | lineno, offset = self.getpos() | ||
231 | if "\n" in self.__starttag_text: | ||
232 | lineno = lineno + self.__starttag_text.count("\n") | ||
233 | offset = len(self.__starttag_text) \ | ||
234 | - self.__starttag_text.rfind("\n") | ||
235 | else: | ||
236 | offset = offset + len(self.__starttag_text) | ||
237 | if self.strict: | ||
238 | self.error("junk characters in start tag: %r" | ||
239 | % (rawdata[k:endpos][:20],)) | ||
240 | self.handle_data(rawdata[i:endpos]) | ||
241 | return endpos | ||
242 | if end.endswith('/>'): | ||
243 | # XHTML-style empty tag: <span attr="value" /> | ||
244 | self.handle_startendtag(tag, attrs) | ||
245 | else: | ||
246 | self.handle_starttag(tag, attrs) | ||
247 | if tag in self.CDATA_CONTENT_ELEMENTS: | ||
248 | self.set_cdata_mode(tag) | ||
249 | return endpos | ||
250 | |||
251 | def set_cdata_mode(self, elem): | ||
252 | self.cdata_elem = elem.lower() | ||
253 | self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | ||
254 | |||
255 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag | ||
256 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode | ||
257 | |||
258 | CONSTRUCTOR_TAKES_STRICT = True | ||