summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/builder/_htmlparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/builder/_htmlparser.py')
-rw-r--r--bitbake/lib/bs4/builder/_htmlparser.py433
1 files changed, 279 insertions, 154 deletions
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py
index bb0a63f2f3..3cc187f892 100644
--- a/bitbake/lib/bs4/builder/_htmlparser.py
+++ b/bitbake/lib/bs4/builder/_htmlparser.py
@@ -1,35 +1,18 @@
1# encoding: utf-8
1"""Use the HTMLParser library to parse HTML files that aren't too bad.""" 2"""Use the HTMLParser library to parse HTML files that aren't too bad."""
2 3
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
6
3__all__ = [ 7__all__ = [
4 'HTMLParserTreeBuilder', 8 'HTMLParserTreeBuilder',
5 ] 9 ]
6 10
7from html.parser import HTMLParser 11from html.parser import HTMLParser
8 12
9try:
10 from html.parser import HTMLParseError
11except ImportError as e:
12 # HTMLParseError is removed in Python 3.5. Since it can never be
13 # thrown in 3.5, we can just define our own class as a placeholder.
14 class HTMLParseError(Exception):
15 pass
16
17import sys 13import sys
18import warnings 14import warnings
19 15
20# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
21# argument, which we'd like to set to False. Unfortunately,
22# http://bugs.python.org/issue13273 makes strict=True a better bet
23# before Python 3.2.3.
24#
25# At the end of this file, we monkeypatch HTMLParser so that
26# strict=True works well on Python 3.2.2.
27major, minor, release = sys.version_info[:3]
28CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
29CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
30CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
31
32
33from bs4.element import ( 16from bs4.element import (
34 CData, 17 CData,
35 Comment, 18 Comment,
@@ -40,6 +23,8 @@ from bs4.element import (
40from bs4.dammit import EntitySubstitution, UnicodeDammit 23from bs4.dammit import EntitySubstitution, UnicodeDammit
41 24
42from bs4.builder import ( 25from bs4.builder import (
26 DetectsXMLParsedAsHTML,
27 ParserRejectedMarkup,
43 HTML, 28 HTML,
44 HTMLTreeBuilder, 29 HTMLTreeBuilder,
45 STRICT, 30 STRICT,
@@ -48,8 +33,84 @@ from bs4.builder import (
48 33
49HTMLPARSER = 'html.parser' 34HTMLPARSER = 'html.parser'
50 35
51class BeautifulSoupHTMLParser(HTMLParser): 36class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
52 def handle_starttag(self, name, attrs): 37 """A subclass of the Python standard library's HTMLParser class, which
38 listens for HTMLParser events and translates them into calls
39 to Beautiful Soup's tree construction API.
40 """
41
42 # Strategies for handling duplicate attributes
43 IGNORE = 'ignore'
44 REPLACE = 'replace'
45
46 def __init__(self, *args, **kwargs):
47 """Constructor.
48
49 :param on_duplicate_attribute: A strategy for what to do if a
50 tag includes the same attribute more than once. Accepted
51 values are: REPLACE (replace earlier values with later
52 ones, the default), IGNORE (keep the earliest value
53 encountered), or a callable. A callable must take three
54 arguments: the dictionary of attributes already processed,
55 the name of the duplicate attribute, and the most recent value
56 encountered.
57 """
58 self.on_duplicate_attribute = kwargs.pop(
59 'on_duplicate_attribute', self.REPLACE
60 )
61 HTMLParser.__init__(self, *args, **kwargs)
62
63 # Keep a list of empty-element tags that were encountered
64 # without an explicit closing tag. If we encounter a closing tag
65 # of this type, we'll associate it with one of those entries.
66 #
67 # This isn't a stack because we don't care about the
68 # order. It's a list of closing tags we've already handled and
69 # will ignore, assuming they ever show up.
70 self.already_closed_empty_element = []
71
72 self._initialize_xml_detector()
73
74 def error(self, message):
75 # NOTE: This method is required so long as Python 3.9 is
76 # supported. The corresponding code is removed from HTMLParser
77 # in 3.5, but not removed from ParserBase until 3.10.
78 # https://github.com/python/cpython/issues/76025
79 #
80 # The original implementation turned the error into a warning,
81 # but in every case I discovered, this made HTMLParser
82 # immediately crash with an error message that was less
83 # helpful than the warning. The new implementation makes it
84 # more clear that html.parser just can't parse this
85 # markup. The 3.10 implementation does the same, though it
86 # raises AssertionError rather than calling a method. (We
87 # catch this error and wrap it in a ParserRejectedMarkup.)
88 raise ParserRejectedMarkup(message)
89
90 def handle_startendtag(self, name, attrs):
91 """Handle an incoming empty-element tag.
92
93 This is only called when the markup looks like <tag/>.
94
95 :param name: Name of the tag.
96 :param attrs: Dictionary of the tag's attributes.
97 """
98 # is_startend() tells handle_starttag not to close the tag
99 # just because its name matches a known empty-element tag. We
100 # know that this is an empty-element tag and we want to call
101 # handle_endtag ourselves.
102 tag = self.handle_starttag(name, attrs, handle_empty_element=False)
103 self.handle_endtag(name)
104
105 def handle_starttag(self, name, attrs, handle_empty_element=True):
106 """Handle an opening tag, e.g. '<tag>'
107
108 :param name: Name of the tag.
109 :param attrs: Dictionary of the tag's attributes.
110 :param handle_empty_element: True if this tag is known to be
111 an empty-element tag (i.e. there is not expected to be any
112 closing tag).
113 """
53 # XXX namespace 114 # XXX namespace
54 attr_dict = {} 115 attr_dict = {}
55 for key, value in attrs: 116 for key, value in attrs:
@@ -57,20 +118,78 @@ class BeautifulSoupHTMLParser(HTMLParser):
57 # for consistency with the other tree builders. 118 # for consistency with the other tree builders.
58 if value is None: 119 if value is None:
59 value = '' 120 value = ''
60 attr_dict[key] = value 121 if key in attr_dict:
122 # A single attribute shows up multiple times in this
123 # tag. How to handle it depends on the
124 # on_duplicate_attribute setting.
125 on_dupe = self.on_duplicate_attribute
126 if on_dupe == self.IGNORE:
127 pass
128 elif on_dupe in (None, self.REPLACE):
129 attr_dict[key] = value
130 else:
131 on_dupe(attr_dict, key, value)
132 else:
133 attr_dict[key] = value
61 attrvalue = '""' 134 attrvalue = '""'
62 self.soup.handle_starttag(name, None, None, attr_dict) 135 #print("START", name)
63 136 sourceline, sourcepos = self.getpos()
64 def handle_endtag(self, name): 137 tag = self.soup.handle_starttag(
65 self.soup.handle_endtag(name) 138 name, None, None, attr_dict, sourceline=sourceline,
66 139 sourcepos=sourcepos
140 )
141 if tag and tag.is_empty_element and handle_empty_element:
142 # Unlike other parsers, html.parser doesn't send separate end tag
143 # events for empty-element tags. (It's handled in
144 # handle_startendtag, but only if the original markup looked like
145 # <tag/>.)
146 #
147 # So we need to call handle_endtag() ourselves. Since we
148 # know the start event is identical to the end event, we
149 # don't want handle_endtag() to cross off any previous end
150 # events for tags of this name.
151 self.handle_endtag(name, check_already_closed=False)
152
153 # But we might encounter an explicit closing tag for this tag
154 # later on. If so, we want to ignore it.
155 self.already_closed_empty_element.append(name)
156
157 if self._root_tag is None:
158 self._root_tag_encountered(name)
159
160 def handle_endtag(self, name, check_already_closed=True):
161 """Handle a closing tag, e.g. '</tag>'
162
163 :param name: A tag name.
164 :param check_already_closed: True if this tag is expected to
165 be the closing portion of an empty-element tag,
166 e.g. '<tag></tag>'.
167 """
168 #print("END", name)
169 if check_already_closed and name in self.already_closed_empty_element:
170 # This is a redundant end tag for an empty-element tag.
171 # We've already called handle_endtag() for it, so just
172 # check it off the list.
173 #print("ALREADY CLOSED", name)
174 self.already_closed_empty_element.remove(name)
175 else:
176 self.soup.handle_endtag(name)
177
67 def handle_data(self, data): 178 def handle_data(self, data):
179 """Handle some textual data that shows up between tags."""
68 self.soup.handle_data(data) 180 self.soup.handle_data(data)
69 181
70 def handle_charref(self, name): 182 def handle_charref(self, name):
71 # XXX workaround for a bug in HTMLParser. Remove this once 183 """Handle a numeric character reference by converting it to the
72 # it's fixed in all supported versions. 184 corresponding Unicode character and treating it as textual
73 # http://bugs.python.org/issue13633 185 data.
186
187 :param name: Character number, possibly in hexadecimal.
188 """
189 # TODO: This was originally a workaround for a bug in
190 # HTMLParser. (http://bugs.python.org/issue13633) The bug has
191 # been fixed, but removing this code still makes some
192 # Beautiful Soup tests fail. This needs investigation.
74 if name.startswith('x'): 193 if name.startswith('x'):
75 real_name = int(name.lstrip('x'), 16) 194 real_name = int(name.lstrip('x'), 16)
76 elif name.startswith('X'): 195 elif name.startswith('X'):
@@ -78,37 +197,71 @@ class BeautifulSoupHTMLParser(HTMLParser):
78 else: 197 else:
79 real_name = int(name) 198 real_name = int(name)
80 199
81 try: 200 data = None
82 data = chr(real_name) 201 if real_name < 256:
83 except (ValueError, OverflowError) as e: 202 # HTML numeric entities are supposed to reference Unicode
84 data = "\N{REPLACEMENT CHARACTER}" 203 # code points, but sometimes they reference code points in
85 204 # some other encoding (ahem, Windows-1252). E.g. &#147;
205 # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
206 # code tries to detect this situation and compensate.
207 for encoding in (self.soup.original_encoding, 'windows-1252'):
208 if not encoding:
209 continue
210 try:
211 data = bytearray([real_name]).decode(encoding)
212 except UnicodeDecodeError as e:
213 pass
214 if not data:
215 try:
216 data = chr(real_name)
217 except (ValueError, OverflowError) as e:
218 pass
219 data = data or "\N{REPLACEMENT CHARACTER}"
86 self.handle_data(data) 220 self.handle_data(data)
87 221
88 def handle_entityref(self, name): 222 def handle_entityref(self, name):
223 """Handle a named entity reference by converting it to the
224 corresponding Unicode character(s) and treating it as textual
225 data.
226
227 :param name: Name of the entity reference.
228 """
89 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 229 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
90 if character is not None: 230 if character is not None:
91 data = character 231 data = character
92 else: 232 else:
93 data = "&%s;" % name 233 # If this were XML, it would be ambiguous whether "&foo"
234 # was an character entity reference with a missing
235 # semicolon or the literal string "&foo". Since this is
236 # HTML, we have a complete list of all character entity references,
237 # and this one wasn't found, so assume it's the literal string "&foo".
238 data = "&%s" % name
94 self.handle_data(data) 239 self.handle_data(data)
95 240
96 def handle_comment(self, data): 241 def handle_comment(self, data):
242 """Handle an HTML comment.
243
244 :param data: The text of the comment.
245 """
97 self.soup.endData() 246 self.soup.endData()
98 self.soup.handle_data(data) 247 self.soup.handle_data(data)
99 self.soup.endData(Comment) 248 self.soup.endData(Comment)
100 249
101 def handle_decl(self, data): 250 def handle_decl(self, data):
251 """Handle a DOCTYPE declaration.
252
253 :param data: The text of the declaration.
254 """
102 self.soup.endData() 255 self.soup.endData()
103 if data.startswith("DOCTYPE "): 256 data = data[len("DOCTYPE "):]
104 data = data[len("DOCTYPE "):]
105 elif data == 'DOCTYPE':
106 # i.e. "<!DOCTYPE>"
107 data = ''
108 self.soup.handle_data(data) 257 self.soup.handle_data(data)
109 self.soup.endData(Doctype) 258 self.soup.endData(Doctype)
110 259
111 def unknown_decl(self, data): 260 def unknown_decl(self, data):
261 """Handle a declaration of unknown type -- probably a CDATA block.
262
263 :param data: The text of the declaration.
264 """
112 if data.upper().startswith('CDATA['): 265 if data.upper().startswith('CDATA['):
113 cls = CData 266 cls = CData
114 data = data[len('CDATA['):] 267 data = data[len('CDATA['):]
@@ -119,144 +272,116 @@ class BeautifulSoupHTMLParser(HTMLParser):
119 self.soup.endData(cls) 272 self.soup.endData(cls)
120 273
121 def handle_pi(self, data): 274 def handle_pi(self, data):
275 """Handle a processing instruction.
276
277 :param data: The text of the instruction.
278 """
122 self.soup.endData() 279 self.soup.endData()
123 self.soup.handle_data(data) 280 self.soup.handle_data(data)
281 self._document_might_be_xml(data)
124 self.soup.endData(ProcessingInstruction) 282 self.soup.endData(ProcessingInstruction)
125 283
126 284
127class HTMLParserTreeBuilder(HTMLTreeBuilder): 285class HTMLParserTreeBuilder(HTMLTreeBuilder):
128 286 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
287 found in the Python standard library.
288 """
129 is_xml = False 289 is_xml = False
130 picklable = True 290 picklable = True
131 NAME = HTMLPARSER 291 NAME = HTMLPARSER
132 features = [NAME, HTML, STRICT] 292 features = [NAME, HTML, STRICT]
133 293
134 def __init__(self, *args, **kwargs): 294 # The html.parser knows which line number and position in the
135 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 295 # original file is the source of an element.
136 kwargs['strict'] = False 296 TRACKS_LINE_NUMBERS = True
137 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
138 kwargs['convert_charrefs'] = False
139 self.parser_args = (args, kwargs)
140 297
298 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
299 """Constructor.
300
301 :param parser_args: Positional arguments to pass into
302 the BeautifulSoupHTMLParser constructor, once it's
303 invoked.
304 :param parser_kwargs: Keyword arguments to pass into
305 the BeautifulSoupHTMLParser constructor, once it's
306 invoked.
307 :param kwargs: Keyword arguments for the superclass constructor.
308 """
309 # Some keyword arguments will be pulled out of kwargs and placed
310 # into parser_kwargs.
311 extra_parser_kwargs = dict()
312 for arg in ('on_duplicate_attribute',):
313 if arg in kwargs:
314 value = kwargs.pop(arg)
315 extra_parser_kwargs[arg] = value
316 super(HTMLParserTreeBuilder, self).__init__(**kwargs)
317 parser_args = parser_args or []
318 parser_kwargs = parser_kwargs or {}
319 parser_kwargs.update(extra_parser_kwargs)
320 parser_kwargs['convert_charrefs'] = False
321 self.parser_args = (parser_args, parser_kwargs)
322
141 def prepare_markup(self, markup, user_specified_encoding=None, 323 def prepare_markup(self, markup, user_specified_encoding=None,
142 document_declared_encoding=None, exclude_encodings=None): 324 document_declared_encoding=None, exclude_encodings=None):
143 """ 325
144 :return: A 4-tuple (markup, original encoding, encoding 326 """Run any preliminary steps necessary to make incoming markup
145 declared within markup, whether any characters had to be 327 acceptable to the parser.
146 replaced with REPLACEMENT CHARACTER). 328
329 :param markup: Some markup -- probably a bytestring.
330 :param user_specified_encoding: The user asked to try this encoding.
331 :param document_declared_encoding: The markup itself claims to be
332 in this encoding.
333 :param exclude_encodings: The user asked _not_ to try any of
334 these encodings.
335
336 :yield: A series of 4-tuples:
337 (markup, encoding, declared encoding,
338 has undergone character replacement)
339
340 Each 4-tuple represents a strategy for converting the
341 document to Unicode and parsing it. Each strategy will be tried
342 in turn.
147 """ 343 """
148 if isinstance(markup, str): 344 if isinstance(markup, str):
345 # Parse Unicode as-is.
149 yield (markup, None, None, False) 346 yield (markup, None, None, False)
150 return 347 return
151 348
349 # Ask UnicodeDammit to sniff the most likely encoding.
350
351 # This was provided by the end-user; treat it as a known
352 # definite encoding per the algorithm laid out in the HTML5
353 # spec. (See the EncodingDetector class for details.)
354 known_definite_encodings = [user_specified_encoding]
355
356 # This was found in the document; treat it as a slightly lower-priority
357 # user encoding.
358 user_encodings = [document_declared_encoding]
359
152 try_encodings = [user_specified_encoding, document_declared_encoding] 360 try_encodings = [user_specified_encoding, document_declared_encoding]
153 dammit = UnicodeDammit(markup, try_encodings, is_html=True, 361 dammit = UnicodeDammit(
154 exclude_encodings=exclude_encodings) 362 markup,
363 known_definite_encodings=known_definite_encodings,
364 user_encodings=user_encodings,
365 is_html=True,
366 exclude_encodings=exclude_encodings
367 )
155 yield (dammit.markup, dammit.original_encoding, 368 yield (dammit.markup, dammit.original_encoding,
156 dammit.declared_html_encoding, 369 dammit.declared_html_encoding,
157 dammit.contains_replacement_characters) 370 dammit.contains_replacement_characters)
158 371
159 def feed(self, markup): 372 def feed(self, markup):
373 """Run some incoming markup through some parsing process,
374 populating the `BeautifulSoup` object in self.soup.
375 """
160 args, kwargs = self.parser_args 376 args, kwargs = self.parser_args
161 parser = BeautifulSoupHTMLParser(*args, **kwargs) 377 parser = BeautifulSoupHTMLParser(*args, **kwargs)
162 parser.soup = self.soup 378 parser.soup = self.soup
163 try: 379 try:
164 parser.feed(markup) 380 parser.feed(markup)
165 except HTMLParseError as e: 381 parser.close()
166 warnings.warn(RuntimeWarning( 382 except AssertionError as e:
167 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 383 # html.parser raises AssertionError in rare cases to
168 raise e 384 # indicate a fatal problem with the markup, especially
169 385 # when there's an error in the doctype declaration.
170# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 386 raise ParserRejectedMarkup(e)
171# 3.2.3 code. This ensures they don't treat markup like <p></p> as a 387 parser.already_closed_empty_element = []
172# string.
173#
174# XXX This code can be removed once most Python 3 users are on 3.2.3.
175if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
176 import re
177 attrfind_tolerant = re.compile(
178 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
179 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
180 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
181
182 locatestarttagend = re.compile(r"""
183 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
184 (?:\s+ # whitespace before attribute name
185 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
186 (?:\s*=\s* # value indicator
187 (?:'[^']*' # LITA-enclosed value
188 |\"[^\"]*\" # LIT-enclosed value
189 |[^'\">\s]+ # bare value
190 )
191 )?
192 )
193 )*
194 \s* # trailing whitespace
195""", re.VERBOSE)
196 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
197
198 from html.parser import tagfind, attrfind
199
200 def parse_starttag(self, i):
201 self.__starttag_text = None
202 endpos = self.check_for_whole_start_tag(i)
203 if endpos < 0:
204 return endpos
205 rawdata = self.rawdata
206 self.__starttag_text = rawdata[i:endpos]
207
208 # Now parse the data between i+1 and j into a tag and attrs
209 attrs = []
210 match = tagfind.match(rawdata, i+1)
211 assert match, 'unexpected call to parse_starttag()'
212 k = match.end()
213 self.lasttag = tag = rawdata[i+1:k].lower()
214 while k < endpos:
215 if self.strict:
216 m = attrfind.match(rawdata, k)
217 else:
218 m = attrfind_tolerant.match(rawdata, k)
219 if not m:
220 break
221 attrname, rest, attrvalue = m.group(1, 2, 3)
222 if not rest:
223 attrvalue = None
224 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
225 attrvalue[:1] == '"' == attrvalue[-1:]:
226 attrvalue = attrvalue[1:-1]
227 if attrvalue:
228 attrvalue = self.unescape(attrvalue)
229 attrs.append((attrname.lower(), attrvalue))
230 k = m.end()
231
232 end = rawdata[k:endpos].strip()
233 if end not in (">", "/>"):
234 lineno, offset = self.getpos()
235 if "\n" in self.__starttag_text:
236 lineno = lineno + self.__starttag_text.count("\n")
237 offset = len(self.__starttag_text) \
238 - self.__starttag_text.rfind("\n")
239 else:
240 offset = offset + len(self.__starttag_text)
241 if self.strict:
242 self.error("junk characters in start tag: %r"
243 % (rawdata[k:endpos][:20],))
244 self.handle_data(rawdata[i:endpos])
245 return endpos
246 if end.endswith('/>'):
247 # XHTML-style empty tag: <span attr="value" />
248 self.handle_startendtag(tag, attrs)
249 else:
250 self.handle_starttag(tag, attrs)
251 if tag in self.CDATA_CONTENT_ELEMENTS:
252 self.set_cdata_mode(tag)
253 return endpos
254
255 def set_cdata_mode(self, elem):
256 self.cdata_elem = elem.lower()
257 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
258
259 BeautifulSoupHTMLParser.parse_starttag = parse_starttag
260 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
261
262 CONSTRUCTOR_TAKES_STRICT = True