diff options
Diffstat (limited to 'bitbake/lib/bs4/builder/_htmlparser.py')
-rw-r--r-- | bitbake/lib/bs4/builder/_htmlparser.py | 433 |
1 files changed, 279 insertions, 154 deletions
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py index bb0a63f2f3..3cc187f892 100644 --- a/bitbake/lib/bs4/builder/_htmlparser.py +++ b/bitbake/lib/bs4/builder/_htmlparser.py | |||
@@ -1,35 +1,18 @@ | |||
1 | # encoding: utf-8 | ||
1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" | 2 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" |
2 | 3 | ||
4 | # Use of this source code is governed by the MIT license. | ||
5 | __license__ = "MIT" | ||
6 | |||
3 | __all__ = [ | 7 | __all__ = [ |
4 | 'HTMLParserTreeBuilder', | 8 | 'HTMLParserTreeBuilder', |
5 | ] | 9 | ] |
6 | 10 | ||
7 | from html.parser import HTMLParser | 11 | from html.parser import HTMLParser |
8 | 12 | ||
9 | try: | ||
10 | from html.parser import HTMLParseError | ||
11 | except ImportError as e: | ||
12 | # HTMLParseError is removed in Python 3.5. Since it can never be | ||
13 | # thrown in 3.5, we can just define our own class as a placeholder. | ||
14 | class HTMLParseError(Exception): | ||
15 | pass | ||
16 | |||
17 | import sys | 13 | import sys |
18 | import warnings | 14 | import warnings |
19 | 15 | ||
20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' | ||
21 | # argument, which we'd like to set to False. Unfortunately, | ||
22 | # http://bugs.python.org/issue13273 makes strict=True a better bet | ||
23 | # before Python 3.2.3. | ||
24 | # | ||
25 | # At the end of this file, we monkeypatch HTMLParser so that | ||
26 | # strict=True works well on Python 3.2.2. | ||
27 | major, minor, release = sys.version_info[:3] | ||
28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 | ||
29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 | ||
30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 | ||
31 | |||
32 | |||
33 | from bs4.element import ( | 16 | from bs4.element import ( |
34 | CData, | 17 | CData, |
35 | Comment, | 18 | Comment, |
@@ -40,6 +23,8 @@ from bs4.element import ( | |||
40 | from bs4.dammit import EntitySubstitution, UnicodeDammit | 23 | from bs4.dammit import EntitySubstitution, UnicodeDammit |
41 | 24 | ||
42 | from bs4.builder import ( | 25 | from bs4.builder import ( |
26 | DetectsXMLParsedAsHTML, | ||
27 | ParserRejectedMarkup, | ||
43 | HTML, | 28 | HTML, |
44 | HTMLTreeBuilder, | 29 | HTMLTreeBuilder, |
45 | STRICT, | 30 | STRICT, |
@@ -48,8 +33,84 @@ from bs4.builder import ( | |||
48 | 33 | ||
49 | HTMLPARSER = 'html.parser' | 34 | HTMLPARSER = 'html.parser' |
50 | 35 | ||
51 | class BeautifulSoupHTMLParser(HTMLParser): | 36 | class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): |
52 | def handle_starttag(self, name, attrs): | 37 | """A subclass of the Python standard library's HTMLParser class, which |
38 | listens for HTMLParser events and translates them into calls | ||
39 | to Beautiful Soup's tree construction API. | ||
40 | """ | ||
41 | |||
42 | # Strategies for handling duplicate attributes | ||
43 | IGNORE = 'ignore' | ||
44 | REPLACE = 'replace' | ||
45 | |||
46 | def __init__(self, *args, **kwargs): | ||
47 | """Constructor. | ||
48 | |||
49 | :param on_duplicate_attribute: A strategy for what to do if a | ||
50 | tag includes the same attribute more than once. Accepted | ||
51 | values are: REPLACE (replace earlier values with later | ||
52 | ones, the default), IGNORE (keep the earliest value | ||
53 | encountered), or a callable. A callable must take three | ||
54 | arguments: the dictionary of attributes already processed, | ||
55 | the name of the duplicate attribute, and the most recent value | ||
56 | encountered. | ||
57 | """ | ||
58 | self.on_duplicate_attribute = kwargs.pop( | ||
59 | 'on_duplicate_attribute', self.REPLACE | ||
60 | ) | ||
61 | HTMLParser.__init__(self, *args, **kwargs) | ||
62 | |||
63 | # Keep a list of empty-element tags that were encountered | ||
64 | # without an explicit closing tag. If we encounter a closing tag | ||
65 | # of this type, we'll associate it with one of those entries. | ||
66 | # | ||
67 | # This isn't a stack because we don't care about the | ||
68 | # order. It's a list of closing tags we've already handled and | ||
69 | # will ignore, assuming they ever show up. | ||
70 | self.already_closed_empty_element = [] | ||
71 | |||
72 | self._initialize_xml_detector() | ||
73 | |||
74 | def error(self, message): | ||
75 | # NOTE: This method is required so long as Python 3.9 is | ||
76 | # supported. The corresponding code is removed from HTMLParser | ||
77 | # in 3.5, but not removed from ParserBase until 3.10. | ||
78 | # https://github.com/python/cpython/issues/76025 | ||
79 | # | ||
80 | # The original implementation turned the error into a warning, | ||
81 | # but in every case I discovered, this made HTMLParser | ||
82 | # immediately crash with an error message that was less | ||
83 | # helpful than the warning. The new implementation makes it | ||
84 | # more clear that html.parser just can't parse this | ||
85 | # markup. The 3.10 implementation does the same, though it | ||
86 | # raises AssertionError rather than calling a method. (We | ||
87 | # catch this error and wrap it in a ParserRejectedMarkup.) | ||
88 | raise ParserRejectedMarkup(message) | ||
89 | |||
90 | def handle_startendtag(self, name, attrs): | ||
91 | """Handle an incoming empty-element tag. | ||
92 | |||
93 | This is only called when the markup looks like <tag/>. | ||
94 | |||
95 | :param name: Name of the tag. | ||
96 | :param attrs: Dictionary of the tag's attributes. | ||
97 | """ | ||
98 | # is_startend() tells handle_starttag not to close the tag | ||
99 | # just because its name matches a known empty-element tag. We | ||
100 | # know that this is an empty-element tag and we want to call | ||
101 | # handle_endtag ourselves. | ||
102 | tag = self.handle_starttag(name, attrs, handle_empty_element=False) | ||
103 | self.handle_endtag(name) | ||
104 | |||
105 | def handle_starttag(self, name, attrs, handle_empty_element=True): | ||
106 | """Handle an opening tag, e.g. '<tag>' | ||
107 | |||
108 | :param name: Name of the tag. | ||
109 | :param attrs: Dictionary of the tag's attributes. | ||
110 | :param handle_empty_element: True if this tag is known to be | ||
111 | an empty-element tag (i.e. there is not expected to be any | ||
112 | closing tag). | ||
113 | """ | ||
53 | # XXX namespace | 114 | # XXX namespace |
54 | attr_dict = {} | 115 | attr_dict = {} |
55 | for key, value in attrs: | 116 | for key, value in attrs: |
@@ -57,20 +118,78 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
57 | # for consistency with the other tree builders. | 118 | # for consistency with the other tree builders. |
58 | if value is None: | 119 | if value is None: |
59 | value = '' | 120 | value = '' |
60 | attr_dict[key] = value | 121 | if key in attr_dict: |
122 | # A single attribute shows up multiple times in this | ||
123 | # tag. How to handle it depends on the | ||
124 | # on_duplicate_attribute setting. | ||
125 | on_dupe = self.on_duplicate_attribute | ||
126 | if on_dupe == self.IGNORE: | ||
127 | pass | ||
128 | elif on_dupe in (None, self.REPLACE): | ||
129 | attr_dict[key] = value | ||
130 | else: | ||
131 | on_dupe(attr_dict, key, value) | ||
132 | else: | ||
133 | attr_dict[key] = value | ||
61 | attrvalue = '""' | 134 | attrvalue = '""' |
62 | self.soup.handle_starttag(name, None, None, attr_dict) | 135 | #print("START", name) |
63 | 136 | sourceline, sourcepos = self.getpos() | |
64 | def handle_endtag(self, name): | 137 | tag = self.soup.handle_starttag( |
65 | self.soup.handle_endtag(name) | 138 | name, None, None, attr_dict, sourceline=sourceline, |
66 | 139 | sourcepos=sourcepos | |
140 | ) | ||
141 | if tag and tag.is_empty_element and handle_empty_element: | ||
142 | # Unlike other parsers, html.parser doesn't send separate end tag | ||
143 | # events for empty-element tags. (It's handled in | ||
144 | # handle_startendtag, but only if the original markup looked like | ||
145 | # <tag/>.) | ||
146 | # | ||
147 | # So we need to call handle_endtag() ourselves. Since we | ||
148 | # know the start event is identical to the end event, we | ||
149 | # don't want handle_endtag() to cross off any previous end | ||
150 | # events for tags of this name. | ||
151 | self.handle_endtag(name, check_already_closed=False) | ||
152 | |||
153 | # But we might encounter an explicit closing tag for this tag | ||
154 | # later on. If so, we want to ignore it. | ||
155 | self.already_closed_empty_element.append(name) | ||
156 | |||
157 | if self._root_tag is None: | ||
158 | self._root_tag_encountered(name) | ||
159 | |||
160 | def handle_endtag(self, name, check_already_closed=True): | ||
161 | """Handle a closing tag, e.g. '</tag>' | ||
162 | |||
163 | :param name: A tag name. | ||
164 | :param check_already_closed: True if this tag is expected to | ||
165 | be the closing portion of an empty-element tag, | ||
166 | e.g. '<tag></tag>'. | ||
167 | """ | ||
168 | #print("END", name) | ||
169 | if check_already_closed and name in self.already_closed_empty_element: | ||
170 | # This is a redundant end tag for an empty-element tag. | ||
171 | # We've already called handle_endtag() for it, so just | ||
172 | # check it off the list. | ||
173 | #print("ALREADY CLOSED", name) | ||
174 | self.already_closed_empty_element.remove(name) | ||
175 | else: | ||
176 | self.soup.handle_endtag(name) | ||
177 | |||
67 | def handle_data(self, data): | 178 | def handle_data(self, data): |
179 | """Handle some textual data that shows up between tags.""" | ||
68 | self.soup.handle_data(data) | 180 | self.soup.handle_data(data) |
69 | 181 | ||
70 | def handle_charref(self, name): | 182 | def handle_charref(self, name): |
71 | # XXX workaround for a bug in HTMLParser. Remove this once | 183 | """Handle a numeric character reference by converting it to the |
72 | # it's fixed in all supported versions. | 184 | corresponding Unicode character and treating it as textual |
73 | # http://bugs.python.org/issue13633 | 185 | data. |
186 | |||
187 | :param name: Character number, possibly in hexadecimal. | ||
188 | """ | ||
189 | # TODO: This was originally a workaround for a bug in | ||
190 | # HTMLParser. (http://bugs.python.org/issue13633) The bug has | ||
191 | # been fixed, but removing this code still makes some | ||
192 | # Beautiful Soup tests fail. This needs investigation. | ||
74 | if name.startswith('x'): | 193 | if name.startswith('x'): |
75 | real_name = int(name.lstrip('x'), 16) | 194 | real_name = int(name.lstrip('x'), 16) |
76 | elif name.startswith('X'): | 195 | elif name.startswith('X'): |
@@ -78,37 +197,71 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
78 | else: | 197 | else: |
79 | real_name = int(name) | 198 | real_name = int(name) |
80 | 199 | ||
81 | try: | 200 | data = None |
82 | data = chr(real_name) | 201 | if real_name < 256: |
83 | except (ValueError, OverflowError) as e: | 202 | # HTML numeric entities are supposed to reference Unicode |
84 | data = "\N{REPLACEMENT CHARACTER}" | 203 | # code points, but sometimes they reference code points in |
85 | 204 | # some other encoding (ahem, Windows-1252). E.g. “ | |
205 | # instead of É for LEFT DOUBLE QUOTATION MARK. This | ||
206 | # code tries to detect this situation and compensate. | ||
207 | for encoding in (self.soup.original_encoding, 'windows-1252'): | ||
208 | if not encoding: | ||
209 | continue | ||
210 | try: | ||
211 | data = bytearray([real_name]).decode(encoding) | ||
212 | except UnicodeDecodeError as e: | ||
213 | pass | ||
214 | if not data: | ||
215 | try: | ||
216 | data = chr(real_name) | ||
217 | except (ValueError, OverflowError) as e: | ||
218 | pass | ||
219 | data = data or "\N{REPLACEMENT CHARACTER}" | ||
86 | self.handle_data(data) | 220 | self.handle_data(data) |
87 | 221 | ||
88 | def handle_entityref(self, name): | 222 | def handle_entityref(self, name): |
223 | """Handle a named entity reference by converting it to the | ||
224 | corresponding Unicode character(s) and treating it as textual | ||
225 | data. | ||
226 | |||
227 | :param name: Name of the entity reference. | ||
228 | """ | ||
89 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) | 229 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) |
90 | if character is not None: | 230 | if character is not None: |
91 | data = character | 231 | data = character |
92 | else: | 232 | else: |
93 | data = "&%s;" % name | 233 | # If this were XML, it would be ambiguous whether "&foo" |
234 | # was an character entity reference with a missing | ||
235 | # semicolon or the literal string "&foo". Since this is | ||
236 | # HTML, we have a complete list of all character entity references, | ||
237 | # and this one wasn't found, so assume it's the literal string "&foo". | ||
238 | data = "&%s" % name | ||
94 | self.handle_data(data) | 239 | self.handle_data(data) |
95 | 240 | ||
96 | def handle_comment(self, data): | 241 | def handle_comment(self, data): |
242 | """Handle an HTML comment. | ||
243 | |||
244 | :param data: The text of the comment. | ||
245 | """ | ||
97 | self.soup.endData() | 246 | self.soup.endData() |
98 | self.soup.handle_data(data) | 247 | self.soup.handle_data(data) |
99 | self.soup.endData(Comment) | 248 | self.soup.endData(Comment) |
100 | 249 | ||
101 | def handle_decl(self, data): | 250 | def handle_decl(self, data): |
251 | """Handle a DOCTYPE declaration. | ||
252 | |||
253 | :param data: The text of the declaration. | ||
254 | """ | ||
102 | self.soup.endData() | 255 | self.soup.endData() |
103 | if data.startswith("DOCTYPE "): | 256 | data = data[len("DOCTYPE "):] |
104 | data = data[len("DOCTYPE "):] | ||
105 | elif data == 'DOCTYPE': | ||
106 | # i.e. "<!DOCTYPE>" | ||
107 | data = '' | ||
108 | self.soup.handle_data(data) | 257 | self.soup.handle_data(data) |
109 | self.soup.endData(Doctype) | 258 | self.soup.endData(Doctype) |
110 | 259 | ||
111 | def unknown_decl(self, data): | 260 | def unknown_decl(self, data): |
261 | """Handle a declaration of unknown type -- probably a CDATA block. | ||
262 | |||
263 | :param data: The text of the declaration. | ||
264 | """ | ||
112 | if data.upper().startswith('CDATA['): | 265 | if data.upper().startswith('CDATA['): |
113 | cls = CData | 266 | cls = CData |
114 | data = data[len('CDATA['):] | 267 | data = data[len('CDATA['):] |
@@ -119,144 +272,116 @@ class BeautifulSoupHTMLParser(HTMLParser): | |||
119 | self.soup.endData(cls) | 272 | self.soup.endData(cls) |
120 | 273 | ||
121 | def handle_pi(self, data): | 274 | def handle_pi(self, data): |
275 | """Handle a processing instruction. | ||
276 | |||
277 | :param data: The text of the instruction. | ||
278 | """ | ||
122 | self.soup.endData() | 279 | self.soup.endData() |
123 | self.soup.handle_data(data) | 280 | self.soup.handle_data(data) |
281 | self._document_might_be_xml(data) | ||
124 | self.soup.endData(ProcessingInstruction) | 282 | self.soup.endData(ProcessingInstruction) |
125 | 283 | ||
126 | 284 | ||
127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): | 285 | class HTMLParserTreeBuilder(HTMLTreeBuilder): |
128 | 286 | """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, | |
287 | found in the Python standard library. | ||
288 | """ | ||
129 | is_xml = False | 289 | is_xml = False |
130 | picklable = True | 290 | picklable = True |
131 | NAME = HTMLPARSER | 291 | NAME = HTMLPARSER |
132 | features = [NAME, HTML, STRICT] | 292 | features = [NAME, HTML, STRICT] |
133 | 293 | ||
134 | def __init__(self, *args, **kwargs): | 294 | # The html.parser knows which line number and position in the |
135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: | 295 | # original file is the source of an element. |
136 | kwargs['strict'] = False | 296 | TRACKS_LINE_NUMBERS = True |
137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: | ||
138 | kwargs['convert_charrefs'] = False | ||
139 | self.parser_args = (args, kwargs) | ||
140 | 297 | ||
298 | def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): | ||
299 | """Constructor. | ||
300 | |||
301 | :param parser_args: Positional arguments to pass into | ||
302 | the BeautifulSoupHTMLParser constructor, once it's | ||
303 | invoked. | ||
304 | :param parser_kwargs: Keyword arguments to pass into | ||
305 | the BeautifulSoupHTMLParser constructor, once it's | ||
306 | invoked. | ||
307 | :param kwargs: Keyword arguments for the superclass constructor. | ||
308 | """ | ||
309 | # Some keyword arguments will be pulled out of kwargs and placed | ||
310 | # into parser_kwargs. | ||
311 | extra_parser_kwargs = dict() | ||
312 | for arg in ('on_duplicate_attribute',): | ||
313 | if arg in kwargs: | ||
314 | value = kwargs.pop(arg) | ||
315 | extra_parser_kwargs[arg] = value | ||
316 | super(HTMLParserTreeBuilder, self).__init__(**kwargs) | ||
317 | parser_args = parser_args or [] | ||
318 | parser_kwargs = parser_kwargs or {} | ||
319 | parser_kwargs.update(extra_parser_kwargs) | ||
320 | parser_kwargs['convert_charrefs'] = False | ||
321 | self.parser_args = (parser_args, parser_kwargs) | ||
322 | |||
141 | def prepare_markup(self, markup, user_specified_encoding=None, | 323 | def prepare_markup(self, markup, user_specified_encoding=None, |
142 | document_declared_encoding=None, exclude_encodings=None): | 324 | document_declared_encoding=None, exclude_encodings=None): |
143 | """ | 325 | |
144 | :return: A 4-tuple (markup, original encoding, encoding | 326 | """Run any preliminary steps necessary to make incoming markup |
145 | declared within markup, whether any characters had to be | 327 | acceptable to the parser. |
146 | replaced with REPLACEMENT CHARACTER). | 328 | |
329 | :param markup: Some markup -- probably a bytestring. | ||
330 | :param user_specified_encoding: The user asked to try this encoding. | ||
331 | :param document_declared_encoding: The markup itself claims to be | ||
332 | in this encoding. | ||
333 | :param exclude_encodings: The user asked _not_ to try any of | ||
334 | these encodings. | ||
335 | |||
336 | :yield: A series of 4-tuples: | ||
337 | (markup, encoding, declared encoding, | ||
338 | has undergone character replacement) | ||
339 | |||
340 | Each 4-tuple represents a strategy for converting the | ||
341 | document to Unicode and parsing it. Each strategy will be tried | ||
342 | in turn. | ||
147 | """ | 343 | """ |
148 | if isinstance(markup, str): | 344 | if isinstance(markup, str): |
345 | # Parse Unicode as-is. | ||
149 | yield (markup, None, None, False) | 346 | yield (markup, None, None, False) |
150 | return | 347 | return |
151 | 348 | ||
349 | # Ask UnicodeDammit to sniff the most likely encoding. | ||
350 | |||
351 | # This was provided by the end-user; treat it as a known | ||
352 | # definite encoding per the algorithm laid out in the HTML5 | ||
353 | # spec. (See the EncodingDetector class for details.) | ||
354 | known_definite_encodings = [user_specified_encoding] | ||
355 | |||
356 | # This was found in the document; treat it as a slightly lower-priority | ||
357 | # user encoding. | ||
358 | user_encodings = [document_declared_encoding] | ||
359 | |||
152 | try_encodings = [user_specified_encoding, document_declared_encoding] | 360 | try_encodings = [user_specified_encoding, document_declared_encoding] |
153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, | 361 | dammit = UnicodeDammit( |
154 | exclude_encodings=exclude_encodings) | 362 | markup, |
363 | known_definite_encodings=known_definite_encodings, | ||
364 | user_encodings=user_encodings, | ||
365 | is_html=True, | ||
366 | exclude_encodings=exclude_encodings | ||
367 | ) | ||
155 | yield (dammit.markup, dammit.original_encoding, | 368 | yield (dammit.markup, dammit.original_encoding, |
156 | dammit.declared_html_encoding, | 369 | dammit.declared_html_encoding, |
157 | dammit.contains_replacement_characters) | 370 | dammit.contains_replacement_characters) |
158 | 371 | ||
159 | def feed(self, markup): | 372 | def feed(self, markup): |
373 | """Run some incoming markup through some parsing process, | ||
374 | populating the `BeautifulSoup` object in self.soup. | ||
375 | """ | ||
160 | args, kwargs = self.parser_args | 376 | args, kwargs = self.parser_args |
161 | parser = BeautifulSoupHTMLParser(*args, **kwargs) | 377 | parser = BeautifulSoupHTMLParser(*args, **kwargs) |
162 | parser.soup = self.soup | 378 | parser.soup = self.soup |
163 | try: | 379 | try: |
164 | parser.feed(markup) | 380 | parser.feed(markup) |
165 | except HTMLParseError as e: | 381 | parser.close() |
166 | warnings.warn(RuntimeWarning( | 382 | except AssertionError as e: |
167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) | 383 | # html.parser raises AssertionError in rare cases to |
168 | raise e | 384 | # indicate a fatal problem with the markup, especially |
169 | 385 | # when there's an error in the doctype declaration. | |
170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some | 386 | raise ParserRejectedMarkup(e) |
171 | # 3.2.3 code. This ensures they don't treat markup like <p></p> as a | 387 | parser.already_closed_empty_element = [] |
172 | # string. | ||
173 | # | ||
174 | # XXX This code can be removed once most Python 3 users are on 3.2.3. | ||
175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: | ||
176 | import re | ||
177 | attrfind_tolerant = re.compile( | ||
178 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' | ||
179 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') | ||
180 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant | ||
181 | |||
182 | locatestarttagend = re.compile(r""" | ||
183 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name | ||
184 | (?:\s+ # whitespace before attribute name | ||
185 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name | ||
186 | (?:\s*=\s* # value indicator | ||
187 | (?:'[^']*' # LITA-enclosed value | ||
188 | |\"[^\"]*\" # LIT-enclosed value | ||
189 | |[^'\">\s]+ # bare value | ||
190 | ) | ||
191 | )? | ||
192 | ) | ||
193 | )* | ||
194 | \s* # trailing whitespace | ||
195 | """, re.VERBOSE) | ||
196 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend | ||
197 | |||
198 | from html.parser import tagfind, attrfind | ||
199 | |||
200 | def parse_starttag(self, i): | ||
201 | self.__starttag_text = None | ||
202 | endpos = self.check_for_whole_start_tag(i) | ||
203 | if endpos < 0: | ||
204 | return endpos | ||
205 | rawdata = self.rawdata | ||
206 | self.__starttag_text = rawdata[i:endpos] | ||
207 | |||
208 | # Now parse the data between i+1 and j into a tag and attrs | ||
209 | attrs = [] | ||
210 | match = tagfind.match(rawdata, i+1) | ||
211 | assert match, 'unexpected call to parse_starttag()' | ||
212 | k = match.end() | ||
213 | self.lasttag = tag = rawdata[i+1:k].lower() | ||
214 | while k < endpos: | ||
215 | if self.strict: | ||
216 | m = attrfind.match(rawdata, k) | ||
217 | else: | ||
218 | m = attrfind_tolerant.match(rawdata, k) | ||
219 | if not m: | ||
220 | break | ||
221 | attrname, rest, attrvalue = m.group(1, 2, 3) | ||
222 | if not rest: | ||
223 | attrvalue = None | ||
224 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ | ||
225 | attrvalue[:1] == '"' == attrvalue[-1:]: | ||
226 | attrvalue = attrvalue[1:-1] | ||
227 | if attrvalue: | ||
228 | attrvalue = self.unescape(attrvalue) | ||
229 | attrs.append((attrname.lower(), attrvalue)) | ||
230 | k = m.end() | ||
231 | |||
232 | end = rawdata[k:endpos].strip() | ||
233 | if end not in (">", "/>"): | ||
234 | lineno, offset = self.getpos() | ||
235 | if "\n" in self.__starttag_text: | ||
236 | lineno = lineno + self.__starttag_text.count("\n") | ||
237 | offset = len(self.__starttag_text) \ | ||
238 | - self.__starttag_text.rfind("\n") | ||
239 | else: | ||
240 | offset = offset + len(self.__starttag_text) | ||
241 | if self.strict: | ||
242 | self.error("junk characters in start tag: %r" | ||
243 | % (rawdata[k:endpos][:20],)) | ||
244 | self.handle_data(rawdata[i:endpos]) | ||
245 | return endpos | ||
246 | if end.endswith('/>'): | ||
247 | # XHTML-style empty tag: <span attr="value" /> | ||
248 | self.handle_startendtag(tag, attrs) | ||
249 | else: | ||
250 | self.handle_starttag(tag, attrs) | ||
251 | if tag in self.CDATA_CONTENT_ELEMENTS: | ||
252 | self.set_cdata_mode(tag) | ||
253 | return endpos | ||
254 | |||
255 | def set_cdata_mode(self, elem): | ||
256 | self.cdata_elem = elem.lower() | ||
257 | self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | ||
258 | |||
259 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag | ||
260 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode | ||
261 | |||
262 | CONSTRUCTOR_TAKES_STRICT = True | ||