From 12fa81e8d67f0d9755decde5c5b766f56b2af8db Mon Sep 17 00:00:00 2001 From: Richard Purdie Date: Fri, 31 May 2024 12:04:03 +0100 Subject: bs4: Update to 4.12.3 from 4.4.1 It makes sense to switch to a more recent version and keep up to date with upstream changes and things like new python version support. (Bitbake rev: f5462156036e71911c66d07dbf3303cde862785b) Signed-off-by: Richard Purdie --- bitbake/lib/bs4/builder/_htmlparser.py | 433 +++++++++++++++++++++------------ 1 file changed, 279 insertions(+), 154 deletions(-) (limited to 'bitbake/lib/bs4/builder/_htmlparser.py') diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py index bb0a63f2f3..3cc187f892 100644 --- a/bitbake/lib/bs4/builder/_htmlparser.py +++ b/bitbake/lib/bs4/builder/_htmlparser.py @@ -1,35 +1,18 @@ +# encoding: utf-8 """Use the HTMLParser library to parse HTML files that aren't too bad.""" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + __all__ = [ 'HTMLParserTreeBuilder', ] from html.parser import HTMLParser -try: - from html.parser import HTMLParseError -except ImportError as e: - # HTMLParseError is removed in Python 3.5. Since it can never be - # thrown in 3.5, we can just define our own class as a placeholder. - class HTMLParseError(Exception): - pass - import sys import warnings -# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' -# argument, which we'd like to set to False. Unfortunately, -# http://bugs.python.org/issue13273 makes strict=True a better bet -# before Python 3.2.3. -# -# At the end of this file, we monkeypatch HTMLParser so that -# strict=True works well on Python 3.2.2. -major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 -CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 -CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 - - from bs4.element import ( CData, Comment, @@ -40,6 +23,8 @@ from bs4.element import ( from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.builder import ( + DetectsXMLParsedAsHTML, + ParserRejectedMarkup, HTML, HTMLTreeBuilder, STRICT, @@ -48,8 +33,84 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' -class BeautifulSoupHTMLParser(HTMLParser): - def handle_starttag(self, name, attrs): +class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): + """A subclass of the Python standard library's HTMLParser class, which + listens for HTMLParser events and translates them into calls + to Beautiful Soup's tree construction API. + """ + + # Strategies for handling duplicate attributes + IGNORE = 'ignore' + REPLACE = 'replace' + + def __init__(self, *args, **kwargs): + """Constructor. + + :param on_duplicate_attribute: A strategy for what to do if a + tag includes the same attribute more than once. Accepted + values are: REPLACE (replace earlier values with later + ones, the default), IGNORE (keep the earliest value + encountered), or a callable. A callable must take three + arguments: the dictionary of attributes already processed, + the name of the duplicate attribute, and the most recent value + encountered. + """ + self.on_duplicate_attribute = kwargs.pop( + 'on_duplicate_attribute', self.REPLACE + ) + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + self._initialize_xml_detector() + + def error(self, message): + # NOTE: This method is required so long as Python 3.9 is + # supported. The corresponding code is removed from HTMLParser + # in 3.5, but not removed from ParserBase until 3.10. + # https://github.com/python/cpython/issues/76025 + # + # The original implementation turned the error into a warning, + # but in every case I discovered, this made HTMLParser + # immediately crash with an error message that was less + # helpful than the warning. The new implementation makes it + # more clear that html.parser just can't parse this + # markup. The 3.10 implementation does the same, though it + # raises AssertionError rather than calling a method. (We + # catch this error and wrap it in a ParserRejectedMarkup.) + raise ParserRejectedMarkup(message) + + def handle_startendtag(self, name, attrs): + """Handle an incoming empty-element tag. + + This is only called when the markup looks like . + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + """ + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): + """Handle an opening tag, e.g. '' + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + :param handle_empty_element: True if this tag is known to be + an empty-element tag (i.e. there is not expected to be any + closing tag). + """ # XXX namespace attr_dict = {} for key, value in attrs: @@ -57,20 +118,78 @@ class BeautifulSoupHTMLParser(HTMLParser): # for consistency with the other tree builders. if value is None: value = '' - attr_dict[key] = value + if key in attr_dict: + # A single attribute shows up multiple times in this + # tag. How to handle it depends on the + # on_duplicate_attribute setting. + on_dupe = self.on_duplicate_attribute + if on_dupe == self.IGNORE: + pass + elif on_dupe in (None, self.REPLACE): + attr_dict[key] = value + else: + on_dupe(attr_dict, key, value) + else: + attr_dict[key] = value attrvalue = '""' - self.soup.handle_starttag(name, None, None, attr_dict) - - def handle_endtag(self, name): - self.soup.handle_endtag(name) - + #print("START", name) + sourceline, sourcepos = self.getpos() + tag = self.soup.handle_starttag( + name, None, None, attr_dict, sourceline=sourceline, + sourcepos=sourcepos + ) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # .) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) + + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + if self._root_tag is None: + self._root_tag_encountered(name) + + def handle_endtag(self, name, check_already_closed=True): + """Handle a closing tag, e.g. '' + + :param name: A tag name. + :param check_already_closed: True if this tag is expected to + be the closing portion of an empty-element tag, + e.g. ''. + """ + #print("END", name) + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + #print("ALREADY CLOSED", name) + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) + def handle_data(self, data): + """Handle some textual data that shows up between tags.""" self.soup.handle_data(data) def handle_charref(self, name): - # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed in all supported versions. - # http://bugs.python.org/issue13633 + """Handle a numeric character reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Character number, possibly in hexadecimal. + """ + # TODO: This was originally a workaround for a bug in + # HTMLParser. (http://bugs.python.org/issue13633) The bug has + # been fixed, but removing this code still makes some + # Beautiful Soup tests fail. This needs investigation. if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): @@ -78,37 +197,71 @@ class BeautifulSoupHTMLParser(HTMLParser): else: real_name = int(name) - try: - data = chr(real_name) - except (ValueError, OverflowError) as e: - data = "\N{REPLACEMENT CHARACTER}" - + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError as e: + pass + if not data: + try: + data = chr(real_name) + except (ValueError, OverflowError) as e: + pass + data = data or "\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): + """Handle a named entity reference by converting it to the + corresponding Unicode character(s) and treating it as textual + data. + + :param name: Name of the entity reference. + """ character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) if character is not None: data = character else: - data = "&%s;" % name + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name self.handle_data(data) def handle_comment(self, data): + """Handle an HTML comment. + + :param data: The text of the comment. + """ self.soup.endData() self.soup.handle_data(data) self.soup.endData(Comment) def handle_decl(self, data): + """Handle a DOCTYPE declaration. + + :param data: The text of the declaration. + """ self.soup.endData() - if data.startswith("DOCTYPE "): - data = data[len("DOCTYPE "):] - elif data == 'DOCTYPE': - # i.e. "" - data = '' + data = data[len("DOCTYPE "):] self.soup.handle_data(data) self.soup.endData(Doctype) def unknown_decl(self, data): + """Handle a declaration of unknown type -- probably a CDATA block. + + :param data: The text of the declaration. + """ if data.upper().startswith('CDATA['): cls = CData data = data[len('CDATA['):] @@ -119,144 +272,116 @@ class BeautifulSoupHTMLParser(HTMLParser): self.soup.endData(cls) def handle_pi(self, data): + """Handle a processing instruction. + + :param data: The text of the instruction. + """ self.soup.endData() self.soup.handle_data(data) + self._document_might_be_xml(data) self.soup.endData(ProcessingInstruction) class HTMLParserTreeBuilder(HTMLTreeBuilder): - + """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, + found in the Python standard library. + """ is_xml = False picklable = True NAME = HTMLPARSER features = [NAME, HTML, STRICT] - def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - kwargs['strict'] = False - if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - kwargs['convert_charrefs'] = False - self.parser_args = (args, kwargs) + # The html.parser knows which line number and position in the + # original file is the source of an element. + TRACKS_LINE_NUMBERS = True + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + """Constructor. + + :param parser_args: Positional arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param parser_kwargs: Keyword arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param kwargs: Keyword arguments for the superclass constructor. + """ + # Some keyword arguments will be pulled out of kwargs and placed + # into parser_kwargs. + extra_parser_kwargs = dict() + for arg in ('on_duplicate_attribute',): + if arg in kwargs: + value = kwargs.pop(arg) + extra_parser_kwargs[arg] = value + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} + parser_kwargs.update(extra_parser_kwargs) + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) + def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): - """ - :return: A 4-tuple (markup, original encoding, encoding - declared within markup, whether any characters had to be - replaced with REPLACEMENT CHARACTER). + + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. """ if isinstance(markup, str): + # Parse Unicode as-is. yield (markup, None, None, False) return + # Ask UnicodeDammit to sniff the most likely encoding. + + # This was provided by the end-user; treat it as a known + # definite encoding per the algorithm laid out in the HTML5 + # spec. (See the EncodingDetector class for details.) + known_definite_encodings = [user_specified_encoding] + + # This was found in the document; treat it as a slightly lower-priority + # user encoding. + user_encodings = [document_declared_encoding] + try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True, - exclude_encodings=exclude_encodings) + dammit = UnicodeDammit( + markup, + known_definite_encodings=known_definite_encodings, + user_encodings=user_encodings, + is_html=True, + exclude_encodings=exclude_encodings + ) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + """ args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) - except HTMLParseError as e: - warnings.warn(RuntimeWarning( - "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) - raise e - -# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some -# 3.2.3 code. This ensures they don't treat markup like

as a -# string. -# -# XXX This code can be removed once most Python 3 users are on 3.2.3. -if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: - import re - attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') - HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant - - locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) - BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend - - from html.parser import tagfind, attrfind - - def parse_starttag(self, i): - self.__starttag_text = None - endpos = self.check_for_whole_start_tag(i) - if endpos < 0: - return endpos - rawdata = self.rawdata - self.__starttag_text = rawdata[i:endpos] - - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] - match = tagfind.match(rawdata, i+1) - assert match, 'unexpected call to parse_starttag()' - k = match.end() - self.lasttag = tag = rawdata[i+1:k].lower() - while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) - if not m: - break - attrname, rest, attrvalue = m.group(1, 2, 3) - if not rest: - attrvalue = None - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] - if attrvalue: - attrvalue = self.unescape(attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = m.end() - - end = rawdata[k:endpos].strip() - if end not in (">", "/>"): - lineno, offset = self.getpos() - if "\n" in self.__starttag_text: - lineno = lineno + self.__starttag_text.count("\n") - offset = len(self.__starttag_text) \ - - self.__starttag_text.rfind("\n") - else: - offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos - if end.endswith('/>'): - # XHTML-style empty tag: - self.handle_startendtag(tag, attrs) - else: - self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) - return endpos - - def set_cdata_mode(self, elem): - self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) - - BeautifulSoupHTMLParser.parse_starttag = parse_starttag - BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode - - CONSTRUCTOR_TAKES_STRICT = True + parser.close() + except AssertionError as e: + # html.parser raises AssertionError in rare cases to + # indicate a fatal problem with the markup, especially + # when there's an error in the doctype declaration. + raise ParserRejectedMarkup(e) + parser.already_closed_empty_element = [] -- cgit v1.2.3-54-g00ecf