summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/builder/_htmlparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/builder/_htmlparser.py')
-rw-r--r--bitbake/lib/bs4/builder/_htmlparser.py56
1 files changed, 30 insertions, 26 deletions
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py
index ca8d8b892b..bb0a63f2f3 100644
--- a/bitbake/lib/bs4/builder/_htmlparser.py
+++ b/bitbake/lib/bs4/builder/_htmlparser.py
@@ -4,10 +4,16 @@ __all__ = [
4 'HTMLParserTreeBuilder', 4 'HTMLParserTreeBuilder',
5 ] 5 ]
6 6
7from HTMLParser import ( 7from html.parser import HTMLParser
8 HTMLParser, 8
9 HTMLParseError, 9try:
10 ) 10 from html.parser import HTMLParseError
11except ImportError as e:
12 # HTMLParseError is removed in Python 3.5. Since it can never be
13 # thrown in 3.5, we can just define our own class as a placeholder.
14 class HTMLParseError(Exception):
15 pass
16
11import sys 17import sys
12import warnings 18import warnings
13 19
@@ -19,10 +25,10 @@ import warnings
19# At the end of this file, we monkeypatch HTMLParser so that 25# At the end of this file, we monkeypatch HTMLParser so that
20# strict=True works well on Python 3.2.2. 26# strict=True works well on Python 3.2.2.
21major, minor, release = sys.version_info[:3] 27major, minor, release = sys.version_info[:3]
22CONSTRUCTOR_TAKES_STRICT = ( 28CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
23 major > 3 29CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
24 or (major == 3 and minor > 2) 30CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
25 or (major == 3 and minor == 2 and release >= 3)) 31
26 32
27from bs4.element import ( 33from bs4.element import (
28 CData, 34 CData,
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
63 69
64 def handle_charref(self, name): 70 def handle_charref(self, name):
65 # XXX workaround for a bug in HTMLParser. Remove this once 71 # XXX workaround for a bug in HTMLParser. Remove this once
66 # it's fixed. 72 # it's fixed in all supported versions.
73 # http://bugs.python.org/issue13633
67 if name.startswith('x'): 74 if name.startswith('x'):
68 real_name = int(name.lstrip('x'), 16) 75 real_name = int(name.lstrip('x'), 16)
69 elif name.startswith('X'): 76 elif name.startswith('X'):
@@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
72 real_name = int(name) 79 real_name = int(name)
73 80
74 try: 81 try:
75 data = unichr(real_name) 82 data = chr(real_name)
76 except (ValueError, OverflowError), e: 83 except (ValueError, OverflowError) as e:
77 data = u"\N{REPLACEMENT CHARACTER}" 84 data = "\N{REPLACEMENT CHARACTER}"
78 85
79 self.handle_data(data) 86 self.handle_data(data)
80 87
@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
113 120
114 def handle_pi(self, data): 121 def handle_pi(self, data):
115 self.soup.endData() 122 self.soup.endData()
116 if data.endswith("?") and data.lower().startswith("xml"):
117 # "An XHTML processing instruction using the trailing '?'
118 # will cause the '?' to be included in data." - HTMLParser
119 # docs.
120 #
121 # Strip the question mark so we don't end up with two
122 # question marks.
123 data = data[:-1]
124 self.soup.handle_data(data) 123 self.soup.handle_data(data)
125 self.soup.endData(ProcessingInstruction) 124 self.soup.endData(ProcessingInstruction)
126 125
@@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser):
128class HTMLParserTreeBuilder(HTMLTreeBuilder): 127class HTMLParserTreeBuilder(HTMLTreeBuilder):
129 128
130 is_xml = False 129 is_xml = False
131 features = [HTML, STRICT, HTMLPARSER] 130 picklable = True
131 NAME = HTMLPARSER
132 features = [NAME, HTML, STRICT]
132 133
133 def __init__(self, *args, **kwargs): 134 def __init__(self, *args, **kwargs):
134 if CONSTRUCTOR_TAKES_STRICT: 135 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
135 kwargs['strict'] = False 136 kwargs['strict'] = False
137 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
138 kwargs['convert_charrefs'] = False
136 self.parser_args = (args, kwargs) 139 self.parser_args = (args, kwargs)
137 140
138 def prepare_markup(self, markup, user_specified_encoding=None, 141 def prepare_markup(self, markup, user_specified_encoding=None,
139 document_declared_encoding=None): 142 document_declared_encoding=None, exclude_encodings=None):
140 """ 143 """
141 :return: A 4-tuple (markup, original encoding, encoding 144 :return: A 4-tuple (markup, original encoding, encoding
142 declared within markup, whether any characters had to be 145 declared within markup, whether any characters had to be
143 replaced with REPLACEMENT CHARACTER). 146 replaced with REPLACEMENT CHARACTER).
144 """ 147 """
145 if isinstance(markup, unicode): 148 if isinstance(markup, str):
146 yield (markup, None, None, False) 149 yield (markup, None, None, False)
147 return 150 return
148 151
149 try_encodings = [user_specified_encoding, document_declared_encoding] 152 try_encodings = [user_specified_encoding, document_declared_encoding]
150 dammit = UnicodeDammit(markup, try_encodings, is_html=True) 153 dammit = UnicodeDammit(markup, try_encodings, is_html=True,
154 exclude_encodings=exclude_encodings)
151 yield (dammit.markup, dammit.original_encoding, 155 yield (dammit.markup, dammit.original_encoding,
152 dammit.declared_html_encoding, 156 dammit.declared_html_encoding,
153 dammit.contains_replacement_characters) 157 dammit.contains_replacement_characters)
@@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
158 parser.soup = self.soup 162 parser.soup = self.soup
159 try: 163 try:
160 parser.feed(markup) 164 parser.feed(markup)
161 except HTMLParseError, e: 165 except HTMLParseError as e:
162 warnings.warn(RuntimeWarning( 166 warnings.warn(RuntimeWarning(
163 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 167 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
164 raise e 168 raise e