summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/diagnose.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/diagnose.py')
-rw-r--r--bitbake/lib/bs4/diagnose.py83
1 files changed, 50 insertions, 33 deletions
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py
index 083395fb46..e079772e69 100644
--- a/bitbake/lib/bs4/diagnose.py
+++ b/bitbake/lib/bs4/diagnose.py
@@ -1,9 +1,10 @@
1"""Diagnostic functions, mainly for use when doing tech support.""" 1"""Diagnostic functions, mainly for use when doing tech support."""
2 2
3# Use of this source code is governed by the MIT license.
3__license__ = "MIT" 4__license__ = "MIT"
4 5
5import cProfile 6import cProfile
6from io import StringIO 7from io import BytesIO
7from html.parser import HTMLParser 8from html.parser import HTMLParser
8import bs4 9import bs4
9from bs4 import BeautifulSoup, __version__ 10from bs4 import BeautifulSoup, __version__
@@ -19,9 +20,13 @@ import sys
19import cProfile 20import cProfile
20 21
21def diagnose(data): 22def diagnose(data):
22 """Diagnostic suite for isolating common problems.""" 23 """Diagnostic suite for isolating common problems.
23 print("Diagnostic running on Beautiful Soup %s" % __version__) 24
24 print("Python version %s" % sys.version) 25 :param data: A string containing markup that needs to be explained.
26 :return: None; diagnostics are printed to standard output.
27 """
28 print(("Diagnostic running on Beautiful Soup %s" % __version__))
29 print(("Python version %s" % sys.version))
25 30
26 basic_parsers = ["html.parser", "html5lib", "lxml"] 31 basic_parsers = ["html.parser", "html5lib", "lxml"]
27 for name in basic_parsers: 32 for name in basic_parsers:
@@ -35,61 +40,70 @@ def diagnose(data):
35 name)) 40 name))
36 41
37 if 'lxml' in basic_parsers: 42 if 'lxml' in basic_parsers:
38 basic_parsers.append(["lxml", "xml"]) 43 basic_parsers.append("lxml-xml")
39 try: 44 try:
40 from lxml import etree 45 from lxml import etree
41 print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) 46 print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
42 except ImportError as e: 47 except ImportError as e:
43 print ( 48 print(
44 "lxml is not installed or couldn't be imported.") 49 "lxml is not installed or couldn't be imported.")
45 50
46 51
47 if 'html5lib' in basic_parsers: 52 if 'html5lib' in basic_parsers:
48 try: 53 try:
49 import html5lib 54 import html5lib
50 print("Found html5lib version %s" % html5lib.__version__) 55 print(("Found html5lib version %s" % html5lib.__version__))
51 except ImportError as e: 56 except ImportError as e:
52 print ( 57 print(
53 "html5lib is not installed or couldn't be imported.") 58 "html5lib is not installed or couldn't be imported.")
54 59
55 if hasattr(data, 'read'): 60 if hasattr(data, 'read'):
56 data = data.read() 61 data = data.read()
57 elif os.path.exists(data):
58 print('"%s" looks like a filename. Reading data from the file.' % data)
59 data = open(data).read()
60 elif data.startswith("http:") or data.startswith("https:"):
61 print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
62 print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
63 return
64 print()
65 62
66 for parser in basic_parsers: 63 for parser in basic_parsers:
67 print("Trying to parse your markup with %s" % parser) 64 print(("Trying to parse your markup with %s" % parser))
68 success = False 65 success = False
69 try: 66 try:
70 soup = BeautifulSoup(data, parser) 67 soup = BeautifulSoup(data, features=parser)
71 success = True 68 success = True
72 except Exception as e: 69 except Exception as e:
73 print("%s could not parse the markup." % parser) 70 print(("%s could not parse the markup." % parser))
74 traceback.print_exc() 71 traceback.print_exc()
75 if success: 72 if success:
76 print("Here's what %s did with the markup:" % parser) 73 print(("Here's what %s did with the markup:" % parser))
77 print(soup.prettify()) 74 print((soup.prettify()))
78 75
79 print("-" * 80) 76 print(("-" * 80))
80 77
81def lxml_trace(data, html=True, **kwargs): 78def lxml_trace(data, html=True, **kwargs):
82 """Print out the lxml events that occur during parsing. 79 """Print out the lxml events that occur during parsing.
83 80
84 This lets you see how lxml parses a document when no Beautiful 81 This lets you see how lxml parses a document when no Beautiful
85 Soup code is running. 82 Soup code is running. You can use this to determine whether
83 an lxml-specific problem is in Beautiful Soup's lxml tree builders
84 or in lxml itself.
85
86 :param data: Some markup.
87 :param html: If True, markup will be parsed with lxml's HTML parser.
88 if False, lxml's XML parser will be used.
86 """ 89 """
87 from lxml import etree 90 from lxml import etree
88 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 91 recover = kwargs.pop('recover', True)
92 if isinstance(data, str):
93 data = data.encode("utf8")
94 reader = BytesIO(data)
95 for event, element in etree.iterparse(
96 reader, html=html, recover=recover, **kwargs
97 ):
89 print(("%s, %4s, %s" % (event, element.tag, element.text))) 98 print(("%s, %4s, %s" % (event, element.tag, element.text)))
90 99
91class AnnouncingParser(HTMLParser): 100class AnnouncingParser(HTMLParser):
92 """Announces HTMLParser parse events, without doing anything else.""" 101 """Subclass of HTMLParser that announces parse events, without doing
102 anything else.
103
104 You can use this to get a picture of how html.parser sees a given
105 document. The easiest way to do this is to call `htmlparser_trace`.
106 """
93 107
94 def _p(self, s): 108 def _p(self, s):
95 print(s) 109 print(s)
@@ -126,6 +140,8 @@ def htmlparser_trace(data):
126 140
127 This lets you see how HTMLParser parses a document when no 141 This lets you see how HTMLParser parses a document when no
128 Beautiful Soup code is running. 142 Beautiful Soup code is running.
143
144 :param data: Some markup.
129 """ 145 """
130 parser = AnnouncingParser() 146 parser = AnnouncingParser()
131 parser.feed(data) 147 parser.feed(data)
@@ -168,9 +184,9 @@ def rdoc(num_elements=1000):
168 184
169def benchmark_parsers(num_elements=100000): 185def benchmark_parsers(num_elements=100000):
170 """Very basic head-to-head performance benchmark.""" 186 """Very basic head-to-head performance benchmark."""
171 print("Comparative parser benchmark on Beautiful Soup %s" % __version__) 187 print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
172 data = rdoc(num_elements) 188 data = rdoc(num_elements)
173 print("Generated a large invalid HTML document (%d bytes)." % len(data)) 189 print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
174 190
175 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 191 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
176 success = False 192 success = False
@@ -180,26 +196,26 @@ def benchmark_parsers(num_elements=100000):
180 b = time.time() 196 b = time.time()
181 success = True 197 success = True
182 except Exception as e: 198 except Exception as e:
183 print("%s could not parse the markup." % parser) 199 print(("%s could not parse the markup." % parser))
184 traceback.print_exc() 200 traceback.print_exc()
185 if success: 201 if success:
186 print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) 202 print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
187 203
188 from lxml import etree 204 from lxml import etree
189 a = time.time() 205 a = time.time()
190 etree.HTML(data) 206 etree.HTML(data)
191 b = time.time() 207 b = time.time()
192 print("Raw lxml parsed the markup in %.2fs." % (b-a)) 208 print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
193 209
194 import html5lib 210 import html5lib
195 parser = html5lib.HTMLParser() 211 parser = html5lib.HTMLParser()
196 a = time.time() 212 a = time.time()
197 parser.parse(data) 213 parser.parse(data)
198 b = time.time() 214 b = time.time()
199 print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 215 print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
200 216
201def profile(num_elements=100000, parser="lxml"): 217def profile(num_elements=100000, parser="lxml"):
202 218 """Use Python's profiler on a randomly generated document."""
203 filehandle = tempfile.NamedTemporaryFile() 219 filehandle = tempfile.NamedTemporaryFile()
204 filename = filehandle.name 220 filename = filehandle.name
205 221
@@ -212,5 +228,6 @@ def profile(num_elements=100000, parser="lxml"):
212 stats.sort_stats("cumulative") 228 stats.sort_stats("cumulative")
213 stats.print_stats('_html5lib|bs4', 50) 229 stats.print_stats('_html5lib|bs4', 50)
214 230
231# If this file is run as a script, standard input is diagnosed.
215if __name__ == '__main__': 232if __name__ == '__main__':
216 diagnose(sys.stdin.read()) 233 diagnose(sys.stdin.read())