summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/diagnose.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/diagnose.py')
-rw-r--r--bitbake/lib/bs4/diagnose.py84
1 files changed, 50 insertions, 34 deletions
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py
index 083395fb46..4692795340 100644
--- a/bitbake/lib/bs4/diagnose.py
+++ b/bitbake/lib/bs4/diagnose.py
@@ -1,9 +1,10 @@
1"""Diagnostic functions, mainly for use when doing tech support.""" 1"""Diagnostic functions, mainly for use when doing tech support."""
2 2
3# Use of this source code is governed by the MIT license.
3__license__ = "MIT" 4__license__ = "MIT"
4 5
5import cProfile 6import cProfile
6from io import StringIO 7from io import BytesIO
7from html.parser import HTMLParser 8from html.parser import HTMLParser
8import bs4 9import bs4
9from bs4 import BeautifulSoup, __version__ 10from bs4 import BeautifulSoup, __version__
@@ -16,12 +17,15 @@ import tempfile
16import time 17import time
17import traceback 18import traceback
18import sys 19import sys
19import cProfile
20 20
21def diagnose(data): 21def diagnose(data):
22 """Diagnostic suite for isolating common problems.""" 22 """Diagnostic suite for isolating common problems.
23 print("Diagnostic running on Beautiful Soup %s" % __version__) 23
24 print("Python version %s" % sys.version) 24 :param data: A string containing markup that needs to be explained.
25 :return: None; diagnostics are printed to standard output.
26 """
27 print(("Diagnostic running on Beautiful Soup %s" % __version__))
28 print(("Python version %s" % sys.version))
25 29
26 basic_parsers = ["html.parser", "html5lib", "lxml"] 30 basic_parsers = ["html.parser", "html5lib", "lxml"]
27 for name in basic_parsers: 31 for name in basic_parsers:
@@ -35,61 +39,70 @@ def diagnose(data):
35 name)) 39 name))
36 40
37 if 'lxml' in basic_parsers: 41 if 'lxml' in basic_parsers:
38 basic_parsers.append(["lxml", "xml"]) 42 basic_parsers.append("lxml-xml")
39 try: 43 try:
40 from lxml import etree 44 from lxml import etree
41 print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) 45 print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
42 except ImportError as e: 46 except ImportError as e:
43 print ( 47 print(
44 "lxml is not installed or couldn't be imported.") 48 "lxml is not installed or couldn't be imported.")
45 49
46 50
47 if 'html5lib' in basic_parsers: 51 if 'html5lib' in basic_parsers:
48 try: 52 try:
49 import html5lib 53 import html5lib
50 print("Found html5lib version %s" % html5lib.__version__) 54 print(("Found html5lib version %s" % html5lib.__version__))
51 except ImportError as e: 55 except ImportError as e:
52 print ( 56 print(
53 "html5lib is not installed or couldn't be imported.") 57 "html5lib is not installed or couldn't be imported.")
54 58
55 if hasattr(data, 'read'): 59 if hasattr(data, 'read'):
56 data = data.read() 60 data = data.read()
57 elif os.path.exists(data):
58 print('"%s" looks like a filename. Reading data from the file.' % data)
59 data = open(data).read()
60 elif data.startswith("http:") or data.startswith("https:"):
61 print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
62 print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
63 return
64 print()
65 61
66 for parser in basic_parsers: 62 for parser in basic_parsers:
67 print("Trying to parse your markup with %s" % parser) 63 print(("Trying to parse your markup with %s" % parser))
68 success = False 64 success = False
69 try: 65 try:
70 soup = BeautifulSoup(data, parser) 66 soup = BeautifulSoup(data, features=parser)
71 success = True 67 success = True
72 except Exception as e: 68 except Exception as e:
73 print("%s could not parse the markup." % parser) 69 print(("%s could not parse the markup." % parser))
74 traceback.print_exc() 70 traceback.print_exc()
75 if success: 71 if success:
76 print("Here's what %s did with the markup:" % parser) 72 print(("Here's what %s did with the markup:" % parser))
77 print(soup.prettify()) 73 print((soup.prettify()))
78 74
79 print("-" * 80) 75 print(("-" * 80))
80 76
81def lxml_trace(data, html=True, **kwargs): 77def lxml_trace(data, html=True, **kwargs):
82 """Print out the lxml events that occur during parsing. 78 """Print out the lxml events that occur during parsing.
83 79
84 This lets you see how lxml parses a document when no Beautiful 80 This lets you see how lxml parses a document when no Beautiful
85 Soup code is running. 81 Soup code is running. You can use this to determine whether
82 an lxml-specific problem is in Beautiful Soup's lxml tree builders
83 or in lxml itself.
84
85 :param data: Some markup.
86 :param html: If True, markup will be parsed with lxml's HTML parser.
87 if False, lxml's XML parser will be used.
86 """ 88 """
87 from lxml import etree 89 from lxml import etree
88 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 90 recover = kwargs.pop('recover', True)
91 if isinstance(data, str):
92 data = data.encode("utf8")
93 reader = BytesIO(data)
94 for event, element in etree.iterparse(
95 reader, html=html, recover=recover, **kwargs
96 ):
89 print(("%s, %4s, %s" % (event, element.tag, element.text))) 97 print(("%s, %4s, %s" % (event, element.tag, element.text)))
90 98
91class AnnouncingParser(HTMLParser): 99class AnnouncingParser(HTMLParser):
92 """Announces HTMLParser parse events, without doing anything else.""" 100 """Subclass of HTMLParser that announces parse events, without doing
101 anything else.
102
103 You can use this to get a picture of how html.parser sees a given
104 document. The easiest way to do this is to call `htmlparser_trace`.
105 """
93 106
94 def _p(self, s): 107 def _p(self, s):
95 print(s) 108 print(s)
@@ -126,6 +139,8 @@ def htmlparser_trace(data):
126 139
127 This lets you see how HTMLParser parses a document when no 140 This lets you see how HTMLParser parses a document when no
128 Beautiful Soup code is running. 141 Beautiful Soup code is running.
142
143 :param data: Some markup.
129 """ 144 """
130 parser = AnnouncingParser() 145 parser = AnnouncingParser()
131 parser.feed(data) 146 parser.feed(data)
@@ -168,9 +183,9 @@ def rdoc(num_elements=1000):
168 183
169def benchmark_parsers(num_elements=100000): 184def benchmark_parsers(num_elements=100000):
170 """Very basic head-to-head performance benchmark.""" 185 """Very basic head-to-head performance benchmark."""
171 print("Comparative parser benchmark on Beautiful Soup %s" % __version__) 186 print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
172 data = rdoc(num_elements) 187 data = rdoc(num_elements)
173 print("Generated a large invalid HTML document (%d bytes)." % len(data)) 188 print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
174 189
175 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 190 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
176 success = False 191 success = False
@@ -180,26 +195,26 @@ def benchmark_parsers(num_elements=100000):
180 b = time.time() 195 b = time.time()
181 success = True 196 success = True
182 except Exception as e: 197 except Exception as e:
183 print("%s could not parse the markup." % parser) 198 print(("%s could not parse the markup." % parser))
184 traceback.print_exc() 199 traceback.print_exc()
185 if success: 200 if success:
186 print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) 201 print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
187 202
188 from lxml import etree 203 from lxml import etree
189 a = time.time() 204 a = time.time()
190 etree.HTML(data) 205 etree.HTML(data)
191 b = time.time() 206 b = time.time()
192 print("Raw lxml parsed the markup in %.2fs." % (b-a)) 207 print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
193 208
194 import html5lib 209 import html5lib
195 parser = html5lib.HTMLParser() 210 parser = html5lib.HTMLParser()
196 a = time.time() 211 a = time.time()
197 parser.parse(data) 212 parser.parse(data)
198 b = time.time() 213 b = time.time()
199 print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 214 print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
200 215
201def profile(num_elements=100000, parser="lxml"): 216def profile(num_elements=100000, parser="lxml"):
202 217 """Use Python's profiler on a randomly generated document."""
203 filehandle = tempfile.NamedTemporaryFile() 218 filehandle = tempfile.NamedTemporaryFile()
204 filename = filehandle.name 219 filename = filehandle.name
205 220
@@ -212,5 +227,6 @@ def profile(num_elements=100000, parser="lxml"):
212 stats.sort_stats("cumulative") 227 stats.sort_stats("cumulative")
213 stats.print_stats('_html5lib|bs4', 50) 228 stats.print_stats('_html5lib|bs4', 50)
214 229
230# If this file is run as a script, standard input is diagnosed.
215if __name__ == '__main__': 231if __name__ == '__main__':
216 diagnose(sys.stdin.read()) 232 diagnose(sys.stdin.read())