diff options
Diffstat (limited to 'bitbake/lib/bs4/diagnose.py')
-rw-r--r-- | bitbake/lib/bs4/diagnose.py | 84 |
1 files changed, 50 insertions, 34 deletions
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py index 083395fb46..4692795340 100644 --- a/bitbake/lib/bs4/diagnose.py +++ b/bitbake/lib/bs4/diagnose.py | |||
@@ -1,9 +1,10 @@ | |||
1 | """Diagnostic functions, mainly for use when doing tech support.""" | 1 | """Diagnostic functions, mainly for use when doing tech support.""" |
2 | 2 | ||
3 | # Use of this source code is governed by the MIT license. | ||
3 | __license__ = "MIT" | 4 | __license__ = "MIT" |
4 | 5 | ||
5 | import cProfile | 6 | import cProfile |
6 | from io import StringIO | 7 | from io import BytesIO |
7 | from html.parser import HTMLParser | 8 | from html.parser import HTMLParser |
8 | import bs4 | 9 | import bs4 |
9 | from bs4 import BeautifulSoup, __version__ | 10 | from bs4 import BeautifulSoup, __version__ |
@@ -16,12 +17,15 @@ import tempfile | |||
16 | import time | 17 | import time |
17 | import traceback | 18 | import traceback |
18 | import sys | 19 | import sys |
19 | import cProfile | ||
20 | 20 | ||
21 | def diagnose(data): | 21 | def diagnose(data): |
22 | """Diagnostic suite for isolating common problems.""" | 22 | """Diagnostic suite for isolating common problems. |
23 | print("Diagnostic running on Beautiful Soup %s" % __version__) | 23 | |
24 | print("Python version %s" % sys.version) | 24 | :param data: A string containing markup that needs to be explained. |
25 | :return: None; diagnostics are printed to standard output. | ||
26 | """ | ||
27 | print(("Diagnostic running on Beautiful Soup %s" % __version__)) | ||
28 | print(("Python version %s" % sys.version)) | ||
25 | 29 | ||
26 | basic_parsers = ["html.parser", "html5lib", "lxml"] | 30 | basic_parsers = ["html.parser", "html5lib", "lxml"] |
27 | for name in basic_parsers: | 31 | for name in basic_parsers: |
@@ -35,61 +39,70 @@ def diagnose(data): | |||
35 | name)) | 39 | name)) |
36 | 40 | ||
37 | if 'lxml' in basic_parsers: | 41 | if 'lxml' in basic_parsers: |
38 | basic_parsers.append(["lxml", "xml"]) | 42 | basic_parsers.append("lxml-xml") |
39 | try: | 43 | try: |
40 | from lxml import etree | 44 | from lxml import etree |
41 | print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) | 45 | print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))) |
42 | except ImportError as e: | 46 | except ImportError as e: |
43 | print ( | 47 | print( |
44 | "lxml is not installed or couldn't be imported.") | 48 | "lxml is not installed or couldn't be imported.") |
45 | 49 | ||
46 | 50 | ||
47 | if 'html5lib' in basic_parsers: | 51 | if 'html5lib' in basic_parsers: |
48 | try: | 52 | try: |
49 | import html5lib | 53 | import html5lib |
50 | print("Found html5lib version %s" % html5lib.__version__) | 54 | print(("Found html5lib version %s" % html5lib.__version__)) |
51 | except ImportError as e: | 55 | except ImportError as e: |
52 | print ( | 56 | print( |
53 | "html5lib is not installed or couldn't be imported.") | 57 | "html5lib is not installed or couldn't be imported.") |
54 | 58 | ||
55 | if hasattr(data, 'read'): | 59 | if hasattr(data, 'read'): |
56 | data = data.read() | 60 | data = data.read() |
57 | elif os.path.exists(data): | ||
58 | print('"%s" looks like a filename. Reading data from the file.' % data) | ||
59 | data = open(data).read() | ||
60 | elif data.startswith("http:") or data.startswith("https:"): | ||
61 | print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) | ||
62 | print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") | ||
63 | return | ||
64 | print() | ||
65 | 61 | ||
66 | for parser in basic_parsers: | 62 | for parser in basic_parsers: |
67 | print("Trying to parse your markup with %s" % parser) | 63 | print(("Trying to parse your markup with %s" % parser)) |
68 | success = False | 64 | success = False |
69 | try: | 65 | try: |
70 | soup = BeautifulSoup(data, parser) | 66 | soup = BeautifulSoup(data, features=parser) |
71 | success = True | 67 | success = True |
72 | except Exception as e: | 68 | except Exception as e: |
73 | print("%s could not parse the markup." % parser) | 69 | print(("%s could not parse the markup." % parser)) |
74 | traceback.print_exc() | 70 | traceback.print_exc() |
75 | if success: | 71 | if success: |
76 | print("Here's what %s did with the markup:" % parser) | 72 | print(("Here's what %s did with the markup:" % parser)) |
77 | print(soup.prettify()) | 73 | print((soup.prettify())) |
78 | 74 | ||
79 | print("-" * 80) | 75 | print(("-" * 80)) |
80 | 76 | ||
81 | def lxml_trace(data, html=True, **kwargs): | 77 | def lxml_trace(data, html=True, **kwargs): |
82 | """Print out the lxml events that occur during parsing. | 78 | """Print out the lxml events that occur during parsing. |
83 | 79 | ||
84 | This lets you see how lxml parses a document when no Beautiful | 80 | This lets you see how lxml parses a document when no Beautiful |
85 | Soup code is running. | 81 | Soup code is running. You can use this to determine whether |
82 | an lxml-specific problem is in Beautiful Soup's lxml tree builders | ||
83 | or in lxml itself. | ||
84 | |||
85 | :param data: Some markup. | ||
86 | :param html: If True, markup will be parsed with lxml's HTML parser. | ||
87 | if False, lxml's XML parser will be used. | ||
86 | """ | 88 | """ |
87 | from lxml import etree | 89 | from lxml import etree |
88 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): | 90 | recover = kwargs.pop('recover', True) |
91 | if isinstance(data, str): | ||
92 | data = data.encode("utf8") | ||
93 | reader = BytesIO(data) | ||
94 | for event, element in etree.iterparse( | ||
95 | reader, html=html, recover=recover, **kwargs | ||
96 | ): | ||
89 | print(("%s, %4s, %s" % (event, element.tag, element.text))) | 97 | print(("%s, %4s, %s" % (event, element.tag, element.text))) |
90 | 98 | ||
91 | class AnnouncingParser(HTMLParser): | 99 | class AnnouncingParser(HTMLParser): |
92 | """Announces HTMLParser parse events, without doing anything else.""" | 100 | """Subclass of HTMLParser that announces parse events, without doing |
101 | anything else. | ||
102 | |||
103 | You can use this to get a picture of how html.parser sees a given | ||
104 | document. The easiest way to do this is to call `htmlparser_trace`. | ||
105 | """ | ||
93 | 106 | ||
94 | def _p(self, s): | 107 | def _p(self, s): |
95 | print(s) | 108 | print(s) |
@@ -126,6 +139,8 @@ def htmlparser_trace(data): | |||
126 | 139 | ||
127 | This lets you see how HTMLParser parses a document when no | 140 | This lets you see how HTMLParser parses a document when no |
128 | Beautiful Soup code is running. | 141 | Beautiful Soup code is running. |
142 | |||
143 | :param data: Some markup. | ||
129 | """ | 144 | """ |
130 | parser = AnnouncingParser() | 145 | parser = AnnouncingParser() |
131 | parser.feed(data) | 146 | parser.feed(data) |
@@ -168,9 +183,9 @@ def rdoc(num_elements=1000): | |||
168 | 183 | ||
169 | def benchmark_parsers(num_elements=100000): | 184 | def benchmark_parsers(num_elements=100000): |
170 | """Very basic head-to-head performance benchmark.""" | 185 | """Very basic head-to-head performance benchmark.""" |
171 | print("Comparative parser benchmark on Beautiful Soup %s" % __version__) | 186 | print(("Comparative parser benchmark on Beautiful Soup %s" % __version__)) |
172 | data = rdoc(num_elements) | 187 | data = rdoc(num_elements) |
173 | print("Generated a large invalid HTML document (%d bytes)." % len(data)) | 188 | print(("Generated a large invalid HTML document (%d bytes)." % len(data))) |
174 | 189 | ||
175 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: | 190 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: |
176 | success = False | 191 | success = False |
@@ -180,26 +195,26 @@ def benchmark_parsers(num_elements=100000): | |||
180 | b = time.time() | 195 | b = time.time() |
181 | success = True | 196 | success = True |
182 | except Exception as e: | 197 | except Exception as e: |
183 | print("%s could not parse the markup." % parser) | 198 | print(("%s could not parse the markup." % parser)) |
184 | traceback.print_exc() | 199 | traceback.print_exc() |
185 | if success: | 200 | if success: |
186 | print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) | 201 | print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a))) |
187 | 202 | ||
188 | from lxml import etree | 203 | from lxml import etree |
189 | a = time.time() | 204 | a = time.time() |
190 | etree.HTML(data) | 205 | etree.HTML(data) |
191 | b = time.time() | 206 | b = time.time() |
192 | print("Raw lxml parsed the markup in %.2fs." % (b-a)) | 207 | print(("Raw lxml parsed the markup in %.2fs." % (b-a))) |
193 | 208 | ||
194 | import html5lib | 209 | import html5lib |
195 | parser = html5lib.HTMLParser() | 210 | parser = html5lib.HTMLParser() |
196 | a = time.time() | 211 | a = time.time() |
197 | parser.parse(data) | 212 | parser.parse(data) |
198 | b = time.time() | 213 | b = time.time() |
199 | print("Raw html5lib parsed the markup in %.2fs." % (b-a)) | 214 | print(("Raw html5lib parsed the markup in %.2fs." % (b-a))) |
200 | 215 | ||
201 | def profile(num_elements=100000, parser="lxml"): | 216 | def profile(num_elements=100000, parser="lxml"): |
202 | 217 | """Use Python's profiler on a randomly generated document.""" | |
203 | filehandle = tempfile.NamedTemporaryFile() | 218 | filehandle = tempfile.NamedTemporaryFile() |
204 | filename = filehandle.name | 219 | filename = filehandle.name |
205 | 220 | ||
@@ -212,5 +227,6 @@ def profile(num_elements=100000, parser="lxml"): | |||
212 | stats.sort_stats("cumulative") | 227 | stats.sort_stats("cumulative") |
213 | stats.print_stats('_html5lib|bs4', 50) | 228 | stats.print_stats('_html5lib|bs4', 50) |
214 | 229 | ||
230 | # If this file is run as a script, standard input is diagnosed. | ||
215 | if __name__ == '__main__': | 231 | if __name__ == '__main__': |
216 | diagnose(sys.stdin.read()) | 232 | diagnose(sys.stdin.read()) |