diff options
Diffstat (limited to 'bitbake/lib/bs4/diagnose.py')
-rw-r--r-- | bitbake/lib/bs4/diagnose.py | 83 |
1 files changed, 50 insertions, 33 deletions
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py index 083395fb46..e079772e69 100644 --- a/bitbake/lib/bs4/diagnose.py +++ b/bitbake/lib/bs4/diagnose.py | |||
@@ -1,9 +1,10 @@ | |||
1 | """Diagnostic functions, mainly for use when doing tech support.""" | 1 | """Diagnostic functions, mainly for use when doing tech support.""" |
2 | 2 | ||
3 | # Use of this source code is governed by the MIT license. | ||
3 | __license__ = "MIT" | 4 | __license__ = "MIT" |
4 | 5 | ||
5 | import cProfile | 6 | import cProfile |
6 | from io import StringIO | 7 | from io import BytesIO |
7 | from html.parser import HTMLParser | 8 | from html.parser import HTMLParser |
8 | import bs4 | 9 | import bs4 |
9 | from bs4 import BeautifulSoup, __version__ | 10 | from bs4 import BeautifulSoup, __version__ |
@@ -19,9 +20,13 @@ import sys | |||
19 | import cProfile | 20 | import cProfile |
20 | 21 | ||
21 | def diagnose(data): | 22 | def diagnose(data): |
22 | """Diagnostic suite for isolating common problems.""" | 23 | """Diagnostic suite for isolating common problems. |
23 | print("Diagnostic running on Beautiful Soup %s" % __version__) | 24 | |
24 | print("Python version %s" % sys.version) | 25 | :param data: A string containing markup that needs to be explained. |
26 | :return: None; diagnostics are printed to standard output. | ||
27 | """ | ||
28 | print(("Diagnostic running on Beautiful Soup %s" % __version__)) | ||
29 | print(("Python version %s" % sys.version)) | ||
25 | 30 | ||
26 | basic_parsers = ["html.parser", "html5lib", "lxml"] | 31 | basic_parsers = ["html.parser", "html5lib", "lxml"] |
27 | for name in basic_parsers: | 32 | for name in basic_parsers: |
@@ -35,61 +40,70 @@ def diagnose(data): | |||
35 | name)) | 40 | name)) |
36 | 41 | ||
37 | if 'lxml' in basic_parsers: | 42 | if 'lxml' in basic_parsers: |
38 | basic_parsers.append(["lxml", "xml"]) | 43 | basic_parsers.append("lxml-xml") |
39 | try: | 44 | try: |
40 | from lxml import etree | 45 | from lxml import etree |
41 | print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) | 46 | print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))) |
42 | except ImportError as e: | 47 | except ImportError as e: |
43 | print ( | 48 | print( |
44 | "lxml is not installed or couldn't be imported.") | 49 | "lxml is not installed or couldn't be imported.") |
45 | 50 | ||
46 | 51 | ||
47 | if 'html5lib' in basic_parsers: | 52 | if 'html5lib' in basic_parsers: |
48 | try: | 53 | try: |
49 | import html5lib | 54 | import html5lib |
50 | print("Found html5lib version %s" % html5lib.__version__) | 55 | print(("Found html5lib version %s" % html5lib.__version__)) |
51 | except ImportError as e: | 56 | except ImportError as e: |
52 | print ( | 57 | print( |
53 | "html5lib is not installed or couldn't be imported.") | 58 | "html5lib is not installed or couldn't be imported.") |
54 | 59 | ||
55 | if hasattr(data, 'read'): | 60 | if hasattr(data, 'read'): |
56 | data = data.read() | 61 | data = data.read() |
57 | elif os.path.exists(data): | ||
58 | print('"%s" looks like a filename. Reading data from the file.' % data) | ||
59 | data = open(data).read() | ||
60 | elif data.startswith("http:") or data.startswith("https:"): | ||
61 | print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) | ||
62 | print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") | ||
63 | return | ||
64 | print() | ||
65 | 62 | ||
66 | for parser in basic_parsers: | 63 | for parser in basic_parsers: |
67 | print("Trying to parse your markup with %s" % parser) | 64 | print(("Trying to parse your markup with %s" % parser)) |
68 | success = False | 65 | success = False |
69 | try: | 66 | try: |
70 | soup = BeautifulSoup(data, parser) | 67 | soup = BeautifulSoup(data, features=parser) |
71 | success = True | 68 | success = True |
72 | except Exception as e: | 69 | except Exception as e: |
73 | print("%s could not parse the markup." % parser) | 70 | print(("%s could not parse the markup." % parser)) |
74 | traceback.print_exc() | 71 | traceback.print_exc() |
75 | if success: | 72 | if success: |
76 | print("Here's what %s did with the markup:" % parser) | 73 | print(("Here's what %s did with the markup:" % parser)) |
77 | print(soup.prettify()) | 74 | print((soup.prettify())) |
78 | 75 | ||
79 | print("-" * 80) | 76 | print(("-" * 80)) |
80 | 77 | ||
81 | def lxml_trace(data, html=True, **kwargs): | 78 | def lxml_trace(data, html=True, **kwargs): |
82 | """Print out the lxml events that occur during parsing. | 79 | """Print out the lxml events that occur during parsing. |
83 | 80 | ||
84 | This lets you see how lxml parses a document when no Beautiful | 81 | This lets you see how lxml parses a document when no Beautiful |
85 | Soup code is running. | 82 | Soup code is running. You can use this to determine whether |
83 | an lxml-specific problem is in Beautiful Soup's lxml tree builders | ||
84 | or in lxml itself. | ||
85 | |||
86 | :param data: Some markup. | ||
87 | :param html: If True, markup will be parsed with lxml's HTML parser. | ||
88 | if False, lxml's XML parser will be used. | ||
86 | """ | 89 | """ |
87 | from lxml import etree | 90 | from lxml import etree |
88 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): | 91 | recover = kwargs.pop('recover', True) |
92 | if isinstance(data, str): | ||
93 | data = data.encode("utf8") | ||
94 | reader = BytesIO(data) | ||
95 | for event, element in etree.iterparse( | ||
96 | reader, html=html, recover=recover, **kwargs | ||
97 | ): | ||
89 | print(("%s, %4s, %s" % (event, element.tag, element.text))) | 98 | print(("%s, %4s, %s" % (event, element.tag, element.text))) |
90 | 99 | ||
91 | class AnnouncingParser(HTMLParser): | 100 | class AnnouncingParser(HTMLParser): |
92 | """Announces HTMLParser parse events, without doing anything else.""" | 101 | """Subclass of HTMLParser that announces parse events, without doing |
102 | anything else. | ||
103 | |||
104 | You can use this to get a picture of how html.parser sees a given | ||
105 | document. The easiest way to do this is to call `htmlparser_trace`. | ||
106 | """ | ||
93 | 107 | ||
94 | def _p(self, s): | 108 | def _p(self, s): |
95 | print(s) | 109 | print(s) |
@@ -126,6 +140,8 @@ def htmlparser_trace(data): | |||
126 | 140 | ||
127 | This lets you see how HTMLParser parses a document when no | 141 | This lets you see how HTMLParser parses a document when no |
128 | Beautiful Soup code is running. | 142 | Beautiful Soup code is running. |
143 | |||
144 | :param data: Some markup. | ||
129 | """ | 145 | """ |
130 | parser = AnnouncingParser() | 146 | parser = AnnouncingParser() |
131 | parser.feed(data) | 147 | parser.feed(data) |
@@ -168,9 +184,9 @@ def rdoc(num_elements=1000): | |||
168 | 184 | ||
169 | def benchmark_parsers(num_elements=100000): | 185 | def benchmark_parsers(num_elements=100000): |
170 | """Very basic head-to-head performance benchmark.""" | 186 | """Very basic head-to-head performance benchmark.""" |
171 | print("Comparative parser benchmark on Beautiful Soup %s" % __version__) | 187 | print(("Comparative parser benchmark on Beautiful Soup %s" % __version__)) |
172 | data = rdoc(num_elements) | 188 | data = rdoc(num_elements) |
173 | print("Generated a large invalid HTML document (%d bytes)." % len(data)) | 189 | print(("Generated a large invalid HTML document (%d bytes)." % len(data))) |
174 | 190 | ||
175 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: | 191 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: |
176 | success = False | 192 | success = False |
@@ -180,26 +196,26 @@ def benchmark_parsers(num_elements=100000): | |||
180 | b = time.time() | 196 | b = time.time() |
181 | success = True | 197 | success = True |
182 | except Exception as e: | 198 | except Exception as e: |
183 | print("%s could not parse the markup." % parser) | 199 | print(("%s could not parse the markup." % parser)) |
184 | traceback.print_exc() | 200 | traceback.print_exc() |
185 | if success: | 201 | if success: |
186 | print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) | 202 | print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a))) |
187 | 203 | ||
188 | from lxml import etree | 204 | from lxml import etree |
189 | a = time.time() | 205 | a = time.time() |
190 | etree.HTML(data) | 206 | etree.HTML(data) |
191 | b = time.time() | 207 | b = time.time() |
192 | print("Raw lxml parsed the markup in %.2fs." % (b-a)) | 208 | print(("Raw lxml parsed the markup in %.2fs." % (b-a))) |
193 | 209 | ||
194 | import html5lib | 210 | import html5lib |
195 | parser = html5lib.HTMLParser() | 211 | parser = html5lib.HTMLParser() |
196 | a = time.time() | 212 | a = time.time() |
197 | parser.parse(data) | 213 | parser.parse(data) |
198 | b = time.time() | 214 | b = time.time() |
199 | print("Raw html5lib parsed the markup in %.2fs." % (b-a)) | 215 | print(("Raw html5lib parsed the markup in %.2fs." % (b-a))) |
200 | 216 | ||
201 | def profile(num_elements=100000, parser="lxml"): | 217 | def profile(num_elements=100000, parser="lxml"): |
202 | 218 | """Use Python's profiler on a randomly generated document.""" | |
203 | filehandle = tempfile.NamedTemporaryFile() | 219 | filehandle = tempfile.NamedTemporaryFile() |
204 | filename = filehandle.name | 220 | filename = filehandle.name |
205 | 221 | ||
@@ -212,5 +228,6 @@ def profile(num_elements=100000, parser="lxml"): | |||
212 | stats.sort_stats("cumulative") | 228 | stats.sort_stats("cumulative") |
213 | stats.print_stats('_html5lib|bs4', 50) | 229 | stats.print_stats('_html5lib|bs4', 50) |
214 | 230 | ||
231 | # If this file is run as a script, standard input is diagnosed. | ||
215 | if __name__ == '__main__': | 232 | if __name__ == '__main__': |
216 | diagnose(sys.stdin.read()) | 233 | diagnose(sys.stdin.read()) |