1 files changed, 50 insertions, 34 deletions
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py
index 083395fb46..4692795340 100644
--- a/bitbake/lib/bs4/diagnose.py
+++ b/bitbake/lib/bs4/diagnose.py
@@ -1,9 +1,10 @@
 """Diagnostic functions, mainly for use when doing tech support."""
+# Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 import cProfile
-from io import StringIO
+from io import BytesIO
 from html.parser import HTMLParser
 import bs4
 from bs4 import BeautifulSoup, __version__
@@ -16,12 +17,15 @@ import tempfile
 import time
 import traceback
 import sys
-import cProfile
 def diagnose(data):
-    """Diagnostic suite for isolating common problems."""
+    """Diagnostic suite for isolating common problems.
-    print("Diagnostic running on Beautiful Soup %s" % __version__)
-    print("Python version %s" % sys.version)
+    :param data: A string containing markup that needs to be explained.
+    :return: None; diagnostics are printed to standard output.
+    """
+    print(("Diagnostic running on Beautiful Soup %s" % __version__))
+    print(("Python version %s" % sys.version))
    basic_parsers = ["html.parser", "html5lib", "lxml"]
    for name in basic_parsers:
@@ -35,61 +39,70 @@ def diagnose(data):
                name))
    if 'lxml' in basic_parsers:
-        basic_parsers.append(["lxml", "xml"])
+        basic_parsers.append("lxml-xml")
        try:
            from lxml import etree
-            print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
+            print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
        except ImportError as e:
-            print (
+            print(
                "lxml is not installed or couldn't be imported.")
    if 'html5lib' in basic_parsers:
        try:
            import html5lib
-            print("Found html5lib version %s" % html5lib.__version__)
+            print(("Found html5lib version %s" % html5lib.__version__))
        except ImportError as e:
-            print (
+            print(
                "html5lib is not installed or couldn't be imported.")
    if hasattr(data, 'read'):
        data = data.read()
-    elif os.path.exists(data):
-        print('"%s" looks like a filename. Reading data from the file.' % data)
-        data = open(data).read()
-    elif data.startswith("http:") or data.startswith("https:"):
-        print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
-        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
-        return
-    print()
    for parser in basic_parsers:
-        print("Trying to parse your markup with %s" % parser)
+        print(("Trying to parse your markup with %s" % parser))
        success = False
        try:
-            soup = BeautifulSoup(data, parser)
+            soup = BeautifulSoup(data, features=parser)
            success = True
        except Exception as e:
-            print("%s could not parse the markup." % parser)
+            print(("%s could not parse the markup." % parser))
            traceback.print_exc()
        if success:
-            print("Here's what %s did with the markup:" % parser)
+            print(("Here's what %s did with the markup:" % parser))
-            print(soup.prettify())
+            print((soup.prettify()))
-        print("-" * 80)
+        print(("-" * 80))
 def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.
    This lets you see how lxml parses a document when no Beautiful
-    Soup code is running.
+    Soup code is running. You can use this to determine whether
+    an lxml-specific problem is in Beautiful Soup's lxml tree builders
+    or in lxml itself.
+    :param data: Some markup.
+    :param html: If True, markup will be parsed with lxml's HTML parser.
+       if False, lxml's XML parser will be used.
    """
    from lxml import etree
-    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
+    recover = kwargs.pop('recover', True)
+    if isinstance(data, str):
+        data = data.encode("utf8")
+    reader = BytesIO(data)
+    for event, element in etree.iterparse(
+        reader, html=html, recover=recover, **kwargs
+    ):
        print(("%s, %4s, %s" % (event, element.tag, element.text)))
 class AnnouncingParser(HTMLParser):
-    """Announces HTMLParser parse events, without doing anything else."""
+    """Subclass of HTMLParser that announces parse events, without doing
+    anything else.
+    You can use this to get a picture of how html.parser sees a given
+    document. The easiest way to do this is to call `htmlparser_trace`.
+    """
    def _p(self, s):
        print(s)
@@ -126,6 +139,8 @@ def htmlparser_trace(data):
    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
+    :param data: Some markup.
    """
    parser = AnnouncingParser()
    parser.feed(data)
@@ -168,9 +183,9 @@ def rdoc(num_elements=1000):
 def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
-    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
+    print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
    data = rdoc(num_elements)
-    print("Generated a large invalid HTML document (%d bytes)." % len(data))
+    print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
@@ -180,26 +195,26 @@ def benchmark_parsers(num_elements=100000):
            b = time.time()
            success = True
        except Exception as e:
-            print("%s could not parse the markup." % parser)
+            print(("%s could not parse the markup." % parser))
            traceback.print_exc()
        if success:
-            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
+            print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
-    print("Raw lxml parsed the markup in %.2fs." % (b-a))
+    print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
-    print("Raw html5lib parsed the markup in %.2fs." % (b-a))
+    print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
 def profile(num_elements=100000, parser="lxml"):
+    """Use Python's profiler on a randomly generated document."""
    filehandle = tempfile.NamedTemporaryFile()
    filename = filehandle.name
@@ -212,5 +227,6 @@ def profile(num_elements=100000, parser="lxml"):
    stats.sort_stats("cumulative")
    stats.print_stats('_html5lib|bs4', 50)
+# If this file is run as a script, standard input is diagnosed.
 if __name__ == '__main__':
    diagnose(sys.stdin.read())

diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py index 083395fb46..4692795340 100644 --- a/bitbake/lib/bs4/diagnose.py +++ b/bitbake/lib/bs4/diagnose.py
@@ -1,9 +1,10 @@
1	"""Diagnostic functions, mainly for use when doing tech support."""	1	"""Diagnostic functions, mainly for use when doing tech support."""
2		2
		3	# Use of this source code is governed by the MIT license.
3	__license__ = "MIT"	4	__license__ = "MIT"
4		5
5	import cProfile	6	import cProfile
6	from io import StringIO	7	from io import BytesIO
7	from html.parser import HTMLParser	8	from html.parser import HTMLParser
8	import bs4	9	import bs4
9	from bs4 import BeautifulSoup, __version__	10	from bs4 import BeautifulSoup, __version__
@@ -16,12 +17,15 @@ import tempfile
16	import time	17	import time
17	import traceback	18	import traceback
18	import sys	19	import sys
19	import cProfile
20		20
21	def diagnose(data):	21	def diagnose(data):
22	"""Diagnostic suite for isolating common problems."""	22	"""Diagnostic suite for isolating common problems.
23	print("Diagnostic running on Beautiful Soup %s" % __version__)	23
24	print("Python version %s" % sys.version)	24	:param data: A string containing markup that needs to be explained.
		25	:return: None; diagnostics are printed to standard output.
		26	"""
		27	print(("Diagnostic running on Beautiful Soup %s" % __version__))
		28	print(("Python version %s" % sys.version))
25		29
26	basic_parsers = ["html.parser", "html5lib", "lxml"]	30	basic_parsers = ["html.parser", "html5lib", "lxml"]
27	for name in basic_parsers:	31	for name in basic_parsers:
@@ -35,61 +39,70 @@ def diagnose(data):
35	name))	39	name))
36		40
37	if 'lxml' in basic_parsers:	41	if 'lxml' in basic_parsers:
38	basic_parsers.append(["lxml", "xml"])	42	basic_parsers.append("lxml-xml")
39	try:	43	try:
40	from lxml import etree	44	from lxml import etree
41	print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))	45	print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
42	except ImportError as e:	46	except ImportError as e:
43	print (	47	print(
44	"lxml is not installed or couldn't be imported.")	48	"lxml is not installed or couldn't be imported.")
45		49
46		50
47	if 'html5lib' in basic_parsers:	51	if 'html5lib' in basic_parsers:
48	try:	52	try:
49	import html5lib	53	import html5lib
50	print("Found html5lib version %s" % html5lib.__version__)	54	print(("Found html5lib version %s" % html5lib.__version__))
51	except ImportError as e:	55	except ImportError as e:
52	print (	56	print(
53	"html5lib is not installed or couldn't be imported.")	57	"html5lib is not installed or couldn't be imported.")
54		58
55	if hasattr(data, 'read'):	59	if hasattr(data, 'read'):
56	data = data.read()	60	data = data.read()
57	elif os.path.exists(data):
58	print('"%s" looks like a filename. Reading data from the file.' % data)
59	data = open(data).read()
60	elif data.startswith("http:") or data.startswith("https:"):
61	print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
62	print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
63	return
64	print()
65		61
66	for parser in basic_parsers:	62	for parser in basic_parsers:
67	print("Trying to parse your markup with %s" % parser)	63	print(("Trying to parse your markup with %s" % parser))
68	success = False	64	success = False
69	try:	65	try:
70	soup = BeautifulSoup(data, parser)	66	soup = BeautifulSoup(data, features=parser)
71	success = True	67	success = True
72	except Exception as e:	68	except Exception as e:
73	print("%s could not parse the markup." % parser)	69	print(("%s could not parse the markup." % parser))
74	traceback.print_exc()	70	traceback.print_exc()
75	if success:	71	if success:
76	print("Here's what %s did with the markup:" % parser)	72	print(("Here's what %s did with the markup:" % parser))
77	print(soup.prettify())	73	print((soup.prettify()))
78		74
79	print("-" * 80)	75	print(("-" * 80))
80		76
81	def lxml_trace(data, html=True, **kwargs):	77	def lxml_trace(data, html=True, **kwargs):
82	"""Print out the lxml events that occur during parsing.	78	"""Print out the lxml events that occur during parsing.
83		79
84	This lets you see how lxml parses a document when no Beautiful	80	This lets you see how lxml parses a document when no Beautiful
85	Soup code is running.	81	Soup code is running. You can use this to determine whether
		82	an lxml-specific problem is in Beautiful Soup's lxml tree builders
		83	or in lxml itself.
		84
		85	:param data: Some markup.
		86	:param html: If True, markup will be parsed with lxml's HTML parser.
		87	if False, lxml's XML parser will be used.
86	"""	88	"""
87	from lxml import etree	89	from lxml import etree
88	for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):	90	recover = kwargs.pop('recover', True)
		91	if isinstance(data, str):
		92	data = data.encode("utf8")
		93	reader = BytesIO(data)
		94	for event, element in etree.iterparse(
		95	reader, html=html, recover=recover, **kwargs
		96	):
89	print(("%s, %4s, %s" % (event, element.tag, element.text)))	97	print(("%s, %4s, %s" % (event, element.tag, element.text)))
90		98
91	class AnnouncingParser(HTMLParser):	99	class AnnouncingParser(HTMLParser):
92	"""Announces HTMLParser parse events, without doing anything else."""	100	"""Subclass of HTMLParser that announces parse events, without doing
		101	anything else.
		102
		103	You can use this to get a picture of how html.parser sees a given
		104	document. The easiest way to do this is to call `htmlparser_trace`.
		105	"""
93		106
94	def _p(self, s):	107	def _p(self, s):
95	print(s)	108	print(s)
@@ -126,6 +139,8 @@ def htmlparser_trace(data):
126		139
127	This lets you see how HTMLParser parses a document when no	140	This lets you see how HTMLParser parses a document when no
128	Beautiful Soup code is running.	141	Beautiful Soup code is running.
		142
		143	:param data: Some markup.
129	"""	144	"""
130	parser = AnnouncingParser()	145	parser = AnnouncingParser()
131	parser.feed(data)	146	parser.feed(data)
@@ -168,9 +183,9 @@ def rdoc(num_elements=1000):
168		183
169	def benchmark_parsers(num_elements=100000):	184	def benchmark_parsers(num_elements=100000):
170	"""Very basic head-to-head performance benchmark."""	185	"""Very basic head-to-head performance benchmark."""
171	print("Comparative parser benchmark on Beautiful Soup %s" % __version__)	186	print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
172	data = rdoc(num_elements)	187	data = rdoc(num_elements)
173	print("Generated a large invalid HTML document (%d bytes)." % len(data))	188	print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
174		189
175	for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:	190	for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
176	success = False	191	success = False
@@ -180,26 +195,26 @@ def benchmark_parsers(num_elements=100000):
180	b = time.time()	195	b = time.time()
181	success = True	196	success = True
182	except Exception as e:	197	except Exception as e:
183	print("%s could not parse the markup." % parser)	198	print(("%s could not parse the markup." % parser))
184	traceback.print_exc()	199	traceback.print_exc()
185	if success:	200	if success:
186	print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))	201	print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
187		202
188	from lxml import etree	203	from lxml import etree
189	a = time.time()	204	a = time.time()
190	etree.HTML(data)	205	etree.HTML(data)
191	b = time.time()	206	b = time.time()
192	print("Raw lxml parsed the markup in %.2fs." % (b-a))	207	print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
193		208
194	import html5lib	209	import html5lib
195	parser = html5lib.HTMLParser()	210	parser = html5lib.HTMLParser()
196	a = time.time()	211	a = time.time()
197	parser.parse(data)	212	parser.parse(data)
198	b = time.time()	213	b = time.time()
199	print("Raw html5lib parsed the markup in %.2fs." % (b-a))	214	print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
200		215
201	def profile(num_elements=100000, parser="lxml"):	216	def profile(num_elements=100000, parser="lxml"):
202		217	"""Use Python's profiler on a randomly generated document."""
203	filehandle = tempfile.NamedTemporaryFile()	218	filehandle = tempfile.NamedTemporaryFile()
204	filename = filehandle.name	219	filename = filehandle.name
205		220
@@ -212,5 +227,6 @@ def profile(num_elements=100000, parser="lxml"):
212	stats.sort_stats("cumulative")	227	stats.sort_stats("cumulative")
213	stats.print_stats('_html5lib\|bs4', 50)	228	stats.print_stats('_html5lib\|bs4', 50)
214		229
		230	# If this file is run as a script, standard input is diagnosed.
215	if __name__ == '__main__':	231	if __name__ == '__main__':
216	diagnose(sys.stdin.read())	232	diagnose(sys.stdin.read())