bitbake: bs4: Add beautifulsoup 4.3.2 to assist the fetcher

Added Beautifulsoup module because fetch/wget latest_versionstring method depends on it. This provides support to fetch/wget.py module for search new package versions in upstream sites. (Bitbake rev: 4626c9b77e5eded97507b6f9ca0d891f9a54bb8a) Signed-off-by: Aníbal Limón <anibal.limon@linux.intel.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
author: Aníbal Limón <anibal.limon@linux.intel.com> 2014-11-05 12:10:27 -0600
committer: Richard Purdie <richard.purdie@linuxfoundation.org> 2014-11-06 16:45:23 +0000
commit: 25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch)
tree: 7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/diagnose.py
parent: bc6330cb7f288e76209410b0812aff1dbfa90950 (diff)
download: poky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz
1 files changed, 204 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py
new file mode 100644
index 0000000000..4d0b00afad
--- /dev/null
+++ b/bitbake/lib/bs4/diagnose.py
@@ -0,0 +1,204 @@
+"""Diagnostic functions, mainly for use when doing tech support."""
+import cProfile
+from StringIO import StringIO
+from HTMLParser import HTMLParser
+import bs4
+from bs4 import BeautifulSoup, __version__
+from bs4.builder import builder_registry
+import os
+import pstats
+import random
+import tempfile
+import time
+import traceback
+import sys
+import cProfile
+def diagnose(data):
+    """Diagnostic suite for isolating common problems."""
+    print "Diagnostic running on Beautiful Soup %s" % __version__
+    print "Python version %s" % sys.version
+    basic_parsers = ["html.parser", "html5lib", "lxml"]
+    for name in basic_parsers:
+        for builder in builder_registry.builders:
+            if name in builder.features:
+                break
+        else:
+            basic_parsers.remove(name)
+            print (
+                "I noticed that %s is not installed. Installing it may help." %
+                name)
+    if 'lxml' in basic_parsers:
+        basic_parsers.append(["lxml", "xml"])
+        from lxml import etree
+        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+    if 'html5lib' in basic_parsers:
+        import html5lib
+        print "Found html5lib version %s" % html5lib.__version__
+    if hasattr(data, 'read'):
+        data = data.read()
+    elif os.path.exists(data):
+        print '"%s" looks like a filename. Reading data from the file.' % data
+        data = open(data).read()
+    elif data.startswith("http:") or data.startswith("https:"):
+        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
+        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+        return
+    print
+    for parser in basic_parsers:
+        print "Trying to parse your markup with %s" % parser
+        success = False
+        try:
+            soup = BeautifulSoup(data, parser)
+            success = True
+        except Exception, e:
+            print "%s could not parse the markup." % parser
+            traceback.print_exc()
+        if success:
+            print "Here's what %s did with the markup:" % parser
+            print soup.prettify()
+        print "-" * 80
+def lxml_trace(data, html=True, **kwargs):
+    """Print out the lxml events that occur during parsing.
+    This lets you see how lxml parses a document when no Beautiful
+    Soup code is running.
+    """
+    from lxml import etree
+    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
+        print("%s, %4s, %s" % (event, element.tag, element.text))
+class AnnouncingParser(HTMLParser):
+    """Announces HTMLParser parse events, without doing anything else."""
+    def _p(self, s):
+        print(s)
+    def handle_starttag(self, name, attrs):
+        self._p("%s START" % name)
+    def handle_endtag(self, name):
+        self._p("%s END" % name)
+    def handle_data(self, data):
+        self._p("%s DATA" % data)
+    def handle_charref(self, name):
+        self._p("%s CHARREF" % name)
+    def handle_entityref(self, name):
+        self._p("%s ENTITYREF" % name)
+    def handle_comment(self, data):
+        self._p("%s COMMENT" % data)
+    def handle_decl(self, data):
+        self._p("%s DECL" % data)
+    def unknown_decl(self, data):
+        self._p("%s UNKNOWN-DECL" % data)
+    def handle_pi(self, data):
+        self._p("%s PI" % data)
+def htmlparser_trace(data):
+    """Print out the HTMLParser events that occur during parsing.
+    This lets you see how HTMLParser parses a document when no
+    Beautiful Soup code is running.
+    """
+    parser = AnnouncingParser()
+    parser.feed(data)
+_vowels = "aeiou"
+_consonants = "bcdfghjklmnpqrstvwxyz"
+def rword(length=5):
+    "Generate a random word-like string."
+    s = ''
+    for i in range(length):
+        if i % 2 == 0:
+            t = _consonants
+        else:
+            t = _vowels
+        s += random.choice(t)
+    return s
+def rsentence(length=4):
+    "Generate a random sentence-like string."
+    return " ".join(rword(random.randint(4,9)) for i in range(length))
+        
+def rdoc(num_elements=1000):
+    """Randomly generate an invalid HTML document."""
+    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
+    elements = []
+    for i in range(num_elements):
+        choice = random.randint(0,3)
+        if choice == 0:
+            # New tag.
+            tag_name = random.choice(tag_names)
+            elements.append("<%s>" % tag_name)
+        elif choice == 1:
+            elements.append(rsentence(random.randint(1,4)))
+        elif choice == 2:
+            # Close a tag.
+            tag_name = random.choice(tag_names)
+            elements.append("</%s>" % tag_name)
+    return "<html>" + "\n".join(elements) + "</html>"
+def benchmark_parsers(num_elements=100000):
+    """Very basic head-to-head performance benchmark."""
+    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+    data = rdoc(num_elements)
+    print "Generated a large invalid HTML document (%d bytes)." % len(data)
+    
+    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
+        success = False
+        try:
+            a = time.time()
+            soup = BeautifulSoup(data, parser)
+            b = time.time()
+            success = True
+        except Exception, e:
+            print "%s could not parse the markup." % parser
+            traceback.print_exc()
+        if success:
+            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+    from lxml import etree
+    a = time.time()
+    etree.HTML(data)
+    b = time.time()
+    print "Raw lxml parsed the markup in %.2fs." % (b-a)
+    import html5lib
+    parser = html5lib.HTMLParser()
+    a = time.time()
+    parser.parse(data)
+    b = time.time()
+    print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+def profile(num_elements=100000, parser="lxml"):
+    filehandle = tempfile.NamedTemporaryFile()
+    filename = filehandle.name
+    data = rdoc(num_elements)
+    vars = dict(bs4=bs4, data=data, parser=parser)
+    cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
+    stats = pstats.Stats(filename)
+    # stats.strip_dirs()
+    stats.sort_stats("cumulative")
+    stats.print_stats('_html5lib|bs4', 50)
+if __name__ == '__main__':
+    diagnose(sys.stdin.read())
author	Aníbal Limón <anibal.limon@linux.intel.com>	2014-11-05 12:10:27 -0600
committer	Richard Purdie <richard.purdie@linuxfoundation.org>	2014-11-06 16:45:23 +0000
commit	25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch)
tree	7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/diagnose.py
parent	bc6330cb7f288e76209410b0812aff1dbfa90950 (diff)
download	poky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz

diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py new file mode 100644 index 0000000000..4d0b00afad --- /dev/null +++ b/bitbake/lib/bs4/diagnose.py
@@ -0,0 +1,204 @@
	1	"""Diagnostic functions, mainly for use when doing tech support."""
	2	import cProfile
	3	from StringIO import StringIO
	4	from HTMLParser import HTMLParser
	5	import bs4
	6	from bs4 import BeautifulSoup, __version__
	7	from bs4.builder import builder_registry
	8
	9	import os
	10	import pstats
	11	import random
	12	import tempfile
	13	import time
	14	import traceback
	15	import sys
	16	import cProfile
	17
	18	def diagnose(data):
	19	"""Diagnostic suite for isolating common problems."""
	20	print "Diagnostic running on Beautiful Soup %s" % __version__
	21	print "Python version %s" % sys.version
	22
	23	basic_parsers = ["html.parser", "html5lib", "lxml"]
	24	for name in basic_parsers:
	25	for builder in builder_registry.builders:
	26	if name in builder.features:
	27	break
	28	else:
	29	basic_parsers.remove(name)
	30	print (
	31	"I noticed that %s is not installed. Installing it may help." %
	32	name)
	33
	34	if 'lxml' in basic_parsers:
	35	basic_parsers.append(["lxml", "xml"])
	36	from lxml import etree
	37	print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
	38
	39	if 'html5lib' in basic_parsers:
	40	import html5lib
	41	print "Found html5lib version %s" % html5lib.__version__
	42
	43	if hasattr(data, 'read'):
	44	data = data.read()
	45	elif os.path.exists(data):
	46	print '"%s" looks like a filename. Reading data from the file.' % data
	47	data = open(data).read()
	48	elif data.startswith("http:") or data.startswith("https:"):
	49	print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
	50	print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
	51	return
	52	print
	53
	54	for parser in basic_parsers:
	55	print "Trying to parse your markup with %s" % parser
	56	success = False
	57	try:
	58	soup = BeautifulSoup(data, parser)
	59	success = True
	60	except Exception, e:
	61	print "%s could not parse the markup." % parser
	62	traceback.print_exc()
	63	if success:
	64	print "Here's what %s did with the markup:" % parser
	65	print soup.prettify()
	66
	67	print "-" * 80
	68
	69	def lxml_trace(data, html=True, **kwargs):
	70	"""Print out the lxml events that occur during parsing.
	71
	72	This lets you see how lxml parses a document when no Beautiful
	73	Soup code is running.
	74	"""
	75	from lxml import etree
	76	for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
	77	print("%s, %4s, %s" % (event, element.tag, element.text))
	78
	79	class AnnouncingParser(HTMLParser):
	80	"""Announces HTMLParser parse events, without doing anything else."""
	81
	82	def _p(self, s):
	83	print(s)
	84
	85	def handle_starttag(self, name, attrs):
	86	self._p("%s START" % name)
	87
	88	def handle_endtag(self, name):
	89	self._p("%s END" % name)
	90
	91	def handle_data(self, data):
	92	self._p("%s DATA" % data)
	93
	94	def handle_charref(self, name):
	95	self._p("%s CHARREF" % name)
	96
	97	def handle_entityref(self, name):
	98	self._p("%s ENTITYREF" % name)
	99
	100	def handle_comment(self, data):
	101	self._p("%s COMMENT" % data)
	102
	103	def handle_decl(self, data):
	104	self._p("%s DECL" % data)
	105
	106	def unknown_decl(self, data):
	107	self._p("%s UNKNOWN-DECL" % data)
	108
	109	def handle_pi(self, data):
	110	self._p("%s PI" % data)
	111
	112	def htmlparser_trace(data):
	113	"""Print out the HTMLParser events that occur during parsing.
	114
	115	This lets you see how HTMLParser parses a document when no
	116	Beautiful Soup code is running.
	117	"""
	118	parser = AnnouncingParser()
	119	parser.feed(data)
	120
	121	_vowels = "aeiou"
	122	_consonants = "bcdfghjklmnpqrstvwxyz"
	123
	124	def rword(length=5):
	125	"Generate a random word-like string."
	126	s = ''
	127	for i in range(length):
	128	if i % 2 == 0:
	129	t = _consonants
	130	else:
	131	t = _vowels
	132	s += random.choice(t)
	133	return s
	134
	135	def rsentence(length=4):
	136	"Generate a random sentence-like string."
	137	return " ".join(rword(random.randint(4,9)) for i in range(length))
	138
	139	def rdoc(num_elements=1000):
	140	"""Randomly generate an invalid HTML document."""
	141	tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
	142	elements = []
	143	for i in range(num_elements):
	144	choice = random.randint(0,3)
	145	if choice == 0:
	146	# New tag.
	147	tag_name = random.choice(tag_names)
	148	elements.append("<%s>" % tag_name)
	149	elif choice == 1:
	150	elements.append(rsentence(random.randint(1,4)))
	151	elif choice == 2:
	152	# Close a tag.
	153	tag_name = random.choice(tag_names)
	154	elements.append("</%s>" % tag_name)
	155	return "<html>" + "\n".join(elements) + "</html>"
	156
	157	def benchmark_parsers(num_elements=100000):
	158	"""Very basic head-to-head performance benchmark."""
	159	print "Comparative parser benchmark on Beautiful Soup %s" % __version__
	160	data = rdoc(num_elements)
	161	print "Generated a large invalid HTML document (%d bytes)." % len(data)
	162
	163	for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
	164	success = False
	165	try:
	166	a = time.time()
	167	soup = BeautifulSoup(data, parser)
	168	b = time.time()
	169	success = True
	170	except Exception, e:
	171	print "%s could not parse the markup." % parser
	172	traceback.print_exc()
	173	if success:
	174	print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
	175
	176	from lxml import etree
	177	a = time.time()
	178	etree.HTML(data)
	179	b = time.time()
	180	print "Raw lxml parsed the markup in %.2fs." % (b-a)
	181
	182	import html5lib
	183	parser = html5lib.HTMLParser()
	184	a = time.time()
	185	parser.parse(data)
	186	b = time.time()
	187	print "Raw html5lib parsed the markup in %.2fs." % (b-a)
	188
	189	def profile(num_elements=100000, parser="lxml"):
	190
	191	filehandle = tempfile.NamedTemporaryFile()
	192	filename = filehandle.name
	193
	194	data = rdoc(num_elements)
	195	vars = dict(bs4=bs4, data=data, parser=parser)
	196	cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
	197
	198	stats = pstats.Stats(filename)
	199	# stats.strip_dirs()
	200	stats.sort_stats("cumulative")
	201	stats.print_stats('_html5lib\|bs4', 50)
	202
	203	if __name__ == '__main__':
	204	diagnose(sys.stdin.read())