bitbake: bs4: Add beautifulsoup 4.3.2 to assist the fetcher

Added Beautifulsoup module because fetch/wget latest_versionstring method depends on it. This provides support to fetch/wget.py module for search new package versions in upstream sites. (Bitbake rev: 4626c9b77e5eded97507b6f9ca0d891f9a54bb8a) Signed-off-by: Aníbal Limón <anibal.limon@linux.intel.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
author: Aníbal Limón <anibal.limon@linux.intel.com> 2014-11-05 12:10:27 -0600
committer: Richard Purdie <richard.purdie@linuxfoundation.org> 2014-11-06 16:45:23 +0000
commit: 25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch)
tree: 7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/builder/__init__.py
parent: bc6330cb7f288e76209410b0812aff1dbfa90950 (diff)
download: poky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz
1 files changed, 321 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py
new file mode 100644
index 0000000000..740f5f29cd
--- /dev/null
+++ b/bitbake/lib/bs4/builder/__init__.py
@@ -0,0 +1,321 @@
+from collections import defaultdict
+import itertools
+import sys
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    whitespace_re
+    )
+__all__ = [
+    'HTMLTreeBuilder',
+    'SAXTreeBuilder',
+    'TreeBuilder',
+    'TreeBuilderRegistry',
+    ]
+# Some useful features for a TreeBuilder to have.
+FAST = 'fast'
+PERMISSIVE = 'permissive'
+STRICT = 'strict'
+XML = 'xml'
+HTML = 'html'
+HTML_5 = 'html5'
+class TreeBuilderRegistry(object):
+    def __init__(self):
+        self.builders_for_feature = defaultdict(list)
+        self.builders = []
+    def register(self, treebuilder_class):
+        """Register a treebuilder based on its advertised features."""
+        for feature in treebuilder_class.features:
+            self.builders_for_feature[feature].insert(0, treebuilder_class)
+        self.builders.insert(0, treebuilder_class)
+    def lookup(self, *features):
+        if len(self.builders) == 0:
+            # There are no builders at all.
+            return None
+        if len(features) == 0:
+            # They didn't ask for any features. Give them the most
+            # recently registered builder.
+            return self.builders[0]
+        # Go down the list of features in order, and eliminate any builders
+        # that don't match every feature.
+        features = list(features)
+        features.reverse()
+        candidates = None
+        candidate_set = None
+        while len(features) > 0:
+            feature = features.pop()
+            we_have_the_feature = self.builders_for_feature.get(feature, [])
+            if len(we_have_the_feature) > 0:
+                if candidates is None:
+                    candidates = we_have_the_feature
+                    candidate_set = set(candidates)
+                else:
+                    # Eliminate any candidates that don't have this feature.
+                    candidate_set = candidate_set.intersection(
+                        set(we_have_the_feature))
+        # The only valid candidates are the ones in candidate_set.
+        # Go through the original list of candidates and pick the first one
+        # that's in candidate_set.
+        if candidate_set is None:
+            return None
+        for candidate in candidates:
+            if candidate in candidate_set:
+                return candidate
+        return None
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
+class TreeBuilder(object):
+    """Turn a document into a Beautiful Soup object tree."""
+    features = []
+    is_xml = False
+    preserve_whitespace_tags = set()
+    empty_element_tags = None # A tag will be considered an empty-element
+                              # tag when and only when it has no contents.
+    # A value for these tag/attribute combinations is a space- or
+    # comma-separated list of CDATA, rather than a single CDATA.
+    cdata_list_attributes = {}
+    def __init__(self):
+        self.soup = None
+    def reset(self):
+        pass
+    def can_be_empty_element(self, tag_name):
+        """Might a tag with this name be an empty-element tag?
+        The final markup may or may not actually present this tag as
+        self-closing.
+        For instance: an HTMLBuilder does not consider a <p> tag to be
+        an empty-element tag (it's not in
+        HTMLBuilder.empty_element_tags). This means an empty <p> tag
+        will be presented as "<p></p>", not "<p />".
+        The default implementation has no opinion about which tags are
+        empty-element tags, so a tag will be presented as an
+        empty-element tag if and only if it has no contents.
+        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        be left alone.
+        """
+        if self.empty_element_tags is None:
+            return True
+        return tag_name in self.empty_element_tags
+    def feed(self, markup):
+        raise NotImplementedError()
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        return markup, None, None, False
+    def test_fragment_to_document(self, fragment):
+        """Wrap an HTML fragment to make it look like a document.
+        Different parsers do this differently. For instance, lxml
+        introduces an empty <head> tag, and html5lib
+        doesn't. Abstracting this away lets us write simple tests
+        which run HTML fragments through the parser and compare the
+        results against other HTML fragments.
+        This method should not be used outside of tests.
+        """
+        return fragment
+    def set_up_substitutions(self, tag):
+        return False
+    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
+        """Replaces class="foo bar" with class=["foo", "bar"]
+        Modifies its input in place.
+        """
+        if not attrs:
+            return attrs
+        if self.cdata_list_attributes:
+            universal = self.cdata_list_attributes.get('*', [])
+            tag_specific = self.cdata_list_attributes.get(
+                tag_name.lower(), None)
+            for attr in attrs.keys():
+                if attr in universal or (tag_specific and attr in tag_specific):
+                    # We have a "class"-type attribute whose string
+                    # value is a whitespace-separated list of
+                    # values. Split it into a list.
+                    value = attrs[attr]
+                    if isinstance(value, basestring):
+                        values = whitespace_re.split(value)
+                    else:
+                        # html5lib sometimes calls setAttributes twice
+                        # for the same tag when rearranging the parse
+                        # tree. On the second call the attribute value
+                        # here is already a list.  If this happens,
+                        # leave the value alone rather than trying to
+                        # split it again.
+                        values = value
+                    attrs[attr] = values
+        return attrs
+class SAXTreeBuilder(TreeBuilder):
+    """A Beautiful Soup treebuilder that listens for SAX events."""
+    def feed(self, markup):
+        raise NotImplementedError()
+    def close(self):
+        pass
+    def startElement(self, name, attrs):
+        attrs = dict((key[1], value) for key, value in list(attrs.items()))
+        #print "Start %s, %r" % (name, attrs)
+        self.soup.handle_starttag(name, attrs)
+    def endElement(self, name):
+        #print "End %s" % name
+        self.soup.handle_endtag(name)
+    def startElementNS(self, nsTuple, nodeName, attrs):
+        # Throw away (ns, nodeName) for now.
+        self.startElement(nodeName, attrs)
+    def endElementNS(self, nsTuple, nodeName):
+        # Throw away (ns, nodeName) for now.
+        self.endElement(nodeName)
+        #handler.endElementNS((ns, node.nodeName), node.nodeName)
+    def startPrefixMapping(self, prefix, nodeValue):
+        # Ignore the prefix for now.
+        pass
+    def endPrefixMapping(self, prefix):
+        # Ignore the prefix for now.
+        # handler.endPrefixMapping(prefix)
+        pass
+    def characters(self, content):
+        self.soup.handle_data(content)
+    def startDocument(self):
+        pass
+    def endDocument(self):
+        pass
+class HTMLTreeBuilder(TreeBuilder):
+    """This TreeBuilder knows facts about HTML.
+    Such as which tags are empty-element tags.
+    """
+    preserve_whitespace_tags = set(['pre', 'textarea'])
+    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+                              'spacer', 'link', 'frame', 'base'])
+    # The HTML standard defines these attributes as containing a
+    # space-separated list of values, not a single value. That is,
+    # class="foo bar" means that the 'class' attribute has two values,
+    # 'foo' and 'bar', not the single value 'foo bar'.  When we
+    # encounter one of these attributes, we will parse its value into
+    # a list of values if possible. Upon output, the list will be
+    # converted back into a string.
+    cdata_list_attributes = {
+        "*" : ['class', 'accesskey', 'dropzone'],
+        "a" : ['rel', 'rev'],
+        "link" :  ['rel', 'rev'],
+        "td" : ["headers"],
+        "th" : ["headers"],
+        "td" : ["headers"],
+        "form" : ["accept-charset"],
+        "object" : ["archive"],
+        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+        "area" : ["rel"],
+        "icon" : ["sizes"],
+        "iframe" : ["sandbox"],
+        "output" : ["for"],
+        }
+    def set_up_substitutions(self, tag):
+        # We are only interested in <meta> tags
+        if tag.name != 'meta':
+            return False
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+        charset = tag.get('charset')
+        # We are interested in <meta> tags that say what encoding the
+        # document was originally in. This means HTML 5-style <meta>
+        # tags that provide the "charset" attribute. It also means
+        # HTML 4-style <meta> tags that provide the "content"
+        # attribute and have "http-equiv" set to "content-type".
+        #
+        # In both cases we will replace the value of the appropriate
+        # attribute with a standin object that can take on any
+        # encoding.
+        meta_encoding = None
+        if charset is not None:
+            # HTML 5 style:
+            # <meta charset="utf8">
+            meta_encoding = charset
+            tag['charset'] = CharsetMetaAttributeValue(charset)
+        elif (content is not None and http_equiv is not None
+              and http_equiv.lower() == 'content-type'):
+            # HTML 4 style:
+            # <meta http-equiv="content-type" content="text/html; charset=utf8">
+            tag['content'] = ContentMetaAttributeValue(content)
+        return (meta_encoding is not None)
+def register_treebuilders_from(module):
+    """Copy TreeBuilders from the given module into this module."""
+    # I'm fairly sure this is not the best way to do this.
+    this_module = sys.modules['bs4.builder']
+    for name in module.__all__:
+        obj = getattr(module, name)
+        if issubclass(obj, TreeBuilder):
+            setattr(this_module, name, obj)
+            this_module.__all__.append(name)
+            # Register the builder while we're at it.
+            this_module.builder_registry.register(obj)
+class ParserRejectedMarkup(Exception):
+    pass
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last result.
+from . import _htmlparser
+register_treebuilders_from(_htmlparser)
+try:
+    from . import _html5lib
+    register_treebuilders_from(_html5lib)
+except ImportError:
+    # They don't have html5lib installed.
+    pass
+try:
+    from . import _lxml
+    register_treebuilders_from(_lxml)
+except ImportError:
+    # They don't have lxml installed.
+    pass
author	Aníbal Limón <anibal.limon@linux.intel.com>	2014-11-05 12:10:27 -0600
committer	Richard Purdie <richard.purdie@linuxfoundation.org>	2014-11-06 16:45:23 +0000
commit	25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch)
tree	7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/builder/__init__.py
parent	bc6330cb7f288e76209410b0812aff1dbfa90950 (diff)
download	poky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz

diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py new file mode 100644 index 0000000000..740f5f29cd --- /dev/null +++ b/bitbake/lib/bs4/builder/__init__.py
@@ -0,0 +1,321 @@
	1	from collections import defaultdict
	2	import itertools
	3	import sys
	4	from bs4.element import (
	5	CharsetMetaAttributeValue,
	6	ContentMetaAttributeValue,
	7	whitespace_re
	8	)
	9
	10	__all__ = [
	11	'HTMLTreeBuilder',
	12	'SAXTreeBuilder',
	13	'TreeBuilder',
	14	'TreeBuilderRegistry',
	15	]
	16
	17	# Some useful features for a TreeBuilder to have.
	18	FAST = 'fast'
	19	PERMISSIVE = 'permissive'
	20	STRICT = 'strict'
	21	XML = 'xml'
	22	HTML = 'html'
	23	HTML_5 = 'html5'
	24
	25
	26	class TreeBuilderRegistry(object):
	27
	28	def __init__(self):
	29	self.builders_for_feature = defaultdict(list)
	30	self.builders = []
	31
	32	def register(self, treebuilder_class):
	33	"""Register a treebuilder based on its advertised features."""
	34	for feature in treebuilder_class.features:
	35	self.builders_for_feature[feature].insert(0, treebuilder_class)
	36	self.builders.insert(0, treebuilder_class)
	37
	38	def lookup(self, *features):
	39	if len(self.builders) == 0:
	40	# There are no builders at all.
	41	return None
	42
	43	if len(features) == 0:
	44	# They didn't ask for any features. Give them the most
	45	# recently registered builder.
	46	return self.builders[0]
	47
	48	# Go down the list of features in order, and eliminate any builders
	49	# that don't match every feature.
	50	features = list(features)
	51	features.reverse()
	52	candidates = None
	53	candidate_set = None
	54	while len(features) > 0:
	55	feature = features.pop()
	56	we_have_the_feature = self.builders_for_feature.get(feature, [])
	57	if len(we_have_the_feature) > 0:
	58	if candidates is None:
	59	candidates = we_have_the_feature
	60	candidate_set = set(candidates)
	61	else:
	62	# Eliminate any candidates that don't have this feature.
	63	candidate_set = candidate_set.intersection(
	64	set(we_have_the_feature))
	65
	66	# The only valid candidates are the ones in candidate_set.
	67	# Go through the original list of candidates and pick the first one
	68	# that's in candidate_set.
	69	if candidate_set is None:
	70	return None
	71	for candidate in candidates:
	72	if candidate in candidate_set:
	73	return candidate
	74	return None
	75
	76	# The BeautifulSoup class will take feature lists from developers and use them
	77	# to look up builders in this registry.
	78	builder_registry = TreeBuilderRegistry()
	79
	80	class TreeBuilder(object):
	81	"""Turn a document into a Beautiful Soup object tree."""
	82
	83	features = []
	84
	85	is_xml = False
	86	preserve_whitespace_tags = set()
	87	empty_element_tags = None # A tag will be considered an empty-element
	88	# tag when and only when it has no contents.
	89
	90	# A value for these tag/attribute combinations is a space- or
	91	# comma-separated list of CDATA, rather than a single CDATA.
	92	cdata_list_attributes = {}
	93
	94
	95	def __init__(self):
	96	self.soup = None
	97
	98	def reset(self):
	99	pass
	100
	101	def can_be_empty_element(self, tag_name):
	102	"""Might a tag with this name be an empty-element tag?
	103
	104	The final markup may or may not actually present this tag as
	105	self-closing.
	106
	107	For instance: an HTMLBuilder does not consider a <p> tag to be
	108	an empty-element tag (it's not in
	109	HTMLBuilder.empty_element_tags). This means an empty <p> tag
	110	will be presented as "<p></p>", not "<p />".
	111
	112	The default implementation has no opinion about which tags are
	113	empty-element tags, so a tag will be presented as an
	114	empty-element tag if and only if it has no contents.
	115	"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
	116	be left alone.
	117	"""
	118	if self.empty_element_tags is None:
	119	return True
	120	return tag_name in self.empty_element_tags
	121
	122	def feed(self, markup):
	123	raise NotImplementedError()
	124
	125	def prepare_markup(self, markup, user_specified_encoding=None,
	126	document_declared_encoding=None):
	127	return markup, None, None, False
	128
	129	def test_fragment_to_document(self, fragment):
	130	"""Wrap an HTML fragment to make it look like a document.
	131
	132	Different parsers do this differently. For instance, lxml
	133	introduces an empty <head> tag, and html5lib
	134	doesn't. Abstracting this away lets us write simple tests
	135	which run HTML fragments through the parser and compare the
	136	results against other HTML fragments.
	137
	138	This method should not be used outside of tests.
	139	"""
	140	return fragment
	141
	142	def set_up_substitutions(self, tag):
	143	return False
	144
	145	def _replace_cdata_list_attribute_values(self, tag_name, attrs):
	146	"""Replaces class="foo bar" with class=["foo", "bar"]
	147
	148	Modifies its input in place.
	149	"""
	150	if not attrs:
	151	return attrs
	152	if self.cdata_list_attributes:
	153	universal = self.cdata_list_attributes.get('*', [])
	154	tag_specific = self.cdata_list_attributes.get(
	155	tag_name.lower(), None)
	156	for attr in attrs.keys():
	157	if attr in universal or (tag_specific and attr in tag_specific):
	158	# We have a "class"-type attribute whose string
	159	# value is a whitespace-separated list of
	160	# values. Split it into a list.
	161	value = attrs[attr]
	162	if isinstance(value, basestring):
	163	values = whitespace_re.split(value)
	164	else:
	165	# html5lib sometimes calls setAttributes twice
	166	# for the same tag when rearranging the parse
	167	# tree. On the second call the attribute value
	168	# here is already a list. If this happens,
	169	# leave the value alone rather than trying to
	170	# split it again.
	171	values = value
	172	attrs[attr] = values
	173	return attrs
	174
	175	class SAXTreeBuilder(TreeBuilder):
	176	"""A Beautiful Soup treebuilder that listens for SAX events."""
	177
	178	def feed(self, markup):
	179	raise NotImplementedError()
	180
	181	def close(self):
	182	pass
	183
	184	def startElement(self, name, attrs):
	185	attrs = dict((key[1], value) for key, value in list(attrs.items()))
	186	#print "Start %s, %r" % (name, attrs)
	187	self.soup.handle_starttag(name, attrs)
	188
	189	def endElement(self, name):
	190	#print "End %s" % name
	191	self.soup.handle_endtag(name)
	192
	193	def startElementNS(self, nsTuple, nodeName, attrs):
	194	# Throw away (ns, nodeName) for now.
	195	self.startElement(nodeName, attrs)
	196
	197	def endElementNS(self, nsTuple, nodeName):
	198	# Throw away (ns, nodeName) for now.
	199	self.endElement(nodeName)
	200	#handler.endElementNS((ns, node.nodeName), node.nodeName)
	201
	202	def startPrefixMapping(self, prefix, nodeValue):
	203	# Ignore the prefix for now.
	204	pass
	205
	206	def endPrefixMapping(self, prefix):
	207	# Ignore the prefix for now.
	208	# handler.endPrefixMapping(prefix)
	209	pass
	210
	211	def characters(self, content):
	212	self.soup.handle_data(content)
	213
	214	def startDocument(self):
	215	pass
	216
	217	def endDocument(self):
	218	pass
	219
	220
	221	class HTMLTreeBuilder(TreeBuilder):
	222	"""This TreeBuilder knows facts about HTML.
	223
	224	Such as which tags are empty-element tags.
	225	"""
	226
	227	preserve_whitespace_tags = set(['pre', 'textarea'])
	228	empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
	229	'spacer', 'link', 'frame', 'base'])
	230
	231	# The HTML standard defines these attributes as containing a
	232	# space-separated list of values, not a single value. That is,
	233	# class="foo bar" means that the 'class' attribute has two values,
	234	# 'foo' and 'bar', not the single value 'foo bar'. When we
	235	# encounter one of these attributes, we will parse its value into
	236	# a list of values if possible. Upon output, the list will be
	237	# converted back into a string.
	238	cdata_list_attributes = {
	239	"*" : ['class', 'accesskey', 'dropzone'],
	240	"a" : ['rel', 'rev'],
	241	"link" : ['rel', 'rev'],
	242	"td" : ["headers"],
	243	"th" : ["headers"],
	244	"td" : ["headers"],
	245	"form" : ["accept-charset"],
	246	"object" : ["archive"],
	247
	248	# These are HTML5 specific, as are .accesskey and .dropzone above.
	249	"area" : ["rel"],
	250	"icon" : ["sizes"],
	251	"iframe" : ["sandbox"],
	252	"output" : ["for"],
	253	}
	254
	255	def set_up_substitutions(self, tag):
	256	# We are only interested in <meta> tags
	257	if tag.name != 'meta':
	258	return False
	259
	260	http_equiv = tag.get('http-equiv')
	261	content = tag.get('content')
	262	charset = tag.get('charset')
	263
	264	# We are interested in <meta> tags that say what encoding the
	265	# document was originally in. This means HTML 5-style <meta>
	266	# tags that provide the "charset" attribute. It also means
	267	# HTML 4-style <meta> tags that provide the "content"
	268	# attribute and have "http-equiv" set to "content-type".
	269	#
	270	# In both cases we will replace the value of the appropriate
	271	# attribute with a standin object that can take on any
	272	# encoding.
	273	meta_encoding = None
	274	if charset is not None:
	275	# HTML 5 style:
	276	# <meta charset="utf8">
	277	meta_encoding = charset
	278	tag['charset'] = CharsetMetaAttributeValue(charset)
	279
	280	elif (content is not None and http_equiv is not None
	281	and http_equiv.lower() == 'content-type'):
	282	# HTML 4 style:
	283	# <meta http-equiv="content-type" content="text/html; charset=utf8">
	284	tag['content'] = ContentMetaAttributeValue(content)
	285
	286	return (meta_encoding is not None)
	287
	288	def register_treebuilders_from(module):
	289	"""Copy TreeBuilders from the given module into this module."""
	290	# I'm fairly sure this is not the best way to do this.
	291	this_module = sys.modules['bs4.builder']
	292	for name in module.__all__:
	293	obj = getattr(module, name)
	294
	295	if issubclass(obj, TreeBuilder):
	296	setattr(this_module, name, obj)
	297	this_module.__all__.append(name)
	298	# Register the builder while we're at it.
	299	this_module.builder_registry.register(obj)
	300
	301	class ParserRejectedMarkup(Exception):
	302	pass
	303
	304	# Builders are registered in reverse order of priority, so that custom
	305	# builder registrations will take precedence. In general, we want lxml
	306	# to take precedence over html5lib, because it's faster. And we only
	307	# want to use HTMLParser as a last result.
	308	from . import _htmlparser
	309	register_treebuilders_from(_htmlparser)
	310	try:
	311	from . import _html5lib
	312	register_treebuilders_from(_html5lib)
	313	except ImportError:
	314	# They don't have html5lib installed.
	315	pass
	316	try:
	317	from . import _lxml
	318	register_treebuilders_from(_lxml)
	319	except ImportError:
	320	# They don't have lxml installed.
	321	pass