summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/element.py
diff options
context:
space:
mode:
authorRichard Purdie <richard.purdie@linuxfoundation.org>2016-05-06 09:06:51 +0100
committerRichard Purdie <richard.purdie@linuxfoundation.org>2016-05-16 23:32:40 +0100
commit8d49bef632a0486e0172e543a6c2622398ed7a8c (patch)
treee1df010f269ba33c3b53300bd16f030873b75363 /bitbake/lib/bs4/element.py
parent64182f6a89761fbdb7929da067ca1e7d4e89bbb7 (diff)
downloadpoky-8d49bef632a0486e0172e543a6c2622398ed7a8c.tar.gz
bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)
Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers recommendation for v3 use. (Bitbake rev: f06e0f8052ba44eeb9ce701192cdf19252b2646d) Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/element.py')
-rw-r--r--bitbake/lib/bs4/element.py346
1 files changed, 230 insertions, 116 deletions
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py
index da9afdf48e..0e62c2e100 100644
--- a/bitbake/lib/bs4/element.py
+++ b/bitbake/lib/bs4/element.py
@@ -1,3 +1,6 @@
1__license__ = "MIT"
2
3from pdb import set_trace
1import collections 4import collections
2import re 5import re
3import sys 6import sys
@@ -21,22 +24,22 @@ def _alias(attr):
21 return alias 24 return alias
22 25
23 26
24class NamespacedAttribute(unicode): 27class NamespacedAttribute(str):
25 28
26 def __new__(cls, prefix, name, namespace=None): 29 def __new__(cls, prefix, name, namespace=None):
27 if name is None: 30 if name is None:
28 obj = unicode.__new__(cls, prefix) 31 obj = str.__new__(cls, prefix)
29 elif prefix is None: 32 elif prefix is None:
30 # Not really namespaced. 33 # Not really namespaced.
31 obj = unicode.__new__(cls, name) 34 obj = str.__new__(cls, name)
32 else: 35 else:
33 obj = unicode.__new__(cls, prefix + ":" + name) 36 obj = str.__new__(cls, prefix + ":" + name)
34 obj.prefix = prefix 37 obj.prefix = prefix
35 obj.name = name 38 obj.name = name
36 obj.namespace = namespace 39 obj.namespace = namespace
37 return obj 40 return obj
38 41
39class AttributeValueWithCharsetSubstitution(unicode): 42class AttributeValueWithCharsetSubstitution(str):
40 """A stand-in object for a character encoding specified in HTML.""" 43 """A stand-in object for a character encoding specified in HTML."""
41 44
42class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 45class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@@ -47,7 +50,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
47 """ 50 """
48 51
49 def __new__(cls, original_value): 52 def __new__(cls, original_value):
50 obj = unicode.__new__(cls, original_value) 53 obj = str.__new__(cls, original_value)
51 obj.original_value = original_value 54 obj.original_value = original_value
52 return obj 55 return obj
53 56
@@ -70,9 +73,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
70 match = cls.CHARSET_RE.search(original_value) 73 match = cls.CHARSET_RE.search(original_value)
71 if match is None: 74 if match is None:
72 # No substitution necessary. 75 # No substitution necessary.
73 return unicode.__new__(unicode, original_value) 76 return str.__new__(str, original_value)
74 77
75 obj = unicode.__new__(cls, original_value) 78 obj = str.__new__(cls, original_value)
76 obj.original_value = original_value 79 obj.original_value = original_value
77 return obj 80 return obj
78 81
@@ -152,7 +155,7 @@ class PageElement(object):
152 155
153 def format_string(self, s, formatter='minimal'): 156 def format_string(self, s, formatter='minimal'):
154 """Format the given string using the given formatter.""" 157 """Format the given string using the given formatter."""
155 if not callable(formatter): 158 if not isinstance(formatter, collections.Callable):
156 formatter = self._formatter_for_name(formatter) 159 formatter = self._formatter_for_name(formatter)
157 if formatter is None: 160 if formatter is None:
158 output = s 161 output = s
@@ -185,24 +188,40 @@ class PageElement(object):
185 return self.HTML_FORMATTERS.get( 188 return self.HTML_FORMATTERS.get(
186 name, HTMLAwareEntitySubstitution.substitute_xml) 189 name, HTMLAwareEntitySubstitution.substitute_xml)
187 190
188 def setup(self, parent=None, previous_element=None): 191 def setup(self, parent=None, previous_element=None, next_element=None,
192 previous_sibling=None, next_sibling=None):
189 """Sets up the initial relations between this element and 193 """Sets up the initial relations between this element and
190 other elements.""" 194 other elements."""
191 self.parent = parent 195 self.parent = parent
196
192 self.previous_element = previous_element 197 self.previous_element = previous_element
193 if previous_element is not None: 198 if previous_element is not None:
194 self.previous_element.next_element = self 199 self.previous_element.next_element = self
195 self.next_element = None 200
196 self.previous_sibling = None 201 self.next_element = next_element
197 self.next_sibling = None 202 if self.next_element:
198 if self.parent is not None and self.parent.contents: 203 self.next_element.previous_element = self
199 self.previous_sibling = self.parent.contents[-1] 204
205 self.next_sibling = next_sibling
206 if self.next_sibling:
207 self.next_sibling.previous_sibling = self
208
209 if (not previous_sibling
210 and self.parent is not None and self.parent.contents):
211 previous_sibling = self.parent.contents[-1]
212
213 self.previous_sibling = previous_sibling
214 if previous_sibling:
200 self.previous_sibling.next_sibling = self 215 self.previous_sibling.next_sibling = self
201 216
202 nextSibling = _alias("next_sibling") # BS3 217 nextSibling = _alias("next_sibling") # BS3
203 previousSibling = _alias("previous_sibling") # BS3 218 previousSibling = _alias("previous_sibling") # BS3
204 219
205 def replace_with(self, replace_with): 220 def replace_with(self, replace_with):
221 if not self.parent:
222 raise ValueError(
223 "Cannot replace one element with another when the"
224 "element to be replaced is not part of a tree.")
206 if replace_with is self: 225 if replace_with is self:
207 return 226 return
208 if replace_with is self.parent: 227 if replace_with is self.parent:
@@ -216,6 +235,10 @@ class PageElement(object):
216 235
217 def unwrap(self): 236 def unwrap(self):
218 my_parent = self.parent 237 my_parent = self.parent
238 if not self.parent:
239 raise ValueError(
240 "Cannot replace an element with its contents when that"
241 "element is not part of a tree.")
219 my_index = self.parent.index(self) 242 my_index = self.parent.index(self)
220 self.extract() 243 self.extract()
221 for child in reversed(self.contents[:]): 244 for child in reversed(self.contents[:]):
@@ -240,17 +263,20 @@ class PageElement(object):
240 last_child = self._last_descendant() 263 last_child = self._last_descendant()
241 next_element = last_child.next_element 264 next_element = last_child.next_element
242 265
243 if self.previous_element is not None: 266 if (self.previous_element is not None and
267 self.previous_element is not next_element):
244 self.previous_element.next_element = next_element 268 self.previous_element.next_element = next_element
245 if next_element is not None: 269 if next_element is not None and next_element is not self.previous_element:
246 next_element.previous_element = self.previous_element 270 next_element.previous_element = self.previous_element
247 self.previous_element = None 271 self.previous_element = None
248 last_child.next_element = None 272 last_child.next_element = None
249 273
250 self.parent = None 274 self.parent = None
251 if self.previous_sibling is not None: 275 if (self.previous_sibling is not None
276 and self.previous_sibling is not self.next_sibling):
252 self.previous_sibling.next_sibling = self.next_sibling 277 self.previous_sibling.next_sibling = self.next_sibling
253 if self.next_sibling is not None: 278 if (self.next_sibling is not None
279 and self.next_sibling is not self.previous_sibling):
254 self.next_sibling.previous_sibling = self.previous_sibling 280 self.next_sibling.previous_sibling = self.previous_sibling
255 self.previous_sibling = self.next_sibling = None 281 self.previous_sibling = self.next_sibling = None
256 return self 282 return self
@@ -263,16 +289,18 @@ class PageElement(object):
263 last_child = self 289 last_child = self
264 while isinstance(last_child, Tag) and last_child.contents: 290 while isinstance(last_child, Tag) and last_child.contents:
265 last_child = last_child.contents[-1] 291 last_child = last_child.contents[-1]
266 if not accept_self and last_child == self: 292 if not accept_self and last_child is self:
267 last_child = None 293 last_child = None
268 return last_child 294 return last_child
269 # BS3: Not part of the API! 295 # BS3: Not part of the API!
270 _lastRecursiveChild = _last_descendant 296 _lastRecursiveChild = _last_descendant
271 297
272 def insert(self, position, new_child): 298 def insert(self, position, new_child):
299 if new_child is None:
300 raise ValueError("Cannot insert None into a tag.")
273 if new_child is self: 301 if new_child is self:
274 raise ValueError("Cannot insert a tag into itself.") 302 raise ValueError("Cannot insert a tag into itself.")
275 if (isinstance(new_child, basestring) 303 if (isinstance(new_child, str)
276 and not isinstance(new_child, NavigableString)): 304 and not isinstance(new_child, NavigableString)):
277 new_child = NavigableString(new_child) 305 new_child = NavigableString(new_child)
278 306
@@ -478,6 +506,10 @@ class PageElement(object):
478 def _find_all(self, name, attrs, text, limit, generator, **kwargs): 506 def _find_all(self, name, attrs, text, limit, generator, **kwargs):
479 "Iterates over a generator looking for things that match." 507 "Iterates over a generator looking for things that match."
480 508
509 if text is None and 'string' in kwargs:
510 text = kwargs['string']
511 del kwargs['string']
512
481 if isinstance(name, SoupStrainer): 513 if isinstance(name, SoupStrainer):
482 strainer = name 514 strainer = name
483 else: 515 else:
@@ -489,7 +521,7 @@ class PageElement(object):
489 result = (element for element in generator 521 result = (element for element in generator
490 if isinstance(element, Tag)) 522 if isinstance(element, Tag))
491 return ResultSet(strainer, result) 523 return ResultSet(strainer, result)
492 elif isinstance(name, basestring): 524 elif isinstance(name, str):
493 # Optimization to find all tags with a given name. 525 # Optimization to find all tags with a given name.
494 result = (element for element in generator 526 result = (element for element in generator
495 if isinstance(element, Tag) 527 if isinstance(element, Tag)
@@ -548,17 +580,17 @@ class PageElement(object):
548 580
549 # Methods for supporting CSS selectors. 581 # Methods for supporting CSS selectors.
550 582
551 tag_name_re = re.compile('^[a-z0-9]+$') 583 tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
552 584
553 # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ 585 # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
554 # \---/ \---/\-------------/ \-------/ 586 # \---------------------------/ \---/\-------------/ \-------/
555 # | | | | 587 # | | | |
556 # | | | The value 588 # | | | The value
557 # | | ~,|,^,$,* or = 589 # | | ~,|,^,$,* or =
558 # | Attribute 590 # | Attribute
559 # Tag 591 # Tag
560 attribselect_re = re.compile( 592 attribselect_re = re.compile(
561 r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + 593 r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
562 r'=?"?(?P<value>[^\]"]*)"?\]$' 594 r'=?"?(?P<value>[^\]"]*)"?\]$'
563 ) 595 )
564 596
@@ -640,7 +672,7 @@ class PageElement(object):
640 return self.parents 672 return self.parents
641 673
642 674
643class NavigableString(unicode, PageElement): 675class NavigableString(str, PageElement):
644 676
645 PREFIX = '' 677 PREFIX = ''
646 SUFFIX = '' 678 SUFFIX = ''
@@ -653,15 +685,21 @@ class NavigableString(unicode, PageElement):
653 passed in to the superclass's __new__ or the superclass won't know 685 passed in to the superclass's __new__ or the superclass won't know
654 how to handle non-ASCII characters. 686 how to handle non-ASCII characters.
655 """ 687 """
656 if isinstance(value, unicode): 688 if isinstance(value, str):
657 return unicode.__new__(cls, value) 689 u = str.__new__(cls, value)
658 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 690 else:
691 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
692 u.setup()
693 return u
659 694
660 def __copy__(self): 695 def __copy__(self):
661 return self 696 """A copy of a NavigableString has the same contents and class
697 as the original, but it is not connected to the parse tree.
698 """
699 return type(self)(self)
662 700
663 def __getnewargs__(self): 701 def __getnewargs__(self):
664 return (unicode(self),) 702 return (str(self),)
665 703
666 def __getattr__(self, attr): 704 def __getattr__(self, attr):
667 """text.string gives you text. This is for backwards 705 """text.string gives you text. This is for backwards
@@ -701,23 +739,23 @@ class PreformattedString(NavigableString):
701 739
702class CData(PreformattedString): 740class CData(PreformattedString):
703 741
704 PREFIX = u'<![CDATA[' 742 PREFIX = '<![CDATA['
705 SUFFIX = u']]>' 743 SUFFIX = ']]>'
706 744
707class ProcessingInstruction(PreformattedString): 745class ProcessingInstruction(PreformattedString):
708 746
709 PREFIX = u'<?' 747 PREFIX = '<?'
710 SUFFIX = u'?>' 748 SUFFIX = '>'
711 749
712class Comment(PreformattedString): 750class Comment(PreformattedString):
713 751
714 PREFIX = u'<!--' 752 PREFIX = '<!--'
715 SUFFIX = u'-->' 753 SUFFIX = '-->'
716 754
717 755
718class Declaration(PreformattedString): 756class Declaration(PreformattedString):
719 PREFIX = u'<!' 757 PREFIX = '<?'
720 SUFFIX = u'!>' 758 SUFFIX = '?>'
721 759
722 760
723class Doctype(PreformattedString): 761class Doctype(PreformattedString):
@@ -734,8 +772,8 @@ class Doctype(PreformattedString):
734 772
735 return Doctype(value) 773 return Doctype(value)
736 774
737 PREFIX = u'<!DOCTYPE ' 775 PREFIX = '<!DOCTYPE '
738 SUFFIX = u'>\n' 776 SUFFIX = '>\n'
739 777
740 778
741class Tag(PageElement): 779class Tag(PageElement):
@@ -759,9 +797,12 @@ class Tag(PageElement):
759 self.prefix = prefix 797 self.prefix = prefix
760 if attrs is None: 798 if attrs is None:
761 attrs = {} 799 attrs = {}
762 elif attrs and builder.cdata_list_attributes: 800 elif attrs:
763 attrs = builder._replace_cdata_list_attribute_values( 801 if builder is not None and builder.cdata_list_attributes:
764 self.name, attrs) 802 attrs = builder._replace_cdata_list_attribute_values(
803 self.name, attrs)
804 else:
805 attrs = dict(attrs)
765 else: 806 else:
766 attrs = dict(attrs) 807 attrs = dict(attrs)
767 self.attrs = attrs 808 self.attrs = attrs
@@ -778,6 +819,18 @@ class Tag(PageElement):
778 819
779 parserClass = _alias("parser_class") # BS3 820 parserClass = _alias("parser_class") # BS3
780 821
822 def __copy__(self):
823 """A copy of a Tag is a new Tag, unconnected to the parse tree.
824 Its contents are a copy of the old Tag's contents.
825 """
826 clone = type(self)(None, self.builder, self.name, self.namespace,
827 self.nsprefix, self.attrs)
828 for attr in ('can_be_empty_element', 'hidden'):
829 setattr(clone, attr, getattr(self, attr))
830 for child in self.contents:
831 clone.append(child.__copy__())
832 return clone
833
781 @property 834 @property
782 def is_empty_element(self): 835 def is_empty_element(self):
783 """Is this tag an empty-element tag? (aka a self-closing tag) 836 """Is this tag an empty-element tag? (aka a self-closing tag)
@@ -843,7 +896,7 @@ class Tag(PageElement):
843 for string in self._all_strings(True): 896 for string in self._all_strings(True):
844 yield string 897 yield string
845 898
846 def get_text(self, separator=u"", strip=False, 899 def get_text(self, separator="", strip=False,
847 types=(NavigableString, CData)): 900 types=(NavigableString, CData)):
848 """ 901 """
849 Get all child strings, concatenated using the given separator. 902 Get all child strings, concatenated using the given separator.
@@ -915,7 +968,7 @@ class Tag(PageElement):
915 def __contains__(self, x): 968 def __contains__(self, x):
916 return x in self.contents 969 return x in self.contents
917 970
918 def __nonzero__(self): 971 def __bool__(self):
919 "A tag is non-None even if it has no contents." 972 "A tag is non-None even if it has no contents."
920 return True 973 return True
921 974
@@ -971,15 +1024,25 @@ class Tag(PageElement):
971 as defined in __eq__.""" 1024 as defined in __eq__."""
972 return not self == other 1025 return not self == other
973 1026
974 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 1027 def __repr__(self, encoding="unicode-escape"):
975 """Renders this tag as a string.""" 1028 """Renders this tag as a string."""
976 return self.encode(encoding) 1029 if PY3K:
1030 # "The return value must be a string object", i.e. Unicode
1031 return self.decode()
1032 else:
1033 # "The return value must be a string object", i.e. a bytestring.
1034 # By convention, the return value of __repr__ should also be
1035 # an ASCII string.
1036 return self.encode(encoding)
977 1037
978 def __unicode__(self): 1038 def __unicode__(self):
979 return self.decode() 1039 return self.decode()
980 1040
981 def __str__(self): 1041 def __str__(self):
982 return self.encode() 1042 if PY3K:
1043 return self.decode()
1044 else:
1045 return self.encode()
983 1046
984 if PY3K: 1047 if PY3K:
985 __str__ = __repr__ = __unicode__ 1048 __str__ = __repr__ = __unicode__
@@ -1014,7 +1077,7 @@ class Tag(PageElement):
1014 1077
1015 # First off, turn a string formatter into a function. This 1078 # First off, turn a string formatter into a function. This
1016 # will stop the lookup from happening over and over again. 1079 # will stop the lookup from happening over and over again.
1017 if not callable(formatter): 1080 if not isinstance(formatter, collections.Callable):
1018 formatter = self._formatter_for_name(formatter) 1081 formatter = self._formatter_for_name(formatter)
1019 1082
1020 attrs = [] 1083 attrs = []
@@ -1025,8 +1088,8 @@ class Tag(PageElement):
1025 else: 1088 else:
1026 if isinstance(val, list) or isinstance(val, tuple): 1089 if isinstance(val, list) or isinstance(val, tuple):
1027 val = ' '.join(val) 1090 val = ' '.join(val)
1028 elif not isinstance(val, basestring): 1091 elif not isinstance(val, str):
1029 val = unicode(val) 1092 val = str(val)
1030 elif ( 1093 elif (
1031 isinstance(val, AttributeValueWithCharsetSubstitution) 1094 isinstance(val, AttributeValueWithCharsetSubstitution)
1032 and eventual_encoding is not None): 1095 and eventual_encoding is not None):
@@ -1034,7 +1097,7 @@ class Tag(PageElement):
1034 1097
1035 text = self.format_string(val, formatter) 1098 text = self.format_string(val, formatter)
1036 decoded = ( 1099 decoded = (
1037 unicode(key) + '=' 1100 str(key) + '='
1038 + EntitySubstitution.quoted_attribute_value(text)) 1101 + EntitySubstitution.quoted_attribute_value(text))
1039 attrs.append(decoded) 1102 attrs.append(decoded)
1040 close = '' 1103 close = ''
@@ -1103,16 +1166,22 @@ class Tag(PageElement):
1103 formatter="minimal"): 1166 formatter="minimal"):
1104 """Renders the contents of this tag as a Unicode string. 1167 """Renders the contents of this tag as a Unicode string.
1105 1168
1169 :param indent_level: Each line of the rendering will be
1170 indented this many spaces.
1171
1106 :param eventual_encoding: The tag is destined to be 1172 :param eventual_encoding: The tag is destined to be
1107 encoded into this encoding. This method is _not_ 1173 encoded into this encoding. This method is _not_
1108 responsible for performing that encoding. This information 1174 responsible for performing that encoding. This information
1109 is passed in so that it can be substituted in if the 1175 is passed in so that it can be substituted in if the
1110 document contains a <META> tag that mentions the document's 1176 document contains a <META> tag that mentions the document's
1111 encoding. 1177 encoding.
1178
1179 :param formatter: The output formatter responsible for converting
1180 entities to Unicode characters.
1112 """ 1181 """
1113 # First off, turn a string formatter into a function. This 1182 # First off, turn a string formatter into a function. This
1114 # will stop the lookup from happening over and over again. 1183 # will stop the lookup from happening over and over again.
1115 if not callable(formatter): 1184 if not isinstance(formatter, collections.Callable):
1116 formatter = self._formatter_for_name(formatter) 1185 formatter = self._formatter_for_name(formatter)
1117 1186
1118 pretty_print = (indent_level is not None) 1187 pretty_print = (indent_level is not None)
@@ -1137,7 +1206,17 @@ class Tag(PageElement):
1137 def encode_contents( 1206 def encode_contents(
1138 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1207 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1139 formatter="minimal"): 1208 formatter="minimal"):
1140 """Renders the contents of this tag as a bytestring.""" 1209 """Renders the contents of this tag as a bytestring.
1210
1211 :param indent_level: Each line of the rendering will be
1212 indented this many spaces.
1213
1214 :param eventual_encoding: The bytestring will be in this encoding.
1215
1216 :param formatter: The output formatter responsible for converting
1217 entities to Unicode characters.
1218 """
1219
1141 contents = self.decode_contents(indent_level, encoding, formatter) 1220 contents = self.decode_contents(indent_level, encoding, formatter)
1142 return contents.encode(encoding) 1221 return contents.encode(encoding)
1143 1222
@@ -1201,26 +1280,57 @@ class Tag(PageElement):
1201 1280
1202 _selector_combinators = ['>', '+', '~'] 1281 _selector_combinators = ['>', '+', '~']
1203 _select_debug = False 1282 _select_debug = False
1204 def select(self, selector, _candidate_generator=None): 1283 def select_one(self, selector):
1284 """Perform a CSS selection operation on the current element."""
1285 value = self.select(selector, limit=1)
1286 if value:
1287 return value[0]
1288 return None
1289
1290 def select(self, selector, _candidate_generator=None, limit=None):
1205 """Perform a CSS selection operation on the current element.""" 1291 """Perform a CSS selection operation on the current element."""
1292
1293 # Handle grouping selectors if ',' exists, ie: p,a
1294 if ',' in selector:
1295 context = []
1296 for partial_selector in selector.split(','):
1297 partial_selector = partial_selector.strip()
1298 if partial_selector == '':
1299 raise ValueError('Invalid group selection syntax: %s' % selector)
1300 candidates = self.select(partial_selector, limit=limit)
1301 for candidate in candidates:
1302 if candidate not in context:
1303 context.append(candidate)
1304
1305 if limit and len(context) >= limit:
1306 break
1307 return context
1308
1206 tokens = selector.split() 1309 tokens = selector.split()
1207 current_context = [self] 1310 current_context = [self]
1208 1311
1209 if tokens[-1] in self._selector_combinators: 1312 if tokens[-1] in self._selector_combinators:
1210 raise ValueError( 1313 raise ValueError(
1211 'Final combinator "%s" is missing an argument.' % tokens[-1]) 1314 'Final combinator "%s" is missing an argument.' % tokens[-1])
1315
1212 if self._select_debug: 1316 if self._select_debug:
1213 print 'Running CSS selector "%s"' % selector 1317 print('Running CSS selector "%s"' % selector)
1318
1214 for index, token in enumerate(tokens): 1319 for index, token in enumerate(tokens):
1215 if self._select_debug: 1320 new_context = []
1216 print ' Considering token "%s"' % token 1321 new_context_ids = set([])
1217 recursive_candidate_generator = None 1322
1218 tag_name = None
1219 if tokens[index-1] in self._selector_combinators: 1323 if tokens[index-1] in self._selector_combinators:
1220 # This token was consumed by the previous combinator. Skip it. 1324 # This token was consumed by the previous combinator. Skip it.
1221 if self._select_debug: 1325 if self._select_debug:
1222 print ' Token was consumed by the previous combinator.' 1326 print(' Token was consumed by the previous combinator.')
1223 continue 1327 continue
1328
1329 if self._select_debug:
1330 print(' Considering token "%s"' % token)
1331 recursive_candidate_generator = None
1332 tag_name = None
1333
1224 # Each operation corresponds to a checker function, a rule 1334 # Each operation corresponds to a checker function, a rule
1225 # for determining whether a candidate matches the 1335 # for determining whether a candidate matches the
1226 # selector. Candidates are generated by the active 1336 # selector. Candidates are generated by the active
@@ -1256,35 +1366,38 @@ class Tag(PageElement):
1256 "A pseudo-class must be prefixed with a tag name.") 1366 "A pseudo-class must be prefixed with a tag name.")
1257 pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) 1367 pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1258 found = [] 1368 found = []
1259 if pseudo_attributes is not None: 1369 if pseudo_attributes is None:
1370 pseudo_type = pseudo
1371 pseudo_value = None
1372 else:
1260 pseudo_type, pseudo_value = pseudo_attributes.groups() 1373 pseudo_type, pseudo_value = pseudo_attributes.groups()
1261 if pseudo_type == 'nth-of-type': 1374 if pseudo_type == 'nth-of-type':
1262 try: 1375 try:
1263 pseudo_value = int(pseudo_value) 1376 pseudo_value = int(pseudo_value)
1264 except: 1377 except:
1265 raise NotImplementedError(
1266 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1267 if pseudo_value < 1:
1268 raise ValueError(
1269 'nth-of-type pseudo-class value must be at least 1.')
1270 class Counter(object):
1271 def __init__(self, destination):
1272 self.count = 0
1273 self.destination = destination
1274
1275 def nth_child_of_type(self, tag):
1276 self.count += 1
1277 if self.count == self.destination:
1278 return True
1279 if self.count > self.destination:
1280 # Stop the generator that's sending us
1281 # these things.
1282 raise StopIteration()
1283 return False
1284 checker = Counter(pseudo_value).nth_child_of_type
1285 else:
1286 raise NotImplementedError( 1378 raise NotImplementedError(
1287 'Only the following pseudo-classes are implemented: nth-of-type.') 1379 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1380 if pseudo_value < 1:
1381 raise ValueError(
1382 'nth-of-type pseudo-class value must be at least 1.')
1383 class Counter(object):
1384 def __init__(self, destination):
1385 self.count = 0
1386 self.destination = destination
1387
1388 def nth_child_of_type(self, tag):
1389 self.count += 1
1390 if self.count == self.destination:
1391 return True
1392 if self.count > self.destination:
1393 # Stop the generator that's sending us
1394 # these things.
1395 raise StopIteration()
1396 return False
1397 checker = Counter(pseudo_value).nth_child_of_type
1398 else:
1399 raise NotImplementedError(
1400 'Only the following pseudo-classes are implemented: nth-of-type.')
1288 1401
1289 elif token == '*': 1402 elif token == '*':
1290 # Star selector -- matches everything 1403 # Star selector -- matches everything
@@ -1311,7 +1424,6 @@ class Tag(PageElement):
1311 else: 1424 else:
1312 raise ValueError( 1425 raise ValueError(
1313 'Unsupported or invalid CSS selector: "%s"' % token) 1426 'Unsupported or invalid CSS selector: "%s"' % token)
1314
1315 if recursive_candidate_generator: 1427 if recursive_candidate_generator:
1316 # This happens when the selector looks like "> foo". 1428 # This happens when the selector looks like "> foo".
1317 # 1429 #
@@ -1325,14 +1437,14 @@ class Tag(PageElement):
1325 next_token = tokens[index+1] 1437 next_token = tokens[index+1]
1326 def recursive_select(tag): 1438 def recursive_select(tag):
1327 if self._select_debug: 1439 if self._select_debug:
1328 print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) 1440 print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
1329 print '-' * 40 1441 print('-' * 40)
1330 for i in tag.select(next_token, recursive_candidate_generator): 1442 for i in tag.select(next_token, recursive_candidate_generator):
1331 if self._select_debug: 1443 if self._select_debug:
1332 print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) 1444 print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
1333 yield i 1445 yield i
1334 if self._select_debug: 1446 if self._select_debug:
1335 print '-' * 40 1447 print('-' * 40)
1336 _use_candidate_generator = recursive_select 1448 _use_candidate_generator = recursive_select
1337 elif _candidate_generator is None: 1449 elif _candidate_generator is None:
1338 # By default, a tag's candidates are all of its 1450 # By default, a tag's candidates are all of its
@@ -1343,7 +1455,7 @@ class Tag(PageElement):
1343 check = "[any]" 1455 check = "[any]"
1344 else: 1456 else:
1345 check = tag_name 1457 check = tag_name
1346 print ' Default candidate generator, tag name="%s"' % check 1458 print(' Default candidate generator, tag name="%s"' % check)
1347 if self._select_debug: 1459 if self._select_debug:
1348 # This is redundant with later code, but it stops 1460 # This is redundant with later code, but it stops
1349 # a bunch of bogus tags from cluttering up the 1461 # a bunch of bogus tags from cluttering up the
@@ -1361,12 +1473,11 @@ class Tag(PageElement):
1361 else: 1473 else:
1362 _use_candidate_generator = _candidate_generator 1474 _use_candidate_generator = _candidate_generator
1363 1475
1364 new_context = [] 1476 count = 0
1365 new_context_ids = set([])
1366 for tag in current_context: 1477 for tag in current_context:
1367 if self._select_debug: 1478 if self._select_debug:
1368 print " Running candidate generator on %s %s" % ( 1479 print(" Running candidate generator on %s %s" % (
1369 tag.name, repr(tag.attrs)) 1480 tag.name, repr(tag.attrs)))
1370 for candidate in _use_candidate_generator(tag): 1481 for candidate in _use_candidate_generator(tag):
1371 if not isinstance(candidate, Tag): 1482 if not isinstance(candidate, Tag):
1372 continue 1483 continue
@@ -1381,21 +1492,24 @@ class Tag(PageElement):
1381 break 1492 break
1382 if checker is None or result: 1493 if checker is None or result:
1383 if self._select_debug: 1494 if self._select_debug:
1384 print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) 1495 print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
1385 if id(candidate) not in new_context_ids: 1496 if id(candidate) not in new_context_ids:
1386 # If a tag matches a selector more than once, 1497 # If a tag matches a selector more than once,
1387 # don't include it in the context more than once. 1498 # don't include it in the context more than once.
1388 new_context.append(candidate) 1499 new_context.append(candidate)
1389 new_context_ids.add(id(candidate)) 1500 new_context_ids.add(id(candidate))
1501 if limit and len(new_context) >= limit:
1502 break
1390 elif self._select_debug: 1503 elif self._select_debug:
1391 print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) 1504 print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
1505
1392 1506
1393 current_context = new_context 1507 current_context = new_context
1394 1508
1395 if self._select_debug: 1509 if self._select_debug:
1396 print "Final verdict:" 1510 print("Final verdict:")
1397 for i in current_context: 1511 for i in current_context:
1398 print " %s %s" % (i.name, i.attrs) 1512 print(" %s %s" % (i.name, i.attrs))
1399 return current_context 1513 return current_context
1400 1514
1401 # Old names for backwards compatibility 1515 # Old names for backwards compatibility
@@ -1439,7 +1553,7 @@ class SoupStrainer(object):
1439 else: 1553 else:
1440 attrs = kwargs 1554 attrs = kwargs
1441 normalized_attrs = {} 1555 normalized_attrs = {}
1442 for key, value in attrs.items(): 1556 for key, value in list(attrs.items()):
1443 normalized_attrs[key] = self._normalize_search_value(value) 1557 normalized_attrs[key] = self._normalize_search_value(value)
1444 1558
1445 self.attrs = normalized_attrs 1559 self.attrs = normalized_attrs
@@ -1448,7 +1562,7 @@ class SoupStrainer(object):
1448 def _normalize_search_value(self, value): 1562 def _normalize_search_value(self, value):
1449 # Leave it alone if it's a Unicode string, a callable, a 1563 # Leave it alone if it's a Unicode string, a callable, a
1450 # regular expression, a boolean, or None. 1564 # regular expression, a boolean, or None.
1451 if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') 1565 if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match')
1452 or isinstance(value, bool) or value is None): 1566 or isinstance(value, bool) or value is None):
1453 return value 1567 return value
1454 1568
@@ -1461,7 +1575,7 @@ class SoupStrainer(object):
1461 new_value = [] 1575 new_value = []
1462 for v in value: 1576 for v in value:
1463 if (hasattr(v, '__iter__') and not isinstance(v, bytes) 1577 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1464 and not isinstance(v, unicode)): 1578 and not isinstance(v, str)):
1465 # This is almost certainly the user's mistake. In the 1579 # This is almost certainly the user's mistake. In the
1466 # interests of avoiding infinite loops, we'll let 1580 # interests of avoiding infinite loops, we'll let
1467 # it through as-is rather than doing a recursive call. 1581 # it through as-is rather than doing a recursive call.
@@ -1473,7 +1587,7 @@ class SoupStrainer(object):
1473 # Otherwise, convert it into a Unicode string. 1587 # Otherwise, convert it into a Unicode string.
1474 # The unicode(str()) thing is so this will do the same thing on Python 2 1588 # The unicode(str()) thing is so this will do the same thing on Python 2
1475 # and Python 3. 1589 # and Python 3.
1476 return unicode(str(value)) 1590 return str(str(value))
1477 1591
1478 def __str__(self): 1592 def __str__(self):
1479 if self.text: 1593 if self.text:
@@ -1527,7 +1641,7 @@ class SoupStrainer(object):
1527 found = None 1641 found = None
1528 # If given a list of items, scan it for a text element that 1642 # If given a list of items, scan it for a text element that
1529 # matches. 1643 # matches.
1530 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): 1644 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
1531 for element in markup: 1645 for element in markup:
1532 if isinstance(element, NavigableString) \ 1646 if isinstance(element, NavigableString) \
1533 and self.search(element): 1647 and self.search(element):
@@ -1540,7 +1654,7 @@ class SoupStrainer(object):
1540 found = self.search_tag(markup) 1654 found = self.search_tag(markup)
1541 # If it's text, make sure the text matches. 1655 # If it's text, make sure the text matches.
1542 elif isinstance(markup, NavigableString) or \ 1656 elif isinstance(markup, NavigableString) or \
1543 isinstance(markup, basestring): 1657 isinstance(markup, str):
1544 if not self.name and not self.attrs and self._matches(markup, self.text): 1658 if not self.name and not self.attrs and self._matches(markup, self.text):
1545 found = markup 1659 found = markup
1546 else: 1660 else:
@@ -1554,7 +1668,7 @@ class SoupStrainer(object):
1554 if isinstance(markup, list) or isinstance(markup, tuple): 1668 if isinstance(markup, list) or isinstance(markup, tuple):
1555 # This should only happen when searching a multi-valued attribute 1669 # This should only happen when searching a multi-valued attribute
1556 # like 'class'. 1670 # like 'class'.
1557 if (isinstance(match_against, unicode) 1671 if (isinstance(match_against, str)
1558 and ' ' in match_against): 1672 and ' ' in match_against):
1559 # A bit of a special case. If they try to match "foo 1673 # A bit of a special case. If they try to match "foo
1560 # bar" on a multivalue attribute's value, only accept 1674 # bar" on a multivalue attribute's value, only accept
@@ -1589,7 +1703,7 @@ class SoupStrainer(object):
1589 # None matches None, False, an empty string, an empty list, and so on. 1703 # None matches None, False, an empty string, an empty list, and so on.
1590 return not match_against 1704 return not match_against
1591 1705
1592 if isinstance(match_against, unicode): 1706 if isinstance(match_against, str):
1593 # Exact string match 1707 # Exact string match
1594 return markup == match_against 1708 return markup == match_against
1595 1709