diff options
Diffstat (limited to 'bitbake/lib/bs4/element.py')
-rw-r--r-- | bitbake/lib/bs4/element.py | 346 |
1 files changed, 230 insertions, 116 deletions
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py index da9afdf48e..0e62c2e100 100644 --- a/bitbake/lib/bs4/element.py +++ b/bitbake/lib/bs4/element.py | |||
@@ -1,3 +1,6 @@ | |||
1 | __license__ = "MIT" | ||
2 | |||
3 | from pdb import set_trace | ||
1 | import collections | 4 | import collections |
2 | import re | 5 | import re |
3 | import sys | 6 | import sys |
@@ -21,22 +24,22 @@ def _alias(attr): | |||
21 | return alias | 24 | return alias |
22 | 25 | ||
23 | 26 | ||
24 | class NamespacedAttribute(unicode): | 27 | class NamespacedAttribute(str): |
25 | 28 | ||
26 | def __new__(cls, prefix, name, namespace=None): | 29 | def __new__(cls, prefix, name, namespace=None): |
27 | if name is None: | 30 | if name is None: |
28 | obj = unicode.__new__(cls, prefix) | 31 | obj = str.__new__(cls, prefix) |
29 | elif prefix is None: | 32 | elif prefix is None: |
30 | # Not really namespaced. | 33 | # Not really namespaced. |
31 | obj = unicode.__new__(cls, name) | 34 | obj = str.__new__(cls, name) |
32 | else: | 35 | else: |
33 | obj = unicode.__new__(cls, prefix + ":" + name) | 36 | obj = str.__new__(cls, prefix + ":" + name) |
34 | obj.prefix = prefix | 37 | obj.prefix = prefix |
35 | obj.name = name | 38 | obj.name = name |
36 | obj.namespace = namespace | 39 | obj.namespace = namespace |
37 | return obj | 40 | return obj |
38 | 41 | ||
39 | class AttributeValueWithCharsetSubstitution(unicode): | 42 | class AttributeValueWithCharsetSubstitution(str): |
40 | """A stand-in object for a character encoding specified in HTML.""" | 43 | """A stand-in object for a character encoding specified in HTML.""" |
41 | 44 | ||
42 | class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): | 45 | class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): |
@@ -47,7 +50,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |||
47 | """ | 50 | """ |
48 | 51 | ||
49 | def __new__(cls, original_value): | 52 | def __new__(cls, original_value): |
50 | obj = unicode.__new__(cls, original_value) | 53 | obj = str.__new__(cls, original_value) |
51 | obj.original_value = original_value | 54 | obj.original_value = original_value |
52 | return obj | 55 | return obj |
53 | 56 | ||
@@ -70,9 +73,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |||
70 | match = cls.CHARSET_RE.search(original_value) | 73 | match = cls.CHARSET_RE.search(original_value) |
71 | if match is None: | 74 | if match is None: |
72 | # No substitution necessary. | 75 | # No substitution necessary. |
73 | return unicode.__new__(unicode, original_value) | 76 | return str.__new__(str, original_value) |
74 | 77 | ||
75 | obj = unicode.__new__(cls, original_value) | 78 | obj = str.__new__(cls, original_value) |
76 | obj.original_value = original_value | 79 | obj.original_value = original_value |
77 | return obj | 80 | return obj |
78 | 81 | ||
@@ -152,7 +155,7 @@ class PageElement(object): | |||
152 | 155 | ||
153 | def format_string(self, s, formatter='minimal'): | 156 | def format_string(self, s, formatter='minimal'): |
154 | """Format the given string using the given formatter.""" | 157 | """Format the given string using the given formatter.""" |
155 | if not callable(formatter): | 158 | if not isinstance(formatter, collections.Callable): |
156 | formatter = self._formatter_for_name(formatter) | 159 | formatter = self._formatter_for_name(formatter) |
157 | if formatter is None: | 160 | if formatter is None: |
158 | output = s | 161 | output = s |
@@ -185,24 +188,40 @@ class PageElement(object): | |||
185 | return self.HTML_FORMATTERS.get( | 188 | return self.HTML_FORMATTERS.get( |
186 | name, HTMLAwareEntitySubstitution.substitute_xml) | 189 | name, HTMLAwareEntitySubstitution.substitute_xml) |
187 | 190 | ||
188 | def setup(self, parent=None, previous_element=None): | 191 | def setup(self, parent=None, previous_element=None, next_element=None, |
192 | previous_sibling=None, next_sibling=None): | ||
189 | """Sets up the initial relations between this element and | 193 | """Sets up the initial relations between this element and |
190 | other elements.""" | 194 | other elements.""" |
191 | self.parent = parent | 195 | self.parent = parent |
196 | |||
192 | self.previous_element = previous_element | 197 | self.previous_element = previous_element |
193 | if previous_element is not None: | 198 | if previous_element is not None: |
194 | self.previous_element.next_element = self | 199 | self.previous_element.next_element = self |
195 | self.next_element = None | 200 | |
196 | self.previous_sibling = None | 201 | self.next_element = next_element |
197 | self.next_sibling = None | 202 | if self.next_element: |
198 | if self.parent is not None and self.parent.contents: | 203 | self.next_element.previous_element = self |
199 | self.previous_sibling = self.parent.contents[-1] | 204 | |
205 | self.next_sibling = next_sibling | ||
206 | if self.next_sibling: | ||
207 | self.next_sibling.previous_sibling = self | ||
208 | |||
209 | if (not previous_sibling | ||
210 | and self.parent is not None and self.parent.contents): | ||
211 | previous_sibling = self.parent.contents[-1] | ||
212 | |||
213 | self.previous_sibling = previous_sibling | ||
214 | if previous_sibling: | ||
200 | self.previous_sibling.next_sibling = self | 215 | self.previous_sibling.next_sibling = self |
201 | 216 | ||
202 | nextSibling = _alias("next_sibling") # BS3 | 217 | nextSibling = _alias("next_sibling") # BS3 |
203 | previousSibling = _alias("previous_sibling") # BS3 | 218 | previousSibling = _alias("previous_sibling") # BS3 |
204 | 219 | ||
205 | def replace_with(self, replace_with): | 220 | def replace_with(self, replace_with): |
221 | if not self.parent: | ||
222 | raise ValueError( | ||
223 | "Cannot replace one element with another when the" | ||
224 | "element to be replaced is not part of a tree.") | ||
206 | if replace_with is self: | 225 | if replace_with is self: |
207 | return | 226 | return |
208 | if replace_with is self.parent: | 227 | if replace_with is self.parent: |
@@ -216,6 +235,10 @@ class PageElement(object): | |||
216 | 235 | ||
217 | def unwrap(self): | 236 | def unwrap(self): |
218 | my_parent = self.parent | 237 | my_parent = self.parent |
238 | if not self.parent: | ||
239 | raise ValueError( | ||
240 | "Cannot replace an element with its contents when that" | ||
241 | "element is not part of a tree.") | ||
219 | my_index = self.parent.index(self) | 242 | my_index = self.parent.index(self) |
220 | self.extract() | 243 | self.extract() |
221 | for child in reversed(self.contents[:]): | 244 | for child in reversed(self.contents[:]): |
@@ -240,17 +263,20 @@ class PageElement(object): | |||
240 | last_child = self._last_descendant() | 263 | last_child = self._last_descendant() |
241 | next_element = last_child.next_element | 264 | next_element = last_child.next_element |
242 | 265 | ||
243 | if self.previous_element is not None: | 266 | if (self.previous_element is not None and |
267 | self.previous_element is not next_element): | ||
244 | self.previous_element.next_element = next_element | 268 | self.previous_element.next_element = next_element |
245 | if next_element is not None: | 269 | if next_element is not None and next_element is not self.previous_element: |
246 | next_element.previous_element = self.previous_element | 270 | next_element.previous_element = self.previous_element |
247 | self.previous_element = None | 271 | self.previous_element = None |
248 | last_child.next_element = None | 272 | last_child.next_element = None |
249 | 273 | ||
250 | self.parent = None | 274 | self.parent = None |
251 | if self.previous_sibling is not None: | 275 | if (self.previous_sibling is not None |
276 | and self.previous_sibling is not self.next_sibling): | ||
252 | self.previous_sibling.next_sibling = self.next_sibling | 277 | self.previous_sibling.next_sibling = self.next_sibling |
253 | if self.next_sibling is not None: | 278 | if (self.next_sibling is not None |
279 | and self.next_sibling is not self.previous_sibling): | ||
254 | self.next_sibling.previous_sibling = self.previous_sibling | 280 | self.next_sibling.previous_sibling = self.previous_sibling |
255 | self.previous_sibling = self.next_sibling = None | 281 | self.previous_sibling = self.next_sibling = None |
256 | return self | 282 | return self |
@@ -263,16 +289,18 @@ class PageElement(object): | |||
263 | last_child = self | 289 | last_child = self |
264 | while isinstance(last_child, Tag) and last_child.contents: | 290 | while isinstance(last_child, Tag) and last_child.contents: |
265 | last_child = last_child.contents[-1] | 291 | last_child = last_child.contents[-1] |
266 | if not accept_self and last_child == self: | 292 | if not accept_self and last_child is self: |
267 | last_child = None | 293 | last_child = None |
268 | return last_child | 294 | return last_child |
269 | # BS3: Not part of the API! | 295 | # BS3: Not part of the API! |
270 | _lastRecursiveChild = _last_descendant | 296 | _lastRecursiveChild = _last_descendant |
271 | 297 | ||
272 | def insert(self, position, new_child): | 298 | def insert(self, position, new_child): |
299 | if new_child is None: | ||
300 | raise ValueError("Cannot insert None into a tag.") | ||
273 | if new_child is self: | 301 | if new_child is self: |
274 | raise ValueError("Cannot insert a tag into itself.") | 302 | raise ValueError("Cannot insert a tag into itself.") |
275 | if (isinstance(new_child, basestring) | 303 | if (isinstance(new_child, str) |
276 | and not isinstance(new_child, NavigableString)): | 304 | and not isinstance(new_child, NavigableString)): |
277 | new_child = NavigableString(new_child) | 305 | new_child = NavigableString(new_child) |
278 | 306 | ||
@@ -478,6 +506,10 @@ class PageElement(object): | |||
478 | def _find_all(self, name, attrs, text, limit, generator, **kwargs): | 506 | def _find_all(self, name, attrs, text, limit, generator, **kwargs): |
479 | "Iterates over a generator looking for things that match." | 507 | "Iterates over a generator looking for things that match." |
480 | 508 | ||
509 | if text is None and 'string' in kwargs: | ||
510 | text = kwargs['string'] | ||
511 | del kwargs['string'] | ||
512 | |||
481 | if isinstance(name, SoupStrainer): | 513 | if isinstance(name, SoupStrainer): |
482 | strainer = name | 514 | strainer = name |
483 | else: | 515 | else: |
@@ -489,7 +521,7 @@ class PageElement(object): | |||
489 | result = (element for element in generator | 521 | result = (element for element in generator |
490 | if isinstance(element, Tag)) | 522 | if isinstance(element, Tag)) |
491 | return ResultSet(strainer, result) | 523 | return ResultSet(strainer, result) |
492 | elif isinstance(name, basestring): | 524 | elif isinstance(name, str): |
493 | # Optimization to find all tags with a given name. | 525 | # Optimization to find all tags with a given name. |
494 | result = (element for element in generator | 526 | result = (element for element in generator |
495 | if isinstance(element, Tag) | 527 | if isinstance(element, Tag) |
@@ -548,17 +580,17 @@ class PageElement(object): | |||
548 | 580 | ||
549 | # Methods for supporting CSS selectors. | 581 | # Methods for supporting CSS selectors. |
550 | 582 | ||
551 | tag_name_re = re.compile('^[a-z0-9]+$') | 583 | tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') |
552 | 584 | ||
553 | # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ | 585 | # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ |
554 | # \---/ \---/\-------------/ \-------/ | 586 | # \---------------------------/ \---/\-------------/ \-------/ |
555 | # | | | | | 587 | # | | | | |
556 | # | | | The value | 588 | # | | | The value |
557 | # | | ~,|,^,$,* or = | 589 | # | | ~,|,^,$,* or = |
558 | # | Attribute | 590 | # | Attribute |
559 | # Tag | 591 | # Tag |
560 | attribselect_re = re.compile( | 592 | attribselect_re = re.compile( |
561 | r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + | 593 | r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' + |
562 | r'=?"?(?P<value>[^\]"]*)"?\]$' | 594 | r'=?"?(?P<value>[^\]"]*)"?\]$' |
563 | ) | 595 | ) |
564 | 596 | ||
@@ -640,7 +672,7 @@ class PageElement(object): | |||
640 | return self.parents | 672 | return self.parents |
641 | 673 | ||
642 | 674 | ||
643 | class NavigableString(unicode, PageElement): | 675 | class NavigableString(str, PageElement): |
644 | 676 | ||
645 | PREFIX = '' | 677 | PREFIX = '' |
646 | SUFFIX = '' | 678 | SUFFIX = '' |
@@ -653,15 +685,21 @@ class NavigableString(unicode, PageElement): | |||
653 | passed in to the superclass's __new__ or the superclass won't know | 685 | passed in to the superclass's __new__ or the superclass won't know |
654 | how to handle non-ASCII characters. | 686 | how to handle non-ASCII characters. |
655 | """ | 687 | """ |
656 | if isinstance(value, unicode): | 688 | if isinstance(value, str): |
657 | return unicode.__new__(cls, value) | 689 | u = str.__new__(cls, value) |
658 | return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) | 690 | else: |
691 | u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) | ||
692 | u.setup() | ||
693 | return u | ||
659 | 694 | ||
660 | def __copy__(self): | 695 | def __copy__(self): |
661 | return self | 696 | """A copy of a NavigableString has the same contents and class |
697 | as the original, but it is not connected to the parse tree. | ||
698 | """ | ||
699 | return type(self)(self) | ||
662 | 700 | ||
663 | def __getnewargs__(self): | 701 | def __getnewargs__(self): |
664 | return (unicode(self),) | 702 | return (str(self),) |
665 | 703 | ||
666 | def __getattr__(self, attr): | 704 | def __getattr__(self, attr): |
667 | """text.string gives you text. This is for backwards | 705 | """text.string gives you text. This is for backwards |
@@ -701,23 +739,23 @@ class PreformattedString(NavigableString): | |||
701 | 739 | ||
702 | class CData(PreformattedString): | 740 | class CData(PreformattedString): |
703 | 741 | ||
704 | PREFIX = u'<![CDATA[' | 742 | PREFIX = '<![CDATA[' |
705 | SUFFIX = u']]>' | 743 | SUFFIX = ']]>' |
706 | 744 | ||
707 | class ProcessingInstruction(PreformattedString): | 745 | class ProcessingInstruction(PreformattedString): |
708 | 746 | ||
709 | PREFIX = u'<?' | 747 | PREFIX = '<?' |
710 | SUFFIX = u'?>' | 748 | SUFFIX = '>' |
711 | 749 | ||
712 | class Comment(PreformattedString): | 750 | class Comment(PreformattedString): |
713 | 751 | ||
714 | PREFIX = u'<!--' | 752 | PREFIX = '<!--' |
715 | SUFFIX = u'-->' | 753 | SUFFIX = '-->' |
716 | 754 | ||
717 | 755 | ||
718 | class Declaration(PreformattedString): | 756 | class Declaration(PreformattedString): |
719 | PREFIX = u'<!' | 757 | PREFIX = '<?' |
720 | SUFFIX = u'!>' | 758 | SUFFIX = '?>' |
721 | 759 | ||
722 | 760 | ||
723 | class Doctype(PreformattedString): | 761 | class Doctype(PreformattedString): |
@@ -734,8 +772,8 @@ class Doctype(PreformattedString): | |||
734 | 772 | ||
735 | return Doctype(value) | 773 | return Doctype(value) |
736 | 774 | ||
737 | PREFIX = u'<!DOCTYPE ' | 775 | PREFIX = '<!DOCTYPE ' |
738 | SUFFIX = u'>\n' | 776 | SUFFIX = '>\n' |
739 | 777 | ||
740 | 778 | ||
741 | class Tag(PageElement): | 779 | class Tag(PageElement): |
@@ -759,9 +797,12 @@ class Tag(PageElement): | |||
759 | self.prefix = prefix | 797 | self.prefix = prefix |
760 | if attrs is None: | 798 | if attrs is None: |
761 | attrs = {} | 799 | attrs = {} |
762 | elif attrs and builder.cdata_list_attributes: | 800 | elif attrs: |
763 | attrs = builder._replace_cdata_list_attribute_values( | 801 | if builder is not None and builder.cdata_list_attributes: |
764 | self.name, attrs) | 802 | attrs = builder._replace_cdata_list_attribute_values( |
803 | self.name, attrs) | ||
804 | else: | ||
805 | attrs = dict(attrs) | ||
765 | else: | 806 | else: |
766 | attrs = dict(attrs) | 807 | attrs = dict(attrs) |
767 | self.attrs = attrs | 808 | self.attrs = attrs |
@@ -778,6 +819,18 @@ class Tag(PageElement): | |||
778 | 819 | ||
779 | parserClass = _alias("parser_class") # BS3 | 820 | parserClass = _alias("parser_class") # BS3 |
780 | 821 | ||
822 | def __copy__(self): | ||
823 | """A copy of a Tag is a new Tag, unconnected to the parse tree. | ||
824 | Its contents are a copy of the old Tag's contents. | ||
825 | """ | ||
826 | clone = type(self)(None, self.builder, self.name, self.namespace, | ||
827 | self.nsprefix, self.attrs) | ||
828 | for attr in ('can_be_empty_element', 'hidden'): | ||
829 | setattr(clone, attr, getattr(self, attr)) | ||
830 | for child in self.contents: | ||
831 | clone.append(child.__copy__()) | ||
832 | return clone | ||
833 | |||
781 | @property | 834 | @property |
782 | def is_empty_element(self): | 835 | def is_empty_element(self): |
783 | """Is this tag an empty-element tag? (aka a self-closing tag) | 836 | """Is this tag an empty-element tag? (aka a self-closing tag) |
@@ -843,7 +896,7 @@ class Tag(PageElement): | |||
843 | for string in self._all_strings(True): | 896 | for string in self._all_strings(True): |
844 | yield string | 897 | yield string |
845 | 898 | ||
846 | def get_text(self, separator=u"", strip=False, | 899 | def get_text(self, separator="", strip=False, |
847 | types=(NavigableString, CData)): | 900 | types=(NavigableString, CData)): |
848 | """ | 901 | """ |
849 | Get all child strings, concatenated using the given separator. | 902 | Get all child strings, concatenated using the given separator. |
@@ -915,7 +968,7 @@ class Tag(PageElement): | |||
915 | def __contains__(self, x): | 968 | def __contains__(self, x): |
916 | return x in self.contents | 969 | return x in self.contents |
917 | 970 | ||
918 | def __nonzero__(self): | 971 | def __bool__(self): |
919 | "A tag is non-None even if it has no contents." | 972 | "A tag is non-None even if it has no contents." |
920 | return True | 973 | return True |
921 | 974 | ||
@@ -971,15 +1024,25 @@ class Tag(PageElement): | |||
971 | as defined in __eq__.""" | 1024 | as defined in __eq__.""" |
972 | return not self == other | 1025 | return not self == other |
973 | 1026 | ||
974 | def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): | 1027 | def __repr__(self, encoding="unicode-escape"): |
975 | """Renders this tag as a string.""" | 1028 | """Renders this tag as a string.""" |
976 | return self.encode(encoding) | 1029 | if PY3K: |
1030 | # "The return value must be a string object", i.e. Unicode | ||
1031 | return self.decode() | ||
1032 | else: | ||
1033 | # "The return value must be a string object", i.e. a bytestring. | ||
1034 | # By convention, the return value of __repr__ should also be | ||
1035 | # an ASCII string. | ||
1036 | return self.encode(encoding) | ||
977 | 1037 | ||
978 | def __unicode__(self): | 1038 | def __unicode__(self): |
979 | return self.decode() | 1039 | return self.decode() |
980 | 1040 | ||
981 | def __str__(self): | 1041 | def __str__(self): |
982 | return self.encode() | 1042 | if PY3K: |
1043 | return self.decode() | ||
1044 | else: | ||
1045 | return self.encode() | ||
983 | 1046 | ||
984 | if PY3K: | 1047 | if PY3K: |
985 | __str__ = __repr__ = __unicode__ | 1048 | __str__ = __repr__ = __unicode__ |
@@ -1014,7 +1077,7 @@ class Tag(PageElement): | |||
1014 | 1077 | ||
1015 | # First off, turn a string formatter into a function. This | 1078 | # First off, turn a string formatter into a function. This |
1016 | # will stop the lookup from happening over and over again. | 1079 | # will stop the lookup from happening over and over again. |
1017 | if not callable(formatter): | 1080 | if not isinstance(formatter, collections.Callable): |
1018 | formatter = self._formatter_for_name(formatter) | 1081 | formatter = self._formatter_for_name(formatter) |
1019 | 1082 | ||
1020 | attrs = [] | 1083 | attrs = [] |
@@ -1025,8 +1088,8 @@ class Tag(PageElement): | |||
1025 | else: | 1088 | else: |
1026 | if isinstance(val, list) or isinstance(val, tuple): | 1089 | if isinstance(val, list) or isinstance(val, tuple): |
1027 | val = ' '.join(val) | 1090 | val = ' '.join(val) |
1028 | elif not isinstance(val, basestring): | 1091 | elif not isinstance(val, str): |
1029 | val = unicode(val) | 1092 | val = str(val) |
1030 | elif ( | 1093 | elif ( |
1031 | isinstance(val, AttributeValueWithCharsetSubstitution) | 1094 | isinstance(val, AttributeValueWithCharsetSubstitution) |
1032 | and eventual_encoding is not None): | 1095 | and eventual_encoding is not None): |
@@ -1034,7 +1097,7 @@ class Tag(PageElement): | |||
1034 | 1097 | ||
1035 | text = self.format_string(val, formatter) | 1098 | text = self.format_string(val, formatter) |
1036 | decoded = ( | 1099 | decoded = ( |
1037 | unicode(key) + '=' | 1100 | str(key) + '=' |
1038 | + EntitySubstitution.quoted_attribute_value(text)) | 1101 | + EntitySubstitution.quoted_attribute_value(text)) |
1039 | attrs.append(decoded) | 1102 | attrs.append(decoded) |
1040 | close = '' | 1103 | close = '' |
@@ -1103,16 +1166,22 @@ class Tag(PageElement): | |||
1103 | formatter="minimal"): | 1166 | formatter="minimal"): |
1104 | """Renders the contents of this tag as a Unicode string. | 1167 | """Renders the contents of this tag as a Unicode string. |
1105 | 1168 | ||
1169 | :param indent_level: Each line of the rendering will be | ||
1170 | indented this many spaces. | ||
1171 | |||
1106 | :param eventual_encoding: The tag is destined to be | 1172 | :param eventual_encoding: The tag is destined to be |
1107 | encoded into this encoding. This method is _not_ | 1173 | encoded into this encoding. This method is _not_ |
1108 | responsible for performing that encoding. This information | 1174 | responsible for performing that encoding. This information |
1109 | is passed in so that it can be substituted in if the | 1175 | is passed in so that it can be substituted in if the |
1110 | document contains a <META> tag that mentions the document's | 1176 | document contains a <META> tag that mentions the document's |
1111 | encoding. | 1177 | encoding. |
1178 | |||
1179 | :param formatter: The output formatter responsible for converting | ||
1180 | entities to Unicode characters. | ||
1112 | """ | 1181 | """ |
1113 | # First off, turn a string formatter into a function. This | 1182 | # First off, turn a string formatter into a function. This |
1114 | # will stop the lookup from happening over and over again. | 1183 | # will stop the lookup from happening over and over again. |
1115 | if not callable(formatter): | 1184 | if not isinstance(formatter, collections.Callable): |
1116 | formatter = self._formatter_for_name(formatter) | 1185 | formatter = self._formatter_for_name(formatter) |
1117 | 1186 | ||
1118 | pretty_print = (indent_level is not None) | 1187 | pretty_print = (indent_level is not None) |
@@ -1137,7 +1206,17 @@ class Tag(PageElement): | |||
1137 | def encode_contents( | 1206 | def encode_contents( |
1138 | self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, | 1207 | self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, |
1139 | formatter="minimal"): | 1208 | formatter="minimal"): |
1140 | """Renders the contents of this tag as a bytestring.""" | 1209 | """Renders the contents of this tag as a bytestring. |
1210 | |||
1211 | :param indent_level: Each line of the rendering will be | ||
1212 | indented this many spaces. | ||
1213 | |||
1214 | :param eventual_encoding: The bytestring will be in this encoding. | ||
1215 | |||
1216 | :param formatter: The output formatter responsible for converting | ||
1217 | entities to Unicode characters. | ||
1218 | """ | ||
1219 | |||
1141 | contents = self.decode_contents(indent_level, encoding, formatter) | 1220 | contents = self.decode_contents(indent_level, encoding, formatter) |
1142 | return contents.encode(encoding) | 1221 | return contents.encode(encoding) |
1143 | 1222 | ||
@@ -1201,26 +1280,57 @@ class Tag(PageElement): | |||
1201 | 1280 | ||
1202 | _selector_combinators = ['>', '+', '~'] | 1281 | _selector_combinators = ['>', '+', '~'] |
1203 | _select_debug = False | 1282 | _select_debug = False |
1204 | def select(self, selector, _candidate_generator=None): | 1283 | def select_one(self, selector): |
1284 | """Perform a CSS selection operation on the current element.""" | ||
1285 | value = self.select(selector, limit=1) | ||
1286 | if value: | ||
1287 | return value[0] | ||
1288 | return None | ||
1289 | |||
1290 | def select(self, selector, _candidate_generator=None, limit=None): | ||
1205 | """Perform a CSS selection operation on the current element.""" | 1291 | """Perform a CSS selection operation on the current element.""" |
1292 | |||
1293 | # Handle grouping selectors if ',' exists, ie: p,a | ||
1294 | if ',' in selector: | ||
1295 | context = [] | ||
1296 | for partial_selector in selector.split(','): | ||
1297 | partial_selector = partial_selector.strip() | ||
1298 | if partial_selector == '': | ||
1299 | raise ValueError('Invalid group selection syntax: %s' % selector) | ||
1300 | candidates = self.select(partial_selector, limit=limit) | ||
1301 | for candidate in candidates: | ||
1302 | if candidate not in context: | ||
1303 | context.append(candidate) | ||
1304 | |||
1305 | if limit and len(context) >= limit: | ||
1306 | break | ||
1307 | return context | ||
1308 | |||
1206 | tokens = selector.split() | 1309 | tokens = selector.split() |
1207 | current_context = [self] | 1310 | current_context = [self] |
1208 | 1311 | ||
1209 | if tokens[-1] in self._selector_combinators: | 1312 | if tokens[-1] in self._selector_combinators: |
1210 | raise ValueError( | 1313 | raise ValueError( |
1211 | 'Final combinator "%s" is missing an argument.' % tokens[-1]) | 1314 | 'Final combinator "%s" is missing an argument.' % tokens[-1]) |
1315 | |||
1212 | if self._select_debug: | 1316 | if self._select_debug: |
1213 | print 'Running CSS selector "%s"' % selector | 1317 | print('Running CSS selector "%s"' % selector) |
1318 | |||
1214 | for index, token in enumerate(tokens): | 1319 | for index, token in enumerate(tokens): |
1215 | if self._select_debug: | 1320 | new_context = [] |
1216 | print ' Considering token "%s"' % token | 1321 | new_context_ids = set([]) |
1217 | recursive_candidate_generator = None | 1322 | |
1218 | tag_name = None | ||
1219 | if tokens[index-1] in self._selector_combinators: | 1323 | if tokens[index-1] in self._selector_combinators: |
1220 | # This token was consumed by the previous combinator. Skip it. | 1324 | # This token was consumed by the previous combinator. Skip it. |
1221 | if self._select_debug: | 1325 | if self._select_debug: |
1222 | print ' Token was consumed by the previous combinator.' | 1326 | print(' Token was consumed by the previous combinator.') |
1223 | continue | 1327 | continue |
1328 | |||
1329 | if self._select_debug: | ||
1330 | print(' Considering token "%s"' % token) | ||
1331 | recursive_candidate_generator = None | ||
1332 | tag_name = None | ||
1333 | |||
1224 | # Each operation corresponds to a checker function, a rule | 1334 | # Each operation corresponds to a checker function, a rule |
1225 | # for determining whether a candidate matches the | 1335 | # for determining whether a candidate matches the |
1226 | # selector. Candidates are generated by the active | 1336 | # selector. Candidates are generated by the active |
@@ -1256,35 +1366,38 @@ class Tag(PageElement): | |||
1256 | "A pseudo-class must be prefixed with a tag name.") | 1366 | "A pseudo-class must be prefixed with a tag name.") |
1257 | pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) | 1367 | pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) |
1258 | found = [] | 1368 | found = [] |
1259 | if pseudo_attributes is not None: | 1369 | if pseudo_attributes is None: |
1370 | pseudo_type = pseudo | ||
1371 | pseudo_value = None | ||
1372 | else: | ||
1260 | pseudo_type, pseudo_value = pseudo_attributes.groups() | 1373 | pseudo_type, pseudo_value = pseudo_attributes.groups() |
1261 | if pseudo_type == 'nth-of-type': | 1374 | if pseudo_type == 'nth-of-type': |
1262 | try: | 1375 | try: |
1263 | pseudo_value = int(pseudo_value) | 1376 | pseudo_value = int(pseudo_value) |
1264 | except: | 1377 | except: |
1265 | raise NotImplementedError( | ||
1266 | 'Only numeric values are currently supported for the nth-of-type pseudo-class.') | ||
1267 | if pseudo_value < 1: | ||
1268 | raise ValueError( | ||
1269 | 'nth-of-type pseudo-class value must be at least 1.') | ||
1270 | class Counter(object): | ||
1271 | def __init__(self, destination): | ||
1272 | self.count = 0 | ||
1273 | self.destination = destination | ||
1274 | |||
1275 | def nth_child_of_type(self, tag): | ||
1276 | self.count += 1 | ||
1277 | if self.count == self.destination: | ||
1278 | return True | ||
1279 | if self.count > self.destination: | ||
1280 | # Stop the generator that's sending us | ||
1281 | # these things. | ||
1282 | raise StopIteration() | ||
1283 | return False | ||
1284 | checker = Counter(pseudo_value).nth_child_of_type | ||
1285 | else: | ||
1286 | raise NotImplementedError( | 1378 | raise NotImplementedError( |
1287 | 'Only the following pseudo-classes are implemented: nth-of-type.') | 1379 | 'Only numeric values are currently supported for the nth-of-type pseudo-class.') |
1380 | if pseudo_value < 1: | ||
1381 | raise ValueError( | ||
1382 | 'nth-of-type pseudo-class value must be at least 1.') | ||
1383 | class Counter(object): | ||
1384 | def __init__(self, destination): | ||
1385 | self.count = 0 | ||
1386 | self.destination = destination | ||
1387 | |||
1388 | def nth_child_of_type(self, tag): | ||
1389 | self.count += 1 | ||
1390 | if self.count == self.destination: | ||
1391 | return True | ||
1392 | if self.count > self.destination: | ||
1393 | # Stop the generator that's sending us | ||
1394 | # these things. | ||
1395 | raise StopIteration() | ||
1396 | return False | ||
1397 | checker = Counter(pseudo_value).nth_child_of_type | ||
1398 | else: | ||
1399 | raise NotImplementedError( | ||
1400 | 'Only the following pseudo-classes are implemented: nth-of-type.') | ||
1288 | 1401 | ||
1289 | elif token == '*': | 1402 | elif token == '*': |
1290 | # Star selector -- matches everything | 1403 | # Star selector -- matches everything |
@@ -1311,7 +1424,6 @@ class Tag(PageElement): | |||
1311 | else: | 1424 | else: |
1312 | raise ValueError( | 1425 | raise ValueError( |
1313 | 'Unsupported or invalid CSS selector: "%s"' % token) | 1426 | 'Unsupported or invalid CSS selector: "%s"' % token) |
1314 | |||
1315 | if recursive_candidate_generator: | 1427 | if recursive_candidate_generator: |
1316 | # This happens when the selector looks like "> foo". | 1428 | # This happens when the selector looks like "> foo". |
1317 | # | 1429 | # |
@@ -1325,14 +1437,14 @@ class Tag(PageElement): | |||
1325 | next_token = tokens[index+1] | 1437 | next_token = tokens[index+1] |
1326 | def recursive_select(tag): | 1438 | def recursive_select(tag): |
1327 | if self._select_debug: | 1439 | if self._select_debug: |
1328 | print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) | 1440 | print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)) |
1329 | print '-' * 40 | 1441 | print('-' * 40) |
1330 | for i in tag.select(next_token, recursive_candidate_generator): | 1442 | for i in tag.select(next_token, recursive_candidate_generator): |
1331 | if self._select_debug: | 1443 | if self._select_debug: |
1332 | print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) | 1444 | print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)) |
1333 | yield i | 1445 | yield i |
1334 | if self._select_debug: | 1446 | if self._select_debug: |
1335 | print '-' * 40 | 1447 | print('-' * 40) |
1336 | _use_candidate_generator = recursive_select | 1448 | _use_candidate_generator = recursive_select |
1337 | elif _candidate_generator is None: | 1449 | elif _candidate_generator is None: |
1338 | # By default, a tag's candidates are all of its | 1450 | # By default, a tag's candidates are all of its |
@@ -1343,7 +1455,7 @@ class Tag(PageElement): | |||
1343 | check = "[any]" | 1455 | check = "[any]" |
1344 | else: | 1456 | else: |
1345 | check = tag_name | 1457 | check = tag_name |
1346 | print ' Default candidate generator, tag name="%s"' % check | 1458 | print(' Default candidate generator, tag name="%s"' % check) |
1347 | if self._select_debug: | 1459 | if self._select_debug: |
1348 | # This is redundant with later code, but it stops | 1460 | # This is redundant with later code, but it stops |
1349 | # a bunch of bogus tags from cluttering up the | 1461 | # a bunch of bogus tags from cluttering up the |
@@ -1361,12 +1473,11 @@ class Tag(PageElement): | |||
1361 | else: | 1473 | else: |
1362 | _use_candidate_generator = _candidate_generator | 1474 | _use_candidate_generator = _candidate_generator |
1363 | 1475 | ||
1364 | new_context = [] | 1476 | count = 0 |
1365 | new_context_ids = set([]) | ||
1366 | for tag in current_context: | 1477 | for tag in current_context: |
1367 | if self._select_debug: | 1478 | if self._select_debug: |
1368 | print " Running candidate generator on %s %s" % ( | 1479 | print(" Running candidate generator on %s %s" % ( |
1369 | tag.name, repr(tag.attrs)) | 1480 | tag.name, repr(tag.attrs))) |
1370 | for candidate in _use_candidate_generator(tag): | 1481 | for candidate in _use_candidate_generator(tag): |
1371 | if not isinstance(candidate, Tag): | 1482 | if not isinstance(candidate, Tag): |
1372 | continue | 1483 | continue |
@@ -1381,21 +1492,24 @@ class Tag(PageElement): | |||
1381 | break | 1492 | break |
1382 | if checker is None or result: | 1493 | if checker is None or result: |
1383 | if self._select_debug: | 1494 | if self._select_debug: |
1384 | print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) | 1495 | print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))) |
1385 | if id(candidate) not in new_context_ids: | 1496 | if id(candidate) not in new_context_ids: |
1386 | # If a tag matches a selector more than once, | 1497 | # If a tag matches a selector more than once, |
1387 | # don't include it in the context more than once. | 1498 | # don't include it in the context more than once. |
1388 | new_context.append(candidate) | 1499 | new_context.append(candidate) |
1389 | new_context_ids.add(id(candidate)) | 1500 | new_context_ids.add(id(candidate)) |
1501 | if limit and len(new_context) >= limit: | ||
1502 | break | ||
1390 | elif self._select_debug: | 1503 | elif self._select_debug: |
1391 | print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) | 1504 | print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs))) |
1505 | |||
1392 | 1506 | ||
1393 | current_context = new_context | 1507 | current_context = new_context |
1394 | 1508 | ||
1395 | if self._select_debug: | 1509 | if self._select_debug: |
1396 | print "Final verdict:" | 1510 | print("Final verdict:") |
1397 | for i in current_context: | 1511 | for i in current_context: |
1398 | print " %s %s" % (i.name, i.attrs) | 1512 | print(" %s %s" % (i.name, i.attrs)) |
1399 | return current_context | 1513 | return current_context |
1400 | 1514 | ||
1401 | # Old names for backwards compatibility | 1515 | # Old names for backwards compatibility |
@@ -1439,7 +1553,7 @@ class SoupStrainer(object): | |||
1439 | else: | 1553 | else: |
1440 | attrs = kwargs | 1554 | attrs = kwargs |
1441 | normalized_attrs = {} | 1555 | normalized_attrs = {} |
1442 | for key, value in attrs.items(): | 1556 | for key, value in list(attrs.items()): |
1443 | normalized_attrs[key] = self._normalize_search_value(value) | 1557 | normalized_attrs[key] = self._normalize_search_value(value) |
1444 | 1558 | ||
1445 | self.attrs = normalized_attrs | 1559 | self.attrs = normalized_attrs |
@@ -1448,7 +1562,7 @@ class SoupStrainer(object): | |||
1448 | def _normalize_search_value(self, value): | 1562 | def _normalize_search_value(self, value): |
1449 | # Leave it alone if it's a Unicode string, a callable, a | 1563 | # Leave it alone if it's a Unicode string, a callable, a |
1450 | # regular expression, a boolean, or None. | 1564 | # regular expression, a boolean, or None. |
1451 | if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') | 1565 | if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match') |
1452 | or isinstance(value, bool) or value is None): | 1566 | or isinstance(value, bool) or value is None): |
1453 | return value | 1567 | return value |
1454 | 1568 | ||
@@ -1461,7 +1575,7 @@ class SoupStrainer(object): | |||
1461 | new_value = [] | 1575 | new_value = [] |
1462 | for v in value: | 1576 | for v in value: |
1463 | if (hasattr(v, '__iter__') and not isinstance(v, bytes) | 1577 | if (hasattr(v, '__iter__') and not isinstance(v, bytes) |
1464 | and not isinstance(v, unicode)): | 1578 | and not isinstance(v, str)): |
1465 | # This is almost certainly the user's mistake. In the | 1579 | # This is almost certainly the user's mistake. In the |
1466 | # interests of avoiding infinite loops, we'll let | 1580 | # interests of avoiding infinite loops, we'll let |
1467 | # it through as-is rather than doing a recursive call. | 1581 | # it through as-is rather than doing a recursive call. |
@@ -1473,7 +1587,7 @@ class SoupStrainer(object): | |||
1473 | # Otherwise, convert it into a Unicode string. | 1587 | # Otherwise, convert it into a Unicode string. |
1474 | # The unicode(str()) thing is so this will do the same thing on Python 2 | 1588 | # The unicode(str()) thing is so this will do the same thing on Python 2 |
1475 | # and Python 3. | 1589 | # and Python 3. |
1476 | return unicode(str(value)) | 1590 | return str(str(value)) |
1477 | 1591 | ||
1478 | def __str__(self): | 1592 | def __str__(self): |
1479 | if self.text: | 1593 | if self.text: |
@@ -1527,7 +1641,7 @@ class SoupStrainer(object): | |||
1527 | found = None | 1641 | found = None |
1528 | # If given a list of items, scan it for a text element that | 1642 | # If given a list of items, scan it for a text element that |
1529 | # matches. | 1643 | # matches. |
1530 | if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): | 1644 | if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): |
1531 | for element in markup: | 1645 | for element in markup: |
1532 | if isinstance(element, NavigableString) \ | 1646 | if isinstance(element, NavigableString) \ |
1533 | and self.search(element): | 1647 | and self.search(element): |
@@ -1540,7 +1654,7 @@ class SoupStrainer(object): | |||
1540 | found = self.search_tag(markup) | 1654 | found = self.search_tag(markup) |
1541 | # If it's text, make sure the text matches. | 1655 | # If it's text, make sure the text matches. |
1542 | elif isinstance(markup, NavigableString) or \ | 1656 | elif isinstance(markup, NavigableString) or \ |
1543 | isinstance(markup, basestring): | 1657 | isinstance(markup, str): |
1544 | if not self.name and not self.attrs and self._matches(markup, self.text): | 1658 | if not self.name and not self.attrs and self._matches(markup, self.text): |
1545 | found = markup | 1659 | found = markup |
1546 | else: | 1660 | else: |
@@ -1554,7 +1668,7 @@ class SoupStrainer(object): | |||
1554 | if isinstance(markup, list) or isinstance(markup, tuple): | 1668 | if isinstance(markup, list) or isinstance(markup, tuple): |
1555 | # This should only happen when searching a multi-valued attribute | 1669 | # This should only happen when searching a multi-valued attribute |
1556 | # like 'class'. | 1670 | # like 'class'. |
1557 | if (isinstance(match_against, unicode) | 1671 | if (isinstance(match_against, str) |
1558 | and ' ' in match_against): | 1672 | and ' ' in match_against): |
1559 | # A bit of a special case. If they try to match "foo | 1673 | # A bit of a special case. If they try to match "foo |
1560 | # bar" on a multivalue attribute's value, only accept | 1674 | # bar" on a multivalue attribute's value, only accept |
@@ -1589,7 +1703,7 @@ class SoupStrainer(object): | |||
1589 | # None matches None, False, an empty string, an empty list, and so on. | 1703 | # None matches None, False, an empty string, an empty list, and so on. |
1590 | return not match_against | 1704 | return not match_against |
1591 | 1705 | ||
1592 | if isinstance(match_against, unicode): | 1706 | if isinstance(match_against, str): |
1593 | # Exact string match | 1707 | # Exact string match |
1594 | return markup == match_against | 1708 | return markup == match_against |
1595 | 1709 | ||