diff options
Diffstat (limited to 'bitbake/lib/bs4/element.py')
-rw-r--r-- | bitbake/lib/bs4/element.py | 2219 |
1 files changed, 1465 insertions, 754 deletions
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py index 68be42d138..0aefe734b2 100644 --- a/bitbake/lib/bs4/element.py +++ b/bitbake/lib/bs4/element.py | |||
@@ -1,14 +1,27 @@ | |||
1 | # Use of this source code is governed by the MIT license. | ||
1 | __license__ = "MIT" | 2 | __license__ = "MIT" |
2 | 3 | ||
3 | import collections.abc | 4 | try: |
5 | from collections.abc import Callable # Python 3.6 | ||
6 | except ImportError as e: | ||
7 | from collections import Callable | ||
4 | import re | 8 | import re |
5 | import sys | 9 | import sys |
6 | import warnings | 10 | import warnings |
7 | from bs4.dammit import EntitySubstitution | 11 | |
12 | from bs4.css import CSS | ||
13 | from bs4.formatter import ( | ||
14 | Formatter, | ||
15 | HTMLFormatter, | ||
16 | XMLFormatter, | ||
17 | ) | ||
8 | 18 | ||
9 | DEFAULT_OUTPUT_ENCODING = "utf-8" | 19 | DEFAULT_OUTPUT_ENCODING = "utf-8" |
10 | PY3K = (sys.version_info[0] > 2) | ||
11 | 20 | ||
21 | nonwhitespace_re = re.compile(r"\S+") | ||
22 | |||
23 | # NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on | ||
24 | # the off chance someone imported it for their own use. | ||
12 | whitespace_re = re.compile(r"\s+") | 25 | whitespace_re = re.compile(r"\s+") |
13 | 26 | ||
14 | def _alias(attr): | 27 | def _alias(attr): |
@@ -23,12 +36,49 @@ def _alias(attr): | |||
23 | return alias | 36 | return alias |
24 | 37 | ||
25 | 38 | ||
39 | # These encodings are recognized by Python (so PageElement.encode | ||
40 | # could theoretically support them) but XML and HTML don't recognize | ||
41 | # them (so they should not show up in an XML or HTML document as that | ||
42 | # document's encoding). | ||
43 | # | ||
44 | # If an XML document is encoded in one of these encodings, no encoding | ||
45 | # will be mentioned in the XML declaration. If an HTML document is | ||
46 | # encoded in one of these encodings, and the HTML document has a | ||
47 | # <meta> tag that mentions an encoding, the encoding will be given as | ||
48 | # the empty string. | ||
49 | # | ||
50 | # Source: | ||
51 | # https://docs.python.org/3/library/codecs.html#python-specific-encodings | ||
52 | PYTHON_SPECIFIC_ENCODINGS = set([ | ||
53 | "idna", | ||
54 | "mbcs", | ||
55 | "oem", | ||
56 | "palmos", | ||
57 | "punycode", | ||
58 | "raw_unicode_escape", | ||
59 | "undefined", | ||
60 | "unicode_escape", | ||
61 | "raw-unicode-escape", | ||
62 | "unicode-escape", | ||
63 | "string-escape", | ||
64 | "string_escape", | ||
65 | ]) | ||
66 | |||
67 | |||
26 | class NamespacedAttribute(str): | 68 | class NamespacedAttribute(str): |
69 | """A namespaced string (e.g. 'xml:lang') that remembers the namespace | ||
70 | ('xml') and the name ('lang') that were used to create it. | ||
71 | """ | ||
27 | 72 | ||
28 | def __new__(cls, prefix, name, namespace=None): | 73 | def __new__(cls, prefix, name=None, namespace=None): |
29 | if name is None: | 74 | if not name: |
75 | # This is the default namespace. Its name "has no value" | ||
76 | # per https://www.w3.org/TR/xml-names/#defaulting | ||
77 | name = None | ||
78 | |||
79 | if not name: | ||
30 | obj = str.__new__(cls, prefix) | 80 | obj = str.__new__(cls, prefix) |
31 | elif prefix is None: | 81 | elif not prefix: |
32 | # Not really namespaced. | 82 | # Not really namespaced. |
33 | obj = str.__new__(cls, name) | 83 | obj = str.__new__(cls, name) |
34 | else: | 84 | else: |
@@ -54,6 +104,11 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |||
54 | return obj | 104 | return obj |
55 | 105 | ||
56 | def encode(self, encoding): | 106 | def encode(self, encoding): |
107 | """When an HTML document is being encoded to a given encoding, the | ||
108 | value of a meta tag's 'charset' is the name of the encoding. | ||
109 | """ | ||
110 | if encoding in PYTHON_SPECIFIC_ENCODINGS: | ||
111 | return '' | ||
57 | return encoding | 112 | return encoding |
58 | 113 | ||
59 | 114 | ||
@@ -79,118 +134,44 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |||
79 | return obj | 134 | return obj |
80 | 135 | ||
81 | def encode(self, encoding): | 136 | def encode(self, encoding): |
137 | if encoding in PYTHON_SPECIFIC_ENCODINGS: | ||
138 | return '' | ||
82 | def rewrite(match): | 139 | def rewrite(match): |
83 | return match.group(1) + encoding | 140 | return match.group(1) + encoding |
84 | return self.CHARSET_RE.sub(rewrite, self.original_value) | 141 | return self.CHARSET_RE.sub(rewrite, self.original_value) |
85 | 142 | ||
86 | class HTMLAwareEntitySubstitution(EntitySubstitution): | ||
87 | |||
88 | """Entity substitution rules that are aware of some HTML quirks. | ||
89 | 143 | ||
90 | Specifically, the contents of <script> and <style> tags should not | 144 | class PageElement(object): |
91 | undergo entity substitution. | 145 | """Contains the navigational information for some part of the page: |
146 | that is, its current location in the parse tree. | ||
92 | 147 | ||
93 | Incoming NavigableString objects are checked to see if they're the | 148 | NavigableString, Tag, etc. are all subclasses of PageElement. |
94 | direct children of a <script> or <style> tag. | ||
95 | """ | 149 | """ |
96 | 150 | ||
97 | cdata_containing_tags = set(["script", "style"]) | 151 | # In general, we can't tell just by looking at an element whether |
152 | # it's contained in an XML document or an HTML document. But for | ||
153 | # Tags (q.v.) we can store this information at parse time. | ||
154 | known_xml = None | ||
98 | 155 | ||
99 | preformatted_tags = set(["pre"]) | 156 | def setup(self, parent=None, previous_element=None, next_element=None, |
100 | 157 | previous_sibling=None, next_sibling=None): | |
101 | @classmethod | 158 | """Sets up the initial relations between this element and |
102 | def _substitute_if_appropriate(cls, ns, f): | 159 | other elements. |
103 | if (isinstance(ns, NavigableString) | ||
104 | and ns.parent is not None | ||
105 | and ns.parent.name in cls.cdata_containing_tags): | ||
106 | # Do nothing. | ||
107 | return ns | ||
108 | # Substitute. | ||
109 | return f(ns) | ||
110 | 160 | ||
111 | @classmethod | 161 | :param parent: The parent of this element. |
112 | def substitute_html(cls, ns): | ||
113 | return cls._substitute_if_appropriate( | ||
114 | ns, EntitySubstitution.substitute_html) | ||
115 | 162 | ||
116 | @classmethod | 163 | :param previous_element: The element parsed immediately before |
117 | def substitute_xml(cls, ns): | 164 | this one. |
118 | return cls._substitute_if_appropriate( | ||
119 | ns, EntitySubstitution.substitute_xml) | ||
120 | 165 | ||
121 | class PageElement(object): | 166 | :param next_element: The element parsed immediately before |
122 | """Contains the navigational information for some part of the page | 167 | this one. |
123 | (either a tag or a piece of text)""" | ||
124 | |||
125 | # There are five possible values for the "formatter" argument passed in | ||
126 | # to methods like encode() and prettify(): | ||
127 | # | ||
128 | # "html" - All Unicode characters with corresponding HTML entities | ||
129 | # are converted to those entities on output. | ||
130 | # "minimal" - Bare ampersands and angle brackets are converted to | ||
131 | # XML entities: & < > | ||
132 | # None - The null formatter. Unicode characters are never | ||
133 | # converted to entities. This is not recommended, but it's | ||
134 | # faster than "minimal". | ||
135 | # A function - This function will be called on every string that | ||
136 | # needs to undergo entity substitution. | ||
137 | # | ||
138 | |||
139 | # In an HTML document, the default "html" and "minimal" functions | ||
140 | # will leave the contents of <script> and <style> tags alone. For | ||
141 | # an XML document, all tags will be given the same treatment. | ||
142 | |||
143 | HTML_FORMATTERS = { | ||
144 | "html" : HTMLAwareEntitySubstitution.substitute_html, | ||
145 | "minimal" : HTMLAwareEntitySubstitution.substitute_xml, | ||
146 | None : None | ||
147 | } | ||
148 | |||
149 | XML_FORMATTERS = { | ||
150 | "html" : EntitySubstitution.substitute_html, | ||
151 | "minimal" : EntitySubstitution.substitute_xml, | ||
152 | None : None | ||
153 | } | ||
154 | |||
155 | def format_string(self, s, formatter='minimal'): | ||
156 | """Format the given string using the given formatter.""" | ||
157 | if not isinstance(formatter, collections.abc.Callable): | ||
158 | formatter = self._formatter_for_name(formatter) | ||
159 | if formatter is None: | ||
160 | output = s | ||
161 | else: | ||
162 | output = formatter(s) | ||
163 | return output | ||
164 | 168 | ||
165 | @property | 169 | :param previous_sibling: The most recently encountered element |
166 | def _is_xml(self): | 170 | on the same level of the parse tree as this one. |
167 | """Is this element part of an XML tree or an HTML tree? | ||
168 | 171 | ||
169 | This is used when mapping a formatter name ("minimal") to an | 172 | :param previous_sibling: The next element to be encountered |
170 | appropriate function (one that performs entity-substitution on | 173 | on the same level of the parse tree as this one. |
171 | the contents of <script> and <style> tags, or not). It's | ||
172 | inefficient, but it should be called very rarely. | ||
173 | """ | 174 | """ |
174 | if self.parent is None: | ||
175 | # This is the top-level object. It should have .is_xml set | ||
176 | # from tree creation. If not, take a guess--BS is usually | ||
177 | # used on HTML markup. | ||
178 | return getattr(self, 'is_xml', False) | ||
179 | return self.parent._is_xml | ||
180 | |||
181 | def _formatter_for_name(self, name): | ||
182 | "Look up a formatter function based on its name and the tree." | ||
183 | if self._is_xml: | ||
184 | return self.XML_FORMATTERS.get( | ||
185 | name, EntitySubstitution.substitute_xml) | ||
186 | else: | ||
187 | return self.HTML_FORMATTERS.get( | ||
188 | name, HTMLAwareEntitySubstitution.substitute_xml) | ||
189 | |||
190 | def setup(self, parent=None, previous_element=None, next_element=None, | ||
191 | previous_sibling=None, next_sibling=None): | ||
192 | """Sets up the initial relations between this element and | ||
193 | other elements.""" | ||
194 | self.parent = parent | 175 | self.parent = parent |
195 | 176 | ||
196 | self.previous_element = previous_element | 177 | self.previous_element = previous_element |
@@ -198,48 +179,156 @@ class PageElement(object): | |||
198 | self.previous_element.next_element = self | 179 | self.previous_element.next_element = self |
199 | 180 | ||
200 | self.next_element = next_element | 181 | self.next_element = next_element |
201 | if self.next_element: | 182 | if self.next_element is not None: |
202 | self.next_element.previous_element = self | 183 | self.next_element.previous_element = self |
203 | 184 | ||
204 | self.next_sibling = next_sibling | 185 | self.next_sibling = next_sibling |
205 | if self.next_sibling: | 186 | if self.next_sibling is not None: |
206 | self.next_sibling.previous_sibling = self | 187 | self.next_sibling.previous_sibling = self |
207 | 188 | ||
208 | if (not previous_sibling | 189 | if (previous_sibling is None |
209 | and self.parent is not None and self.parent.contents): | 190 | and self.parent is not None and self.parent.contents): |
210 | previous_sibling = self.parent.contents[-1] | 191 | previous_sibling = self.parent.contents[-1] |
211 | 192 | ||
212 | self.previous_sibling = previous_sibling | 193 | self.previous_sibling = previous_sibling |
213 | if previous_sibling: | 194 | if previous_sibling is not None: |
214 | self.previous_sibling.next_sibling = self | 195 | self.previous_sibling.next_sibling = self |
215 | 196 | ||
197 | def format_string(self, s, formatter): | ||
198 | """Format the given string using the given formatter. | ||
199 | |||
200 | :param s: A string. | ||
201 | :param formatter: A Formatter object, or a string naming one of the standard formatters. | ||
202 | """ | ||
203 | if formatter is None: | ||
204 | return s | ||
205 | if not isinstance(formatter, Formatter): | ||
206 | formatter = self.formatter_for_name(formatter) | ||
207 | output = formatter.substitute(s) | ||
208 | return output | ||
209 | |||
210 | def formatter_for_name(self, formatter): | ||
211 | """Look up or create a Formatter for the given identifier, | ||
212 | if necessary. | ||
213 | |||
214 | :param formatter: Can be a Formatter object (used as-is), a | ||
215 | function (used as the entity substitution hook for an | ||
216 | XMLFormatter or HTMLFormatter), or a string (used to look | ||
217 | up an XMLFormatter or HTMLFormatter in the appropriate | ||
218 | registry. | ||
219 | """ | ||
220 | if isinstance(formatter, Formatter): | ||
221 | return formatter | ||
222 | if self._is_xml: | ||
223 | c = XMLFormatter | ||
224 | else: | ||
225 | c = HTMLFormatter | ||
226 | if isinstance(formatter, Callable): | ||
227 | return c(entity_substitution=formatter) | ||
228 | return c.REGISTRY[formatter] | ||
229 | |||
230 | @property | ||
231 | def _is_xml(self): | ||
232 | """Is this element part of an XML tree or an HTML tree? | ||
233 | |||
234 | This is used in formatter_for_name, when deciding whether an | ||
235 | XMLFormatter or HTMLFormatter is more appropriate. It can be | ||
236 | inefficient, but it should be called very rarely. | ||
237 | """ | ||
238 | if self.known_xml is not None: | ||
239 | # Most of the time we will have determined this when the | ||
240 | # document is parsed. | ||
241 | return self.known_xml | ||
242 | |||
243 | # Otherwise, it's likely that this element was created by | ||
244 | # direct invocation of the constructor from within the user's | ||
245 | # Python code. | ||
246 | if self.parent is None: | ||
247 | # This is the top-level object. It should have .known_xml set | ||
248 | # from tree creation. If not, take a guess--BS is usually | ||
249 | # used on HTML markup. | ||
250 | return getattr(self, 'is_xml', False) | ||
251 | return self.parent._is_xml | ||
252 | |||
216 | nextSibling = _alias("next_sibling") # BS3 | 253 | nextSibling = _alias("next_sibling") # BS3 |
217 | previousSibling = _alias("previous_sibling") # BS3 | 254 | previousSibling = _alias("previous_sibling") # BS3 |
218 | 255 | ||
219 | def replace_with(self, replace_with): | 256 | default = object() |
220 | if not self.parent: | 257 | def _all_strings(self, strip=False, types=default): |
258 | """Yield all strings of certain classes, possibly stripping them. | ||
259 | |||
260 | This is implemented differently in Tag and NavigableString. | ||
261 | """ | ||
262 | raise NotImplementedError() | ||
263 | |||
264 | @property | ||
265 | def stripped_strings(self): | ||
266 | """Yield all strings in this PageElement, stripping them first. | ||
267 | |||
268 | :yield: A sequence of stripped strings. | ||
269 | """ | ||
270 | for string in self._all_strings(True): | ||
271 | yield string | ||
272 | |||
273 | def get_text(self, separator="", strip=False, | ||
274 | types=default): | ||
275 | """Get all child strings of this PageElement, concatenated using the | ||
276 | given separator. | ||
277 | |||
278 | :param separator: Strings will be concatenated using this separator. | ||
279 | |||
280 | :param strip: If True, strings will be stripped before being | ||
281 | concatenated. | ||
282 | |||
283 | :param types: A tuple of NavigableString subclasses. Any | ||
284 | strings of a subclass not found in this list will be | ||
285 | ignored. Although there are exceptions, the default | ||
286 | behavior in most cases is to consider only NavigableString | ||
287 | and CData objects. That means no comments, processing | ||
288 | instructions, etc. | ||
289 | |||
290 | :return: A string. | ||
291 | """ | ||
292 | return separator.join([s for s in self._all_strings( | ||
293 | strip, types=types)]) | ||
294 | getText = get_text | ||
295 | text = property(get_text) | ||
296 | |||
297 | def replace_with(self, *args): | ||
298 | """Replace this PageElement with one or more PageElements, keeping the | ||
299 | rest of the tree the same. | ||
300 | |||
301 | :param args: One or more PageElements. | ||
302 | :return: `self`, no longer part of the tree. | ||
303 | """ | ||
304 | if self.parent is None: | ||
221 | raise ValueError( | 305 | raise ValueError( |
222 | "Cannot replace one element with another when the" | 306 | "Cannot replace one element with another when the " |
223 | "element to be replaced is not part of a tree.") | 307 | "element to be replaced is not part of a tree.") |
224 | if replace_with is self: | 308 | if len(args) == 1 and args[0] is self: |
225 | return | 309 | return |
226 | if replace_with is self.parent: | 310 | if any(x is self.parent for x in args): |
227 | raise ValueError("Cannot replace a Tag with its parent.") | 311 | raise ValueError("Cannot replace a Tag with its parent.") |
228 | old_parent = self.parent | 312 | old_parent = self.parent |
229 | my_index = self.parent.index(self) | 313 | my_index = self.parent.index(self) |
230 | self.extract() | 314 | self.extract(_self_index=my_index) |
231 | old_parent.insert(my_index, replace_with) | 315 | for idx, replace_with in enumerate(args, start=my_index): |
316 | old_parent.insert(idx, replace_with) | ||
232 | return self | 317 | return self |
233 | replaceWith = replace_with # BS3 | 318 | replaceWith = replace_with # BS3 |
234 | 319 | ||
235 | def unwrap(self): | 320 | def unwrap(self): |
321 | """Replace this PageElement with its contents. | ||
322 | |||
323 | :return: `self`, no longer part of the tree. | ||
324 | """ | ||
236 | my_parent = self.parent | 325 | my_parent = self.parent |
237 | if not self.parent: | 326 | if self.parent is None: |
238 | raise ValueError( | 327 | raise ValueError( |
239 | "Cannot replace an element with its contents when that" | 328 | "Cannot replace an element with its contents when that" |
240 | "element is not part of a tree.") | 329 | "element is not part of a tree.") |
241 | my_index = self.parent.index(self) | 330 | my_index = self.parent.index(self) |
242 | self.extract() | 331 | self.extract(_self_index=my_index) |
243 | for child in reversed(self.contents[:]): | 332 | for child in reversed(self.contents[:]): |
244 | my_parent.insert(my_index, child) | 333 | my_parent.insert(my_index, child) |
245 | return self | 334 | return self |
@@ -247,14 +336,29 @@ class PageElement(object): | |||
247 | replaceWithChildren = unwrap # BS3 | 336 | replaceWithChildren = unwrap # BS3 |
248 | 337 | ||
249 | def wrap(self, wrap_inside): | 338 | def wrap(self, wrap_inside): |
339 | """Wrap this PageElement inside another one. | ||
340 | |||
341 | :param wrap_inside: A PageElement. | ||
342 | :return: `wrap_inside`, occupying the position in the tree that used | ||
343 | to be occupied by `self`, and with `self` inside it. | ||
344 | """ | ||
250 | me = self.replace_with(wrap_inside) | 345 | me = self.replace_with(wrap_inside) |
251 | wrap_inside.append(me) | 346 | wrap_inside.append(me) |
252 | return wrap_inside | 347 | return wrap_inside |
253 | 348 | ||
254 | def extract(self): | 349 | def extract(self, _self_index=None): |
255 | """Destructively rips this element out of the tree.""" | 350 | """Destructively rips this element out of the tree. |
351 | |||
352 | :param _self_index: The location of this element in its parent's | ||
353 | .contents, if known. Passing this in allows for a performance | ||
354 | optimization. | ||
355 | |||
356 | :return: `self`, no longer part of the tree. | ||
357 | """ | ||
256 | if self.parent is not None: | 358 | if self.parent is not None: |
257 | del self.parent.contents[self.parent.index(self)] | 359 | if _self_index is None: |
360 | _self_index = self.parent.index(self) | ||
361 | del self.parent.contents[_self_index] | ||
258 | 362 | ||
259 | #Find the two elements that would be next to each other if | 363 | #Find the two elements that would be next to each other if |
260 | #this element (and any children) hadn't been parsed. Connect | 364 | #this element (and any children) hadn't been parsed. Connect |
@@ -281,8 +385,13 @@ class PageElement(object): | |||
281 | return self | 385 | return self |
282 | 386 | ||
283 | def _last_descendant(self, is_initialized=True, accept_self=True): | 387 | def _last_descendant(self, is_initialized=True, accept_self=True): |
284 | "Finds the last element beneath this object to be parsed." | 388 | """Finds the last element beneath this object to be parsed. |
285 | if is_initialized and self.next_sibling: | 389 | |
390 | :param is_initialized: Has `setup` been called on this PageElement | ||
391 | yet? | ||
392 | :param accept_self: Is `self` an acceptable answer to the question? | ||
393 | """ | ||
394 | if is_initialized and self.next_sibling is not None: | ||
286 | last_child = self.next_sibling.previous_element | 395 | last_child = self.next_sibling.previous_element |
287 | else: | 396 | else: |
288 | last_child = self | 397 | last_child = self |
@@ -295,6 +404,14 @@ class PageElement(object): | |||
295 | _lastRecursiveChild = _last_descendant | 404 | _lastRecursiveChild = _last_descendant |
296 | 405 | ||
297 | def insert(self, position, new_child): | 406 | def insert(self, position, new_child): |
407 | """Insert a new PageElement in the list of this PageElement's children. | ||
408 | |||
409 | This works the same way as `list.insert`. | ||
410 | |||
411 | :param position: The numeric position that should be occupied | ||
412 | in `self.children` by the new PageElement. | ||
413 | :param new_child: A PageElement. | ||
414 | """ | ||
298 | if new_child is None: | 415 | if new_child is None: |
299 | raise ValueError("Cannot insert None into a tag.") | 416 | raise ValueError("Cannot insert None into a tag.") |
300 | if new_child is self: | 417 | if new_child is self: |
@@ -303,6 +420,14 @@ class PageElement(object): | |||
303 | and not isinstance(new_child, NavigableString)): | 420 | and not isinstance(new_child, NavigableString)): |
304 | new_child = NavigableString(new_child) | 421 | new_child = NavigableString(new_child) |
305 | 422 | ||
423 | from bs4 import BeautifulSoup | ||
424 | if isinstance(new_child, BeautifulSoup): | ||
425 | # We don't want to end up with a situation where one BeautifulSoup | ||
426 | # object contains another. Insert the children one at a time. | ||
427 | for subchild in list(new_child.contents): | ||
428 | self.insert(position, subchild) | ||
429 | position += 1 | ||
430 | return | ||
306 | position = min(position, len(self.contents)) | 431 | position = min(position, len(self.contents)) |
307 | if hasattr(new_child, 'parent') and new_child.parent is not None: | 432 | if hasattr(new_child, 'parent') and new_child.parent is not None: |
308 | # We're 'inserting' an element that's already one | 433 | # We're 'inserting' an element that's already one |
@@ -361,160 +486,326 @@ class PageElement(object): | |||
361 | self.contents.insert(position, new_child) | 486 | self.contents.insert(position, new_child) |
362 | 487 | ||
363 | def append(self, tag): | 488 | def append(self, tag): |
364 | """Appends the given tag to the contents of this tag.""" | 489 | """Appends the given PageElement to the contents of this one. |
490 | |||
491 | :param tag: A PageElement. | ||
492 | """ | ||
365 | self.insert(len(self.contents), tag) | 493 | self.insert(len(self.contents), tag) |
366 | 494 | ||
367 | def insert_before(self, predecessor): | 495 | def extend(self, tags): |
368 | """Makes the given element the immediate predecessor of this one. | 496 | """Appends the given PageElements to this one's contents. |
369 | 497 | ||
370 | The two elements will have the same parent, and the given element | 498 | :param tags: A list of PageElements. If a single Tag is |
499 | provided instead, this PageElement's contents will be extended | ||
500 | with that Tag's contents. | ||
501 | """ | ||
502 | if isinstance(tags, Tag): | ||
503 | tags = tags.contents | ||
504 | if isinstance(tags, list): | ||
505 | # Moving items around the tree may change their position in | ||
506 | # the original list. Make a list that won't change. | ||
507 | tags = list(tags) | ||
508 | for tag in tags: | ||
509 | self.append(tag) | ||
510 | |||
511 | def insert_before(self, *args): | ||
512 | """Makes the given element(s) the immediate predecessor of this one. | ||
513 | |||
514 | All the elements will have the same parent, and the given elements | ||
371 | will be immediately before this one. | 515 | will be immediately before this one. |
516 | |||
517 | :param args: One or more PageElements. | ||
372 | """ | 518 | """ |
373 | if self is predecessor: | ||
374 | raise ValueError("Can't insert an element before itself.") | ||
375 | parent = self.parent | 519 | parent = self.parent |
376 | if parent is None: | 520 | if parent is None: |
377 | raise ValueError( | 521 | raise ValueError( |
378 | "Element has no parent, so 'before' has no meaning.") | 522 | "Element has no parent, so 'before' has no meaning.") |
379 | # Extract first so that the index won't be screwed up if they | 523 | if any(x is self for x in args): |
380 | # are siblings. | 524 | raise ValueError("Can't insert an element before itself.") |
381 | if isinstance(predecessor, PageElement): | 525 | for predecessor in args: |
382 | predecessor.extract() | 526 | # Extract first so that the index won't be screwed up if they |
383 | index = parent.index(self) | 527 | # are siblings. |
384 | parent.insert(index, predecessor) | 528 | if isinstance(predecessor, PageElement): |
385 | 529 | predecessor.extract() | |
386 | def insert_after(self, successor): | 530 | index = parent.index(self) |
387 | """Makes the given element the immediate successor of this one. | 531 | parent.insert(index, predecessor) |
388 | 532 | ||
389 | The two elements will have the same parent, and the given element | 533 | def insert_after(self, *args): |
534 | """Makes the given element(s) the immediate successor of this one. | ||
535 | |||
536 | The elements will have the same parent, and the given elements | ||
390 | will be immediately after this one. | 537 | will be immediately after this one. |
538 | |||
539 | :param args: One or more PageElements. | ||
391 | """ | 540 | """ |
392 | if self is successor: | 541 | # Do all error checking before modifying the tree. |
393 | raise ValueError("Can't insert an element after itself.") | ||
394 | parent = self.parent | 542 | parent = self.parent |
395 | if parent is None: | 543 | if parent is None: |
396 | raise ValueError( | 544 | raise ValueError( |
397 | "Element has no parent, so 'after' has no meaning.") | 545 | "Element has no parent, so 'after' has no meaning.") |
398 | # Extract first so that the index won't be screwed up if they | 546 | if any(x is self for x in args): |
399 | # are siblings. | 547 | raise ValueError("Can't insert an element after itself.") |
400 | if isinstance(successor, PageElement): | 548 | |
401 | successor.extract() | 549 | offset = 0 |
402 | index = parent.index(self) | 550 | for successor in args: |
403 | parent.insert(index+1, successor) | 551 | # Extract first so that the index won't be screwed up if they |
404 | 552 | # are siblings. | |
405 | def find_next(self, name=None, attrs={}, text=None, **kwargs): | 553 | if isinstance(successor, PageElement): |
406 | """Returns the first item that matches the given criteria and | 554 | successor.extract() |
407 | appears after this Tag in the document.""" | 555 | index = parent.index(self) |
408 | return self._find_one(self.find_all_next, name, attrs, text, **kwargs) | 556 | parent.insert(index+1+offset, successor) |
557 | offset += 1 | ||
558 | |||
559 | def find_next(self, name=None, attrs={}, string=None, **kwargs): | ||
560 | """Find the first PageElement that matches the given criteria and | ||
561 | appears later in the document than this PageElement. | ||
562 | |||
563 | All find_* methods take a common set of arguments. See the online | ||
564 | documentation for detailed explanations. | ||
565 | |||
566 | :param name: A filter on tag name. | ||
567 | :param attrs: A dictionary of filters on attribute values. | ||
568 | :param string: A filter for a NavigableString with specific text. | ||
569 | :kwargs: A dictionary of filters on attribute values. | ||
570 | :return: A PageElement. | ||
571 | :rtype: bs4.element.Tag | bs4.element.NavigableString | ||
572 | """ | ||
573 | return self._find_one(self.find_all_next, name, attrs, string, **kwargs) | ||
409 | findNext = find_next # BS3 | 574 | findNext = find_next # BS3 |
410 | 575 | ||
411 | def find_all_next(self, name=None, attrs={}, text=None, limit=None, | 576 | def find_all_next(self, name=None, attrs={}, string=None, limit=None, |
412 | **kwargs): | 577 | **kwargs): |
413 | """Returns all items that match the given criteria and appear | 578 | """Find all PageElements that match the given criteria and appear |
414 | after this Tag in the document.""" | 579 | later in the document than this PageElement. |
415 | return self._find_all(name, attrs, text, limit, self.next_elements, | 580 | |
416 | **kwargs) | 581 | All find_* methods take a common set of arguments. See the online |
582 | documentation for detailed explanations. | ||
583 | |||
584 | :param name: A filter on tag name. | ||
585 | :param attrs: A dictionary of filters on attribute values. | ||
586 | :param string: A filter for a NavigableString with specific text. | ||
587 | :param limit: Stop looking after finding this many results. | ||
588 | :kwargs: A dictionary of filters on attribute values. | ||
589 | :return: A ResultSet containing PageElements. | ||
590 | """ | ||
591 | _stacklevel = kwargs.pop('_stacklevel', 2) | ||
592 | return self._find_all(name, attrs, string, limit, self.next_elements, | ||
593 | _stacklevel=_stacklevel+1, **kwargs) | ||
417 | findAllNext = find_all_next # BS3 | 594 | findAllNext = find_all_next # BS3 |
418 | 595 | ||
419 | def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): | 596 | def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): |
420 | """Returns the closest sibling to this Tag that matches the | 597 | """Find the closest sibling to this PageElement that matches the |
421 | given criteria and appears after this Tag in the document.""" | 598 | given criteria and appears later in the document. |
422 | return self._find_one(self.find_next_siblings, name, attrs, text, | 599 | |
600 | All find_* methods take a common set of arguments. See the | ||
601 | online documentation for detailed explanations. | ||
602 | |||
603 | :param name: A filter on tag name. | ||
604 | :param attrs: A dictionary of filters on attribute values. | ||
605 | :param string: A filter for a NavigableString with specific text. | ||
606 | :kwargs: A dictionary of filters on attribute values. | ||
607 | :return: A PageElement. | ||
608 | :rtype: bs4.element.Tag | bs4.element.NavigableString | ||
609 | """ | ||
610 | return self._find_one(self.find_next_siblings, name, attrs, string, | ||
423 | **kwargs) | 611 | **kwargs) |
424 | findNextSibling = find_next_sibling # BS3 | 612 | findNextSibling = find_next_sibling # BS3 |
425 | 613 | ||
426 | def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, | 614 | def find_next_siblings(self, name=None, attrs={}, string=None, limit=None, |
427 | **kwargs): | 615 | **kwargs): |
428 | """Returns the siblings of this Tag that match the given | 616 | """Find all siblings of this PageElement that match the given criteria |
429 | criteria and appear after this Tag in the document.""" | 617 | and appear later in the document. |
430 | return self._find_all(name, attrs, text, limit, | 618 | |
431 | self.next_siblings, **kwargs) | 619 | All find_* methods take a common set of arguments. See the online |
620 | documentation for detailed explanations. | ||
621 | |||
622 | :param name: A filter on tag name. | ||
623 | :param attrs: A dictionary of filters on attribute values. | ||
624 | :param string: A filter for a NavigableString with specific text. | ||
625 | :param limit: Stop looking after finding this many results. | ||
626 | :kwargs: A dictionary of filters on attribute values. | ||
627 | :return: A ResultSet of PageElements. | ||
628 | :rtype: bs4.element.ResultSet | ||
629 | """ | ||
630 | _stacklevel = kwargs.pop('_stacklevel', 2) | ||
631 | return self._find_all( | ||
632 | name, attrs, string, limit, | ||
633 | self.next_siblings, _stacklevel=_stacklevel+1, **kwargs | ||
634 | ) | ||
432 | findNextSiblings = find_next_siblings # BS3 | 635 | findNextSiblings = find_next_siblings # BS3 |
433 | fetchNextSiblings = find_next_siblings # BS2 | 636 | fetchNextSiblings = find_next_siblings # BS2 |
434 | 637 | ||
435 | def find_previous(self, name=None, attrs={}, text=None, **kwargs): | 638 | def find_previous(self, name=None, attrs={}, string=None, **kwargs): |
436 | """Returns the first item that matches the given criteria and | 639 | """Look backwards in the document from this PageElement and find the |
437 | appears before this Tag in the document.""" | 640 | first PageElement that matches the given criteria. |
641 | |||
642 | All find_* methods take a common set of arguments. See the online | ||
643 | documentation for detailed explanations. | ||
644 | |||
645 | :param name: A filter on tag name. | ||
646 | :param attrs: A dictionary of filters on attribute values. | ||
647 | :param string: A filter for a NavigableString with specific text. | ||
648 | :kwargs: A dictionary of filters on attribute values. | ||
649 | :return: A PageElement. | ||
650 | :rtype: bs4.element.Tag | bs4.element.NavigableString | ||
651 | """ | ||
438 | return self._find_one( | 652 | return self._find_one( |
439 | self.find_all_previous, name, attrs, text, **kwargs) | 653 | self.find_all_previous, name, attrs, string, **kwargs) |
440 | findPrevious = find_previous # BS3 | 654 | findPrevious = find_previous # BS3 |
441 | 655 | ||
442 | def find_all_previous(self, name=None, attrs={}, text=None, limit=None, | 656 | def find_all_previous(self, name=None, attrs={}, string=None, limit=None, |
443 | **kwargs): | 657 | **kwargs): |
444 | """Returns all items that match the given criteria and appear | 658 | """Look backwards in the document from this PageElement and find all |
445 | before this Tag in the document.""" | 659 | PageElements that match the given criteria. |
446 | return self._find_all(name, attrs, text, limit, self.previous_elements, | 660 | |
447 | **kwargs) | 661 | All find_* methods take a common set of arguments. See the online |
662 | documentation for detailed explanations. | ||
663 | |||
664 | :param name: A filter on tag name. | ||
665 | :param attrs: A dictionary of filters on attribute values. | ||
666 | :param string: A filter for a NavigableString with specific text. | ||
667 | :param limit: Stop looking after finding this many results. | ||
668 | :kwargs: A dictionary of filters on attribute values. | ||
669 | :return: A ResultSet of PageElements. | ||
670 | :rtype: bs4.element.ResultSet | ||
671 | """ | ||
672 | _stacklevel = kwargs.pop('_stacklevel', 2) | ||
673 | return self._find_all( | ||
674 | name, attrs, string, limit, self.previous_elements, | ||
675 | _stacklevel=_stacklevel+1, **kwargs | ||
676 | ) | ||
448 | findAllPrevious = find_all_previous # BS3 | 677 | findAllPrevious = find_all_previous # BS3 |
449 | fetchPrevious = find_all_previous # BS2 | 678 | fetchPrevious = find_all_previous # BS2 |
450 | 679 | ||
451 | def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): | 680 | def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs): |
452 | """Returns the closest sibling to this Tag that matches the | 681 | """Returns the closest sibling to this PageElement that matches the |
453 | given criteria and appears before this Tag in the document.""" | 682 | given criteria and appears earlier in the document. |
454 | return self._find_one(self.find_previous_siblings, name, attrs, text, | 683 | |
684 | All find_* methods take a common set of arguments. See the online | ||
685 | documentation for detailed explanations. | ||
686 | |||
687 | :param name: A filter on tag name. | ||
688 | :param attrs: A dictionary of filters on attribute values. | ||
689 | :param string: A filter for a NavigableString with specific text. | ||
690 | :kwargs: A dictionary of filters on attribute values. | ||
691 | :return: A PageElement. | ||
692 | :rtype: bs4.element.Tag | bs4.element.NavigableString | ||
693 | """ | ||
694 | return self._find_one(self.find_previous_siblings, name, attrs, string, | ||
455 | **kwargs) | 695 | **kwargs) |
456 | findPreviousSibling = find_previous_sibling # BS3 | 696 | findPreviousSibling = find_previous_sibling # BS3 |
457 | 697 | ||
458 | def find_previous_siblings(self, name=None, attrs={}, text=None, | 698 | def find_previous_siblings(self, name=None, attrs={}, string=None, |
459 | limit=None, **kwargs): | 699 | limit=None, **kwargs): |
460 | """Returns the siblings of this Tag that match the given | 700 | """Returns all siblings to this PageElement that match the |
461 | criteria and appear before this Tag in the document.""" | 701 | given criteria and appear earlier in the document. |
462 | return self._find_all(name, attrs, text, limit, | 702 | |
463 | self.previous_siblings, **kwargs) | 703 | All find_* methods take a common set of arguments. See the online |
704 | documentation for detailed explanations. | ||
705 | |||
706 | :param name: A filter on tag name. | ||
707 | :param attrs: A dictionary of filters on attribute values. | ||
708 | :param string: A filter for a NavigableString with specific text. | ||
709 | :param limit: Stop looking after finding this many results. | ||
710 | :kwargs: A dictionary of filters on attribute values. | ||
711 | :return: A ResultSet of PageElements. | ||
712 | :rtype: bs4.element.ResultSet | ||
713 | """ | ||
714 | _stacklevel = kwargs.pop('_stacklevel', 2) | ||
715 | return self._find_all( | ||
716 | name, attrs, string, limit, | ||
717 | self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs | ||
718 | ) | ||
464 | findPreviousSiblings = find_previous_siblings # BS3 | 719 | findPreviousSiblings = find_previous_siblings # BS3 |
465 | fetchPreviousSiblings = find_previous_siblings # BS2 | 720 | fetchPreviousSiblings = find_previous_siblings # BS2 |
466 | 721 | ||
467 | def find_parent(self, name=None, attrs={}, **kwargs): | 722 | def find_parent(self, name=None, attrs={}, **kwargs): |
468 | """Returns the closest parent of this Tag that matches the given | 723 | """Find the closest parent of this PageElement that matches the given |
469 | criteria.""" | 724 | criteria. |
725 | |||
726 | All find_* methods take a common set of arguments. See the online | ||
727 | documentation for detailed explanations. | ||
728 | |||
729 | :param name: A filter on tag name. | ||
730 | :param attrs: A dictionary of filters on attribute values. | ||
731 | :kwargs: A dictionary of filters on attribute values. | ||
732 | |||
733 | :return: A PageElement. | ||
734 | :rtype: bs4.element.Tag | bs4.element.NavigableString | ||
735 | """ | ||
470 | # NOTE: We can't use _find_one because findParents takes a different | 736 | # NOTE: We can't use _find_one because findParents takes a different |
471 | # set of arguments. | 737 | # set of arguments. |
472 | r = None | 738 | r = None |
473 | l = self.find_parents(name, attrs, 1, **kwargs) | 739 | l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs) |
474 | if l: | 740 | if l: |
475 | r = l[0] | 741 | r = l[0] |
476 | return r | 742 | return r |
477 | findParent = find_parent # BS3 | 743 | findParent = find_parent # BS3 |
478 | 744 | ||
479 | def find_parents(self, name=None, attrs={}, limit=None, **kwargs): | 745 | def find_parents(self, name=None, attrs={}, limit=None, **kwargs): |
480 | """Returns the parents of this Tag that match the given | 746 | """Find all parents of this PageElement that match the given criteria. |
481 | criteria.""" | 747 | |
748 | All find_* methods take a common set of arguments. See the online | ||
749 | documentation for detailed explanations. | ||
482 | 750 | ||
751 | :param name: A filter on tag name. | ||
752 | :param attrs: A dictionary of filters on attribute values. | ||
753 | :param limit: Stop looking after finding this many results. | ||
754 | :kwargs: A dictionary of filters on attribute values. | ||
755 | |||
756 | :return: A PageElement. | ||
757 | :rtype: bs4.element.Tag | bs4.element.NavigableString | ||
758 | """ | ||
759 | _stacklevel = kwargs.pop('_stacklevel', 2) | ||
483 | return self._find_all(name, attrs, None, limit, self.parents, | 760 | return self._find_all(name, attrs, None, limit, self.parents, |
484 | **kwargs) | 761 | _stacklevel=_stacklevel+1, **kwargs) |
485 | findParents = find_parents # BS3 | 762 | findParents = find_parents # BS3 |
486 | fetchParents = find_parents # BS2 | 763 | fetchParents = find_parents # BS2 |
487 | 764 | ||
488 | @property | 765 | @property |
489 | def next(self): | 766 | def next(self): |
767 | """The PageElement, if any, that was parsed just after this one. | ||
768 | |||
769 | :return: A PageElement. | ||
770 | :rtype: bs4.element.Tag | bs4.element.NavigableString | ||
771 | """ | ||
490 | return self.next_element | 772 | return self.next_element |
491 | 773 | ||
492 | @property | 774 | @property |
493 | def previous(self): | 775 | def previous(self): |
776 | """The PageElement, if any, that was parsed just before this one. | ||
777 | |||
778 | :return: A PageElement. | ||
779 | :rtype: bs4.element.Tag | bs4.element.NavigableString | ||
780 | """ | ||
494 | return self.previous_element | 781 | return self.previous_element |
495 | 782 | ||
496 | #These methods do the real heavy lifting. | 783 | #These methods do the real heavy lifting. |
497 | 784 | ||
498 | def _find_one(self, method, name, attrs, text, **kwargs): | 785 | def _find_one(self, method, name, attrs, string, **kwargs): |
499 | r = None | 786 | r = None |
500 | l = method(name, attrs, text, 1, **kwargs) | 787 | l = method(name, attrs, string, 1, _stacklevel=4, **kwargs) |
501 | if l: | 788 | if l: |
502 | r = l[0] | 789 | r = l[0] |
503 | return r | 790 | return r |
504 | 791 | ||
505 | def _find_all(self, name, attrs, text, limit, generator, **kwargs): | 792 | def _find_all(self, name, attrs, string, limit, generator, **kwargs): |
506 | "Iterates over a generator looking for things that match." | 793 | "Iterates over a generator looking for things that match." |
794 | _stacklevel = kwargs.pop('_stacklevel', 3) | ||
507 | 795 | ||
508 | if text is None and 'string' in kwargs: | 796 | if string is None and 'text' in kwargs: |
509 | text = kwargs['string'] | 797 | string = kwargs.pop('text') |
510 | del kwargs['string'] | 798 | warnings.warn( |
799 | "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", | ||
800 | DeprecationWarning, stacklevel=_stacklevel | ||
801 | ) | ||
511 | 802 | ||
512 | if isinstance(name, SoupStrainer): | 803 | if isinstance(name, SoupStrainer): |
513 | strainer = name | 804 | strainer = name |
514 | else: | 805 | else: |
515 | strainer = SoupStrainer(name, attrs, text, **kwargs) | 806 | strainer = SoupStrainer(name, attrs, string, **kwargs) |
516 | 807 | ||
517 | if text is None and not limit and not attrs and not kwargs: | 808 | if string is None and not limit and not attrs and not kwargs: |
518 | if name is True or name is None: | 809 | if name is True or name is None: |
519 | # Optimization to find all tags. | 810 | # Optimization to find all tags. |
520 | result = (element for element in generator | 811 | result = (element for element in generator |
@@ -522,9 +813,23 @@ class PageElement(object): | |||
522 | return ResultSet(strainer, result) | 813 | return ResultSet(strainer, result) |
523 | elif isinstance(name, str): | 814 | elif isinstance(name, str): |
524 | # Optimization to find all tags with a given name. | 815 | # Optimization to find all tags with a given name. |
816 | if name.count(':') == 1: | ||
817 | # This is a name with a prefix. If this is a namespace-aware document, | ||
818 | # we need to match the local name against tag.name. If not, | ||
819 | # we need to match the fully-qualified name against tag.name. | ||
820 | prefix, local_name = name.split(':', 1) | ||
821 | else: | ||
822 | prefix = None | ||
823 | local_name = name | ||
525 | result = (element for element in generator | 824 | result = (element for element in generator |
526 | if isinstance(element, Tag) | 825 | if isinstance(element, Tag) |
527 | and element.name == name) | 826 | and ( |
827 | element.name == name | ||
828 | ) or ( | ||
829 | element.name == local_name | ||
830 | and (prefix is None or element.prefix == prefix) | ||
831 | ) | ||
832 | ) | ||
528 | return ResultSet(strainer, result) | 833 | return ResultSet(strainer, result) |
529 | results = ResultSet(strainer) | 834 | results = ResultSet(strainer) |
530 | while True: | 835 | while True: |
@@ -544,6 +849,10 @@ class PageElement(object): | |||
544 | #NavigableStrings and Tags. | 849 | #NavigableStrings and Tags. |
545 | @property | 850 | @property |
546 | def next_elements(self): | 851 | def next_elements(self): |
852 | """All PageElements that were parsed after this one. | ||
853 | |||
854 | :yield: A sequence of PageElements. | ||
855 | """ | ||
547 | i = self.next_element | 856 | i = self.next_element |
548 | while i is not None: | 857 | while i is not None: |
549 | yield i | 858 | yield i |
@@ -551,6 +860,11 @@ class PageElement(object): | |||
551 | 860 | ||
552 | @property | 861 | @property |
553 | def next_siblings(self): | 862 | def next_siblings(self): |
863 | """All PageElements that are siblings of this one but were parsed | ||
864 | later. | ||
865 | |||
866 | :yield: A sequence of PageElements. | ||
867 | """ | ||
554 | i = self.next_sibling | 868 | i = self.next_sibling |
555 | while i is not None: | 869 | while i is not None: |
556 | yield i | 870 | yield i |
@@ -558,6 +872,10 @@ class PageElement(object): | |||
558 | 872 | ||
559 | @property | 873 | @property |
560 | def previous_elements(self): | 874 | def previous_elements(self): |
875 | """All PageElements that were parsed before this one. | ||
876 | |||
877 | :yield: A sequence of PageElements. | ||
878 | """ | ||
561 | i = self.previous_element | 879 | i = self.previous_element |
562 | while i is not None: | 880 | while i is not None: |
563 | yield i | 881 | yield i |
@@ -565,6 +883,11 @@ class PageElement(object): | |||
565 | 883 | ||
566 | @property | 884 | @property |
567 | def previous_siblings(self): | 885 | def previous_siblings(self): |
886 | """All PageElements that are siblings of this one but were parsed | ||
887 | earlier. | ||
888 | |||
889 | :yield: A sequence of PageElements. | ||
890 | """ | ||
568 | i = self.previous_sibling | 891 | i = self.previous_sibling |
569 | while i is not None: | 892 | while i is not None: |
570 | yield i | 893 | yield i |
@@ -572,87 +895,23 @@ class PageElement(object): | |||
572 | 895 | ||
573 | @property | 896 | @property |
574 | def parents(self): | 897 | def parents(self): |
898 | """All PageElements that are parents of this PageElement. | ||
899 | |||
900 | :yield: A sequence of PageElements. | ||
901 | """ | ||
575 | i = self.parent | 902 | i = self.parent |
576 | while i is not None: | 903 | while i is not None: |
577 | yield i | 904 | yield i |
578 | i = i.parent | 905 | i = i.parent |
579 | 906 | ||
580 | # Methods for supporting CSS selectors. | 907 | @property |
581 | 908 | def decomposed(self): | |
582 | tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') | 909 | """Check whether a PageElement has been decomposed. |
583 | |||
584 | # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ | ||
585 | # \---------------------------/ \---/\-------------/ \-------/ | ||
586 | # | | | | | ||
587 | # | | | The value | ||
588 | # | | ~,|,^,$,* or = | ||
589 | # | Attribute | ||
590 | # Tag | ||
591 | attribselect_re = re.compile( | ||
592 | r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' + | ||
593 | r'=?"?(?P<value>[^\]"]*)"?\]$' | ||
594 | ) | ||
595 | |||
596 | def _attr_value_as_string(self, value, default=None): | ||
597 | """Force an attribute value into a string representation. | ||
598 | 910 | ||
599 | A multi-valued attribute will be converted into a | 911 | :rtype: bool |
600 | space-separated stirng. | ||
601 | """ | 912 | """ |
602 | value = self.get(value, default) | 913 | return getattr(self, '_decomposed', False) or False |
603 | if isinstance(value, list) or isinstance(value, tuple): | 914 | |
604 | value =" ".join(value) | ||
605 | return value | ||
606 | |||
607 | def _tag_name_matches_and(self, function, tag_name): | ||
608 | if not tag_name: | ||
609 | return function | ||
610 | else: | ||
611 | def _match(tag): | ||
612 | return tag.name == tag_name and function(tag) | ||
613 | return _match | ||
614 | |||
615 | def _attribute_checker(self, operator, attribute, value=''): | ||
616 | """Create a function that performs a CSS selector operation. | ||
617 | |||
618 | Takes an operator, attribute and optional value. Returns a | ||
619 | function that will return True for elements that match that | ||
620 | combination. | ||
621 | """ | ||
622 | if operator == '=': | ||
623 | # string representation of `attribute` is equal to `value` | ||
624 | return lambda el: el._attr_value_as_string(attribute) == value | ||
625 | elif operator == '~': | ||
626 | # space-separated list representation of `attribute` | ||
627 | # contains `value` | ||
628 | def _includes_value(element): | ||
629 | attribute_value = element.get(attribute, []) | ||
630 | if not isinstance(attribute_value, list): | ||
631 | attribute_value = attribute_value.split() | ||
632 | return value in attribute_value | ||
633 | return _includes_value | ||
634 | elif operator == '^': | ||
635 | # string representation of `attribute` starts with `value` | ||
636 | return lambda el: el._attr_value_as_string( | ||
637 | attribute, '').startswith(value) | ||
638 | elif operator == '$': | ||
639 | # string represenation of `attribute` ends with `value` | ||
640 | return lambda el: el._attr_value_as_string( | ||
641 | attribute, '').endswith(value) | ||
642 | elif operator == '*': | ||
643 | # string representation of `attribute` contains `value` | ||
644 | return lambda el: value in el._attr_value_as_string(attribute, '') | ||
645 | elif operator == '|': | ||
646 | # string representation of `attribute` is either exactly | ||
647 | # `value` or starts with `value` and then a dash. | ||
648 | def _is_or_starts_with_dash(element): | ||
649 | attribute_value = element._attr_value_as_string(attribute, '') | ||
650 | return (attribute_value == value or attribute_value.startswith( | ||
651 | value + '-')) | ||
652 | return _is_or_starts_with_dash | ||
653 | else: | ||
654 | return lambda el: el.has_attr(attribute) | ||
655 | |||
656 | # Old non-property versions of the generators, for backwards | 915 | # Old non-property versions of the generators, for backwards |
657 | # compatibility with BS3. | 916 | # compatibility with BS3. |
658 | def nextGenerator(self): | 917 | def nextGenerator(self): |
@@ -672,6 +931,11 @@ class PageElement(object): | |||
672 | 931 | ||
673 | 932 | ||
674 | class NavigableString(str, PageElement): | 933 | class NavigableString(str, PageElement): |
934 | """A Python Unicode string that is part of a parse tree. | ||
935 | |||
936 | When Beautiful Soup parses the markup <b>penguin</b>, it will | ||
937 | create a NavigableString for the string "penguin". | ||
938 | """ | ||
675 | 939 | ||
676 | PREFIX = '' | 940 | PREFIX = '' |
677 | SUFFIX = '' | 941 | SUFFIX = '' |
@@ -691,12 +955,22 @@ class NavigableString(str, PageElement): | |||
691 | u.setup() | 955 | u.setup() |
692 | return u | 956 | return u |
693 | 957 | ||
694 | def __copy__(self): | 958 | def __deepcopy__(self, memo, recursive=False): |
695 | """A copy of a NavigableString has the same contents and class | 959 | """A copy of a NavigableString has the same contents and class |
696 | as the original, but it is not connected to the parse tree. | 960 | as the original, but it is not connected to the parse tree. |
961 | |||
962 | :param recursive: This parameter is ignored; it's only defined | ||
963 | so that NavigableString.__deepcopy__ implements the same | ||
964 | signature as Tag.__deepcopy__. | ||
697 | """ | 965 | """ |
698 | return type(self)(self) | 966 | return type(self)(self) |
699 | 967 | ||
968 | def __copy__(self): | ||
969 | """A copy of a NavigableString can only be a deep copy, because | ||
970 | only one PageElement can occupy a given place in a parse tree. | ||
971 | """ | ||
972 | return self.__deepcopy__({}) | ||
973 | |||
700 | def __getnewargs__(self): | 974 | def __getnewargs__(self): |
701 | return (str(self),) | 975 | return (str(self),) |
702 | 976 | ||
@@ -712,55 +986,146 @@ class NavigableString(str, PageElement): | |||
712 | self.__class__.__name__, attr)) | 986 | self.__class__.__name__, attr)) |
713 | 987 | ||
714 | def output_ready(self, formatter="minimal"): | 988 | def output_ready(self, formatter="minimal"): |
989 | """Run the string through the provided formatter. | ||
990 | |||
991 | :param formatter: A Formatter object, or a string naming one of the standard formatters. | ||
992 | """ | ||
715 | output = self.format_string(self, formatter) | 993 | output = self.format_string(self, formatter) |
716 | return self.PREFIX + output + self.SUFFIX | 994 | return self.PREFIX + output + self.SUFFIX |
717 | 995 | ||
718 | @property | 996 | @property |
719 | def name(self): | 997 | def name(self): |
998 | """Since a NavigableString is not a Tag, it has no .name. | ||
999 | |||
1000 | This property is implemented so that code like this doesn't crash | ||
1001 | when run on a mixture of Tag and NavigableString objects: | ||
1002 | [x.name for x in tag.children] | ||
1003 | """ | ||
720 | return None | 1004 | return None |
721 | 1005 | ||
722 | @name.setter | 1006 | @name.setter |
723 | def name(self, name): | 1007 | def name(self, name): |
1008 | """Prevent NavigableString.name from ever being set.""" | ||
724 | raise AttributeError("A NavigableString cannot be given a name.") | 1009 | raise AttributeError("A NavigableString cannot be given a name.") |
725 | 1010 | ||
1011 | def _all_strings(self, strip=False, types=PageElement.default): | ||
1012 | """Yield all strings of certain classes, possibly stripping them. | ||
1013 | |||
1014 | This makes it easy for NavigableString to implement methods | ||
1015 | like get_text() as conveniences, creating a consistent | ||
1016 | text-extraction API across all PageElements. | ||
1017 | |||
1018 | :param strip: If True, all strings will be stripped before being | ||
1019 | yielded. | ||
1020 | |||
1021 | :param types: A tuple of NavigableString subclasses. If this | ||
1022 | NavigableString isn't one of those subclasses, the | ||
1023 | sequence will be empty. By default, the subclasses | ||
1024 | considered are NavigableString and CData objects. That | ||
1025 | means no comments, processing instructions, etc. | ||
1026 | |||
1027 | :yield: A sequence that either contains this string, or is empty. | ||
1028 | |||
1029 | """ | ||
1030 | if types is self.default: | ||
1031 | # This is kept in Tag because it's full of subclasses of | ||
1032 | # this class, which aren't defined until later in the file. | ||
1033 | types = Tag.DEFAULT_INTERESTING_STRING_TYPES | ||
1034 | |||
1035 | # Do nothing if the caller is looking for specific types of | ||
1036 | # string, and we're of a different type. | ||
1037 | # | ||
1038 | # We check specific types instead of using isinstance(self, | ||
1039 | # types) because all of these classes subclass | ||
1040 | # NavigableString. Anyone who's using this feature probably | ||
1041 | # wants generic NavigableStrings but not other stuff. | ||
1042 | my_type = type(self) | ||
1043 | if types is not None: | ||
1044 | if isinstance(types, type): | ||
1045 | # Looking for a single type. | ||
1046 | if my_type is not types: | ||
1047 | return | ||
1048 | elif my_type not in types: | ||
1049 | # Looking for one of a list of types. | ||
1050 | return | ||
1051 | |||
1052 | value = self | ||
1053 | if strip: | ||
1054 | value = value.strip() | ||
1055 | if len(value) > 0: | ||
1056 | yield value | ||
1057 | strings = property(_all_strings) | ||
1058 | |||
726 | class PreformattedString(NavigableString): | 1059 | class PreformattedString(NavigableString): |
727 | """A NavigableString not subject to the normal formatting rules. | 1060 | """A NavigableString not subject to the normal formatting rules. |
728 | 1061 | ||
729 | The string will be passed into the formatter (to trigger side effects), | 1062 | This is an abstract class used for special kinds of strings such |
730 | but the return value will be ignored. | 1063 | as comments (the Comment class) and CDATA blocks (the CData |
1064 | class). | ||
731 | """ | 1065 | """ |
732 | 1066 | ||
733 | def output_ready(self, formatter="minimal"): | 1067 | PREFIX = '' |
734 | """CData strings are passed into the formatter. | 1068 | SUFFIX = '' |
735 | But the return value is ignored.""" | 1069 | |
736 | self.format_string(self, formatter) | 1070 | def output_ready(self, formatter=None): |
1071 | """Make this string ready for output by adding any subclass-specific | ||
1072 | prefix or suffix. | ||
1073 | |||
1074 | :param formatter: A Formatter object, or a string naming one | ||
1075 | of the standard formatters. The string will be passed into the | ||
1076 | Formatter, but only to trigger any side effects: the return | ||
1077 | value is ignored. | ||
1078 | |||
1079 | :return: The string, with any subclass-specific prefix and | ||
1080 | suffix added on. | ||
1081 | """ | ||
1082 | if formatter is not None: | ||
1083 | ignore = self.format_string(self, formatter) | ||
737 | return self.PREFIX + self + self.SUFFIX | 1084 | return self.PREFIX + self + self.SUFFIX |
738 | 1085 | ||
739 | class CData(PreformattedString): | 1086 | class CData(PreformattedString): |
740 | 1087 | """A CDATA block.""" | |
741 | PREFIX = '<![CDATA[' | 1088 | PREFIX = '<![CDATA[' |
742 | SUFFIX = ']]>' | 1089 | SUFFIX = ']]>' |
743 | 1090 | ||
744 | class ProcessingInstruction(PreformattedString): | 1091 | class ProcessingInstruction(PreformattedString): |
1092 | """A SGML processing instruction.""" | ||
745 | 1093 | ||
746 | PREFIX = '<?' | 1094 | PREFIX = '<?' |
747 | SUFFIX = '>' | 1095 | SUFFIX = '>' |
748 | 1096 | ||
749 | class Comment(PreformattedString): | 1097 | class XMLProcessingInstruction(ProcessingInstruction): |
1098 | """An XML processing instruction.""" | ||
1099 | PREFIX = '<?' | ||
1100 | SUFFIX = '?>' | ||
750 | 1101 | ||
1102 | class Comment(PreformattedString): | ||
1103 | """An HTML or XML comment.""" | ||
751 | PREFIX = '<!--' | 1104 | PREFIX = '<!--' |
752 | SUFFIX = '-->' | 1105 | SUFFIX = '-->' |
753 | 1106 | ||
754 | 1107 | ||
755 | class Declaration(PreformattedString): | 1108 | class Declaration(PreformattedString): |
1109 | """An XML declaration.""" | ||
756 | PREFIX = '<?' | 1110 | PREFIX = '<?' |
757 | SUFFIX = '?>' | 1111 | SUFFIX = '?>' |
758 | 1112 | ||
759 | 1113 | ||
760 | class Doctype(PreformattedString): | 1114 | class Doctype(PreformattedString): |
761 | 1115 | """A document type declaration.""" | |
762 | @classmethod | 1116 | @classmethod |
763 | def for_name_and_ids(cls, name, pub_id, system_id): | 1117 | def for_name_and_ids(cls, name, pub_id, system_id): |
1118 | """Generate an appropriate document type declaration for a given | ||
1119 | public ID and system ID. | ||
1120 | |||
1121 | :param name: The name of the document's root element, e.g. 'html'. | ||
1122 | :param pub_id: The Formal Public Identifier for this document type, | ||
1123 | e.g. '-//W3C//DTD XHTML 1.1//EN' | ||
1124 | :param system_id: The system identifier for this document type, | ||
1125 | e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' | ||
1126 | |||
1127 | :return: A Doctype. | ||
1128 | """ | ||
764 | value = name or '' | 1129 | value = name or '' |
765 | if pub_id is not None: | 1130 | if pub_id is not None: |
766 | value += ' PUBLIC "%s"' % pub_id | 1131 | value += ' PUBLIC "%s"' % pub_id |
@@ -775,14 +1140,105 @@ class Doctype(PreformattedString): | |||
775 | SUFFIX = '>\n' | 1140 | SUFFIX = '>\n' |
776 | 1141 | ||
777 | 1142 | ||
1143 | class Stylesheet(NavigableString): | ||
1144 | """A NavigableString representing an stylesheet (probably | ||
1145 | CSS). | ||
1146 | |||
1147 | Used to distinguish embedded stylesheets from textual content. | ||
1148 | """ | ||
1149 | pass | ||
1150 | |||
1151 | |||
1152 | class Script(NavigableString): | ||
1153 | """A NavigableString representing an executable script (probably | ||
1154 | Javascript). | ||
1155 | |||
1156 | Used to distinguish executable code from textual content. | ||
1157 | """ | ||
1158 | pass | ||
1159 | |||
1160 | |||
1161 | class TemplateString(NavigableString): | ||
1162 | """A NavigableString representing a string found inside an HTML | ||
1163 | template embedded in a larger document. | ||
1164 | |||
1165 | Used to distinguish such strings from the main body of the document. | ||
1166 | """ | ||
1167 | pass | ||
1168 | |||
1169 | |||
1170 | class RubyTextString(NavigableString): | ||
1171 | """A NavigableString representing the contents of the <rt> HTML | ||
1172 | element. | ||
1173 | |||
1174 | https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element | ||
1175 | |||
1176 | Can be used to distinguish such strings from the strings they're | ||
1177 | annotating. | ||
1178 | """ | ||
1179 | pass | ||
1180 | |||
1181 | |||
1182 | class RubyParenthesisString(NavigableString): | ||
1183 | """A NavigableString representing the contents of the <rp> HTML | ||
1184 | element. | ||
1185 | |||
1186 | https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element | ||
1187 | """ | ||
1188 | pass | ||
1189 | |||
1190 | |||
778 | class Tag(PageElement): | 1191 | class Tag(PageElement): |
1192 | """Represents an HTML or XML tag that is part of a parse tree, along | ||
1193 | with its attributes and contents. | ||
779 | 1194 | ||
780 | """Represents a found HTML tag with its attributes and contents.""" | 1195 | When Beautiful Soup parses the markup <b>penguin</b>, it will |
1196 | create a Tag object representing the <b> tag. | ||
1197 | """ | ||
781 | 1198 | ||
782 | def __init__(self, parser=None, builder=None, name=None, namespace=None, | 1199 | def __init__(self, parser=None, builder=None, name=None, namespace=None, |
783 | prefix=None, attrs=None, parent=None, previous=None): | 1200 | prefix=None, attrs=None, parent=None, previous=None, |
784 | "Basic constructor." | 1201 | is_xml=None, sourceline=None, sourcepos=None, |
785 | 1202 | can_be_empty_element=None, cdata_list_attributes=None, | |
1203 | preserve_whitespace_tags=None, | ||
1204 | interesting_string_types=None, | ||
1205 | namespaces=None | ||
1206 | ): | ||
1207 | """Basic constructor. | ||
1208 | |||
1209 | :param parser: A BeautifulSoup object. | ||
1210 | :param builder: A TreeBuilder. | ||
1211 | :param name: The name of the tag. | ||
1212 | :param namespace: The URI of this Tag's XML namespace, if any. | ||
1213 | :param prefix: The prefix for this Tag's XML namespace, if any. | ||
1214 | :param attrs: A dictionary of this Tag's attribute values. | ||
1215 | :param parent: The PageElement to use as this Tag's parent. | ||
1216 | :param previous: The PageElement that was parsed immediately before | ||
1217 | this tag. | ||
1218 | :param is_xml: If True, this is an XML tag. Otherwise, this is an | ||
1219 | HTML tag. | ||
1220 | :param sourceline: The line number where this tag was found in its | ||
1221 | source document. | ||
1222 | :param sourcepos: The character position within `sourceline` where this | ||
1223 | tag was found. | ||
1224 | :param can_be_empty_element: If True, this tag should be | ||
1225 | represented as <tag/>. If False, this tag should be represented | ||
1226 | as <tag></tag>. | ||
1227 | :param cdata_list_attributes: A list of attributes whose values should | ||
1228 | be treated as CDATA if they ever show up on this tag. | ||
1229 | :param preserve_whitespace_tags: A list of tag names whose contents | ||
1230 | should have their whitespace preserved. | ||
1231 | :param interesting_string_types: This is a NavigableString | ||
1232 | subclass or a tuple of them. When iterating over this | ||
1233 | Tag's strings in methods like Tag.strings or Tag.get_text, | ||
1234 | these are the types of strings that are interesting enough | ||
1235 | to be considered. The default is to consider | ||
1236 | NavigableString and CData the only interesting string | ||
1237 | subtypes. | ||
1238 | :param namespaces: A dictionary mapping currently active | ||
1239 | namespace prefixes to URIs. This can be used later to | ||
1240 | construct CSS selectors. | ||
1241 | """ | ||
786 | if parser is None: | 1242 | if parser is None: |
787 | self.parser_class = None | 1243 | self.parser_class = None |
788 | else: | 1244 | else: |
@@ -793,7 +1249,12 @@ class Tag(PageElement): | |||
793 | raise ValueError("No value provided for new tag's name.") | 1249 | raise ValueError("No value provided for new tag's name.") |
794 | self.name = name | 1250 | self.name = name |
795 | self.namespace = namespace | 1251 | self.namespace = namespace |
1252 | self._namespaces = namespaces or {} | ||
796 | self.prefix = prefix | 1253 | self.prefix = prefix |
1254 | if ((not builder or builder.store_line_numbers) | ||
1255 | and (sourceline is not None or sourcepos is not None)): | ||
1256 | self.sourceline = sourceline | ||
1257 | self.sourcepos = sourcepos | ||
797 | if attrs is None: | 1258 | if attrs is None: |
798 | attrs = {} | 1259 | attrs = {} |
799 | elif attrs: | 1260 | elif attrs: |
@@ -804,32 +1265,109 @@ class Tag(PageElement): | |||
804 | attrs = dict(attrs) | 1265 | attrs = dict(attrs) |
805 | else: | 1266 | else: |
806 | attrs = dict(attrs) | 1267 | attrs = dict(attrs) |
1268 | |||
1269 | # If possible, determine ahead of time whether this tag is an | ||
1270 | # XML tag. | ||
1271 | if builder: | ||
1272 | self.known_xml = builder.is_xml | ||
1273 | else: | ||
1274 | self.known_xml = is_xml | ||
807 | self.attrs = attrs | 1275 | self.attrs = attrs |
808 | self.contents = [] | 1276 | self.contents = [] |
809 | self.setup(parent, previous) | 1277 | self.setup(parent, previous) |
810 | self.hidden = False | 1278 | self.hidden = False |
811 | 1279 | ||
812 | # Set up any substitutions, such as the charset in a META tag. | 1280 | if builder is None: |
813 | if builder is not None: | 1281 | # In the absence of a TreeBuilder, use whatever values were |
1282 | # passed in here. They're probably None, unless this is a copy of some | ||
1283 | # other tag. | ||
1284 | self.can_be_empty_element = can_be_empty_element | ||
1285 | self.cdata_list_attributes = cdata_list_attributes | ||
1286 | self.preserve_whitespace_tags = preserve_whitespace_tags | ||
1287 | self.interesting_string_types = interesting_string_types | ||
1288 | else: | ||
1289 | # Set up any substitutions for this tag, such as the charset in a META tag. | ||
814 | builder.set_up_substitutions(self) | 1290 | builder.set_up_substitutions(self) |
1291 | |||
1292 | # Ask the TreeBuilder whether this tag might be an empty-element tag. | ||
815 | self.can_be_empty_element = builder.can_be_empty_element(name) | 1293 | self.can_be_empty_element = builder.can_be_empty_element(name) |
816 | else: | 1294 | |
817 | self.can_be_empty_element = False | 1295 | # Keep track of the list of attributes of this tag that |
1296 | # might need to be treated as a list. | ||
1297 | # | ||
1298 | # For performance reasons, we store the whole data structure | ||
1299 | # rather than asking the question of every tag. Asking would | ||
1300 | # require building a new data structure every time, and | ||
1301 | # (unlike can_be_empty_element), we almost never need | ||
1302 | # to check this. | ||
1303 | self.cdata_list_attributes = builder.cdata_list_attributes | ||
1304 | |||
1305 | # Keep track of the names that might cause this tag to be treated as a | ||
1306 | # whitespace-preserved tag. | ||
1307 | self.preserve_whitespace_tags = builder.preserve_whitespace_tags | ||
1308 | |||
1309 | if self.name in builder.string_containers: | ||
1310 | # This sort of tag uses a special string container | ||
1311 | # subclass for most of its strings. When we ask the | ||
1312 | self.interesting_string_types = builder.string_containers[self.name] | ||
1313 | else: | ||
1314 | self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES | ||
818 | 1315 | ||
819 | parserClass = _alias("parser_class") # BS3 | 1316 | parserClass = _alias("parser_class") # BS3 |
820 | 1317 | ||
821 | def __copy__(self): | 1318 | def __deepcopy__(self, memo, recursive=True): |
822 | """A copy of a Tag is a new Tag, unconnected to the parse tree. | 1319 | """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. |
823 | Its contents are a copy of the old Tag's contents. | 1320 | Its contents are a copy of the old Tag's contents. |
824 | """ | 1321 | """ |
825 | clone = type(self)(None, self.builder, self.name, self.namespace, | 1322 | clone = self._clone() |
826 | self.nsprefix, self.attrs) | 1323 | |
1324 | if recursive: | ||
1325 | # Clone this tag's descendants recursively, but without | ||
1326 | # making any recursive function calls. | ||
1327 | tag_stack = [clone] | ||
1328 | for event, element in self._event_stream(self.descendants): | ||
1329 | if event is Tag.END_ELEMENT_EVENT: | ||
1330 | # Stop appending incoming Tags to the Tag that was | ||
1331 | # just closed. | ||
1332 | tag_stack.pop() | ||
1333 | else: | ||
1334 | descendant_clone = element.__deepcopy__( | ||
1335 | memo, recursive=False | ||
1336 | ) | ||
1337 | # Add to its parent's .contents | ||
1338 | tag_stack[-1].append(descendant_clone) | ||
1339 | |||
1340 | if event is Tag.START_ELEMENT_EVENT: | ||
1341 | # Add the Tag itself to the stack so that its | ||
1342 | # children will be .appended to it. | ||
1343 | tag_stack.append(descendant_clone) | ||
1344 | return clone | ||
1345 | |||
1346 | def __copy__(self): | ||
1347 | """A copy of a Tag must always be a deep copy, because a Tag's | ||
1348 | children can only have one parent at a time. | ||
1349 | """ | ||
1350 | return self.__deepcopy__({}) | ||
1351 | |||
1352 | def _clone(self): | ||
1353 | """Create a new Tag just like this one, but with no | ||
1354 | contents and unattached to any parse tree. | ||
1355 | |||
1356 | This is the first step in the deepcopy process. | ||
1357 | """ | ||
1358 | clone = type(self)( | ||
1359 | None, None, self.name, self.namespace, | ||
1360 | self.prefix, self.attrs, is_xml=self._is_xml, | ||
1361 | sourceline=self.sourceline, sourcepos=self.sourcepos, | ||
1362 | can_be_empty_element=self.can_be_empty_element, | ||
1363 | cdata_list_attributes=self.cdata_list_attributes, | ||
1364 | preserve_whitespace_tags=self.preserve_whitespace_tags, | ||
1365 | interesting_string_types=self.interesting_string_types | ||
1366 | ) | ||
827 | for attr in ('can_be_empty_element', 'hidden'): | 1367 | for attr in ('can_be_empty_element', 'hidden'): |
828 | setattr(clone, attr, getattr(self, attr)) | 1368 | setattr(clone, attr, getattr(self, attr)) |
829 | for child in self.contents: | ||
830 | clone.append(child.__copy__()) | ||
831 | return clone | 1369 | return clone |
832 | 1370 | ||
833 | @property | 1371 | @property |
834 | def is_empty_element(self): | 1372 | def is_empty_element(self): |
835 | """Is this tag an empty-element tag? (aka a self-closing tag) | 1373 | """Is this tag an empty-element tag? (aka a self-closing tag) |
@@ -850,13 +1388,17 @@ class Tag(PageElement): | |||
850 | 1388 | ||
851 | @property | 1389 | @property |
852 | def string(self): | 1390 | def string(self): |
853 | """Convenience property to get the single string within this tag. | 1391 | """Convenience property to get the single string within this |
1392 | PageElement. | ||
854 | 1393 | ||
855 | :Return: If this tag has a single string child, return value | 1394 | TODO It might make sense to have NavigableString.string return |
856 | is that string. If this tag has no children, or more than one | 1395 | itself. |
857 | child, return value is None. If this tag has one child tag, | 1396 | |
1397 | :return: If this element has a single string child, return | ||
1398 | value is that string. If this element has one child tag, | ||
858 | return value is the 'string' attribute of the child tag, | 1399 | return value is the 'string' attribute of the child tag, |
859 | recursively. | 1400 | recursively. If this element is itself a string, has no |
1401 | children, or has more than one child, return value is None. | ||
860 | """ | 1402 | """ |
861 | if len(self.contents) != 1: | 1403 | if len(self.contents) != 1: |
862 | return None | 1404 | return None |
@@ -867,57 +1409,75 @@ class Tag(PageElement): | |||
867 | 1409 | ||
868 | @string.setter | 1410 | @string.setter |
869 | def string(self, string): | 1411 | def string(self, string): |
1412 | """Replace this PageElement's contents with `string`.""" | ||
870 | self.clear() | 1413 | self.clear() |
871 | self.append(string.__class__(string)) | 1414 | self.append(string.__class__(string)) |
872 | 1415 | ||
873 | def _all_strings(self, strip=False, types=(NavigableString, CData)): | 1416 | DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) |
1417 | def _all_strings(self, strip=False, types=PageElement.default): | ||
874 | """Yield all strings of certain classes, possibly stripping them. | 1418 | """Yield all strings of certain classes, possibly stripping them. |
875 | 1419 | ||
876 | By default, yields only NavigableString and CData objects. So | 1420 | :param strip: If True, all strings will be stripped before being |
877 | no comments, processing instructions, etc. | 1421 | yielded. |
1422 | |||
1423 | :param types: A tuple of NavigableString subclasses. Any strings of | ||
1424 | a subclass not found in this list will be ignored. By | ||
1425 | default, the subclasses considered are the ones found in | ||
1426 | self.interesting_string_types. If that's not specified, | ||
1427 | only NavigableString and CData objects will be | ||
1428 | considered. That means no comments, processing | ||
1429 | instructions, etc. | ||
1430 | |||
1431 | :yield: A sequence of strings. | ||
1432 | |||
878 | """ | 1433 | """ |
1434 | if types is self.default: | ||
1435 | types = self.interesting_string_types | ||
1436 | |||
879 | for descendant in self.descendants: | 1437 | for descendant in self.descendants: |
880 | if ( | 1438 | if (types is None and not isinstance(descendant, NavigableString)): |
881 | (types is None and not isinstance(descendant, NavigableString)) | 1439 | continue |
882 | or | 1440 | descendant_type = type(descendant) |
883 | (types is not None and type(descendant) not in types)): | 1441 | if isinstance(types, type): |
1442 | if descendant_type is not types: | ||
1443 | # We're not interested in strings of this type. | ||
1444 | continue | ||
1445 | elif types is not None and descendant_type not in types: | ||
1446 | # We're not interested in strings of this type. | ||
884 | continue | 1447 | continue |
885 | if strip: | 1448 | if strip: |
886 | descendant = descendant.strip() | 1449 | descendant = descendant.strip() |
887 | if len(descendant) == 0: | 1450 | if len(descendant) == 0: |
888 | continue | 1451 | continue |
889 | yield descendant | 1452 | yield descendant |
890 | |||
891 | strings = property(_all_strings) | 1453 | strings = property(_all_strings) |
892 | 1454 | ||
893 | @property | 1455 | def decompose(self): |
894 | def stripped_strings(self): | 1456 | """Recursively destroys this PageElement and its children. |
895 | for string in self._all_strings(True): | ||
896 | yield string | ||
897 | 1457 | ||
898 | def get_text(self, separator="", strip=False, | 1458 | This element will be removed from the tree and wiped out; so |
899 | types=(NavigableString, CData)): | 1459 | will everything beneath it. |
900 | """ | ||
901 | Get all child strings, concatenated using the given separator. | ||
902 | """ | ||
903 | return separator.join([s for s in self._all_strings( | ||
904 | strip, types=types)]) | ||
905 | getText = get_text | ||
906 | text = property(get_text) | ||
907 | 1460 | ||
908 | def decompose(self): | 1461 | The behavior of a decomposed PageElement is undefined and you |
909 | """Recursively destroys the contents of this tree.""" | 1462 | should never use one for anything, but if you need to _check_ |
1463 | whether an element has been decomposed, you can use the | ||
1464 | `decomposed` property. | ||
1465 | """ | ||
910 | self.extract() | 1466 | self.extract() |
911 | i = self | 1467 | i = self |
912 | while i is not None: | 1468 | while i is not None: |
913 | next = i.next_element | 1469 | n = i.next_element |
914 | i.__dict__.clear() | 1470 | i.__dict__.clear() |
915 | i.contents = [] | 1471 | i.contents = [] |
916 | i = next | 1472 | i._decomposed = True |
1473 | i = n | ||
917 | 1474 | ||
918 | def clear(self, decompose=False): | 1475 | def clear(self, decompose=False): |
919 | """ | 1476 | """Wipe out all children of this PageElement by calling extract() |
920 | Extract all children. If decompose is True, decompose instead. | 1477 | on them. |
1478 | |||
1479 | :param decompose: If this is True, decompose() (a more | ||
1480 | destructive method) will be called instead of extract(). | ||
921 | """ | 1481 | """ |
922 | if decompose: | 1482 | if decompose: |
923 | for element in self.contents[:]: | 1483 | for element in self.contents[:]: |
@@ -929,10 +1489,51 @@ class Tag(PageElement): | |||
929 | for element in self.contents[:]: | 1489 | for element in self.contents[:]: |
930 | element.extract() | 1490 | element.extract() |
931 | 1491 | ||
932 | def index(self, element): | 1492 | def smooth(self): |
1493 | """Smooth out this element's children by consolidating consecutive | ||
1494 | strings. | ||
1495 | |||
1496 | This makes pretty-printed output look more natural following a | ||
1497 | lot of operations that modified the tree. | ||
933 | """ | 1498 | """ |
934 | Find the index of a child by identity, not value. Avoids issues with | 1499 | # Mark the first position of every pair of children that need |
935 | tag.contents.index(element) getting the index of equal elements. | 1500 | # to be consolidated. Do this rather than making a copy of |
1501 | # self.contents, since in most cases very few strings will be | ||
1502 | # affected. | ||
1503 | marked = [] | ||
1504 | for i, a in enumerate(self.contents): | ||
1505 | if isinstance(a, Tag): | ||
1506 | # Recursively smooth children. | ||
1507 | a.smooth() | ||
1508 | if i == len(self.contents)-1: | ||
1509 | # This is the last item in .contents, and it's not a | ||
1510 | # tag. There's no chance it needs any work. | ||
1511 | continue | ||
1512 | b = self.contents[i+1] | ||
1513 | if (isinstance(a, NavigableString) | ||
1514 | and isinstance(b, NavigableString) | ||
1515 | and not isinstance(a, PreformattedString) | ||
1516 | and not isinstance(b, PreformattedString) | ||
1517 | ): | ||
1518 | marked.append(i) | ||
1519 | |||
1520 | # Go over the marked positions in reverse order, so that | ||
1521 | # removing items from .contents won't affect the remaining | ||
1522 | # positions. | ||
1523 | for i in reversed(marked): | ||
1524 | a = self.contents[i] | ||
1525 | b = self.contents[i+1] | ||
1526 | b.extract() | ||
1527 | n = NavigableString(a+b) | ||
1528 | a.replace_with(n) | ||
1529 | |||
1530 | def index(self, element): | ||
1531 | """Find the index of a child by identity, not value. | ||
1532 | |||
1533 | Avoids issues with tag.contents.index(element) getting the | ||
1534 | index of equal elements. | ||
1535 | |||
1536 | :param element: Look for this PageElement in `self.contents`. | ||
936 | """ | 1537 | """ |
937 | for i, child in enumerate(self.contents): | 1538 | for i, child in enumerate(self.contents): |
938 | if child is element: | 1539 | if child is element: |
@@ -945,23 +1546,38 @@ class Tag(PageElement): | |||
945 | attribute.""" | 1546 | attribute.""" |
946 | return self.attrs.get(key, default) | 1547 | return self.attrs.get(key, default) |
947 | 1548 | ||
1549 | def get_attribute_list(self, key, default=None): | ||
1550 | """The same as get(), but always returns a list. | ||
1551 | |||
1552 | :param key: The attribute to look for. | ||
1553 | :param default: Use this value if the attribute is not present | ||
1554 | on this PageElement. | ||
1555 | :return: A list of values, probably containing only a single | ||
1556 | value. | ||
1557 | """ | ||
1558 | value = self.get(key, default) | ||
1559 | if not isinstance(value, list): | ||
1560 | value = [value] | ||
1561 | return value | ||
1562 | |||
948 | def has_attr(self, key): | 1563 | def has_attr(self, key): |
1564 | """Does this PageElement have an attribute with the given name?""" | ||
949 | return key in self.attrs | 1565 | return key in self.attrs |
950 | 1566 | ||
951 | def __hash__(self): | 1567 | def __hash__(self): |
952 | return str(self).__hash__() | 1568 | return str(self).__hash__() |
953 | 1569 | ||
954 | def __getitem__(self, key): | 1570 | def __getitem__(self, key): |
955 | """tag[key] returns the value of the 'key' attribute for the tag, | 1571 | """tag[key] returns the value of the 'key' attribute for the Tag, |
956 | and throws an exception if it's not there.""" | 1572 | and throws an exception if it's not there.""" |
957 | return self.attrs[key] | 1573 | return self.attrs[key] |
958 | 1574 | ||
959 | def __iter__(self): | 1575 | def __iter__(self): |
960 | "Iterating over a tag iterates over its contents." | 1576 | "Iterating over a Tag iterates over its contents." |
961 | return iter(self.contents) | 1577 | return iter(self.contents) |
962 | 1578 | ||
963 | def __len__(self): | 1579 | def __len__(self): |
964 | "The length of a tag is the length of its list of contents." | 1580 | "The length of a Tag is the length of its list of contents." |
965 | return len(self.contents) | 1581 | return len(self.contents) |
966 | 1582 | ||
967 | def __contains__(self, x): | 1583 | def __contains__(self, x): |
@@ -981,29 +1597,33 @@ class Tag(PageElement): | |||
981 | self.attrs.pop(key, None) | 1597 | self.attrs.pop(key, None) |
982 | 1598 | ||
983 | def __call__(self, *args, **kwargs): | 1599 | def __call__(self, *args, **kwargs): |
984 | """Calling a tag like a function is the same as calling its | 1600 | """Calling a Tag like a function is the same as calling its |
985 | find_all() method. Eg. tag('a') returns a list of all the A tags | 1601 | find_all() method. Eg. tag('a') returns a list of all the A tags |
986 | found within this tag.""" | 1602 | found within this tag.""" |
987 | return self.find_all(*args, **kwargs) | 1603 | return self.find_all(*args, **kwargs) |
988 | 1604 | ||
989 | def __getattr__(self, tag): | 1605 | def __getattr__(self, tag): |
990 | #print "Getattr %s.%s" % (self.__class__, tag) | 1606 | """Calling tag.subtag is the same as calling tag.find(name="subtag")""" |
1607 | #print("Getattr %s.%s" % (self.__class__, tag)) | ||
991 | if len(tag) > 3 and tag.endswith('Tag'): | 1608 | if len(tag) > 3 and tag.endswith('Tag'): |
992 | # BS3: soup.aTag -> "soup.find("a") | 1609 | # BS3: soup.aTag -> "soup.find("a") |
993 | tag_name = tag[:-3] | 1610 | tag_name = tag[:-3] |
994 | warnings.warn( | 1611 | warnings.warn( |
995 | '.%sTag is deprecated, use .find("%s") instead.' % ( | 1612 | '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( |
996 | tag_name, tag_name)) | 1613 | name=tag_name |
1614 | ), | ||
1615 | DeprecationWarning, stacklevel=2 | ||
1616 | ) | ||
997 | return self.find(tag_name) | 1617 | return self.find(tag_name) |
998 | # We special case contents to avoid recursion. | 1618 | # We special case contents to avoid recursion. |
999 | elif not tag.startswith("__") and not tag=="contents": | 1619 | elif not tag.startswith("__") and not tag == "contents": |
1000 | return self.find(tag) | 1620 | return self.find(tag) |
1001 | raise AttributeError( | 1621 | raise AttributeError( |
1002 | "'%s' object has no attribute '%s'" % (self.__class__, tag)) | 1622 | "'%s' object has no attribute '%s'" % (self.__class__, tag)) |
1003 | 1623 | ||
1004 | def __eq__(self, other): | 1624 | def __eq__(self, other): |
1005 | """Returns true iff this tag has the same name, the same attributes, | 1625 | """Returns true iff this Tag has the same name, the same attributes, |
1006 | and the same contents (recursively) as the given tag.""" | 1626 | and the same contents (recursively) as `other`.""" |
1007 | if self is other: | 1627 | if self is other: |
1008 | return True | 1628 | return True |
1009 | if (not hasattr(other, 'name') or | 1629 | if (not hasattr(other, 'name') or |
@@ -1019,69 +1639,235 @@ class Tag(PageElement): | |||
1019 | return True | 1639 | return True |
1020 | 1640 | ||
1021 | def __ne__(self, other): | 1641 | def __ne__(self, other): |
1022 | """Returns true iff this tag is not identical to the other tag, | 1642 | """Returns true iff this Tag is not identical to `other`, |
1023 | as defined in __eq__.""" | 1643 | as defined in __eq__.""" |
1024 | return not self == other | 1644 | return not self == other |
1025 | 1645 | ||
1026 | def __repr__(self, encoding="unicode-escape"): | 1646 | def __repr__(self, encoding="unicode-escape"): |
1027 | """Renders this tag as a string.""" | 1647 | """Renders this PageElement as a string. |
1028 | if PY3K: | ||
1029 | # "The return value must be a string object", i.e. Unicode | ||
1030 | return self.decode() | ||
1031 | else: | ||
1032 | # "The return value must be a string object", i.e. a bytestring. | ||
1033 | # By convention, the return value of __repr__ should also be | ||
1034 | # an ASCII string. | ||
1035 | return self.encode(encoding) | ||
1036 | 1648 | ||
1037 | def __unicode__(self): | 1649 | :param encoding: The encoding to use (Python 2 only). |
1650 | TODO: This is now ignored and a warning should be issued | ||
1651 | if a value is provided. | ||
1652 | :return: A (Unicode) string. | ||
1653 | """ | ||
1654 | # "The return value must be a string object", i.e. Unicode | ||
1038 | return self.decode() | 1655 | return self.decode() |
1039 | 1656 | ||
1040 | def __str__(self): | 1657 | def __unicode__(self): |
1041 | if PY3K: | 1658 | """Renders this PageElement as a Unicode string.""" |
1042 | return self.decode() | 1659 | return self.decode() |
1043 | else: | ||
1044 | return self.encode() | ||
1045 | 1660 | ||
1046 | if PY3K: | 1661 | __str__ = __repr__ = __unicode__ |
1047 | __str__ = __repr__ = __unicode__ | ||
1048 | 1662 | ||
1049 | def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, | 1663 | def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, |
1050 | indent_level=None, formatter="minimal", | 1664 | indent_level=None, formatter="minimal", |
1051 | errors="xmlcharrefreplace"): | 1665 | errors="xmlcharrefreplace"): |
1666 | """Render a bytestring representation of this PageElement and its | ||
1667 | contents. | ||
1668 | |||
1669 | :param encoding: The destination encoding. | ||
1670 | :param indent_level: Each line of the rendering will be | ||
1671 | indented this many levels. (The formatter decides what a | ||
1672 | 'level' means in terms of spaces or other characters | ||
1673 | output.) Used internally in recursive calls while | ||
1674 | pretty-printing. | ||
1675 | :param formatter: A Formatter object, or a string naming one of | ||
1676 | the standard formatters. | ||
1677 | :param errors: An error handling strategy such as | ||
1678 | 'xmlcharrefreplace'. This value is passed along into | ||
1679 | encode() and its value should be one of the constants | ||
1680 | defined by Python. | ||
1681 | :return: A bytestring. | ||
1682 | |||
1683 | """ | ||
1052 | # Turn the data structure into Unicode, then encode the | 1684 | # Turn the data structure into Unicode, then encode the |
1053 | # Unicode. | 1685 | # Unicode. |
1054 | u = self.decode(indent_level, encoding, formatter) | 1686 | u = self.decode(indent_level, encoding, formatter) |
1055 | return u.encode(encoding, errors) | 1687 | return u.encode(encoding, errors) |
1056 | 1688 | ||
1057 | def _should_pretty_print(self, indent_level): | ||
1058 | """Should this tag be pretty-printed?""" | ||
1059 | return ( | ||
1060 | indent_level is not None and | ||
1061 | (self.name not in HTMLAwareEntitySubstitution.preformatted_tags | ||
1062 | or self._is_xml)) | ||
1063 | |||
1064 | def decode(self, indent_level=None, | 1689 | def decode(self, indent_level=None, |
1065 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, | 1690 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, |
1066 | formatter="minimal"): | 1691 | formatter="minimal", |
1067 | """Returns a Unicode representation of this tag and its contents. | 1692 | iterator=None): |
1693 | pieces = [] | ||
1694 | # First off, turn a non-Formatter `formatter` into a Formatter | ||
1695 | # object. This will stop the lookup from happening over and | ||
1696 | # over again. | ||
1697 | if not isinstance(formatter, Formatter): | ||
1698 | formatter = self.formatter_for_name(formatter) | ||
1699 | |||
1700 | if indent_level is True: | ||
1701 | indent_level = 0 | ||
1702 | |||
1703 | # The currently active tag that put us into string literal | ||
1704 | # mode. Until this element is closed, children will be treated | ||
1705 | # as string literals and not pretty-printed. String literal | ||
1706 | # mode is turned on immediately after this tag begins, and | ||
1707 | # turned off immediately before it's closed. This means there | ||
1708 | # will be whitespace before and after the tag itself. | ||
1709 | string_literal_tag = None | ||
1710 | |||
1711 | for event, element in self._event_stream(iterator): | ||
1712 | if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): | ||
1713 | piece = element._format_tag( | ||
1714 | eventual_encoding, formatter, opening=True | ||
1715 | ) | ||
1716 | elif event is Tag.END_ELEMENT_EVENT: | ||
1717 | piece = element._format_tag( | ||
1718 | eventual_encoding, formatter, opening=False | ||
1719 | ) | ||
1720 | if indent_level is not None: | ||
1721 | indent_level -= 1 | ||
1722 | else: | ||
1723 | piece = element.output_ready(formatter) | ||
1724 | |||
1725 | # Now we need to apply the 'prettiness' -- extra | ||
1726 | # whitespace before and/or after this tag. This can get | ||
1727 | # complicated because certain tags, like <pre> and | ||
1728 | # <script>, can't be prettified, since adding whitespace would | ||
1729 | # change the meaning of the content. | ||
1730 | |||
1731 | # The default behavior is to add whitespace before and | ||
1732 | # after an element when string literal mode is off, and to | ||
1733 | # leave things as they are when string literal mode is on. | ||
1734 | if string_literal_tag: | ||
1735 | indent_before = indent_after = False | ||
1736 | else: | ||
1737 | indent_before = indent_after = True | ||
1738 | |||
1739 | # The only time the behavior is more complex than that is | ||
1740 | # when we encounter an opening or closing tag that might | ||
1741 | # put us into or out of string literal mode. | ||
1742 | if (event is Tag.START_ELEMENT_EVENT | ||
1743 | and not string_literal_tag | ||
1744 | and not element._should_pretty_print()): | ||
1745 | # We are about to enter string literal mode. Add | ||
1746 | # whitespace before this tag, but not after. We | ||
1747 | # will stay in string literal mode until this tag | ||
1748 | # is closed. | ||
1749 | indent_before = True | ||
1750 | indent_after = False | ||
1751 | string_literal_tag = element | ||
1752 | elif (event is Tag.END_ELEMENT_EVENT | ||
1753 | and element is string_literal_tag): | ||
1754 | # We are about to exit string literal mode by closing | ||
1755 | # the tag that sent us into that mode. Add whitespace | ||
1756 | # after this tag, but not before. | ||
1757 | indent_before = False | ||
1758 | indent_after = True | ||
1759 | string_literal_tag = None | ||
1760 | |||
1761 | # Now we know whether to add whitespace before and/or | ||
1762 | # after this element. | ||
1763 | if indent_level is not None: | ||
1764 | if (indent_before or indent_after): | ||
1765 | if isinstance(element, NavigableString): | ||
1766 | piece = piece.strip() | ||
1767 | if piece: | ||
1768 | piece = self._indent_string( | ||
1769 | piece, indent_level, formatter, | ||
1770 | indent_before, indent_after | ||
1771 | ) | ||
1772 | if event == Tag.START_ELEMENT_EVENT: | ||
1773 | indent_level += 1 | ||
1774 | pieces.append(piece) | ||
1775 | return "".join(pieces) | ||
1776 | |||
1777 | # Names for the different events yielded by _event_stream | ||
1778 | START_ELEMENT_EVENT = object() | ||
1779 | END_ELEMENT_EVENT = object() | ||
1780 | EMPTY_ELEMENT_EVENT = object() | ||
1781 | STRING_ELEMENT_EVENT = object() | ||
1782 | |||
1783 | def _event_stream(self, iterator=None): | ||
1784 | """Yield a sequence of events that can be used to reconstruct the DOM | ||
1785 | for this element. | ||
1786 | |||
1787 | This lets us recreate the nested structure of this element | ||
1788 | (e.g. when formatting it as a string) without using recursive | ||
1789 | method calls. | ||
1790 | |||
1791 | This is similar in concept to the SAX API, but it's a simpler | ||
1792 | interface designed for internal use. The events are different | ||
1793 | from SAX and the arguments associated with the events are Tags | ||
1794 | and other Beautiful Soup objects. | ||
1795 | |||
1796 | :param iterator: An alternate iterator to use when traversing | ||
1797 | the tree. | ||
1798 | """ | ||
1799 | tag_stack = [] | ||
1068 | 1800 | ||
1069 | :param eventual_encoding: The tag is destined to be | 1801 | iterator = iterator or self.self_and_descendants |
1070 | encoded into this encoding. This method is _not_ | 1802 | |
1071 | responsible for performing that encoding. This information | 1803 | for c in iterator: |
1072 | is passed in so that it can be substituted in if the | 1804 | # If the parent of the element we're about to yield is not |
1073 | document contains a <META> tag that mentions the document's | 1805 | # the tag currently on the stack, it means that the tag on |
1074 | encoding. | 1806 | # the stack closed before this element appeared. |
1807 | while tag_stack and c.parent != tag_stack[-1]: | ||
1808 | now_closed_tag = tag_stack.pop() | ||
1809 | yield Tag.END_ELEMENT_EVENT, now_closed_tag | ||
1810 | |||
1811 | if isinstance(c, Tag): | ||
1812 | if c.is_empty_element: | ||
1813 | yield Tag.EMPTY_ELEMENT_EVENT, c | ||
1814 | else: | ||
1815 | yield Tag.START_ELEMENT_EVENT, c | ||
1816 | tag_stack.append(c) | ||
1817 | continue | ||
1818 | else: | ||
1819 | yield Tag.STRING_ELEMENT_EVENT, c | ||
1820 | |||
1821 | while tag_stack: | ||
1822 | now_closed_tag = tag_stack.pop() | ||
1823 | yield Tag.END_ELEMENT_EVENT, now_closed_tag | ||
1824 | |||
1825 | def _indent_string(self, s, indent_level, formatter, | ||
1826 | indent_before, indent_after): | ||
1827 | """Add indentation whitespace before and/or after a string. | ||
1828 | |||
1829 | :param s: The string to amend with whitespace. | ||
1830 | :param indent_level: The indentation level; affects how much | ||
1831 | whitespace goes before the string. | ||
1832 | :param indent_before: Whether or not to add whitespace | ||
1833 | before the string. | ||
1834 | :param indent_after: Whether or not to add whitespace | ||
1835 | (a newline) after the string. | ||
1075 | """ | 1836 | """ |
1837 | space_before = '' | ||
1838 | if indent_before and indent_level: | ||
1839 | space_before = (formatter.indent * indent_level) | ||
1076 | 1840 | ||
1077 | # First off, turn a string formatter into a function. This | 1841 | space_after = '' |
1078 | # will stop the lookup from happening over and over again. | 1842 | if indent_after: |
1079 | if not isinstance(formatter, collections.abc.Callable): | 1843 | space_after = "\n" |
1080 | formatter = self._formatter_for_name(formatter) | ||
1081 | 1844 | ||
1082 | attrs = [] | 1845 | return space_before + s + space_after |
1083 | if self.attrs: | 1846 | |
1084 | for key, val in sorted(self.attrs.items()): | 1847 | def _format_tag(self, eventual_encoding, formatter, opening): |
1848 | if self.hidden: | ||
1849 | # A hidden tag is invisible, although its contents | ||
1850 | # are visible. | ||
1851 | return '' | ||
1852 | |||
1853 | # A tag starts with the < character (see below). | ||
1854 | |||
1855 | # Then the / character, if this is a closing tag. | ||
1856 | closing_slash = '' | ||
1857 | if not opening: | ||
1858 | closing_slash = '/' | ||
1859 | |||
1860 | # Then an optional namespace prefix. | ||
1861 | prefix = '' | ||
1862 | if self.prefix: | ||
1863 | prefix = self.prefix + ":" | ||
1864 | |||
1865 | # Then a list of attribute values, if this is an opening tag. | ||
1866 | attribute_string = '' | ||
1867 | if opening: | ||
1868 | attributes = formatter.attributes(self) | ||
1869 | attrs = [] | ||
1870 | for key, val in attributes: | ||
1085 | if val is None: | 1871 | if val is None: |
1086 | decoded = key | 1872 | decoded = key |
1087 | else: | 1873 | else: |
@@ -1090,71 +1876,52 @@ class Tag(PageElement): | |||
1090 | elif not isinstance(val, str): | 1876 | elif not isinstance(val, str): |
1091 | val = str(val) | 1877 | val = str(val) |
1092 | elif ( | 1878 | elif ( |
1093 | isinstance(val, AttributeValueWithCharsetSubstitution) | 1879 | isinstance(val, AttributeValueWithCharsetSubstitution) |
1094 | and eventual_encoding is not None): | 1880 | and eventual_encoding is not None |
1881 | ): | ||
1095 | val = val.encode(eventual_encoding) | 1882 | val = val.encode(eventual_encoding) |
1096 | 1883 | ||
1097 | text = self.format_string(val, formatter) | 1884 | text = formatter.attribute_value(val) |
1098 | decoded = ( | 1885 | decoded = ( |
1099 | str(key) + '=' | 1886 | str(key) + '=' |
1100 | + EntitySubstitution.quoted_attribute_value(text)) | 1887 | + formatter.quoted_attribute_value(text)) |
1101 | attrs.append(decoded) | 1888 | attrs.append(decoded) |
1102 | close = '' | 1889 | if attrs: |
1103 | closeTag = '' | 1890 | attribute_string = ' ' + ' '.join(attrs) |
1104 | |||
1105 | prefix = '' | ||
1106 | if self.prefix: | ||
1107 | prefix = self.prefix + ":" | ||
1108 | 1891 | ||
1892 | # Then an optional closing slash (for a void element in an | ||
1893 | # XML document). | ||
1894 | void_element_closing_slash = '' | ||
1109 | if self.is_empty_element: | 1895 | if self.is_empty_element: |
1110 | close = '/' | 1896 | void_element_closing_slash = formatter.void_element_close_prefix or '' |
1111 | else: | ||
1112 | closeTag = '</%s%s>' % (prefix, self.name) | ||
1113 | |||
1114 | pretty_print = self._should_pretty_print(indent_level) | ||
1115 | space = '' | ||
1116 | indent_space = '' | ||
1117 | if indent_level is not None: | ||
1118 | indent_space = (' ' * (indent_level - 1)) | ||
1119 | if pretty_print: | ||
1120 | space = indent_space | ||
1121 | indent_contents = indent_level + 1 | ||
1122 | else: | ||
1123 | indent_contents = None | ||
1124 | contents = self.decode_contents( | ||
1125 | indent_contents, eventual_encoding, formatter) | ||
1126 | 1897 | ||
1127 | if self.hidden: | 1898 | # Put it all together. |
1128 | # This is the 'document root' object. | 1899 | return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>' |
1129 | s = contents | 1900 | |
1130 | else: | 1901 | def _should_pretty_print(self, indent_level=1): |
1131 | s = [] | 1902 | """Should this tag be pretty-printed? |
1132 | attribute_string = '' | 1903 | |
1133 | if attrs: | 1904 | Most of them should, but some (such as <pre> in HTML |
1134 | attribute_string = ' ' + ' '.join(attrs) | 1905 | documents) should not. |
1135 | if indent_level is not None: | 1906 | """ |
1136 | # Even if this particular tag is not pretty-printed, | 1907 | return ( |
1137 | # we should indent up to the start of the tag. | 1908 | indent_level is not None |
1138 | s.append(indent_space) | 1909 | and ( |
1139 | s.append('<%s%s%s%s>' % ( | 1910 | not self.preserve_whitespace_tags |
1140 | prefix, self.name, attribute_string, close)) | 1911 | or self.name not in self.preserve_whitespace_tags |
1141 | if pretty_print: | 1912 | ) |
1142 | s.append("\n") | 1913 | ) |
1143 | s.append(contents) | ||
1144 | if pretty_print and contents and contents[-1] != "\n": | ||
1145 | s.append("\n") | ||
1146 | if pretty_print and closeTag: | ||
1147 | s.append(space) | ||
1148 | s.append(closeTag) | ||
1149 | if indent_level is not None and closeTag and self.next_sibling: | ||
1150 | # Even if this particular tag is not pretty-printed, | ||
1151 | # we're now done with the tag, and we should add a | ||
1152 | # newline if appropriate. | ||
1153 | s.append("\n") | ||
1154 | s = ''.join(s) | ||
1155 | return s | ||
1156 | 1914 | ||
1157 | def prettify(self, encoding=None, formatter="minimal"): | 1915 | def prettify(self, encoding=None, formatter="minimal"): |
1916 | """Pretty-print this PageElement as a string. | ||
1917 | |||
1918 | :param encoding: The eventual encoding of the string. If this is None, | ||
1919 | a Unicode string will be returned. | ||
1920 | :param formatter: A Formatter object, or a string naming one of | ||
1921 | the standard formatters. | ||
1922 | :return: A Unicode string (if encoding==None) or a bytestring | ||
1923 | (otherwise). | ||
1924 | """ | ||
1158 | if encoding is None: | 1925 | if encoding is None: |
1159 | return self.decode(True, formatter=formatter) | 1926 | return self.decode(True, formatter=formatter) |
1160 | else: | 1927 | else: |
@@ -1166,62 +1933,50 @@ class Tag(PageElement): | |||
1166 | """Renders the contents of this tag as a Unicode string. | 1933 | """Renders the contents of this tag as a Unicode string. |
1167 | 1934 | ||
1168 | :param indent_level: Each line of the rendering will be | 1935 | :param indent_level: Each line of the rendering will be |
1169 | indented this many spaces. | 1936 | indented this many levels. (The formatter decides what a |
1937 | 'level' means in terms of spaces or other characters | ||
1938 | output.) Used internally in recursive calls while | ||
1939 | pretty-printing. | ||
1170 | 1940 | ||
1171 | :param eventual_encoding: The tag is destined to be | 1941 | :param eventual_encoding: The tag is destined to be |
1172 | encoded into this encoding. This method is _not_ | 1942 | encoded into this encoding. decode_contents() is _not_ |
1173 | responsible for performing that encoding. This information | 1943 | responsible for performing that encoding. This information |
1174 | is passed in so that it can be substituted in if the | 1944 | is passed in so that it can be substituted in if the |
1175 | document contains a <META> tag that mentions the document's | 1945 | document contains a <META> tag that mentions the document's |
1176 | encoding. | 1946 | encoding. |
1177 | 1947 | ||
1178 | :param formatter: The output formatter responsible for converting | 1948 | :param formatter: A Formatter object, or a string naming one of |
1179 | entities to Unicode characters. | 1949 | the standard Formatters. |
1180 | """ | 1950 | |
1181 | # First off, turn a string formatter into a function. This | 1951 | """ |
1182 | # will stop the lookup from happening over and over again. | 1952 | return self.decode(indent_level, eventual_encoding, formatter, |
1183 | if not isinstance(formatter, collections.abc.Callable): | 1953 | iterator=self.descendants) |
1184 | formatter = self._formatter_for_name(formatter) | ||
1185 | |||
1186 | pretty_print = (indent_level is not None) | ||
1187 | s = [] | ||
1188 | for c in self: | ||
1189 | text = None | ||
1190 | if isinstance(c, NavigableString): | ||
1191 | text = c.output_ready(formatter) | ||
1192 | elif isinstance(c, Tag): | ||
1193 | s.append(c.decode(indent_level, eventual_encoding, | ||
1194 | formatter)) | ||
1195 | if text and indent_level and not self.name == 'pre': | ||
1196 | text = text.strip() | ||
1197 | if text: | ||
1198 | if pretty_print and not self.name == 'pre': | ||
1199 | s.append(" " * (indent_level - 1)) | ||
1200 | s.append(text) | ||
1201 | if pretty_print and not self.name == 'pre': | ||
1202 | s.append("\n") | ||
1203 | return ''.join(s) | ||
1204 | 1954 | ||
1205 | def encode_contents( | 1955 | def encode_contents( |
1206 | self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, | 1956 | self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, |
1207 | formatter="minimal"): | 1957 | formatter="minimal"): |
1208 | """Renders the contents of this tag as a bytestring. | 1958 | """Renders the contents of this PageElement as a bytestring. |
1209 | 1959 | ||
1210 | :param indent_level: Each line of the rendering will be | 1960 | :param indent_level: Each line of the rendering will be |
1211 | indented this many spaces. | 1961 | indented this many levels. (The formatter decides what a |
1962 | 'level' means in terms of spaces or other characters | ||
1963 | output.) Used internally in recursive calls while | ||
1964 | pretty-printing. | ||
1212 | 1965 | ||
1213 | :param eventual_encoding: The bytestring will be in this encoding. | 1966 | :param eventual_encoding: The bytestring will be in this encoding. |
1214 | 1967 | ||
1215 | :param formatter: The output formatter responsible for converting | 1968 | :param formatter: A Formatter object, or a string naming one of |
1216 | entities to Unicode characters. | 1969 | the standard Formatters. |
1217 | """ | ||
1218 | 1970 | ||
1971 | :return: A bytestring. | ||
1972 | """ | ||
1219 | contents = self.decode_contents(indent_level, encoding, formatter) | 1973 | contents = self.decode_contents(indent_level, encoding, formatter) |
1220 | return contents.encode(encoding) | 1974 | return contents.encode(encoding) |
1221 | 1975 | ||
1222 | # Old method for BS3 compatibility | 1976 | # Old method for BS3 compatibility |
1223 | def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, | 1977 | def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, |
1224 | prettyPrint=False, indentLevel=0): | 1978 | prettyPrint=False, indentLevel=0): |
1979 | """Deprecated method for BS3 compatibility.""" | ||
1225 | if not prettyPrint: | 1980 | if not prettyPrint: |
1226 | indentLevel = None | 1981 | indentLevel = None |
1227 | return self.encode_contents( | 1982 | return self.encode_contents( |
@@ -1229,44 +1984,88 @@ class Tag(PageElement): | |||
1229 | 1984 | ||
1230 | #Soup methods | 1985 | #Soup methods |
1231 | 1986 | ||
1232 | def find(self, name=None, attrs={}, recursive=True, text=None, | 1987 | def find(self, name=None, attrs={}, recursive=True, string=None, |
1233 | **kwargs): | 1988 | **kwargs): |
1234 | """Return only the first child of this Tag matching the given | 1989 | """Look in the children of this PageElement and find the first |
1235 | criteria.""" | 1990 | PageElement that matches the given criteria. |
1991 | |||
1992 | All find_* methods take a common set of arguments. See the online | ||
1993 | documentation for detailed explanations. | ||
1994 | |||
1995 | :param name: A filter on tag name. | ||
1996 | :param attrs: A dictionary of filters on attribute values. | ||
1997 | :param recursive: If this is True, find() will perform a | ||
1998 | recursive search of this PageElement's children. Otherwise, | ||
1999 | only the direct children will be considered. | ||
2000 | :param limit: Stop looking after finding this many results. | ||
2001 | :kwargs: A dictionary of filters on attribute values. | ||
2002 | :return: A PageElement. | ||
2003 | :rtype: bs4.element.Tag | bs4.element.NavigableString | ||
2004 | """ | ||
1236 | r = None | 2005 | r = None |
1237 | l = self.find_all(name, attrs, recursive, text, 1, **kwargs) | 2006 | l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, |
2007 | **kwargs) | ||
1238 | if l: | 2008 | if l: |
1239 | r = l[0] | 2009 | r = l[0] |
1240 | return r | 2010 | return r |
1241 | findChild = find | 2011 | findChild = find #BS2 |
1242 | 2012 | ||
1243 | def find_all(self, name=None, attrs={}, recursive=True, text=None, | 2013 | def find_all(self, name=None, attrs={}, recursive=True, string=None, |
1244 | limit=None, **kwargs): | 2014 | limit=None, **kwargs): |
1245 | """Extracts a list of Tag objects that match the given | 2015 | """Look in the children of this PageElement and find all |
1246 | criteria. You can specify the name of the Tag and any | 2016 | PageElements that match the given criteria. |
1247 | attributes you want the Tag to have. | 2017 | |
1248 | 2018 | All find_* methods take a common set of arguments. See the online | |
1249 | The value of a key-value pair in the 'attrs' map can be a | 2019 | documentation for detailed explanations. |
1250 | string, a list of strings, a regular expression object, or a | 2020 | |
1251 | callable that takes a string and returns whether or not the | 2021 | :param name: A filter on tag name. |
1252 | string matches for some custom definition of 'matches'. The | 2022 | :param attrs: A dictionary of filters on attribute values. |
1253 | same is true of the tag name.""" | 2023 | :param recursive: If this is True, find_all() will perform a |
1254 | 2024 | recursive search of this PageElement's children. Otherwise, | |
2025 | only the direct children will be considered. | ||
2026 | :param limit: Stop looking after finding this many results. | ||
2027 | :kwargs: A dictionary of filters on attribute values. | ||
2028 | :return: A ResultSet of PageElements. | ||
2029 | :rtype: bs4.element.ResultSet | ||
2030 | """ | ||
1255 | generator = self.descendants | 2031 | generator = self.descendants |
1256 | if not recursive: | 2032 | if not recursive: |
1257 | generator = self.children | 2033 | generator = self.children |
1258 | return self._find_all(name, attrs, text, limit, generator, **kwargs) | 2034 | _stacklevel = kwargs.pop('_stacklevel', 2) |
2035 | return self._find_all(name, attrs, string, limit, generator, | ||
2036 | _stacklevel=_stacklevel+1, **kwargs) | ||
1259 | findAll = find_all # BS3 | 2037 | findAll = find_all # BS3 |
1260 | findChildren = find_all # BS2 | 2038 | findChildren = find_all # BS2 |
1261 | 2039 | ||
1262 | #Generator methods | 2040 | #Generator methods |
1263 | @property | 2041 | @property |
1264 | def children(self): | 2042 | def children(self): |
2043 | """Iterate over all direct children of this PageElement. | ||
2044 | |||
2045 | :yield: A sequence of PageElements. | ||
2046 | """ | ||
1265 | # return iter() to make the purpose of the method clear | 2047 | # return iter() to make the purpose of the method clear |
1266 | return iter(self.contents) # XXX This seems to be untested. | 2048 | return iter(self.contents) # XXX This seems to be untested. |
1267 | 2049 | ||
1268 | @property | 2050 | @property |
2051 | def self_and_descendants(self): | ||
2052 | """Iterate over this PageElement and its children in a | ||
2053 | breadth-first sequence. | ||
2054 | |||
2055 | :yield: A sequence of PageElements. | ||
2056 | """ | ||
2057 | if not self.hidden: | ||
2058 | yield self | ||
2059 | for i in self.descendants: | ||
2060 | yield i | ||
2061 | |||
2062 | @property | ||
1269 | def descendants(self): | 2063 | def descendants(self): |
2064 | """Iterate over all children of this PageElement in a | ||
2065 | breadth-first sequence. | ||
2066 | |||
2067 | :yield: A sequence of PageElements. | ||
2068 | """ | ||
1270 | if not len(self.contents): | 2069 | if not len(self.contents): |
1271 | return | 2070 | return |
1272 | stopNode = self._last_descendant().next_element | 2071 | stopNode = self._last_descendant().next_element |
@@ -1276,262 +2075,102 @@ class Tag(PageElement): | |||
1276 | current = current.next_element | 2075 | current = current.next_element |
1277 | 2076 | ||
1278 | # CSS selector code | 2077 | # CSS selector code |
2078 | def select_one(self, selector, namespaces=None, **kwargs): | ||
2079 | """Perform a CSS selection operation on the current element. | ||
1279 | 2080 | ||
1280 | _selector_combinators = ['>', '+', '~'] | 2081 | :param selector: A CSS selector. |
1281 | _select_debug = False | ||
1282 | def select_one(self, selector): | ||
1283 | """Perform a CSS selection operation on the current element.""" | ||
1284 | value = self.select(selector, limit=1) | ||
1285 | if value: | ||
1286 | return value[0] | ||
1287 | return None | ||
1288 | 2082 | ||
1289 | def select(self, selector, _candidate_generator=None, limit=None): | 2083 | :param namespaces: A dictionary mapping namespace prefixes |
1290 | """Perform a CSS selection operation on the current element.""" | 2084 | used in the CSS selector to namespace URIs. By default, |
1291 | 2085 | Beautiful Soup will use the prefixes it encountered while | |
1292 | # Handle grouping selectors if ',' exists, ie: p,a | 2086 | parsing the document. |
1293 | if ',' in selector: | ||
1294 | context = [] | ||
1295 | for partial_selector in selector.split(','): | ||
1296 | partial_selector = partial_selector.strip() | ||
1297 | if partial_selector == '': | ||
1298 | raise ValueError('Invalid group selection syntax: %s' % selector) | ||
1299 | candidates = self.select(partial_selector, limit=limit) | ||
1300 | for candidate in candidates: | ||
1301 | if candidate not in context: | ||
1302 | context.append(candidate) | ||
1303 | |||
1304 | if limit and len(context) >= limit: | ||
1305 | break | ||
1306 | return context | ||
1307 | 2087 | ||
1308 | tokens = selector.split() | 2088 | :param kwargs: Keyword arguments to be passed into Soup Sieve's |
1309 | current_context = [self] | 2089 | soupsieve.select() method. |
1310 | 2090 | ||
1311 | if tokens[-1] in self._selector_combinators: | 2091 | :return: A Tag. |
1312 | raise ValueError( | 2092 | :rtype: bs4.element.Tag |
1313 | 'Final combinator "%s" is missing an argument.' % tokens[-1]) | 2093 | """ |
2094 | return self.css.select_one(selector, namespaces, **kwargs) | ||
1314 | 2095 | ||
1315 | if self._select_debug: | 2096 | def select(self, selector, namespaces=None, limit=None, **kwargs): |
1316 | print('Running CSS selector "%s"' % selector) | 2097 | """Perform a CSS selection operation on the current element. |
1317 | 2098 | ||
1318 | for index, token in enumerate(tokens): | 2099 | This uses the SoupSieve library. |
1319 | new_context = [] | ||
1320 | new_context_ids = set([]) | ||
1321 | 2100 | ||
1322 | if tokens[index-1] in self._selector_combinators: | 2101 | :param selector: A string containing a CSS selector. |
1323 | # This token was consumed by the previous combinator. Skip it. | ||
1324 | if self._select_debug: | ||
1325 | print(' Token was consumed by the previous combinator.') | ||
1326 | continue | ||
1327 | 2102 | ||
1328 | if self._select_debug: | 2103 | :param namespaces: A dictionary mapping namespace prefixes |
1329 | print(' Considering token "%s"' % token) | 2104 | used in the CSS selector to namespace URIs. By default, |
1330 | recursive_candidate_generator = None | 2105 | Beautiful Soup will use the prefixes it encountered while |
1331 | tag_name = None | 2106 | parsing the document. |
1332 | 2107 | ||
1333 | # Each operation corresponds to a checker function, a rule | 2108 | :param limit: After finding this number of results, stop looking. |
1334 | # for determining whether a candidate matches the | 2109 | |
1335 | # selector. Candidates are generated by the active | 2110 | :param kwargs: Keyword arguments to be passed into SoupSieve's |
1336 | # iterator. | 2111 | soupsieve.select() method. |
1337 | checker = None | 2112 | |
1338 | 2113 | :return: A ResultSet of Tags. | |
1339 | m = self.attribselect_re.match(token) | 2114 | :rtype: bs4.element.ResultSet |
1340 | if m is not None: | 2115 | """ |
1341 | # Attribute selector | 2116 | return self.css.select(selector, namespaces, limit, **kwargs) |
1342 | tag_name, attribute, operator, value = m.groups() | 2117 | |
1343 | checker = self._attribute_checker(operator, attribute, value) | 2118 | @property |
1344 | 2119 | def css(self): | |
1345 | elif '#' in token: | 2120 | """Return an interface to the CSS selector API.""" |
1346 | # ID selector | 2121 | return CSS(self) |
1347 | tag_name, tag_id = token.split('#', 1) | ||
1348 | def id_matches(tag): | ||
1349 | return tag.get('id', None) == tag_id | ||
1350 | checker = id_matches | ||
1351 | |||
1352 | elif '.' in token: | ||
1353 | # Class selector | ||
1354 | tag_name, klass = token.split('.', 1) | ||
1355 | classes = set(klass.split('.')) | ||
1356 | def classes_match(candidate): | ||
1357 | return classes.issubset(candidate.get('class', [])) | ||
1358 | checker = classes_match | ||
1359 | |||
1360 | elif ':' in token: | ||
1361 | # Pseudo-class | ||
1362 | tag_name, pseudo = token.split(':', 1) | ||
1363 | if tag_name == '': | ||
1364 | raise ValueError( | ||
1365 | "A pseudo-class must be prefixed with a tag name.") | ||
1366 | pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) | ||
1367 | found = [] | ||
1368 | if pseudo_attributes is None: | ||
1369 | pseudo_type = pseudo | ||
1370 | pseudo_value = None | ||
1371 | else: | ||
1372 | pseudo_type, pseudo_value = pseudo_attributes.groups() | ||
1373 | if pseudo_type == 'nth-of-type': | ||
1374 | try: | ||
1375 | pseudo_value = int(pseudo_value) | ||
1376 | except: | ||
1377 | raise NotImplementedError( | ||
1378 | 'Only numeric values are currently supported for the nth-of-type pseudo-class.') | ||
1379 | if pseudo_value < 1: | ||
1380 | raise ValueError( | ||
1381 | 'nth-of-type pseudo-class value must be at least 1.') | ||
1382 | class Counter(object): | ||
1383 | def __init__(self, destination): | ||
1384 | self.count = 0 | ||
1385 | self.destination = destination | ||
1386 | |||
1387 | def nth_child_of_type(self, tag): | ||
1388 | self.count += 1 | ||
1389 | if self.count == self.destination: | ||
1390 | return True | ||
1391 | if self.count > self.destination: | ||
1392 | # Stop the generator that's sending us | ||
1393 | # these things. | ||
1394 | raise StopIteration() | ||
1395 | return False | ||
1396 | checker = Counter(pseudo_value).nth_child_of_type | ||
1397 | else: | ||
1398 | raise NotImplementedError( | ||
1399 | 'Only the following pseudo-classes are implemented: nth-of-type.') | ||
1400 | |||
1401 | elif token == '*': | ||
1402 | # Star selector -- matches everything | ||
1403 | pass | ||
1404 | elif token == '>': | ||
1405 | # Run the next token as a CSS selector against the | ||
1406 | # direct children of each tag in the current context. | ||
1407 | recursive_candidate_generator = lambda tag: tag.children | ||
1408 | elif token == '~': | ||
1409 | # Run the next token as a CSS selector against the | ||
1410 | # siblings of each tag in the current context. | ||
1411 | recursive_candidate_generator = lambda tag: tag.next_siblings | ||
1412 | elif token == '+': | ||
1413 | # For each tag in the current context, run the next | ||
1414 | # token as a CSS selector against the tag's next | ||
1415 | # sibling that's a tag. | ||
1416 | def next_tag_sibling(tag): | ||
1417 | yield tag.find_next_sibling(True) | ||
1418 | recursive_candidate_generator = next_tag_sibling | ||
1419 | |||
1420 | elif self.tag_name_re.match(token): | ||
1421 | # Just a tag name. | ||
1422 | tag_name = token | ||
1423 | else: | ||
1424 | raise ValueError( | ||
1425 | 'Unsupported or invalid CSS selector: "%s"' % token) | ||
1426 | if recursive_candidate_generator: | ||
1427 | # This happens when the selector looks like "> foo". | ||
1428 | # | ||
1429 | # The generator calls select() recursively on every | ||
1430 | # member of the current context, passing in a different | ||
1431 | # candidate generator and a different selector. | ||
1432 | # | ||
1433 | # In the case of "> foo", the candidate generator is | ||
1434 | # one that yields a tag's direct children (">"), and | ||
1435 | # the selector is "foo". | ||
1436 | next_token = tokens[index+1] | ||
1437 | def recursive_select(tag): | ||
1438 | if self._select_debug: | ||
1439 | print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)) | ||
1440 | print('-' * 40) | ||
1441 | for i in tag.select(next_token, recursive_candidate_generator): | ||
1442 | if self._select_debug: | ||
1443 | print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)) | ||
1444 | yield i | ||
1445 | if self._select_debug: | ||
1446 | print('-' * 40) | ||
1447 | _use_candidate_generator = recursive_select | ||
1448 | elif _candidate_generator is None: | ||
1449 | # By default, a tag's candidates are all of its | ||
1450 | # children. If tag_name is defined, only yield tags | ||
1451 | # with that name. | ||
1452 | if self._select_debug: | ||
1453 | if tag_name: | ||
1454 | check = "[any]" | ||
1455 | else: | ||
1456 | check = tag_name | ||
1457 | print(' Default candidate generator, tag name="%s"' % check) | ||
1458 | if self._select_debug: | ||
1459 | # This is redundant with later code, but it stops | ||
1460 | # a bunch of bogus tags from cluttering up the | ||
1461 | # debug log. | ||
1462 | def default_candidate_generator(tag): | ||
1463 | for child in tag.descendants: | ||
1464 | if not isinstance(child, Tag): | ||
1465 | continue | ||
1466 | if tag_name and not child.name == tag_name: | ||
1467 | continue | ||
1468 | yield child | ||
1469 | _use_candidate_generator = default_candidate_generator | ||
1470 | else: | ||
1471 | _use_candidate_generator = lambda tag: tag.descendants | ||
1472 | else: | ||
1473 | _use_candidate_generator = _candidate_generator | ||
1474 | |||
1475 | count = 0 | ||
1476 | for tag in current_context: | ||
1477 | if self._select_debug: | ||
1478 | print(" Running candidate generator on %s %s" % ( | ||
1479 | tag.name, repr(tag.attrs))) | ||
1480 | for candidate in _use_candidate_generator(tag): | ||
1481 | if not isinstance(candidate, Tag): | ||
1482 | continue | ||
1483 | if tag_name and candidate.name != tag_name: | ||
1484 | continue | ||
1485 | if checker is not None: | ||
1486 | try: | ||
1487 | result = checker(candidate) | ||
1488 | except StopIteration: | ||
1489 | # The checker has decided we should no longer | ||
1490 | # run the generator. | ||
1491 | break | ||
1492 | if checker is None or result: | ||
1493 | if self._select_debug: | ||
1494 | print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))) | ||
1495 | if id(candidate) not in new_context_ids: | ||
1496 | # If a tag matches a selector more than once, | ||
1497 | # don't include it in the context more than once. | ||
1498 | new_context.append(candidate) | ||
1499 | new_context_ids.add(id(candidate)) | ||
1500 | if limit and len(new_context) >= limit: | ||
1501 | break | ||
1502 | elif self._select_debug: | ||
1503 | print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs))) | ||
1504 | |||
1505 | |||
1506 | current_context = new_context | ||
1507 | |||
1508 | if self._select_debug: | ||
1509 | print("Final verdict:") | ||
1510 | for i in current_context: | ||
1511 | print(" %s %s" % (i.name, i.attrs)) | ||
1512 | return current_context | ||
1513 | 2122 | ||
1514 | # Old names for backwards compatibility | 2123 | # Old names for backwards compatibility |
1515 | def childGenerator(self): | 2124 | def childGenerator(self): |
2125 | """Deprecated generator.""" | ||
1516 | return self.children | 2126 | return self.children |
1517 | 2127 | ||
1518 | def recursiveChildGenerator(self): | 2128 | def recursiveChildGenerator(self): |
2129 | """Deprecated generator.""" | ||
1519 | return self.descendants | 2130 | return self.descendants |
1520 | 2131 | ||
1521 | def has_key(self, key): | 2132 | def has_key(self, key): |
1522 | """This was kind of misleading because has_key() (attributes) | 2133 | """Deprecated method. This was kind of misleading because has_key() |
1523 | was different from __in__ (contents). has_key() is gone in | 2134 | (attributes) was different from __in__ (contents). |
1524 | Python 3, anyway.""" | 2135 | |
1525 | warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( | 2136 | has_key() is gone in Python 3, anyway. |
1526 | key)) | 2137 | """ |
2138 | warnings.warn( | ||
2139 | 'has_key is deprecated. Use has_attr(key) instead.', | ||
2140 | DeprecationWarning, stacklevel=2 | ||
2141 | ) | ||
1527 | return self.has_attr(key) | 2142 | return self.has_attr(key) |
1528 | 2143 | ||
1529 | # Next, a couple classes to represent queries and their results. | 2144 | # Next, a couple classes to represent queries and their results. |
1530 | class SoupStrainer(object): | 2145 | class SoupStrainer(object): |
1531 | """Encapsulates a number of ways of matching a markup element (tag or | 2146 | """Encapsulates a number of ways of matching a markup element (tag or |
1532 | text).""" | 2147 | string). |
2148 | |||
2149 | This is primarily used to underpin the find_* methods, but you can | ||
2150 | create one yourself and pass it in as `parse_only` to the | ||
2151 | `BeautifulSoup` constructor, to parse a subset of a large | ||
2152 | document. | ||
2153 | """ | ||
2154 | |||
2155 | def __init__(self, name=None, attrs={}, string=None, **kwargs): | ||
2156 | """Constructor. | ||
2157 | |||
2158 | The SoupStrainer constructor takes the same arguments passed | ||
2159 | into the find_* methods. See the online documentation for | ||
2160 | detailed explanations. | ||
2161 | |||
2162 | :param name: A filter on tag name. | ||
2163 | :param attrs: A dictionary of filters on attribute values. | ||
2164 | :param string: A filter for a NavigableString with specific text. | ||
2165 | :kwargs: A dictionary of filters on attribute values. | ||
2166 | """ | ||
2167 | if string is None and 'text' in kwargs: | ||
2168 | string = kwargs.pop('text') | ||
2169 | warnings.warn( | ||
2170 | "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", | ||
2171 | DeprecationWarning, stacklevel=2 | ||
2172 | ) | ||
1533 | 2173 | ||
1534 | def __init__(self, name=None, attrs={}, text=None, **kwargs): | ||
1535 | self.name = self._normalize_search_value(name) | 2174 | self.name = self._normalize_search_value(name) |
1536 | if not isinstance(attrs, dict): | 2175 | if not isinstance(attrs, dict): |
1537 | # Treat a non-dict value for attrs as a search for the 'class' | 2176 | # Treat a non-dict value for attrs as a search for the 'class' |
@@ -1556,12 +2195,15 @@ class SoupStrainer(object): | |||
1556 | normalized_attrs[key] = self._normalize_search_value(value) | 2195 | normalized_attrs[key] = self._normalize_search_value(value) |
1557 | 2196 | ||
1558 | self.attrs = normalized_attrs | 2197 | self.attrs = normalized_attrs |
1559 | self.text = self._normalize_search_value(text) | 2198 | self.string = self._normalize_search_value(string) |
2199 | |||
2200 | # DEPRECATED but just in case someone is checking this. | ||
2201 | self.text = self.string | ||
1560 | 2202 | ||
1561 | def _normalize_search_value(self, value): | 2203 | def _normalize_search_value(self, value): |
1562 | # Leave it alone if it's a Unicode string, a callable, a | 2204 | # Leave it alone if it's a Unicode string, a callable, a |
1563 | # regular expression, a boolean, or None. | 2205 | # regular expression, a boolean, or None. |
1564 | if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match') | 2206 | if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') |
1565 | or isinstance(value, bool) or value is None): | 2207 | or isinstance(value, bool) or value is None): |
1566 | return value | 2208 | return value |
1567 | 2209 | ||
@@ -1589,19 +2231,40 @@ class SoupStrainer(object): | |||
1589 | return str(str(value)) | 2231 | return str(str(value)) |
1590 | 2232 | ||
1591 | def __str__(self): | 2233 | def __str__(self): |
1592 | if self.text: | 2234 | """A human-readable representation of this SoupStrainer.""" |
1593 | return self.text | 2235 | if self.string: |
2236 | return self.string | ||
1594 | else: | 2237 | else: |
1595 | return "%s|%s" % (self.name, self.attrs) | 2238 | return "%s|%s" % (self.name, self.attrs) |
1596 | 2239 | ||
1597 | def search_tag(self, markup_name=None, markup_attrs={}): | 2240 | def search_tag(self, markup_name=None, markup_attrs={}): |
2241 | """Check whether a Tag with the given name and attributes would | ||
2242 | match this SoupStrainer. | ||
2243 | |||
2244 | Used prospectively to decide whether to even bother creating a Tag | ||
2245 | object. | ||
2246 | |||
2247 | :param markup_name: A tag name as found in some markup. | ||
2248 | :param markup_attrs: A dictionary of attributes as found in some markup. | ||
2249 | |||
2250 | :return: True if the prospective tag would match this SoupStrainer; | ||
2251 | False otherwise. | ||
2252 | """ | ||
1598 | found = None | 2253 | found = None |
1599 | markup = None | 2254 | markup = None |
1600 | if isinstance(markup_name, Tag): | 2255 | if isinstance(markup_name, Tag): |
1601 | markup = markup_name | 2256 | markup = markup_name |
1602 | markup_attrs = markup | 2257 | markup_attrs = markup |
2258 | |||
2259 | if isinstance(self.name, str): | ||
2260 | # Optimization for a very common case where the user is | ||
2261 | # searching for a tag with one specific name, and we're | ||
2262 | # looking at a tag with a different name. | ||
2263 | if markup and not markup.prefix and self.name != markup.name: | ||
2264 | return False | ||
2265 | |||
1603 | call_function_with_tag_data = ( | 2266 | call_function_with_tag_data = ( |
1604 | isinstance(self.name, collections.abc.Callable) | 2267 | isinstance(self.name, Callable) |
1605 | and not isinstance(markup_name, Tag)) | 2268 | and not isinstance(markup_name, Tag)) |
1606 | 2269 | ||
1607 | if ((not self.name) | 2270 | if ((not self.name) |
@@ -1630,13 +2293,22 @@ class SoupStrainer(object): | |||
1630 | found = markup | 2293 | found = markup |
1631 | else: | 2294 | else: |
1632 | found = markup_name | 2295 | found = markup_name |
1633 | if found and self.text and not self._matches(found.string, self.text): | 2296 | if found and self.string and not self._matches(found.string, self.string): |
1634 | found = None | 2297 | found = None |
1635 | return found | 2298 | return found |
2299 | |||
2300 | # For BS3 compatibility. | ||
1636 | searchTag = search_tag | 2301 | searchTag = search_tag |
1637 | 2302 | ||
1638 | def search(self, markup): | 2303 | def search(self, markup): |
1639 | # print 'looking for %s in %s' % (self, markup) | 2304 | """Find all items in `markup` that match this SoupStrainer. |
2305 | |||
2306 | Used by the core _find_all() method, which is ultimately | ||
2307 | called by all find_* methods. | ||
2308 | |||
2309 | :param markup: A PageElement or a list of them. | ||
2310 | """ | ||
2311 | # print('looking for %s in %s' % (self, markup)) | ||
1640 | found = None | 2312 | found = None |
1641 | # If given a list of items, scan it for a text element that | 2313 | # If given a list of items, scan it for a text element that |
1642 | # matches. | 2314 | # matches. |
@@ -1649,49 +2321,44 @@ class SoupStrainer(object): | |||
1649 | # If it's a Tag, make sure its name or attributes match. | 2321 | # If it's a Tag, make sure its name or attributes match. |
1650 | # Don't bother with Tags if we're searching for text. | 2322 | # Don't bother with Tags if we're searching for text. |
1651 | elif isinstance(markup, Tag): | 2323 | elif isinstance(markup, Tag): |
1652 | if not self.text or self.name or self.attrs: | 2324 | if not self.string or self.name or self.attrs: |
1653 | found = self.search_tag(markup) | 2325 | found = self.search_tag(markup) |
1654 | # If it's text, make sure the text matches. | 2326 | # If it's text, make sure the text matches. |
1655 | elif isinstance(markup, NavigableString) or \ | 2327 | elif isinstance(markup, NavigableString) or \ |
1656 | isinstance(markup, str): | 2328 | isinstance(markup, str): |
1657 | if not self.name and not self.attrs and self._matches(markup, self.text): | 2329 | if not self.name and not self.attrs and self._matches(markup, self.string): |
1658 | found = markup | 2330 | found = markup |
1659 | else: | 2331 | else: |
1660 | raise Exception( | 2332 | raise Exception( |
1661 | "I don't know how to match against a %s" % markup.__class__) | 2333 | "I don't know how to match against a %s" % markup.__class__) |
1662 | return found | 2334 | return found |
1663 | 2335 | ||
1664 | def _matches(self, markup, match_against): | 2336 | def _matches(self, markup, match_against, already_tried=None): |
1665 | # print u"Matching %s against %s" % (markup, match_against) | 2337 | # print(u"Matching %s against %s" % (markup, match_against)) |
1666 | result = False | 2338 | result = False |
1667 | if isinstance(markup, list) or isinstance(markup, tuple): | 2339 | if isinstance(markup, list) or isinstance(markup, tuple): |
1668 | # This should only happen when searching a multi-valued attribute | 2340 | # This should only happen when searching a multi-valued attribute |
1669 | # like 'class'. | 2341 | # like 'class'. |
1670 | if (isinstance(match_against, str) | 2342 | for item in markup: |
1671 | and ' ' in match_against): | 2343 | if self._matches(item, match_against): |
1672 | # A bit of a special case. If they try to match "foo | 2344 | return True |
1673 | # bar" on a multivalue attribute's value, only accept | 2345 | # We didn't match any particular value of the multivalue |
1674 | # the literal value "foo bar" | 2346 | # attribute, but maybe we match the attribute value when |
1675 | # | 2347 | # considered as a string. |
1676 | # XXX This is going to be pretty slow because we keep | 2348 | if self._matches(' '.join(markup), match_against): |
1677 | # splitting match_against. But it shouldn't come up | 2349 | return True |
1678 | # too often. | 2350 | return False |
1679 | return (whitespace_re.split(match_against) == markup) | ||
1680 | else: | ||
1681 | for item in markup: | ||
1682 | if self._matches(item, match_against): | ||
1683 | return True | ||
1684 | return False | ||
1685 | 2351 | ||
1686 | if match_against is True: | 2352 | if match_against is True: |
1687 | # True matches any non-None value. | 2353 | # True matches any non-None value. |
1688 | return markup is not None | 2354 | return markup is not None |
1689 | 2355 | ||
1690 | if isinstance(match_against, collections.abc.Callable): | 2356 | if isinstance(match_against, Callable): |
1691 | return match_against(markup) | 2357 | return match_against(markup) |
1692 | 2358 | ||
1693 | # Custom callables take the tag as an argument, but all | 2359 | # Custom callables take the tag as an argument, but all |
1694 | # other ways of matching match the tag name as a string. | 2360 | # other ways of matching match the tag name as a string. |
2361 | original_markup = markup | ||
1695 | if isinstance(markup, Tag): | 2362 | if isinstance(markup, Tag): |
1696 | markup = markup.name | 2363 | markup = markup.name |
1697 | 2364 | ||
@@ -1702,23 +2369,67 @@ class SoupStrainer(object): | |||
1702 | # None matches None, False, an empty string, an empty list, and so on. | 2369 | # None matches None, False, an empty string, an empty list, and so on. |
1703 | return not match_against | 2370 | return not match_against |
1704 | 2371 | ||
1705 | if isinstance(match_against, str): | 2372 | if (hasattr(match_against, '__iter__') |
2373 | and not isinstance(match_against, str)): | ||
2374 | # We're asked to match against an iterable of items. | ||
2375 | # The markup must be match at least one item in the | ||
2376 | # iterable. We'll try each one in turn. | ||
2377 | # | ||
2378 | # To avoid infinite recursion we need to keep track of | ||
2379 | # items we've already seen. | ||
2380 | if not already_tried: | ||
2381 | already_tried = set() | ||
2382 | for item in match_against: | ||
2383 | if item.__hash__: | ||
2384 | key = item | ||
2385 | else: | ||
2386 | key = id(item) | ||
2387 | if key in already_tried: | ||
2388 | continue | ||
2389 | else: | ||
2390 | already_tried.add(key) | ||
2391 | if self._matches(original_markup, item, already_tried): | ||
2392 | return True | ||
2393 | else: | ||
2394 | return False | ||
2395 | |||
2396 | # Beyond this point we might need to run the test twice: once against | ||
2397 | # the tag's name and once against its prefixed name. | ||
2398 | match = False | ||
2399 | |||
2400 | if not match and isinstance(match_against, str): | ||
1706 | # Exact string match | 2401 | # Exact string match |
1707 | return markup == match_against | 2402 | match = markup == match_against |
1708 | 2403 | ||
1709 | if hasattr(match_against, 'match'): | 2404 | if not match and hasattr(match_against, 'search'): |
1710 | # Regexp match | 2405 | # Regexp match |
1711 | return match_against.search(markup) | 2406 | return match_against.search(markup) |
1712 | 2407 | ||
1713 | if hasattr(match_against, '__iter__'): | 2408 | if (not match |
1714 | # The markup must be an exact match against something | 2409 | and isinstance(original_markup, Tag) |
1715 | # in the iterable. | 2410 | and original_markup.prefix): |
1716 | return markup in match_against | 2411 | # Try the whole thing again with the prefixed tag name. |
2412 | return self._matches( | ||
2413 | original_markup.prefix + ':' + original_markup.name, match_against | ||
2414 | ) | ||
2415 | |||
2416 | return match | ||
1717 | 2417 | ||
1718 | 2418 | ||
1719 | class ResultSet(list): | 2419 | class ResultSet(list): |
1720 | """A ResultSet is just a list that keeps track of the SoupStrainer | 2420 | """A ResultSet is just a list that keeps track of the SoupStrainer |
1721 | that created it.""" | 2421 | that created it.""" |
1722 | def __init__(self, source, result=()): | 2422 | def __init__(self, source, result=()): |
2423 | """Constructor. | ||
2424 | |||
2425 | :param source: A SoupStrainer. | ||
2426 | :param result: A list of PageElements. | ||
2427 | """ | ||
1723 | super(ResultSet, self).__init__(result) | 2428 | super(ResultSet, self).__init__(result) |
1724 | self.source = source | 2429 | self.source = source |
2430 | |||
2431 | def __getattr__(self, key): | ||
2432 | """Raise a helpful exception to explain a common code fix.""" | ||
2433 | raise AttributeError( | ||
2434 | "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key | ||
2435 | ) | ||