summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/element.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/element.py')
-rw-r--r--bitbake/lib/bs4/element.py2219
1 files changed, 1465 insertions, 754 deletions
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py
index 68be42d138..0aefe734b2 100644
--- a/bitbake/lib/bs4/element.py
+++ b/bitbake/lib/bs4/element.py
@@ -1,14 +1,27 @@
1# Use of this source code is governed by the MIT license.
1__license__ = "MIT" 2__license__ = "MIT"
2 3
3import collections.abc 4try:
5 from collections.abc import Callable # Python 3.6
6except ImportError as e:
7 from collections import Callable
4import re 8import re
5import sys 9import sys
6import warnings 10import warnings
7from bs4.dammit import EntitySubstitution 11
12from bs4.css import CSS
13from bs4.formatter import (
14 Formatter,
15 HTMLFormatter,
16 XMLFormatter,
17)
8 18
9DEFAULT_OUTPUT_ENCODING = "utf-8" 19DEFAULT_OUTPUT_ENCODING = "utf-8"
10PY3K = (sys.version_info[0] > 2)
11 20
21nonwhitespace_re = re.compile(r"\S+")
22
23# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
24# the off chance someone imported it for their own use.
12whitespace_re = re.compile(r"\s+") 25whitespace_re = re.compile(r"\s+")
13 26
14def _alias(attr): 27def _alias(attr):
@@ -23,12 +36,49 @@ def _alias(attr):
23 return alias 36 return alias
24 37
25 38
39# These encodings are recognized by Python (so PageElement.encode
40# could theoretically support them) but XML and HTML don't recognize
41# them (so they should not show up in an XML or HTML document as that
42# document's encoding).
43#
44# If an XML document is encoded in one of these encodings, no encoding
45# will be mentioned in the XML declaration. If an HTML document is
46# encoded in one of these encodings, and the HTML document has a
47# <meta> tag that mentions an encoding, the encoding will be given as
48# the empty string.
49#
50# Source:
51# https://docs.python.org/3/library/codecs.html#python-specific-encodings
52PYTHON_SPECIFIC_ENCODINGS = set([
53 "idna",
54 "mbcs",
55 "oem",
56 "palmos",
57 "punycode",
58 "raw_unicode_escape",
59 "undefined",
60 "unicode_escape",
61 "raw-unicode-escape",
62 "unicode-escape",
63 "string-escape",
64 "string_escape",
65])
66
67
26class NamespacedAttribute(str): 68class NamespacedAttribute(str):
69 """A namespaced string (e.g. 'xml:lang') that remembers the namespace
70 ('xml') and the name ('lang') that were used to create it.
71 """
27 72
28 def __new__(cls, prefix, name, namespace=None): 73 def __new__(cls, prefix, name=None, namespace=None):
29 if name is None: 74 if not name:
75 # This is the default namespace. Its name "has no value"
76 # per https://www.w3.org/TR/xml-names/#defaulting
77 name = None
78
79 if not name:
30 obj = str.__new__(cls, prefix) 80 obj = str.__new__(cls, prefix)
31 elif prefix is None: 81 elif not prefix:
32 # Not really namespaced. 82 # Not really namespaced.
33 obj = str.__new__(cls, name) 83 obj = str.__new__(cls, name)
34 else: 84 else:
@@ -54,6 +104,11 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
54 return obj 104 return obj
55 105
56 def encode(self, encoding): 106 def encode(self, encoding):
107 """When an HTML document is being encoded to a given encoding, the
108 value of a meta tag's 'charset' is the name of the encoding.
109 """
110 if encoding in PYTHON_SPECIFIC_ENCODINGS:
111 return ''
57 return encoding 112 return encoding
58 113
59 114
@@ -79,118 +134,44 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
79 return obj 134 return obj
80 135
81 def encode(self, encoding): 136 def encode(self, encoding):
137 if encoding in PYTHON_SPECIFIC_ENCODINGS:
138 return ''
82 def rewrite(match): 139 def rewrite(match):
83 return match.group(1) + encoding 140 return match.group(1) + encoding
84 return self.CHARSET_RE.sub(rewrite, self.original_value) 141 return self.CHARSET_RE.sub(rewrite, self.original_value)
85 142
86class HTMLAwareEntitySubstitution(EntitySubstitution):
87
88 """Entity substitution rules that are aware of some HTML quirks.
89 143
90 Specifically, the contents of <script> and <style> tags should not 144class PageElement(object):
91 undergo entity substitution. 145 """Contains the navigational information for some part of the page:
146 that is, its current location in the parse tree.
92 147
93 Incoming NavigableString objects are checked to see if they're the 148 NavigableString, Tag, etc. are all subclasses of PageElement.
94 direct children of a <script> or <style> tag.
95 """ 149 """
96 150
97 cdata_containing_tags = set(["script", "style"]) 151 # In general, we can't tell just by looking at an element whether
152 # it's contained in an XML document or an HTML document. But for
153 # Tags (q.v.) we can store this information at parse time.
154 known_xml = None
98 155
99 preformatted_tags = set(["pre"]) 156 def setup(self, parent=None, previous_element=None, next_element=None,
100 157 previous_sibling=None, next_sibling=None):
101 @classmethod 158 """Sets up the initial relations between this element and
102 def _substitute_if_appropriate(cls, ns, f): 159 other elements.
103 if (isinstance(ns, NavigableString)
104 and ns.parent is not None
105 and ns.parent.name in cls.cdata_containing_tags):
106 # Do nothing.
107 return ns
108 # Substitute.
109 return f(ns)
110 160
111 @classmethod 161 :param parent: The parent of this element.
112 def substitute_html(cls, ns):
113 return cls._substitute_if_appropriate(
114 ns, EntitySubstitution.substitute_html)
115 162
116 @classmethod 163 :param previous_element: The element parsed immediately before
117 def substitute_xml(cls, ns): 164 this one.
118 return cls._substitute_if_appropriate(
119 ns, EntitySubstitution.substitute_xml)
120 165
121class PageElement(object): 166 :param next_element: The element parsed immediately before
122 """Contains the navigational information for some part of the page 167 this one.
123 (either a tag or a piece of text)"""
124
125 # There are five possible values for the "formatter" argument passed in
126 # to methods like encode() and prettify():
127 #
128 # "html" - All Unicode characters with corresponding HTML entities
129 # are converted to those entities on output.
130 # "minimal" - Bare ampersands and angle brackets are converted to
131 # XML entities: &amp; &lt; &gt;
132 # None - The null formatter. Unicode characters are never
133 # converted to entities. This is not recommended, but it's
134 # faster than "minimal".
135 # A function - This function will be called on every string that
136 # needs to undergo entity substitution.
137 #
138
139 # In an HTML document, the default "html" and "minimal" functions
140 # will leave the contents of <script> and <style> tags alone. For
141 # an XML document, all tags will be given the same treatment.
142
143 HTML_FORMATTERS = {
144 "html" : HTMLAwareEntitySubstitution.substitute_html,
145 "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
146 None : None
147 }
148
149 XML_FORMATTERS = {
150 "html" : EntitySubstitution.substitute_html,
151 "minimal" : EntitySubstitution.substitute_xml,
152 None : None
153 }
154
155 def format_string(self, s, formatter='minimal'):
156 """Format the given string using the given formatter."""
157 if not isinstance(formatter, collections.abc.Callable):
158 formatter = self._formatter_for_name(formatter)
159 if formatter is None:
160 output = s
161 else:
162 output = formatter(s)
163 return output
164 168
165 @property 169 :param previous_sibling: The most recently encountered element
166 def _is_xml(self): 170 on the same level of the parse tree as this one.
167 """Is this element part of an XML tree or an HTML tree?
168 171
169 This is used when mapping a formatter name ("minimal") to an 172 :param previous_sibling: The next element to be encountered
170 appropriate function (one that performs entity-substitution on 173 on the same level of the parse tree as this one.
171 the contents of <script> and <style> tags, or not). It's
172 inefficient, but it should be called very rarely.
173 """ 174 """
174 if self.parent is None:
175 # This is the top-level object. It should have .is_xml set
176 # from tree creation. If not, take a guess--BS is usually
177 # used on HTML markup.
178 return getattr(self, 'is_xml', False)
179 return self.parent._is_xml
180
181 def _formatter_for_name(self, name):
182 "Look up a formatter function based on its name and the tree."
183 if self._is_xml:
184 return self.XML_FORMATTERS.get(
185 name, EntitySubstitution.substitute_xml)
186 else:
187 return self.HTML_FORMATTERS.get(
188 name, HTMLAwareEntitySubstitution.substitute_xml)
189
190 def setup(self, parent=None, previous_element=None, next_element=None,
191 previous_sibling=None, next_sibling=None):
192 """Sets up the initial relations between this element and
193 other elements."""
194 self.parent = parent 175 self.parent = parent
195 176
196 self.previous_element = previous_element 177 self.previous_element = previous_element
@@ -198,48 +179,156 @@ class PageElement(object):
198 self.previous_element.next_element = self 179 self.previous_element.next_element = self
199 180
200 self.next_element = next_element 181 self.next_element = next_element
201 if self.next_element: 182 if self.next_element is not None:
202 self.next_element.previous_element = self 183 self.next_element.previous_element = self
203 184
204 self.next_sibling = next_sibling 185 self.next_sibling = next_sibling
205 if self.next_sibling: 186 if self.next_sibling is not None:
206 self.next_sibling.previous_sibling = self 187 self.next_sibling.previous_sibling = self
207 188
208 if (not previous_sibling 189 if (previous_sibling is None
209 and self.parent is not None and self.parent.contents): 190 and self.parent is not None and self.parent.contents):
210 previous_sibling = self.parent.contents[-1] 191 previous_sibling = self.parent.contents[-1]
211 192
212 self.previous_sibling = previous_sibling 193 self.previous_sibling = previous_sibling
213 if previous_sibling: 194 if previous_sibling is not None:
214 self.previous_sibling.next_sibling = self 195 self.previous_sibling.next_sibling = self
215 196
197 def format_string(self, s, formatter):
198 """Format the given string using the given formatter.
199
200 :param s: A string.
201 :param formatter: A Formatter object, or a string naming one of the standard formatters.
202 """
203 if formatter is None:
204 return s
205 if not isinstance(formatter, Formatter):
206 formatter = self.formatter_for_name(formatter)
207 output = formatter.substitute(s)
208 return output
209
210 def formatter_for_name(self, formatter):
211 """Look up or create a Formatter for the given identifier,
212 if necessary.
213
214 :param formatter: Can be a Formatter object (used as-is), a
215 function (used as the entity substitution hook for an
216 XMLFormatter or HTMLFormatter), or a string (used to look
217 up an XMLFormatter or HTMLFormatter in the appropriate
218 registry.
219 """
220 if isinstance(formatter, Formatter):
221 return formatter
222 if self._is_xml:
223 c = XMLFormatter
224 else:
225 c = HTMLFormatter
226 if isinstance(formatter, Callable):
227 return c(entity_substitution=formatter)
228 return c.REGISTRY[formatter]
229
230 @property
231 def _is_xml(self):
232 """Is this element part of an XML tree or an HTML tree?
233
234 This is used in formatter_for_name, when deciding whether an
235 XMLFormatter or HTMLFormatter is more appropriate. It can be
236 inefficient, but it should be called very rarely.
237 """
238 if self.known_xml is not None:
239 # Most of the time we will have determined this when the
240 # document is parsed.
241 return self.known_xml
242
243 # Otherwise, it's likely that this element was created by
244 # direct invocation of the constructor from within the user's
245 # Python code.
246 if self.parent is None:
247 # This is the top-level object. It should have .known_xml set
248 # from tree creation. If not, take a guess--BS is usually
249 # used on HTML markup.
250 return getattr(self, 'is_xml', False)
251 return self.parent._is_xml
252
216 nextSibling = _alias("next_sibling") # BS3 253 nextSibling = _alias("next_sibling") # BS3
217 previousSibling = _alias("previous_sibling") # BS3 254 previousSibling = _alias("previous_sibling") # BS3
218 255
219 def replace_with(self, replace_with): 256 default = object()
220 if not self.parent: 257 def _all_strings(self, strip=False, types=default):
258 """Yield all strings of certain classes, possibly stripping them.
259
260 This is implemented differently in Tag and NavigableString.
261 """
262 raise NotImplementedError()
263
264 @property
265 def stripped_strings(self):
266 """Yield all strings in this PageElement, stripping them first.
267
268 :yield: A sequence of stripped strings.
269 """
270 for string in self._all_strings(True):
271 yield string
272
273 def get_text(self, separator="", strip=False,
274 types=default):
275 """Get all child strings of this PageElement, concatenated using the
276 given separator.
277
278 :param separator: Strings will be concatenated using this separator.
279
280 :param strip: If True, strings will be stripped before being
281 concatenated.
282
283 :param types: A tuple of NavigableString subclasses. Any
284 strings of a subclass not found in this list will be
285 ignored. Although there are exceptions, the default
286 behavior in most cases is to consider only NavigableString
287 and CData objects. That means no comments, processing
288 instructions, etc.
289
290 :return: A string.
291 """
292 return separator.join([s for s in self._all_strings(
293 strip, types=types)])
294 getText = get_text
295 text = property(get_text)
296
297 def replace_with(self, *args):
298 """Replace this PageElement with one or more PageElements, keeping the
299 rest of the tree the same.
300
301 :param args: One or more PageElements.
302 :return: `self`, no longer part of the tree.
303 """
304 if self.parent is None:
221 raise ValueError( 305 raise ValueError(
222 "Cannot replace one element with another when the" 306 "Cannot replace one element with another when the "
223 "element to be replaced is not part of a tree.") 307 "element to be replaced is not part of a tree.")
224 if replace_with is self: 308 if len(args) == 1 and args[0] is self:
225 return 309 return
226 if replace_with is self.parent: 310 if any(x is self.parent for x in args):
227 raise ValueError("Cannot replace a Tag with its parent.") 311 raise ValueError("Cannot replace a Tag with its parent.")
228 old_parent = self.parent 312 old_parent = self.parent
229 my_index = self.parent.index(self) 313 my_index = self.parent.index(self)
230 self.extract() 314 self.extract(_self_index=my_index)
231 old_parent.insert(my_index, replace_with) 315 for idx, replace_with in enumerate(args, start=my_index):
316 old_parent.insert(idx, replace_with)
232 return self 317 return self
233 replaceWith = replace_with # BS3 318 replaceWith = replace_with # BS3
234 319
235 def unwrap(self): 320 def unwrap(self):
321 """Replace this PageElement with its contents.
322
323 :return: `self`, no longer part of the tree.
324 """
236 my_parent = self.parent 325 my_parent = self.parent
237 if not self.parent: 326 if self.parent is None:
238 raise ValueError( 327 raise ValueError(
239 "Cannot replace an element with its contents when that" 328 "Cannot replace an element with its contents when that"
240 "element is not part of a tree.") 329 "element is not part of a tree.")
241 my_index = self.parent.index(self) 330 my_index = self.parent.index(self)
242 self.extract() 331 self.extract(_self_index=my_index)
243 for child in reversed(self.contents[:]): 332 for child in reversed(self.contents[:]):
244 my_parent.insert(my_index, child) 333 my_parent.insert(my_index, child)
245 return self 334 return self
@@ -247,14 +336,29 @@ class PageElement(object):
247 replaceWithChildren = unwrap # BS3 336 replaceWithChildren = unwrap # BS3
248 337
249 def wrap(self, wrap_inside): 338 def wrap(self, wrap_inside):
339 """Wrap this PageElement inside another one.
340
341 :param wrap_inside: A PageElement.
342 :return: `wrap_inside`, occupying the position in the tree that used
343 to be occupied by `self`, and with `self` inside it.
344 """
250 me = self.replace_with(wrap_inside) 345 me = self.replace_with(wrap_inside)
251 wrap_inside.append(me) 346 wrap_inside.append(me)
252 return wrap_inside 347 return wrap_inside
253 348
254 def extract(self): 349 def extract(self, _self_index=None):
255 """Destructively rips this element out of the tree.""" 350 """Destructively rips this element out of the tree.
351
352 :param _self_index: The location of this element in its parent's
353 .contents, if known. Passing this in allows for a performance
354 optimization.
355
356 :return: `self`, no longer part of the tree.
357 """
256 if self.parent is not None: 358 if self.parent is not None:
257 del self.parent.contents[self.parent.index(self)] 359 if _self_index is None:
360 _self_index = self.parent.index(self)
361 del self.parent.contents[_self_index]
258 362
259 #Find the two elements that would be next to each other if 363 #Find the two elements that would be next to each other if
260 #this element (and any children) hadn't been parsed. Connect 364 #this element (and any children) hadn't been parsed. Connect
@@ -281,8 +385,13 @@ class PageElement(object):
281 return self 385 return self
282 386
283 def _last_descendant(self, is_initialized=True, accept_self=True): 387 def _last_descendant(self, is_initialized=True, accept_self=True):
284 "Finds the last element beneath this object to be parsed." 388 """Finds the last element beneath this object to be parsed.
285 if is_initialized and self.next_sibling: 389
390 :param is_initialized: Has `setup` been called on this PageElement
391 yet?
392 :param accept_self: Is `self` an acceptable answer to the question?
393 """
394 if is_initialized and self.next_sibling is not None:
286 last_child = self.next_sibling.previous_element 395 last_child = self.next_sibling.previous_element
287 else: 396 else:
288 last_child = self 397 last_child = self
@@ -295,6 +404,14 @@ class PageElement(object):
295 _lastRecursiveChild = _last_descendant 404 _lastRecursiveChild = _last_descendant
296 405
297 def insert(self, position, new_child): 406 def insert(self, position, new_child):
407 """Insert a new PageElement in the list of this PageElement's children.
408
409 This works the same way as `list.insert`.
410
411 :param position: The numeric position that should be occupied
412 in `self.children` by the new PageElement.
413 :param new_child: A PageElement.
414 """
298 if new_child is None: 415 if new_child is None:
299 raise ValueError("Cannot insert None into a tag.") 416 raise ValueError("Cannot insert None into a tag.")
300 if new_child is self: 417 if new_child is self:
@@ -303,6 +420,14 @@ class PageElement(object):
303 and not isinstance(new_child, NavigableString)): 420 and not isinstance(new_child, NavigableString)):
304 new_child = NavigableString(new_child) 421 new_child = NavigableString(new_child)
305 422
423 from bs4 import BeautifulSoup
424 if isinstance(new_child, BeautifulSoup):
425 # We don't want to end up with a situation where one BeautifulSoup
426 # object contains another. Insert the children one at a time.
427 for subchild in list(new_child.contents):
428 self.insert(position, subchild)
429 position += 1
430 return
306 position = min(position, len(self.contents)) 431 position = min(position, len(self.contents))
307 if hasattr(new_child, 'parent') and new_child.parent is not None: 432 if hasattr(new_child, 'parent') and new_child.parent is not None:
308 # We're 'inserting' an element that's already one 433 # We're 'inserting' an element that's already one
@@ -361,160 +486,326 @@ class PageElement(object):
361 self.contents.insert(position, new_child) 486 self.contents.insert(position, new_child)
362 487
363 def append(self, tag): 488 def append(self, tag):
364 """Appends the given tag to the contents of this tag.""" 489 """Appends the given PageElement to the contents of this one.
490
491 :param tag: A PageElement.
492 """
365 self.insert(len(self.contents), tag) 493 self.insert(len(self.contents), tag)
366 494
367 def insert_before(self, predecessor): 495 def extend(self, tags):
368 """Makes the given element the immediate predecessor of this one. 496 """Appends the given PageElements to this one's contents.
369 497
370 The two elements will have the same parent, and the given element 498 :param tags: A list of PageElements. If a single Tag is
499 provided instead, this PageElement's contents will be extended
500 with that Tag's contents.
501 """
502 if isinstance(tags, Tag):
503 tags = tags.contents
504 if isinstance(tags, list):
505 # Moving items around the tree may change their position in
506 # the original list. Make a list that won't change.
507 tags = list(tags)
508 for tag in tags:
509 self.append(tag)
510
511 def insert_before(self, *args):
512 """Makes the given element(s) the immediate predecessor of this one.
513
514 All the elements will have the same parent, and the given elements
371 will be immediately before this one. 515 will be immediately before this one.
516
517 :param args: One or more PageElements.
372 """ 518 """
373 if self is predecessor:
374 raise ValueError("Can't insert an element before itself.")
375 parent = self.parent 519 parent = self.parent
376 if parent is None: 520 if parent is None:
377 raise ValueError( 521 raise ValueError(
378 "Element has no parent, so 'before' has no meaning.") 522 "Element has no parent, so 'before' has no meaning.")
379 # Extract first so that the index won't be screwed up if they 523 if any(x is self for x in args):
380 # are siblings. 524 raise ValueError("Can't insert an element before itself.")
381 if isinstance(predecessor, PageElement): 525 for predecessor in args:
382 predecessor.extract() 526 # Extract first so that the index won't be screwed up if they
383 index = parent.index(self) 527 # are siblings.
384 parent.insert(index, predecessor) 528 if isinstance(predecessor, PageElement):
385 529 predecessor.extract()
386 def insert_after(self, successor): 530 index = parent.index(self)
387 """Makes the given element the immediate successor of this one. 531 parent.insert(index, predecessor)
388 532
389 The two elements will have the same parent, and the given element 533 def insert_after(self, *args):
534 """Makes the given element(s) the immediate successor of this one.
535
536 The elements will have the same parent, and the given elements
390 will be immediately after this one. 537 will be immediately after this one.
538
539 :param args: One or more PageElements.
391 """ 540 """
392 if self is successor: 541 # Do all error checking before modifying the tree.
393 raise ValueError("Can't insert an element after itself.")
394 parent = self.parent 542 parent = self.parent
395 if parent is None: 543 if parent is None:
396 raise ValueError( 544 raise ValueError(
397 "Element has no parent, so 'after' has no meaning.") 545 "Element has no parent, so 'after' has no meaning.")
398 # Extract first so that the index won't be screwed up if they 546 if any(x is self for x in args):
399 # are siblings. 547 raise ValueError("Can't insert an element after itself.")
400 if isinstance(successor, PageElement): 548
401 successor.extract() 549 offset = 0
402 index = parent.index(self) 550 for successor in args:
403 parent.insert(index+1, successor) 551 # Extract first so that the index won't be screwed up if they
404 552 # are siblings.
405 def find_next(self, name=None, attrs={}, text=None, **kwargs): 553 if isinstance(successor, PageElement):
406 """Returns the first item that matches the given criteria and 554 successor.extract()
407 appears after this Tag in the document.""" 555 index = parent.index(self)
408 return self._find_one(self.find_all_next, name, attrs, text, **kwargs) 556 parent.insert(index+1+offset, successor)
557 offset += 1
558
559 def find_next(self, name=None, attrs={}, string=None, **kwargs):
560 """Find the first PageElement that matches the given criteria and
561 appears later in the document than this PageElement.
562
563 All find_* methods take a common set of arguments. See the online
564 documentation for detailed explanations.
565
566 :param name: A filter on tag name.
567 :param attrs: A dictionary of filters on attribute values.
568 :param string: A filter for a NavigableString with specific text.
569 :kwargs: A dictionary of filters on attribute values.
570 :return: A PageElement.
571 :rtype: bs4.element.Tag | bs4.element.NavigableString
572 """
573 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
409 findNext = find_next # BS3 574 findNext = find_next # BS3
410 575
411 def find_all_next(self, name=None, attrs={}, text=None, limit=None, 576 def find_all_next(self, name=None, attrs={}, string=None, limit=None,
412 **kwargs): 577 **kwargs):
413 """Returns all items that match the given criteria and appear 578 """Find all PageElements that match the given criteria and appear
414 after this Tag in the document.""" 579 later in the document than this PageElement.
415 return self._find_all(name, attrs, text, limit, self.next_elements, 580
416 **kwargs) 581 All find_* methods take a common set of arguments. See the online
582 documentation for detailed explanations.
583
584 :param name: A filter on tag name.
585 :param attrs: A dictionary of filters on attribute values.
586 :param string: A filter for a NavigableString with specific text.
587 :param limit: Stop looking after finding this many results.
588 :kwargs: A dictionary of filters on attribute values.
589 :return: A ResultSet containing PageElements.
590 """
591 _stacklevel = kwargs.pop('_stacklevel', 2)
592 return self._find_all(name, attrs, string, limit, self.next_elements,
593 _stacklevel=_stacklevel+1, **kwargs)
417 findAllNext = find_all_next # BS3 594 findAllNext = find_all_next # BS3
418 595
419 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): 596 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
420 """Returns the closest sibling to this Tag that matches the 597 """Find the closest sibling to this PageElement that matches the
421 given criteria and appears after this Tag in the document.""" 598 given criteria and appears later in the document.
422 return self._find_one(self.find_next_siblings, name, attrs, text, 599
600 All find_* methods take a common set of arguments. See the
601 online documentation for detailed explanations.
602
603 :param name: A filter on tag name.
604 :param attrs: A dictionary of filters on attribute values.
605 :param string: A filter for a NavigableString with specific text.
606 :kwargs: A dictionary of filters on attribute values.
607 :return: A PageElement.
608 :rtype: bs4.element.Tag | bs4.element.NavigableString
609 """
610 return self._find_one(self.find_next_siblings, name, attrs, string,
423 **kwargs) 611 **kwargs)
424 findNextSibling = find_next_sibling # BS3 612 findNextSibling = find_next_sibling # BS3
425 613
426 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, 614 def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
427 **kwargs): 615 **kwargs):
428 """Returns the siblings of this Tag that match the given 616 """Find all siblings of this PageElement that match the given criteria
429 criteria and appear after this Tag in the document.""" 617 and appear later in the document.
430 return self._find_all(name, attrs, text, limit, 618
431 self.next_siblings, **kwargs) 619 All find_* methods take a common set of arguments. See the online
620 documentation for detailed explanations.
621
622 :param name: A filter on tag name.
623 :param attrs: A dictionary of filters on attribute values.
624 :param string: A filter for a NavigableString with specific text.
625 :param limit: Stop looking after finding this many results.
626 :kwargs: A dictionary of filters on attribute values.
627 :return: A ResultSet of PageElements.
628 :rtype: bs4.element.ResultSet
629 """
630 _stacklevel = kwargs.pop('_stacklevel', 2)
631 return self._find_all(
632 name, attrs, string, limit,
633 self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
634 )
432 findNextSiblings = find_next_siblings # BS3 635 findNextSiblings = find_next_siblings # BS3
433 fetchNextSiblings = find_next_siblings # BS2 636 fetchNextSiblings = find_next_siblings # BS2
434 637
435 def find_previous(self, name=None, attrs={}, text=None, **kwargs): 638 def find_previous(self, name=None, attrs={}, string=None, **kwargs):
436 """Returns the first item that matches the given criteria and 639 """Look backwards in the document from this PageElement and find the
437 appears before this Tag in the document.""" 640 first PageElement that matches the given criteria.
641
642 All find_* methods take a common set of arguments. See the online
643 documentation for detailed explanations.
644
645 :param name: A filter on tag name.
646 :param attrs: A dictionary of filters on attribute values.
647 :param string: A filter for a NavigableString with specific text.
648 :kwargs: A dictionary of filters on attribute values.
649 :return: A PageElement.
650 :rtype: bs4.element.Tag | bs4.element.NavigableString
651 """
438 return self._find_one( 652 return self._find_one(
439 self.find_all_previous, name, attrs, text, **kwargs) 653 self.find_all_previous, name, attrs, string, **kwargs)
440 findPrevious = find_previous # BS3 654 findPrevious = find_previous # BS3
441 655
442 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, 656 def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
443 **kwargs): 657 **kwargs):
444 """Returns all items that match the given criteria and appear 658 """Look backwards in the document from this PageElement and find all
445 before this Tag in the document.""" 659 PageElements that match the given criteria.
446 return self._find_all(name, attrs, text, limit, self.previous_elements, 660
447 **kwargs) 661 All find_* methods take a common set of arguments. See the online
662 documentation for detailed explanations.
663
664 :param name: A filter on tag name.
665 :param attrs: A dictionary of filters on attribute values.
666 :param string: A filter for a NavigableString with specific text.
667 :param limit: Stop looking after finding this many results.
668 :kwargs: A dictionary of filters on attribute values.
669 :return: A ResultSet of PageElements.
670 :rtype: bs4.element.ResultSet
671 """
672 _stacklevel = kwargs.pop('_stacklevel', 2)
673 return self._find_all(
674 name, attrs, string, limit, self.previous_elements,
675 _stacklevel=_stacklevel+1, **kwargs
676 )
448 findAllPrevious = find_all_previous # BS3 677 findAllPrevious = find_all_previous # BS3
449 fetchPrevious = find_all_previous # BS2 678 fetchPrevious = find_all_previous # BS2
450 679
451 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): 680 def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
452 """Returns the closest sibling to this Tag that matches the 681 """Returns the closest sibling to this PageElement that matches the
453 given criteria and appears before this Tag in the document.""" 682 given criteria and appears earlier in the document.
454 return self._find_one(self.find_previous_siblings, name, attrs, text, 683
684 All find_* methods take a common set of arguments. See the online
685 documentation for detailed explanations.
686
687 :param name: A filter on tag name.
688 :param attrs: A dictionary of filters on attribute values.
689 :param string: A filter for a NavigableString with specific text.
690 :kwargs: A dictionary of filters on attribute values.
691 :return: A PageElement.
692 :rtype: bs4.element.Tag | bs4.element.NavigableString
693 """
694 return self._find_one(self.find_previous_siblings, name, attrs, string,
455 **kwargs) 695 **kwargs)
456 findPreviousSibling = find_previous_sibling # BS3 696 findPreviousSibling = find_previous_sibling # BS3
457 697
458 def find_previous_siblings(self, name=None, attrs={}, text=None, 698 def find_previous_siblings(self, name=None, attrs={}, string=None,
459 limit=None, **kwargs): 699 limit=None, **kwargs):
460 """Returns the siblings of this Tag that match the given 700 """Returns all siblings to this PageElement that match the
461 criteria and appear before this Tag in the document.""" 701 given criteria and appear earlier in the document.
462 return self._find_all(name, attrs, text, limit, 702
463 self.previous_siblings, **kwargs) 703 All find_* methods take a common set of arguments. See the online
704 documentation for detailed explanations.
705
706 :param name: A filter on tag name.
707 :param attrs: A dictionary of filters on attribute values.
708 :param string: A filter for a NavigableString with specific text.
709 :param limit: Stop looking after finding this many results.
710 :kwargs: A dictionary of filters on attribute values.
711 :return: A ResultSet of PageElements.
712 :rtype: bs4.element.ResultSet
713 """
714 _stacklevel = kwargs.pop('_stacklevel', 2)
715 return self._find_all(
716 name, attrs, string, limit,
717 self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
718 )
464 findPreviousSiblings = find_previous_siblings # BS3 719 findPreviousSiblings = find_previous_siblings # BS3
465 fetchPreviousSiblings = find_previous_siblings # BS2 720 fetchPreviousSiblings = find_previous_siblings # BS2
466 721
467 def find_parent(self, name=None, attrs={}, **kwargs): 722 def find_parent(self, name=None, attrs={}, **kwargs):
468 """Returns the closest parent of this Tag that matches the given 723 """Find the closest parent of this PageElement that matches the given
469 criteria.""" 724 criteria.
725
726 All find_* methods take a common set of arguments. See the online
727 documentation for detailed explanations.
728
729 :param name: A filter on tag name.
730 :param attrs: A dictionary of filters on attribute values.
731 :kwargs: A dictionary of filters on attribute values.
732
733 :return: A PageElement.
734 :rtype: bs4.element.Tag | bs4.element.NavigableString
735 """
470 # NOTE: We can't use _find_one because findParents takes a different 736 # NOTE: We can't use _find_one because findParents takes a different
471 # set of arguments. 737 # set of arguments.
472 r = None 738 r = None
473 l = self.find_parents(name, attrs, 1, **kwargs) 739 l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
474 if l: 740 if l:
475 r = l[0] 741 r = l[0]
476 return r 742 return r
477 findParent = find_parent # BS3 743 findParent = find_parent # BS3
478 744
479 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): 745 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
480 """Returns the parents of this Tag that match the given 746 """Find all parents of this PageElement that match the given criteria.
481 criteria.""" 747
748 All find_* methods take a common set of arguments. See the online
749 documentation for detailed explanations.
482 750
751 :param name: A filter on tag name.
752 :param attrs: A dictionary of filters on attribute values.
753 :param limit: Stop looking after finding this many results.
754 :kwargs: A dictionary of filters on attribute values.
755
756 :return: A PageElement.
757 :rtype: bs4.element.Tag | bs4.element.NavigableString
758 """
759 _stacklevel = kwargs.pop('_stacklevel', 2)
483 return self._find_all(name, attrs, None, limit, self.parents, 760 return self._find_all(name, attrs, None, limit, self.parents,
484 **kwargs) 761 _stacklevel=_stacklevel+1, **kwargs)
485 findParents = find_parents # BS3 762 findParents = find_parents # BS3
486 fetchParents = find_parents # BS2 763 fetchParents = find_parents # BS2
487 764
488 @property 765 @property
489 def next(self): 766 def next(self):
767 """The PageElement, if any, that was parsed just after this one.
768
769 :return: A PageElement.
770 :rtype: bs4.element.Tag | bs4.element.NavigableString
771 """
490 return self.next_element 772 return self.next_element
491 773
492 @property 774 @property
493 def previous(self): 775 def previous(self):
776 """The PageElement, if any, that was parsed just before this one.
777
778 :return: A PageElement.
779 :rtype: bs4.element.Tag | bs4.element.NavigableString
780 """
494 return self.previous_element 781 return self.previous_element
495 782
496 #These methods do the real heavy lifting. 783 #These methods do the real heavy lifting.
497 784
498 def _find_one(self, method, name, attrs, text, **kwargs): 785 def _find_one(self, method, name, attrs, string, **kwargs):
499 r = None 786 r = None
500 l = method(name, attrs, text, 1, **kwargs) 787 l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
501 if l: 788 if l:
502 r = l[0] 789 r = l[0]
503 return r 790 return r
504 791
505 def _find_all(self, name, attrs, text, limit, generator, **kwargs): 792 def _find_all(self, name, attrs, string, limit, generator, **kwargs):
506 "Iterates over a generator looking for things that match." 793 "Iterates over a generator looking for things that match."
794 _stacklevel = kwargs.pop('_stacklevel', 3)
507 795
508 if text is None and 'string' in kwargs: 796 if string is None and 'text' in kwargs:
509 text = kwargs['string'] 797 string = kwargs.pop('text')
510 del kwargs['string'] 798 warnings.warn(
799 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
800 DeprecationWarning, stacklevel=_stacklevel
801 )
511 802
512 if isinstance(name, SoupStrainer): 803 if isinstance(name, SoupStrainer):
513 strainer = name 804 strainer = name
514 else: 805 else:
515 strainer = SoupStrainer(name, attrs, text, **kwargs) 806 strainer = SoupStrainer(name, attrs, string, **kwargs)
516 807
517 if text is None and not limit and not attrs and not kwargs: 808 if string is None and not limit and not attrs and not kwargs:
518 if name is True or name is None: 809 if name is True or name is None:
519 # Optimization to find all tags. 810 # Optimization to find all tags.
520 result = (element for element in generator 811 result = (element for element in generator
@@ -522,9 +813,23 @@ class PageElement(object):
522 return ResultSet(strainer, result) 813 return ResultSet(strainer, result)
523 elif isinstance(name, str): 814 elif isinstance(name, str):
524 # Optimization to find all tags with a given name. 815 # Optimization to find all tags with a given name.
816 if name.count(':') == 1:
817 # This is a name with a prefix. If this is a namespace-aware document,
818 # we need to match the local name against tag.name. If not,
819 # we need to match the fully-qualified name against tag.name.
820 prefix, local_name = name.split(':', 1)
821 else:
822 prefix = None
823 local_name = name
525 result = (element for element in generator 824 result = (element for element in generator
526 if isinstance(element, Tag) 825 if isinstance(element, Tag)
527 and element.name == name) 826 and (
827 element.name == name
828 ) or (
829 element.name == local_name
830 and (prefix is None or element.prefix == prefix)
831 )
832 )
528 return ResultSet(strainer, result) 833 return ResultSet(strainer, result)
529 results = ResultSet(strainer) 834 results = ResultSet(strainer)
530 while True: 835 while True:
@@ -544,6 +849,10 @@ class PageElement(object):
544 #NavigableStrings and Tags. 849 #NavigableStrings and Tags.
545 @property 850 @property
546 def next_elements(self): 851 def next_elements(self):
852 """All PageElements that were parsed after this one.
853
854 :yield: A sequence of PageElements.
855 """
547 i = self.next_element 856 i = self.next_element
548 while i is not None: 857 while i is not None:
549 yield i 858 yield i
@@ -551,6 +860,11 @@ class PageElement(object):
551 860
552 @property 861 @property
553 def next_siblings(self): 862 def next_siblings(self):
863 """All PageElements that are siblings of this one but were parsed
864 later.
865
866 :yield: A sequence of PageElements.
867 """
554 i = self.next_sibling 868 i = self.next_sibling
555 while i is not None: 869 while i is not None:
556 yield i 870 yield i
@@ -558,6 +872,10 @@ class PageElement(object):
558 872
559 @property 873 @property
560 def previous_elements(self): 874 def previous_elements(self):
875 """All PageElements that were parsed before this one.
876
877 :yield: A sequence of PageElements.
878 """
561 i = self.previous_element 879 i = self.previous_element
562 while i is not None: 880 while i is not None:
563 yield i 881 yield i
@@ -565,6 +883,11 @@ class PageElement(object):
565 883
566 @property 884 @property
567 def previous_siblings(self): 885 def previous_siblings(self):
886 """All PageElements that are siblings of this one but were parsed
887 earlier.
888
889 :yield: A sequence of PageElements.
890 """
568 i = self.previous_sibling 891 i = self.previous_sibling
569 while i is not None: 892 while i is not None:
570 yield i 893 yield i
@@ -572,87 +895,23 @@ class PageElement(object):
572 895
573 @property 896 @property
574 def parents(self): 897 def parents(self):
898 """All PageElements that are parents of this PageElement.
899
900 :yield: A sequence of PageElements.
901 """
575 i = self.parent 902 i = self.parent
576 while i is not None: 903 while i is not None:
577 yield i 904 yield i
578 i = i.parent 905 i = i.parent
579 906
580 # Methods for supporting CSS selectors. 907 @property
581 908 def decomposed(self):
582 tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') 909 """Check whether a PageElement has been decomposed.
583
584 # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
585 # \---------------------------/ \---/\-------------/ \-------/
586 # | | | |
587 # | | | The value
588 # | | ~,|,^,$,* or =
589 # | Attribute
590 # Tag
591 attribselect_re = re.compile(
592 r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
593 r'=?"?(?P<value>[^\]"]*)"?\]$'
594 )
595
596 def _attr_value_as_string(self, value, default=None):
597 """Force an attribute value into a string representation.
598 910
599 A multi-valued attribute will be converted into a 911 :rtype: bool
600 space-separated stirng.
601 """ 912 """
602 value = self.get(value, default) 913 return getattr(self, '_decomposed', False) or False
603 if isinstance(value, list) or isinstance(value, tuple): 914
604 value =" ".join(value)
605 return value
606
607 def _tag_name_matches_and(self, function, tag_name):
608 if not tag_name:
609 return function
610 else:
611 def _match(tag):
612 return tag.name == tag_name and function(tag)
613 return _match
614
615 def _attribute_checker(self, operator, attribute, value=''):
616 """Create a function that performs a CSS selector operation.
617
618 Takes an operator, attribute and optional value. Returns a
619 function that will return True for elements that match that
620 combination.
621 """
622 if operator == '=':
623 # string representation of `attribute` is equal to `value`
624 return lambda el: el._attr_value_as_string(attribute) == value
625 elif operator == '~':
626 # space-separated list representation of `attribute`
627 # contains `value`
628 def _includes_value(element):
629 attribute_value = element.get(attribute, [])
630 if not isinstance(attribute_value, list):
631 attribute_value = attribute_value.split()
632 return value in attribute_value
633 return _includes_value
634 elif operator == '^':
635 # string representation of `attribute` starts with `value`
636 return lambda el: el._attr_value_as_string(
637 attribute, '').startswith(value)
638 elif operator == '$':
639 # string represenation of `attribute` ends with `value`
640 return lambda el: el._attr_value_as_string(
641 attribute, '').endswith(value)
642 elif operator == '*':
643 # string representation of `attribute` contains `value`
644 return lambda el: value in el._attr_value_as_string(attribute, '')
645 elif operator == '|':
646 # string representation of `attribute` is either exactly
647 # `value` or starts with `value` and then a dash.
648 def _is_or_starts_with_dash(element):
649 attribute_value = element._attr_value_as_string(attribute, '')
650 return (attribute_value == value or attribute_value.startswith(
651 value + '-'))
652 return _is_or_starts_with_dash
653 else:
654 return lambda el: el.has_attr(attribute)
655
656 # Old non-property versions of the generators, for backwards 915 # Old non-property versions of the generators, for backwards
657 # compatibility with BS3. 916 # compatibility with BS3.
658 def nextGenerator(self): 917 def nextGenerator(self):
@@ -672,6 +931,11 @@ class PageElement(object):
672 931
673 932
674class NavigableString(str, PageElement): 933class NavigableString(str, PageElement):
934 """A Python Unicode string that is part of a parse tree.
935
936 When Beautiful Soup parses the markup <b>penguin</b>, it will
937 create a NavigableString for the string "penguin".
938 """
675 939
676 PREFIX = '' 940 PREFIX = ''
677 SUFFIX = '' 941 SUFFIX = ''
@@ -691,12 +955,22 @@ class NavigableString(str, PageElement):
691 u.setup() 955 u.setup()
692 return u 956 return u
693 957
694 def __copy__(self): 958 def __deepcopy__(self, memo, recursive=False):
695 """A copy of a NavigableString has the same contents and class 959 """A copy of a NavigableString has the same contents and class
696 as the original, but it is not connected to the parse tree. 960 as the original, but it is not connected to the parse tree.
961
962 :param recursive: This parameter is ignored; it's only defined
963 so that NavigableString.__deepcopy__ implements the same
964 signature as Tag.__deepcopy__.
697 """ 965 """
698 return type(self)(self) 966 return type(self)(self)
699 967
968 def __copy__(self):
969 """A copy of a NavigableString can only be a deep copy, because
970 only one PageElement can occupy a given place in a parse tree.
971 """
972 return self.__deepcopy__({})
973
700 def __getnewargs__(self): 974 def __getnewargs__(self):
701 return (str(self),) 975 return (str(self),)
702 976
@@ -712,55 +986,146 @@ class NavigableString(str, PageElement):
712 self.__class__.__name__, attr)) 986 self.__class__.__name__, attr))
713 987
714 def output_ready(self, formatter="minimal"): 988 def output_ready(self, formatter="minimal"):
989 """Run the string through the provided formatter.
990
991 :param formatter: A Formatter object, or a string naming one of the standard formatters.
992 """
715 output = self.format_string(self, formatter) 993 output = self.format_string(self, formatter)
716 return self.PREFIX + output + self.SUFFIX 994 return self.PREFIX + output + self.SUFFIX
717 995
718 @property 996 @property
719 def name(self): 997 def name(self):
998 """Since a NavigableString is not a Tag, it has no .name.
999
1000 This property is implemented so that code like this doesn't crash
1001 when run on a mixture of Tag and NavigableString objects:
1002 [x.name for x in tag.children]
1003 """
720 return None 1004 return None
721 1005
722 @name.setter 1006 @name.setter
723 def name(self, name): 1007 def name(self, name):
1008 """Prevent NavigableString.name from ever being set."""
724 raise AttributeError("A NavigableString cannot be given a name.") 1009 raise AttributeError("A NavigableString cannot be given a name.")
725 1010
1011 def _all_strings(self, strip=False, types=PageElement.default):
1012 """Yield all strings of certain classes, possibly stripping them.
1013
1014 This makes it easy for NavigableString to implement methods
1015 like get_text() as conveniences, creating a consistent
1016 text-extraction API across all PageElements.
1017
1018 :param strip: If True, all strings will be stripped before being
1019 yielded.
1020
1021 :param types: A tuple of NavigableString subclasses. If this
1022 NavigableString isn't one of those subclasses, the
1023 sequence will be empty. By default, the subclasses
1024 considered are NavigableString and CData objects. That
1025 means no comments, processing instructions, etc.
1026
1027 :yield: A sequence that either contains this string, or is empty.
1028
1029 """
1030 if types is self.default:
1031 # This is kept in Tag because it's full of subclasses of
1032 # this class, which aren't defined until later in the file.
1033 types = Tag.DEFAULT_INTERESTING_STRING_TYPES
1034
1035 # Do nothing if the caller is looking for specific types of
1036 # string, and we're of a different type.
1037 #
1038 # We check specific types instead of using isinstance(self,
1039 # types) because all of these classes subclass
1040 # NavigableString. Anyone who's using this feature probably
1041 # wants generic NavigableStrings but not other stuff.
1042 my_type = type(self)
1043 if types is not None:
1044 if isinstance(types, type):
1045 # Looking for a single type.
1046 if my_type is not types:
1047 return
1048 elif my_type not in types:
1049 # Looking for one of a list of types.
1050 return
1051
1052 value = self
1053 if strip:
1054 value = value.strip()
1055 if len(value) > 0:
1056 yield value
1057 strings = property(_all_strings)
1058
726class PreformattedString(NavigableString): 1059class PreformattedString(NavigableString):
727 """A NavigableString not subject to the normal formatting rules. 1060 """A NavigableString not subject to the normal formatting rules.
728 1061
729 The string will be passed into the formatter (to trigger side effects), 1062 This is an abstract class used for special kinds of strings such
730 but the return value will be ignored. 1063 as comments (the Comment class) and CDATA blocks (the CData
1064 class).
731 """ 1065 """
732 1066
733 def output_ready(self, formatter="minimal"): 1067 PREFIX = ''
734 """CData strings are passed into the formatter. 1068 SUFFIX = ''
735 But the return value is ignored.""" 1069
736 self.format_string(self, formatter) 1070 def output_ready(self, formatter=None):
1071 """Make this string ready for output by adding any subclass-specific
1072 prefix or suffix.
1073
1074 :param formatter: A Formatter object, or a string naming one
1075 of the standard formatters. The string will be passed into the
1076 Formatter, but only to trigger any side effects: the return
1077 value is ignored.
1078
1079 :return: The string, with any subclass-specific prefix and
1080 suffix added on.
1081 """
1082 if formatter is not None:
1083 ignore = self.format_string(self, formatter)
737 return self.PREFIX + self + self.SUFFIX 1084 return self.PREFIX + self + self.SUFFIX
738 1085
739class CData(PreformattedString): 1086class CData(PreformattedString):
740 1087 """A CDATA block."""
741 PREFIX = '<![CDATA[' 1088 PREFIX = '<![CDATA['
742 SUFFIX = ']]>' 1089 SUFFIX = ']]>'
743 1090
744class ProcessingInstruction(PreformattedString): 1091class ProcessingInstruction(PreformattedString):
1092 """A SGML processing instruction."""
745 1093
746 PREFIX = '<?' 1094 PREFIX = '<?'
747 SUFFIX = '>' 1095 SUFFIX = '>'
748 1096
749class Comment(PreformattedString): 1097class XMLProcessingInstruction(ProcessingInstruction):
1098 """An XML processing instruction."""
1099 PREFIX = '<?'
1100 SUFFIX = '?>'
750 1101
1102class Comment(PreformattedString):
1103 """An HTML or XML comment."""
751 PREFIX = '<!--' 1104 PREFIX = '<!--'
752 SUFFIX = '-->' 1105 SUFFIX = '-->'
753 1106
754 1107
755class Declaration(PreformattedString): 1108class Declaration(PreformattedString):
1109 """An XML declaration."""
756 PREFIX = '<?' 1110 PREFIX = '<?'
757 SUFFIX = '?>' 1111 SUFFIX = '?>'
758 1112
759 1113
760class Doctype(PreformattedString): 1114class Doctype(PreformattedString):
761 1115 """A document type declaration."""
762 @classmethod 1116 @classmethod
763 def for_name_and_ids(cls, name, pub_id, system_id): 1117 def for_name_and_ids(cls, name, pub_id, system_id):
1118 """Generate an appropriate document type declaration for a given
1119 public ID and system ID.
1120
1121 :param name: The name of the document's root element, e.g. 'html'.
1122 :param pub_id: The Formal Public Identifier for this document type,
1123 e.g. '-//W3C//DTD XHTML 1.1//EN'
1124 :param system_id: The system identifier for this document type,
1125 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1126
1127 :return: A Doctype.
1128 """
764 value = name or '' 1129 value = name or ''
765 if pub_id is not None: 1130 if pub_id is not None:
766 value += ' PUBLIC "%s"' % pub_id 1131 value += ' PUBLIC "%s"' % pub_id
@@ -775,14 +1140,105 @@ class Doctype(PreformattedString):
775 SUFFIX = '>\n' 1140 SUFFIX = '>\n'
776 1141
777 1142
1143class Stylesheet(NavigableString):
1144 """A NavigableString representing an stylesheet (probably
1145 CSS).
1146
1147 Used to distinguish embedded stylesheets from textual content.
1148 """
1149 pass
1150
1151
1152class Script(NavigableString):
1153 """A NavigableString representing an executable script (probably
1154 Javascript).
1155
1156 Used to distinguish executable code from textual content.
1157 """
1158 pass
1159
1160
1161class TemplateString(NavigableString):
1162 """A NavigableString representing a string found inside an HTML
1163 template embedded in a larger document.
1164
1165 Used to distinguish such strings from the main body of the document.
1166 """
1167 pass
1168
1169
1170class RubyTextString(NavigableString):
1171 """A NavigableString representing the contents of the <rt> HTML
1172 element.
1173
1174 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
1175
1176 Can be used to distinguish such strings from the strings they're
1177 annotating.
1178 """
1179 pass
1180
1181
1182class RubyParenthesisString(NavigableString):
1183 """A NavigableString representing the contents of the <rp> HTML
1184 element.
1185
1186 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
1187 """
1188 pass
1189
1190
778class Tag(PageElement): 1191class Tag(PageElement):
1192 """Represents an HTML or XML tag that is part of a parse tree, along
1193 with its attributes and contents.
779 1194
780 """Represents a found HTML tag with its attributes and contents.""" 1195 When Beautiful Soup parses the markup <b>penguin</b>, it will
1196 create a Tag object representing the <b> tag.
1197 """
781 1198
782 def __init__(self, parser=None, builder=None, name=None, namespace=None, 1199 def __init__(self, parser=None, builder=None, name=None, namespace=None,
783 prefix=None, attrs=None, parent=None, previous=None): 1200 prefix=None, attrs=None, parent=None, previous=None,
784 "Basic constructor." 1201 is_xml=None, sourceline=None, sourcepos=None,
785 1202 can_be_empty_element=None, cdata_list_attributes=None,
1203 preserve_whitespace_tags=None,
1204 interesting_string_types=None,
1205 namespaces=None
1206 ):
1207 """Basic constructor.
1208
1209 :param parser: A BeautifulSoup object.
1210 :param builder: A TreeBuilder.
1211 :param name: The name of the tag.
1212 :param namespace: The URI of this Tag's XML namespace, if any.
1213 :param prefix: The prefix for this Tag's XML namespace, if any.
1214 :param attrs: A dictionary of this Tag's attribute values.
1215 :param parent: The PageElement to use as this Tag's parent.
1216 :param previous: The PageElement that was parsed immediately before
1217 this tag.
1218 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1219 HTML tag.
1220 :param sourceline: The line number where this tag was found in its
1221 source document.
1222 :param sourcepos: The character position within `sourceline` where this
1223 tag was found.
1224 :param can_be_empty_element: If True, this tag should be
1225 represented as <tag/>. If False, this tag should be represented
1226 as <tag></tag>.
1227 :param cdata_list_attributes: A list of attributes whose values should
1228 be treated as CDATA if they ever show up on this tag.
1229 :param preserve_whitespace_tags: A list of tag names whose contents
1230 should have their whitespace preserved.
1231 :param interesting_string_types: This is a NavigableString
1232 subclass or a tuple of them. When iterating over this
1233 Tag's strings in methods like Tag.strings or Tag.get_text,
1234 these are the types of strings that are interesting enough
1235 to be considered. The default is to consider
1236 NavigableString and CData the only interesting string
1237 subtypes.
1238 :param namespaces: A dictionary mapping currently active
1239 namespace prefixes to URIs. This can be used later to
1240 construct CSS selectors.
1241 """
786 if parser is None: 1242 if parser is None:
787 self.parser_class = None 1243 self.parser_class = None
788 else: 1244 else:
@@ -793,7 +1249,12 @@ class Tag(PageElement):
793 raise ValueError("No value provided for new tag's name.") 1249 raise ValueError("No value provided for new tag's name.")
794 self.name = name 1250 self.name = name
795 self.namespace = namespace 1251 self.namespace = namespace
1252 self._namespaces = namespaces or {}
796 self.prefix = prefix 1253 self.prefix = prefix
1254 if ((not builder or builder.store_line_numbers)
1255 and (sourceline is not None or sourcepos is not None)):
1256 self.sourceline = sourceline
1257 self.sourcepos = sourcepos
797 if attrs is None: 1258 if attrs is None:
798 attrs = {} 1259 attrs = {}
799 elif attrs: 1260 elif attrs:
@@ -804,32 +1265,109 @@ class Tag(PageElement):
804 attrs = dict(attrs) 1265 attrs = dict(attrs)
805 else: 1266 else:
806 attrs = dict(attrs) 1267 attrs = dict(attrs)
1268
1269 # If possible, determine ahead of time whether this tag is an
1270 # XML tag.
1271 if builder:
1272 self.known_xml = builder.is_xml
1273 else:
1274 self.known_xml = is_xml
807 self.attrs = attrs 1275 self.attrs = attrs
808 self.contents = [] 1276 self.contents = []
809 self.setup(parent, previous) 1277 self.setup(parent, previous)
810 self.hidden = False 1278 self.hidden = False
811 1279
812 # Set up any substitutions, such as the charset in a META tag. 1280 if builder is None:
813 if builder is not None: 1281 # In the absence of a TreeBuilder, use whatever values were
1282 # passed in here. They're probably None, unless this is a copy of some
1283 # other tag.
1284 self.can_be_empty_element = can_be_empty_element
1285 self.cdata_list_attributes = cdata_list_attributes
1286 self.preserve_whitespace_tags = preserve_whitespace_tags
1287 self.interesting_string_types = interesting_string_types
1288 else:
1289 # Set up any substitutions for this tag, such as the charset in a META tag.
814 builder.set_up_substitutions(self) 1290 builder.set_up_substitutions(self)
1291
1292 # Ask the TreeBuilder whether this tag might be an empty-element tag.
815 self.can_be_empty_element = builder.can_be_empty_element(name) 1293 self.can_be_empty_element = builder.can_be_empty_element(name)
816 else: 1294
817 self.can_be_empty_element = False 1295 # Keep track of the list of attributes of this tag that
1296 # might need to be treated as a list.
1297 #
1298 # For performance reasons, we store the whole data structure
1299 # rather than asking the question of every tag. Asking would
1300 # require building a new data structure every time, and
1301 # (unlike can_be_empty_element), we almost never need
1302 # to check this.
1303 self.cdata_list_attributes = builder.cdata_list_attributes
1304
1305 # Keep track of the names that might cause this tag to be treated as a
1306 # whitespace-preserved tag.
1307 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1308
1309 if self.name in builder.string_containers:
1310 # This sort of tag uses a special string container
1311 # subclass for most of its strings. When we ask the
1312 self.interesting_string_types = builder.string_containers[self.name]
1313 else:
1314 self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
818 1315
819 parserClass = _alias("parser_class") # BS3 1316 parserClass = _alias("parser_class") # BS3
820 1317
821 def __copy__(self): 1318 def __deepcopy__(self, memo, recursive=True):
822 """A copy of a Tag is a new Tag, unconnected to the parse tree. 1319 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
823 Its contents are a copy of the old Tag's contents. 1320 Its contents are a copy of the old Tag's contents.
824 """ 1321 """
825 clone = type(self)(None, self.builder, self.name, self.namespace, 1322 clone = self._clone()
826 self.nsprefix, self.attrs) 1323
1324 if recursive:
1325 # Clone this tag's descendants recursively, but without
1326 # making any recursive function calls.
1327 tag_stack = [clone]
1328 for event, element in self._event_stream(self.descendants):
1329 if event is Tag.END_ELEMENT_EVENT:
1330 # Stop appending incoming Tags to the Tag that was
1331 # just closed.
1332 tag_stack.pop()
1333 else:
1334 descendant_clone = element.__deepcopy__(
1335 memo, recursive=False
1336 )
1337 # Add to its parent's .contents
1338 tag_stack[-1].append(descendant_clone)
1339
1340 if event is Tag.START_ELEMENT_EVENT:
1341 # Add the Tag itself to the stack so that its
1342 # children will be .appended to it.
1343 tag_stack.append(descendant_clone)
1344 return clone
1345
1346 def __copy__(self):
1347 """A copy of a Tag must always be a deep copy, because a Tag's
1348 children can only have one parent at a time.
1349 """
1350 return self.__deepcopy__({})
1351
1352 def _clone(self):
1353 """Create a new Tag just like this one, but with no
1354 contents and unattached to any parse tree.
1355
1356 This is the first step in the deepcopy process.
1357 """
1358 clone = type(self)(
1359 None, None, self.name, self.namespace,
1360 self.prefix, self.attrs, is_xml=self._is_xml,
1361 sourceline=self.sourceline, sourcepos=self.sourcepos,
1362 can_be_empty_element=self.can_be_empty_element,
1363 cdata_list_attributes=self.cdata_list_attributes,
1364 preserve_whitespace_tags=self.preserve_whitespace_tags,
1365 interesting_string_types=self.interesting_string_types
1366 )
827 for attr in ('can_be_empty_element', 'hidden'): 1367 for attr in ('can_be_empty_element', 'hidden'):
828 setattr(clone, attr, getattr(self, attr)) 1368 setattr(clone, attr, getattr(self, attr))
829 for child in self.contents:
830 clone.append(child.__copy__())
831 return clone 1369 return clone
832 1370
833 @property 1371 @property
834 def is_empty_element(self): 1372 def is_empty_element(self):
835 """Is this tag an empty-element tag? (aka a self-closing tag) 1373 """Is this tag an empty-element tag? (aka a self-closing tag)
@@ -850,13 +1388,17 @@ class Tag(PageElement):
850 1388
851 @property 1389 @property
852 def string(self): 1390 def string(self):
853 """Convenience property to get the single string within this tag. 1391 """Convenience property to get the single string within this
1392 PageElement.
854 1393
855 :Return: If this tag has a single string child, return value 1394 TODO It might make sense to have NavigableString.string return
856 is that string. If this tag has no children, or more than one 1395 itself.
857 child, return value is None. If this tag has one child tag, 1396
1397 :return: If this element has a single string child, return
1398 value is that string. If this element has one child tag,
858 return value is the 'string' attribute of the child tag, 1399 return value is the 'string' attribute of the child tag,
859 recursively. 1400 recursively. If this element is itself a string, has no
1401 children, or has more than one child, return value is None.
860 """ 1402 """
861 if len(self.contents) != 1: 1403 if len(self.contents) != 1:
862 return None 1404 return None
@@ -867,57 +1409,75 @@ class Tag(PageElement):
867 1409
868 @string.setter 1410 @string.setter
869 def string(self, string): 1411 def string(self, string):
1412 """Replace this PageElement's contents with `string`."""
870 self.clear() 1413 self.clear()
871 self.append(string.__class__(string)) 1414 self.append(string.__class__(string))
872 1415
873 def _all_strings(self, strip=False, types=(NavigableString, CData)): 1416 DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
1417 def _all_strings(self, strip=False, types=PageElement.default):
874 """Yield all strings of certain classes, possibly stripping them. 1418 """Yield all strings of certain classes, possibly stripping them.
875 1419
876 By default, yields only NavigableString and CData objects. So 1420 :param strip: If True, all strings will be stripped before being
877 no comments, processing instructions, etc. 1421 yielded.
1422
1423 :param types: A tuple of NavigableString subclasses. Any strings of
1424 a subclass not found in this list will be ignored. By
1425 default, the subclasses considered are the ones found in
1426 self.interesting_string_types. If that's not specified,
1427 only NavigableString and CData objects will be
1428 considered. That means no comments, processing
1429 instructions, etc.
1430
1431 :yield: A sequence of strings.
1432
878 """ 1433 """
1434 if types is self.default:
1435 types = self.interesting_string_types
1436
879 for descendant in self.descendants: 1437 for descendant in self.descendants:
880 if ( 1438 if (types is None and not isinstance(descendant, NavigableString)):
881 (types is None and not isinstance(descendant, NavigableString)) 1439 continue
882 or 1440 descendant_type = type(descendant)
883 (types is not None and type(descendant) not in types)): 1441 if isinstance(types, type):
1442 if descendant_type is not types:
1443 # We're not interested in strings of this type.
1444 continue
1445 elif types is not None and descendant_type not in types:
1446 # We're not interested in strings of this type.
884 continue 1447 continue
885 if strip: 1448 if strip:
886 descendant = descendant.strip() 1449 descendant = descendant.strip()
887 if len(descendant) == 0: 1450 if len(descendant) == 0:
888 continue 1451 continue
889 yield descendant 1452 yield descendant
890
891 strings = property(_all_strings) 1453 strings = property(_all_strings)
892 1454
893 @property 1455 def decompose(self):
894 def stripped_strings(self): 1456 """Recursively destroys this PageElement and its children.
895 for string in self._all_strings(True):
896 yield string
897 1457
898 def get_text(self, separator="", strip=False, 1458 This element will be removed from the tree and wiped out; so
899 types=(NavigableString, CData)): 1459 will everything beneath it.
900 """
901 Get all child strings, concatenated using the given separator.
902 """
903 return separator.join([s for s in self._all_strings(
904 strip, types=types)])
905 getText = get_text
906 text = property(get_text)
907 1460
908 def decompose(self): 1461 The behavior of a decomposed PageElement is undefined and you
909 """Recursively destroys the contents of this tree.""" 1462 should never use one for anything, but if you need to _check_
1463 whether an element has been decomposed, you can use the
1464 `decomposed` property.
1465 """
910 self.extract() 1466 self.extract()
911 i = self 1467 i = self
912 while i is not None: 1468 while i is not None:
913 next = i.next_element 1469 n = i.next_element
914 i.__dict__.clear() 1470 i.__dict__.clear()
915 i.contents = [] 1471 i.contents = []
916 i = next 1472 i._decomposed = True
1473 i = n
917 1474
918 def clear(self, decompose=False): 1475 def clear(self, decompose=False):
919 """ 1476 """Wipe out all children of this PageElement by calling extract()
920 Extract all children. If decompose is True, decompose instead. 1477 on them.
1478
1479 :param decompose: If this is True, decompose() (a more
1480 destructive method) will be called instead of extract().
921 """ 1481 """
922 if decompose: 1482 if decompose:
923 for element in self.contents[:]: 1483 for element in self.contents[:]:
@@ -929,10 +1489,51 @@ class Tag(PageElement):
929 for element in self.contents[:]: 1489 for element in self.contents[:]:
930 element.extract() 1490 element.extract()
931 1491
932 def index(self, element): 1492 def smooth(self):
1493 """Smooth out this element's children by consolidating consecutive
1494 strings.
1495
1496 This makes pretty-printed output look more natural following a
1497 lot of operations that modified the tree.
933 """ 1498 """
934 Find the index of a child by identity, not value. Avoids issues with 1499 # Mark the first position of every pair of children that need
935 tag.contents.index(element) getting the index of equal elements. 1500 # to be consolidated. Do this rather than making a copy of
1501 # self.contents, since in most cases very few strings will be
1502 # affected.
1503 marked = []
1504 for i, a in enumerate(self.contents):
1505 if isinstance(a, Tag):
1506 # Recursively smooth children.
1507 a.smooth()
1508 if i == len(self.contents)-1:
1509 # This is the last item in .contents, and it's not a
1510 # tag. There's no chance it needs any work.
1511 continue
1512 b = self.contents[i+1]
1513 if (isinstance(a, NavigableString)
1514 and isinstance(b, NavigableString)
1515 and not isinstance(a, PreformattedString)
1516 and not isinstance(b, PreformattedString)
1517 ):
1518 marked.append(i)
1519
1520 # Go over the marked positions in reverse order, so that
1521 # removing items from .contents won't affect the remaining
1522 # positions.
1523 for i in reversed(marked):
1524 a = self.contents[i]
1525 b = self.contents[i+1]
1526 b.extract()
1527 n = NavigableString(a+b)
1528 a.replace_with(n)
1529
1530 def index(self, element):
1531 """Find the index of a child by identity, not value.
1532
1533 Avoids issues with tag.contents.index(element) getting the
1534 index of equal elements.
1535
1536 :param element: Look for this PageElement in `self.contents`.
936 """ 1537 """
937 for i, child in enumerate(self.contents): 1538 for i, child in enumerate(self.contents):
938 if child is element: 1539 if child is element:
@@ -945,23 +1546,38 @@ class Tag(PageElement):
945 attribute.""" 1546 attribute."""
946 return self.attrs.get(key, default) 1547 return self.attrs.get(key, default)
947 1548
1549 def get_attribute_list(self, key, default=None):
1550 """The same as get(), but always returns a list.
1551
1552 :param key: The attribute to look for.
1553 :param default: Use this value if the attribute is not present
1554 on this PageElement.
1555 :return: A list of values, probably containing only a single
1556 value.
1557 """
1558 value = self.get(key, default)
1559 if not isinstance(value, list):
1560 value = [value]
1561 return value
1562
948 def has_attr(self, key): 1563 def has_attr(self, key):
1564 """Does this PageElement have an attribute with the given name?"""
949 return key in self.attrs 1565 return key in self.attrs
950 1566
951 def __hash__(self): 1567 def __hash__(self):
952 return str(self).__hash__() 1568 return str(self).__hash__()
953 1569
954 def __getitem__(self, key): 1570 def __getitem__(self, key):
955 """tag[key] returns the value of the 'key' attribute for the tag, 1571 """tag[key] returns the value of the 'key' attribute for the Tag,
956 and throws an exception if it's not there.""" 1572 and throws an exception if it's not there."""
957 return self.attrs[key] 1573 return self.attrs[key]
958 1574
959 def __iter__(self): 1575 def __iter__(self):
960 "Iterating over a tag iterates over its contents." 1576 "Iterating over a Tag iterates over its contents."
961 return iter(self.contents) 1577 return iter(self.contents)
962 1578
963 def __len__(self): 1579 def __len__(self):
964 "The length of a tag is the length of its list of contents." 1580 "The length of a Tag is the length of its list of contents."
965 return len(self.contents) 1581 return len(self.contents)
966 1582
967 def __contains__(self, x): 1583 def __contains__(self, x):
@@ -981,29 +1597,33 @@ class Tag(PageElement):
981 self.attrs.pop(key, None) 1597 self.attrs.pop(key, None)
982 1598
983 def __call__(self, *args, **kwargs): 1599 def __call__(self, *args, **kwargs):
984 """Calling a tag like a function is the same as calling its 1600 """Calling a Tag like a function is the same as calling its
985 find_all() method. Eg. tag('a') returns a list of all the A tags 1601 find_all() method. Eg. tag('a') returns a list of all the A tags
986 found within this tag.""" 1602 found within this tag."""
987 return self.find_all(*args, **kwargs) 1603 return self.find_all(*args, **kwargs)
988 1604
989 def __getattr__(self, tag): 1605 def __getattr__(self, tag):
990 #print "Getattr %s.%s" % (self.__class__, tag) 1606 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1607 #print("Getattr %s.%s" % (self.__class__, tag))
991 if len(tag) > 3 and tag.endswith('Tag'): 1608 if len(tag) > 3 and tag.endswith('Tag'):
992 # BS3: soup.aTag -> "soup.find("a") 1609 # BS3: soup.aTag -> "soup.find("a")
993 tag_name = tag[:-3] 1610 tag_name = tag[:-3]
994 warnings.warn( 1611 warnings.warn(
995 '.%sTag is deprecated, use .find("%s") instead.' % ( 1612 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
996 tag_name, tag_name)) 1613 name=tag_name
1614 ),
1615 DeprecationWarning, stacklevel=2
1616 )
997 return self.find(tag_name) 1617 return self.find(tag_name)
998 # We special case contents to avoid recursion. 1618 # We special case contents to avoid recursion.
999 elif not tag.startswith("__") and not tag=="contents": 1619 elif not tag.startswith("__") and not tag == "contents":
1000 return self.find(tag) 1620 return self.find(tag)
1001 raise AttributeError( 1621 raise AttributeError(
1002 "'%s' object has no attribute '%s'" % (self.__class__, tag)) 1622 "'%s' object has no attribute '%s'" % (self.__class__, tag))
1003 1623
1004 def __eq__(self, other): 1624 def __eq__(self, other):
1005 """Returns true iff this tag has the same name, the same attributes, 1625 """Returns true iff this Tag has the same name, the same attributes,
1006 and the same contents (recursively) as the given tag.""" 1626 and the same contents (recursively) as `other`."""
1007 if self is other: 1627 if self is other:
1008 return True 1628 return True
1009 if (not hasattr(other, 'name') or 1629 if (not hasattr(other, 'name') or
@@ -1019,69 +1639,235 @@ class Tag(PageElement):
1019 return True 1639 return True
1020 1640
1021 def __ne__(self, other): 1641 def __ne__(self, other):
1022 """Returns true iff this tag is not identical to the other tag, 1642 """Returns true iff this Tag is not identical to `other`,
1023 as defined in __eq__.""" 1643 as defined in __eq__."""
1024 return not self == other 1644 return not self == other
1025 1645
1026 def __repr__(self, encoding="unicode-escape"): 1646 def __repr__(self, encoding="unicode-escape"):
1027 """Renders this tag as a string.""" 1647 """Renders this PageElement as a string.
1028 if PY3K:
1029 # "The return value must be a string object", i.e. Unicode
1030 return self.decode()
1031 else:
1032 # "The return value must be a string object", i.e. a bytestring.
1033 # By convention, the return value of __repr__ should also be
1034 # an ASCII string.
1035 return self.encode(encoding)
1036 1648
1037 def __unicode__(self): 1649 :param encoding: The encoding to use (Python 2 only).
1650 TODO: This is now ignored and a warning should be issued
1651 if a value is provided.
1652 :return: A (Unicode) string.
1653 """
1654 # "The return value must be a string object", i.e. Unicode
1038 return self.decode() 1655 return self.decode()
1039 1656
1040 def __str__(self): 1657 def __unicode__(self):
1041 if PY3K: 1658 """Renders this PageElement as a Unicode string."""
1042 return self.decode() 1659 return self.decode()
1043 else:
1044 return self.encode()
1045 1660
1046 if PY3K: 1661 __str__ = __repr__ = __unicode__
1047 __str__ = __repr__ = __unicode__
1048 1662
1049 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 1663 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1050 indent_level=None, formatter="minimal", 1664 indent_level=None, formatter="minimal",
1051 errors="xmlcharrefreplace"): 1665 errors="xmlcharrefreplace"):
1666 """Render a bytestring representation of this PageElement and its
1667 contents.
1668
1669 :param encoding: The destination encoding.
1670 :param indent_level: Each line of the rendering will be
1671 indented this many levels. (The formatter decides what a
1672 'level' means in terms of spaces or other characters
1673 output.) Used internally in recursive calls while
1674 pretty-printing.
1675 :param formatter: A Formatter object, or a string naming one of
1676 the standard formatters.
1677 :param errors: An error handling strategy such as
1678 'xmlcharrefreplace'. This value is passed along into
1679 encode() and its value should be one of the constants
1680 defined by Python.
1681 :return: A bytestring.
1682
1683 """
1052 # Turn the data structure into Unicode, then encode the 1684 # Turn the data structure into Unicode, then encode the
1053 # Unicode. 1685 # Unicode.
1054 u = self.decode(indent_level, encoding, formatter) 1686 u = self.decode(indent_level, encoding, formatter)
1055 return u.encode(encoding, errors) 1687 return u.encode(encoding, errors)
1056 1688
1057 def _should_pretty_print(self, indent_level):
1058 """Should this tag be pretty-printed?"""
1059 return (
1060 indent_level is not None and
1061 (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1062 or self._is_xml))
1063
1064 def decode(self, indent_level=None, 1689 def decode(self, indent_level=None,
1065 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1690 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1066 formatter="minimal"): 1691 formatter="minimal",
1067 """Returns a Unicode representation of this tag and its contents. 1692 iterator=None):
1693 pieces = []
1694 # First off, turn a non-Formatter `formatter` into a Formatter
1695 # object. This will stop the lookup from happening over and
1696 # over again.
1697 if not isinstance(formatter, Formatter):
1698 formatter = self.formatter_for_name(formatter)
1699
1700 if indent_level is True:
1701 indent_level = 0
1702
1703 # The currently active tag that put us into string literal
1704 # mode. Until this element is closed, children will be treated
1705 # as string literals and not pretty-printed. String literal
1706 # mode is turned on immediately after this tag begins, and
1707 # turned off immediately before it's closed. This means there
1708 # will be whitespace before and after the tag itself.
1709 string_literal_tag = None
1710
1711 for event, element in self._event_stream(iterator):
1712 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
1713 piece = element._format_tag(
1714 eventual_encoding, formatter, opening=True
1715 )
1716 elif event is Tag.END_ELEMENT_EVENT:
1717 piece = element._format_tag(
1718 eventual_encoding, formatter, opening=False
1719 )
1720 if indent_level is not None:
1721 indent_level -= 1
1722 else:
1723 piece = element.output_ready(formatter)
1724
1725 # Now we need to apply the 'prettiness' -- extra
1726 # whitespace before and/or after this tag. This can get
1727 # complicated because certain tags, like <pre> and
1728 # <script>, can't be prettified, since adding whitespace would
1729 # change the meaning of the content.
1730
1731 # The default behavior is to add whitespace before and
1732 # after an element when string literal mode is off, and to
1733 # leave things as they are when string literal mode is on.
1734 if string_literal_tag:
1735 indent_before = indent_after = False
1736 else:
1737 indent_before = indent_after = True
1738
1739 # The only time the behavior is more complex than that is
1740 # when we encounter an opening or closing tag that might
1741 # put us into or out of string literal mode.
1742 if (event is Tag.START_ELEMENT_EVENT
1743 and not string_literal_tag
1744 and not element._should_pretty_print()):
1745 # We are about to enter string literal mode. Add
1746 # whitespace before this tag, but not after. We
1747 # will stay in string literal mode until this tag
1748 # is closed.
1749 indent_before = True
1750 indent_after = False
1751 string_literal_tag = element
1752 elif (event is Tag.END_ELEMENT_EVENT
1753 and element is string_literal_tag):
1754 # We are about to exit string literal mode by closing
1755 # the tag that sent us into that mode. Add whitespace
1756 # after this tag, but not before.
1757 indent_before = False
1758 indent_after = True
1759 string_literal_tag = None
1760
1761 # Now we know whether to add whitespace before and/or
1762 # after this element.
1763 if indent_level is not None:
1764 if (indent_before or indent_after):
1765 if isinstance(element, NavigableString):
1766 piece = piece.strip()
1767 if piece:
1768 piece = self._indent_string(
1769 piece, indent_level, formatter,
1770 indent_before, indent_after
1771 )
1772 if event == Tag.START_ELEMENT_EVENT:
1773 indent_level += 1
1774 pieces.append(piece)
1775 return "".join(pieces)
1776
1777 # Names for the different events yielded by _event_stream
1778 START_ELEMENT_EVENT = object()
1779 END_ELEMENT_EVENT = object()
1780 EMPTY_ELEMENT_EVENT = object()
1781 STRING_ELEMENT_EVENT = object()
1782
1783 def _event_stream(self, iterator=None):
1784 """Yield a sequence of events that can be used to reconstruct the DOM
1785 for this element.
1786
1787 This lets us recreate the nested structure of this element
1788 (e.g. when formatting it as a string) without using recursive
1789 method calls.
1790
1791 This is similar in concept to the SAX API, but it's a simpler
1792 interface designed for internal use. The events are different
1793 from SAX and the arguments associated with the events are Tags
1794 and other Beautiful Soup objects.
1795
1796 :param iterator: An alternate iterator to use when traversing
1797 the tree.
1798 """
1799 tag_stack = []
1068 1800
1069 :param eventual_encoding: The tag is destined to be 1801 iterator = iterator or self.self_and_descendants
1070 encoded into this encoding. This method is _not_ 1802
1071 responsible for performing that encoding. This information 1803 for c in iterator:
1072 is passed in so that it can be substituted in if the 1804 # If the parent of the element we're about to yield is not
1073 document contains a <META> tag that mentions the document's 1805 # the tag currently on the stack, it means that the tag on
1074 encoding. 1806 # the stack closed before this element appeared.
1807 while tag_stack and c.parent != tag_stack[-1]:
1808 now_closed_tag = tag_stack.pop()
1809 yield Tag.END_ELEMENT_EVENT, now_closed_tag
1810
1811 if isinstance(c, Tag):
1812 if c.is_empty_element:
1813 yield Tag.EMPTY_ELEMENT_EVENT, c
1814 else:
1815 yield Tag.START_ELEMENT_EVENT, c
1816 tag_stack.append(c)
1817 continue
1818 else:
1819 yield Tag.STRING_ELEMENT_EVENT, c
1820
1821 while tag_stack:
1822 now_closed_tag = tag_stack.pop()
1823 yield Tag.END_ELEMENT_EVENT, now_closed_tag
1824
1825 def _indent_string(self, s, indent_level, formatter,
1826 indent_before, indent_after):
1827 """Add indentation whitespace before and/or after a string.
1828
1829 :param s: The string to amend with whitespace.
1830 :param indent_level: The indentation level; affects how much
1831 whitespace goes before the string.
1832 :param indent_before: Whether or not to add whitespace
1833 before the string.
1834 :param indent_after: Whether or not to add whitespace
1835 (a newline) after the string.
1075 """ 1836 """
1837 space_before = ''
1838 if indent_before and indent_level:
1839 space_before = (formatter.indent * indent_level)
1076 1840
1077 # First off, turn a string formatter into a function. This 1841 space_after = ''
1078 # will stop the lookup from happening over and over again. 1842 if indent_after:
1079 if not isinstance(formatter, collections.abc.Callable): 1843 space_after = "\n"
1080 formatter = self._formatter_for_name(formatter)
1081 1844
1082 attrs = [] 1845 return space_before + s + space_after
1083 if self.attrs: 1846
1084 for key, val in sorted(self.attrs.items()): 1847 def _format_tag(self, eventual_encoding, formatter, opening):
1848 if self.hidden:
1849 # A hidden tag is invisible, although its contents
1850 # are visible.
1851 return ''
1852
1853 # A tag starts with the < character (see below).
1854
1855 # Then the / character, if this is a closing tag.
1856 closing_slash = ''
1857 if not opening:
1858 closing_slash = '/'
1859
1860 # Then an optional namespace prefix.
1861 prefix = ''
1862 if self.prefix:
1863 prefix = self.prefix + ":"
1864
1865 # Then a list of attribute values, if this is an opening tag.
1866 attribute_string = ''
1867 if opening:
1868 attributes = formatter.attributes(self)
1869 attrs = []
1870 for key, val in attributes:
1085 if val is None: 1871 if val is None:
1086 decoded = key 1872 decoded = key
1087 else: 1873 else:
@@ -1090,71 +1876,52 @@ class Tag(PageElement):
1090 elif not isinstance(val, str): 1876 elif not isinstance(val, str):
1091 val = str(val) 1877 val = str(val)
1092 elif ( 1878 elif (
1093 isinstance(val, AttributeValueWithCharsetSubstitution) 1879 isinstance(val, AttributeValueWithCharsetSubstitution)
1094 and eventual_encoding is not None): 1880 and eventual_encoding is not None
1881 ):
1095 val = val.encode(eventual_encoding) 1882 val = val.encode(eventual_encoding)
1096 1883
1097 text = self.format_string(val, formatter) 1884 text = formatter.attribute_value(val)
1098 decoded = ( 1885 decoded = (
1099 str(key) + '=' 1886 str(key) + '='
1100 + EntitySubstitution.quoted_attribute_value(text)) 1887 + formatter.quoted_attribute_value(text))
1101 attrs.append(decoded) 1888 attrs.append(decoded)
1102 close = '' 1889 if attrs:
1103 closeTag = '' 1890 attribute_string = ' ' + ' '.join(attrs)
1104
1105 prefix = ''
1106 if self.prefix:
1107 prefix = self.prefix + ":"
1108 1891
1892 # Then an optional closing slash (for a void element in an
1893 # XML document).
1894 void_element_closing_slash = ''
1109 if self.is_empty_element: 1895 if self.is_empty_element:
1110 close = '/' 1896 void_element_closing_slash = formatter.void_element_close_prefix or ''
1111 else:
1112 closeTag = '</%s%s>' % (prefix, self.name)
1113
1114 pretty_print = self._should_pretty_print(indent_level)
1115 space = ''
1116 indent_space = ''
1117 if indent_level is not None:
1118 indent_space = (' ' * (indent_level - 1))
1119 if pretty_print:
1120 space = indent_space
1121 indent_contents = indent_level + 1
1122 else:
1123 indent_contents = None
1124 contents = self.decode_contents(
1125 indent_contents, eventual_encoding, formatter)
1126 1897
1127 if self.hidden: 1898 # Put it all together.
1128 # This is the 'document root' object. 1899 return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
1129 s = contents 1900
1130 else: 1901 def _should_pretty_print(self, indent_level=1):
1131 s = [] 1902 """Should this tag be pretty-printed?
1132 attribute_string = '' 1903
1133 if attrs: 1904 Most of them should, but some (such as <pre> in HTML
1134 attribute_string = ' ' + ' '.join(attrs) 1905 documents) should not.
1135 if indent_level is not None: 1906 """
1136 # Even if this particular tag is not pretty-printed, 1907 return (
1137 # we should indent up to the start of the tag. 1908 indent_level is not None
1138 s.append(indent_space) 1909 and (
1139 s.append('<%s%s%s%s>' % ( 1910 not self.preserve_whitespace_tags
1140 prefix, self.name, attribute_string, close)) 1911 or self.name not in self.preserve_whitespace_tags
1141 if pretty_print: 1912 )
1142 s.append("\n") 1913 )
1143 s.append(contents)
1144 if pretty_print and contents and contents[-1] != "\n":
1145 s.append("\n")
1146 if pretty_print and closeTag:
1147 s.append(space)
1148 s.append(closeTag)
1149 if indent_level is not None and closeTag and self.next_sibling:
1150 # Even if this particular tag is not pretty-printed,
1151 # we're now done with the tag, and we should add a
1152 # newline if appropriate.
1153 s.append("\n")
1154 s = ''.join(s)
1155 return s
1156 1914
1157 def prettify(self, encoding=None, formatter="minimal"): 1915 def prettify(self, encoding=None, formatter="minimal"):
1916 """Pretty-print this PageElement as a string.
1917
1918 :param encoding: The eventual encoding of the string. If this is None,
1919 a Unicode string will be returned.
1920 :param formatter: A Formatter object, or a string naming one of
1921 the standard formatters.
1922 :return: A Unicode string (if encoding==None) or a bytestring
1923 (otherwise).
1924 """
1158 if encoding is None: 1925 if encoding is None:
1159 return self.decode(True, formatter=formatter) 1926 return self.decode(True, formatter=formatter)
1160 else: 1927 else:
@@ -1166,62 +1933,50 @@ class Tag(PageElement):
1166 """Renders the contents of this tag as a Unicode string. 1933 """Renders the contents of this tag as a Unicode string.
1167 1934
1168 :param indent_level: Each line of the rendering will be 1935 :param indent_level: Each line of the rendering will be
1169 indented this many spaces. 1936 indented this many levels. (The formatter decides what a
1937 'level' means in terms of spaces or other characters
1938 output.) Used internally in recursive calls while
1939 pretty-printing.
1170 1940
1171 :param eventual_encoding: The tag is destined to be 1941 :param eventual_encoding: The tag is destined to be
1172 encoded into this encoding. This method is _not_ 1942 encoded into this encoding. decode_contents() is _not_
1173 responsible for performing that encoding. This information 1943 responsible for performing that encoding. This information
1174 is passed in so that it can be substituted in if the 1944 is passed in so that it can be substituted in if the
1175 document contains a <META> tag that mentions the document's 1945 document contains a <META> tag that mentions the document's
1176 encoding. 1946 encoding.
1177 1947
1178 :param formatter: The output formatter responsible for converting 1948 :param formatter: A Formatter object, or a string naming one of
1179 entities to Unicode characters. 1949 the standard Formatters.
1180 """ 1950
1181 # First off, turn a string formatter into a function. This 1951 """
1182 # will stop the lookup from happening over and over again. 1952 return self.decode(indent_level, eventual_encoding, formatter,
1183 if not isinstance(formatter, collections.abc.Callable): 1953 iterator=self.descendants)
1184 formatter = self._formatter_for_name(formatter)
1185
1186 pretty_print = (indent_level is not None)
1187 s = []
1188 for c in self:
1189 text = None
1190 if isinstance(c, NavigableString):
1191 text = c.output_ready(formatter)
1192 elif isinstance(c, Tag):
1193 s.append(c.decode(indent_level, eventual_encoding,
1194 formatter))
1195 if text and indent_level and not self.name == 'pre':
1196 text = text.strip()
1197 if text:
1198 if pretty_print and not self.name == 'pre':
1199 s.append(" " * (indent_level - 1))
1200 s.append(text)
1201 if pretty_print and not self.name == 'pre':
1202 s.append("\n")
1203 return ''.join(s)
1204 1954
1205 def encode_contents( 1955 def encode_contents(
1206 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1956 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1207 formatter="minimal"): 1957 formatter="minimal"):
1208 """Renders the contents of this tag as a bytestring. 1958 """Renders the contents of this PageElement as a bytestring.
1209 1959
1210 :param indent_level: Each line of the rendering will be 1960 :param indent_level: Each line of the rendering will be
1211 indented this many spaces. 1961 indented this many levels. (The formatter decides what a
1962 'level' means in terms of spaces or other characters
1963 output.) Used internally in recursive calls while
1964 pretty-printing.
1212 1965
1213 :param eventual_encoding: The bytestring will be in this encoding. 1966 :param eventual_encoding: The bytestring will be in this encoding.
1214 1967
1215 :param formatter: The output formatter responsible for converting 1968 :param formatter: A Formatter object, or a string naming one of
1216 entities to Unicode characters. 1969 the standard Formatters.
1217 """
1218 1970
1971 :return: A bytestring.
1972 """
1219 contents = self.decode_contents(indent_level, encoding, formatter) 1973 contents = self.decode_contents(indent_level, encoding, formatter)
1220 return contents.encode(encoding) 1974 return contents.encode(encoding)
1221 1975
1222 # Old method for BS3 compatibility 1976 # Old method for BS3 compatibility
1223 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 1977 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1224 prettyPrint=False, indentLevel=0): 1978 prettyPrint=False, indentLevel=0):
1979 """Deprecated method for BS3 compatibility."""
1225 if not prettyPrint: 1980 if not prettyPrint:
1226 indentLevel = None 1981 indentLevel = None
1227 return self.encode_contents( 1982 return self.encode_contents(
@@ -1229,44 +1984,88 @@ class Tag(PageElement):
1229 1984
1230 #Soup methods 1985 #Soup methods
1231 1986
1232 def find(self, name=None, attrs={}, recursive=True, text=None, 1987 def find(self, name=None, attrs={}, recursive=True, string=None,
1233 **kwargs): 1988 **kwargs):
1234 """Return only the first child of this Tag matching the given 1989 """Look in the children of this PageElement and find the first
1235 criteria.""" 1990 PageElement that matches the given criteria.
1991
1992 All find_* methods take a common set of arguments. See the online
1993 documentation for detailed explanations.
1994
1995 :param name: A filter on tag name.
1996 :param attrs: A dictionary of filters on attribute values.
1997 :param recursive: If this is True, find() will perform a
1998 recursive search of this PageElement's children. Otherwise,
1999 only the direct children will be considered.
2000 :param limit: Stop looking after finding this many results.
2001 :kwargs: A dictionary of filters on attribute values.
2002 :return: A PageElement.
2003 :rtype: bs4.element.Tag | bs4.element.NavigableString
2004 """
1236 r = None 2005 r = None
1237 l = self.find_all(name, attrs, recursive, text, 1, **kwargs) 2006 l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
2007 **kwargs)
1238 if l: 2008 if l:
1239 r = l[0] 2009 r = l[0]
1240 return r 2010 return r
1241 findChild = find 2011 findChild = find #BS2
1242 2012
1243 def find_all(self, name=None, attrs={}, recursive=True, text=None, 2013 def find_all(self, name=None, attrs={}, recursive=True, string=None,
1244 limit=None, **kwargs): 2014 limit=None, **kwargs):
1245 """Extracts a list of Tag objects that match the given 2015 """Look in the children of this PageElement and find all
1246 criteria. You can specify the name of the Tag and any 2016 PageElements that match the given criteria.
1247 attributes you want the Tag to have. 2017
1248 2018 All find_* methods take a common set of arguments. See the online
1249 The value of a key-value pair in the 'attrs' map can be a 2019 documentation for detailed explanations.
1250 string, a list of strings, a regular expression object, or a 2020
1251 callable that takes a string and returns whether or not the 2021 :param name: A filter on tag name.
1252 string matches for some custom definition of 'matches'. The 2022 :param attrs: A dictionary of filters on attribute values.
1253 same is true of the tag name.""" 2023 :param recursive: If this is True, find_all() will perform a
1254 2024 recursive search of this PageElement's children. Otherwise,
2025 only the direct children will be considered.
2026 :param limit: Stop looking after finding this many results.
2027 :kwargs: A dictionary of filters on attribute values.
2028 :return: A ResultSet of PageElements.
2029 :rtype: bs4.element.ResultSet
2030 """
1255 generator = self.descendants 2031 generator = self.descendants
1256 if not recursive: 2032 if not recursive:
1257 generator = self.children 2033 generator = self.children
1258 return self._find_all(name, attrs, text, limit, generator, **kwargs) 2034 _stacklevel = kwargs.pop('_stacklevel', 2)
2035 return self._find_all(name, attrs, string, limit, generator,
2036 _stacklevel=_stacklevel+1, **kwargs)
1259 findAll = find_all # BS3 2037 findAll = find_all # BS3
1260 findChildren = find_all # BS2 2038 findChildren = find_all # BS2
1261 2039
1262 #Generator methods 2040 #Generator methods
1263 @property 2041 @property
1264 def children(self): 2042 def children(self):
2043 """Iterate over all direct children of this PageElement.
2044
2045 :yield: A sequence of PageElements.
2046 """
1265 # return iter() to make the purpose of the method clear 2047 # return iter() to make the purpose of the method clear
1266 return iter(self.contents) # XXX This seems to be untested. 2048 return iter(self.contents) # XXX This seems to be untested.
1267 2049
1268 @property 2050 @property
2051 def self_and_descendants(self):
2052 """Iterate over this PageElement and its children in a
2053 breadth-first sequence.
2054
2055 :yield: A sequence of PageElements.
2056 """
2057 if not self.hidden:
2058 yield self
2059 for i in self.descendants:
2060 yield i
2061
2062 @property
1269 def descendants(self): 2063 def descendants(self):
2064 """Iterate over all children of this PageElement in a
2065 breadth-first sequence.
2066
2067 :yield: A sequence of PageElements.
2068 """
1270 if not len(self.contents): 2069 if not len(self.contents):
1271 return 2070 return
1272 stopNode = self._last_descendant().next_element 2071 stopNode = self._last_descendant().next_element
@@ -1276,262 +2075,102 @@ class Tag(PageElement):
1276 current = current.next_element 2075 current = current.next_element
1277 2076
1278 # CSS selector code 2077 # CSS selector code
2078 def select_one(self, selector, namespaces=None, **kwargs):
2079 """Perform a CSS selection operation on the current element.
1279 2080
1280 _selector_combinators = ['>', '+', '~'] 2081 :param selector: A CSS selector.
1281 _select_debug = False
1282 def select_one(self, selector):
1283 """Perform a CSS selection operation on the current element."""
1284 value = self.select(selector, limit=1)
1285 if value:
1286 return value[0]
1287 return None
1288 2082
1289 def select(self, selector, _candidate_generator=None, limit=None): 2083 :param namespaces: A dictionary mapping namespace prefixes
1290 """Perform a CSS selection operation on the current element.""" 2084 used in the CSS selector to namespace URIs. By default,
1291 2085 Beautiful Soup will use the prefixes it encountered while
1292 # Handle grouping selectors if ',' exists, ie: p,a 2086 parsing the document.
1293 if ',' in selector:
1294 context = []
1295 for partial_selector in selector.split(','):
1296 partial_selector = partial_selector.strip()
1297 if partial_selector == '':
1298 raise ValueError('Invalid group selection syntax: %s' % selector)
1299 candidates = self.select(partial_selector, limit=limit)
1300 for candidate in candidates:
1301 if candidate not in context:
1302 context.append(candidate)
1303
1304 if limit and len(context) >= limit:
1305 break
1306 return context
1307 2087
1308 tokens = selector.split() 2088 :param kwargs: Keyword arguments to be passed into Soup Sieve's
1309 current_context = [self] 2089 soupsieve.select() method.
1310 2090
1311 if tokens[-1] in self._selector_combinators: 2091 :return: A Tag.
1312 raise ValueError( 2092 :rtype: bs4.element.Tag
1313 'Final combinator "%s" is missing an argument.' % tokens[-1]) 2093 """
2094 return self.css.select_one(selector, namespaces, **kwargs)
1314 2095
1315 if self._select_debug: 2096 def select(self, selector, namespaces=None, limit=None, **kwargs):
1316 print('Running CSS selector "%s"' % selector) 2097 """Perform a CSS selection operation on the current element.
1317 2098
1318 for index, token in enumerate(tokens): 2099 This uses the SoupSieve library.
1319 new_context = []
1320 new_context_ids = set([])
1321 2100
1322 if tokens[index-1] in self._selector_combinators: 2101 :param selector: A string containing a CSS selector.
1323 # This token was consumed by the previous combinator. Skip it.
1324 if self._select_debug:
1325 print(' Token was consumed by the previous combinator.')
1326 continue
1327 2102
1328 if self._select_debug: 2103 :param namespaces: A dictionary mapping namespace prefixes
1329 print(' Considering token "%s"' % token) 2104 used in the CSS selector to namespace URIs. By default,
1330 recursive_candidate_generator = None 2105 Beautiful Soup will use the prefixes it encountered while
1331 tag_name = None 2106 parsing the document.
1332 2107
1333 # Each operation corresponds to a checker function, a rule 2108 :param limit: After finding this number of results, stop looking.
1334 # for determining whether a candidate matches the 2109
1335 # selector. Candidates are generated by the active 2110 :param kwargs: Keyword arguments to be passed into SoupSieve's
1336 # iterator. 2111 soupsieve.select() method.
1337 checker = None 2112
1338 2113 :return: A ResultSet of Tags.
1339 m = self.attribselect_re.match(token) 2114 :rtype: bs4.element.ResultSet
1340 if m is not None: 2115 """
1341 # Attribute selector 2116 return self.css.select(selector, namespaces, limit, **kwargs)
1342 tag_name, attribute, operator, value = m.groups() 2117
1343 checker = self._attribute_checker(operator, attribute, value) 2118 @property
1344 2119 def css(self):
1345 elif '#' in token: 2120 """Return an interface to the CSS selector API."""
1346 # ID selector 2121 return CSS(self)
1347 tag_name, tag_id = token.split('#', 1)
1348 def id_matches(tag):
1349 return tag.get('id', None) == tag_id
1350 checker = id_matches
1351
1352 elif '.' in token:
1353 # Class selector
1354 tag_name, klass = token.split('.', 1)
1355 classes = set(klass.split('.'))
1356 def classes_match(candidate):
1357 return classes.issubset(candidate.get('class', []))
1358 checker = classes_match
1359
1360 elif ':' in token:
1361 # Pseudo-class
1362 tag_name, pseudo = token.split(':', 1)
1363 if tag_name == '':
1364 raise ValueError(
1365 "A pseudo-class must be prefixed with a tag name.")
1366 pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1367 found = []
1368 if pseudo_attributes is None:
1369 pseudo_type = pseudo
1370 pseudo_value = None
1371 else:
1372 pseudo_type, pseudo_value = pseudo_attributes.groups()
1373 if pseudo_type == 'nth-of-type':
1374 try:
1375 pseudo_value = int(pseudo_value)
1376 except:
1377 raise NotImplementedError(
1378 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1379 if pseudo_value < 1:
1380 raise ValueError(
1381 'nth-of-type pseudo-class value must be at least 1.')
1382 class Counter(object):
1383 def __init__(self, destination):
1384 self.count = 0
1385 self.destination = destination
1386
1387 def nth_child_of_type(self, tag):
1388 self.count += 1
1389 if self.count == self.destination:
1390 return True
1391 if self.count > self.destination:
1392 # Stop the generator that's sending us
1393 # these things.
1394 raise StopIteration()
1395 return False
1396 checker = Counter(pseudo_value).nth_child_of_type
1397 else:
1398 raise NotImplementedError(
1399 'Only the following pseudo-classes are implemented: nth-of-type.')
1400
1401 elif token == '*':
1402 # Star selector -- matches everything
1403 pass
1404 elif token == '>':
1405 # Run the next token as a CSS selector against the
1406 # direct children of each tag in the current context.
1407 recursive_candidate_generator = lambda tag: tag.children
1408 elif token == '~':
1409 # Run the next token as a CSS selector against the
1410 # siblings of each tag in the current context.
1411 recursive_candidate_generator = lambda tag: tag.next_siblings
1412 elif token == '+':
1413 # For each tag in the current context, run the next
1414 # token as a CSS selector against the tag's next
1415 # sibling that's a tag.
1416 def next_tag_sibling(tag):
1417 yield tag.find_next_sibling(True)
1418 recursive_candidate_generator = next_tag_sibling
1419
1420 elif self.tag_name_re.match(token):
1421 # Just a tag name.
1422 tag_name = token
1423 else:
1424 raise ValueError(
1425 'Unsupported or invalid CSS selector: "%s"' % token)
1426 if recursive_candidate_generator:
1427 # This happens when the selector looks like "> foo".
1428 #
1429 # The generator calls select() recursively on every
1430 # member of the current context, passing in a different
1431 # candidate generator and a different selector.
1432 #
1433 # In the case of "> foo", the candidate generator is
1434 # one that yields a tag's direct children (">"), and
1435 # the selector is "foo".
1436 next_token = tokens[index+1]
1437 def recursive_select(tag):
1438 if self._select_debug:
1439 print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
1440 print('-' * 40)
1441 for i in tag.select(next_token, recursive_candidate_generator):
1442 if self._select_debug:
1443 print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
1444 yield i
1445 if self._select_debug:
1446 print('-' * 40)
1447 _use_candidate_generator = recursive_select
1448 elif _candidate_generator is None:
1449 # By default, a tag's candidates are all of its
1450 # children. If tag_name is defined, only yield tags
1451 # with that name.
1452 if self._select_debug:
1453 if tag_name:
1454 check = "[any]"
1455 else:
1456 check = tag_name
1457 print(' Default candidate generator, tag name="%s"' % check)
1458 if self._select_debug:
1459 # This is redundant with later code, but it stops
1460 # a bunch of bogus tags from cluttering up the
1461 # debug log.
1462 def default_candidate_generator(tag):
1463 for child in tag.descendants:
1464 if not isinstance(child, Tag):
1465 continue
1466 if tag_name and not child.name == tag_name:
1467 continue
1468 yield child
1469 _use_candidate_generator = default_candidate_generator
1470 else:
1471 _use_candidate_generator = lambda tag: tag.descendants
1472 else:
1473 _use_candidate_generator = _candidate_generator
1474
1475 count = 0
1476 for tag in current_context:
1477 if self._select_debug:
1478 print(" Running candidate generator on %s %s" % (
1479 tag.name, repr(tag.attrs)))
1480 for candidate in _use_candidate_generator(tag):
1481 if not isinstance(candidate, Tag):
1482 continue
1483 if tag_name and candidate.name != tag_name:
1484 continue
1485 if checker is not None:
1486 try:
1487 result = checker(candidate)
1488 except StopIteration:
1489 # The checker has decided we should no longer
1490 # run the generator.
1491 break
1492 if checker is None or result:
1493 if self._select_debug:
1494 print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
1495 if id(candidate) not in new_context_ids:
1496 # If a tag matches a selector more than once,
1497 # don't include it in the context more than once.
1498 new_context.append(candidate)
1499 new_context_ids.add(id(candidate))
1500 if limit and len(new_context) >= limit:
1501 break
1502 elif self._select_debug:
1503 print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
1504
1505
1506 current_context = new_context
1507
1508 if self._select_debug:
1509 print("Final verdict:")
1510 for i in current_context:
1511 print(" %s %s" % (i.name, i.attrs))
1512 return current_context
1513 2122
1514 # Old names for backwards compatibility 2123 # Old names for backwards compatibility
1515 def childGenerator(self): 2124 def childGenerator(self):
2125 """Deprecated generator."""
1516 return self.children 2126 return self.children
1517 2127
1518 def recursiveChildGenerator(self): 2128 def recursiveChildGenerator(self):
2129 """Deprecated generator."""
1519 return self.descendants 2130 return self.descendants
1520 2131
1521 def has_key(self, key): 2132 def has_key(self, key):
1522 """This was kind of misleading because has_key() (attributes) 2133 """Deprecated method. This was kind of misleading because has_key()
1523 was different from __in__ (contents). has_key() is gone in 2134 (attributes) was different from __in__ (contents).
1524 Python 3, anyway.""" 2135
1525 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( 2136 has_key() is gone in Python 3, anyway.
1526 key)) 2137 """
2138 warnings.warn(
2139 'has_key is deprecated. Use has_attr(key) instead.',
2140 DeprecationWarning, stacklevel=2
2141 )
1527 return self.has_attr(key) 2142 return self.has_attr(key)
1528 2143
1529# Next, a couple classes to represent queries and their results. 2144# Next, a couple classes to represent queries and their results.
1530class SoupStrainer(object): 2145class SoupStrainer(object):
1531 """Encapsulates a number of ways of matching a markup element (tag or 2146 """Encapsulates a number of ways of matching a markup element (tag or
1532 text).""" 2147 string).
2148
2149 This is primarily used to underpin the find_* methods, but you can
2150 create one yourself and pass it in as `parse_only` to the
2151 `BeautifulSoup` constructor, to parse a subset of a large
2152 document.
2153 """
2154
2155 def __init__(self, name=None, attrs={}, string=None, **kwargs):
2156 """Constructor.
2157
2158 The SoupStrainer constructor takes the same arguments passed
2159 into the find_* methods. See the online documentation for
2160 detailed explanations.
2161
2162 :param name: A filter on tag name.
2163 :param attrs: A dictionary of filters on attribute values.
2164 :param string: A filter for a NavigableString with specific text.
2165 :kwargs: A dictionary of filters on attribute values.
2166 """
2167 if string is None and 'text' in kwargs:
2168 string = kwargs.pop('text')
2169 warnings.warn(
2170 "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
2171 DeprecationWarning, stacklevel=2
2172 )
1533 2173
1534 def __init__(self, name=None, attrs={}, text=None, **kwargs):
1535 self.name = self._normalize_search_value(name) 2174 self.name = self._normalize_search_value(name)
1536 if not isinstance(attrs, dict): 2175 if not isinstance(attrs, dict):
1537 # Treat a non-dict value for attrs as a search for the 'class' 2176 # Treat a non-dict value for attrs as a search for the 'class'
@@ -1556,12 +2195,15 @@ class SoupStrainer(object):
1556 normalized_attrs[key] = self._normalize_search_value(value) 2195 normalized_attrs[key] = self._normalize_search_value(value)
1557 2196
1558 self.attrs = normalized_attrs 2197 self.attrs = normalized_attrs
1559 self.text = self._normalize_search_value(text) 2198 self.string = self._normalize_search_value(string)
2199
2200 # DEPRECATED but just in case someone is checking this.
2201 self.text = self.string
1560 2202
1561 def _normalize_search_value(self, value): 2203 def _normalize_search_value(self, value):
1562 # Leave it alone if it's a Unicode string, a callable, a 2204 # Leave it alone if it's a Unicode string, a callable, a
1563 # regular expression, a boolean, or None. 2205 # regular expression, a boolean, or None.
1564 if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match') 2206 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
1565 or isinstance(value, bool) or value is None): 2207 or isinstance(value, bool) or value is None):
1566 return value 2208 return value
1567 2209
@@ -1589,19 +2231,40 @@ class SoupStrainer(object):
1589 return str(str(value)) 2231 return str(str(value))
1590 2232
1591 def __str__(self): 2233 def __str__(self):
1592 if self.text: 2234 """A human-readable representation of this SoupStrainer."""
1593 return self.text 2235 if self.string:
2236 return self.string
1594 else: 2237 else:
1595 return "%s|%s" % (self.name, self.attrs) 2238 return "%s|%s" % (self.name, self.attrs)
1596 2239
1597 def search_tag(self, markup_name=None, markup_attrs={}): 2240 def search_tag(self, markup_name=None, markup_attrs={}):
2241 """Check whether a Tag with the given name and attributes would
2242 match this SoupStrainer.
2243
2244 Used prospectively to decide whether to even bother creating a Tag
2245 object.
2246
2247 :param markup_name: A tag name as found in some markup.
2248 :param markup_attrs: A dictionary of attributes as found in some markup.
2249
2250 :return: True if the prospective tag would match this SoupStrainer;
2251 False otherwise.
2252 """
1598 found = None 2253 found = None
1599 markup = None 2254 markup = None
1600 if isinstance(markup_name, Tag): 2255 if isinstance(markup_name, Tag):
1601 markup = markup_name 2256 markup = markup_name
1602 markup_attrs = markup 2257 markup_attrs = markup
2258
2259 if isinstance(self.name, str):
2260 # Optimization for a very common case where the user is
2261 # searching for a tag with one specific name, and we're
2262 # looking at a tag with a different name.
2263 if markup and not markup.prefix and self.name != markup.name:
2264 return False
2265
1603 call_function_with_tag_data = ( 2266 call_function_with_tag_data = (
1604 isinstance(self.name, collections.abc.Callable) 2267 isinstance(self.name, Callable)
1605 and not isinstance(markup_name, Tag)) 2268 and not isinstance(markup_name, Tag))
1606 2269
1607 if ((not self.name) 2270 if ((not self.name)
@@ -1630,13 +2293,22 @@ class SoupStrainer(object):
1630 found = markup 2293 found = markup
1631 else: 2294 else:
1632 found = markup_name 2295 found = markup_name
1633 if found and self.text and not self._matches(found.string, self.text): 2296 if found and self.string and not self._matches(found.string, self.string):
1634 found = None 2297 found = None
1635 return found 2298 return found
2299
2300 # For BS3 compatibility.
1636 searchTag = search_tag 2301 searchTag = search_tag
1637 2302
1638 def search(self, markup): 2303 def search(self, markup):
1639 # print 'looking for %s in %s' % (self, markup) 2304 """Find all items in `markup` that match this SoupStrainer.
2305
2306 Used by the core _find_all() method, which is ultimately
2307 called by all find_* methods.
2308
2309 :param markup: A PageElement or a list of them.
2310 """
2311 # print('looking for %s in %s' % (self, markup))
1640 found = None 2312 found = None
1641 # If given a list of items, scan it for a text element that 2313 # If given a list of items, scan it for a text element that
1642 # matches. 2314 # matches.
@@ -1649,49 +2321,44 @@ class SoupStrainer(object):
1649 # If it's a Tag, make sure its name or attributes match. 2321 # If it's a Tag, make sure its name or attributes match.
1650 # Don't bother with Tags if we're searching for text. 2322 # Don't bother with Tags if we're searching for text.
1651 elif isinstance(markup, Tag): 2323 elif isinstance(markup, Tag):
1652 if not self.text or self.name or self.attrs: 2324 if not self.string or self.name or self.attrs:
1653 found = self.search_tag(markup) 2325 found = self.search_tag(markup)
1654 # If it's text, make sure the text matches. 2326 # If it's text, make sure the text matches.
1655 elif isinstance(markup, NavigableString) or \ 2327 elif isinstance(markup, NavigableString) or \
1656 isinstance(markup, str): 2328 isinstance(markup, str):
1657 if not self.name and not self.attrs and self._matches(markup, self.text): 2329 if not self.name and not self.attrs and self._matches(markup, self.string):
1658 found = markup 2330 found = markup
1659 else: 2331 else:
1660 raise Exception( 2332 raise Exception(
1661 "I don't know how to match against a %s" % markup.__class__) 2333 "I don't know how to match against a %s" % markup.__class__)
1662 return found 2334 return found
1663 2335
1664 def _matches(self, markup, match_against): 2336 def _matches(self, markup, match_against, already_tried=None):
1665 # print u"Matching %s against %s" % (markup, match_against) 2337 # print(u"Matching %s against %s" % (markup, match_against))
1666 result = False 2338 result = False
1667 if isinstance(markup, list) or isinstance(markup, tuple): 2339 if isinstance(markup, list) or isinstance(markup, tuple):
1668 # This should only happen when searching a multi-valued attribute 2340 # This should only happen when searching a multi-valued attribute
1669 # like 'class'. 2341 # like 'class'.
1670 if (isinstance(match_against, str) 2342 for item in markup:
1671 and ' ' in match_against): 2343 if self._matches(item, match_against):
1672 # A bit of a special case. If they try to match "foo 2344 return True
1673 # bar" on a multivalue attribute's value, only accept 2345 # We didn't match any particular value of the multivalue
1674 # the literal value "foo bar" 2346 # attribute, but maybe we match the attribute value when
1675 # 2347 # considered as a string.
1676 # XXX This is going to be pretty slow because we keep 2348 if self._matches(' '.join(markup), match_against):
1677 # splitting match_against. But it shouldn't come up 2349 return True
1678 # too often. 2350 return False
1679 return (whitespace_re.split(match_against) == markup)
1680 else:
1681 for item in markup:
1682 if self._matches(item, match_against):
1683 return True
1684 return False
1685 2351
1686 if match_against is True: 2352 if match_against is True:
1687 # True matches any non-None value. 2353 # True matches any non-None value.
1688 return markup is not None 2354 return markup is not None
1689 2355
1690 if isinstance(match_against, collections.abc.Callable): 2356 if isinstance(match_against, Callable):
1691 return match_against(markup) 2357 return match_against(markup)
1692 2358
1693 # Custom callables take the tag as an argument, but all 2359 # Custom callables take the tag as an argument, but all
1694 # other ways of matching match the tag name as a string. 2360 # other ways of matching match the tag name as a string.
2361 original_markup = markup
1695 if isinstance(markup, Tag): 2362 if isinstance(markup, Tag):
1696 markup = markup.name 2363 markup = markup.name
1697 2364
@@ -1702,23 +2369,67 @@ class SoupStrainer(object):
1702 # None matches None, False, an empty string, an empty list, and so on. 2369 # None matches None, False, an empty string, an empty list, and so on.
1703 return not match_against 2370 return not match_against
1704 2371
1705 if isinstance(match_against, str): 2372 if (hasattr(match_against, '__iter__')
2373 and not isinstance(match_against, str)):
2374 # We're asked to match against an iterable of items.
2375 # The markup must be match at least one item in the
2376 # iterable. We'll try each one in turn.
2377 #
2378 # To avoid infinite recursion we need to keep track of
2379 # items we've already seen.
2380 if not already_tried:
2381 already_tried = set()
2382 for item in match_against:
2383 if item.__hash__:
2384 key = item
2385 else:
2386 key = id(item)
2387 if key in already_tried:
2388 continue
2389 else:
2390 already_tried.add(key)
2391 if self._matches(original_markup, item, already_tried):
2392 return True
2393 else:
2394 return False
2395
2396 # Beyond this point we might need to run the test twice: once against
2397 # the tag's name and once against its prefixed name.
2398 match = False
2399
2400 if not match and isinstance(match_against, str):
1706 # Exact string match 2401 # Exact string match
1707 return markup == match_against 2402 match = markup == match_against
1708 2403
1709 if hasattr(match_against, 'match'): 2404 if not match and hasattr(match_against, 'search'):
1710 # Regexp match 2405 # Regexp match
1711 return match_against.search(markup) 2406 return match_against.search(markup)
1712 2407
1713 if hasattr(match_against, '__iter__'): 2408 if (not match
1714 # The markup must be an exact match against something 2409 and isinstance(original_markup, Tag)
1715 # in the iterable. 2410 and original_markup.prefix):
1716 return markup in match_against 2411 # Try the whole thing again with the prefixed tag name.
2412 return self._matches(
2413 original_markup.prefix + ':' + original_markup.name, match_against
2414 )
2415
2416 return match
1717 2417
1718 2418
1719class ResultSet(list): 2419class ResultSet(list):
1720 """A ResultSet is just a list that keeps track of the SoupStrainer 2420 """A ResultSet is just a list that keeps track of the SoupStrainer
1721 that created it.""" 2421 that created it."""
1722 def __init__(self, source, result=()): 2422 def __init__(self, source, result=()):
2423 """Constructor.
2424
2425 :param source: A SoupStrainer.
2426 :param result: A list of PageElements.
2427 """
1723 super(ResultSet, self).__init__(result) 2428 super(ResultSet, self).__init__(result)
1724 self.source = source 2429 self.source = source
2430
2431 def __getattr__(self, key):
2432 """Raise a helpful exception to explain a common code fix."""
2433 raise AttributeError(
2434 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2435 )