summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/__init__.py')
-rw-r--r--bitbake/lib/bs4/__init__.py680
1 files changed, 526 insertions, 154 deletions
diff --git a/bitbake/lib/bs4/__init__.py b/bitbake/lib/bs4/__init__.py
index e35725b86e..d8ad5e1dc1 100644
--- a/bitbake/lib/bs4/__init__.py
+++ b/bitbake/lib/bs4/__init__.py
@@ -1,65 +1,99 @@
1"""Beautiful Soup 1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
2Elixir and Tonic 2
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/ 3http://www.crummy.com/software/BeautifulSoup/
5 4
6Beautiful Soup uses a pluggable XML or HTML parser to parse a 5Beautiful Soup uses a pluggable XML or HTML parser to parse a
7(possibly invalid) document into a tree representation. Beautiful Soup 6(possibly invalid) document into a tree representation. Beautiful Soup
8provides provides methods and Pythonic idioms that make it easy to 7provides methods and Pythonic idioms that make it easy to navigate,
9navigate, search, and modify the parse tree. 8search, and modify the parse tree.
10 9
11Beautiful Soup works with Python 2.6 and up. It works better if lxml 10Beautiful Soup works with Python 3.6 and up. It works better if lxml
12and/or html5lib is installed. 11and/or html5lib is installed.
13 12
14For more than you ever wanted to know about Beautiful Soup, see the 13For more than you ever wanted to know about Beautiful Soup, see the
15documentation: 14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
16http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17""" 15"""
18 16
19__author__ = "Leonard Richardson (leonardr@segfault.org)" 17__author__ = "Leonard Richardson (leonardr@segfault.org)"
20__version__ = "4.4.1" 18__version__ = "4.12.3"
21__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" 19__copyright__ = "Copyright (c) 2004-2024 Leonard Richardson"
20# Use of this source code is governed by the MIT license.
22__license__ = "MIT" 21__license__ = "MIT"
23 22
24__all__ = ['BeautifulSoup'] 23__all__ = ['BeautifulSoup']
25 24
25from collections import Counter
26import os 26import os
27import re 27import re
28import sys
29import traceback
28import warnings 30import warnings
29 31
30from .builder import builder_registry, ParserRejectedMarkup 32# The very first thing we do is give a useful error if someone is
33# running this code under Python 2.
34if sys.version_info.major < 3:
35 raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
36
37from .builder import (
38 builder_registry,
39 ParserRejectedMarkup,
40 XMLParsedAsHTMLWarning,
41 HTMLParserTreeBuilder
42)
31from .dammit import UnicodeDammit 43from .dammit import UnicodeDammit
32from .element import ( 44from .element import (
33 CData, 45 CData,
34 Comment, 46 Comment,
47 CSS,
35 DEFAULT_OUTPUT_ENCODING, 48 DEFAULT_OUTPUT_ENCODING,
36 Declaration, 49 Declaration,
37 Doctype, 50 Doctype,
38 NavigableString, 51 NavigableString,
39 PageElement, 52 PageElement,
40 ProcessingInstruction, 53 ProcessingInstruction,
54 PYTHON_SPECIFIC_ENCODINGS,
41 ResultSet, 55 ResultSet,
56 Script,
57 Stylesheet,
42 SoupStrainer, 58 SoupStrainer,
43 Tag, 59 Tag,
60 TemplateString,
44 ) 61 )
45 62
46# The very first thing we do is give a useful error if someone is 63# Define some custom warnings.
47# running this code under Python 3 without converting it. 64class GuessedAtParserWarning(UserWarning):
48'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 65 """The warning issued when BeautifulSoup has to guess what parser to
66 use -- probably because no parser was specified in the constructor.
67 """
49 68
50class BeautifulSoup(Tag): 69class MarkupResemblesLocatorWarning(UserWarning):
70 """The warning issued when BeautifulSoup is given 'markup' that
71 actually looks like a resource locator -- a URL or a path to a file
72 on disk.
51 """ 73 """
52 This class defines the basic interface called by the tree builders.
53 74
54 These methods will be called by the parser: 75
55 reset() 76class BeautifulSoup(Tag):
56 feed(markup) 77 """A data structure representing a parsed HTML or XML document.
78
79 Most of the methods you'll call on a BeautifulSoup object are inherited from
80 PageElement or Tag.
81
82 Internally, this class defines the basic interface called by the
83 tree builders when converting an HTML/XML document into a data
84 structure. The interface abstracts away the differences between
85 parsers. To write a new tree builder, you'll need to understand
86 these methods as a whole.
87
88 These methods will be called by the BeautifulSoup constructor:
89 * reset()
90 * feed(markup)
57 91
58 The tree builder may call these methods from its feed() implementation: 92 The tree builder may call these methods from its feed() implementation:
59 handle_starttag(name, attrs) # See note about return value 93 * handle_starttag(name, attrs) # See note about return value
60 handle_endtag(name) 94 * handle_endtag(name)
61 handle_data(data) # Appends to the current data node 95 * handle_data(data) # Appends to the current data node
62 endData(containerClass=NavigableString) # Ends the current data node 96 * endData(containerClass) # Ends the current data node
63 97
64 No matter how complicated the underlying parser is, you should be 98 No matter how complicated the underlying parser is, you should be
65 able to build a tree using 'start tag' events, 'end tag' events, 99 able to build a tree using 'start tag' events, 'end tag' events,
@@ -69,24 +103,77 @@ class BeautifulSoup(Tag):
69 like HTML's <br> tag), call handle_starttag and then 103 like HTML's <br> tag), call handle_starttag and then
70 handle_endtag. 104 handle_endtag.
71 """ 105 """
106
107 # Since BeautifulSoup subclasses Tag, it's possible to treat it as
108 # a Tag with a .name. This name makes it clear the BeautifulSoup
109 # object isn't a real markup tag.
72 ROOT_TAG_NAME = '[document]' 110 ROOT_TAG_NAME = '[document]'
73 111
74 # If the end-user gives no indication which tree builder they 112 # If the end-user gives no indication which tree builder they
75 # want, look for one with these features. 113 # want, look for one with these features.
76 DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 114 DEFAULT_BUILDER_FEATURES = ['html', 'fast']
77 115
116 # A string containing all ASCII whitespace characters, used in
117 # endData() to detect data chunks that seem 'empty'.
78 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 118 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
79 119
80 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" 120 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
81 121
82 def __init__(self, markup="", features=None, builder=None, 122 def __init__(self, markup="", features=None, builder=None,
83 parse_only=None, from_encoding=None, exclude_encodings=None, 123 parse_only=None, from_encoding=None, exclude_encodings=None,
84 **kwargs): 124 element_classes=None, **kwargs):
85 """The Soup object is initialized as the 'root tag', and the 125 """Constructor.
86 provided markup (which can be a string or a file-like object) 126
87 is fed into the underlying parser.""" 127 :param markup: A string or a file-like object representing
88 128 markup to be parsed.
129
130 :param features: Desirable features of the parser to be
131 used. This may be the name of a specific parser ("lxml",
132 "lxml-xml", "html.parser", or "html5lib") or it may be the
133 type of markup to be used ("html", "html5", "xml"). It's
134 recommended that you name a specific parser, so that
135 Beautiful Soup gives you the same results across platforms
136 and virtual environments.
137
138 :param builder: A TreeBuilder subclass to instantiate (or
139 instance to use) instead of looking one up based on
140 `features`. You only need to use this if you've implemented a
141 custom TreeBuilder.
142
143 :param parse_only: A SoupStrainer. Only parts of the document
144 matching the SoupStrainer will be considered. This is useful
145 when parsing part of a document that would otherwise be too
146 large to fit into memory.
147
148 :param from_encoding: A string indicating the encoding of the
149 document to be parsed. Pass this in if Beautiful Soup is
150 guessing wrongly about the document's encoding.
151
152 :param exclude_encodings: A list of strings indicating
153 encodings known to be wrong. Pass this in if you don't know
154 the document's encoding but you know Beautiful Soup's guess is
155 wrong.
156
157 :param element_classes: A dictionary mapping BeautifulSoup
158 classes like Tag and NavigableString, to other classes you'd
159 like to be instantiated instead as the parse tree is
160 built. This is useful for subclassing Tag or NavigableString
161 to modify default behavior.
162
163 :param kwargs: For backwards compatibility purposes, the
164 constructor accepts certain keyword arguments used in
165 Beautiful Soup 3. None of these arguments do anything in
166 Beautiful Soup 4; they will result in a warning and then be
167 ignored.
168
169 Apart from this, any keyword arguments passed into the
170 BeautifulSoup constructor are propagated to the TreeBuilder
171 constructor. This makes it possible to configure a
172 TreeBuilder by passing in arguments, not just by saying which
173 one to use.
174 """
89 if 'convertEntities' in kwargs: 175 if 'convertEntities' in kwargs:
176 del kwargs['convertEntities']
90 warnings.warn( 177 warnings.warn(
91 "BS4 does not respect the convertEntities argument to the " 178 "BS4 does not respect the convertEntities argument to the "
92 "BeautifulSoup constructor. Entities are always converted " 179 "BeautifulSoup constructor. Entities are always converted "
@@ -125,10 +212,10 @@ class BeautifulSoup(Tag):
125 if old_name in kwargs: 212 if old_name in kwargs:
126 warnings.warn( 213 warnings.warn(
127 'The "%s" argument to the BeautifulSoup constructor ' 214 'The "%s" argument to the BeautifulSoup constructor '
128 'has been renamed to "%s."' % (old_name, new_name)) 215 'has been renamed to "%s."' % (old_name, new_name),
129 value = kwargs[old_name] 216 DeprecationWarning, stacklevel=3
130 del kwargs[old_name] 217 )
131 return value 218 return kwargs.pop(old_name)
132 return None 219 return None
133 220
134 parse_only = parse_only or deprecated_argument( 221 parse_only = parse_only or deprecated_argument(
@@ -137,13 +224,23 @@ class BeautifulSoup(Tag):
137 from_encoding = from_encoding or deprecated_argument( 224 from_encoding = from_encoding or deprecated_argument(
138 "fromEncoding", "from_encoding") 225 "fromEncoding", "from_encoding")
139 226
140 if len(kwargs) > 0: 227 if from_encoding and isinstance(markup, str):
141 arg = list(kwargs.keys()).pop() 228 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
142 raise TypeError( 229 from_encoding = None
143 "__init__() got an unexpected keyword argument '%s'" % arg) 230
144 231 self.element_classes = element_classes or dict()
145 if builder is None: 232
146 original_features = features 233 # We need this information to track whether or not the builder
234 # was specified well enough that we can omit the 'you need to
235 # specify a parser' warning.
236 original_builder = builder
237 original_features = features
238
239 if isinstance(builder, type):
240 # A builder class was passed in; it needs to be instantiated.
241 builder_class = builder
242 builder = None
243 elif builder is None:
147 if isinstance(features, str): 244 if isinstance(features, str):
148 features = [features] 245 features = [features]
149 if features is None or len(features) == 0: 246 if features is None or len(features) == 0:
@@ -154,85 +251,227 @@ class BeautifulSoup(Tag):
154 "Couldn't find a tree builder with the features you " 251 "Couldn't find a tree builder with the features you "
155 "requested: %s. Do you need to install a parser library?" 252 "requested: %s. Do you need to install a parser library?"
156 % ",".join(features)) 253 % ",".join(features))
157 builder = builder_class() 254
158 if not (original_features == builder.NAME or 255 # At this point either we have a TreeBuilder instance in
159 original_features in builder.ALTERNATE_NAMES): 256 # builder, or we have a builder_class that we can instantiate
257 # with the remaining **kwargs.
258 if builder is None:
259 builder = builder_class(**kwargs)
260 if not original_builder and not (
261 original_features == builder.NAME or
262 original_features in builder.ALTERNATE_NAMES
263 ) and markup:
264 # The user did not tell us which TreeBuilder to use,
265 # and we had to guess. Issue a warning.
160 if builder.is_xml: 266 if builder.is_xml:
161 markup_type = "XML" 267 markup_type = "XML"
162 else: 268 else:
163 markup_type = "HTML" 269 markup_type = "HTML"
164 warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
165 parser=builder.NAME,
166 markup_type=markup_type))
167 270
271 # This code adapted from warnings.py so that we get the same line
272 # of code as our warnings.warn() call gets, even if the answer is wrong
273 # (as it may be in a multithreading situation).
274 caller = None
275 try:
276 caller = sys._getframe(1)
277 except ValueError:
278 pass
279 if caller:
280 globals = caller.f_globals
281 line_number = caller.f_lineno
282 else:
283 globals = sys.__dict__
284 line_number= 1
285 filename = globals.get('__file__')
286 if filename:
287 fnl = filename.lower()
288 if fnl.endswith((".pyc", ".pyo")):
289 filename = filename[:-1]
290 if filename:
291 # If there is no filename at all, the user is most likely in a REPL,
292 # and the warning is not necessary.
293 values = dict(
294 filename=filename,
295 line_number=line_number,
296 parser=builder.NAME,
297 markup_type=markup_type
298 )
299 warnings.warn(
300 self.NO_PARSER_SPECIFIED_WARNING % values,
301 GuessedAtParserWarning, stacklevel=2
302 )
303 else:
304 if kwargs:
305 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
306
168 self.builder = builder 307 self.builder = builder
169 self.is_xml = builder.is_xml 308 self.is_xml = builder.is_xml
170 self.builder.soup = self 309 self.known_xml = self.is_xml
171 310 self._namespaces = dict()
172 self.parse_only = parse_only 311 self.parse_only = parse_only
173 312
174 if hasattr(markup, 'read'): # It's a file-type object. 313 if hasattr(markup, 'read'): # It's a file-type object.
175 markup = markup.read() 314 markup = markup.read()
176 elif len(markup) <= 256: 315 elif len(markup) <= 256 and (
177 # Print out warnings for a couple beginner problems 316 (isinstance(markup, bytes) and not b'<' in markup)
317 or (isinstance(markup, str) and not '<' in markup)
318 ):
319 # Issue warnings for a couple beginner problems
178 # involving passing non-markup to Beautiful Soup. 320 # involving passing non-markup to Beautiful Soup.
179 # Beautiful Soup will still parse the input as markup, 321 # Beautiful Soup will still parse the input as markup,
180 # just in case that's what the user really wants. 322 # since that is sometimes the intended behavior.
181 if (isinstance(markup, str) 323 if not self._markup_is_url(markup):
182 and not os.path.supports_unicode_filenames): 324 self._markup_resembles_filename(markup)
183 possible_filename = markup.encode("utf8")
184 else:
185 possible_filename = markup
186 is_file = False
187 try:
188 is_file = os.path.exists(possible_filename)
189 except Exception as e:
190 # This is almost certainly a problem involving
191 # characters not valid in filenames on this
192 # system. Just let it go.
193 pass
194 if is_file:
195 if isinstance(markup, str):
196 markup = markup.encode("utf8")
197 warnings.warn(
198 '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
199 if markup[:5] == "http:" or markup[:6] == "https:":
200 # TODO: This is ugly but I couldn't get it to work in
201 # Python 3 otherwise.
202 if ((isinstance(markup, bytes) and not b' ' in markup)
203 or (isinstance(markup, str) and not ' ' in markup)):
204 if isinstance(markup, str):
205 markup = markup.encode("utf8")
206 warnings.warn(
207 '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
208 325
326 rejections = []
327 success = False
209 for (self.markup, self.original_encoding, self.declared_html_encoding, 328 for (self.markup, self.original_encoding, self.declared_html_encoding,
210 self.contains_replacement_characters) in ( 329 self.contains_replacement_characters) in (
211 self.builder.prepare_markup( 330 self.builder.prepare_markup(
212 markup, from_encoding, exclude_encodings=exclude_encodings)): 331 markup, from_encoding, exclude_encodings=exclude_encodings)):
213 self.reset() 332 self.reset()
333 self.builder.initialize_soup(self)
214 try: 334 try:
215 self._feed() 335 self._feed()
336 success = True
216 break 337 break
217 except ParserRejectedMarkup: 338 except ParserRejectedMarkup as e:
339 rejections.append(e)
218 pass 340 pass
219 341
342 if not success:
343 other_exceptions = [str(e) for e in rejections]
344 raise ParserRejectedMarkup(
345 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
346 )
347
220 # Clear out the markup and remove the builder's circular 348 # Clear out the markup and remove the builder's circular
221 # reference to this object. 349 # reference to this object.
222 self.markup = None 350 self.markup = None
223 self.builder.soup = None 351 self.builder.soup = None
224 352
225 def __copy__(self): 353 def _clone(self):
226 return type(self)(self.encode(), builder=self.builder) 354 """Create a new BeautifulSoup object with the same TreeBuilder,
355 but not associated with any markup.
356
357 This is the first step of the deepcopy process.
358 """
359 clone = type(self)("", None, self.builder)
227 360
361 # Keep track of the encoding of the original document,
362 # since we won't be parsing it again.
363 clone.original_encoding = self.original_encoding
364 return clone
365
228 def __getstate__(self): 366 def __getstate__(self):
229 # Frequently a tree builder can't be pickled. 367 # Frequently a tree builder can't be pickled.
230 d = dict(self.__dict__) 368 d = dict(self.__dict__)
231 if 'builder' in d and not self.builder.picklable: 369 if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
232 del d['builder'] 370 d['builder'] = type(self.builder)
371 # Store the contents as a Unicode string.
372 d['contents'] = []
373 d['markup'] = self.decode()
374
375 # If _most_recent_element is present, it's a Tag object left
376 # over from initial parse. It might not be picklable and we
377 # don't need it.
378 if '_most_recent_element' in d:
379 del d['_most_recent_element']
233 return d 380 return d
234 381
382 def __setstate__(self, state):
383 # If necessary, restore the TreeBuilder by looking it up.
384 self.__dict__ = state
385 if isinstance(self.builder, type):
386 self.builder = self.builder()
387 elif not self.builder:
388 # We don't know which builder was used to build this
389 # parse tree, so use a default we know is always available.
390 self.builder = HTMLParserTreeBuilder()
391 self.builder.soup = self
392 self.reset()
393 self._feed()
394 return state
395
396
397 @classmethod
398 def _decode_markup(cls, markup):
399 """Ensure `markup` is bytes so it's safe to send into warnings.warn.
400
401 TODO: warnings.warn had this problem back in 2010 but it might not
402 anymore.
403 """
404 if isinstance(markup, bytes):
405 decoded = markup.decode('utf-8', 'replace')
406 else:
407 decoded = markup
408 return decoded
409
410 @classmethod
411 def _markup_is_url(cls, markup):
412 """Error-handling method to raise a warning if incoming markup looks
413 like a URL.
414
415 :param markup: A string.
416 :return: Whether or not the markup resembles a URL
417 closely enough to justify a warning.
418 """
419 if isinstance(markup, bytes):
420 space = b' '
421 cant_start_with = (b"http:", b"https:")
422 elif isinstance(markup, str):
423 space = ' '
424 cant_start_with = ("http:", "https:")
425 else:
426 return False
427
428 if any(markup.startswith(prefix) for prefix in cant_start_with):
429 if not space in markup:
430 warnings.warn(
431 'The input looks more like a URL than markup. You may want to use'
432 ' an HTTP client like requests to get the document behind'
433 ' the URL, and feed that document to Beautiful Soup.',
434 MarkupResemblesLocatorWarning,
435 stacklevel=3
436 )
437 return True
438 return False
439
440 @classmethod
441 def _markup_resembles_filename(cls, markup):
442 """Error-handling method to raise a warning if incoming markup
443 resembles a filename.
444
445 :param markup: A bytestring or string.
446 :return: Whether or not the markup resembles a filename
447 closely enough to justify a warning.
448 """
449 path_characters = '/\\'
450 extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
451 if isinstance(markup, bytes):
452 path_characters = path_characters.encode("utf8")
453 extensions = [x.encode('utf8') for x in extensions]
454 filelike = False
455 if any(x in markup for x in path_characters):
456 filelike = True
457 else:
458 lower = markup.lower()
459 if any(lower.endswith(ext) for ext in extensions):
460 filelike = True
461 if filelike:
462 warnings.warn(
463 'The input looks more like a filename than markup. You may'
464 ' want to open this file and pass the filehandle into'
465 ' Beautiful Soup.',
466 MarkupResemblesLocatorWarning, stacklevel=3
467 )
468 return True
469 return False
470
235 def _feed(self): 471 def _feed(self):
472 """Internal method that parses previously set markup, creating a large
473 number of Tag and NavigableString objects.
474 """
236 # Convert the document to Unicode. 475 # Convert the document to Unicode.
237 self.builder.reset() 476 self.builder.reset()
238 477
@@ -243,48 +482,111 @@ class BeautifulSoup(Tag):
243 self.popTag() 482 self.popTag()
244 483
245 def reset(self): 484 def reset(self):
485 """Reset this object to a state as though it had never parsed any
486 markup.
487 """
246 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 488 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
247 self.hidden = 1 489 self.hidden = 1
248 self.builder.reset() 490 self.builder.reset()
249 self.current_data = [] 491 self.current_data = []
250 self.currentTag = None 492 self.currentTag = None
251 self.tagStack = [] 493 self.tagStack = []
494 self.open_tag_counter = Counter()
252 self.preserve_whitespace_tag_stack = [] 495 self.preserve_whitespace_tag_stack = []
496 self.string_container_stack = []
497 self._most_recent_element = None
253 self.pushTag(self) 498 self.pushTag(self)
254 499
255 def new_tag(self, name, namespace=None, nsprefix=None, **attrs): 500 def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
256 """Create a new tag associated with this soup.""" 501 sourceline=None, sourcepos=None, **kwattrs):
257 return Tag(None, self.builder, name, namespace, nsprefix, attrs) 502 """Create a new Tag associated with this BeautifulSoup object.
503
504 :param name: The name of the new Tag.
505 :param namespace: The URI of the new Tag's XML namespace, if any.
506 :param prefix: The prefix for the new Tag's XML namespace, if any.
507 :param attrs: A dictionary of this Tag's attribute values; can
508 be used instead of `kwattrs` for attributes like 'class'
509 that are reserved words in Python.
510 :param sourceline: The line number where this tag was
511 (purportedly) found in its source document.
512 :param sourcepos: The character position within `sourceline` where this
513 tag was (purportedly) found.
514 :param kwattrs: Keyword arguments for the new Tag's attribute values.
258 515
259 def new_string(self, s, subclass=NavigableString): 516 """
260 """Create a new NavigableString associated with this soup.""" 517 kwattrs.update(attrs)
261 return subclass(s) 518 return self.element_classes.get(Tag, Tag)(
519 None, self.builder, name, namespace, nsprefix, kwattrs,
520 sourceline=sourceline, sourcepos=sourcepos
521 )
522
523 def string_container(self, base_class=None):
524 container = base_class or NavigableString
525
526 # There may be a general override of NavigableString.
527 container = self.element_classes.get(
528 container, container
529 )
530
531 # On top of that, we may be inside a tag that needs a special
532 # container class.
533 if self.string_container_stack and container is NavigableString:
534 container = self.builder.string_containers.get(
535 self.string_container_stack[-1].name, container
536 )
537 return container
538
539 def new_string(self, s, subclass=None):
540 """Create a new NavigableString associated with this BeautifulSoup
541 object.
542 """
543 container = self.string_container(subclass)
544 return container(s)
262 545
263 def insert_before(self, successor): 546 def insert_before(self, *args):
547 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
548 it because there is nothing before or after it in the parse tree.
549 """
264 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 550 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
265 551
266 def insert_after(self, successor): 552 def insert_after(self, *args):
553 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
554 it because there is nothing before or after it in the parse tree.
555 """
267 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 556 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
268 557
269 def popTag(self): 558 def popTag(self):
559 """Internal method called by _popToTag when a tag is closed."""
270 tag = self.tagStack.pop() 560 tag = self.tagStack.pop()
561 if tag.name in self.open_tag_counter:
562 self.open_tag_counter[tag.name] -= 1
271 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 563 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
272 self.preserve_whitespace_tag_stack.pop() 564 self.preserve_whitespace_tag_stack.pop()
273 #print "Pop", tag.name 565 if self.string_container_stack and tag == self.string_container_stack[-1]:
566 self.string_container_stack.pop()
567 #print("Pop", tag.name)
274 if self.tagStack: 568 if self.tagStack:
275 self.currentTag = self.tagStack[-1] 569 self.currentTag = self.tagStack[-1]
276 return self.currentTag 570 return self.currentTag
277 571
278 def pushTag(self, tag): 572 def pushTag(self, tag):
279 #print "Push", tag.name 573 """Internal method called by handle_starttag when a tag is opened."""
280 if self.currentTag: 574 #print("Push", tag.name)
575 if self.currentTag is not None:
281 self.currentTag.contents.append(tag) 576 self.currentTag.contents.append(tag)
282 self.tagStack.append(tag) 577 self.tagStack.append(tag)
283 self.currentTag = self.tagStack[-1] 578 self.currentTag = self.tagStack[-1]
579 if tag.name != self.ROOT_TAG_NAME:
580 self.open_tag_counter[tag.name] += 1
284 if tag.name in self.builder.preserve_whitespace_tags: 581 if tag.name in self.builder.preserve_whitespace_tags:
285 self.preserve_whitespace_tag_stack.append(tag) 582 self.preserve_whitespace_tag_stack.append(tag)
583 if tag.name in self.builder.string_containers:
584 self.string_container_stack.append(tag)
286 585
287 def endData(self, containerClass=NavigableString): 586 def endData(self, containerClass=None):
587 """Method called by the TreeBuilder when the end of a data segment
588 occurs.
589 """
288 if self.current_data: 590 if self.current_data:
289 current_data = ''.join(self.current_data) 591 current_data = ''.join(self.current_data)
290 # If whitespace is not preserved, and this string contains 592 # If whitespace is not preserved, and this string contains
@@ -311,61 +613,93 @@ class BeautifulSoup(Tag):
311 not self.parse_only.search(current_data)): 613 not self.parse_only.search(current_data)):
312 return 614 return
313 615
616 containerClass = self.string_container(containerClass)
314 o = containerClass(current_data) 617 o = containerClass(current_data)
315 self.object_was_parsed(o) 618 self.object_was_parsed(o)
316 619
317 def object_was_parsed(self, o, parent=None, most_recent_element=None): 620 def object_was_parsed(self, o, parent=None, most_recent_element=None):
318 """Add an object to the parse tree.""" 621 """Method called by the TreeBuilder to integrate an object into the parse tree."""
319 parent = parent or self.currentTag 622 if parent is None:
320 previous_element = most_recent_element or self._most_recent_element 623 parent = self.currentTag
624 if most_recent_element is not None:
625 previous_element = most_recent_element
626 else:
627 previous_element = self._most_recent_element
321 628
322 next_element = previous_sibling = next_sibling = None 629 next_element = previous_sibling = next_sibling = None
323 if isinstance(o, Tag): 630 if isinstance(o, Tag):
324 next_element = o.next_element 631 next_element = o.next_element
325 next_sibling = o.next_sibling 632 next_sibling = o.next_sibling
326 previous_sibling = o.previous_sibling 633 previous_sibling = o.previous_sibling
327 if not previous_element: 634 if previous_element is None:
328 previous_element = o.previous_element 635 previous_element = o.previous_element
329 636
637 fix = parent.next_element is not None
638
330 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 639 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
331 640
332 self._most_recent_element = o 641 self._most_recent_element = o
333 parent.contents.append(o) 642 parent.contents.append(o)
334 643
335 if parent.next_sibling: 644 # Check if we are inserting into an already parsed node.
336 # This node is being inserted into an element that has 645 if fix:
337 # already been parsed. Deal with any dangling references. 646 self._linkage_fixer(parent)
338 index = parent.contents.index(o) 647
339 if index == 0: 648 def _linkage_fixer(self, el):
340 previous_element = parent 649 """Make sure linkage of this fragment is sound."""
341 previous_sibling = None 650
342 else: 651 first = el.contents[0]
343 previous_element = previous_sibling = parent.contents[index-1] 652 child = el.contents[-1]
344 if index == len(parent.contents)-1: 653 descendant = child
345 next_element = parent.next_sibling 654
346 next_sibling = None 655 if child is first and el.parent is not None:
347 else: 656 # Parent should be linked to first child
348 next_element = next_sibling = parent.contents[index+1] 657 el.next_element = child
349 658 # We are no longer linked to whatever this element is
350 o.previous_element = previous_element 659 prev_el = child.previous_element
351 if previous_element: 660 if prev_el is not None and prev_el is not el:
352 previous_element.next_element = o 661 prev_el.next_element = None
353 o.next_element = next_element 662 # First child should be linked to the parent, and no previous siblings.
354 if next_element: 663 child.previous_element = el
355 next_element.previous_element = o 664 child.previous_sibling = None
356 o.next_sibling = next_sibling 665
357 if next_sibling: 666 # We have no sibling as we've been appended as the last.
358 next_sibling.previous_sibling = o 667 child.next_sibling = None
359 o.previous_sibling = previous_sibling 668
360 if previous_sibling: 669 # This index is a tag, dig deeper for a "last descendant"
361 previous_sibling.next_sibling = o 670 if isinstance(child, Tag) and child.contents:
671 descendant = child._last_descendant(False)
672
673 # As the final step, link last descendant. It should be linked
674 # to the parent's next sibling (if found), else walk up the chain
675 # and find a parent with a sibling. It should have no next sibling.
676 descendant.next_element = None
677 descendant.next_sibling = None
678 target = el
679 while True:
680 if target is None:
681 break
682 elif target.next_sibling is not None:
683 descendant.next_element = target.next_sibling
684 target.next_sibling.previous_element = child
685 break
686 target = target.parent
362 687
363 def _popToTag(self, name, nsprefix=None, inclusivePop=True): 688 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
364 """Pops the tag stack up to and including the most recent 689 """Pops the tag stack up to and including the most recent
365 instance of the given tag. If inclusivePop is false, pops the tag 690 instance of the given tag.
366 stack up to but *not* including the most recent instqance of 691
367 the given tag.""" 692 If there are no open tags with the given name, nothing will be
368 #print "Popping to %s" % name 693 popped.
694
695 :param name: Pop up to the most recent tag with this name.
696 :param nsprefix: The namespace prefix that goes with `name`.
697 :param inclusivePop: It this is false, pops the tag stack up
698 to but *not* including the most recent instqance of the
699 given tag.
700
701 """
702 #print("Popping to %s" % name)
369 if name == self.ROOT_TAG_NAME: 703 if name == self.ROOT_TAG_NAME:
370 # The BeautifulSoup object itself can never be popped. 704 # The BeautifulSoup object itself can never be popped.
371 return 705 return
@@ -374,6 +708,8 @@ class BeautifulSoup(Tag):
374 708
375 stack_size = len(self.tagStack) 709 stack_size = len(self.tagStack)
376 for i in range(stack_size - 1, 0, -1): 710 for i in range(stack_size - 1, 0, -1):
711 if not self.open_tag_counter.get(name):
712 break
377 t = self.tagStack[i] 713 t = self.tagStack[i]
378 if (name == t.name and nsprefix == t.prefix): 714 if (name == t.name and nsprefix == t.prefix):
379 if inclusivePop: 715 if inclusivePop:
@@ -383,16 +719,26 @@ class BeautifulSoup(Tag):
383 719
384 return most_recently_popped 720 return most_recently_popped
385 721
386 def handle_starttag(self, name, namespace, nsprefix, attrs): 722 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
387 """Push a start tag on to the stack. 723 sourcepos=None, namespaces=None):
388 724 """Called by the tree builder when a new tag is encountered.
389 If this method returns None, the tag was rejected by the 725
390 SoupStrainer. You should proceed as if the tag had not occured 726 :param name: Name of the tag.
727 :param nsprefix: Namespace prefix for the tag.
728 :param attrs: A dictionary of attribute values.
729 :param sourceline: The line number where this tag was found in its
730 source document.
731 :param sourcepos: The character position within `sourceline` where this
732 tag was found.
733 :param namespaces: A dictionary of all namespace prefix mappings
734 currently in scope in the document.
735
736 If this method returns None, the tag was rejected by an active
737 SoupStrainer. You should proceed as if the tag had not occurred
391 in the document. For instance, if this was a self-closing tag, 738 in the document. For instance, if this was a self-closing tag,
392 don't call handle_endtag. 739 don't call handle_endtag.
393 """ 740 """
394 741 # print("Start tag %s: %s" % (name, attrs))
395 # print "Start tag %s: %s" % (name, attrs)
396 self.endData() 742 self.endData()
397 743
398 if (self.parse_only and len(self.tagStack) <= 1 744 if (self.parse_only and len(self.tagStack) <= 1
@@ -400,34 +746,54 @@ class BeautifulSoup(Tag):
400 or not self.parse_only.search_tag(name, attrs))): 746 or not self.parse_only.search_tag(name, attrs))):
401 return None 747 return None
402 748
403 tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, 749 tag = self.element_classes.get(Tag, Tag)(
404 self.currentTag, self._most_recent_element) 750 self, self.builder, name, namespace, nsprefix, attrs,
751 self.currentTag, self._most_recent_element,
752 sourceline=sourceline, sourcepos=sourcepos,
753 namespaces=namespaces
754 )
405 if tag is None: 755 if tag is None:
406 return tag 756 return tag
407 if self._most_recent_element: 757 if self._most_recent_element is not None:
408 self._most_recent_element.next_element = tag 758 self._most_recent_element.next_element = tag
409 self._most_recent_element = tag 759 self._most_recent_element = tag
410 self.pushTag(tag) 760 self.pushTag(tag)
411 return tag 761 return tag
412 762
413 def handle_endtag(self, name, nsprefix=None): 763 def handle_endtag(self, name, nsprefix=None):
414 #print "End tag: " + name 764 """Called by the tree builder when an ending tag is encountered.
765
766 :param name: Name of the tag.
767 :param nsprefix: Namespace prefix for the tag.
768 """
769 #print("End tag: " + name)
415 self.endData() 770 self.endData()
416 self._popToTag(name, nsprefix) 771 self._popToTag(name, nsprefix)
417 772
418 def handle_data(self, data): 773 def handle_data(self, data):
774 """Called by the tree builder when a chunk of textual data is encountered."""
419 self.current_data.append(data) 775 self.current_data.append(data)
420 776
421 def decode(self, pretty_print=False, 777 def decode(self, pretty_print=False,
422 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 778 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
423 formatter="minimal"): 779 formatter="minimal", iterator=None):
424 """Returns a string or Unicode representation of this document. 780 """Returns a string or Unicode representation of the parse tree
425 To get Unicode, pass None for encoding.""" 781 as an HTML or XML document.
426 782
783 :param pretty_print: If this is True, indentation will be used to
784 make the document more readable.
785 :param eventual_encoding: The encoding of the final document.
786 If this is None, the document will be a Unicode string.
787 """
427 if self.is_xml: 788 if self.is_xml:
428 # Print the XML declaration 789 # Print the XML declaration
429 encoding_part = '' 790 encoding_part = ''
430 if eventual_encoding is not None: 791 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
792 # This is a special Python encoding; it can't actually
793 # go into an XML document because it means nothing
794 # outside of Python.
795 eventual_encoding = None
796 if eventual_encoding != None:
431 encoding_part = ' encoding="%s"' % eventual_encoding 797 encoding_part = ' encoding="%s"' % eventual_encoding
432 prefix = '<?xml version="1.0"%s?>\n' % encoding_part 798 prefix = '<?xml version="1.0"%s?>\n' % encoding_part
433 else: 799 else:
@@ -437,9 +803,9 @@ class BeautifulSoup(Tag):
437 else: 803 else:
438 indent_level = 0 804 indent_level = 0
439 return prefix + super(BeautifulSoup, self).decode( 805 return prefix + super(BeautifulSoup, self).decode(
440 indent_level, eventual_encoding, formatter) 806 indent_level, eventual_encoding, formatter, iterator)
441 807
442# Alias to make it easier to type import: 'from bs4 import _soup' 808# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
443_s = BeautifulSoup 809_s = BeautifulSoup
444_soup = BeautifulSoup 810_soup = BeautifulSoup
445 811
@@ -450,19 +816,25 @@ class BeautifulStoneSoup(BeautifulSoup):
450 kwargs['features'] = 'xml' 816 kwargs['features'] = 'xml'
451 warnings.warn( 817 warnings.warn(
452 'The BeautifulStoneSoup class is deprecated. Instead of using ' 818 'The BeautifulStoneSoup class is deprecated. Instead of using '
453 'it, pass features="xml" into the BeautifulSoup constructor.') 819 'it, pass features="xml" into the BeautifulSoup constructor.',
820 DeprecationWarning, stacklevel=2
821 )
454 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 822 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
455 823
456 824
457class StopParsing(Exception): 825class StopParsing(Exception):
826 """Exception raised by a TreeBuilder if it's unable to continue parsing."""
458 pass 827 pass
459 828
460class FeatureNotFound(ValueError): 829class FeatureNotFound(ValueError):
830 """Exception raised by the BeautifulSoup constructor if no parser with the
831 requested features is found.
832 """
461 pass 833 pass
462 834
463 835
464#By default, act as an HTML pretty-printer. 836#If this file is run as a script, act as an HTML pretty-printer.
465if __name__ == '__main__': 837if __name__ == '__main__':
466 import sys 838 import sys
467 soup = BeautifulSoup(sys.stdin) 839 soup = BeautifulSoup(sys.stdin)
468 print(soup.prettify()) 840 print((soup.prettify()))