diff options
| author | Richard Purdie <richard.purdie@linuxfoundation.org> | 2024-05-31 12:04:03 +0100 |
|---|---|---|
| committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2024-05-31 12:43:18 +0100 |
| commit | 12fa81e8d67f0d9755decde5c5b766f56b2af8db (patch) | |
| tree | de58af9a17e4760de36091d525d7eba8bc6f1578 /bitbake/lib/bs4/__init__.py | |
| parent | 99ff46cc9bb12619af55c892452cee3b90a545f0 (diff) | |
| download | poky-12fa81e8d67f0d9755decde5c5b766f56b2af8db.tar.gz | |
bs4: Update to 4.12.3 from 4.4.1
It makes sense to switch to a more recent version and keep up to date
with upstream changes and things like new python version support.
(Bitbake rev: f5462156036e71911c66d07dbf3303cde862785b)
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/__init__.py')
| -rw-r--r-- | bitbake/lib/bs4/__init__.py | 680 |
1 files changed, 526 insertions, 154 deletions
diff --git a/bitbake/lib/bs4/__init__.py b/bitbake/lib/bs4/__init__.py index e35725b86e..d8ad5e1dc1 100644 --- a/bitbake/lib/bs4/__init__.py +++ b/bitbake/lib/bs4/__init__.py | |||
| @@ -1,65 +1,99 @@ | |||
| 1 | """Beautiful Soup | 1 | """Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". |
| 2 | Elixir and Tonic | 2 | |
| 3 | "The Screen-Scraper's Friend" | ||
| 4 | http://www.crummy.com/software/BeautifulSoup/ | 3 | http://www.crummy.com/software/BeautifulSoup/ |
| 5 | 4 | ||
| 6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a | 5 | Beautiful Soup uses a pluggable XML or HTML parser to parse a |
| 7 | (possibly invalid) document into a tree representation. Beautiful Soup | 6 | (possibly invalid) document into a tree representation. Beautiful Soup |
| 8 | provides provides methods and Pythonic idioms that make it easy to | 7 | provides methods and Pythonic idioms that make it easy to navigate, |
| 9 | navigate, search, and modify the parse tree. | 8 | search, and modify the parse tree. |
| 10 | 9 | ||
| 11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml | 10 | Beautiful Soup works with Python 3.6 and up. It works better if lxml |
| 12 | and/or html5lib is installed. | 11 | and/or html5lib is installed. |
| 13 | 12 | ||
| 14 | For more than you ever wanted to know about Beautiful Soup, see the | 13 | For more than you ever wanted to know about Beautiful Soup, see the |
| 15 | documentation: | 14 | documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ |
| 16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/ | ||
| 17 | """ | 15 | """ |
| 18 | 16 | ||
| 19 | __author__ = "Leonard Richardson (leonardr@segfault.org)" | 17 | __author__ = "Leonard Richardson (leonardr@segfault.org)" |
| 20 | __version__ = "4.4.1" | 18 | __version__ = "4.12.3" |
| 21 | __copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" | 19 | __copyright__ = "Copyright (c) 2004-2024 Leonard Richardson" |
| 20 | # Use of this source code is governed by the MIT license. | ||
| 22 | __license__ = "MIT" | 21 | __license__ = "MIT" |
| 23 | 22 | ||
| 24 | __all__ = ['BeautifulSoup'] | 23 | __all__ = ['BeautifulSoup'] |
| 25 | 24 | ||
| 25 | from collections import Counter | ||
| 26 | import os | 26 | import os |
| 27 | import re | 27 | import re |
| 28 | import sys | ||
| 29 | import traceback | ||
| 28 | import warnings | 30 | import warnings |
| 29 | 31 | ||
| 30 | from .builder import builder_registry, ParserRejectedMarkup | 32 | # The very first thing we do is give a useful error if someone is |
| 33 | # running this code under Python 2. | ||
| 34 | if sys.version_info.major < 3: | ||
| 35 | raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') | ||
| 36 | |||
| 37 | from .builder import ( | ||
| 38 | builder_registry, | ||
| 39 | ParserRejectedMarkup, | ||
| 40 | XMLParsedAsHTMLWarning, | ||
| 41 | HTMLParserTreeBuilder | ||
| 42 | ) | ||
| 31 | from .dammit import UnicodeDammit | 43 | from .dammit import UnicodeDammit |
| 32 | from .element import ( | 44 | from .element import ( |
| 33 | CData, | 45 | CData, |
| 34 | Comment, | 46 | Comment, |
| 47 | CSS, | ||
| 35 | DEFAULT_OUTPUT_ENCODING, | 48 | DEFAULT_OUTPUT_ENCODING, |
| 36 | Declaration, | 49 | Declaration, |
| 37 | Doctype, | 50 | Doctype, |
| 38 | NavigableString, | 51 | NavigableString, |
| 39 | PageElement, | 52 | PageElement, |
| 40 | ProcessingInstruction, | 53 | ProcessingInstruction, |
| 54 | PYTHON_SPECIFIC_ENCODINGS, | ||
| 41 | ResultSet, | 55 | ResultSet, |
| 56 | Script, | ||
| 57 | Stylesheet, | ||
| 42 | SoupStrainer, | 58 | SoupStrainer, |
| 43 | Tag, | 59 | Tag, |
| 60 | TemplateString, | ||
| 44 | ) | 61 | ) |
| 45 | 62 | ||
| 46 | # The very first thing we do is give a useful error if someone is | 63 | # Define some custom warnings. |
| 47 | # running this code under Python 3 without converting it. | 64 | class GuessedAtParserWarning(UserWarning): |
| 48 | 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' | 65 | """The warning issued when BeautifulSoup has to guess what parser to |
| 66 | use -- probably because no parser was specified in the constructor. | ||
| 67 | """ | ||
| 49 | 68 | ||
| 50 | class BeautifulSoup(Tag): | 69 | class MarkupResemblesLocatorWarning(UserWarning): |
| 70 | """The warning issued when BeautifulSoup is given 'markup' that | ||
| 71 | actually looks like a resource locator -- a URL or a path to a file | ||
| 72 | on disk. | ||
| 51 | """ | 73 | """ |
| 52 | This class defines the basic interface called by the tree builders. | ||
| 53 | 74 | ||
| 54 | These methods will be called by the parser: | 75 | |
| 55 | reset() | 76 | class BeautifulSoup(Tag): |
| 56 | feed(markup) | 77 | """A data structure representing a parsed HTML or XML document. |
| 78 | |||
| 79 | Most of the methods you'll call on a BeautifulSoup object are inherited from | ||
| 80 | PageElement or Tag. | ||
| 81 | |||
| 82 | Internally, this class defines the basic interface called by the | ||
| 83 | tree builders when converting an HTML/XML document into a data | ||
| 84 | structure. The interface abstracts away the differences between | ||
| 85 | parsers. To write a new tree builder, you'll need to understand | ||
| 86 | these methods as a whole. | ||
| 87 | |||
| 88 | These methods will be called by the BeautifulSoup constructor: | ||
| 89 | * reset() | ||
| 90 | * feed(markup) | ||
| 57 | 91 | ||
| 58 | The tree builder may call these methods from its feed() implementation: | 92 | The tree builder may call these methods from its feed() implementation: |
| 59 | handle_starttag(name, attrs) # See note about return value | 93 | * handle_starttag(name, attrs) # See note about return value |
| 60 | handle_endtag(name) | 94 | * handle_endtag(name) |
| 61 | handle_data(data) # Appends to the current data node | 95 | * handle_data(data) # Appends to the current data node |
| 62 | endData(containerClass=NavigableString) # Ends the current data node | 96 | * endData(containerClass) # Ends the current data node |
| 63 | 97 | ||
| 64 | No matter how complicated the underlying parser is, you should be | 98 | No matter how complicated the underlying parser is, you should be |
| 65 | able to build a tree using 'start tag' events, 'end tag' events, | 99 | able to build a tree using 'start tag' events, 'end tag' events, |
| @@ -69,24 +103,77 @@ class BeautifulSoup(Tag): | |||
| 69 | like HTML's <br> tag), call handle_starttag and then | 103 | like HTML's <br> tag), call handle_starttag and then |
| 70 | handle_endtag. | 104 | handle_endtag. |
| 71 | """ | 105 | """ |
| 106 | |||
| 107 | # Since BeautifulSoup subclasses Tag, it's possible to treat it as | ||
| 108 | # a Tag with a .name. This name makes it clear the BeautifulSoup | ||
| 109 | # object isn't a real markup tag. | ||
| 72 | ROOT_TAG_NAME = '[document]' | 110 | ROOT_TAG_NAME = '[document]' |
| 73 | 111 | ||
| 74 | # If the end-user gives no indication which tree builder they | 112 | # If the end-user gives no indication which tree builder they |
| 75 | # want, look for one with these features. | 113 | # want, look for one with these features. |
| 76 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] | 114 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] |
| 77 | 115 | ||
| 116 | # A string containing all ASCII whitespace characters, used in | ||
| 117 | # endData() to detect data chunks that seem 'empty'. | ||
| 78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' | 118 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' |
| 79 | 119 | ||
| 80 | NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" | 120 | NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" |
| 81 | 121 | ||
| 82 | def __init__(self, markup="", features=None, builder=None, | 122 | def __init__(self, markup="", features=None, builder=None, |
| 83 | parse_only=None, from_encoding=None, exclude_encodings=None, | 123 | parse_only=None, from_encoding=None, exclude_encodings=None, |
| 84 | **kwargs): | 124 | element_classes=None, **kwargs): |
| 85 | """The Soup object is initialized as the 'root tag', and the | 125 | """Constructor. |
| 86 | provided markup (which can be a string or a file-like object) | 126 | |
| 87 | is fed into the underlying parser.""" | 127 | :param markup: A string or a file-like object representing |
| 88 | 128 | markup to be parsed. | |
| 129 | |||
| 130 | :param features: Desirable features of the parser to be | ||
| 131 | used. This may be the name of a specific parser ("lxml", | ||
| 132 | "lxml-xml", "html.parser", or "html5lib") or it may be the | ||
| 133 | type of markup to be used ("html", "html5", "xml"). It's | ||
| 134 | recommended that you name a specific parser, so that | ||
| 135 | Beautiful Soup gives you the same results across platforms | ||
| 136 | and virtual environments. | ||
| 137 | |||
| 138 | :param builder: A TreeBuilder subclass to instantiate (or | ||
| 139 | instance to use) instead of looking one up based on | ||
| 140 | `features`. You only need to use this if you've implemented a | ||
| 141 | custom TreeBuilder. | ||
| 142 | |||
| 143 | :param parse_only: A SoupStrainer. Only parts of the document | ||
| 144 | matching the SoupStrainer will be considered. This is useful | ||
| 145 | when parsing part of a document that would otherwise be too | ||
| 146 | large to fit into memory. | ||
| 147 | |||
| 148 | :param from_encoding: A string indicating the encoding of the | ||
| 149 | document to be parsed. Pass this in if Beautiful Soup is | ||
| 150 | guessing wrongly about the document's encoding. | ||
| 151 | |||
| 152 | :param exclude_encodings: A list of strings indicating | ||
| 153 | encodings known to be wrong. Pass this in if you don't know | ||
| 154 | the document's encoding but you know Beautiful Soup's guess is | ||
| 155 | wrong. | ||
| 156 | |||
| 157 | :param element_classes: A dictionary mapping BeautifulSoup | ||
| 158 | classes like Tag and NavigableString, to other classes you'd | ||
| 159 | like to be instantiated instead as the parse tree is | ||
| 160 | built. This is useful for subclassing Tag or NavigableString | ||
| 161 | to modify default behavior. | ||
| 162 | |||
| 163 | :param kwargs: For backwards compatibility purposes, the | ||
| 164 | constructor accepts certain keyword arguments used in | ||
| 165 | Beautiful Soup 3. None of these arguments do anything in | ||
| 166 | Beautiful Soup 4; they will result in a warning and then be | ||
| 167 | ignored. | ||
| 168 | |||
| 169 | Apart from this, any keyword arguments passed into the | ||
| 170 | BeautifulSoup constructor are propagated to the TreeBuilder | ||
| 171 | constructor. This makes it possible to configure a | ||
| 172 | TreeBuilder by passing in arguments, not just by saying which | ||
| 173 | one to use. | ||
| 174 | """ | ||
| 89 | if 'convertEntities' in kwargs: | 175 | if 'convertEntities' in kwargs: |
| 176 | del kwargs['convertEntities'] | ||
| 90 | warnings.warn( | 177 | warnings.warn( |
| 91 | "BS4 does not respect the convertEntities argument to the " | 178 | "BS4 does not respect the convertEntities argument to the " |
| 92 | "BeautifulSoup constructor. Entities are always converted " | 179 | "BeautifulSoup constructor. Entities are always converted " |
| @@ -125,10 +212,10 @@ class BeautifulSoup(Tag): | |||
| 125 | if old_name in kwargs: | 212 | if old_name in kwargs: |
| 126 | warnings.warn( | 213 | warnings.warn( |
| 127 | 'The "%s" argument to the BeautifulSoup constructor ' | 214 | 'The "%s" argument to the BeautifulSoup constructor ' |
| 128 | 'has been renamed to "%s."' % (old_name, new_name)) | 215 | 'has been renamed to "%s."' % (old_name, new_name), |
| 129 | value = kwargs[old_name] | 216 | DeprecationWarning, stacklevel=3 |
| 130 | del kwargs[old_name] | 217 | ) |
| 131 | return value | 218 | return kwargs.pop(old_name) |
| 132 | return None | 219 | return None |
| 133 | 220 | ||
| 134 | parse_only = parse_only or deprecated_argument( | 221 | parse_only = parse_only or deprecated_argument( |
| @@ -137,13 +224,23 @@ class BeautifulSoup(Tag): | |||
| 137 | from_encoding = from_encoding or deprecated_argument( | 224 | from_encoding = from_encoding or deprecated_argument( |
| 138 | "fromEncoding", "from_encoding") | 225 | "fromEncoding", "from_encoding") |
| 139 | 226 | ||
| 140 | if len(kwargs) > 0: | 227 | if from_encoding and isinstance(markup, str): |
| 141 | arg = list(kwargs.keys()).pop() | 228 | warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") |
| 142 | raise TypeError( | 229 | from_encoding = None |
| 143 | "__init__() got an unexpected keyword argument '%s'" % arg) | 230 | |
| 144 | 231 | self.element_classes = element_classes or dict() | |
| 145 | if builder is None: | 232 | |
| 146 | original_features = features | 233 | # We need this information to track whether or not the builder |
| 234 | # was specified well enough that we can omit the 'you need to | ||
| 235 | # specify a parser' warning. | ||
| 236 | original_builder = builder | ||
| 237 | original_features = features | ||
| 238 | |||
| 239 | if isinstance(builder, type): | ||
| 240 | # A builder class was passed in; it needs to be instantiated. | ||
| 241 | builder_class = builder | ||
| 242 | builder = None | ||
| 243 | elif builder is None: | ||
| 147 | if isinstance(features, str): | 244 | if isinstance(features, str): |
| 148 | features = [features] | 245 | features = [features] |
| 149 | if features is None or len(features) == 0: | 246 | if features is None or len(features) == 0: |
| @@ -154,85 +251,227 @@ class BeautifulSoup(Tag): | |||
| 154 | "Couldn't find a tree builder with the features you " | 251 | "Couldn't find a tree builder with the features you " |
| 155 | "requested: %s. Do you need to install a parser library?" | 252 | "requested: %s. Do you need to install a parser library?" |
| 156 | % ",".join(features)) | 253 | % ",".join(features)) |
| 157 | builder = builder_class() | 254 | |
| 158 | if not (original_features == builder.NAME or | 255 | # At this point either we have a TreeBuilder instance in |
| 159 | original_features in builder.ALTERNATE_NAMES): | 256 | # builder, or we have a builder_class that we can instantiate |
| 257 | # with the remaining **kwargs. | ||
| 258 | if builder is None: | ||
| 259 | builder = builder_class(**kwargs) | ||
| 260 | if not original_builder and not ( | ||
| 261 | original_features == builder.NAME or | ||
| 262 | original_features in builder.ALTERNATE_NAMES | ||
| 263 | ) and markup: | ||
| 264 | # The user did not tell us which TreeBuilder to use, | ||
| 265 | # and we had to guess. Issue a warning. | ||
| 160 | if builder.is_xml: | 266 | if builder.is_xml: |
| 161 | markup_type = "XML" | 267 | markup_type = "XML" |
| 162 | else: | 268 | else: |
| 163 | markup_type = "HTML" | 269 | markup_type = "HTML" |
| 164 | warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( | ||
| 165 | parser=builder.NAME, | ||
| 166 | markup_type=markup_type)) | ||
| 167 | 270 | ||
| 271 | # This code adapted from warnings.py so that we get the same line | ||
| 272 | # of code as our warnings.warn() call gets, even if the answer is wrong | ||
| 273 | # (as it may be in a multithreading situation). | ||
| 274 | caller = None | ||
| 275 | try: | ||
| 276 | caller = sys._getframe(1) | ||
| 277 | except ValueError: | ||
| 278 | pass | ||
| 279 | if caller: | ||
| 280 | globals = caller.f_globals | ||
| 281 | line_number = caller.f_lineno | ||
| 282 | else: | ||
| 283 | globals = sys.__dict__ | ||
| 284 | line_number= 1 | ||
| 285 | filename = globals.get('__file__') | ||
| 286 | if filename: | ||
| 287 | fnl = filename.lower() | ||
| 288 | if fnl.endswith((".pyc", ".pyo")): | ||
| 289 | filename = filename[:-1] | ||
| 290 | if filename: | ||
| 291 | # If there is no filename at all, the user is most likely in a REPL, | ||
| 292 | # and the warning is not necessary. | ||
| 293 | values = dict( | ||
| 294 | filename=filename, | ||
| 295 | line_number=line_number, | ||
| 296 | parser=builder.NAME, | ||
| 297 | markup_type=markup_type | ||
| 298 | ) | ||
| 299 | warnings.warn( | ||
| 300 | self.NO_PARSER_SPECIFIED_WARNING % values, | ||
| 301 | GuessedAtParserWarning, stacklevel=2 | ||
| 302 | ) | ||
| 303 | else: | ||
| 304 | if kwargs: | ||
| 305 | warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") | ||
| 306 | |||
| 168 | self.builder = builder | 307 | self.builder = builder |
| 169 | self.is_xml = builder.is_xml | 308 | self.is_xml = builder.is_xml |
| 170 | self.builder.soup = self | 309 | self.known_xml = self.is_xml |
| 171 | 310 | self._namespaces = dict() | |
| 172 | self.parse_only = parse_only | 311 | self.parse_only = parse_only |
| 173 | 312 | ||
| 174 | if hasattr(markup, 'read'): # It's a file-type object. | 313 | if hasattr(markup, 'read'): # It's a file-type object. |
| 175 | markup = markup.read() | 314 | markup = markup.read() |
| 176 | elif len(markup) <= 256: | 315 | elif len(markup) <= 256 and ( |
| 177 | # Print out warnings for a couple beginner problems | 316 | (isinstance(markup, bytes) and not b'<' in markup) |
| 317 | or (isinstance(markup, str) and not '<' in markup) | ||
| 318 | ): | ||
| 319 | # Issue warnings for a couple beginner problems | ||
| 178 | # involving passing non-markup to Beautiful Soup. | 320 | # involving passing non-markup to Beautiful Soup. |
| 179 | # Beautiful Soup will still parse the input as markup, | 321 | # Beautiful Soup will still parse the input as markup, |
| 180 | # just in case that's what the user really wants. | 322 | # since that is sometimes the intended behavior. |
| 181 | if (isinstance(markup, str) | 323 | if not self._markup_is_url(markup): |
| 182 | and not os.path.supports_unicode_filenames): | 324 | self._markup_resembles_filename(markup) |
| 183 | possible_filename = markup.encode("utf8") | ||
| 184 | else: | ||
| 185 | possible_filename = markup | ||
| 186 | is_file = False | ||
| 187 | try: | ||
| 188 | is_file = os.path.exists(possible_filename) | ||
| 189 | except Exception as e: | ||
| 190 | # This is almost certainly a problem involving | ||
| 191 | # characters not valid in filenames on this | ||
| 192 | # system. Just let it go. | ||
| 193 | pass | ||
| 194 | if is_file: | ||
| 195 | if isinstance(markup, str): | ||
| 196 | markup = markup.encode("utf8") | ||
| 197 | warnings.warn( | ||
| 198 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) | ||
| 199 | if markup[:5] == "http:" or markup[:6] == "https:": | ||
| 200 | # TODO: This is ugly but I couldn't get it to work in | ||
| 201 | # Python 3 otherwise. | ||
| 202 | if ((isinstance(markup, bytes) and not b' ' in markup) | ||
| 203 | or (isinstance(markup, str) and not ' ' in markup)): | ||
| 204 | if isinstance(markup, str): | ||
| 205 | markup = markup.encode("utf8") | ||
| 206 | warnings.warn( | ||
| 207 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) | ||
| 208 | 325 | ||
| 326 | rejections = [] | ||
| 327 | success = False | ||
| 209 | for (self.markup, self.original_encoding, self.declared_html_encoding, | 328 | for (self.markup, self.original_encoding, self.declared_html_encoding, |
| 210 | self.contains_replacement_characters) in ( | 329 | self.contains_replacement_characters) in ( |
| 211 | self.builder.prepare_markup( | 330 | self.builder.prepare_markup( |
| 212 | markup, from_encoding, exclude_encodings=exclude_encodings)): | 331 | markup, from_encoding, exclude_encodings=exclude_encodings)): |
| 213 | self.reset() | 332 | self.reset() |
| 333 | self.builder.initialize_soup(self) | ||
| 214 | try: | 334 | try: |
| 215 | self._feed() | 335 | self._feed() |
| 336 | success = True | ||
| 216 | break | 337 | break |
| 217 | except ParserRejectedMarkup: | 338 | except ParserRejectedMarkup as e: |
| 339 | rejections.append(e) | ||
| 218 | pass | 340 | pass |
| 219 | 341 | ||
| 342 | if not success: | ||
| 343 | other_exceptions = [str(e) for e in rejections] | ||
| 344 | raise ParserRejectedMarkup( | ||
| 345 | "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) | ||
| 346 | ) | ||
| 347 | |||
| 220 | # Clear out the markup and remove the builder's circular | 348 | # Clear out the markup and remove the builder's circular |
| 221 | # reference to this object. | 349 | # reference to this object. |
| 222 | self.markup = None | 350 | self.markup = None |
| 223 | self.builder.soup = None | 351 | self.builder.soup = None |
| 224 | 352 | ||
| 225 | def __copy__(self): | 353 | def _clone(self): |
| 226 | return type(self)(self.encode(), builder=self.builder) | 354 | """Create a new BeautifulSoup object with the same TreeBuilder, |
| 355 | but not associated with any markup. | ||
| 356 | |||
| 357 | This is the first step of the deepcopy process. | ||
| 358 | """ | ||
| 359 | clone = type(self)("", None, self.builder) | ||
| 227 | 360 | ||
| 361 | # Keep track of the encoding of the original document, | ||
| 362 | # since we won't be parsing it again. | ||
| 363 | clone.original_encoding = self.original_encoding | ||
| 364 | return clone | ||
| 365 | |||
| 228 | def __getstate__(self): | 366 | def __getstate__(self): |
| 229 | # Frequently a tree builder can't be pickled. | 367 | # Frequently a tree builder can't be pickled. |
| 230 | d = dict(self.__dict__) | 368 | d = dict(self.__dict__) |
| 231 | if 'builder' in d and not self.builder.picklable: | 369 | if 'builder' in d and d['builder'] is not None and not self.builder.picklable: |
| 232 | del d['builder'] | 370 | d['builder'] = type(self.builder) |
| 371 | # Store the contents as a Unicode string. | ||
| 372 | d['contents'] = [] | ||
| 373 | d['markup'] = self.decode() | ||
| 374 | |||
| 375 | # If _most_recent_element is present, it's a Tag object left | ||
| 376 | # over from initial parse. It might not be picklable and we | ||
| 377 | # don't need it. | ||
| 378 | if '_most_recent_element' in d: | ||
| 379 | del d['_most_recent_element'] | ||
| 233 | return d | 380 | return d |
| 234 | 381 | ||
| 382 | def __setstate__(self, state): | ||
| 383 | # If necessary, restore the TreeBuilder by looking it up. | ||
| 384 | self.__dict__ = state | ||
| 385 | if isinstance(self.builder, type): | ||
| 386 | self.builder = self.builder() | ||
| 387 | elif not self.builder: | ||
| 388 | # We don't know which builder was used to build this | ||
| 389 | # parse tree, so use a default we know is always available. | ||
| 390 | self.builder = HTMLParserTreeBuilder() | ||
| 391 | self.builder.soup = self | ||
| 392 | self.reset() | ||
| 393 | self._feed() | ||
| 394 | return state | ||
| 395 | |||
| 396 | |||
| 397 | @classmethod | ||
| 398 | def _decode_markup(cls, markup): | ||
| 399 | """Ensure `markup` is bytes so it's safe to send into warnings.warn. | ||
| 400 | |||
| 401 | TODO: warnings.warn had this problem back in 2010 but it might not | ||
| 402 | anymore. | ||
| 403 | """ | ||
| 404 | if isinstance(markup, bytes): | ||
| 405 | decoded = markup.decode('utf-8', 'replace') | ||
| 406 | else: | ||
| 407 | decoded = markup | ||
| 408 | return decoded | ||
| 409 | |||
| 410 | @classmethod | ||
| 411 | def _markup_is_url(cls, markup): | ||
| 412 | """Error-handling method to raise a warning if incoming markup looks | ||
| 413 | like a URL. | ||
| 414 | |||
| 415 | :param markup: A string. | ||
| 416 | :return: Whether or not the markup resembles a URL | ||
| 417 | closely enough to justify a warning. | ||
| 418 | """ | ||
| 419 | if isinstance(markup, bytes): | ||
| 420 | space = b' ' | ||
| 421 | cant_start_with = (b"http:", b"https:") | ||
| 422 | elif isinstance(markup, str): | ||
| 423 | space = ' ' | ||
| 424 | cant_start_with = ("http:", "https:") | ||
| 425 | else: | ||
| 426 | return False | ||
| 427 | |||
| 428 | if any(markup.startswith(prefix) for prefix in cant_start_with): | ||
| 429 | if not space in markup: | ||
| 430 | warnings.warn( | ||
| 431 | 'The input looks more like a URL than markup. You may want to use' | ||
| 432 | ' an HTTP client like requests to get the document behind' | ||
| 433 | ' the URL, and feed that document to Beautiful Soup.', | ||
| 434 | MarkupResemblesLocatorWarning, | ||
| 435 | stacklevel=3 | ||
| 436 | ) | ||
| 437 | return True | ||
| 438 | return False | ||
| 439 | |||
| 440 | @classmethod | ||
| 441 | def _markup_resembles_filename(cls, markup): | ||
| 442 | """Error-handling method to raise a warning if incoming markup | ||
| 443 | resembles a filename. | ||
| 444 | |||
| 445 | :param markup: A bytestring or string. | ||
| 446 | :return: Whether or not the markup resembles a filename | ||
| 447 | closely enough to justify a warning. | ||
| 448 | """ | ||
| 449 | path_characters = '/\\' | ||
| 450 | extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt'] | ||
| 451 | if isinstance(markup, bytes): | ||
| 452 | path_characters = path_characters.encode("utf8") | ||
| 453 | extensions = [x.encode('utf8') for x in extensions] | ||
| 454 | filelike = False | ||
| 455 | if any(x in markup for x in path_characters): | ||
| 456 | filelike = True | ||
| 457 | else: | ||
| 458 | lower = markup.lower() | ||
| 459 | if any(lower.endswith(ext) for ext in extensions): | ||
| 460 | filelike = True | ||
| 461 | if filelike: | ||
| 462 | warnings.warn( | ||
| 463 | 'The input looks more like a filename than markup. You may' | ||
| 464 | ' want to open this file and pass the filehandle into' | ||
| 465 | ' Beautiful Soup.', | ||
| 466 | MarkupResemblesLocatorWarning, stacklevel=3 | ||
| 467 | ) | ||
| 468 | return True | ||
| 469 | return False | ||
| 470 | |||
| 235 | def _feed(self): | 471 | def _feed(self): |
| 472 | """Internal method that parses previously set markup, creating a large | ||
| 473 | number of Tag and NavigableString objects. | ||
| 474 | """ | ||
| 236 | # Convert the document to Unicode. | 475 | # Convert the document to Unicode. |
| 237 | self.builder.reset() | 476 | self.builder.reset() |
| 238 | 477 | ||
| @@ -243,48 +482,111 @@ class BeautifulSoup(Tag): | |||
| 243 | self.popTag() | 482 | self.popTag() |
| 244 | 483 | ||
| 245 | def reset(self): | 484 | def reset(self): |
| 485 | """Reset this object to a state as though it had never parsed any | ||
| 486 | markup. | ||
| 487 | """ | ||
| 246 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) | 488 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) |
| 247 | self.hidden = 1 | 489 | self.hidden = 1 |
| 248 | self.builder.reset() | 490 | self.builder.reset() |
| 249 | self.current_data = [] | 491 | self.current_data = [] |
| 250 | self.currentTag = None | 492 | self.currentTag = None |
| 251 | self.tagStack = [] | 493 | self.tagStack = [] |
| 494 | self.open_tag_counter = Counter() | ||
| 252 | self.preserve_whitespace_tag_stack = [] | 495 | self.preserve_whitespace_tag_stack = [] |
| 496 | self.string_container_stack = [] | ||
| 497 | self._most_recent_element = None | ||
| 253 | self.pushTag(self) | 498 | self.pushTag(self) |
| 254 | 499 | ||
| 255 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs): | 500 | def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, |
| 256 | """Create a new tag associated with this soup.""" | 501 | sourceline=None, sourcepos=None, **kwattrs): |
| 257 | return Tag(None, self.builder, name, namespace, nsprefix, attrs) | 502 | """Create a new Tag associated with this BeautifulSoup object. |
| 503 | |||
| 504 | :param name: The name of the new Tag. | ||
| 505 | :param namespace: The URI of the new Tag's XML namespace, if any. | ||
| 506 | :param prefix: The prefix for the new Tag's XML namespace, if any. | ||
| 507 | :param attrs: A dictionary of this Tag's attribute values; can | ||
| 508 | be used instead of `kwattrs` for attributes like 'class' | ||
| 509 | that are reserved words in Python. | ||
| 510 | :param sourceline: The line number where this tag was | ||
| 511 | (purportedly) found in its source document. | ||
| 512 | :param sourcepos: The character position within `sourceline` where this | ||
| 513 | tag was (purportedly) found. | ||
| 514 | :param kwattrs: Keyword arguments for the new Tag's attribute values. | ||
| 258 | 515 | ||
| 259 | def new_string(self, s, subclass=NavigableString): | 516 | """ |
| 260 | """Create a new NavigableString associated with this soup.""" | 517 | kwattrs.update(attrs) |
| 261 | return subclass(s) | 518 | return self.element_classes.get(Tag, Tag)( |
| 519 | None, self.builder, name, namespace, nsprefix, kwattrs, | ||
| 520 | sourceline=sourceline, sourcepos=sourcepos | ||
| 521 | ) | ||
| 522 | |||
| 523 | def string_container(self, base_class=None): | ||
| 524 | container = base_class or NavigableString | ||
| 525 | |||
| 526 | # There may be a general override of NavigableString. | ||
| 527 | container = self.element_classes.get( | ||
| 528 | container, container | ||
| 529 | ) | ||
| 530 | |||
| 531 | # On top of that, we may be inside a tag that needs a special | ||
| 532 | # container class. | ||
| 533 | if self.string_container_stack and container is NavigableString: | ||
| 534 | container = self.builder.string_containers.get( | ||
| 535 | self.string_container_stack[-1].name, container | ||
| 536 | ) | ||
| 537 | return container | ||
| 538 | |||
| 539 | def new_string(self, s, subclass=None): | ||
| 540 | """Create a new NavigableString associated with this BeautifulSoup | ||
| 541 | object. | ||
| 542 | """ | ||
| 543 | container = self.string_container(subclass) | ||
| 544 | return container(s) | ||
| 262 | 545 | ||
| 263 | def insert_before(self, successor): | 546 | def insert_before(self, *args): |
| 547 | """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement | ||
| 548 | it because there is nothing before or after it in the parse tree. | ||
| 549 | """ | ||
| 264 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") | 550 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") |
| 265 | 551 | ||
| 266 | def insert_after(self, successor): | 552 | def insert_after(self, *args): |
| 553 | """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement | ||
| 554 | it because there is nothing before or after it in the parse tree. | ||
| 555 | """ | ||
| 267 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") | 556 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") |
| 268 | 557 | ||
| 269 | def popTag(self): | 558 | def popTag(self): |
| 559 | """Internal method called by _popToTag when a tag is closed.""" | ||
| 270 | tag = self.tagStack.pop() | 560 | tag = self.tagStack.pop() |
| 561 | if tag.name in self.open_tag_counter: | ||
| 562 | self.open_tag_counter[tag.name] -= 1 | ||
| 271 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: | 563 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: |
| 272 | self.preserve_whitespace_tag_stack.pop() | 564 | self.preserve_whitespace_tag_stack.pop() |
| 273 | #print "Pop", tag.name | 565 | if self.string_container_stack and tag == self.string_container_stack[-1]: |
| 566 | self.string_container_stack.pop() | ||
| 567 | #print("Pop", tag.name) | ||
| 274 | if self.tagStack: | 568 | if self.tagStack: |
| 275 | self.currentTag = self.tagStack[-1] | 569 | self.currentTag = self.tagStack[-1] |
| 276 | return self.currentTag | 570 | return self.currentTag |
| 277 | 571 | ||
| 278 | def pushTag(self, tag): | 572 | def pushTag(self, tag): |
| 279 | #print "Push", tag.name | 573 | """Internal method called by handle_starttag when a tag is opened.""" |
| 280 | if self.currentTag: | 574 | #print("Push", tag.name) |
| 575 | if self.currentTag is not None: | ||
| 281 | self.currentTag.contents.append(tag) | 576 | self.currentTag.contents.append(tag) |
| 282 | self.tagStack.append(tag) | 577 | self.tagStack.append(tag) |
| 283 | self.currentTag = self.tagStack[-1] | 578 | self.currentTag = self.tagStack[-1] |
| 579 | if tag.name != self.ROOT_TAG_NAME: | ||
| 580 | self.open_tag_counter[tag.name] += 1 | ||
| 284 | if tag.name in self.builder.preserve_whitespace_tags: | 581 | if tag.name in self.builder.preserve_whitespace_tags: |
| 285 | self.preserve_whitespace_tag_stack.append(tag) | 582 | self.preserve_whitespace_tag_stack.append(tag) |
| 583 | if tag.name in self.builder.string_containers: | ||
| 584 | self.string_container_stack.append(tag) | ||
| 286 | 585 | ||
| 287 | def endData(self, containerClass=NavigableString): | 586 | def endData(self, containerClass=None): |
| 587 | """Method called by the TreeBuilder when the end of a data segment | ||
| 588 | occurs. | ||
| 589 | """ | ||
| 288 | if self.current_data: | 590 | if self.current_data: |
| 289 | current_data = ''.join(self.current_data) | 591 | current_data = ''.join(self.current_data) |
| 290 | # If whitespace is not preserved, and this string contains | 592 | # If whitespace is not preserved, and this string contains |
| @@ -311,61 +613,93 @@ class BeautifulSoup(Tag): | |||
| 311 | not self.parse_only.search(current_data)): | 613 | not self.parse_only.search(current_data)): |
| 312 | return | 614 | return |
| 313 | 615 | ||
| 616 | containerClass = self.string_container(containerClass) | ||
| 314 | o = containerClass(current_data) | 617 | o = containerClass(current_data) |
| 315 | self.object_was_parsed(o) | 618 | self.object_was_parsed(o) |
| 316 | 619 | ||
| 317 | def object_was_parsed(self, o, parent=None, most_recent_element=None): | 620 | def object_was_parsed(self, o, parent=None, most_recent_element=None): |
| 318 | """Add an object to the parse tree.""" | 621 | """Method called by the TreeBuilder to integrate an object into the parse tree.""" |
| 319 | parent = parent or self.currentTag | 622 | if parent is None: |
| 320 | previous_element = most_recent_element or self._most_recent_element | 623 | parent = self.currentTag |
| 624 | if most_recent_element is not None: | ||
| 625 | previous_element = most_recent_element | ||
| 626 | else: | ||
| 627 | previous_element = self._most_recent_element | ||
| 321 | 628 | ||
| 322 | next_element = previous_sibling = next_sibling = None | 629 | next_element = previous_sibling = next_sibling = None |
| 323 | if isinstance(o, Tag): | 630 | if isinstance(o, Tag): |
| 324 | next_element = o.next_element | 631 | next_element = o.next_element |
| 325 | next_sibling = o.next_sibling | 632 | next_sibling = o.next_sibling |
| 326 | previous_sibling = o.previous_sibling | 633 | previous_sibling = o.previous_sibling |
| 327 | if not previous_element: | 634 | if previous_element is None: |
| 328 | previous_element = o.previous_element | 635 | previous_element = o.previous_element |
| 329 | 636 | ||
| 637 | fix = parent.next_element is not None | ||
| 638 | |||
| 330 | o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) | 639 | o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) |
| 331 | 640 | ||
| 332 | self._most_recent_element = o | 641 | self._most_recent_element = o |
| 333 | parent.contents.append(o) | 642 | parent.contents.append(o) |
| 334 | 643 | ||
| 335 | if parent.next_sibling: | 644 | # Check if we are inserting into an already parsed node. |
| 336 | # This node is being inserted into an element that has | 645 | if fix: |
| 337 | # already been parsed. Deal with any dangling references. | 646 | self._linkage_fixer(parent) |
| 338 | index = parent.contents.index(o) | 647 | |
| 339 | if index == 0: | 648 | def _linkage_fixer(self, el): |
| 340 | previous_element = parent | 649 | """Make sure linkage of this fragment is sound.""" |
| 341 | previous_sibling = None | 650 | |
| 342 | else: | 651 | first = el.contents[0] |
| 343 | previous_element = previous_sibling = parent.contents[index-1] | 652 | child = el.contents[-1] |
| 344 | if index == len(parent.contents)-1: | 653 | descendant = child |
| 345 | next_element = parent.next_sibling | 654 | |
| 346 | next_sibling = None | 655 | if child is first and el.parent is not None: |
| 347 | else: | 656 | # Parent should be linked to first child |
| 348 | next_element = next_sibling = parent.contents[index+1] | 657 | el.next_element = child |
| 349 | 658 | # We are no longer linked to whatever this element is | |
| 350 | o.previous_element = previous_element | 659 | prev_el = child.previous_element |
| 351 | if previous_element: | 660 | if prev_el is not None and prev_el is not el: |
| 352 | previous_element.next_element = o | 661 | prev_el.next_element = None |
| 353 | o.next_element = next_element | 662 | # First child should be linked to the parent, and no previous siblings. |
| 354 | if next_element: | 663 | child.previous_element = el |
| 355 | next_element.previous_element = o | 664 | child.previous_sibling = None |
| 356 | o.next_sibling = next_sibling | 665 | |
| 357 | if next_sibling: | 666 | # We have no sibling as we've been appended as the last. |
| 358 | next_sibling.previous_sibling = o | 667 | child.next_sibling = None |
| 359 | o.previous_sibling = previous_sibling | 668 | |
| 360 | if previous_sibling: | 669 | # This index is a tag, dig deeper for a "last descendant" |
| 361 | previous_sibling.next_sibling = o | 670 | if isinstance(child, Tag) and child.contents: |
| 671 | descendant = child._last_descendant(False) | ||
| 672 | |||
| 673 | # As the final step, link last descendant. It should be linked | ||
| 674 | # to the parent's next sibling (if found), else walk up the chain | ||
| 675 | # and find a parent with a sibling. It should have no next sibling. | ||
| 676 | descendant.next_element = None | ||
| 677 | descendant.next_sibling = None | ||
| 678 | target = el | ||
| 679 | while True: | ||
| 680 | if target is None: | ||
| 681 | break | ||
| 682 | elif target.next_sibling is not None: | ||
| 683 | descendant.next_element = target.next_sibling | ||
| 684 | target.next_sibling.previous_element = child | ||
| 685 | break | ||
| 686 | target = target.parent | ||
| 362 | 687 | ||
| 363 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): | 688 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): |
| 364 | """Pops the tag stack up to and including the most recent | 689 | """Pops the tag stack up to and including the most recent |
| 365 | instance of the given tag. If inclusivePop is false, pops the tag | 690 | instance of the given tag. |
| 366 | stack up to but *not* including the most recent instqance of | 691 | |
| 367 | the given tag.""" | 692 | If there are no open tags with the given name, nothing will be |
| 368 | #print "Popping to %s" % name | 693 | popped. |
| 694 | |||
| 695 | :param name: Pop up to the most recent tag with this name. | ||
| 696 | :param nsprefix: The namespace prefix that goes with `name`. | ||
| 697 | :param inclusivePop: It this is false, pops the tag stack up | ||
| 698 | to but *not* including the most recent instqance of the | ||
| 699 | given tag. | ||
| 700 | |||
| 701 | """ | ||
| 702 | #print("Popping to %s" % name) | ||
| 369 | if name == self.ROOT_TAG_NAME: | 703 | if name == self.ROOT_TAG_NAME: |
| 370 | # The BeautifulSoup object itself can never be popped. | 704 | # The BeautifulSoup object itself can never be popped. |
| 371 | return | 705 | return |
| @@ -374,6 +708,8 @@ class BeautifulSoup(Tag): | |||
| 374 | 708 | ||
| 375 | stack_size = len(self.tagStack) | 709 | stack_size = len(self.tagStack) |
| 376 | for i in range(stack_size - 1, 0, -1): | 710 | for i in range(stack_size - 1, 0, -1): |
| 711 | if not self.open_tag_counter.get(name): | ||
| 712 | break | ||
| 377 | t = self.tagStack[i] | 713 | t = self.tagStack[i] |
| 378 | if (name == t.name and nsprefix == t.prefix): | 714 | if (name == t.name and nsprefix == t.prefix): |
| 379 | if inclusivePop: | 715 | if inclusivePop: |
| @@ -383,16 +719,26 @@ class BeautifulSoup(Tag): | |||
| 383 | 719 | ||
| 384 | return most_recently_popped | 720 | return most_recently_popped |
| 385 | 721 | ||
| 386 | def handle_starttag(self, name, namespace, nsprefix, attrs): | 722 | def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, |
| 387 | """Push a start tag on to the stack. | 723 | sourcepos=None, namespaces=None): |
| 388 | 724 | """Called by the tree builder when a new tag is encountered. | |
| 389 | If this method returns None, the tag was rejected by the | 725 | |
| 390 | SoupStrainer. You should proceed as if the tag had not occured | 726 | :param name: Name of the tag. |
| 727 | :param nsprefix: Namespace prefix for the tag. | ||
| 728 | :param attrs: A dictionary of attribute values. | ||
| 729 | :param sourceline: The line number where this tag was found in its | ||
| 730 | source document. | ||
| 731 | :param sourcepos: The character position within `sourceline` where this | ||
| 732 | tag was found. | ||
| 733 | :param namespaces: A dictionary of all namespace prefix mappings | ||
| 734 | currently in scope in the document. | ||
| 735 | |||
| 736 | If this method returns None, the tag was rejected by an active | ||
| 737 | SoupStrainer. You should proceed as if the tag had not occurred | ||
| 391 | in the document. For instance, if this was a self-closing tag, | 738 | in the document. For instance, if this was a self-closing tag, |
| 392 | don't call handle_endtag. | 739 | don't call handle_endtag. |
| 393 | """ | 740 | """ |
| 394 | 741 | # print("Start tag %s: %s" % (name, attrs)) | |
| 395 | # print "Start tag %s: %s" % (name, attrs) | ||
| 396 | self.endData() | 742 | self.endData() |
| 397 | 743 | ||
| 398 | if (self.parse_only and len(self.tagStack) <= 1 | 744 | if (self.parse_only and len(self.tagStack) <= 1 |
| @@ -400,34 +746,54 @@ class BeautifulSoup(Tag): | |||
| 400 | or not self.parse_only.search_tag(name, attrs))): | 746 | or not self.parse_only.search_tag(name, attrs))): |
| 401 | return None | 747 | return None |
| 402 | 748 | ||
| 403 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, | 749 | tag = self.element_classes.get(Tag, Tag)( |
| 404 | self.currentTag, self._most_recent_element) | 750 | self, self.builder, name, namespace, nsprefix, attrs, |
| 751 | self.currentTag, self._most_recent_element, | ||
| 752 | sourceline=sourceline, sourcepos=sourcepos, | ||
| 753 | namespaces=namespaces | ||
| 754 | ) | ||
| 405 | if tag is None: | 755 | if tag is None: |
| 406 | return tag | 756 | return tag |
| 407 | if self._most_recent_element: | 757 | if self._most_recent_element is not None: |
| 408 | self._most_recent_element.next_element = tag | 758 | self._most_recent_element.next_element = tag |
| 409 | self._most_recent_element = tag | 759 | self._most_recent_element = tag |
| 410 | self.pushTag(tag) | 760 | self.pushTag(tag) |
| 411 | return tag | 761 | return tag |
| 412 | 762 | ||
| 413 | def handle_endtag(self, name, nsprefix=None): | 763 | def handle_endtag(self, name, nsprefix=None): |
| 414 | #print "End tag: " + name | 764 | """Called by the tree builder when an ending tag is encountered. |
| 765 | |||
| 766 | :param name: Name of the tag. | ||
| 767 | :param nsprefix: Namespace prefix for the tag. | ||
| 768 | """ | ||
| 769 | #print("End tag: " + name) | ||
| 415 | self.endData() | 770 | self.endData() |
| 416 | self._popToTag(name, nsprefix) | 771 | self._popToTag(name, nsprefix) |
| 417 | 772 | ||
| 418 | def handle_data(self, data): | 773 | def handle_data(self, data): |
| 774 | """Called by the tree builder when a chunk of textual data is encountered.""" | ||
| 419 | self.current_data.append(data) | 775 | self.current_data.append(data) |
| 420 | 776 | ||
| 421 | def decode(self, pretty_print=False, | 777 | def decode(self, pretty_print=False, |
| 422 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, | 778 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, |
| 423 | formatter="minimal"): | 779 | formatter="minimal", iterator=None): |
| 424 | """Returns a string or Unicode representation of this document. | 780 | """Returns a string or Unicode representation of the parse tree |
| 425 | To get Unicode, pass None for encoding.""" | 781 | as an HTML or XML document. |
| 426 | 782 | ||
| 783 | :param pretty_print: If this is True, indentation will be used to | ||
| 784 | make the document more readable. | ||
| 785 | :param eventual_encoding: The encoding of the final document. | ||
| 786 | If this is None, the document will be a Unicode string. | ||
| 787 | """ | ||
| 427 | if self.is_xml: | 788 | if self.is_xml: |
| 428 | # Print the XML declaration | 789 | # Print the XML declaration |
| 429 | encoding_part = '' | 790 | encoding_part = '' |
| 430 | if eventual_encoding is not None: | 791 | if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: |
| 792 | # This is a special Python encoding; it can't actually | ||
| 793 | # go into an XML document because it means nothing | ||
| 794 | # outside of Python. | ||
| 795 | eventual_encoding = None | ||
| 796 | if eventual_encoding != None: | ||
| 431 | encoding_part = ' encoding="%s"' % eventual_encoding | 797 | encoding_part = ' encoding="%s"' % eventual_encoding |
| 432 | prefix = '<?xml version="1.0"%s?>\n' % encoding_part | 798 | prefix = '<?xml version="1.0"%s?>\n' % encoding_part |
| 433 | else: | 799 | else: |
| @@ -437,9 +803,9 @@ class BeautifulSoup(Tag): | |||
| 437 | else: | 803 | else: |
| 438 | indent_level = 0 | 804 | indent_level = 0 |
| 439 | return prefix + super(BeautifulSoup, self).decode( | 805 | return prefix + super(BeautifulSoup, self).decode( |
| 440 | indent_level, eventual_encoding, formatter) | 806 | indent_level, eventual_encoding, formatter, iterator) |
| 441 | 807 | ||
| 442 | # Alias to make it easier to type import: 'from bs4 import _soup' | 808 | # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' |
| 443 | _s = BeautifulSoup | 809 | _s = BeautifulSoup |
| 444 | _soup = BeautifulSoup | 810 | _soup = BeautifulSoup |
| 445 | 811 | ||
| @@ -450,19 +816,25 @@ class BeautifulStoneSoup(BeautifulSoup): | |||
| 450 | kwargs['features'] = 'xml' | 816 | kwargs['features'] = 'xml' |
| 451 | warnings.warn( | 817 | warnings.warn( |
| 452 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' | 818 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' |
| 453 | 'it, pass features="xml" into the BeautifulSoup constructor.') | 819 | 'it, pass features="xml" into the BeautifulSoup constructor.', |
| 820 | DeprecationWarning, stacklevel=2 | ||
| 821 | ) | ||
| 454 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) | 822 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) |
| 455 | 823 | ||
| 456 | 824 | ||
| 457 | class StopParsing(Exception): | 825 | class StopParsing(Exception): |
| 826 | """Exception raised by a TreeBuilder if it's unable to continue parsing.""" | ||
| 458 | pass | 827 | pass |
| 459 | 828 | ||
| 460 | class FeatureNotFound(ValueError): | 829 | class FeatureNotFound(ValueError): |
| 830 | """Exception raised by the BeautifulSoup constructor if no parser with the | ||
| 831 | requested features is found. | ||
| 832 | """ | ||
| 461 | pass | 833 | pass |
| 462 | 834 | ||
| 463 | 835 | ||
| 464 | #By default, act as an HTML pretty-printer. | 836 | #If this file is run as a script, act as an HTML pretty-printer. |
| 465 | if __name__ == '__main__': | 837 | if __name__ == '__main__': |
| 466 | import sys | 838 | import sys |
| 467 | soup = BeautifulSoup(sys.stdin) | 839 | soup = BeautifulSoup(sys.stdin) |
| 468 | print(soup.prettify()) | 840 | print((soup.prettify())) |
