1 files changed, 1465 insertions, 754 deletions
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py
index 68be42d138..0aefe734b2 100644
--- a/bitbake/lib/bs4/element.py
+++ b/bitbake/lib/bs4/element.py
@@ -1,14 +1,27 @@
+# Use of this source code is governed by the MIT license.
 __license__ = "MIT"
-import collections.abc
+try:
+    from collections.abc import Callable # Python 3.6
+except ImportError as e:
+    from collections import Callable
 import re
 import sys
 import warnings
-from bs4.dammit import EntitySubstitution
+from bs4.css import CSS
+from bs4.formatter import (
+    Formatter,
+    HTMLFormatter,
+    XMLFormatter,
+)
 DEFAULT_OUTPUT_ENCODING = "utf-8"
-PY3K = (sys.version_info[0] > 2)
+nonwhitespace_re = re.compile(r"\S+")
+# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
+# the off chance someone imported it for their own use.
 whitespace_re = re.compile(r"\s+")
 def _alias(attr):
@@ -23,12 +36,49 @@ def _alias(attr):
    return alias
+# These encodings are recognized by Python (so PageElement.encode
+# could theoretically support them) but XML and HTML don't recognize
+# them (so they should not show up in an XML or HTML document as that
+# document's encoding).
+#
+# If an XML document is encoded in one of these encodings, no encoding
+# will be mentioned in the XML declaration. If an HTML document is
+# encoded in one of these encodings, and the HTML document has a
+# <meta> tag that mentions an encoding, the encoding will be given as
+# the empty string.
+#
+# Source:
+# https://docs.python.org/3/library/codecs.html#python-specific-encodings
+PYTHON_SPECIFIC_ENCODINGS = set([
+    "idna",
+    "mbcs",
+    "oem",
+    "palmos",
+    "punycode",
+    "raw_unicode_escape",
+    "undefined",
+    "unicode_escape",
+    "raw-unicode-escape",
+    "unicode-escape",
+    "string-escape",
+    "string_escape",
+])
 class NamespacedAttribute(str):
+    """A namespaced string (e.g. 'xml:lang') that remembers the namespace
+    ('xml') and the name ('lang') that were used to create it.
+    """
-    def __new__(cls, prefix, name, namespace=None):
+    def __new__(cls, prefix, name=None, namespace=None):
-        if name is None:
+        if not name:
+            # This is the default namespace. Its name "has no value"
+            # per https://www.w3.org/TR/xml-names/#defaulting
+            name = None
+        if not name:
            obj = str.__new__(cls, prefix)
-        elif prefix is None:
+        elif not prefix:
            # Not really namespaced.
            obj = str.__new__(cls, name)
        else:
@@ -54,6 +104,11 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
        return obj
    def encode(self, encoding):
+        """When an HTML document is being encoded to a given encoding, the
+        value of a meta tag's 'charset' is the name of the encoding.
+        """
+        if encoding in PYTHON_SPECIFIC_ENCODINGS:
+            return ''
        return encoding
@@ -79,118 +134,44 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
        return obj
    def encode(self, encoding):
+        if encoding in PYTHON_SPECIFIC_ENCODINGS:
+            return ''
        def rewrite(match):
            return match.group(1) + encoding
        return self.CHARSET_RE.sub(rewrite, self.original_value)
-class HTMLAwareEntitySubstitution(EntitySubstitution):
-    """Entity substitution rules that are aware of some HTML quirks.
-    Specifically, the contents of <script> and <style> tags should not
+class PageElement(object):
-    undergo entity substitution.
+    """Contains the navigational information for some part of the page:
+    that is, its current location in the parse tree.
-    Incoming NavigableString objects are checked to see if they're the
+    NavigableString, Tag, etc. are all subclasses of PageElement.
-    direct children of a <script> or <style> tag.
    """
-    cdata_containing_tags = set(["script", "style"])
+    # In general, we can't tell just by looking at an element whether
+    # it's contained in an XML document or an HTML document. But for
+    # Tags (q.v.) we can store this information at parse time.
+    known_xml = None
-    preformatted_tags = set(["pre"])
+    def setup(self, parent=None, previous_element=None, next_element=None,
+              previous_sibling=None, next_sibling=None):
-    @classmethod
+        """Sets up the initial relations between this element and
-    def _substitute_if_appropriate(cls, ns, f):
+        other elements.
-        if (isinstance(ns, NavigableString)
-            and ns.parent is not None
-            and ns.parent.name in cls.cdata_containing_tags):
-            # Do nothing.
-            return ns
-        # Substitute.
-        return f(ns)
-    @classmethod
+        :param parent: The parent of this element.
-    def substitute_html(cls, ns):
-        return cls._substitute_if_appropriate(
-            ns, EntitySubstitution.substitute_html)
-    @classmethod
+        :param previous_element: The element parsed immediately before
-    def substitute_xml(cls, ns):
+            this one.
-        return cls._substitute_if_appropriate(
-            ns, EntitySubstitution.substitute_xml)
-class PageElement(object):
+        :param next_element: The element parsed immediately before
-    """Contains the navigational information for some part of the page
+            this one.
-    (either a tag or a piece of text)"""
-    # There are five possible values for the "formatter" argument passed in
-    # to methods like encode() and prettify():
-    #
-    # "html" - All Unicode characters with corresponding HTML entities
-    #   are converted to those entities on output.
-    # "minimal" - Bare ampersands and angle brackets are converted to
-    #   XML entities: &amp; &lt; &gt;
-    # None - The null formatter. Unicode characters are never
-    #   converted to entities.  This is not recommended, but it's
-    #   faster than "minimal".
-    # A function - This function will be called on every string that
-    #  needs to undergo entity substitution.
-    #
-    # In an HTML document, the default "html" and "minimal" functions
-    # will leave the contents of <script> and <style> tags alone. For
-    # an XML document, all tags will be given the same treatment.
-    HTML_FORMATTERS = {
-        "html" : HTMLAwareEntitySubstitution.substitute_html,
-        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
-        None : None
-        }
-    XML_FORMATTERS = {
-        "html" : EntitySubstitution.substitute_html,
-        "minimal" : EntitySubstitution.substitute_xml,
-        None : None
-        }
-    def format_string(self, s, formatter='minimal'):
-        """Format the given string using the given formatter."""
-        if not isinstance(formatter, collections.abc.Callable):
-            formatter = self._formatter_for_name(formatter)
-        if formatter is None:
-            output = s
-        else:
-            output = formatter(s)
-        return output
-    @property
+        :param previous_sibling: The most recently encountered element
-    def _is_xml(self):
+            on the same level of the parse tree as this one.
-        """Is this element part of an XML tree or an HTML tree?
-        This is used when mapping a formatter name ("minimal") to an
+        :param previous_sibling: The next element to be encountered
-        appropriate function (one that performs entity-substitution on
+            on the same level of the parse tree as this one.
-        the contents of <script> and <style> tags, or not). It's
-        inefficient, but it should be called very rarely.
        """
-        if self.parent is None:
-            # This is the top-level object. It should have .is_xml set
-            # from tree creation. If not, take a guess--BS is usually
-            # used on HTML markup.
-            return getattr(self, 'is_xml', False)
-        return self.parent._is_xml
-    def _formatter_for_name(self, name):
-        "Look up a formatter function based on its name and the tree."
-        if self._is_xml:
-            return self.XML_FORMATTERS.get(
-                name, EntitySubstitution.substitute_xml)
-        else:
-            return self.HTML_FORMATTERS.get(
-                name, HTMLAwareEntitySubstitution.substitute_xml)
-    def setup(self, parent=None, previous_element=None, next_element=None,
-              previous_sibling=None, next_sibling=None):
-        """Sets up the initial relations between this element and
-        other elements."""
        self.parent = parent
        self.previous_element = previous_element
@@ -198,48 +179,156 @@ class PageElement(object):
            self.previous_element.next_element = self
        self.next_element = next_element
-        if self.next_element:
+        if self.next_element is not None:
            self.next_element.previous_element = self
        self.next_sibling = next_sibling
-        if self.next_sibling:
+        if self.next_sibling is not None:
            self.next_sibling.previous_sibling = self
-        if (not previous_sibling
+        if (previous_sibling is None
            and self.parent is not None and self.parent.contents):
            previous_sibling = self.parent.contents[-1]
        self.previous_sibling = previous_sibling
-        if previous_sibling:
+        if previous_sibling is not None:
            self.previous_sibling.next_sibling = self
+    def format_string(self, s, formatter):
+        """Format the given string using the given formatter.
+        :param s: A string.
+        :param formatter: A Formatter object, or a string naming one of the standard formatters.
+        """
+        if formatter is None:
+            return s
+        if not isinstance(formatter, Formatter):
+            formatter = self.formatter_for_name(formatter)
+        output = formatter.substitute(s)
+        return output
+    def formatter_for_name(self, formatter):
+        """Look up or create a Formatter for the given identifier,
+        if necessary.
+        :param formatter: Can be a Formatter object (used as-is), a
+            function (used as the entity substitution hook for an
+            XMLFormatter or HTMLFormatter), or a string (used to look
+            up an XMLFormatter or HTMLFormatter in the appropriate
+            registry.
+        """
+        if isinstance(formatter, Formatter):
+            return formatter
+        if self._is_xml:
+            c = XMLFormatter
+        else:
+            c = HTMLFormatter
+        if isinstance(formatter, Callable):
+            return c(entity_substitution=formatter)
+        return c.REGISTRY[formatter]
+    @property
+    def _is_xml(self):
+        """Is this element part of an XML tree or an HTML tree?
+        This is used in formatter_for_name, when deciding whether an
+        XMLFormatter or HTMLFormatter is more appropriate. It can be
+        inefficient, but it should be called very rarely.
+        """
+        if self.known_xml is not None:
+            # Most of the time we will have determined this when the
+            # document is parsed.
+            return self.known_xml
+        # Otherwise, it's likely that this element was created by
+        # direct invocation of the constructor from within the user's
+        # Python code.
+        if self.parent is None:
+            # This is the top-level object. It should have .known_xml set
+            # from tree creation. If not, take a guess--BS is usually
+            # used on HTML markup.
+            return getattr(self, 'is_xml', False)
+        return self.parent._is_xml
    nextSibling = _alias("next_sibling")  # BS3
    previousSibling = _alias("previous_sibling")  # BS3
-    def replace_with(self, replace_with):
+    default = object()
-        if not self.parent:
+    def _all_strings(self, strip=False, types=default):
+        """Yield all strings of certain classes, possibly stripping them.
+        This is implemented differently in Tag and NavigableString.
+        """
+        raise NotImplementedError()
+    @property
+    def stripped_strings(self):
+        """Yield all strings in this PageElement, stripping them first.
+        :yield: A sequence of stripped strings.
+        """
+        for string in self._all_strings(True):
+            yield string
+    def get_text(self, separator="", strip=False,
+                 types=default):
+        """Get all child strings of this PageElement, concatenated using the
+        given separator.
+        :param separator: Strings will be concatenated using this separator.
+        :param strip: If True, strings will be stripped before being
+            concatenated.
+        :param types: A tuple of NavigableString subclasses. Any
+            strings of a subclass not found in this list will be
+            ignored. Although there are exceptions, the default
+            behavior in most cases is to consider only NavigableString
+            and CData objects. That means no comments, processing
+            instructions, etc.
+        :return: A string.
+        """
+        return separator.join([s for s in self._all_strings(
+                    strip, types=types)])
+    getText = get_text
+    text = property(get_text)
+    def replace_with(self, *args):
+        """Replace this PageElement with one or more PageElements, keeping the
+        rest of the tree the same.
+        :param args: One or more PageElements.
+        :return: `self`, no longer part of the tree.
+        """
+        if self.parent is None:
            raise ValueError(
-                "Cannot replace one element with another when the"
+                "Cannot replace one element with another when the "
                "element to be replaced is not part of a tree.")
-        if replace_with is self:
+        if len(args) == 1 and args[0] is self:
            return
-        if replace_with is self.parent:
+        if any(x is self.parent for x in args):
            raise ValueError("Cannot replace a Tag with its parent.")
        old_parent = self.parent
        my_index = self.parent.index(self)
-        self.extract()
+        self.extract(_self_index=my_index)
-        old_parent.insert(my_index, replace_with)
+        for idx, replace_with in enumerate(args, start=my_index):
+            old_parent.insert(idx, replace_with)
        return self
    replaceWith = replace_with  # BS3
    def unwrap(self):
+        """Replace this PageElement with its contents.
+        :return: `self`, no longer part of the tree.
+        """
        my_parent = self.parent
-        if not self.parent:
+        if self.parent is None:
            raise ValueError(
                "Cannot replace an element with its contents when that"
                "element is not part of a tree.")
        my_index = self.parent.index(self)
-        self.extract()
+        self.extract(_self_index=my_index)
        for child in reversed(self.contents[:]):
            my_parent.insert(my_index, child)
        return self
@@ -247,14 +336,29 @@ class PageElement(object):
    replaceWithChildren = unwrap  # BS3
    def wrap(self, wrap_inside):
+        """Wrap this PageElement inside another one.
+        :param wrap_inside: A PageElement.
+        :return: `wrap_inside`, occupying the position in the tree that used
+           to be occupied by `self`, and with `self` inside it.
+        """
        me = self.replace_with(wrap_inside)
        wrap_inside.append(me)
        return wrap_inside
-    def extract(self):
+    def extract(self, _self_index=None):
-        """Destructively rips this element out of the tree."""
+        """Destructively rips this element out of the tree.
+        :param _self_index: The location of this element in its parent's
+           .contents, if known. Passing this in allows for a performance
+           optimization.
+        :return: `self`, no longer part of the tree.
+        """
        if self.parent is not None:
-            del self.parent.contents[self.parent.index(self)]
+            if _self_index is None:
+                _self_index = self.parent.index(self)
+            del self.parent.contents[_self_index]
        #Find the two elements that would be next to each other if
        #this element (and any children) hadn't been parsed. Connect
@@ -281,8 +385,13 @@ class PageElement(object):
        return self
    def _last_descendant(self, is_initialized=True, accept_self=True):
-        "Finds the last element beneath this object to be parsed."
+        """Finds the last element beneath this object to be parsed.
-        if is_initialized and self.next_sibling:
+        :param is_initialized: Has `setup` been called on this PageElement
+            yet?
+        :param accept_self: Is `self` an acceptable answer to the question?
+        """
+        if is_initialized and self.next_sibling is not None:
            last_child = self.next_sibling.previous_element
        else:
            last_child = self
@@ -295,6 +404,14 @@ class PageElement(object):
    _lastRecursiveChild = _last_descendant
    def insert(self, position, new_child):
+        """Insert a new PageElement in the list of this PageElement's children.
+        This works the same way as `list.insert`.
+        :param position: The numeric position that should be occupied
+           in `self.children` by the new PageElement.
+        :param new_child: A PageElement.
+        """
        if new_child is None:
            raise ValueError("Cannot insert None into a tag.")
        if new_child is self:
@@ -303,6 +420,14 @@ class PageElement(object):
            and not isinstance(new_child, NavigableString)):
            new_child = NavigableString(new_child)
+        from bs4 import BeautifulSoup
+        if isinstance(new_child, BeautifulSoup):
+            # We don't want to end up with a situation where one BeautifulSoup
+            # object contains another. Insert the children one at a time.
+            for subchild in list(new_child.contents):
+                self.insert(position, subchild)
+                position += 1
+            return
        position = min(position, len(self.contents))
        if hasattr(new_child, 'parent') and new_child.parent is not None:
            # We're 'inserting' an element that's already one
@@ -361,160 +486,326 @@ class PageElement(object):
        self.contents.insert(position, new_child)
    def append(self, tag):
-        """Appends the given tag to the contents of this tag."""
+        """Appends the given PageElement to the contents of this one.
+        :param tag: A PageElement.
+        """
        self.insert(len(self.contents), tag)
-    def insert_before(self, predecessor):
+    def extend(self, tags):
-        """Makes the given element the immediate predecessor of this one.
+        """Appends the given PageElements to this one's contents.
-        The two elements will have the same parent, and the given element
+        :param tags: A list of PageElements. If a single Tag is
+            provided instead, this PageElement's contents will be extended
+            with that Tag's contents.
+        """
+        if isinstance(tags, Tag):
+            tags = tags.contents
+        if isinstance(tags, list):
+            # Moving items around the tree may change their position in
+            # the original list. Make a list that won't change.
+            tags = list(tags)
+        for tag in tags:
+            self.append(tag)
+    def insert_before(self, *args):
+        """Makes the given element(s) the immediate predecessor of this one.
+        All the elements will have the same parent, and the given elements
        will be immediately before this one.
+        :param args: One or more PageElements.
        """
-        if self is predecessor:
-            raise ValueError("Can't insert an element before itself.")
        parent = self.parent
        if parent is None:
            raise ValueError(
                "Element has no parent, so 'before' has no meaning.")
-        # Extract first so that the index won't be screwed up if they
+        if any(x is self for x in args):
-        # are siblings.
+                raise ValueError("Can't insert an element before itself.")
-        if isinstance(predecessor, PageElement):
+        for predecessor in args:
-            predecessor.extract()
+            # Extract first so that the index won't be screwed up if they
-        index = parent.index(self)
+            # are siblings.
-        parent.insert(index, predecessor)
+            if isinstance(predecessor, PageElement):
+                predecessor.extract()
-    def insert_after(self, successor):
+            index = parent.index(self)
-        """Makes the given element the immediate successor of this one.
+            parent.insert(index, predecessor)
-        The two elements will have the same parent, and the given element
+    def insert_after(self, *args):
+        """Makes the given element(s) the immediate successor of this one.
+        The elements will have the same parent, and the given elements
        will be immediately after this one.
+        :param args: One or more PageElements.
        """
-        if self is successor:
+        # Do all error checking before modifying the tree.
-            raise ValueError("Can't insert an element after itself.")
        parent = self.parent
        if parent is None:
            raise ValueError(
                "Element has no parent, so 'after' has no meaning.")
-        # Extract first so that the index won't be screwed up if they
+        if any(x is self for x in args):
-        # are siblings.
+            raise ValueError("Can't insert an element after itself.")
-        if isinstance(successor, PageElement):
-            successor.extract()
+        offset = 0
-        index = parent.index(self)
+        for successor in args:
-        parent.insert(index+1, successor)
+            # Extract first so that the index won't be screwed up if they
+            # are siblings.
-    def find_next(self, name=None, attrs={}, text=None, **kwargs):
+            if isinstance(successor, PageElement):
-        """Returns the first item that matches the given criteria and
+                successor.extract()
-        appears after this Tag in the document."""
+            index = parent.index(self)
-        return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
+            parent.insert(index+1+offset, successor)
+            offset += 1
+    def find_next(self, name=None, attrs={}, string=None, **kwargs):
+        """Find the first PageElement that matches the given criteria and
+        appears later in the document than this PageElement.
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param string: A filter for a NavigableString with specific text.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
+        return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
    findNext = find_next  # BS3
-    def find_all_next(self, name=None, attrs={}, text=None, limit=None,
+    def find_all_next(self, name=None, attrs={}, string=None, limit=None,
                    **kwargs):
-        """Returns all items that match the given criteria and appear
+        """Find all PageElements that match the given criteria and appear
-        after this Tag in the document."""
+        later in the document than this PageElement.
-        return self._find_all(name, attrs, text, limit, self.next_elements,
-                             **kwargs)
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param string: A filter for a NavigableString with specific text.
+        :param limit: Stop looking after finding this many results.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A ResultSet containing PageElements.
+        """
+        _stacklevel = kwargs.pop('_stacklevel', 2)
+        return self._find_all(name, attrs, string, limit, self.next_elements,
+                              _stacklevel=_stacklevel+1, **kwargs)
    findAllNext = find_all_next  # BS3
-    def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
+    def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
-        """Returns the closest sibling to this Tag that matches the
+        """Find the closest sibling to this PageElement that matches the
-        given criteria and appears after this Tag in the document."""
+        given criteria and appears later in the document.
-        return self._find_one(self.find_next_siblings, name, attrs, text,
+        All find_* methods take a common set of arguments. See the
+        online documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param string: A filter for a NavigableString with specific text.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
+        return self._find_one(self.find_next_siblings, name, attrs, string,
                             **kwargs)
    findNextSibling = find_next_sibling  # BS3
-    def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
+    def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
                           **kwargs):
-        """Returns the siblings of this Tag that match the given
+        """Find all siblings of this PageElement that match the given criteria
-        criteria and appear after this Tag in the document."""
+        and appear later in the document.
-        return self._find_all(name, attrs, text, limit,
-                              self.next_siblings, **kwargs)
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param string: A filter for a NavigableString with specific text.
+        :param limit: Stop looking after finding this many results.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A ResultSet of PageElements.
+        :rtype: bs4.element.ResultSet
+        """
+        _stacklevel = kwargs.pop('_stacklevel', 2)
+        return self._find_all(
+            name, attrs, string, limit,
+            self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
+        )
    findNextSiblings = find_next_siblings   # BS3
    fetchNextSiblings = find_next_siblings  # BS2
-    def find_previous(self, name=None, attrs={}, text=None, **kwargs):
+    def find_previous(self, name=None, attrs={}, string=None, **kwargs):
-        """Returns the first item that matches the given criteria and
+        """Look backwards in the document from this PageElement and find the
-        appears before this Tag in the document."""
+        first PageElement that matches the given criteria.
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param string: A filter for a NavigableString with specific text.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
        return self._find_one(
-            self.find_all_previous, name, attrs, text, **kwargs)
+            self.find_all_previous, name, attrs, string, **kwargs)
    findPrevious = find_previous  # BS3
-    def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
+    def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
                        **kwargs):
-        """Returns all items that match the given criteria and appear
+        """Look backwards in the document from this PageElement and find all
-        before this Tag in the document."""
+        PageElements that match the given criteria.
-        return self._find_all(name, attrs, text, limit, self.previous_elements,
-                           **kwargs)
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param string: A filter for a NavigableString with specific text.
+        :param limit: Stop looking after finding this many results.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A ResultSet of PageElements.
+        :rtype: bs4.element.ResultSet
+        """
+        _stacklevel = kwargs.pop('_stacklevel', 2)
+        return self._find_all(
+            name, attrs, string, limit, self.previous_elements,
+            _stacklevel=_stacklevel+1, **kwargs
+        )
    findAllPrevious = find_all_previous  # BS3
    fetchPrevious = find_all_previous    # BS2
-    def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
+    def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
-        """Returns the closest sibling to this Tag that matches the
+        """Returns the closest sibling to this PageElement that matches the
-        given criteria and appears before this Tag in the document."""
+        given criteria and appears earlier in the document.
-        return self._find_one(self.find_previous_siblings, name, attrs, text,
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param string: A filter for a NavigableString with specific text.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
+        return self._find_one(self.find_previous_siblings, name, attrs, string,
                             **kwargs)
    findPreviousSibling = find_previous_sibling  # BS3
-    def find_previous_siblings(self, name=None, attrs={}, text=None,
+    def find_previous_siblings(self, name=None, attrs={}, string=None,
                               limit=None, **kwargs):
-        """Returns the siblings of this Tag that match the given
+        """Returns all siblings to this PageElement that match the
-        criteria and appear before this Tag in the document."""
+        given criteria and appear earlier in the document.
-        return self._find_all(name, attrs, text, limit,
-                              self.previous_siblings, **kwargs)
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param string: A filter for a NavigableString with specific text.
+        :param limit: Stop looking after finding this many results.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A ResultSet of PageElements.
+        :rtype: bs4.element.ResultSet
+        """
+        _stacklevel = kwargs.pop('_stacklevel', 2)
+        return self._find_all(
+            name, attrs, string, limit,
+            self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
+        )
    findPreviousSiblings = find_previous_siblings   # BS3
    fetchPreviousSiblings = find_previous_siblings  # BS2
    def find_parent(self, name=None, attrs={}, **kwargs):
-        """Returns the closest parent of this Tag that matches the given
+        """Find the closest parent of this PageElement that matches the given
-        criteria."""
+        criteria.
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
        # NOTE: We can't use _find_one because findParents takes a different
        # set of arguments.
        r = None
-        l = self.find_parents(name, attrs, 1, **kwargs)
+        l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
        if l:
            r = l[0]
        return r
    findParent = find_parent  # BS3
    def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
-        """Returns the parents of this Tag that match the given
+        """Find all parents of this PageElement that match the given criteria.
-        criteria."""
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param limit: Stop looking after finding this many results.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
+        _stacklevel = kwargs.pop('_stacklevel', 2)
        return self._find_all(name, attrs, None, limit, self.parents,
-                             **kwargs)
+                              _stacklevel=_stacklevel+1, **kwargs)
    findParents = find_parents   # BS3
    fetchParents = find_parents  # BS2
    @property
    def next(self):
+        """The PageElement, if any, that was parsed just after this one.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
        return self.next_element
    @property
    def previous(self):
+        """The PageElement, if any, that was parsed just before this one.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
        return self.previous_element
    #These methods do the real heavy lifting.
-    def _find_one(self, method, name, attrs, text, **kwargs):
+    def _find_one(self, method, name, attrs, string, **kwargs):
        r = None
-        l = method(name, attrs, text, 1, **kwargs)
+        l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
        if l:
            r = l[0]
        return r
-    def _find_all(self, name, attrs, text, limit, generator, **kwargs):
+    def _find_all(self, name, attrs, string, limit, generator, **kwargs):
        "Iterates over a generator looking for things that match."
+        _stacklevel = kwargs.pop('_stacklevel', 3)
-        if text is None and 'string' in kwargs:
+        if string is None and 'text' in kwargs:
-            text = kwargs['string']
+            string = kwargs.pop('text')
-            del kwargs['string']
+            warnings.warn(
+                "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
+                DeprecationWarning, stacklevel=_stacklevel
+            )
        if isinstance(name, SoupStrainer):
            strainer = name
        else:
-            strainer = SoupStrainer(name, attrs, text, **kwargs)
+            strainer = SoupStrainer(name, attrs, string, **kwargs)
-        if text is None and not limit and not attrs and not kwargs:
+        if string is None and not limit and not attrs and not kwargs:
            if name is True or name is None:
                # Optimization to find all tags.
                result = (element for element in generator
@@ -522,9 +813,23 @@ class PageElement(object):
                return ResultSet(strainer, result)
            elif isinstance(name, str):
                # Optimization to find all tags with a given name.
+                if name.count(':') == 1:
+                    # This is a name with a prefix. If this is a namespace-aware document,
+                    # we need to match the local name against tag.name. If not,
+                    # we need to match the fully-qualified name against tag.name.
+                    prefix, local_name = name.split(':', 1)
+                else:
+                    prefix = None
+                    local_name = name
                result = (element for element in generator
                          if isinstance(element, Tag)
-                            and element.name == name)
+                          and (
+                              element.name == name
+                          ) or (
+                              element.name == local_name
+                              and (prefix is None or element.prefix == prefix)
+                          )
+                )
                return ResultSet(strainer, result)
        results = ResultSet(strainer)
        while True:
@@ -544,6 +849,10 @@ class PageElement(object):
    #NavigableStrings and Tags.
    @property
    def next_elements(self):
+        """All PageElements that were parsed after this one.
+        :yield: A sequence of PageElements.
+        """
        i = self.next_element
        while i is not None:
            yield i
@@ -551,6 +860,11 @@ class PageElement(object):
    @property
    def next_siblings(self):
+        """All PageElements that are siblings of this one but were parsed
+        later.
+        :yield: A sequence of PageElements.
+        """
        i = self.next_sibling
        while i is not None:
            yield i
@@ -558,6 +872,10 @@ class PageElement(object):
    @property
    def previous_elements(self):
+        """All PageElements that were parsed before this one.
+        :yield: A sequence of PageElements.
+        """
        i = self.previous_element
        while i is not None:
            yield i
@@ -565,6 +883,11 @@ class PageElement(object):
    @property
    def previous_siblings(self):
+        """All PageElements that are siblings of this one but were parsed
+        earlier.
+        :yield: A sequence of PageElements.
+        """
        i = self.previous_sibling
        while i is not None:
            yield i
@@ -572,87 +895,23 @@ class PageElement(object):
    @property
    def parents(self):
+        """All PageElements that are parents of this PageElement.
+        :yield: A sequence of PageElements.
+        """
        i = self.parent
        while i is not None:
            yield i
            i = i.parent
-    # Methods for supporting CSS selectors.
+    @property
+    def decomposed(self):
-    tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
+        """Check whether a PageElement has been decomposed.
-    # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
-    #   \---------------------------/  \---/\-------------/    \-------/
-    #     |                              |         |               |
-    #     |                              |         |           The value
-    #     |                              |    ~,|,^,$,* or =
-    #     |                           Attribute
-    #    Tag
-    attribselect_re = re.compile(
-        r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
-        r'=?"?(?P<value>[^\]"]*)"?\]$'
-        )
-    def _attr_value_as_string(self, value, default=None):
-        """Force an attribute value into a string representation.
-        A multi-valued attribute will be converted into a
+        :rtype: bool
-        space-separated stirng.
        """
-        value = self.get(value, default)
+        return getattr(self, '_decomposed', False) or False
-        if isinstance(value, list) or isinstance(value, tuple):
+   
-            value =" ".join(value)
-        return value
-    def _tag_name_matches_and(self, function, tag_name):
-        if not tag_name:
-            return function
-        else:
-            def _match(tag):
-                return tag.name == tag_name and function(tag)
-            return _match
-    def _attribute_checker(self, operator, attribute, value=''):
-        """Create a function that performs a CSS selector operation.
-        Takes an operator, attribute and optional value. Returns a
-        function that will return True for elements that match that
-        combination.
-        """
-        if operator == '=':
-            # string representation of `attribute` is equal to `value`
-            return lambda el: el._attr_value_as_string(attribute) == value
-        elif operator == '~':
-            # space-separated list representation of `attribute`
-            # contains `value`
-            def _includes_value(element):
-                attribute_value = element.get(attribute, [])
-                if not isinstance(attribute_value, list):
-                    attribute_value = attribute_value.split()
-                return value in attribute_value
-            return _includes_value
-        elif operator == '^':
-            # string representation of `attribute` starts with `value`
-            return lambda el: el._attr_value_as_string(
-                attribute, '').startswith(value)
-        elif operator == '$':
-            # string represenation of `attribute` ends with `value`
-            return lambda el: el._attr_value_as_string(
-                attribute, '').endswith(value)
-        elif operator == '*':
-            # string representation of `attribute` contains `value`
-            return lambda el: value in el._attr_value_as_string(attribute, '')
-        elif operator == '|':
-            # string representation of `attribute` is either exactly
-            # `value` or starts with `value` and then a dash.
-            def _is_or_starts_with_dash(element):
-                attribute_value = element._attr_value_as_string(attribute, '')
-                return (attribute_value == value or attribute_value.startswith(
-                        value + '-'))
-            return _is_or_starts_with_dash
-        else:
-            return lambda el: el.has_attr(attribute)
    # Old non-property versions of the generators, for backwards
    # compatibility with BS3.
    def nextGenerator(self):
@@ -672,6 +931,11 @@ class PageElement(object):
 class NavigableString(str, PageElement):
+    """A Python Unicode string that is part of a parse tree.
+    When Beautiful Soup parses the markup <b>penguin</b>, it will
+    create a NavigableString for the string "penguin".
+    """
    PREFIX = ''
    SUFFIX = ''
@@ -691,12 +955,22 @@ class NavigableString(str, PageElement):
        u.setup()
        return u
-    def __copy__(self):
+    def __deepcopy__(self, memo, recursive=False):
        """A copy of a NavigableString has the same contents and class
        as the original, but it is not connected to the parse tree.
+        :param recursive: This parameter is ignored; it's only defined
+           so that NavigableString.__deepcopy__ implements the same
+           signature as Tag.__deepcopy__.
        """
        return type(self)(self)
+    def __copy__(self):
+        """A copy of a NavigableString can only be a deep copy, because
+        only one PageElement can occupy a given place in a parse tree.
+        """
+        return self.__deepcopy__({})
    def __getnewargs__(self):
        return (str(self),)
@@ -712,55 +986,146 @@ class NavigableString(str, PageElement):
                    self.__class__.__name__, attr))
    def output_ready(self, formatter="minimal"):
+        """Run the string through the provided formatter.
+        :param formatter: A Formatter object, or a string naming one of the standard formatters.
+        """
        output = self.format_string(self, formatter)
        return self.PREFIX + output + self.SUFFIX
    @property
    def name(self):
+        """Since a NavigableString is not a Tag, it has no .name.
+        This property is implemented so that code like this doesn't crash
+        when run on a mixture of Tag and NavigableString objects:
+            [x.name for x in tag.children]
+        """
        return None
    @name.setter
    def name(self, name):
+        """Prevent NavigableString.name from ever being set."""
        raise AttributeError("A NavigableString cannot be given a name.")
+    def _all_strings(self, strip=False, types=PageElement.default):
+        """Yield all strings of certain classes, possibly stripping them.
+        This makes it easy for NavigableString to implement methods
+        like get_text() as conveniences, creating a consistent
+        text-extraction API across all PageElements.
+        :param strip: If True, all strings will be stripped before being
+            yielded.
+        :param types: A tuple of NavigableString subclasses. If this
+            NavigableString isn't one of those subclasses, the
+            sequence will be empty. By default, the subclasses
+            considered are NavigableString and CData objects. That
+            means no comments, processing instructions, etc.
+        :yield: A sequence that either contains this string, or is empty.
+        """
+        if types is self.default:
+            # This is kept in Tag because it's full of subclasses of
+            # this class, which aren't defined until later in the file.
+            types = Tag.DEFAULT_INTERESTING_STRING_TYPES
+        # Do nothing if the caller is looking for specific types of
+        # string, and we're of a different type.
+        #
+        # We check specific types instead of using isinstance(self,
+        # types) because all of these classes subclass
+        # NavigableString. Anyone who's using this feature probably
+        # wants generic NavigableStrings but not other stuff.
+        my_type = type(self)
+        if types is not None:
+            if isinstance(types, type):
+                # Looking for a single type.
+                if my_type is not types:
+                    return
+            elif my_type not in types:
+                # Looking for one of a list of types.
+                return
+        value = self
+        if strip:
+            value = value.strip()
+        if len(value) > 0:
+            yield value
+    strings = property(_all_strings)
 class PreformattedString(NavigableString):
    """A NavigableString not subject to the normal formatting rules.
-    The string will be passed into the formatter (to trigger side effects),
+    This is an abstract class used for special kinds of strings such
-    but the return value will be ignored.
+    as comments (the Comment class) and CDATA blocks (the CData
+    class).
    """
-    def output_ready(self, formatter="minimal"):
+    PREFIX = ''
-        """CData strings are passed into the formatter.
+    SUFFIX = ''
-        But the return value is ignored."""
-        self.format_string(self, formatter)
+    def output_ready(self, formatter=None):
+        """Make this string ready for output by adding any subclass-specific
+            prefix or suffix.
+        :param formatter: A Formatter object, or a string naming one
+            of the standard formatters. The string will be passed into the
+            Formatter, but only to trigger any side effects: the return
+            value is ignored.
+        :return: The string, with any subclass-specific prefix and
+           suffix added on.
+        """
+        if formatter is not None:
+            ignore = self.format_string(self, formatter)
        return self.PREFIX + self + self.SUFFIX
 class CData(PreformattedString):
+    """A CDATA block."""
    PREFIX = '<![CDATA['
    SUFFIX = ']]>'
 class ProcessingInstruction(PreformattedString):
+    """A SGML processing instruction."""
    PREFIX = '<?'
    SUFFIX = '>'
-class Comment(PreformattedString):
+class XMLProcessingInstruction(ProcessingInstruction):
+    """An XML processing instruction."""
+    PREFIX = '<?'
+    SUFFIX = '?>'
+class Comment(PreformattedString):
+    """An HTML or XML comment."""
    PREFIX = '<!--'
    SUFFIX = '-->'
 class Declaration(PreformattedString):
+    """An XML declaration."""
    PREFIX = '<?'
    SUFFIX = '?>'
 class Doctype(PreformattedString):
+    """A document type declaration."""
    @classmethod
    def for_name_and_ids(cls, name, pub_id, system_id):
+        """Generate an appropriate document type declaration for a given
+        public ID and system ID.
+        :param name: The name of the document's root element, e.g. 'html'.
+        :param pub_id: The Formal Public Identifier for this document type,
+            e.g. '-//W3C//DTD XHTML 1.1//EN'
+        :param system_id: The system identifier for this document type,
+            e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
+        :return: A Doctype.
+        """
        value = name or ''
        if pub_id is not None:
            value += ' PUBLIC "%s"' % pub_id
@@ -775,14 +1140,105 @@ class Doctype(PreformattedString):
    SUFFIX = '>\n'
+class Stylesheet(NavigableString):
+    """A NavigableString representing an stylesheet (probably
+    CSS).
+    Used to distinguish embedded stylesheets from textual content.
+    """
+    pass
+class Script(NavigableString):
+    """A NavigableString representing an executable script (probably
+    Javascript).
+    Used to distinguish executable code from textual content.
+    """
+    pass
+class TemplateString(NavigableString):
+    """A NavigableString representing a string found inside an HTML
+    template embedded in a larger document.
+    Used to distinguish such strings from the main body of the document.
+    """
+    pass
+class RubyTextString(NavigableString):
+    """A NavigableString representing the contents of the <rt> HTML
+    element.
+    https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
+    Can be used to distinguish such strings from the strings they're
+    annotating.
+    """
+    pass
+class RubyParenthesisString(NavigableString):
+    """A NavigableString representing the contents of the <rp> HTML
+    element.
+    https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
+    """
+    pass
 class Tag(PageElement):
+    """Represents an HTML or XML tag that is part of a parse tree, along
+    with its attributes and contents.
-    """Represents a found HTML tag with its attributes and contents."""
+    When Beautiful Soup parses the markup <b>penguin</b>, it will
+    create a Tag object representing the <b> tag.
+    """
    def __init__(self, parser=None, builder=None, name=None, namespace=None,
-                 prefix=None, attrs=None, parent=None, previous=None):
+                 prefix=None, attrs=None, parent=None, previous=None,
-        "Basic constructor."
+                 is_xml=None, sourceline=None, sourcepos=None,
+                 can_be_empty_element=None, cdata_list_attributes=None,
+                 preserve_whitespace_tags=None,
+                 interesting_string_types=None,
+                 namespaces=None
+    ):
+        """Basic constructor.
+        :param parser: A BeautifulSoup object.
+        :param builder: A TreeBuilder.
+        :param name: The name of the tag.
+        :param namespace: The URI of this Tag's XML namespace, if any.
+        :param prefix: The prefix for this Tag's XML namespace, if any.
+        :param attrs: A dictionary of this Tag's attribute values.
+        :param parent: The PageElement to use as this Tag's parent.
+        :param previous: The PageElement that was parsed immediately before
+            this tag.
+        :param is_xml: If True, this is an XML tag. Otherwise, this is an
+            HTML tag.
+        :param sourceline: The line number where this tag was found in its
+            source document.
+        :param sourcepos: The character position within `sourceline` where this
+            tag was found.
+        :param can_be_empty_element: If True, this tag should be
+            represented as <tag/>. If False, this tag should be represented
+            as <tag></tag>.
+        :param cdata_list_attributes: A list of attributes whose values should
+            be treated as CDATA if they ever show up on this tag.
+        :param preserve_whitespace_tags: A list of tag names whose contents
+            should have their whitespace preserved.
+        :param interesting_string_types: This is a NavigableString
+            subclass or a tuple of them. When iterating over this
+            Tag's strings in methods like Tag.strings or Tag.get_text,
+            these are the types of strings that are interesting enough
+            to be considered. The default is to consider
+            NavigableString and CData the only interesting string
+            subtypes.
+        :param namespaces: A dictionary mapping currently active
+            namespace prefixes to URIs. This can be used later to
+            construct CSS selectors.
+        """
        if parser is None:
            self.parser_class = None
        else:
@@ -793,7 +1249,12 @@ class Tag(PageElement):
            raise ValueError("No value provided for new tag's name.")
        self.name = name
        self.namespace = namespace
+        self._namespaces = namespaces or {}
        self.prefix = prefix
+        if ((not builder or builder.store_line_numbers)
+            and (sourceline is not None or sourcepos is not None)):
+            self.sourceline = sourceline
+            self.sourcepos = sourcepos
        if attrs is None:
            attrs = {}
        elif attrs:
@@ -804,32 +1265,109 @@ class Tag(PageElement):
                attrs = dict(attrs)
        else:
            attrs = dict(attrs)
+        # If possible, determine ahead of time whether this tag is an
+        # XML tag.
+        if builder:
+            self.known_xml = builder.is_xml
+        else:
+            self.known_xml = is_xml
        self.attrs = attrs
        self.contents = []
        self.setup(parent, previous)
        self.hidden = False
-        # Set up any substitutions, such as the charset in a META tag.
+        if builder is None:
-        if builder is not None:
+            # In the absence of a TreeBuilder, use whatever values were
+            # passed in here. They're probably None, unless this is a copy of some
+            # other tag.
+            self.can_be_empty_element = can_be_empty_element
+            self.cdata_list_attributes = cdata_list_attributes
+            self.preserve_whitespace_tags = preserve_whitespace_tags
+            self.interesting_string_types = interesting_string_types
+        else:
+            # Set up any substitutions for this tag, such as the charset in a META tag.
            builder.set_up_substitutions(self)
+            # Ask the TreeBuilder whether this tag might be an empty-element tag.
            self.can_be_empty_element = builder.can_be_empty_element(name)
-        else:
-            self.can_be_empty_element = False
+            # Keep track of the list of attributes of this tag that
+            # might need to be treated as a list.
+            #
+            # For performance reasons, we store the whole data structure
+            # rather than asking the question of every tag. Asking would
+            # require building a new data structure every time, and
+            # (unlike can_be_empty_element), we almost never need
+            # to check this.
+            self.cdata_list_attributes = builder.cdata_list_attributes
+            # Keep track of the names that might cause this tag to be treated as a
+            # whitespace-preserved tag.
+            self.preserve_whitespace_tags = builder.preserve_whitespace_tags
+            if self.name in builder.string_containers:
+                # This sort of tag uses a special string container
+                # subclass for most of its strings. When we ask the
+                self.interesting_string_types = builder.string_containers[self.name]
+            else:
+                self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
    parserClass = _alias("parser_class")  # BS3
-    def __copy__(self):
+    def __deepcopy__(self, memo, recursive=True):
-        """A copy of a Tag is a new Tag, unconnected to the parse tree.
+        """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
        Its contents are a copy of the old Tag's contents.
        """
-        clone = type(self)(None, self.builder, self.name, self.namespace,
+        clone = self._clone()
-                           self.nsprefix, self.attrs)
+        if recursive:
+            # Clone this tag's descendants recursively, but without
+            # making any recursive function calls.
+            tag_stack = [clone]
+            for event, element in self._event_stream(self.descendants):
+                if event is Tag.END_ELEMENT_EVENT:
+                    # Stop appending incoming Tags to the Tag that was
+                    # just closed.
+                    tag_stack.pop()
+                else:
+                    descendant_clone = element.__deepcopy__(
+                        memo, recursive=False
+                    )
+                    # Add to its parent's .contents
+                    tag_stack[-1].append(descendant_clone)
+                    if event is Tag.START_ELEMENT_EVENT:
+                        # Add the Tag itself to the stack so that its
+                        # children will be .appended to it.
+                        tag_stack.append(descendant_clone)
+        return clone
+    def __copy__(self):
+        """A copy of a Tag must always be a deep copy, because a Tag's
+        children can only have one parent at a time.
+        """
+        return self.__deepcopy__({})
+    def _clone(self):
+        """Create a new Tag just like this one, but with no
+        contents and unattached to any parse tree.
+        This is the first step in the deepcopy process.
+        """
+        clone = type(self)(
+            None, None, self.name, self.namespace,
+            self.prefix, self.attrs, is_xml=self._is_xml,
+            sourceline=self.sourceline, sourcepos=self.sourcepos,
+            can_be_empty_element=self.can_be_empty_element,
+            cdata_list_attributes=self.cdata_list_attributes,
+            preserve_whitespace_tags=self.preserve_whitespace_tags,
+            interesting_string_types=self.interesting_string_types
+        )
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
-        for child in self.contents:
-            clone.append(child.__copy__())
        return clone
+    
    @property
    def is_empty_element(self):
        """Is this tag an empty-element tag? (aka a self-closing tag)
@@ -850,13 +1388,17 @@ class Tag(PageElement):
    @property
    def string(self):
-        """Convenience property to get the single string within this tag.
+        """Convenience property to get the single string within this
+        PageElement.
-        :Return: If this tag has a single string child, return value
+        TODO It might make sense to have NavigableString.string return
-         is that string. If this tag has no children, or more than one
+        itself.
-         child, return value is None. If this tag has one child tag,
+        :return: If this element has a single string child, return
+         value is that string. If this element has one child tag,
         return value is the 'string' attribute of the child tag,
-         recursively.
+         recursively. If this element is itself a string, has no
+         children, or has more than one child, return value is None.
        """
        if len(self.contents) != 1:
            return None
@@ -867,57 +1409,75 @@ class Tag(PageElement):
    @string.setter
    def string(self, string):
+        """Replace this PageElement's contents with `string`."""
        self.clear()
        self.append(string.__class__(string))
-    def _all_strings(self, strip=False, types=(NavigableString, CData)):
+    DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
+    def _all_strings(self, strip=False, types=PageElement.default):
        """Yield all strings of certain classes, possibly stripping them.
-        By default, yields only NavigableString and CData objects. So
+        :param strip: If True, all strings will be stripped before being
-        no comments, processing instructions, etc.
+            yielded.
+        :param types: A tuple of NavigableString subclasses. Any strings of
+            a subclass not found in this list will be ignored. By
+            default, the subclasses considered are the ones found in
+            self.interesting_string_types. If that's not specified,
+            only NavigableString and CData objects will be
+            considered. That means no comments, processing
+            instructions, etc.
+        :yield: A sequence of strings.
        """
+        if types is self.default:
+            types = self.interesting_string_types
        for descendant in self.descendants:
-            if (
+            if (types is None and not isinstance(descendant, NavigableString)):
-                (types is None and not isinstance(descendant, NavigableString))
+                continue
-                or
+            descendant_type = type(descendant)
-                (types is not None and type(descendant) not in types)):
+            if isinstance(types, type):
+                if descendant_type is not types:
+                    # We're not interested in strings of this type.
+                    continue
+            elif types is not None and descendant_type not in types:
+                # We're not interested in strings of this type.
                continue
            if strip:
                descendant = descendant.strip()
                if len(descendant) == 0:
                    continue
            yield descendant
    strings = property(_all_strings)
-    @property
+    def decompose(self):
-    def stripped_strings(self):
+        """Recursively destroys this PageElement and its children.
-        for string in self._all_strings(True):
-            yield string
-    def get_text(self, separator="", strip=False,
+        This element will be removed from the tree and wiped out; so
-                 types=(NavigableString, CData)):
+        will everything beneath it.
-        """
-        Get all child strings, concatenated using the given separator.
-        """
-        return separator.join([s for s in self._all_strings(
-                    strip, types=types)])
-    getText = get_text
-    text = property(get_text)
-    def decompose(self):
+        The behavior of a decomposed PageElement is undefined and you
-        """Recursively destroys the contents of this tree."""
+        should never use one for anything, but if you need to _check_
+        whether an element has been decomposed, you can use the
+        `decomposed` property.
+        """
        self.extract()
        i = self
        while i is not None:
-            next = i.next_element
+            n = i.next_element
            i.__dict__.clear()
            i.contents = []
-            i = next
+            i._decomposed = True
+            i = n
    def clear(self, decompose=False):
-        """
+        """Wipe out all children of this PageElement by calling extract()
-        Extract all children. If decompose is True, decompose instead.
+           on them.
+        :param decompose: If this is True, decompose() (a more
+            destructive method) will be called instead of extract().
        """
        if decompose:
            for element in self.contents[:]:
@@ -929,10 +1489,51 @@ class Tag(PageElement):
            for element in self.contents[:]:
                element.extract()
-    def index(self, element):
+    def smooth(self):
+        """Smooth out this element's children by consolidating consecutive
+        strings.
+        This makes pretty-printed output look more natural following a
+        lot of operations that modified the tree.
        """
-        Find the index of a child by identity, not value. Avoids issues with
+        # Mark the first position of every pair of children that need
-        tag.contents.index(element) getting the index of equal elements.
+        # to be consolidated.  Do this rather than making a copy of
+        # self.contents, since in most cases very few strings will be
+        # affected.
+        marked = []
+        for i, a in enumerate(self.contents):
+            if isinstance(a, Tag):
+                # Recursively smooth children.
+                a.smooth()
+            if i == len(self.contents)-1:
+                # This is the last item in .contents, and it's not a
+                # tag. There's no chance it needs any work.
+                continue
+            b = self.contents[i+1]
+            if (isinstance(a, NavigableString)
+                and isinstance(b, NavigableString)
+                and not isinstance(a, PreformattedString)
+                and not isinstance(b, PreformattedString)
+            ):
+                marked.append(i)
+        # Go over the marked positions in reverse order, so that
+        # removing items from .contents won't affect the remaining
+        # positions.
+        for i in reversed(marked):
+            a = self.contents[i]
+            b = self.contents[i+1]
+            b.extract()
+            n = NavigableString(a+b)
+            a.replace_with(n)
+    def index(self, element):
+        """Find the index of a child by identity, not value.
+        Avoids issues with tag.contents.index(element) getting the
+        index of equal elements.
+        :param element: Look for this PageElement in `self.contents`.
        """
        for i, child in enumerate(self.contents):
            if child is element:
@@ -945,23 +1546,38 @@ class Tag(PageElement):
        attribute."""
        return self.attrs.get(key, default)
+    def get_attribute_list(self, key, default=None):
+        """The same as get(), but always returns a list.
+        :param key: The attribute to look for.
+        :param default: Use this value if the attribute is not present
+            on this PageElement.
+        :return: A list of values, probably containing only a single
+            value.
+        """
+        value = self.get(key, default)
+        if not isinstance(value, list):
+            value = [value]
+        return value
    def has_attr(self, key):
+        """Does this PageElement have an attribute with the given name?"""
        return key in self.attrs
    def __hash__(self):
        return str(self).__hash__()
    def __getitem__(self, key):
-        """tag[key] returns the value of the 'key' attribute for the tag,
+        """tag[key] returns the value of the 'key' attribute for the Tag,
        and throws an exception if it's not there."""
        return self.attrs[key]
    def __iter__(self):
-        "Iterating over a tag iterates over its contents."
+        "Iterating over a Tag iterates over its contents."
        return iter(self.contents)
    def __len__(self):
-        "The length of a tag is the length of its list of contents."
+        "The length of a Tag is the length of its list of contents."
        return len(self.contents)
    def __contains__(self, x):
@@ -981,29 +1597,33 @@ class Tag(PageElement):
        self.attrs.pop(key, None)
    def __call__(self, *args, **kwargs):
-        """Calling a tag like a function is the same as calling its
+        """Calling a Tag like a function is the same as calling its
        find_all() method. Eg. tag('a') returns a list of all the A tags
        found within this tag."""
        return self.find_all(*args, **kwargs)
    def __getattr__(self, tag):
-        #print "Getattr %s.%s" % (self.__class__, tag)
+        """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
+        #print("Getattr %s.%s" % (self.__class__, tag))
        if len(tag) > 3 and tag.endswith('Tag'):
            # BS3: soup.aTag -> "soup.find("a")
            tag_name = tag[:-3]
            warnings.warn(
-                '.%sTag is deprecated, use .find("%s") instead.' % (
+                '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
-                    tag_name, tag_name))
+                    name=tag_name
+                ),
+                DeprecationWarning, stacklevel=2
+            )
            return self.find(tag_name)
        # We special case contents to avoid recursion.
-        elif not tag.startswith("__") and not tag=="contents":
+        elif not tag.startswith("__") and not tag == "contents":
            return self.find(tag)
        raise AttributeError(
            "'%s' object has no attribute '%s'" % (self.__class__, tag))
    def __eq__(self, other):
-        """Returns true iff this tag has the same name, the same attributes,
+        """Returns true iff this Tag has the same name, the same attributes,
-        and the same contents (recursively) as the given tag."""
+        and the same contents (recursively) as `other`."""
        if self is other:
            return True
        if (not hasattr(other, 'name') or
@@ -1019,69 +1639,235 @@ class Tag(PageElement):
        return True
    def __ne__(self, other):
-        """Returns true iff this tag is not identical to the other tag,
+        """Returns true iff this Tag is not identical to `other`,
        as defined in __eq__."""
        return not self == other
    def __repr__(self, encoding="unicode-escape"):
-        """Renders this tag as a string."""
+        """Renders this PageElement as a string.
-        if PY3K:
-            # "The return value must be a string object", i.e. Unicode
-            return self.decode()
-        else:
-            # "The return value must be a string object", i.e. a bytestring.
-            # By convention, the return value of __repr__ should also be
-            # an ASCII string.
-            return self.encode(encoding)
-    def __unicode__(self):
+        :param encoding: The encoding to use (Python 2 only).
+            TODO: This is now ignored and a warning should be issued
+            if a value is provided.
+        :return: A (Unicode) string.
+        """
+        # "The return value must be a string object", i.e. Unicode
        return self.decode()
-    def __str__(self):
+    def __unicode__(self):
-        if PY3K:
+        """Renders this PageElement as a Unicode string."""
-            return self.decode()
+        return self.decode()
-        else:
-            return self.encode()
-    if PY3K:
+    __str__ = __repr__ = __unicode__
-        __str__ = __repr__ = __unicode__
    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
               indent_level=None, formatter="minimal",
               errors="xmlcharrefreplace"):
+        """Render a bytestring representation of this PageElement and its
+        contents.
+        :param encoding: The destination encoding.
+        :param indent_level: Each line of the rendering will be
+           indented this many levels. (The formatter decides what a
+           'level' means in terms of spaces or other characters
+           output.) Used internally in recursive calls while
+           pretty-printing.
+        :param formatter: A Formatter object, or a string naming one of
+            the standard formatters.
+        :param errors: An error handling strategy such as
+            'xmlcharrefreplace'. This value is passed along into
+            encode() and its value should be one of the constants
+            defined by Python.
+        :return: A bytestring.
+        """
        # Turn the data structure into Unicode, then encode the
        # Unicode.
        u = self.decode(indent_level, encoding, formatter)
        return u.encode(encoding, errors)
-    def _should_pretty_print(self, indent_level):
-        """Should this tag be pretty-printed?"""
-        return (
-            indent_level is not None and
-            (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
-             or self._is_xml))
    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-               formatter="minimal"):
+               formatter="minimal",
-        """Returns a Unicode representation of this tag and its contents.
+               iterator=None):
+        pieces = []
+        # First off, turn a non-Formatter `formatter` into a Formatter
+        # object. This will stop the lookup from happening over and
+        # over again.
+        if not isinstance(formatter, Formatter):
+            formatter = self.formatter_for_name(formatter)
+        if indent_level is True:
+            indent_level = 0
+        # The currently active tag that put us into string literal
+        # mode. Until this element is closed, children will be treated
+        # as string literals and not pretty-printed. String literal
+        # mode is turned on immediately after this tag begins, and
+        # turned off immediately before it's closed. This means there
+        # will be whitespace before and after the tag itself.
+        string_literal_tag = None
+        for event, element in self._event_stream(iterator):
+            if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
+                piece = element._format_tag(
+                    eventual_encoding, formatter, opening=True
+                )
+            elif event is Tag.END_ELEMENT_EVENT:
+                piece = element._format_tag(
+                    eventual_encoding, formatter, opening=False
+                )
+                if indent_level is not None:
+                    indent_level -= 1
+            else:
+                piece = element.output_ready(formatter)
+            # Now we need to apply the 'prettiness' -- extra
+            # whitespace before and/or after this tag. This can get
+            # complicated because certain tags, like <pre> and
+            # <script>, can't be prettified, since adding whitespace would
+            # change the meaning of the content.
+            # The default behavior is to add whitespace before and
+            # after an element when string literal mode is off, and to
+            # leave things as they are when string literal mode is on.
+            if string_literal_tag:
+                indent_before = indent_after = False
+            else:
+                indent_before = indent_after = True
+            # The only time the behavior is more complex than that is
+            # when we encounter an opening or closing tag that might
+            # put us into or out of string literal mode.
+            if (event is Tag.START_ELEMENT_EVENT
+                and not string_literal_tag
+                and not element._should_pretty_print()):
+                    # We are about to enter string literal mode. Add
+                    # whitespace before this tag, but not after. We
+                    # will stay in string literal mode until this tag
+                    # is closed.
+                    indent_before = True
+                    indent_after = False
+                    string_literal_tag = element
+            elif (event is Tag.END_ELEMENT_EVENT
+                  and element is string_literal_tag):
+                # We are about to exit string literal mode by closing
+                # the tag that sent us into that mode. Add whitespace
+                # after this tag, but not before.
+                indent_before = False
+                indent_after = True
+                string_literal_tag = None
+            # Now we know whether to add whitespace before and/or
+            # after this element.
+            if indent_level is not None:
+                if (indent_before or indent_after):
+                    if isinstance(element, NavigableString):
+                        piece = piece.strip()
+                    if piece:
+                        piece = self._indent_string(
+                            piece, indent_level, formatter,
+                            indent_before, indent_after
+                        )
+                if event == Tag.START_ELEMENT_EVENT:
+                    indent_level += 1
+            pieces.append(piece)
+        return "".join(pieces)
+    # Names for the different events yielded by _event_stream
+    START_ELEMENT_EVENT = object()
+    END_ELEMENT_EVENT = object()
+    EMPTY_ELEMENT_EVENT = object()
+    STRING_ELEMENT_EVENT = object()
+    def _event_stream(self, iterator=None):
+        """Yield a sequence of events that can be used to reconstruct the DOM
+        for this element.
+        This lets us recreate the nested structure of this element
+        (e.g. when formatting it as a string) without using recursive
+        method calls.
+        This is similar in concept to the SAX API, but it's a simpler
+        interface designed for internal use. The events are different
+        from SAX and the arguments associated with the events are Tags
+        and other Beautiful Soup objects.
+        :param iterator: An alternate iterator to use when traversing
+         the tree.
+        """
+        tag_stack = []
-        :param eventual_encoding: The tag is destined to be
+        iterator = iterator or self.self_and_descendants
-           encoded into this encoding. This method is _not_
-           responsible for performing that encoding. This information
+        for c in iterator:
-           is passed in so that it can be substituted in if the
+            # If the parent of the element we're about to yield is not
-           document contains a <META> tag that mentions the document's
+            # the tag currently on the stack, it means that the tag on
-           encoding.
+            # the stack closed before this element appeared.
+            while tag_stack and c.parent != tag_stack[-1]:
+                now_closed_tag = tag_stack.pop()
+                yield Tag.END_ELEMENT_EVENT, now_closed_tag
+            if isinstance(c, Tag):
+                if c.is_empty_element:
+                    yield Tag.EMPTY_ELEMENT_EVENT, c
+                else:
+                    yield Tag.START_ELEMENT_EVENT, c
+                    tag_stack.append(c)
+                    continue
+            else:
+                yield Tag.STRING_ELEMENT_EVENT, c
+        while tag_stack:
+            now_closed_tag = tag_stack.pop()
+            yield Tag.END_ELEMENT_EVENT, now_closed_tag
+    def _indent_string(self, s, indent_level, formatter,
+                       indent_before, indent_after):
+        """Add indentation whitespace before and/or after a string.
+        :param s: The string to amend with whitespace.
+        :param indent_level: The indentation level; affects how much
+           whitespace goes before the string.
+        :param indent_before: Whether or not to add whitespace
+           before the string.
+        :param indent_after: Whether or not to add whitespace
+           (a newline) after the string.
        """
+        space_before = ''
+        if indent_before and indent_level:
+            space_before = (formatter.indent * indent_level)
-        # First off, turn a string formatter into a function. This
+        space_after = ''
-        # will stop the lookup from happening over and over again.
+        if indent_after:
-        if not isinstance(formatter, collections.abc.Callable):
+            space_after = "\n"
-            formatter = self._formatter_for_name(formatter)
-        attrs = []
+        return space_before + s + space_after
-        if self.attrs:
-            for key, val in sorted(self.attrs.items()):
+    def _format_tag(self, eventual_encoding, formatter, opening):
+        if self.hidden:
+            # A hidden tag is invisible, although its contents
+            # are visible.
+            return ''
+        # A tag starts with the < character (see below).
+        # Then the / character, if this is a closing tag.
+        closing_slash = ''
+        if not opening:
+            closing_slash = '/'
+        # Then an optional namespace prefix.
+        prefix = ''
+        if self.prefix:
+            prefix = self.prefix + ":"
+        # Then a list of attribute values, if this is an opening tag.
+        attribute_string = ''
+        if opening:
+            attributes = formatter.attributes(self)
+            attrs = []
+            for key, val in attributes:
                if val is None:
                    decoded = key
                else:
@@ -1090,71 +1876,52 @@ class Tag(PageElement):
                    elif not isinstance(val, str):
                        val = str(val)
                    elif (
-                        isinstance(val, AttributeValueWithCharsetSubstitution)
+                            isinstance(val, AttributeValueWithCharsetSubstitution)
-                        and eventual_encoding is not None):
+                            and eventual_encoding is not None
+                    ):
                        val = val.encode(eventual_encoding)
-                    text = self.format_string(val, formatter)
+                    text = formatter.attribute_value(val)
                    decoded = (
                        str(key) + '='
-                        + EntitySubstitution.quoted_attribute_value(text))
+                        + formatter.quoted_attribute_value(text))
                attrs.append(decoded)
-        close = ''
+            if attrs:
-        closeTag = ''
+                attribute_string = ' ' + ' '.join(attrs)
-        prefix = ''
-        if self.prefix:
-            prefix = self.prefix + ":"
+        # Then an optional closing slash (for a void element in an
+        # XML document).
+        void_element_closing_slash = ''
        if self.is_empty_element:
-            close = '/'
+            void_element_closing_slash = formatter.void_element_close_prefix or ''
-        else:
-            closeTag = '</%s%s>' % (prefix, self.name)
-        pretty_print = self._should_pretty_print(indent_level)
-        space = ''
-        indent_space = ''
-        if indent_level is not None:
-            indent_space = (' ' * (indent_level - 1))
-        if pretty_print:
-            space = indent_space
-            indent_contents = indent_level + 1
-        else:
-            indent_contents = None
-        contents = self.decode_contents(
-            indent_contents, eventual_encoding, formatter)
-        if self.hidden:
+        # Put it all together.
-            # This is the 'document root' object.
+        return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
-            s = contents
-        else:
+    def _should_pretty_print(self, indent_level=1):
-            s = []
+        """Should this tag be pretty-printed?
-            attribute_string = ''
-            if attrs:
+        Most of them should, but some (such as <pre> in HTML
-                attribute_string = ' ' + ' '.join(attrs)
+        documents) should not.
-            if indent_level is not None:
+        """
-                # Even if this particular tag is not pretty-printed,
+        return (
-                # we should indent up to the start of the tag.
+            indent_level is not None
-                s.append(indent_space)
+            and (
-            s.append('<%s%s%s%s>' % (
+                not self.preserve_whitespace_tags
-                    prefix, self.name, attribute_string, close))
+                or self.name not in self.preserve_whitespace_tags
-            if pretty_print:
+            )
-                s.append("\n")
+        )
-            s.append(contents)
-            if pretty_print and contents and contents[-1] != "\n":
-                s.append("\n")
-            if pretty_print and closeTag:
-                s.append(space)
-            s.append(closeTag)
-            if indent_level is not None and closeTag and self.next_sibling:
-                # Even if this particular tag is not pretty-printed,
-                # we're now done with the tag, and we should add a
-                # newline if appropriate.
-                s.append("\n")
-            s = ''.join(s)
-        return s
    def prettify(self, encoding=None, formatter="minimal"):
+        """Pretty-print this PageElement as a string.
+        :param encoding: The eventual encoding of the string. If this is None,
+            a Unicode string will be returned.
+        :param formatter: A Formatter object, or a string naming one of
+            the standard formatters.
+        :return: A Unicode string (if encoding==None) or a bytestring
+            (otherwise).
+        """
        if encoding is None:
            return self.decode(True, formatter=formatter)
        else:
@@ -1166,62 +1933,50 @@ class Tag(PageElement):
        """Renders the contents of this tag as a Unicode string.
        :param indent_level: Each line of the rendering will be
-           indented this many spaces.
+           indented this many levels. (The formatter decides what a
+           'level' means in terms of spaces or other characters
+           output.) Used internally in recursive calls while
+           pretty-printing.
        :param eventual_encoding: The tag is destined to be
-           encoded into this encoding. This method is _not_
+           encoded into this encoding. decode_contents() is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.
-        :param formatter: The output formatter responsible for converting
+        :param formatter: A Formatter object, or a string naming one of
-           entities to Unicode characters.
+            the standard Formatters.
-        """
-        # First off, turn a string formatter into a function. This
+        """
-        # will stop the lookup from happening over and over again.
+        return self.decode(indent_level, eventual_encoding, formatter,
-        if not isinstance(formatter, collections.abc.Callable):
+                           iterator=self.descendants)
-            formatter = self._formatter_for_name(formatter)
-        pretty_print = (indent_level is not None)
-        s = []
-        for c in self:
-            text = None
-            if isinstance(c, NavigableString):
-                text = c.output_ready(formatter)
-            elif isinstance(c, Tag):
-                s.append(c.decode(indent_level, eventual_encoding,
-                                  formatter))
-            if text and indent_level and not self.name == 'pre':
-                text = text.strip()
-            if text:
-                if pretty_print and not self.name == 'pre':
-                    s.append(" " * (indent_level - 1))
-                s.append(text)
-                if pretty_print and not self.name == 'pre':
-                    s.append("\n")
-        return ''.join(s)
    def encode_contents(
        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
        formatter="minimal"):
-        """Renders the contents of this tag as a bytestring.
+        """Renders the contents of this PageElement as a bytestring.
        :param indent_level: Each line of the rendering will be
-           indented this many spaces.
+           indented this many levels. (The formatter decides what a
+           'level' means in terms of spaces or other characters
+           output.) Used internally in recursive calls while
+           pretty-printing.
        :param eventual_encoding: The bytestring will be in this encoding.
-        :param formatter: The output formatter responsible for converting
+        :param formatter: A Formatter object, or a string naming one of
-           entities to Unicode characters.
+            the standard Formatters.
-        """
+        :return: A bytestring.
+        """
        contents = self.decode_contents(indent_level, encoding, formatter)
        return contents.encode(encoding)
    # Old method for BS3 compatibility
    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
                       prettyPrint=False, indentLevel=0):
+        """Deprecated method for BS3 compatibility."""
        if not prettyPrint:
            indentLevel = None
        return self.encode_contents(
@@ -1229,44 +1984,88 @@ class Tag(PageElement):
    #Soup methods
-    def find(self, name=None, attrs={}, recursive=True, text=None,
+    def find(self, name=None, attrs={}, recursive=True, string=None,
             **kwargs):
-        """Return only the first child of this Tag matching the given
+        """Look in the children of this PageElement and find the first
-        criteria."""
+        PageElement that matches the given criteria.
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param recursive: If this is True, find() will perform a
+            recursive search of this PageElement's children. Otherwise,
+            only the direct children will be considered.
+        :param limit: Stop looking after finding this many results.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
        r = None
-        l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
+        l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
+                          **kwargs)
        if l:
            r = l[0]
        return r
-    findChild = find
+    findChild = find #BS2
-    def find_all(self, name=None, attrs={}, recursive=True, text=None,
+    def find_all(self, name=None, attrs={}, recursive=True, string=None,
                 limit=None, **kwargs):
-        """Extracts a list of Tag objects that match the given
+        """Look in the children of this PageElement and find all
-        criteria.  You can specify the name of the Tag and any
+        PageElements that match the given criteria.
-        attributes you want the Tag to have.
+        All find_* methods take a common set of arguments. See the online
-        The value of a key-value pair in the 'attrs' map can be a
+        documentation for detailed explanations.
-        string, a list of strings, a regular expression object, or a
-        callable that takes a string and returns whether or not the
+        :param name: A filter on tag name.
-        string matches for some custom definition of 'matches'. The
+        :param attrs: A dictionary of filters on attribute values.
-        same is true of the tag name."""
+        :param recursive: If this is True, find_all() will perform a
+            recursive search of this PageElement's children. Otherwise,
+            only the direct children will be considered.
+        :param limit: Stop looking after finding this many results.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A ResultSet of PageElements.
+        :rtype: bs4.element.ResultSet
+        """
        generator = self.descendants
        if not recursive:
            generator = self.children
-        return self._find_all(name, attrs, text, limit, generator, **kwargs)
+        _stacklevel = kwargs.pop('_stacklevel', 2)
+        return self._find_all(name, attrs, string, limit, generator,
+                              _stacklevel=_stacklevel+1, **kwargs)
    findAll = find_all       # BS3
    findChildren = find_all  # BS2
    #Generator methods
    @property
    def children(self):
+        """Iterate over all direct children of this PageElement.
+        :yield: A sequence of PageElements.
+        """
        # return iter() to make the purpose of the method clear
        return iter(self.contents)  # XXX This seems to be untested.
    @property
+    def self_and_descendants(self):
+        """Iterate over this PageElement and its children in a
+        breadth-first sequence.
+        :yield: A sequence of PageElements.
+        """
+        if not self.hidden:
+            yield self
+        for i in self.descendants:
+            yield i
+    @property
    def descendants(self):
+        """Iterate over all children of this PageElement in a
+        breadth-first sequence.
+        :yield: A sequence of PageElements.
+        """
        if not len(self.contents):
            return
        stopNode = self._last_descendant().next_element
@@ -1276,262 +2075,102 @@ class Tag(PageElement):
            current = current.next_element
    # CSS selector code
+    def select_one(self, selector, namespaces=None, **kwargs):
+        """Perform a CSS selection operation on the current element.
-    _selector_combinators = ['>', '+', '~']
+        :param selector: A CSS selector.
-    _select_debug = False
-    def select_one(self, selector):
-        """Perform a CSS selection operation on the current element."""
-        value = self.select(selector, limit=1)
-        if value:
-            return value[0]
-        return None
-    def select(self, selector, _candidate_generator=None, limit=None):
+        :param namespaces: A dictionary mapping namespace prefixes
-        """Perform a CSS selection operation on the current element."""
+           used in the CSS selector to namespace URIs. By default,
+           Beautiful Soup will use the prefixes it encountered while
-        # Handle grouping selectors if ',' exists, ie: p,a
+           parsing the document.
-        if ',' in selector:
-            context = []
-            for partial_selector in selector.split(','):
-                partial_selector = partial_selector.strip()
-                if partial_selector == '':
-                    raise ValueError('Invalid group selection syntax: %s' % selector)
-                candidates = self.select(partial_selector, limit=limit)
-                for candidate in candidates:
-                    if candidate not in context:
-                        context.append(candidate)
-                if limit and len(context) >= limit:
-                    break
-            return context
-        tokens = selector.split()
+        :param kwargs: Keyword arguments to be passed into Soup Sieve's
-        current_context = [self]
+           soupsieve.select() method.
-        if tokens[-1] in self._selector_combinators:
+        :return: A Tag.
-            raise ValueError(
+        :rtype: bs4.element.Tag
-                'Final combinator "%s" is missing an argument.' % tokens[-1])
+        """
+        return self.css.select_one(selector, namespaces, **kwargs)
-        if self._select_debug:
+    def select(self, selector, namespaces=None, limit=None, **kwargs):
-            print('Running CSS selector "%s"' % selector)
+        """Perform a CSS selection operation on the current element.
-        for index, token in enumerate(tokens):
+        This uses the SoupSieve library.
-            new_context = []
-            new_context_ids = set([])
-            if tokens[index-1] in self._selector_combinators:
+        :param selector: A string containing a CSS selector.
-                # This token was consumed by the previous combinator. Skip it.
-                if self._select_debug:
-                    print('  Token was consumed by the previous combinator.')
-                continue
-            if self._select_debug:
+        :param namespaces: A dictionary mapping namespace prefixes
-                print(' Considering token "%s"' % token)
+           used in the CSS selector to namespace URIs. By default,
-            recursive_candidate_generator = None
+           Beautiful Soup will use the prefixes it encountered while
-            tag_name = None
+           parsing the document.
-            # Each operation corresponds to a checker function, a rule
+        :param limit: After finding this number of results, stop looking.
-            # for determining whether a candidate matches the
-            # selector. Candidates are generated by the active
+        :param kwargs: Keyword arguments to be passed into SoupSieve's
-            # iterator.
+           soupsieve.select() method.
-            checker = None
+        :return: A ResultSet of Tags.
-            m = self.attribselect_re.match(token)
+        :rtype: bs4.element.ResultSet
-            if m is not None:
+        """
-                # Attribute selector
+        return self.css.select(selector, namespaces, limit, **kwargs)
-                tag_name, attribute, operator, value = m.groups()
-                checker = self._attribute_checker(operator, attribute, value)
+    @property
+    def css(self):
-            elif '#' in token:
+        """Return an interface to the CSS selector API."""
-                # ID selector
+        return CSS(self)
-                tag_name, tag_id = token.split('#', 1)
-                def id_matches(tag):
-                    return tag.get('id', None) == tag_id
-                checker = id_matches
-            elif '.' in token:
-                # Class selector
-                tag_name, klass = token.split('.', 1)
-                classes = set(klass.split('.'))
-                def classes_match(candidate):
-                    return classes.issubset(candidate.get('class', []))
-                checker = classes_match
-            elif ':' in token:
-                # Pseudo-class
-                tag_name, pseudo = token.split(':', 1)
-                if tag_name == '':
-                    raise ValueError(
-                        "A pseudo-class must be prefixed with a tag name.")
-                pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
-                found = []
-                if pseudo_attributes is None:
-                    pseudo_type = pseudo
-                    pseudo_value = None
-                else:
-                    pseudo_type, pseudo_value = pseudo_attributes.groups()
-                if pseudo_type == 'nth-of-type':
-                    try:
-                        pseudo_value = int(pseudo_value)
-                    except:
-                        raise NotImplementedError(
-                            'Only numeric values are currently supported for the nth-of-type pseudo-class.')
-                    if pseudo_value < 1:
-                        raise ValueError(
-                            'nth-of-type pseudo-class value must be at least 1.')
-                    class Counter(object):
-                        def __init__(self, destination):
-                            self.count = 0
-                            self.destination = destination
-                        def nth_child_of_type(self, tag):
-                            self.count += 1
-                            if self.count == self.destination:
-                                return True
-                            if self.count > self.destination:
-                                # Stop the generator that's sending us
-                                # these things.
-                                raise StopIteration()
-                            return False
-                    checker = Counter(pseudo_value).nth_child_of_type
-                else:
-                    raise NotImplementedError(
-                        'Only the following pseudo-classes are implemented: nth-of-type.')
-            elif token == '*':
-                # Star selector -- matches everything
-                pass
-            elif token == '>':
-                # Run the next token as a CSS selector against the
-                # direct children of each tag in the current context.
-                recursive_candidate_generator = lambda tag: tag.children
-            elif token == '~':
-                # Run the next token as a CSS selector against the
-                # siblings of each tag in the current context.
-                recursive_candidate_generator = lambda tag: tag.next_siblings
-            elif token == '+':
-                # For each tag in the current context, run the next
-                # token as a CSS selector against the tag's next
-                # sibling that's a tag.
-                def next_tag_sibling(tag):
-                    yield tag.find_next_sibling(True)
-                recursive_candidate_generator = next_tag_sibling
-            elif self.tag_name_re.match(token):
-                # Just a tag name.
-                tag_name = token
-            else:
-                raise ValueError(
-                    'Unsupported or invalid CSS selector: "%s"' % token)
-            if recursive_candidate_generator:
-                # This happens when the selector looks like  "> foo".
-                #
-                # The generator calls select() recursively on every
-                # member of the current context, passing in a different
-                # candidate generator and a different selector.
-                #
-                # In the case of "> foo", the candidate generator is
-                # one that yields a tag's direct children (">"), and
-                # the selector is "foo".
-                next_token = tokens[index+1]
-                def recursive_select(tag):
-                    if self._select_debug:
-                        print('    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
-                        print('-' * 40)
-                    for i in tag.select(next_token, recursive_candidate_generator):
-                        if self._select_debug:
-                            print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
-                        yield i
-                    if self._select_debug:
-                        print('-' * 40)
-                _use_candidate_generator = recursive_select
-            elif _candidate_generator is None:
-                # By default, a tag's candidates are all of its
-                # children. If tag_name is defined, only yield tags
-                # with that name.
-                if self._select_debug:
-                    if tag_name:
-                        check = "[any]"
-                    else:
-                        check = tag_name
-                    print('   Default candidate generator, tag name="%s"' % check)
-                if self._select_debug:
-                    # This is redundant with later code, but it stops
-                    # a bunch of bogus tags from cluttering up the
-                    # debug log.
-                    def default_candidate_generator(tag):
-                        for child in tag.descendants:
-                            if not isinstance(child, Tag):
-                                continue
-                            if tag_name and not child.name == tag_name:
-                                continue
-                            yield child
-                    _use_candidate_generator = default_candidate_generator
-                else:
-                    _use_candidate_generator = lambda tag: tag.descendants
-            else:
-                _use_candidate_generator = _candidate_generator
-            count = 0
-            for tag in current_context:
-                if self._select_debug:
-                    print("    Running candidate generator on %s %s" % (
-                        tag.name, repr(tag.attrs)))
-                for candidate in _use_candidate_generator(tag):
-                    if not isinstance(candidate, Tag):
-                        continue
-                    if tag_name and candidate.name != tag_name:
-                        continue
-                    if checker is not None:
-                        try:
-                            result = checker(candidate)
-                        except StopIteration:
-                            # The checker has decided we should no longer
-                            # run the generator.
-                            break
-                    if checker is None or result:
-                        if self._select_debug:
-                            print("     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
-                        if id(candidate) not in new_context_ids:
-                            # If a tag matches a selector more than once,
-                            # don't include it in the context more than once.
-                            new_context.append(candidate)
-                            new_context_ids.add(id(candidate))
-                            if limit and len(new_context) >= limit:
-                                break
-                    elif self._select_debug:
-                        print("     FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
-            current_context = new_context
-        if self._select_debug:
-            print("Final verdict:")
-            for i in current_context:
-                print(" %s %s" % (i.name, i.attrs))
-        return current_context
    # Old names for backwards compatibility
    def childGenerator(self):
+        """Deprecated generator."""
        return self.children
    def recursiveChildGenerator(self):
+        """Deprecated generator."""
        return self.descendants
    def has_key(self, key):
-        """This was kind of misleading because has_key() (attributes)
+        """Deprecated method. This was kind of misleading because has_key()
-        was different from __in__ (contents). has_key() is gone in
+        (attributes) was different from __in__ (contents).
-        Python 3, anyway."""
-        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
+        has_key() is gone in Python 3, anyway.
-                key))
+        """
+        warnings.warn(
+            'has_key is deprecated. Use has_attr(key) instead.',
+            DeprecationWarning, stacklevel=2
+        )
        return self.has_attr(key)
 # Next, a couple classes to represent queries and their results.
 class SoupStrainer(object):
    """Encapsulates a number of ways of matching a markup element (tag or
-    text)."""
+    string).
+    This is primarily used to underpin the find_* methods, but you can
+    create one yourself and pass it in as `parse_only` to the
+    `BeautifulSoup` constructor, to parse a subset of a large
+    document.
+    """
+    def __init__(self, name=None, attrs={}, string=None, **kwargs):
+        """Constructor.
+        The SoupStrainer constructor takes the same arguments passed
+        into the find_* methods. See the online documentation for
+        detailed explanations.
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param string: A filter for a NavigableString with specific text.
+        :kwargs: A dictionary of filters on attribute values.
+        """
+        if string is None and 'text' in kwargs:
+            string = kwargs.pop('text')
+            warnings.warn(
+                "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
+                DeprecationWarning, stacklevel=2
+            )
-    def __init__(self, name=None, attrs={}, text=None, **kwargs):
        self.name = self._normalize_search_value(name)
        if not isinstance(attrs, dict):
            # Treat a non-dict value for attrs as a search for the 'class'
@@ -1556,12 +2195,15 @@ class SoupStrainer(object):
            normalized_attrs[key] = self._normalize_search_value(value)
        self.attrs = normalized_attrs
-        self.text = self._normalize_search_value(text)
+        self.string = self._normalize_search_value(string)
+        # DEPRECATED but just in case someone is checking this.
+        self.text = self.string
    def _normalize_search_value(self, value):
        # Leave it alone if it's a Unicode string, a callable, a
        # regular expression, a boolean, or None.
-        if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match')
+        if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
            or isinstance(value, bool) or value is None):
            return value
@@ -1589,19 +2231,40 @@ class SoupStrainer(object):
        return str(str(value))
    def __str__(self):
-        if self.text:
+        """A human-readable representation of this SoupStrainer."""
-            return self.text
+        if self.string:
+            return self.string
        else:
            return "%s|%s" % (self.name, self.attrs)
    def search_tag(self, markup_name=None, markup_attrs={}):
+        """Check whether a Tag with the given name and attributes would
+        match this SoupStrainer.
+        Used prospectively to decide whether to even bother creating a Tag
+        object.
+        :param markup_name: A tag name as found in some markup.
+        :param markup_attrs: A dictionary of attributes as found in some markup.
+        :return: True if the prospective tag would match this SoupStrainer;
+            False otherwise.
+        """
        found = None
        markup = None
        if isinstance(markup_name, Tag):
            markup = markup_name
            markup_attrs = markup
+        if isinstance(self.name, str):
+            # Optimization for a very common case where the user is
+            # searching for a tag with one specific name, and we're
+            # looking at a tag with a different name.
+            if markup and not markup.prefix and self.name != markup.name:
+                 return False
        call_function_with_tag_data = (
-            isinstance(self.name, collections.abc.Callable)
+            isinstance(self.name, Callable)
            and not isinstance(markup_name, Tag))
        if ((not self.name)
@@ -1630,13 +2293,22 @@ class SoupStrainer(object):
                    found = markup
                else:
                    found = markup_name
-        if found and self.text and not self._matches(found.string, self.text):
+        if found and self.string and not self._matches(found.string, self.string):
            found = None
        return found
+    # For BS3 compatibility.
    searchTag = search_tag
    def search(self, markup):
-        # print 'looking for %s in %s' % (self, markup)
+        """Find all items in `markup` that match this SoupStrainer.
+        Used by the core _find_all() method, which is ultimately
+        called by all find_* methods.
+        :param markup: A PageElement or a list of them.
+        """
+        # print('looking for %s in %s' % (self, markup))
        found = None
        # If given a list of items, scan it for a text element that
        # matches.
@@ -1649,49 +2321,44 @@ class SoupStrainer(object):
        # If it's a Tag, make sure its name or attributes match.
        # Don't bother with Tags if we're searching for text.
        elif isinstance(markup, Tag):
-            if not self.text or self.name or self.attrs:
+            if not self.string or self.name or self.attrs:
                found = self.search_tag(markup)
        # If it's text, make sure the text matches.
        elif isinstance(markup, NavigableString) or \
                 isinstance(markup, str):
-            if not self.name and not self.attrs and self._matches(markup, self.text):
+            if not self.name and not self.attrs and self._matches(markup, self.string):
                found = markup
        else:
            raise Exception(
                "I don't know how to match against a %s" % markup.__class__)
        return found
-    def _matches(self, markup, match_against):
+    def _matches(self, markup, match_against, already_tried=None):
-        # print u"Matching %s against %s" % (markup, match_against)
+        # print(u"Matching %s against %s" % (markup, match_against))
        result = False
        if isinstance(markup, list) or isinstance(markup, tuple):
            # This should only happen when searching a multi-valued attribute
            # like 'class'.
-            if (isinstance(match_against, str)
+            for item in markup:
-                and ' ' in match_against):
+                if self._matches(item, match_against):
-                # A bit of a special case. If they try to match "foo
+                    return True
-                # bar" on a multivalue attribute's value, only accept
+            # We didn't match any particular value of the multivalue
-                # the literal value "foo bar"
+            # attribute, but maybe we match the attribute value when
-                #
+            # considered as a string.
-                # XXX This is going to be pretty slow because we keep
+            if self._matches(' '.join(markup), match_against):
-                # splitting match_against. But it shouldn't come up
+                return True
-                # too often.
+            return False
-                return (whitespace_re.split(match_against) == markup)
-            else:
-                for item in markup:
-                    if self._matches(item, match_against):
-                        return True
-                return False
        if match_against is True:
            # True matches any non-None value.
            return markup is not None
-        if isinstance(match_against, collections.abc.Callable):
+        if isinstance(match_against, Callable):
            return match_against(markup)
        # Custom callables take the tag as an argument, but all
        # other ways of matching match the tag name as a string.
+        original_markup = markup
        if isinstance(markup, Tag):
            markup = markup.name
@@ -1702,23 +2369,67 @@ class SoupStrainer(object):
            # None matches None, False, an empty string, an empty list, and so on.
            return not match_against
-        if isinstance(match_against, str):
+        if (hasattr(match_against, '__iter__')
+            and not isinstance(match_against, str)):
+            # We're asked to match against an iterable of items.
+            # The markup must be match at least one item in the
+            # iterable. We'll try each one in turn.
+            #
+            # To avoid infinite recursion we need to keep track of
+            # items we've already seen.
+            if not already_tried:
+                already_tried = set()
+            for item in match_against:
+                if item.__hash__:
+                    key = item
+                else:
+                    key = id(item)
+                if key in already_tried:
+                    continue
+                else:
+                    already_tried.add(key)
+                    if self._matches(original_markup, item, already_tried):
+                        return True
+            else:
+                return False
+        # Beyond this point we might need to run the test twice: once against
+        # the tag's name and once against its prefixed name.
+        match = False
+        if not match and isinstance(match_against, str):
            # Exact string match
-            return markup == match_against
+            match = markup == match_against
-        if hasattr(match_against, 'match'):
+        if not match and hasattr(match_against, 'search'):
            # Regexp match
            return match_against.search(markup)
-        if hasattr(match_against, '__iter__'):
+        if (not match
-            # The markup must be an exact match against something
+            and isinstance(original_markup, Tag)
-            # in the iterable.
+            and original_markup.prefix):
-            return markup in match_against
+            # Try the whole thing again with the prefixed tag name.
+            return self._matches(
+                original_markup.prefix + ':' + original_markup.name, match_against
+            )
+        return match
 class ResultSet(list):
    """A ResultSet is just a list that keeps track of the SoupStrainer
    that created it."""
    def __init__(self, source, result=()):
+        """Constructor.
+        :param source: A SoupStrainer.
+        :param result: A list of PageElements.
+        """
        super(ResultSet, self).__init__(result)
        self.source = source
+    def __getattr__(self, key):
+        """Raise a helpful exception to explain a common code fix."""
+        raise AttributeError(
+            "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
+        )