summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/formatter.py
blob: 9fa1b57cb6dc2d4484c5710245844dc4c9c282ec (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from bs4.dammit import EntitySubstitution

class Formatter(EntitySubstitution):
    """Describes a strategy to use when outputting a parse tree to a string.

    Some parts of this strategy come from the distinction between
    HTML4, HTML5, and XML. Others are configurable by the user.

    Formatters are passed in as the `formatter` argument to methods
    like `PageElement.encode`. Most people won't need to think about
    formatters, and most people who need to think about them can pass
    in one of these predefined strings as `formatter` rather than
    making a new Formatter object:

    For HTML documents:
     * 'html' - HTML entity substitution for generic HTML documents. (default)
     * 'html5' - HTML entity substitution for HTML5 documents, as
                 well as some optimizations in the way tags are rendered.
     * 'minimal' - Only make the substitutions necessary to guarantee
                   valid HTML.
     * None - Do not perform any substitution. This will be faster
              but may result in invalid markup.

    For XML documents:
     * 'html' - Entity substitution for XHTML documents.
     * 'minimal' - Only make the substitutions necessary to guarantee
                   valid XML. (default)
     * None - Do not perform any substitution. This will be faster
              but may result in invalid markup.
    """
    # Registries of XML and HTML formatters.
    XML_FORMATTERS = {}
    HTML_FORMATTERS = {}

    HTML = 'html'
    XML = 'xml'

    HTML_DEFAULTS = dict(
        cdata_containing_tags=set(["script", "style"]),
    )

    def _default(self, language, value, kwarg):
        if value is not None:
            return value
        if language == self.XML:
            return set()
        return self.HTML_DEFAULTS[kwarg]

    def __init__(
            self, language=None, entity_substitution=None,
            void_element_close_prefix='/', cdata_containing_tags=None,
            empty_attributes_are_booleans=False, indent=1,
    ):
        r"""Constructor.

        :param language: This should be Formatter.XML if you are formatting
           XML markup and Formatter.HTML if you are formatting HTML markup.

        :param entity_substitution: A function to call to replace special
           characters with XML/HTML entities. For examples, see 
           bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
        :param void_element_close_prefix: By default, void elements
           are represented as <tag/> (XML rules) rather than <tag>
           (HTML rules). To get <tag>, pass in the empty string.
        :param cdata_containing_tags: The list of tags that are defined
           as containing CDATA in this dialect. For example, in HTML,
           <script> and <style> tags are defined as containing CDATA,
           and their contents should not be formatted.
        :param blank_attributes_are_booleans: Render attributes whose value
            is the empty string as HTML-style boolean attributes.
            (Attributes whose value is None are always rendered this way.)

        :param indent: If indent is a non-negative integer or string,
            then the contents of elements will be indented
            appropriately when pretty-printing. An indent level of 0,
            negative, or "" will only insert newlines. Using a
            positive integer indent indents that many spaces per
            level. If indent is a string (such as "\t"), that string
            is used to indent each level. The default behavior is to
            indent one space per level.
        """
        self.language = language
        self.entity_substitution = entity_substitution
        self.void_element_close_prefix = void_element_close_prefix
        self.cdata_containing_tags = self._default(
            language, cdata_containing_tags, 'cdata_containing_tags'
        )
        self.empty_attributes_are_booleans=empty_attributes_are_booleans
        if indent is None:
            indent = 0
        if isinstance(indent, int):
            if indent < 0:
                indent = 0
            indent = ' ' * indent
        elif isinstance(indent, str):
            indent = indent
        else:
            indent = ' '
        self.indent = indent

    def substitute(self, ns):
        """Process a string that needs to undergo entity substitution.
        This may be a string encountered in an attribute value or as
        text.

        :param ns: A string.
        :return: A string with certain characters replaced by named
           or numeric entities.
        """
        if not self.entity_substitution:
            return ns
        from .element import NavigableString
        if (isinstance(ns, NavigableString)
            and ns.parent is not None
            and ns.parent.name in self.cdata_containing_tags):
            # Do nothing.
            return ns
        # Substitute.
        return self.entity_substitution(ns)

    def attribute_value(self, value):
        """Process the value of an attribute.

        :param ns: A string.
        :return: A string with certain characters replaced by named
           or numeric entities.
        """
        return self.substitute(value)
    
    def attributes(self, tag):
        """Reorder a tag's attributes however you want.
        
        By default, attributes are sorted alphabetically. This makes
        behavior consistent between Python 2 and Python 3, and preserves
        backwards compatibility with older versions of Beautiful Soup.

        If `empty_boolean_attributes` is True, then attributes whose
        values are set to the empty string will be treated as boolean
        attributes.
        """
        if tag.attrs is None:
            return []
        return sorted(
            (k, (None if self.empty_attributes_are_booleans and v == '' else v))
            for k, v in list(tag.attrs.items())
        )
   
class HTMLFormatter(Formatter):
    """A generic Formatter for HTML."""
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)

    
class XMLFormatter(Formatter):
    """A generic Formatter for XML."""
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)


# Set up aliases for the default formatters.
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html,
    void_element_close_prefix=None,
    empty_attributes_are_booleans=True,
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
)
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
    entity_substitution=None
)
XMLFormatter.REGISTRY["html"] =  XMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html
)
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
)
XMLFormatter.REGISTRY[None] = Formatter(
    Formatter(Formatter.XML, entity_substitution=None)
)