summaryrefslogtreecommitdiffstats
path: root/bitbake
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake')
-rwxr-xr-xbitbake/bin/bitbake2
-rwxr-xr-xbitbake/bin/bitbake-hashclient61
-rwxr-xr-xbitbake/bin/bitbake-hashserv10
-rwxr-xr-xbitbake/bin/bitbake-layers14
-rwxr-xr-xbitbake/bin/bitbake-prserv106
-rwxr-xr-xbitbake/bin/bitbake-selftest2
-rw-r--r--bitbake/doc/bitbake-user-manual/bitbake-user-manual-ref-variables-context.rst91
-rw-r--r--bitbake/doc/bitbake-user-manual/bitbake-user-manual-ref-variables.rst9
-rw-r--r--bitbake/doc/index.rst1
-rw-r--r--bitbake/lib/bb/__init__.py34
-rw-r--r--bitbake/lib/bb/asyncrpc/__init__.py2
-rw-r--r--bitbake/lib/bb/asyncrpc/client.py126
-rw-r--r--bitbake/lib/bb/asyncrpc/serv.py37
-rw-r--r--bitbake/lib/bb/build.py2
-rw-r--r--bitbake/lib/bb/codeparser.py31
-rw-r--r--bitbake/lib/bb/cooker.py33
-rw-r--r--bitbake/lib/bb/fetch2/crate.py9
-rw-r--r--bitbake/lib/bb/fetch2/gcp.py1
-rw-r--r--bitbake/lib/bb/fetch2/npmsw.py2
-rw-r--r--bitbake/lib/bb/fetch2/svn.py3
-rw-r--r--bitbake/lib/bb/fetch2/wget.py23
-rw-r--r--bitbake/lib/bb/parse/__init__.py12
-rw-r--r--bitbake/lib/bb/parse/parse_py/BBHandler.py8
-rw-r--r--bitbake/lib/bb/runqueue.py99
-rw-r--r--bitbake/lib/bb/siggen.py114
-rw-r--r--bitbake/lib/bb/tests/codeparser.py40
-rw-r--r--bitbake/lib/bb/tests/fetch.py32
-rw-r--r--bitbake/lib/bb/ui/buildinfohelper.py5
-rw-r--r--bitbake/lib/bblayers/action.py4
-rw-r--r--bitbake/lib/bs4/AUTHORS49
-rw-r--r--bitbake/lib/bs4/AUTHORS.txt43
-rw-r--r--bitbake/lib/bs4/CHANGELOG (renamed from bitbake/lib/bs4/NEWS.txt)779
-rw-r--r--bitbake/lib/bs4/LICENSE (renamed from bitbake/lib/bs4/COPYING.txt)11
-rw-r--r--bitbake/lib/bs4/__init__.py680
-rw-r--r--bitbake/lib/bs4/builder/__init__.py382
-rw-r--r--bitbake/lib/bs4/builder/_html5lib.py251
-rw-r--r--bitbake/lib/bs4/builder/_htmlparser.py433
-rw-r--r--bitbake/lib/bs4/builder/_lxml.py212
-rw-r--r--bitbake/lib/bs4/css.py274
-rw-r--r--bitbake/lib/bs4/dammit.py411
-rw-r--r--bitbake/lib/bs4/diagnose.py83
-rw-r--r--bitbake/lib/bs4/element.py2219
-rw-r--r--bitbake/lib/bs4/formatter.py185
-rw-r--r--bitbake/lib/bs4/testing.py686
-rw-r--r--bitbake/lib/bs4/tests/__init__.py1
-rw-r--r--bitbake/lib/bs4/tests/test_builder_registry.py147
-rw-r--r--bitbake/lib/bs4/tests/test_docs.py32
-rw-r--r--bitbake/lib/bs4/tests/test_html5lib.py98
-rw-r--r--bitbake/lib/bs4/tests/test_htmlparser.py31
-rw-r--r--bitbake/lib/bs4/tests/test_lxml.py70
-rw-r--r--bitbake/lib/bs4/tests/test_soup.py479
-rw-r--r--bitbake/lib/bs4/tests/test_tree.py2004
-rw-r--r--bitbake/lib/hashserv/__init__.py31
-rw-r--r--bitbake/lib/hashserv/client.py227
-rw-r--r--bitbake/lib/hashserv/tests.py78
-rw-r--r--bitbake/lib/prserv/__init__.py97
-rw-r--r--bitbake/lib/prserv/client.py46
-rw-r--r--bitbake/lib/prserv/db.py430
-rw-r--r--bitbake/lib/prserv/serv.py252
-rw-r--r--bitbake/lib/prserv/tests.py386
60 files changed, 6356 insertions, 5664 deletions
diff --git a/bitbake/bin/bitbake b/bitbake/bin/bitbake
index f2d168c522..8622a7bf94 100755
--- a/bitbake/bin/bitbake
+++ b/bitbake/bin/bitbake
@@ -27,7 +27,7 @@ from bb.main import bitbake_main, BitBakeConfigParameters, BBMainException
27 27
28bb.utils.check_system_locale() 28bb.utils.check_system_locale()
29 29
30__version__ = "2.7.3" 30__version__ = "2.9.1"
31 31
32if __name__ == "__main__": 32if __name__ == "__main__":
33 if __version__ != bb.__version__: 33 if __version__ != bb.__version__:
diff --git a/bitbake/bin/bitbake-hashclient b/bitbake/bin/bitbake-hashclient
index 610787ed2b..5d6f67046b 100755
--- a/bitbake/bin/bitbake-hashclient
+++ b/bitbake/bin/bitbake-hashclient
@@ -16,6 +16,7 @@ import time
16import warnings 16import warnings
17import netrc 17import netrc
18import json 18import json
19import statistics
19warnings.simplefilter("default") 20warnings.simplefilter("default")
20 21
21try: 22try:
@@ -81,6 +82,7 @@ def main():
81 nonlocal found_hashes 82 nonlocal found_hashes
82 nonlocal missed_hashes 83 nonlocal missed_hashes
83 nonlocal max_time 84 nonlocal max_time
85 nonlocal times
84 86
85 with hashserv.create_client(args.address) as client: 87 with hashserv.create_client(args.address) as client:
86 for i in range(args.requests): 88 for i in range(args.requests):
@@ -98,29 +100,41 @@ def main():
98 else: 100 else:
99 missed_hashes += 1 101 missed_hashes += 1
100 102
101 max_time = max(elapsed, max_time) 103 times.append(elapsed)
102 pbar.update() 104 pbar.update()
103 105
104 max_time = 0 106 max_time = 0
105 found_hashes = 0 107 found_hashes = 0
106 missed_hashes = 0 108 missed_hashes = 0
107 lock = threading.Lock() 109 lock = threading.Lock()
108 total_requests = args.clients * args.requests 110 times = []
109 start_time = time.perf_counter() 111 start_time = time.perf_counter()
110 with ProgressBar(total=total_requests) as pbar: 112 with ProgressBar(total=args.clients * args.requests) as pbar:
111 threads = [threading.Thread(target=thread_main, args=(pbar, lock), daemon=False) for _ in range(args.clients)] 113 threads = [threading.Thread(target=thread_main, args=(pbar, lock), daemon=False) for _ in range(args.clients)]
112 for t in threads: 114 for t in threads:
113 t.start() 115 t.start()
114 116
115 for t in threads: 117 for t in threads:
116 t.join() 118 t.join()
119 total_elapsed = time.perf_counter() - start_time
117 120
118 elapsed = time.perf_counter() - start_time
119 with lock: 121 with lock:
120 print("%d requests in %.1fs. %.1f requests per second" % (total_requests, elapsed, total_requests / elapsed)) 122 mean = statistics.mean(times)
121 print("Average request time %.8fs" % (elapsed / total_requests)) 123 median = statistics.median(times)
122 print("Max request time was %.8fs" % max_time) 124 stddev = statistics.pstdev(times)
123 print("Found %d hashes, missed %d" % (found_hashes, missed_hashes)) 125
126 print(f"Number of clients: {args.clients}")
127 print(f"Requests per client: {args.requests}")
128 print(f"Number of requests: {len(times)}")
129 print(f"Total elapsed time: {total_elapsed:.3f}s")
130 print(f"Total request rate: {len(times)/total_elapsed:.3f} req/s")
131 print(f"Average request time: {mean:.3f}s")
132 print(f"Median request time: {median:.3f}s")
133 print(f"Request time std dev: {stddev:.3f}s")
134 print(f"Maximum request time: {max(times):.3f}s")
135 print(f"Minimum request time: {min(times):.3f}s")
136 print(f"Hashes found: {found_hashes}")
137 print(f"Hashes missed: {missed_hashes}")
124 138
125 if args.report: 139 if args.report:
126 with ProgressBar(total=args.requests) as pbar: 140 with ProgressBar(total=args.requests) as pbar:
@@ -225,6 +239,32 @@ def main():
225 print("true" if result else "false") 239 print("true" if result else "false")
226 return 0 240 return 0
227 241
242 def handle_ping(args, client):
243 times = []
244 for i in range(1, args.count + 1):
245 if not args.quiet:
246 print(f"Ping {i} of {args.count}... ", end="")
247 start_time = time.perf_counter()
248 client.ping()
249 elapsed = time.perf_counter() - start_time
250 times.append(elapsed)
251 if not args.quiet:
252 print(f"{elapsed:.3f}s")
253
254 mean = statistics.mean(times)
255 median = statistics.median(times)
256 std_dev = statistics.pstdev(times)
257
258 if not args.quiet:
259 print("------------------------")
260 print(f"Number of pings: {len(times)}")
261 print(f"Average round trip time: {mean:.3f}s")
262 print(f"Median round trip time: {median:.3f}s")
263 print(f"Round trip time std dev: {std_dev:.3f}s")
264 print(f"Min time is: {min(times):.3f}s")
265 print(f"Max time is: {max(times):.3f}s")
266 return 0
267
228 parser = argparse.ArgumentParser(description='Hash Equivalence Client') 268 parser = argparse.ArgumentParser(description='Hash Equivalence Client')
229 parser.add_argument('--address', default=DEFAULT_ADDRESS, help='Server address (default "%(default)s")') 269 parser.add_argument('--address', default=DEFAULT_ADDRESS, help='Server address (default "%(default)s")')
230 parser.add_argument('--log', default='WARNING', help='Set logging level') 270 parser.add_argument('--log', default='WARNING', help='Set logging level')
@@ -322,6 +362,11 @@ def main():
322 unihash_exists_parser.add_argument("unihash", help="Unihash to check") 362 unihash_exists_parser.add_argument("unihash", help="Unihash to check")
323 unihash_exists_parser.set_defaults(func=handle_unihash_exists) 363 unihash_exists_parser.set_defaults(func=handle_unihash_exists)
324 364
365 ping_parser = subparsers.add_parser('ping', help="Ping server")
366 ping_parser.add_argument("-n", "--count", type=int, help="Number of pings. Default is %(default)s", default=10)
367 ping_parser.add_argument("-q", "--quiet", action="store_true", help="Don't print each ping; only print results")
368 ping_parser.set_defaults(func=handle_ping)
369
325 args = parser.parse_args() 370 args = parser.parse_args()
326 371
327 logger = logging.getLogger('hashserv') 372 logger = logging.getLogger('hashserv')
diff --git a/bitbake/bin/bitbake-hashserv b/bitbake/bin/bitbake-hashserv
index 4bfb7abfbc..01503736b9 100755
--- a/bitbake/bin/bitbake-hashserv
+++ b/bitbake/bin/bitbake-hashserv
@@ -125,6 +125,11 @@ The following permissions are supported by the server:
125 default=os.environ.get("HASHSERVER_ADMIN_PASSWORD", None), 125 default=os.environ.get("HASHSERVER_ADMIN_PASSWORD", None),
126 help="Create default admin user with password ADMIN_PASSWORD ($HASHSERVER_ADMIN_PASSWORD)", 126 help="Create default admin user with password ADMIN_PASSWORD ($HASHSERVER_ADMIN_PASSWORD)",
127 ) 127 )
128 parser.add_argument(
129 "--reuseport",
130 action="store_true",
131 help="Enable SO_REUSEPORT, allowing multiple servers to bind to the same port for load balancing",
132 )
128 133
129 args = parser.parse_args() 134 args = parser.parse_args()
130 135
@@ -132,7 +137,9 @@ The following permissions are supported by the server:
132 137
133 level = getattr(logging, args.log.upper(), None) 138 level = getattr(logging, args.log.upper(), None)
134 if not isinstance(level, int): 139 if not isinstance(level, int):
135 raise ValueError("Invalid log level: %s (Try ERROR/WARNING/INFO/DEBUG)" % args.log) 140 raise ValueError(
141 "Invalid log level: %s (Try ERROR/WARNING/INFO/DEBUG)" % args.log
142 )
136 143
137 logger.setLevel(level) 144 logger.setLevel(level)
138 console = logging.StreamHandler() 145 console = logging.StreamHandler()
@@ -155,6 +162,7 @@ The following permissions are supported by the server:
155 anon_perms=anon_perms, 162 anon_perms=anon_perms,
156 admin_username=args.admin_user, 163 admin_username=args.admin_user,
157 admin_password=args.admin_password, 164 admin_password=args.admin_password,
165 reuseport=args.reuseport,
158 ) 166 )
159 server.serve_forever() 167 server.serve_forever()
160 return 0 168 return 0
diff --git a/bitbake/bin/bitbake-layers b/bitbake/bin/bitbake-layers
index d4b1d1aaf2..aebb5100c2 100755
--- a/bitbake/bin/bitbake-layers
+++ b/bitbake/bin/bitbake-layers
@@ -33,7 +33,7 @@ def main():
33 add_help=False) 33 add_help=False)
34 parser.add_argument('-d', '--debug', help='Enable debug output', action='store_true') 34 parser.add_argument('-d', '--debug', help='Enable debug output', action='store_true')
35 parser.add_argument('-q', '--quiet', help='Print only errors', action='store_true') 35 parser.add_argument('-q', '--quiet', help='Print only errors', action='store_true')
36 parser.add_argument('-F', '--force', help='Force add without recipe parse verification', action='store_true') 36 parser.add_argument('-F', '--force', help='Forced execution: can be specified multiple times. -F will force add without recipe parse verification and -FF will additionally force the run withput layer parsing.', action='count', default=0)
37 parser.add_argument('--color', choices=['auto', 'always', 'never'], default='auto', help='Colorize output (where %(metavar)s is %(choices)s)', metavar='COLOR') 37 parser.add_argument('--color', choices=['auto', 'always', 'never'], default='auto', help='Colorize output (where %(metavar)s is %(choices)s)', metavar='COLOR')
38 38
39 global_args, unparsed_args = parser.parse_known_args() 39 global_args, unparsed_args = parser.parse_known_args()
@@ -59,16 +59,20 @@ def main():
59 plugins = [] 59 plugins = []
60 tinfoil = bb.tinfoil.Tinfoil(tracking=True) 60 tinfoil = bb.tinfoil.Tinfoil(tracking=True)
61 tinfoil.logger.setLevel(logger.getEffectiveLevel()) 61 tinfoil.logger.setLevel(logger.getEffectiveLevel())
62 try: 62 if global_args.force > 1:
63 bbpaths = []
64 else:
63 tinfoil.prepare(True) 65 tinfoil.prepare(True)
64 for path in ([topdir] + 66 bbpaths = tinfoil.config_data.getVar('BBPATH').split(':')
65 tinfoil.config_data.getVar('BBPATH').split(':')): 67
68 try:
69 for path in ([topdir] + bbpaths):
66 pluginpath = os.path.join(path, 'lib', 'bblayers') 70 pluginpath = os.path.join(path, 'lib', 'bblayers')
67 bb.utils.load_plugins(logger, plugins, pluginpath) 71 bb.utils.load_plugins(logger, plugins, pluginpath)
68 72
69 registered = False 73 registered = False
70 for plugin in plugins: 74 for plugin in plugins:
71 if hasattr(plugin, 'tinfoil_init'): 75 if hasattr(plugin, 'tinfoil_init') and global_args.force <= 1:
72 plugin.tinfoil_init(tinfoil) 76 plugin.tinfoil_init(tinfoil)
73 if hasattr(plugin, 'register_commands'): 77 if hasattr(plugin, 'register_commands'):
74 registered = True 78 registered = True
diff --git a/bitbake/bin/bitbake-prserv b/bitbake/bin/bitbake-prserv
index 5be42f3ce5..580e021fda 100755
--- a/bitbake/bin/bitbake-prserv
+++ b/bitbake/bin/bitbake-prserv
@@ -7,49 +7,97 @@
7 7
8import os 8import os
9import sys,logging 9import sys,logging
10import optparse 10import argparse
11import warnings 11import warnings
12warnings.simplefilter("default") 12warnings.simplefilter("default")
13 13
14sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)),'lib')) 14sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), "lib"))
15 15
16import prserv 16import prserv
17import prserv.serv 17import prserv.serv
18 18
19__version__="1.0.0" 19VERSION = "2.0.0"
20 20
21PRHOST_DEFAULT='0.0.0.0' 21PRHOST_DEFAULT="0.0.0.0"
22PRPORT_DEFAULT=8585 22PRPORT_DEFAULT=8585
23 23
24def init_logger(logfile, loglevel):
25 numeric_level = getattr(logging, loglevel.upper(), None)
26 if not isinstance(numeric_level, int):
27 raise ValueError("Invalid log level: %s" % loglevel)
28 FORMAT = "%(asctime)-15s %(message)s"
29 logging.basicConfig(level=numeric_level, filename=logfile, format=FORMAT)
30
24def main(): 31def main():
25 parser = optparse.OptionParser( 32 parser = argparse.ArgumentParser(
26 version="Bitbake PR Service Core version %s, %%prog version %s" % (prserv.__version__, __version__), 33 description="BitBake PR Server. Version=%s" % VERSION,
27 usage = "%prog < --start | --stop > [options]") 34 formatter_class=argparse.RawTextHelpFormatter)
28 35
29 parser.add_option("-f", "--file", help="database filename(default: prserv.sqlite3)", action="store", 36 parser.add_argument(
30 dest="dbfile", type="string", default="prserv.sqlite3") 37 "-f",
31 parser.add_option("-l", "--log", help="log filename(default: prserv.log)", action="store", 38 "--file",
32 dest="logfile", type="string", default="prserv.log") 39 default="prserv.sqlite3",
33 parser.add_option("--loglevel", help="logging level, i.e. CRITICAL, ERROR, WARNING, INFO, DEBUG", 40 help="database filename (default: prserv.sqlite3)",
34 action = "store", type="string", dest="loglevel", default = "INFO") 41 )
35 parser.add_option("--start", help="start daemon", 42 parser.add_argument(
36 action="store_true", dest="start") 43 "-l",
37 parser.add_option("--stop", help="stop daemon", 44 "--log",
38 action="store_true", dest="stop") 45 default="prserv.log",
39 parser.add_option("--host", help="ip address to bind", action="store", 46 help="log filename(default: prserv.log)",
40 dest="host", type="string", default=PRHOST_DEFAULT) 47 )
41 parser.add_option("--port", help="port number(default: 8585)", action="store", 48 parser.add_argument(
42 dest="port", type="int", default=PRPORT_DEFAULT) 49 "--loglevel",
43 parser.add_option("-r", "--read-only", help="open database in read-only mode", 50 default="INFO",
44 action="store_true") 51 help="logging level, i.e. CRITICAL, ERROR, WARNING, INFO, DEBUG",
52 )
53 parser.add_argument(
54 "--start",
55 action="store_true",
56 help="start daemon",
57 )
58 parser.add_argument(
59 "--stop",
60 action="store_true",
61 help="stop daemon",
62 )
63 parser.add_argument(
64 "--host",
65 help="ip address to bind",
66 default=PRHOST_DEFAULT,
67 )
68 parser.add_argument(
69 "--port",
70 type=int,
71 default=PRPORT_DEFAULT,
72 help="port number (default: 8585)",
73 )
74 parser.add_argument(
75 "-r",
76 "--read-only",
77 action="store_true",
78 help="open database in read-only mode",
79 )
80 parser.add_argument(
81 "-u",
82 "--upstream",
83 default=os.environ.get("PRSERVER_UPSTREAM", None),
84 help="Upstream PR service (host:port)",
85 )
45 86
46 options, args = parser.parse_args(sys.argv) 87 args = parser.parse_args()
47 prserv.init_logger(os.path.abspath(options.logfile),options.loglevel) 88 init_logger(os.path.abspath(args.log), args.loglevel)
48 89
49 if options.start: 90 if args.start:
50 ret=prserv.serv.start_daemon(options.dbfile, options.host, options.port,os.path.abspath(options.logfile), options.read_only) 91 ret=prserv.serv.start_daemon(
51 elif options.stop: 92 args.file,
52 ret=prserv.serv.stop_daemon(options.host, options.port) 93 args.host,
94 args.port,
95 os.path.abspath(args.log),
96 args.read_only,
97 args.upstream
98 )
99 elif args.stop:
100 ret=prserv.serv.stop_daemon(args.host, args.port)
53 else: 101 else:
54 ret=parser.print_help() 102 ret=parser.print_help()
55 return ret 103 return ret
diff --git a/bitbake/bin/bitbake-selftest b/bitbake/bin/bitbake-selftest
index f25f23b1ae..ce901232fe 100755
--- a/bitbake/bin/bitbake-selftest
+++ b/bitbake/bin/bitbake-selftest
@@ -15,6 +15,7 @@ import unittest
15try: 15try:
16 import bb 16 import bb
17 import hashserv 17 import hashserv
18 import prserv
18 import layerindexlib 19 import layerindexlib
19except RuntimeError as exc: 20except RuntimeError as exc:
20 sys.exit(str(exc)) 21 sys.exit(str(exc))
@@ -33,6 +34,7 @@ tests = ["bb.tests.codeparser",
33 "bb.tests.utils", 34 "bb.tests.utils",
34 "bb.tests.compression", 35 "bb.tests.compression",
35 "hashserv.tests", 36 "hashserv.tests",
37 "prserv.tests",
36 "layerindexlib.tests.layerindexobj", 38 "layerindexlib.tests.layerindexobj",
37 "layerindexlib.tests.restapi", 39 "layerindexlib.tests.restapi",
38 "layerindexlib.tests.cooker"] 40 "layerindexlib.tests.cooker"]
diff --git a/bitbake/doc/bitbake-user-manual/bitbake-user-manual-ref-variables-context.rst b/bitbake/doc/bitbake-user-manual/bitbake-user-manual-ref-variables-context.rst
new file mode 100644
index 0000000000..e9c454ba11
--- /dev/null
+++ b/bitbake/doc/bitbake-user-manual/bitbake-user-manual-ref-variables-context.rst
@@ -0,0 +1,91 @@
1.. SPDX-License-Identifier: CC-BY-2.5
2
3================
4Variable Context
5================
6
7|
8
9Variables might only have an impact or can be used in certain contexts. Some
10should only be used in global files like ``.conf``, while others are intended only
11for local files like ``.bb``. This chapter aims to describe some important variable
12contexts.
13
14.. _ref-varcontext-configuration:
15
16BitBake's own configuration
17===========================
18
19Variables starting with ``BB_`` usually configure the behaviour of BitBake itself.
20For example, one could configure:
21
22- System resources, like disk space to be used (:term:`BB_DISKMON_DIRS`),
23 or the number of tasks to be run in parallel by BitBake (:term:`BB_NUMBER_THREADS`).
24
25- How the fetchers shall behave, e.g., :term:`BB_FETCH_PREMIRRORONLY` is used
26 by BitBake to determine if BitBake's fetcher shall search only
27 :term:`PREMIRRORS` for files.
28
29Those variables are usually configured globally.
30
31BitBake configuration
32=====================
33
34There are variables:
35
36- Like :term:`B` or :term:`T`, that are used to specify directories used by
37 BitBake during the build of a particular recipe. Those variables are
38 specified in ``bitbake.conf``. Some, like :term:`B`, are quite often
39 overwritten in recipes.
40
41- Starting with ``FAKEROOT``, to configure how the ``fakeroot`` command is
42 handled. Those are usually set by ``bitbake.conf`` and might get adapted in a
43 ``bbclass``.
44
45- Detailing where BitBake will store and fetch information from, for
46 data reuse between build runs like :term:`CACHE`, :term:`DL_DIR` or
47 :term:`PERSISTENT_DIR`. Those are usually global.
48
49
50Layers and files
51================
52
53Variables starting with ``LAYER`` configure how BitBake handles layers.
54Additionally, variables starting with ``BB`` configure how layers and files are
55handled. For example:
56
57- :term:`LAYERDEPENDS` is used to configure on which layers a given layer
58 depends.
59
60- The configured layers are contained in :term:`BBLAYERS` and files in
61 :term:`BBFILES`.
62
63Those variables are often used in the files ``layer.conf`` and ``bblayers.conf``.
64
65Recipes and packages
66====================
67
68Variables handling recipes and packages can be split into:
69
70- :term:`PN`, :term:`PV` or :term:`PF` for example, contain information about
71 the name or revision of a recipe or package. Usually, the default set in
72 ``bitbake.conf`` is used, but those are from time to time overwritten in
73 recipes.
74
75- :term:`SUMMARY`, :term:`DESCRIPTION`, :term:`LICENSE` or :term:`HOMEPAGE`
76 contain the expected information and should be set specifically for every
77 recipe.
78
79- In recipes, variables are also used to control build and runtime
80 dependencies between recipes/packages with other recipes/packages. The
81 most common should be: :term:`PROVIDES`, :term:`RPROVIDES`, :term:`DEPENDS`,
82 and :term:`RDEPENDS`.
83
84- There are further variables starting with ``SRC`` that specify the sources in
85 a recipe like :term:`SRC_URI` or :term:`SRCDATE`. Those are also usually set
86 in recipes.
87
88- Which version or provider of a recipe should be given preference when
89 multiple recipes would provide the same item, is controlled by variables
90 starting with ``PREFERRED_``. Those are normally set in the configuration
91 files of a ``MACHINE`` or ``DISTRO``.
diff --git a/bitbake/doc/bitbake-user-manual/bitbake-user-manual-ref-variables.rst b/bitbake/doc/bitbake-user-manual/bitbake-user-manual-ref-variables.rst
index d3936935cd..899e584f91 100644
--- a/bitbake/doc/bitbake-user-manual/bitbake-user-manual-ref-variables.rst
+++ b/bitbake/doc/bitbake-user-manual/bitbake-user-manual-ref-variables.rst
@@ -432,6 +432,15 @@ overview of their function and contents.
432 ``ConfigParsed`` event can set the variable to trigger the re-parse. 432 ``ConfigParsed`` event can set the variable to trigger the re-parse.
433 You must be careful to avoid recursive loops with this functionality. 433 You must be careful to avoid recursive loops with this functionality.
434 434
435 :term:`BB_LOADFACTOR_MAX`
436 Setting this to a value will cause BitBake to check the system load
437 average before executing new tasks. If the load average is above the
438 the number of CPUs multipled by this factor, no new task will be started
439 unless there is no task executing. A value of "1.5" has been found to
440 work reasonably. This is helpful for systems which don't have pressure
441 regulation enabled, which is more granular. Pressure values take
442 precedence over loadfactor.
443
435 :term:`BB_LOGCONFIG` 444 :term:`BB_LOGCONFIG`
436 Specifies the name of a config file that contains the user logging 445 Specifies the name of a config file that contains the user logging
437 configuration. See 446 configuration. See
diff --git a/bitbake/doc/index.rst b/bitbake/doc/index.rst
index 3ff8b1580f..ee1660ac15 100644
--- a/bitbake/doc/index.rst
+++ b/bitbake/doc/index.rst
@@ -13,6 +13,7 @@ BitBake User Manual
13 bitbake-user-manual/bitbake-user-manual-intro 13 bitbake-user-manual/bitbake-user-manual-intro
14 bitbake-user-manual/bitbake-user-manual-execution 14 bitbake-user-manual/bitbake-user-manual-execution
15 bitbake-user-manual/bitbake-user-manual-metadata 15 bitbake-user-manual/bitbake-user-manual-metadata
16 bitbake-user-manual/bitbake-user-manual-ref-variables-context
16 bitbake-user-manual/bitbake-user-manual-fetching 17 bitbake-user-manual/bitbake-user-manual-fetching
17 bitbake-user-manual/bitbake-user-manual-ref-variables 18 bitbake-user-manual/bitbake-user-manual-ref-variables
18 bitbake-user-manual/bitbake-user-manual-hello 19 bitbake-user-manual/bitbake-user-manual-hello
diff --git a/bitbake/lib/bb/__init__.py b/bitbake/lib/bb/__init__.py
index 768cce84e9..574e0de5be 100644
--- a/bitbake/lib/bb/__init__.py
+++ b/bitbake/lib/bb/__init__.py
@@ -9,7 +9,7 @@
9# SPDX-License-Identifier: GPL-2.0-only 9# SPDX-License-Identifier: GPL-2.0-only
10# 10#
11 11
12__version__ = "2.7.3" 12__version__ = "2.9.1"
13 13
14import sys 14import sys
15if sys.version_info < (3, 8, 0): 15if sys.version_info < (3, 8, 0):
@@ -36,6 +36,7 @@ class BBHandledException(Exception):
36 36
37import os 37import os
38import logging 38import logging
39from collections import namedtuple
39 40
40 41
41class NullHandler(logging.Handler): 42class NullHandler(logging.Handler):
@@ -103,26 +104,6 @@ class BBLoggerAdapter(logging.LoggerAdapter, BBLoggerMixin):
103 self.setup_bblogger(logger.name) 104 self.setup_bblogger(logger.name)
104 super().__init__(logger, *args, **kwargs) 105 super().__init__(logger, *args, **kwargs)
105 106
106 if sys.version_info < (3, 6):
107 # These properties were added in Python 3.6. Add them in older versions
108 # for compatibility
109 @property
110 def manager(self):
111 return self.logger.manager
112
113 @manager.setter
114 def manager(self, value):
115 self.logger.manager = value
116
117 @property
118 def name(self):
119 return self.logger.name
120
121 def __repr__(self):
122 logger = self.logger
123 level = logger.getLevelName(logger.getEffectiveLevel())
124 return '<%s %s (%s)>' % (self.__class__.__name__, logger.name, level)
125
126logging.LoggerAdapter = BBLoggerAdapter 107logging.LoggerAdapter = BBLoggerAdapter
127 108
128logger = logging.getLogger("BitBake") 109logger = logging.getLogger("BitBake")
@@ -227,3 +208,14 @@ def deprecate_import(current, modulename, fromlist, renames = None):
227 208
228 setattr(sys.modules[current], newname, newobj) 209 setattr(sys.modules[current], newname, newobj)
229 210
211TaskData = namedtuple("TaskData", [
212 "pn",
213 "taskname",
214 "fn",
215 "deps",
216 "provides",
217 "taskhash",
218 "unihash",
219 "hashfn",
220 "taskhash_deps",
221])
diff --git a/bitbake/lib/bb/asyncrpc/__init__.py b/bitbake/lib/bb/asyncrpc/__init__.py
index 639e1607f8..a4371643d7 100644
--- a/bitbake/lib/bb/asyncrpc/__init__.py
+++ b/bitbake/lib/bb/asyncrpc/__init__.py
@@ -5,7 +5,7 @@
5# 5#
6 6
7 7
8from .client import AsyncClient, Client, ClientPool 8from .client import AsyncClient, Client
9from .serv import AsyncServer, AsyncServerConnection 9from .serv import AsyncServer, AsyncServerConnection
10from .connection import DEFAULT_MAX_CHUNK 10from .connection import DEFAULT_MAX_CHUNK
11from .exceptions import ( 11from .exceptions import (
diff --git a/bitbake/lib/bb/asyncrpc/client.py b/bitbake/lib/bb/asyncrpc/client.py
index 29a5ab76aa..11179b0fcb 100644
--- a/bitbake/lib/bb/asyncrpc/client.py
+++ b/bitbake/lib/bb/asyncrpc/client.py
@@ -10,11 +10,41 @@ import json
10import os 10import os
11import socket 11import socket
12import sys 12import sys
13import re
13import contextlib 14import contextlib
14from threading import Thread 15from threading import Thread
15from .connection import StreamConnection, WebsocketConnection, DEFAULT_MAX_CHUNK 16from .connection import StreamConnection, WebsocketConnection, DEFAULT_MAX_CHUNK
16from .exceptions import ConnectionClosedError, InvokeError 17from .exceptions import ConnectionClosedError, InvokeError
17 18
19UNIX_PREFIX = "unix://"
20WS_PREFIX = "ws://"
21WSS_PREFIX = "wss://"
22
23ADDR_TYPE_UNIX = 0
24ADDR_TYPE_TCP = 1
25ADDR_TYPE_WS = 2
26
27WEBSOCKETS_MIN_VERSION = (9, 1)
28# Need websockets 10 with python 3.10+
29if sys.version_info >= (3, 10, 0):
30 WEBSOCKETS_MIN_VERSION = (10, 0)
31
32
33def parse_address(addr):
34 if addr.startswith(UNIX_PREFIX):
35 return (ADDR_TYPE_UNIX, (addr[len(UNIX_PREFIX) :],))
36 elif addr.startswith(WS_PREFIX) or addr.startswith(WSS_PREFIX):
37 return (ADDR_TYPE_WS, (addr,))
38 else:
39 m = re.match(r"\[(?P<host>[^\]]*)\]:(?P<port>\d+)$", addr)
40 if m is not None:
41 host = m.group("host")
42 port = m.group("port")
43 else:
44 host, port = addr.split(":")
45
46 return (ADDR_TYPE_TCP, (host, int(port)))
47
18 48
19class AsyncClient(object): 49class AsyncClient(object):
20 def __init__( 50 def __init__(
@@ -63,6 +93,24 @@ class AsyncClient(object):
63 async def connect_websocket(self, uri): 93 async def connect_websocket(self, uri):
64 import websockets 94 import websockets
65 95
96 try:
97 version = tuple(
98 int(v)
99 for v in websockets.__version__.split(".")[
100 0 : len(WEBSOCKETS_MIN_VERSION)
101 ]
102 )
103 except ValueError:
104 raise ImportError(
105 f"Unable to parse websockets version '{websockets.__version__}'"
106 )
107
108 if version < WEBSOCKETS_MIN_VERSION:
109 min_ver_str = ".".join(str(v) for v in WEBSOCKETS_MIN_VERSION)
110 raise ImportError(
111 f"Websockets version {websockets.__version__} is less than minimum required version {min_ver_str}"
112 )
113
66 async def connect_sock(): 114 async def connect_sock():
67 websocket = await websockets.connect(uri, ping_interval=None) 115 websocket = await websockets.connect(uri, ping_interval=None)
68 return WebsocketConnection(websocket, self.timeout) 116 return WebsocketConnection(websocket, self.timeout)
@@ -202,85 +250,9 @@ class Client(object):
202 def close(self): 250 def close(self):
203 if self.loop: 251 if self.loop:
204 self.loop.run_until_complete(self.client.close()) 252 self.loop.run_until_complete(self.client.close())
205 if sys.version_info >= (3, 6):
206 self.loop.run_until_complete(self.loop.shutdown_asyncgens())
207 self.loop.close()
208 self.loop = None
209
210 def __enter__(self):
211 return self
212
213 def __exit__(self, exc_type, exc_value, traceback):
214 self.close()
215 return False
216
217
218class ClientPool(object):
219 def __init__(self, max_clients):
220 self.avail_clients = []
221 self.num_clients = 0
222 self.max_clients = max_clients
223 self.loop = None
224 self.client_condition = None
225
226 @abc.abstractmethod
227 async def _new_client(self):
228 raise NotImplementedError("Must be implemented in derived class")
229
230 def close(self):
231 if self.client_condition:
232 self.client_condition = None
233
234 if self.loop:
235 self.loop.run_until_complete(self.__close_clients())
236 self.loop.run_until_complete(self.loop.shutdown_asyncgens()) 253 self.loop.run_until_complete(self.loop.shutdown_asyncgens())
237 self.loop.close() 254 self.loop.close()
238 self.loop = None 255 self.loop = None
239
240 def run_tasks(self, tasks):
241 if not self.loop:
242 self.loop = asyncio.new_event_loop()
243
244 thread = Thread(target=self.__thread_main, args=(tasks,))
245 thread.start()
246 thread.join()
247
248 @contextlib.asynccontextmanager
249 async def get_client(self):
250 async with self.client_condition:
251 if self.avail_clients:
252 client = self.avail_clients.pop()
253 elif self.num_clients < self.max_clients:
254 self.num_clients += 1
255 client = await self._new_client()
256 else:
257 while not self.avail_clients:
258 await self.client_condition.wait()
259 client = self.avail_clients.pop()
260
261 try:
262 yield client
263 finally:
264 async with self.client_condition:
265 self.avail_clients.append(client)
266 self.client_condition.notify()
267
268 def __thread_main(self, tasks):
269 async def process_task(task):
270 async with self.get_client() as client:
271 await task(client)
272
273 asyncio.set_event_loop(self.loop)
274 if not self.client_condition:
275 self.client_condition = asyncio.Condition()
276 tasks = [process_task(t) for t in tasks]
277 self.loop.run_until_complete(asyncio.gather(*tasks))
278
279 async def __close_clients(self):
280 for c in self.avail_clients:
281 await c.close()
282 self.avail_clients = []
283 self.num_clients = 0
284 256
285 def __enter__(self): 257 def __enter__(self):
286 return self 258 return self
diff --git a/bitbake/lib/bb/asyncrpc/serv.py b/bitbake/lib/bb/asyncrpc/serv.py
index a66117acad..667217c5c1 100644
--- a/bitbake/lib/bb/asyncrpc/serv.py
+++ b/bitbake/lib/bb/asyncrpc/serv.py
@@ -138,14 +138,20 @@ class StreamServer(object):
138 138
139 139
140class TCPStreamServer(StreamServer): 140class TCPStreamServer(StreamServer):
141 def __init__(self, host, port, handler, logger): 141 def __init__(self, host, port, handler, logger, *, reuseport=False):
142 super().__init__(handler, logger) 142 super().__init__(handler, logger)
143 self.host = host 143 self.host = host
144 self.port = port 144 self.port = port
145 self.reuseport = reuseport
145 146
146 def start(self, loop): 147 def start(self, loop):
147 self.server = loop.run_until_complete( 148 self.server = loop.run_until_complete(
148 asyncio.start_server(self.handle_stream_client, self.host, self.port) 149 asyncio.start_server(
150 self.handle_stream_client,
151 self.host,
152 self.port,
153 reuse_port=self.reuseport,
154 )
149 ) 155 )
150 156
151 for s in self.server.sockets: 157 for s in self.server.sockets:
@@ -209,11 +215,12 @@ class UnixStreamServer(StreamServer):
209 215
210 216
211class WebsocketsServer(object): 217class WebsocketsServer(object):
212 def __init__(self, host, port, handler, logger): 218 def __init__(self, host, port, handler, logger, *, reuseport=False):
213 self.host = host 219 self.host = host
214 self.port = port 220 self.port = port
215 self.handler = handler 221 self.handler = handler
216 self.logger = logger 222 self.logger = logger
223 self.reuseport = reuseport
217 224
218 def start(self, loop): 225 def start(self, loop):
219 import websockets.server 226 import websockets.server
@@ -224,6 +231,7 @@ class WebsocketsServer(object):
224 self.host, 231 self.host,
225 self.port, 232 self.port,
226 ping_interval=None, 233 ping_interval=None,
234 reuse_port=self.reuseport,
227 ) 235 )
228 ) 236 )
229 237
@@ -262,14 +270,26 @@ class AsyncServer(object):
262 self.loop = None 270 self.loop = None
263 self.run_tasks = [] 271 self.run_tasks = []
264 272
265 def start_tcp_server(self, host, port): 273 def start_tcp_server(self, host, port, *, reuseport=False):
266 self.server = TCPStreamServer(host, port, self._client_handler, self.logger) 274 self.server = TCPStreamServer(
275 host,
276 port,
277 self._client_handler,
278 self.logger,
279 reuseport=reuseport,
280 )
267 281
268 def start_unix_server(self, path): 282 def start_unix_server(self, path):
269 self.server = UnixStreamServer(path, self._client_handler, self.logger) 283 self.server = UnixStreamServer(path, self._client_handler, self.logger)
270 284
271 def start_websocket_server(self, host, port): 285 def start_websocket_server(self, host, port, reuseport=False):
272 self.server = WebsocketsServer(host, port, self._client_handler, self.logger) 286 self.server = WebsocketsServer(
287 host,
288 port,
289 self._client_handler,
290 self.logger,
291 reuseport=reuseport,
292 )
273 293
274 async def _client_handler(self, socket): 294 async def _client_handler(self, socket):
275 address = socket.address 295 address = socket.address
@@ -368,8 +388,7 @@ class AsyncServer(object):
368 388
369 self._serve_forever(tasks) 389 self._serve_forever(tasks)
370 390
371 if sys.version_info >= (3, 6): 391 self.loop.run_until_complete(self.loop.shutdown_asyncgens())
372 self.loop.run_until_complete(self.loop.shutdown_asyncgens())
373 self.loop.close() 392 self.loop.close()
374 393
375 queue = multiprocessing.Queue() 394 queue = multiprocessing.Queue()
diff --git a/bitbake/lib/bb/build.py b/bitbake/lib/bb/build.py
index 44d08f5c55..ab8bce3d57 100644
--- a/bitbake/lib/bb/build.py
+++ b/bitbake/lib/bb/build.py
@@ -197,6 +197,8 @@ def exec_func(func, d, dirs = None):
197 for cdir in d.expand(cleandirs).split(): 197 for cdir in d.expand(cleandirs).split():
198 bb.utils.remove(cdir, True) 198 bb.utils.remove(cdir, True)
199 bb.utils.mkdirhier(cdir) 199 bb.utils.mkdirhier(cdir)
200 if cdir == oldcwd:
201 os.chdir(cdir)
200 202
201 if flags and dirs is None: 203 if flags and dirs is None:
202 dirs = flags.get('dirs') 204 dirs = flags.get('dirs')
diff --git a/bitbake/lib/bb/codeparser.py b/bitbake/lib/bb/codeparser.py
index 2e8b7ced3c..691bdff75e 100644
--- a/bitbake/lib/bb/codeparser.py
+++ b/bitbake/lib/bb/codeparser.py
@@ -484,19 +484,34 @@ class ShellParser():
484 """ 484 """
485 485
486 words = list(words) 486 words = list(words)
487 for word in list(words): 487 for word in words:
488 wtree = pyshlex.make_wordtree(word[1]) 488 wtree = pyshlex.make_wordtree(word[1])
489 for part in wtree: 489 for part in wtree:
490 if not isinstance(part, list): 490 if not isinstance(part, list):
491 continue 491 continue
492 492
493 if part[0] in ('`', '$('): 493 candidates = [part]
494 command = pyshlex.wordtree_as_string(part[1:-1]) 494
495 self._parse_shell(command) 495 # If command is of type:
496 496 #
497 if word[0] in ("cmd_name", "cmd_word"): 497 # var="... $(cmd [...]) ..."
498 if word in words: 498 #
499 words.remove(word) 499 # Then iterate on what's between the quotes and if we find a
500 # list, make that what we check for below.
501 if len(part) >= 3 and part[0] == '"':
502 for p in part[1:-1]:
503 if isinstance(p, list):
504 candidates.append(p)
505
506 for candidate in candidates:
507 if len(candidate) >= 2:
508 if candidate[0] in ('`', '$('):
509 command = pyshlex.wordtree_as_string(candidate[1:-1])
510 self._parse_shell(command)
511
512 if word[0] in ("cmd_name", "cmd_word"):
513 if word in words:
514 words.remove(word)
500 515
501 usetoken = False 516 usetoken = False
502 for word in words: 517 for word in words:
diff --git a/bitbake/lib/bb/cooker.py b/bitbake/lib/bb/cooker.py
index c5bfef55d6..6754f986bf 100644
--- a/bitbake/lib/bb/cooker.py
+++ b/bitbake/lib/bb/cooker.py
@@ -315,13 +315,13 @@ class BBCooker:
315 dbfile = (self.data.getVar("PERSISTENT_DIR") or self.data.getVar("CACHE")) + "/hashserv.db" 315 dbfile = (self.data.getVar("PERSISTENT_DIR") or self.data.getVar("CACHE")) + "/hashserv.db"
316 upstream = self.data.getVar("BB_HASHSERVE_UPSTREAM") or None 316 upstream = self.data.getVar("BB_HASHSERVE_UPSTREAM") or None
317 if upstream: 317 if upstream:
318 import socket
319 try: 318 try:
320 sock = socket.create_connection(upstream.split(":"), 5) 319 with hashserv.create_client(upstream) as client:
321 sock.close() 320 client.ping()
322 except socket.error as e: 321 except (ConnectionError, ImportError) as e:
323 bb.warn("BB_HASHSERVE_UPSTREAM is not valid, unable to connect hash equivalence server at '%s': %s" 322 bb.warn("BB_HASHSERVE_UPSTREAM is not valid, unable to connect hash equivalence server at '%s': %s"
324 % (upstream, repr(e))) 323 % (upstream, repr(e)))
324 upstream = None
325 325
326 self.hashservaddr = "unix://%s/hashserve.sock" % self.data.getVar("TOPDIR") 326 self.hashservaddr = "unix://%s/hashserve.sock" % self.data.getVar("TOPDIR")
327 self.hashserv = hashserv.create_server( 327 self.hashserv = hashserv.create_server(
@@ -680,14 +680,14 @@ class BBCooker:
680 bb.event.fire(bb.event.TreeDataPreparationCompleted(len(fulltargetlist)), self.data) 680 bb.event.fire(bb.event.TreeDataPreparationCompleted(len(fulltargetlist)), self.data)
681 return taskdata, runlist 681 return taskdata, runlist
682 682
683 def prepareTreeData(self, pkgs_to_build, task): 683 def prepareTreeData(self, pkgs_to_build, task, halt=False):
684 """ 684 """
685 Prepare a runqueue and taskdata object for iteration over pkgs_to_build 685 Prepare a runqueue and taskdata object for iteration over pkgs_to_build
686 """ 686 """
687 687
688 # We set halt to False here to prevent unbuildable targets raising 688 # We set halt to False here to prevent unbuildable targets raising
689 # an exception when we're just generating data 689 # an exception when we're just generating data
690 taskdata, runlist = self.buildTaskData(pkgs_to_build, task, False, allowincomplete=True) 690 taskdata, runlist = self.buildTaskData(pkgs_to_build, task, halt, allowincomplete=True)
691 691
692 return runlist, taskdata 692 return runlist, taskdata
693 693
@@ -701,7 +701,7 @@ class BBCooker:
701 if not task.startswith("do_"): 701 if not task.startswith("do_"):
702 task = "do_%s" % task 702 task = "do_%s" % task
703 703
704 runlist, taskdata = self.prepareTreeData(pkgs_to_build, task) 704 runlist, taskdata = self.prepareTreeData(pkgs_to_build, task, halt=True)
705 rq = bb.runqueue.RunQueue(self, self.data, self.recipecaches, taskdata, runlist) 705 rq = bb.runqueue.RunQueue(self, self.data, self.recipecaches, taskdata, runlist)
706 rq.rqdata.prepare() 706 rq.rqdata.prepare()
707 return self.buildDependTree(rq, taskdata) 707 return self.buildDependTree(rq, taskdata)
@@ -1459,7 +1459,6 @@ class BBCooker:
1459 1459
1460 if t in task or getAllTaskSignatures: 1460 if t in task or getAllTaskSignatures:
1461 try: 1461 try:
1462 rq.rqdata.prepare_task_hash(tid)
1463 sig.append([pn, t, rq.rqdata.get_task_unihash(tid)]) 1462 sig.append([pn, t, rq.rqdata.get_task_unihash(tid)])
1464 except KeyError: 1463 except KeyError:
1465 sig.append(self.getTaskSignatures(target, [t])[0]) 1464 sig.append(self.getTaskSignatures(target, [t])[0])
@@ -1813,8 +1812,8 @@ class CookerCollectFiles(object):
1813 bb.event.fire(CookerExit(), eventdata) 1812 bb.event.fire(CookerExit(), eventdata)
1814 1813
1815 # We need to track where we look so that we can know when the cache is invalid. There 1814 # We need to track where we look so that we can know when the cache is invalid. There
1816 # is no nice way to do this, this is horrid. We intercept the os.listdir() 1815 # is no nice way to do this, this is horrid. We intercept the os.listdir() and os.scandir()
1817 # (or os.scandir() for python 3.6+) calls while we run glob(). 1816 # calls while we run glob().
1818 origlistdir = os.listdir 1817 origlistdir = os.listdir
1819 if hasattr(os, 'scandir'): 1818 if hasattr(os, 'scandir'):
1820 origscandir = os.scandir 1819 origscandir = os.scandir
@@ -2225,9 +2224,8 @@ class CookerParser(object):
2225 2224
2226 for process in self.processes: 2225 for process in self.processes:
2227 process.join() 2226 process.join()
2228 # Added in 3.7, cleans up zombies 2227 # clean up zombies
2229 if hasattr(process, "close"): 2228 process.close()
2230 process.close()
2231 2229
2232 bb.codeparser.parser_cache_save() 2230 bb.codeparser.parser_cache_save()
2233 bb.codeparser.parser_cache_savemerge() 2231 bb.codeparser.parser_cache_savemerge()
@@ -2237,12 +2235,13 @@ class CookerParser(object):
2237 profiles = [] 2235 profiles = []
2238 for i in self.process_names: 2236 for i in self.process_names:
2239 logfile = "profile-parse-%s.log" % i 2237 logfile = "profile-parse-%s.log" % i
2240 if os.path.exists(logfile): 2238 if os.path.exists(logfile) and os.path.getsize(logfile):
2241 profiles.append(logfile) 2239 profiles.append(logfile)
2242 2240
2243 pout = "profile-parse.log.processed" 2241 if profiles:
2244 bb.utils.process_profilelog(profiles, pout = pout) 2242 pout = "profile-parse.log.processed"
2245 print("Processed parsing statistics saved to %s" % (pout)) 2243 bb.utils.process_profilelog(profiles, pout = pout)
2244 print("Processed parsing statistics saved to %s" % (pout))
2246 2245
2247 def final_cleanup(self): 2246 def final_cleanup(self):
2248 if self.syncthread: 2247 if self.syncthread:
diff --git a/bitbake/lib/bb/fetch2/crate.py b/bitbake/lib/bb/fetch2/crate.py
index 01d49435c3..e611736f06 100644
--- a/bitbake/lib/bb/fetch2/crate.py
+++ b/bitbake/lib/bb/fetch2/crate.py
@@ -70,6 +70,7 @@ class Crate(Wget):
70 host = 'crates.io/api/v1/crates' 70 host = 'crates.io/api/v1/crates'
71 71
72 ud.url = "https://%s/%s/%s/download" % (host, name, version) 72 ud.url = "https://%s/%s/%s/download" % (host, name, version)
73 ud.versionsurl = "https://%s/%s/versions" % (host, name)
73 ud.parm['downloadfilename'] = "%s-%s.crate" % (name, version) 74 ud.parm['downloadfilename'] = "%s-%s.crate" % (name, version)
74 if 'name' not in ud.parm: 75 if 'name' not in ud.parm:
75 ud.parm['name'] = '%s-%s' % (name, version) 76 ud.parm['name'] = '%s-%s' % (name, version)
@@ -139,3 +140,11 @@ class Crate(Wget):
139 mdpath = os.path.join(bbpath, cratepath, mdfile) 140 mdpath = os.path.join(bbpath, cratepath, mdfile)
140 with open(mdpath, "w") as f: 141 with open(mdpath, "w") as f:
141 json.dump(metadata, f) 142 json.dump(metadata, f)
143
144 def latest_versionstring(self, ud, d):
145 from functools import cmp_to_key
146 json_data = json.loads(self._fetch_index(ud.versionsurl, ud, d))
147 versions = [(0, i["num"], "") for i in json_data["versions"]]
148 versions = sorted(versions, key=cmp_to_key(bb.utils.vercmp))
149
150 return (versions[-1][1], "")
diff --git a/bitbake/lib/bb/fetch2/gcp.py b/bitbake/lib/bb/fetch2/gcp.py
index f40ce2eaa5..eb3e0c6a6b 100644
--- a/bitbake/lib/bb/fetch2/gcp.py
+++ b/bitbake/lib/bb/fetch2/gcp.py
@@ -23,6 +23,7 @@ import urllib.parse, urllib.error
23from bb.fetch2 import FetchMethod 23from bb.fetch2 import FetchMethod
24from bb.fetch2 import FetchError 24from bb.fetch2 import FetchError
25from bb.fetch2 import logger 25from bb.fetch2 import logger
26from bb.fetch2 import runfetchcmd
26 27
27class GCP(FetchMethod): 28class GCP(FetchMethod):
28 """ 29 """
diff --git a/bitbake/lib/bb/fetch2/npmsw.py b/bitbake/lib/bb/fetch2/npmsw.py
index ff5f8dc755..b55e885d7b 100644
--- a/bitbake/lib/bb/fetch2/npmsw.py
+++ b/bitbake/lib/bb/fetch2/npmsw.py
@@ -268,7 +268,7 @@ class NpmShrinkWrap(FetchMethod):
268 268
269 def unpack(self, ud, rootdir, d): 269 def unpack(self, ud, rootdir, d):
270 """Unpack the downloaded dependencies""" 270 """Unpack the downloaded dependencies"""
271 destdir = d.getVar("S") 271 destdir = rootdir
272 destsuffix = ud.parm.get("destsuffix") 272 destsuffix = ud.parm.get("destsuffix")
273 if destsuffix: 273 if destsuffix:
274 destdir = os.path.join(rootdir, destsuffix) 274 destdir = os.path.join(rootdir, destsuffix)
diff --git a/bitbake/lib/bb/fetch2/svn.py b/bitbake/lib/bb/fetch2/svn.py
index d40e4d2909..0852108e7d 100644
--- a/bitbake/lib/bb/fetch2/svn.py
+++ b/bitbake/lib/bb/fetch2/svn.py
@@ -210,3 +210,6 @@ class Svn(FetchMethod):
210 210
211 def _build_revision(self, ud, d): 211 def _build_revision(self, ud, d):
212 return ud.revision 212 return ud.revision
213
214 def supports_checksum(self, urldata):
215 return False
diff --git a/bitbake/lib/bb/fetch2/wget.py b/bitbake/lib/bb/fetch2/wget.py
index dc025800e6..d76b1d0d38 100644
--- a/bitbake/lib/bb/fetch2/wget.py
+++ b/bitbake/lib/bb/fetch2/wget.py
@@ -87,7 +87,10 @@ class Wget(FetchMethod):
87 if not ud.localfile: 87 if not ud.localfile:
88 ud.localfile = d.expand(urllib.parse.unquote(ud.host + ud.path).replace("/", ".")) 88 ud.localfile = d.expand(urllib.parse.unquote(ud.host + ud.path).replace("/", "."))
89 89
90 self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30 --passive-ftp" 90 self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30"
91
92 if ud.type == 'ftp' or ud.type == 'ftps':
93 self.basecmd += " --passive-ftp"
91 94
92 if not self.check_certs(d): 95 if not self.check_certs(d):
93 self.basecmd += " --no-check-certificate" 96 self.basecmd += " --no-check-certificate"
@@ -131,6 +134,15 @@ class Wget(FetchMethod):
131 134
132 self._runwget(ud, d, fetchcmd, False) 135 self._runwget(ud, d, fetchcmd, False)
133 136
137 # Sanity check since wget can pretend it succeed when it didn't
138 # Also, this used to happen if sourceforge sent us to the mirror page
139 if not os.path.exists(localpath):
140 raise FetchError("The fetch command returned success for url %s but %s doesn't exist?!" % (uri, localpath), uri)
141
142 if os.path.getsize(localpath) == 0:
143 os.remove(localpath)
144 raise FetchError("The fetch of %s resulted in a zero size file?! Deleting and failing since this isn't right." % (uri), uri)
145
134 # Try and verify any checksum now, meaning if it isn't correct, we don't remove the 146 # Try and verify any checksum now, meaning if it isn't correct, we don't remove the
135 # original file, which might be a race (imagine two recipes referencing the same 147 # original file, which might be a race (imagine two recipes referencing the same
136 # source, one with an incorrect checksum) 148 # source, one with an incorrect checksum)
@@ -140,15 +152,6 @@ class Wget(FetchMethod):
140 # Our lock prevents multiple writers but mirroring code may grab incomplete files 152 # Our lock prevents multiple writers but mirroring code may grab incomplete files
141 os.rename(localpath, localpath[:-4]) 153 os.rename(localpath, localpath[:-4])
142 154
143 # Sanity check since wget can pretend it succeed when it didn't
144 # Also, this used to happen if sourceforge sent us to the mirror page
145 if not os.path.exists(ud.localpath):
146 raise FetchError("The fetch command returned success for url %s but %s doesn't exist?!" % (uri, ud.localpath), uri)
147
148 if os.path.getsize(ud.localpath) == 0:
149 os.remove(ud.localpath)
150 raise FetchError("The fetch of %s resulted in a zero size file?! Deleting and failing since this isn't right." % (uri), uri)
151
152 return True 155 return True
153 156
154 def checkstatus(self, fetch, ud, d, try_again=True): 157 def checkstatus(self, fetch, ud, d, try_again=True):
diff --git a/bitbake/lib/bb/parse/__init__.py b/bitbake/lib/bb/parse/__init__.py
index a4358f1374..7ffdaa6fd7 100644
--- a/bitbake/lib/bb/parse/__init__.py
+++ b/bitbake/lib/bb/parse/__init__.py
@@ -49,20 +49,23 @@ class SkipPackage(SkipRecipe):
49__mtime_cache = {} 49__mtime_cache = {}
50def cached_mtime(f): 50def cached_mtime(f):
51 if f not in __mtime_cache: 51 if f not in __mtime_cache:
52 __mtime_cache[f] = os.stat(f)[stat.ST_MTIME] 52 res = os.stat(f)
53 __mtime_cache[f] = (res.st_mtime_ns, res.st_size, res.st_ino)
53 return __mtime_cache[f] 54 return __mtime_cache[f]
54 55
55def cached_mtime_noerror(f): 56def cached_mtime_noerror(f):
56 if f not in __mtime_cache: 57 if f not in __mtime_cache:
57 try: 58 try:
58 __mtime_cache[f] = os.stat(f)[stat.ST_MTIME] 59 res = os.stat(f)
60 __mtime_cache[f] = (res.st_mtime_ns, res.st_size, res.st_ino)
59 except OSError: 61 except OSError:
60 return 0 62 return 0
61 return __mtime_cache[f] 63 return __mtime_cache[f]
62 64
63def check_mtime(f, mtime): 65def check_mtime(f, mtime):
64 try: 66 try:
65 current_mtime = os.stat(f)[stat.ST_MTIME] 67 res = os.stat(f)
68 current_mtime = (res.st_mtime_ns, res.st_size, res.st_ino)
66 __mtime_cache[f] = current_mtime 69 __mtime_cache[f] = current_mtime
67 except OSError: 70 except OSError:
68 current_mtime = 0 71 current_mtime = 0
@@ -70,7 +73,8 @@ def check_mtime(f, mtime):
70 73
71def update_mtime(f): 74def update_mtime(f):
72 try: 75 try:
73 __mtime_cache[f] = os.stat(f)[stat.ST_MTIME] 76 res = os.stat(f)
77 __mtime_cache[f] = (res.st_mtime_ns, res.st_size, res.st_ino)
74 except OSError: 78 except OSError:
75 if f in __mtime_cache: 79 if f in __mtime_cache:
76 del __mtime_cache[f] 80 del __mtime_cache[f]
diff --git a/bitbake/lib/bb/parse/parse_py/BBHandler.py b/bitbake/lib/bb/parse/parse_py/BBHandler.py
index cd1c998f8f..c13e4b9755 100644
--- a/bitbake/lib/bb/parse/parse_py/BBHandler.py
+++ b/bitbake/lib/bb/parse/parse_py/BBHandler.py
@@ -34,6 +34,7 @@ __infunc__ = []
34__inpython__ = False 34__inpython__ = False
35__body__ = [] 35__body__ = []
36__classname__ = "" 36__classname__ = ""
37__residue__ = []
37 38
38cached_statements = {} 39cached_statements = {}
39 40
@@ -80,7 +81,7 @@ def inherit(files, fn, lineno, d, deferred=False):
80 __inherit_cache = d.getVar('__inherit_cache', False) or [] 81 __inherit_cache = d.getVar('__inherit_cache', False) or []
81 82
82def get_statements(filename, absolute_filename, base_name): 83def get_statements(filename, absolute_filename, base_name):
83 global cached_statements 84 global cached_statements, __residue__, __body__
84 85
85 try: 86 try:
86 return cached_statements[absolute_filename] 87 return cached_statements[absolute_filename]
@@ -100,6 +101,11 @@ def get_statements(filename, absolute_filename, base_name):
100 # add a blank line to close out any python definition 101 # add a blank line to close out any python definition
101 feeder(lineno, "", filename, base_name, statements, eof=True) 102 feeder(lineno, "", filename, base_name, statements, eof=True)
102 103
104 if __residue__:
105 raise ParseError("Unparsed lines %s: %s" % (filename, str(__residue__)), filename, lineno)
106 if __body__:
107 raise ParseError("Unparsed lines from unclosed function %s: %s" % (filename, str(__body__)), filename, lineno)
108
103 if filename.endswith(".bbclass") or filename.endswith(".inc"): 109 if filename.endswith(".bbclass") or filename.endswith(".inc"):
104 cached_statements[absolute_filename] = statements 110 cached_statements[absolute_filename] = statements
105 return statements 111 return statements
diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py
index bc7e18175d..93079a9776 100644
--- a/bitbake/lib/bb/runqueue.py
+++ b/bitbake/lib/bb/runqueue.py
@@ -1273,27 +1273,41 @@ class RunQueueData:
1273 1273
1274 bb.parse.siggen.set_setscene_tasks(self.runq_setscene_tids) 1274 bb.parse.siggen.set_setscene_tasks(self.runq_setscene_tids)
1275 1275
1276 starttime = time.time()
1277 lasttime = starttime
1278
1276 # Iterate over the task list and call into the siggen code 1279 # Iterate over the task list and call into the siggen code
1277 dealtwith = set() 1280 dealtwith = set()
1278 todeal = set(self.runtaskentries) 1281 todeal = set(self.runtaskentries)
1279 while todeal: 1282 while todeal:
1283 ready = set()
1280 for tid in todeal.copy(): 1284 for tid in todeal.copy():
1281 if not (self.runtaskentries[tid].depends - dealtwith): 1285 if not (self.runtaskentries[tid].depends - dealtwith):
1282 dealtwith.add(tid) 1286 self.runtaskentries[tid].taskhash_deps = bb.parse.siggen.prep_taskhash(tid, self.runtaskentries[tid].depends, self.dataCaches)
1283 todeal.remove(tid) 1287 # get_taskhash for a given tid *must* be called before get_unihash* below
1284 self.prepare_task_hash(tid) 1288 self.runtaskentries[tid].hash = bb.parse.siggen.get_taskhash(tid, self.runtaskentries[tid].depends, self.dataCaches)
1285 bb.event.check_for_interrupts(self.cooker.data) 1289 ready.add(tid)
1290 unihashes = bb.parse.siggen.get_unihashes(ready)
1291 for tid in ready:
1292 dealtwith.add(tid)
1293 todeal.remove(tid)
1294 self.runtaskentries[tid].unihash = unihashes[tid]
1295
1296 bb.event.check_for_interrupts(self.cooker.data)
1297
1298 if time.time() > (lasttime + 30):
1299 lasttime = time.time()
1300 hashequiv_logger.verbose("Initial setup loop progress: %s of %s in %s" % (len(todeal), len(self.runtaskentries), lasttime - starttime))
1301
1302 endtime = time.time()
1303 if (endtime-starttime > 60):
1304 hashequiv_logger.verbose("Initial setup loop took: %s" % (endtime-starttime))
1286 1305
1287 bb.parse.siggen.writeout_file_checksum_cache() 1306 bb.parse.siggen.writeout_file_checksum_cache()
1288 1307
1289 #self.dump_data() 1308 #self.dump_data()
1290 return len(self.runtaskentries) 1309 return len(self.runtaskentries)
1291 1310
1292 def prepare_task_hash(self, tid):
1293 bb.parse.siggen.prep_taskhash(tid, self.runtaskentries[tid].depends, self.dataCaches)
1294 self.runtaskentries[tid].hash = bb.parse.siggen.get_taskhash(tid, self.runtaskentries[tid].depends, self.dataCaches)
1295 self.runtaskentries[tid].unihash = bb.parse.siggen.get_unihash(tid)
1296
1297 def dump_data(self): 1311 def dump_data(self):
1298 """ 1312 """
1299 Dump some debug information on the internal data structures 1313 Dump some debug information on the internal data structures
@@ -2438,14 +2452,17 @@ class RunQueueExecute:
2438 taskdepdata_cache = {} 2452 taskdepdata_cache = {}
2439 for task in self.rqdata.runtaskentries: 2453 for task in self.rqdata.runtaskentries:
2440 (mc, fn, taskname, taskfn) = split_tid_mcfn(task) 2454 (mc, fn, taskname, taskfn) = split_tid_mcfn(task)
2441 pn = self.rqdata.dataCaches[mc].pkg_fn[taskfn] 2455 taskdepdata_cache[task] = bb.TaskData(
2442 deps = self.rqdata.runtaskentries[task].depends 2456 pn = self.rqdata.dataCaches[mc].pkg_fn[taskfn],
2443 provides = self.rqdata.dataCaches[mc].fn_provides[taskfn] 2457 taskname = taskname,
2444 taskhash = self.rqdata.runtaskentries[task].hash 2458 fn = fn,
2445 unihash = self.rqdata.runtaskentries[task].unihash 2459 deps = self.filtermcdeps(task, mc, self.rqdata.runtaskentries[task].depends),
2446 deps = self.filtermcdeps(task, mc, deps) 2460 provides = self.rqdata.dataCaches[mc].fn_provides[taskfn],
2447 hashfn = self.rqdata.dataCaches[mc].hashfn[taskfn] 2461 taskhash = self.rqdata.runtaskentries[task].hash,
2448 taskdepdata_cache[task] = [pn, taskname, fn, deps, provides, taskhash, unihash, hashfn] 2462 unihash = self.rqdata.runtaskentries[task].unihash,
2463 hashfn = self.rqdata.dataCaches[mc].hashfn[taskfn],
2464 taskhash_deps = self.rqdata.runtaskentries[task].taskhash_deps,
2465 )
2449 2466
2450 self.taskdepdata_cache = taskdepdata_cache 2467 self.taskdepdata_cache = taskdepdata_cache
2451 2468
@@ -2460,9 +2477,11 @@ class RunQueueExecute:
2460 while next: 2477 while next:
2461 additional = [] 2478 additional = []
2462 for revdep in next: 2479 for revdep in next:
2463 self.taskdepdata_cache[revdep][6] = self.rqdata.runtaskentries[revdep].unihash 2480 self.taskdepdata_cache[revdep] = self.taskdepdata_cache[revdep]._replace(
2481 unihash=self.rqdata.runtaskentries[revdep].unihash
2482 )
2464 taskdepdata[revdep] = self.taskdepdata_cache[revdep] 2483 taskdepdata[revdep] = self.taskdepdata_cache[revdep]
2465 for revdep2 in self.taskdepdata_cache[revdep][3]: 2484 for revdep2 in self.taskdepdata_cache[revdep].deps:
2466 if revdep2 not in taskdepdata: 2485 if revdep2 not in taskdepdata:
2467 additional.append(revdep2) 2486 additional.append(revdep2)
2468 next = additional 2487 next = additional
@@ -2556,17 +2575,28 @@ class RunQueueExecute:
2556 elif self.rqdata.runtaskentries[p].depends.isdisjoint(total): 2575 elif self.rqdata.runtaskentries[p].depends.isdisjoint(total):
2557 next.add(p) 2576 next.add(p)
2558 2577
2578 starttime = time.time()
2579 lasttime = starttime
2580
2559 # When an item doesn't have dependencies in total, we can process it. Drop items from total when handled 2581 # When an item doesn't have dependencies in total, we can process it. Drop items from total when handled
2560 while next: 2582 while next:
2561 current = next.copy() 2583 current = next.copy()
2562 next = set() 2584 next = set()
2585 ready = {}
2563 for tid in current: 2586 for tid in current:
2564 if self.rqdata.runtaskentries[p].depends and not self.rqdata.runtaskentries[tid].depends.isdisjoint(total): 2587 if self.rqdata.runtaskentries[p].depends and not self.rqdata.runtaskentries[tid].depends.isdisjoint(total):
2565 continue 2588 continue
2589 # get_taskhash for a given tid *must* be called before get_unihash* below
2590 ready[tid] = bb.parse.siggen.get_taskhash(tid, self.rqdata.runtaskentries[tid].depends, self.rqdata.dataCaches)
2591
2592 unihashes = bb.parse.siggen.get_unihashes(ready.keys())
2593
2594 for tid in ready:
2566 orighash = self.rqdata.runtaskentries[tid].hash 2595 orighash = self.rqdata.runtaskentries[tid].hash
2567 newhash = bb.parse.siggen.get_taskhash(tid, self.rqdata.runtaskentries[tid].depends, self.rqdata.dataCaches) 2596 newhash = ready[tid]
2568 origuni = self.rqdata.runtaskentries[tid].unihash 2597 origuni = self.rqdata.runtaskentries[tid].unihash
2569 newuni = bb.parse.siggen.get_unihash(tid) 2598 newuni = unihashes[tid]
2599
2570 # FIXME, need to check it can come from sstate at all for determinism? 2600 # FIXME, need to check it can come from sstate at all for determinism?
2571 remapped = False 2601 remapped = False
2572 if newuni == origuni: 2602 if newuni == origuni:
@@ -2587,6 +2617,15 @@ class RunQueueExecute:
2587 next |= self.rqdata.runtaskentries[tid].revdeps 2617 next |= self.rqdata.runtaskentries[tid].revdeps
2588 total.remove(tid) 2618 total.remove(tid)
2589 next.intersection_update(total) 2619 next.intersection_update(total)
2620 bb.event.check_for_interrupts(self.cooker.data)
2621
2622 if time.time() > (lasttime + 30):
2623 lasttime = time.time()
2624 hashequiv_logger.verbose("Rehash loop slow progress: %s in %s" % (len(total), lasttime - starttime))
2625
2626 endtime = time.time()
2627 if (endtime-starttime > 60):
2628 hashequiv_logger.verbose("Rehash loop took more than 60s: %s" % (endtime-starttime))
2590 2629
2591 if changed: 2630 if changed:
2592 for mc in self.rq.worker: 2631 for mc in self.rq.worker:
@@ -2806,13 +2845,19 @@ class RunQueueExecute:
2806 additional = [] 2845 additional = []
2807 for revdep in next: 2846 for revdep in next:
2808 (mc, fn, taskname, taskfn) = split_tid_mcfn(revdep) 2847 (mc, fn, taskname, taskfn) = split_tid_mcfn(revdep)
2809 pn = self.rqdata.dataCaches[mc].pkg_fn[taskfn]
2810 deps = getsetscenedeps(revdep) 2848 deps = getsetscenedeps(revdep)
2811 provides = self.rqdata.dataCaches[mc].fn_provides[taskfn] 2849
2812 taskhash = self.rqdata.runtaskentries[revdep].hash 2850 taskdepdata[revdep] = bb.TaskData(
2813 unihash = self.rqdata.runtaskentries[revdep].unihash 2851 pn = self.rqdata.dataCaches[mc].pkg_fn[taskfn],
2814 hashfn = self.rqdata.dataCaches[mc].hashfn[taskfn] 2852 taskname = taskname,
2815 taskdepdata[revdep] = [pn, taskname, fn, deps, provides, taskhash, unihash, hashfn] 2853 fn = fn,
2854 deps = deps,
2855 provides = self.rqdata.dataCaches[mc].fn_provides[taskfn],
2856 taskhash = self.rqdata.runtaskentries[revdep].hash,
2857 unihash = self.rqdata.runtaskentries[revdep].unihash,
2858 hashfn = self.rqdata.dataCaches[mc].hashfn[taskfn],
2859 taskhash_deps = self.rqdata.runtaskentries[revdep].taskhash_deps,
2860 )
2816 for revdep2 in deps: 2861 for revdep2 in deps:
2817 if revdep2 not in taskdepdata: 2862 if revdep2 not in taskdepdata:
2818 additional.append(revdep2) 2863 additional.append(revdep2)
diff --git a/bitbake/lib/bb/siggen.py b/bitbake/lib/bb/siggen.py
index 2a0ecf57e1..92066da00c 100644
--- a/bitbake/lib/bb/siggen.py
+++ b/bitbake/lib/bb/siggen.py
@@ -15,6 +15,7 @@ import difflib
15import simplediff 15import simplediff
16import json 16import json
17import types 17import types
18from contextlib import contextmanager
18import bb.compress.zstd 19import bb.compress.zstd
19from bb.checksum import FileChecksumCache 20from bb.checksum import FileChecksumCache
20from bb import runqueue 21from bb import runqueue
@@ -28,6 +29,14 @@ hashequiv_logger = logging.getLogger('BitBake.SigGen.HashEquiv')
28# The minimum version of the find_siginfo function we need 29# The minimum version of the find_siginfo function we need
29find_siginfo_minversion = 2 30find_siginfo_minversion = 2
30 31
32HASHSERV_ENVVARS = [
33 "SSL_CERT_DIR",
34 "SSL_CERT_FILE",
35 "NO_PROXY",
36 "HTTPS_PROXY",
37 "HTTP_PROXY"
38]
39
31def check_siggen_version(siggen): 40def check_siggen_version(siggen):
32 if not hasattr(siggen, "find_siginfo_version"): 41 if not hasattr(siggen, "find_siginfo_version"):
33 bb.fatal("Siggen from metadata (OE-Core?) is too old, please update it (no version found)") 42 bb.fatal("Siggen from metadata (OE-Core?) is too old, please update it (no version found)")
@@ -372,7 +381,7 @@ class SignatureGeneratorBasic(SignatureGenerator):
372 self.taints[tid] = taint 381 self.taints[tid] = taint
373 logger.warning("%s is tainted from a forced run" % tid) 382 logger.warning("%s is tainted from a forced run" % tid)
374 383
375 return 384 return set(dep for _, dep in self.runtaskdeps[tid])
376 385
377 def get_taskhash(self, tid, deps, dataCaches): 386 def get_taskhash(self, tid, deps, dataCaches):
378 387
@@ -531,19 +540,28 @@ class SignatureGeneratorUniHashMixIn(object):
531 def __init__(self, data): 540 def __init__(self, data):
532 self.extramethod = {} 541 self.extramethod = {}
533 # NOTE: The cache only tracks hashes that exist. Hashes that don't 542 # NOTE: The cache only tracks hashes that exist. Hashes that don't
534 # exist are always queries from the server since it is possible for 543 # exist are always queried from the server since it is possible for
535 # hashes to appear over time, but much less likely for them to 544 # hashes to appear over time, but much less likely for them to
536 # disappear 545 # disappear
537 self.unihash_exists_cache = set() 546 self.unihash_exists_cache = set()
538 self.username = None 547 self.username = None
539 self.password = None 548 self.password = None
549 self.env = {}
550
551 origenv = data.getVar("BB_ORIGENV")
552 for e in HASHSERV_ENVVARS:
553 value = data.getVar(e)
554 if not value and origenv:
555 value = origenv.getVar(e)
556 if value:
557 self.env[e] = value
540 super().__init__(data) 558 super().__init__(data)
541 559
542 def get_taskdata(self): 560 def get_taskdata(self):
543 return (self.server, self.method, self.extramethod, self.max_parallel, self.username, self.password) + super().get_taskdata() 561 return (self.server, self.method, self.extramethod, self.username, self.password, self.env) + super().get_taskdata()
544 562
545 def set_taskdata(self, data): 563 def set_taskdata(self, data):
546 self.server, self.method, self.extramethod, self.max_parallel, self.username, self.password = data[:6] 564 self.server, self.method, self.extramethod, self.username, self.password, self.env = data[:6]
547 super().set_taskdata(data[6:]) 565 super().set_taskdata(data[6:])
548 566
549 def get_hashserv_creds(self): 567 def get_hashserv_creds(self):
@@ -555,15 +573,27 @@ class SignatureGeneratorUniHashMixIn(object):
555 573
556 return {} 574 return {}
557 575
558 def client(self): 576 @contextmanager
559 if getattr(self, '_client', None) is None: 577 def _client_env(self):
560 self._client = hashserv.create_client(self.server, **self.get_hashserv_creds()) 578 orig_env = os.environ.copy()
561 return self._client 579 try:
580 for k, v in self.env.items():
581 os.environ[k] = v
562 582
563 def client_pool(self): 583 yield
564 if getattr(self, '_client_pool', None) is None: 584 finally:
565 self._client_pool = hashserv.client.ClientPool(self.server, self.max_parallel, **self.get_hashserv_creds()) 585 for k, v in self.env.items():
566 return self._client_pool 586 if k in orig_env:
587 os.environ[k] = orig_env[k]
588 else:
589 del os.environ[k]
590
591 @contextmanager
592 def client(self):
593 with self._client_env():
594 if getattr(self, '_client', None) is None:
595 self._client = hashserv.create_client(self.server, **self.get_hashserv_creds())
596 yield self._client
567 597
568 def reset(self, data): 598 def reset(self, data):
569 self.__close_clients() 599 self.__close_clients()
@@ -574,12 +604,13 @@ class SignatureGeneratorUniHashMixIn(object):
574 return super().exit() 604 return super().exit()
575 605
576 def __close_clients(self): 606 def __close_clients(self):
577 if getattr(self, '_client', None) is not None: 607 with self._client_env():
578 self._client.close() 608 if getattr(self, '_client', None) is not None:
579 self._client = None 609 self._client.close()
580 if getattr(self, '_client_pool', None) is not None: 610 self._client = None
581 self._client_pool.close() 611 if getattr(self, '_client_pool', None) is not None:
582 self._client_pool = None 612 self._client_pool.close()
613 self._client_pool = None
583 614
584 def get_stampfile_hash(self, tid): 615 def get_stampfile_hash(self, tid):
585 if tid in self.taskhash: 616 if tid in self.taskhash:
@@ -640,23 +671,20 @@ class SignatureGeneratorUniHashMixIn(object):
640 if len(query) == 0: 671 if len(query) == 0:
641 return {} 672 return {}
642 673
643 uncached_query = {} 674 query_keys = []
644 result = {} 675 result = {}
645 for key, unihash in query.items(): 676 for key, unihash in query.items():
646 if unihash in self.unihash_exists_cache: 677 if unihash in self.unihash_exists_cache:
647 result[key] = True 678 result[key] = True
648 else: 679 else:
649 uncached_query[key] = unihash 680 query_keys.append(key)
650 681
651 if self.max_parallel <= 1 or len(uncached_query) <= 1: 682 if query_keys:
652 # No parallelism required. Make the query serially with the single client 683 with self.client() as client:
653 uncached_result = { 684 query_result = client.unihash_exists_batch(query[k] for k in query_keys)
654 key: self.client().unihash_exists(value) for key, value in uncached_query.items()
655 }
656 else:
657 uncached_result = self.client_pool().unihashes_exist(uncached_query)
658 685
659 for key, exists in uncached_result.items(): 686 for idx, key in enumerate(query_keys):
687 exists = query_result[idx]
660 if exists: 688 if exists:
661 self.unihash_exists_cache.add(query[key]) 689 self.unihash_exists_cache.add(query[key])
662 result[key] = exists 690 result[key] = exists
@@ -672,27 +700,20 @@ class SignatureGeneratorUniHashMixIn(object):
672 unihash 700 unihash
673 """ 701 """
674 result = {} 702 result = {}
675 queries = {} 703 query_tids = []
676 query_result = {}
677 704
678 for tid in tids: 705 for tid in tids:
679 unihash = self.get_cached_unihash(tid) 706 unihash = self.get_cached_unihash(tid)
680 if unihash: 707 if unihash:
681 result[tid] = unihash 708 result[tid] = unihash
682 else: 709 else:
683 queries[tid] = (self._get_method(tid), self.taskhash[tid]) 710 query_tids.append(tid)
684 711
685 if len(queries) == 0: 712 if query_tids:
686 return result 713 with self.client() as client:
687 714 unihashes = client.get_unihash_batch((self._get_method(tid), self.taskhash[tid]) for tid in query_tids)
688 if self.max_parallel <= 1 or len(queries) <= 1:
689 # No parallelism required. Make the query serially with the single client
690 for tid, args in queries.items():
691 query_result[tid] = self.client().get_unihash(*args)
692 else:
693 query_result = self.client_pool().get_unihashes(queries)
694 715
695 for tid, unihash in query_result.items(): 716 for idx, tid in enumerate(query_tids):
696 # In the absence of being able to discover a unique hash from the 717 # In the absence of being able to discover a unique hash from the
697 # server, make it be equivalent to the taskhash. The unique "hash" only 718 # server, make it be equivalent to the taskhash. The unique "hash" only
698 # really needs to be a unique string (not even necessarily a hash), but 719 # really needs to be a unique string (not even necessarily a hash), but
@@ -707,6 +728,8 @@ class SignatureGeneratorUniHashMixIn(object):
707 # to the server, there is a better chance that they will agree on 728 # to the server, there is a better chance that they will agree on
708 # the unique hash. 729 # the unique hash.
709 taskhash = self.taskhash[tid] 730 taskhash = self.taskhash[tid]
731 unihash = unihashes[idx]
732
710 if unihash: 733 if unihash:
711 # A unique hash equal to the taskhash is not very interesting, 734 # A unique hash equal to the taskhash is not very interesting,
712 # so it is reported it at debug level 2. If they differ, that 735 # so it is reported it at debug level 2. If they differ, that
@@ -785,7 +808,9 @@ class SignatureGeneratorUniHashMixIn(object):
785 if tid in self.extramethod: 808 if tid in self.extramethod:
786 method = method + self.extramethod[tid] 809 method = method + self.extramethod[tid]
787 810
788 data = self.client().report_unihash(taskhash, method, outhash, unihash, extra_data) 811 with self.client() as client:
812 data = client.report_unihash(taskhash, method, outhash, unihash, extra_data)
813
789 new_unihash = data['unihash'] 814 new_unihash = data['unihash']
790 815
791 if new_unihash != unihash: 816 if new_unihash != unihash:
@@ -816,7 +841,9 @@ class SignatureGeneratorUniHashMixIn(object):
816 if tid in self.extramethod: 841 if tid in self.extramethod:
817 method = method + self.extramethod[tid] 842 method = method + self.extramethod[tid]
818 843
819 data = self.client().report_unihash_equiv(taskhash, method, wanted_unihash, extra_data) 844 with self.client() as client:
845 data = client.report_unihash_equiv(taskhash, method, wanted_unihash, extra_data)
846
820 hashequiv_logger.verbose('Reported task %s as unihash %s to %s (%s)' % (tid, wanted_unihash, self.server, str(data))) 847 hashequiv_logger.verbose('Reported task %s as unihash %s to %s (%s)' % (tid, wanted_unihash, self.server, str(data)))
821 848
822 if data is None: 849 if data is None:
@@ -849,7 +876,6 @@ class SignatureGeneratorTestEquivHash(SignatureGeneratorUniHashMixIn, SignatureG
849 super().init_rundepcheck(data) 876 super().init_rundepcheck(data)
850 self.server = data.getVar('BB_HASHSERVE') 877 self.server = data.getVar('BB_HASHSERVE')
851 self.method = "sstate_output_hash" 878 self.method = "sstate_output_hash"
852 self.max_parallel = 1
853 879
854def clean_checksum_file_path(file_checksum_tuple): 880def clean_checksum_file_path(file_checksum_tuple):
855 f, cs = file_checksum_tuple 881 f, cs = file_checksum_tuple
diff --git a/bitbake/lib/bb/tests/codeparser.py b/bitbake/lib/bb/tests/codeparser.py
index f6585fb3aa..c0d1362a0c 100644
--- a/bitbake/lib/bb/tests/codeparser.py
+++ b/bitbake/lib/bb/tests/codeparser.py
@@ -106,6 +106,46 @@ ${D}${libdir}/pkgconfig/*.pc
106 self.parseExpression("foo=$(echo bar)") 106 self.parseExpression("foo=$(echo bar)")
107 self.assertExecs(set(["echo"])) 107 self.assertExecs(set(["echo"]))
108 108
109 def test_assign_subshell_expansion_quotes(self):
110 self.parseExpression('foo="$(echo bar)"')
111 self.assertExecs(set(["echo"]))
112
113 def test_assign_subshell_expansion_nested(self):
114 self.parseExpression('foo="$(func1 "$(func2 bar$(func3))")"')
115 self.assertExecs(set(["func1", "func2", "func3"]))
116
117 def test_assign_subshell_expansion_multiple(self):
118 self.parseExpression('foo="$(func1 "$(func2)") $(func3)"')
119 self.assertExecs(set(["func1", "func2", "func3"]))
120
121 def test_assign_subshell_expansion_escaped_quotes(self):
122 self.parseExpression('foo="\\"fo\\"o$(func1)"')
123 self.assertExecs(set(["func1"]))
124
125 def test_assign_subshell_expansion_empty(self):
126 self.parseExpression('foo="bar$()foo"')
127 self.assertExecs(set())
128
129 def test_assign_subshell_backticks(self):
130 self.parseExpression("foo=`echo bar`")
131 self.assertExecs(set(["echo"]))
132
133 def test_assign_subshell_backticks_quotes(self):
134 self.parseExpression('foo="`echo bar`"')
135 self.assertExecs(set(["echo"]))
136
137 def test_assign_subshell_backticks_multiple(self):
138 self.parseExpression('foo="`func1 bar` `func2`"')
139 self.assertExecs(set(["func1", "func2"]))
140
141 def test_assign_subshell_backticks_escaped_quotes(self):
142 self.parseExpression('foo="\\"fo\\"o`func1`"')
143 self.assertExecs(set(["func1"]))
144
145 def test_assign_subshell_backticks_empty(self):
146 self.parseExpression('foo="bar``foo"')
147 self.assertExecs(set())
148
109 def test_shell_unexpanded(self): 149 def test_shell_unexpanded(self):
110 self.setEmptyVars(["QT_BASE_NAME"]) 150 self.setEmptyVars(["QT_BASE_NAME"])
111 self.parseExpression('echo "${QT_BASE_NAME}"') 151 self.parseExpression('echo "${QT_BASE_NAME}"')
diff --git a/bitbake/lib/bb/tests/fetch.py b/bitbake/lib/bb/tests/fetch.py
index 85c1f79ff3..1e55cdd299 100644
--- a/bitbake/lib/bb/tests/fetch.py
+++ b/bitbake/lib/bb/tests/fetch.py
@@ -511,7 +511,8 @@ class MirrorUriTest(FetcherTest):
511 mirrorvar = "http://.*/.* file:///somepath/downloads/ " \ 511 mirrorvar = "http://.*/.* file:///somepath/downloads/ " \
512 "git://someserver.org/bitbake git://git.openembedded.org/bitbake " \ 512 "git://someserver.org/bitbake git://git.openembedded.org/bitbake " \
513 "https://.*/.* file:///someotherpath/downloads/ " \ 513 "https://.*/.* file:///someotherpath/downloads/ " \
514 "http://.*/.* file:///someotherpath/downloads/" 514 "http://.*/.* file:///someotherpath/downloads/ " \
515 "svn://svn.server1.com/ svn://svn.server2.com/"
515 516
516 def test_urireplace(self): 517 def test_urireplace(self):
517 self.d.setVar("FILESPATH", ".") 518 self.d.setVar("FILESPATH", ".")
@@ -535,6 +536,13 @@ class MirrorUriTest(FetcherTest):
535 uris, uds = bb.fetch2.build_mirroruris(fetcher, mirrors, self.d) 536 uris, uds = bb.fetch2.build_mirroruris(fetcher, mirrors, self.d)
536 self.assertEqual(uris, ['file:///someotherpath/downloads/bitbake-1.0.tar.gz']) 537 self.assertEqual(uris, ['file:///someotherpath/downloads/bitbake-1.0.tar.gz'])
537 538
539 def test_urilistsvn(self):
540 # Catch svn:// -> svn:// bug
541 fetcher = bb.fetch.FetchData("svn://svn.server1.com/isource/svnroot/reponame/tags/tagname;module=path_in_tagnamefolder;protocol=https;rev=2", self.d)
542 mirrors = bb.fetch2.mirror_from_string(self.mirrorvar)
543 uris, uds = bb.fetch2.build_mirroruris(fetcher, mirrors, self.d)
544 self.assertEqual(uris, ['svn://svn.server2.com/isource/svnroot/reponame/tags/tagname;module=path_in_tagnamefolder;protocol=https;rev=2'])
545
538 def test_mirror_of_mirror(self): 546 def test_mirror_of_mirror(self):
539 # Test if mirror of a mirror works 547 # Test if mirror of a mirror works
540 mirrorvar = self.mirrorvar + " http://.*/.* http://otherdownloads.yoctoproject.org/downloads/" 548 mirrorvar = self.mirrorvar + " http://.*/.* http://otherdownloads.yoctoproject.org/downloads/"
@@ -1493,6 +1501,12 @@ class FetchLatestVersionTest(FetcherTest):
1493 : "2.8", 1501 : "2.8",
1494 } 1502 }
1495 1503
1504 test_crate_uris = {
1505 # basic example; version pattern "A.B.C+cargo-D.E.F"
1506 ("cargo-c", "crate://crates.io/cargo-c/0.9.18+cargo-0.69")
1507 : "0.9.29"
1508 }
1509
1496 @skipIfNoNetwork() 1510 @skipIfNoNetwork()
1497 def test_git_latest_versionstring(self): 1511 def test_git_latest_versionstring(self):
1498 for k, v in self.test_git_uris.items(): 1512 for k, v in self.test_git_uris.items():
@@ -1511,7 +1525,7 @@ class FetchLatestVersionTest(FetcherTest):
1511 1525
1512 def test_wget_latest_versionstring(self): 1526 def test_wget_latest_versionstring(self):
1513 testdata = os.path.dirname(os.path.abspath(__file__)) + "/fetch-testdata" 1527 testdata = os.path.dirname(os.path.abspath(__file__)) + "/fetch-testdata"
1514 server = HTTPService(testdata) 1528 server = HTTPService(testdata, host="127.0.0.1")
1515 server.start() 1529 server.start()
1516 port = server.port 1530 port = server.port
1517 try: 1531 try:
@@ -1519,10 +1533,10 @@ class FetchLatestVersionTest(FetcherTest):
1519 self.d.setVar("PN", k[0]) 1533 self.d.setVar("PN", k[0])
1520 checkuri = "" 1534 checkuri = ""
1521 if k[2]: 1535 if k[2]:
1522 checkuri = "http://localhost:%s/" % port + k[2] 1536 checkuri = "http://127.0.0.1:%s/" % port + k[2]
1523 self.d.setVar("UPSTREAM_CHECK_URI", checkuri) 1537 self.d.setVar("UPSTREAM_CHECK_URI", checkuri)
1524 self.d.setVar("UPSTREAM_CHECK_REGEX", k[3]) 1538 self.d.setVar("UPSTREAM_CHECK_REGEX", k[3])
1525 url = "http://localhost:%s/" % port + k[1] 1539 url = "http://127.0.0.1:%s/" % port + k[1]
1526 ud = bb.fetch2.FetchData(url, self.d) 1540 ud = bb.fetch2.FetchData(url, self.d)
1527 pupver = ud.method.latest_versionstring(ud, self.d) 1541 pupver = ud.method.latest_versionstring(ud, self.d)
1528 verstring = pupver[0] 1542 verstring = pupver[0]
@@ -1532,6 +1546,16 @@ class FetchLatestVersionTest(FetcherTest):
1532 finally: 1546 finally:
1533 server.stop() 1547 server.stop()
1534 1548
1549 @skipIfNoNetwork()
1550 def test_crate_latest_versionstring(self):
1551 for k, v in self.test_crate_uris.items():
1552 self.d.setVar("PN", k[0])
1553 ud = bb.fetch2.FetchData(k[1], self.d)
1554 pupver = ud.method.latest_versionstring(ud, self.d)
1555 verstring = pupver[0]
1556 self.assertTrue(verstring, msg="Could not find upstream version for %s" % k[0])
1557 r = bb.utils.vercmp_string(v, verstring)
1558 self.assertTrue(r == -1 or r == 0, msg="Package %s, version: %s <= %s" % (k[0], v, verstring))
1535 1559
1536class FetchCheckStatusTest(FetcherTest): 1560class FetchCheckStatusTest(FetcherTest):
1537 test_wget_uris = ["https://downloads.yoctoproject.org/releases/sato/sato-engine-0.1.tar.gz", 1561 test_wget_uris = ["https://downloads.yoctoproject.org/releases/sato/sato-engine-0.1.tar.gz",
diff --git a/bitbake/lib/bb/ui/buildinfohelper.py b/bitbake/lib/bb/ui/buildinfohelper.py
index 8b212b7803..4ee45d67a2 100644
--- a/bitbake/lib/bb/ui/buildinfohelper.py
+++ b/bitbake/lib/bb/ui/buildinfohelper.py
@@ -559,7 +559,10 @@ class ORMWrapper(object):
559 # we might have an invalid link; no way to detect this. just set it to None 559 # we might have an invalid link; no way to detect this. just set it to None
560 filetarget_obj = None 560 filetarget_obj = None
561 561
562 parent_obj = Target_File.objects.get(target = target_obj, path = parent_path, inodetype = Target_File.ITYPE_DIRECTORY) 562 try:
563 parent_obj = Target_File.objects.get(target = target_obj, path = parent_path, inodetype = Target_File.ITYPE_DIRECTORY)
564 except Target_File.DoesNotExist:
565 parent_obj = None
563 566
564 Target_File.objects.create( 567 Target_File.objects.create(
565 target = target_obj, 568 target = target_obj,
diff --git a/bitbake/lib/bblayers/action.py b/bitbake/lib/bblayers/action.py
index a8f2699335..a14f19948e 100644
--- a/bitbake/lib/bblayers/action.py
+++ b/bitbake/lib/bblayers/action.py
@@ -50,8 +50,8 @@ class ActionPlugin(LayerPlugin):
50 50
51 try: 51 try:
52 notadded, _ = bb.utils.edit_bblayers_conf(bblayers_conf, layerdirs, None) 52 notadded, _ = bb.utils.edit_bblayers_conf(bblayers_conf, layerdirs, None)
53 self.tinfoil.modified_files()
54 if not (args.force or notadded): 53 if not (args.force or notadded):
54 self.tinfoil.modified_files()
55 try: 55 try:
56 self.tinfoil.run_command('parseConfiguration') 56 self.tinfoil.run_command('parseConfiguration')
57 except (bb.tinfoil.TinfoilUIException, bb.BBHandledException): 57 except (bb.tinfoil.TinfoilUIException, bb.BBHandledException):
@@ -83,6 +83,8 @@ class ActionPlugin(LayerPlugin):
83 layerdir = os.path.abspath(item) 83 layerdir = os.path.abspath(item)
84 layerdirs.append(layerdir) 84 layerdirs.append(layerdir)
85 (_, notremoved) = bb.utils.edit_bblayers_conf(bblayers_conf, None, layerdirs) 85 (_, notremoved) = bb.utils.edit_bblayers_conf(bblayers_conf, None, layerdirs)
86 if args.force > 1:
87 return 0
86 self.tinfoil.modified_files() 88 self.tinfoil.modified_files()
87 if notremoved: 89 if notremoved:
88 for item in notremoved: 90 for item in notremoved:
diff --git a/bitbake/lib/bs4/AUTHORS b/bitbake/lib/bs4/AUTHORS
new file mode 100644
index 0000000000..1f14fe07de
--- /dev/null
+++ b/bitbake/lib/bs4/AUTHORS
@@ -0,0 +1,49 @@
1Behold, mortal, the origins of Beautiful Soup...
2================================================
3
4Leonard Richardson is the primary maintainer.
5
6Aaron DeVore and Isaac Muse have made significant contributions to the
7code base.
8
9Mark Pilgrim provided the encoding detection code that forms the base
10of UnicodeDammit.
11
12Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
13Soup 4 working under Python 3.
14
15Simon Willison wrote soupselect, which was used to make Beautiful Soup
16support CSS selectors. Isaac Muse wrote SoupSieve, which made it
17possible to _remove_ the CSS selector code from Beautiful Soup.
18
19Sam Ruby helped with a lot of edge cases.
20
21Jonathan Ellis was awarded the prestigious Beau Potage D'Or for his
22work in solving the nestable tags conundrum.
23
24An incomplete list of people have contributed patches to Beautiful
25Soup:
26
27 Istvan Albert, Andrew Lin, Anthony Baxter, Oliver Beattie, Andrew
28Boyko, Tony Chang, Francisco Canas, "Delong", Zephyr Fang, Fuzzy,
29Roman Gaufman, Yoni Gilad, Richie Hindle, Toshihiro Kamiya, Peteris
30Krumins, Kent Johnson, Marek Kapolka, Andreas Kostyrka, Roel Kramer,
31Ben Last, Robert Leftwich, Stefaan Lippens, "liquider", Staffan
32Malmgren, Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon",
33Ed Oskiewicz, Martijn Peters, Greg Phillips, Giles Radford, Stefano
34Revera, Arthur Rudolph, Marko Samastur, James Salter, Jouni Seppänen,
35Alexander Schmolck, Tim Shirley, Geoffrey Sneddon, Ville Skyttä,
36"Vikas", Jens Svalgaard, Andy Theyers, Eric Weiser, Glyn Webster, John
37Wiseman, Paul Wright, Danny Yoo
38
39An incomplete list of people who made suggestions or found bugs or
40found ways to break Beautiful Soup:
41
42 Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
43 Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
44 Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
45 warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
46 Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
47 Summers, Dennis Sutch, Chris Smith, Aaron Swartz, Stuart
48 Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
49 Sousa Rocha, Yichun Wei, Per Vognsen
diff --git a/bitbake/lib/bs4/AUTHORS.txt b/bitbake/lib/bs4/AUTHORS.txt
deleted file mode 100644
index 2ac8fcc8cc..0000000000
--- a/bitbake/lib/bs4/AUTHORS.txt
+++ /dev/null
@@ -1,43 +0,0 @@
1Behold, mortal, the origins of Beautiful Soup...
2================================================
3
4Leonard Richardson is the primary programmer.
5
6Aaron DeVore is awesome.
7
8Mark Pilgrim provided the encoding detection code that forms the base
9of UnicodeDammit.
10
11Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
12Soup 4 working under Python 3.
13
14Simon Willison wrote soupselect, which was used to make Beautiful Soup
15support CSS selectors.
16
17Sam Ruby helped with a lot of edge cases.
18
19Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
20work in solving the nestable tags conundrum.
21
22An incomplete list of people have contributed patches to Beautiful
23Soup:
24
25 Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
26 Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
27 Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
28 Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
29 Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
30 Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
31 Webster, Paul Wright, Danny Yoo
32
33An incomplete list of people who made suggestions or found bugs or
34found ways to break Beautiful Soup:
35
36 Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
37 Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
38 Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
39 warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
40 Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
41 Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
42 Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
43 Sousa Rocha, Yichun Wei, Per Vognsen
diff --git a/bitbake/lib/bs4/NEWS.txt b/bitbake/lib/bs4/CHANGELOG
index 88a60a2458..2701446a6d 100644
--- a/bitbake/lib/bs4/NEWS.txt
+++ b/bitbake/lib/bs4/CHANGELOG
@@ -1,3 +1,776 @@
1= 4.12.3 (20240117)
2
3* The Beautiful Soup documentation now has a Spanish translation, thanks
4 to Carlos Romero. Delong Wang's Chinese translation has been updated
5 to cover Beautiful Soup 4.12.0.
6
7* Fixed a regression such that if you set .hidden on a tag, the tag
8 becomes invisible but its contents are still visible. User manipulation
9 of .hidden is not a documented or supported feature, so don't do this,
10 but it wasn't too difficult to keep the old behavior working.
11
12* Fixed a case found by Mengyuhan where html.parser giving up on
13 markup would result in an AssertionError instead of a
14 ParserRejectedMarkup exception.
15
16* Added the correct stacklevel to instances of the XMLParsedAsHTMLWarning.
17 [bug=2034451]
18
19* Corrected the syntax of the license definition in pyproject.toml. Patch
20 by Louis Maddox. [bug=2032848]
21
22* Corrected a typo in a test that was causing test failures when run against
23 libxml2 2.12.1. [bug=2045481]
24
25= 4.12.2 (20230407)
26
27* Fixed an unhandled exception in BeautifulSoup.decode_contents
28 and methods that call it. [bug=2015545]
29
30= 4.12.1 (20230405)
31
32NOTE: the following things are likely to be dropped in the next
33feature release of Beautiful Soup:
34
35 Official support for Python 3.6.
36 Inclusion of unit tests and test data in the wheel file.
37 Two scripts: demonstrate_parser_differences.py and test-all-versions.
38
39Changes:
40
41* This version of Beautiful Soup replaces setup.py and setup.cfg
42 with pyproject.toml. Beautiful Soup now uses tox as its test backend
43 and hatch to do builds.
44
45* The main functional improvement in this version is a nonrecursive technique
46 for regenerating a tree. This technique is used to avoid situations where,
47 in previous versions, doing something to a very deeply nested tree
48 would overflow the Python interpreter stack:
49
50 1. Outputting a tree as a string, e.g. with
51 BeautifulSoup.encode() [bug=1471755]
52
53 2. Making copies of trees (copy.copy() and
54 copy.deepcopy() from the Python standard library). [bug=1709837]
55
56 3. Pickling a BeautifulSoup object. (Note that pickling a Tag
57 object can still cause an overflow.)
58
59* Making a copy of a BeautifulSoup object no longer parses the
60 document again, which should improve performance significantly.
61
62* When a BeautifulSoup object is unpickled, Beautiful Soup now
63 tries to associate an appropriate TreeBuilder object with it.
64
65* Tag.prettify() will now consistently end prettified markup with
66 a newline.
67
68* Added unit tests for fuzz test cases created by third
69 parties. Some of these tests are skipped since they point
70 to problems outside of Beautiful Soup, but this change
71 puts them all in one convenient place.
72
73* PageElement now implements the known_xml attribute. (This was technically
74 a bug, but it shouldn't be an issue in normal use.) [bug=2007895]
75
76* The demonstrate_parser_differences.py script was still written in
77 Python 2. I've converted it to Python 3, but since no one has
78 mentioned this over the years, it's a sign that no one uses this
79 script and it's not serving its purpose.
80
81= 4.12.0 (20230320)
82
83* Introduced the .css property, which centralizes all access to
84 the Soup Sieve API. This allows Beautiful Soup to give direct
85 access to as much of Soup Sieve that makes sense, without cluttering
86 the BeautifulSoup and Tag classes with a lot of new methods.
87
88 This does mean one addition to the BeautifulSoup and Tag classes
89 (the .css property itself), so this might be a breaking change if you
90 happen to use Beautiful Soup to parse XML that includes a tag called
91 <css>. In particular, code like this will stop working in 4.12.0:
92
93 soup.css['id']
94
95 Code like this will work just as before:
96
97 soup.find_one('css')['id']
98
99 The Soup Sieve methods supported through the .css property are
100 select(), select_one(), iselect(), closest(), match(), filter(),
101 escape(), and compile(). The BeautifulSoup and Tag classes still
102 support the select() and select_one() methods; they have not been
103 deprecated, but they have been demoted to convenience methods.
104
105 [bug=2003677]
106
107* When the html.parser parser decides it can't parse a document, Beautiful
108 Soup now consistently propagates this fact by raising a
109 ParserRejectedMarkup error. [bug=2007343]
110
111* Removed some error checking code from diagnose(), which is redundant with
112 similar (but more Pythonic) code in the BeautifulSoup constructor.
113 [bug=2007344]
114
115* Added intersphinx references to the documentation so that other
116 projects have a target to point to when they reference Beautiful
117 Soup classes. [bug=1453370]
118
119= 4.11.2 (20230131)
120
121* Fixed test failures caused by nondeterministic behavior of
122 UnicodeDammit's character detection, depending on the platform setup.
123 [bug=1973072]
124
125* Fixed another crash when overriding multi_valued_attributes and using the
126 html5lib parser. [bug=1948488]
127
128* The HTMLFormatter and XMLFormatter constructors no longer return a
129 value. [bug=1992693]
130
131* Tag.interesting_string_types is now propagated when a tag is
132 copied. [bug=1990400]
133
134* Warnings now do their best to provide an appropriate stacklevel,
135 improving the usefulness of the message. [bug=1978744]
136
137* Passing a Tag's .contents into PageElement.extend() now works the
138 same way as passing the Tag itself.
139
140* Soup Sieve tests will be skipped if the library is not installed.
141
142= 4.11.1 (20220408)
143
144This release was done to ensure that the unit tests are packaged along
145with the released source. There are no functionality changes in this
146release, but there are a few other packaging changes:
147
148* The Japanese and Korean translations of the documentation are included.
149* The changelog is now packaged as CHANGELOG, and the license file is
150 packaged as LICENSE. NEWS.txt and COPYING.txt are still present,
151 but may be removed in the future.
152* TODO.txt is no longer packaged, since a TODO is not relevant for released
153 code.
154
155= 4.11.0 (20220407)
156
157* Ported unit tests to use pytest.
158
159* Added special string classes, RubyParenthesisString and RubyTextString,
160 to make it possible to treat ruby text specially in get_text() calls.
161 [bug=1941980]
162
163* It's now possible to customize the way output is indented by
164 providing a value for the 'indent' argument to the Formatter
165 constructor. The 'indent' argument works very similarly to the
166 argument of the same name in the Python standard library's
167 json.dump() function. [bug=1955497]
168
169* If the charset-normalizer Python module
170 (https://pypi.org/project/charset-normalizer/) is installed, Beautiful
171 Soup will use it to detect the character sets of incoming documents.
172 This is also the module used by newer versions of the Requests library.
173 For the sake of backwards compatibility, chardet and cchardet both take
174 precedence if installed. [bug=1955346]
175
176* Added a workaround for an lxml bug
177 (https://bugs.launchpad.net/lxml/+bug/1948551) that causes
178 problems when parsing a Unicode string beginning with BYTE ORDER MARK.
179 [bug=1947768]
180
181* Issue a warning when an HTML parser is used to parse a document that
182 looks like XML but not XHTML. [bug=1939121]
183
184* Do a better job of keeping track of namespaces as an XML document is
185 parsed, so that CSS selectors that use namespaces will do the right
186 thing more often. [bug=1946243]
187
188* Some time ago, the misleadingly named "text" argument to find-type
189 methods was renamed to the more accurate "string." But this supposed
190 "renaming" didn't make it into important places like the method
191 signatures or the docstrings. That's corrected in this
192 version. "text" still works, but will give a DeprecationWarning.
193 [bug=1947038]
194
195* Fixed a crash when pickling a BeautifulSoup object that has no
196 tree builder. [bug=1934003]
197
198* Fixed a crash when overriding multi_valued_attributes and using the
199 html5lib parser. [bug=1948488]
200
201* Standardized the wording of the MarkupResemblesLocatorWarning
202 warnings to omit untrusted input and make the warnings less
203 judgmental about what you ought to be doing. [bug=1955450]
204
205* Removed support for the iconv_codec library, which doesn't seem
206 to exist anymore and was never put up on PyPI. (The closest
207 replacement on PyPI, iconv_codecs, is GPL-licensed, so we can't use
208 it--it's also quite old.)
209
210= 4.10.0 (20210907)
211
212* This is the first release of Beautiful Soup to only support Python
213 3. I dropped Python 2 support to maintain support for newer versions
214 (58 and up) of setuptools. See:
215 https://github.com/pypa/setuptools/issues/2769 [bug=1942919]
216
217* The behavior of methods like .get_text() and .strings now differs
218 depending on the type of tag. The change is visible with HTML tags
219 like <script>, <style>, and <template>. Starting in 4.9.0, methods
220 like get_text() returned no results on such tags, because the
221 contents of those tags are not considered 'text' within the document
222 as a whole.
223
224 But a user who calls script.get_text() is working from a different
225 definition of 'text' than a user who calls div.get_text()--otherwise
226 there would be no need to call script.get_text() at all. In 4.10.0,
227 the contents of (e.g.) a <script> tag are considered 'text' during a
228 get_text() call on the tag itself, but not considered 'text' during
229 a get_text() call on the tag's parent.
230
231 Because of this change, calling get_text() on each child of a tag
232 may now return a different result than calling get_text() on the tag
233 itself. That's because different tags now have different
234 understandings of what counts as 'text'. [bug=1906226] [bug=1868861]
235
236* NavigableString and its subclasses now implement the get_text()
237 method, as well as the properties .strings and
238 .stripped_strings. These methods will either return the string
239 itself, or nothing, so the only reason to use this is when iterating
240 over a list of mixed Tag and NavigableString objects. [bug=1904309]
241
242* The 'html5' formatter now treats attributes whose values are the
243 empty string as HTML boolean attributes. Previously (and in other
244 formatters), an attribute value must be set as None to be treated as
245 a boolean attribute. In a future release, I plan to also give this
246 behavior to the 'html' formatter. Patch by Isaac Muse. [bug=1915424]
247
248* The 'replace_with()' method now takes a variable number of arguments,
249 and can be used to replace a single element with a sequence of elements.
250 Patch by Bill Chandos. [rev=605]
251
252* Corrected output when the namespace prefix associated with a
253 namespaced attribute is the empty string, as opposed to
254 None. [bug=1915583]
255
256* Performance improvement when processing tags that speeds up overall
257 tree construction by 2%. Patch by Morotti. [bug=1899358]
258
259* Corrected the use of special string container classes in cases when a
260 single tag may contain strings with different containers; such as
261 the <template> tag, which may contain both TemplateString objects
262 and Comment objects. [bug=1913406]
263
264* The html.parser tree builder can now handle named entities
265 found in the HTML5 spec in much the same way that the html5lib
266 tree builder does. Note that the lxml HTML tree builder doesn't handle
267 named entities this way. [bug=1924908]
268
269* Added a second way to pass specify encodings to UnicodeDammit and
270 EncodingDetector, based on the order of precedence defined in the
271 HTML5 spec, starting at:
272 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
273
274 Encodings in 'known_definite_encodings' are tried first, then
275 byte-order-mark sniffing is run, then encodings in 'user_encodings'
276 are tried. The old argument, 'override_encodings', is now a
277 deprecated alias for 'known_definite_encodings'.
278
279 This changes the default behavior of the html.parser and lxml tree
280 builders, in a way that may slightly improve encoding
281 detection but will probably have no effect. [bug=1889014]
282
283* Improve the warning issued when a directory name (as opposed to
284 the name of a regular file) is passed as markup into the BeautifulSoup
285 constructor. [bug=1913628]
286
287= 4.9.3 (20201003)
288
289This is the final release of Beautiful Soup to support Python
2902. Beautiful Soup's official support for Python 2 ended on 01 January,
2912021. In the Launchpad Git repository, the final revision to support
292Python 2 was revision 70f546b1e689a70e2f103795efce6d261a3dadf7; it is
293tagged as "python2".
294
295* Implemented a significant performance optimization to the process of
296 searching the parse tree. Patch by Morotti. [bug=1898212]
297
298= 4.9.2 (20200926)
299
300* Fixed a bug that caused too many tags to be popped from the tag
301 stack during tree building, when encountering a closing tag that had
302 no matching opening tag. [bug=1880420]
303
304* Fixed a bug that inconsistently moved elements over when passing
305 a Tag, rather than a list, into Tag.extend(). [bug=1885710]
306
307* Specify the soupsieve dependency in a way that complies with
308 PEP 508. Patch by Mike Nerone. [bug=1893696]
309
310* Change the signatures for BeautifulSoup.insert_before and insert_after
311 (which are not implemented) to match PageElement.insert_before and
312 insert_after, quieting warnings in some IDEs. [bug=1897120]
313
314= 4.9.1 (20200517)
315
316* Added a keyword argument 'on_duplicate_attribute' to the
317 BeautifulSoupHTMLParser constructor (used by the html.parser tree
318 builder) which lets you customize the handling of markup that
319 contains the same attribute more than once, as in:
320 <a href="url1" href="url2"> [bug=1878209]
321
322* Added a distinct subclass, GuessedAtParserWarning, for the warning
323 issued when BeautifulSoup is instantiated without a parser being
324 specified. [bug=1873787]
325
326* Added a distinct subclass, MarkupResemblesLocatorWarning, for the
327 warning issued when BeautifulSoup is instantiated with 'markup' that
328 actually seems to be a URL or the path to a file on
329 disk. [bug=1873787]
330
331* The new NavigableString subclasses (Stylesheet, Script, and
332 TemplateString) can now be imported directly from the bs4 package.
333
334* If you encode a document with a Python-specific encoding like
335 'unicode_escape', that encoding is no longer mentioned in the final
336 XML or HTML document. Instead, encoding information is omitted or
337 left blank. [bug=1874955]
338
339* Fixed test failures when run against soupselect 2.0. Patch by Tomáš
340 Chvátal. [bug=1872279]
341
342= 4.9.0 (20200405)
343
344* Added PageElement.decomposed, a new property which lets you
345 check whether you've already called decompose() on a Tag or
346 NavigableString.
347
348* Embedded CSS and Javascript is now stored in distinct Stylesheet and
349 Script tags, which are ignored by methods like get_text() since most
350 people don't consider this sort of content to be 'text'. This
351 feature is not supported by the html5lib treebuilder. [bug=1868861]
352
353* Added a Russian translation by 'authoress' to the repository.
354
355* Fixed an unhandled exception when formatting a Tag that had been
356 decomposed.[bug=1857767]
357
358* Fixed a bug that happened when passing a Unicode filename containing
359 non-ASCII characters as markup into Beautiful Soup, on a system that
360 allows Unicode filenames. [bug=1866717]
361
362* Added a performance optimization to PageElement.extract(). Patch by
363 Arthur Darcet.
364
365= 4.8.2 (20191224)
366
367* Added Python docstrings to all public methods of the most commonly
368 used classes.
369
370* Added a Chinese translation by Deron Wang and a Brazilian Portuguese
371 translation by Cezar Peixeiro to the repository.
372
373* Fixed two deprecation warnings. Patches by Colin
374 Watson and Nicholas Neumann. [bug=1847592] [bug=1855301]
375
376* The html.parser tree builder now correctly handles DOCTYPEs that are
377 not uppercase. [bug=1848401]
378
379* PageElement.select() now returns a ResultSet rather than a regular
380 list, making it consistent with methods like find_all().
381
382= 4.8.1 (20191006)
383
384* When the html.parser or html5lib parsers are in use, Beautiful Soup
385 will, by default, record the position in the original document where
386 each tag was encountered. This includes line number (Tag.sourceline)
387 and position within a line (Tag.sourcepos). Based on code by Chris
388 Mayo. [bug=1742921]
389
390* When instantiating a BeautifulSoup object, it's now possible to
391 provide a dictionary ('element_classes') of the classes you'd like to be
392 instantiated instead of Tag, NavigableString, etc.
393
394* Fixed the definition of the default XML namespace when using
395 lxml 4.4. Patch by Isaac Muse. [bug=1840141]
396
397* Fixed a crash when pretty-printing tags that were not created
398 during initial parsing. [bug=1838903]
399
400* Copying a Tag preserves information that was originally obtained from
401 the TreeBuilder used to build the original Tag. [bug=1838903]
402
403* Raise an explanatory exception when the underlying parser
404 completely rejects the incoming markup. [bug=1838877]
405
406* Avoid a crash when trying to detect the declared encoding of a
407 Unicode document. [bug=1838877]
408
409* Avoid a crash when unpickling certain parse trees generated
410 using html5lib on Python 3. [bug=1843545]
411
412= 4.8.0 (20190720, "One Small Soup")
413
414This release focuses on making it easier to customize Beautiful Soup's
415input mechanism (the TreeBuilder) and output mechanism (the Formatter).
416
417* You can customize the TreeBuilder object by passing keyword
418 arguments into the BeautifulSoup constructor. Those keyword
419 arguments will be passed along into the TreeBuilder constructor.
420
421 The main reason to do this right now is to change how which
422 attributes are treated as multi-valued attributes (the way 'class'
423 is treated by default). You can do this with the
424 'multi_valued_attributes' argument. [bug=1832978]
425
426* The role of Formatter objects has been greatly expanded. The Formatter
427 class now controls the following:
428
429 - The function to call to perform entity substitution. (This was
430 previously Formatter's only job.)
431 - Which tags should be treated as containing CDATA and have their
432 contents exempt from entity substitution.
433 - The order in which a tag's attributes are output. [bug=1812422]
434 - Whether or not to put a '/' inside a void element, e.g. '<br/>' vs '<br>'
435
436 All preexisting code should work as before.
437
438* Added a new method to the API, Tag.smooth(), which consolidates
439 multiple adjacent NavigableString elements. [bug=1697296]
440
441* &apos; (which is valid in XML, XHTML, and HTML 5, but not HTML 4) is always
442 recognized as a named entity and converted to a single quote. [bug=1818721]
443
444= 4.7.1 (20190106)
445
446* Fixed a significant performance problem introduced in 4.7.0. [bug=1810617]
447
448* Fixed an incorrectly raised exception when inserting a tag before or
449 after an identical tag. [bug=1810692]
450
451* Beautiful Soup will no longer try to keep track of namespaces that
452 are not defined with a prefix; this can confuse soupselect. [bug=1810680]
453
454* Tried even harder to avoid the deprecation warning originally fixed in
455 4.6.1. [bug=1778909]
456
457= 4.7.0 (20181231)
458
459* Beautiful Soup's CSS Selector implementation has been replaced by a
460 dependency on Isaac Muse's SoupSieve project (the soupsieve package
461 on PyPI). The good news is that SoupSieve has a much more robust and
462 complete implementation of CSS selectors, resolving a large number
463 of longstanding issues. The bad news is that from this point onward,
464 SoupSieve must be installed if you want to use the select() method.
465
466 You don't have to change anything lf you installed Beautiful Soup
467 through pip (SoupSieve will be automatically installed when you
468 upgrade Beautiful Soup) or if you don't use CSS selectors from
469 within Beautiful Soup.
470
471 SoupSieve documentation: https://facelessuser.github.io/soupsieve/
472
473* Added the PageElement.extend() method, which works like list.append().
474 [bug=1514970]
475
476* PageElement.insert_before() and insert_after() now take a variable
477 number of arguments. [bug=1514970]
478
479* Fix a number of problems with the tree builder that caused
480 trees that were superficially okay, but which fell apart when bits
481 were extracted. Patch by Isaac Muse. [bug=1782928,1809910]
482
483* Fixed a problem with the tree builder in which elements that
484 contained no content (such as empty comments and all-whitespace
485 elements) were not being treated as part of the tree. Patch by Isaac
486 Muse. [bug=1798699]
487
488* Fixed a problem with multi-valued attributes where the value
489 contained whitespace. Thanks to Jens Svalgaard for the
490 fix. [bug=1787453]
491
492* Clarified ambiguous license statements in the source code. Beautiful
493 Soup is released under the MIT license, and has been since 4.4.0.
494
495* This file has been renamed from NEWS.txt to CHANGELOG.
496
497= 4.6.3 (20180812)
498
499* Exactly the same as 4.6.2. Re-released to make the README file
500 render properly on PyPI.
501
502= 4.6.2 (20180812)
503
504* Fix an exception when a custom formatter was asked to format a void
505 element. [bug=1784408]
506
507= 4.6.1 (20180728)
508
509* Stop data loss when encountering an empty numeric entity, and
510 possibly in other cases. Thanks to tos.kamiya for the fix. [bug=1698503]
511
512* Preserve XML namespaces introduced inside an XML document, not just
513 the ones introduced at the top level. [bug=1718787]
514
515* Added a new formatter, "html5", which represents void elements
516 as "<element>" rather than "<element/>". [bug=1716272]
517
518* Fixed a problem where the html.parser tree builder interpreted
519 a string like "&foo " as the character entity "&foo;" [bug=1728706]
520
521* Correctly handle invalid HTML numeric character entities like &#147;
522 which reference code points that are not Unicode code points. Note
523 that this is only fixed when Beautiful Soup is used with the
524 html.parser parser -- html5lib already worked and I couldn't fix it
525 with lxml. [bug=1782933]
526
527* Improved the warning given when no parser is specified. [bug=1780571]
528
529* When markup contains duplicate elements, a select() call that
530 includes multiple match clauses will match all relevant
531 elements. [bug=1770596]
532
533* Fixed code that was causing deprecation warnings in recent Python 3
534 versions. Includes a patch from Ville Skyttä. [bug=1778909] [bug=1689496]
535
536* Fixed a Windows crash in diagnose() when checking whether a long
537 markup string is a filename. [bug=1737121]
538
539* Stopped HTMLParser from raising an exception in very rare cases of
540 bad markup. [bug=1708831]
541
542* Fixed a bug where find_all() was not working when asked to find a
543 tag with a namespaced name in an XML document that was parsed as
544 HTML. [bug=1723783]
545
546* You can get finer control over formatting by subclassing
547 bs4.element.Formatter and passing a Formatter instance into (e.g.)
548 encode(). [bug=1716272]
549
550* You can pass a dictionary of `attrs` into
551 BeautifulSoup.new_tag. This makes it possible to create a tag with
552 an attribute like 'name' that would otherwise be masked by another
553 argument of new_tag. [bug=1779276]
554
555* Clarified the deprecation warning when accessing tag.fooTag, to cover
556 the possibility that you might really have been looking for a tag
557 called 'fooTag'.
558
559= 4.6.0 (20170507) =
560
561* Added the `Tag.get_attribute_list` method, which acts like `Tag.get` for
562 getting the value of an attribute, but which always returns a list,
563 whether or not the attribute is a multi-value attribute. [bug=1678589]
564
565* It's now possible to use a tag's namespace prefix when searching,
566 e.g. soup.find('namespace:tag') [bug=1655332]
567
568* Improved the handling of empty-element tags like <br> when using the
569 html.parser parser. [bug=1676935]
570
571* HTML parsers treat all HTML4 and HTML5 empty element tags (aka void
572 element tags) correctly. [bug=1656909]
573
574* Namespace prefix is preserved when an XML tag is copied. Thanks
575 to Vikas for a patch and test. [bug=1685172]
576
577= 4.5.3 (20170102) =
578
579* Fixed foster parenting when html5lib is the tree builder. Thanks to
580 Geoffrey Sneddon for a patch and test.
581
582* Fixed yet another problem that caused the html5lib tree builder to
583 create a disconnected parse tree. [bug=1629825]
584
585= 4.5.2 (20170102) =
586
587* Apart from the version number, this release is identical to
588 4.5.3. Due to user error, it could not be completely uploaded to
589 PyPI. Use 4.5.3 instead.
590
591= 4.5.1 (20160802) =
592
593* Fixed a crash when passing Unicode markup that contained a
594 processing instruction into the lxml HTML parser on Python
595 3. [bug=1608048]
596
597= 4.5.0 (20160719) =
598
599* Beautiful Soup is no longer compatible with Python 2.6. This
600 actually happened a few releases ago, but it's now official.
601
602* Beautiful Soup will now work with versions of html5lib greater than
603 0.99999999. [bug=1603299]
604
605* If a search against each individual value of a multi-valued
606 attribute fails, the search will be run one final time against the
607 complete attribute value considered as a single string. That is, if
608 a tag has class="foo bar" and neither "foo" nor "bar" matches, but
609 "foo bar" does, the tag is now considered a match.
610
611 This happened in previous versions, but only when the value being
612 searched for was a string. Now it also works when that value is
613 a regular expression, a list of strings, etc. [bug=1476868]
614
615* Fixed a bug that deranged the tree when a whitespace element was
616 reparented into a tag that contained an identical whitespace
617 element. [bug=1505351]
618
619* Added support for CSS selector values that contain quoted spaces,
620 such as tag[style="display: foo"]. [bug=1540588]
621
622* Corrected handling of XML processing instructions. [bug=1504393]
623
624* Corrected an encoding error that happened when a BeautifulSoup
625 object was copied. [bug=1554439]
626
627* The contents of <textarea> tags will no longer be modified when the
628 tree is prettified. [bug=1555829]
629
630* When a BeautifulSoup object is pickled but its tree builder cannot
631 be pickled, its .builder attribute is set to None instead of being
632 destroyed. This avoids a performance problem once the object is
633 unpickled. [bug=1523629]
634
635* Specify the file and line number when warning about a
636 BeautifulSoup object being instantiated without a parser being
637 specified. [bug=1574647]
638
639* The `limit` argument to `select()` now works correctly, though it's
640 not implemented very efficiently. [bug=1520530]
641
642* Fixed a Python 3 ByteWarning when a URL was passed in as though it
643 were markup. Thanks to James Salter for a patch and
644 test. [bug=1533762]
645
646* We don't run the check for a filename passed in as markup if the
647 'filename' contains a less-than character; the less-than character
648 indicates it's most likely a very small document. [bug=1577864]
649
650= 4.4.1 (20150928) =
651
652* Fixed a bug that deranged the tree when part of it was
653 removed. Thanks to Eric Weiser for the patch and John Wiseman for a
654 test. [bug=1481520]
655
656* Fixed a parse bug with the html5lib tree-builder. Thanks to Roel
657 Kramer for the patch. [bug=1483781]
658
659* Improved the implementation of CSS selector grouping. Thanks to
660 Orangain for the patch. [bug=1484543]
661
662* Fixed the test_detect_utf8 test so that it works when chardet is
663 installed. [bug=1471359]
664
665* Corrected the output of Declaration objects. [bug=1477847]
666
667
668= 4.4.0 (20150703) =
669
670Especially important changes:
671
672* Added a warning when you instantiate a BeautifulSoup object without
673 explicitly naming a parser. [bug=1398866]
674
675* __repr__ now returns an ASCII bytestring in Python 2, and a Unicode
676 string in Python 3, instead of a UTF8-encoded bytestring in both
677 versions. In Python 3, __str__ now returns a Unicode string instead
678 of a bytestring. [bug=1420131]
679
680* The `text` argument to the find_* methods is now called `string`,
681 which is more accurate. `text` still works, but `string` is the
682 argument described in the documentation. `text` may eventually
683 change its meaning, but not for a very long time. [bug=1366856]
684
685* Changed the way soup objects work under copy.copy(). Copying a
686 NavigableString or a Tag will give you a new NavigableString that's
687 equal to the old one but not connected to the parse tree. Patch by
688 Martijn Peters. [bug=1307490]
689
690* Started using a standard MIT license. [bug=1294662]
691
692* Added a Chinese translation of the documentation by Delong .w.
693
694New features:
695
696* Introduced the select_one() method, which uses a CSS selector but
697 only returns the first match, instead of a list of
698 matches. [bug=1349367]
699
700* You can now create a Tag object without specifying a
701 TreeBuilder. Patch by Martijn Pieters. [bug=1307471]
702
703* You can now create a NavigableString or a subclass just by invoking
704 the constructor. [bug=1294315]
705
706* Added an `exclude_encodings` argument to UnicodeDammit and to the
707 Beautiful Soup constructor, which lets you prohibit the detection of
708 an encoding that you know is wrong. [bug=1469408]
709
710* The select() method now supports selector grouping. Patch by
711 Francisco Canas [bug=1191917]
712
713Bug fixes:
714
715* Fixed yet another problem that caused the html5lib tree builder to
716 create a disconnected parse tree. [bug=1237763]
717
718* Force object_was_parsed() to keep the tree intact even when an element
719 from later in the document is moved into place. [bug=1430633]
720
721* Fixed yet another bug that caused a disconnected tree when html5lib
722 copied an element from one part of the tree to another. [bug=1270611]
723
724* Fixed a bug where Element.extract() could create an infinite loop in
725 the remaining tree.
726
727* The select() method can now find tags whose names contain
728 dashes. Patch by Francisco Canas. [bug=1276211]
729
730* The select() method can now find tags with attributes whose names
731 contain dashes. Patch by Marek Kapolka. [bug=1304007]
732
733* Improved the lxml tree builder's handling of processing
734 instructions. [bug=1294645]
735
736* Restored the helpful syntax error that happens when you try to
737 import the Python 2 edition of Beautiful Soup under Python
738 3. [bug=1213387]
739
740* In Python 3.4 and above, set the new convert_charrefs argument to
741 the html.parser constructor to avoid a warning and future
742 failures. Patch by Stefano Revera. [bug=1375721]
743
744* The warning when you pass in a filename or URL as markup will now be
745 displayed correctly even if the filename or URL is a Unicode
746 string. [bug=1268888]
747
748* If the initial <html> tag contains a CDATA list attribute such as
749 'class', the html5lib tree builder will now turn its value into a
750 list, as it would with any other tag. [bug=1296481]
751
752* Fixed an import error in Python 3.5 caused by the removal of the
753 HTMLParseError class. [bug=1420063]
754
755* Improved docstring for encode_contents() and
756 decode_contents(). [bug=1441543]
757
758* Fixed a crash in Unicode, Dammit's encoding detector when the name
759 of the encoding itself contained invalid bytes. [bug=1360913]
760
761* Improved the exception raised when you call .unwrap() or
762 .replace_with() on an element that's not attached to a tree.
763
764* Raise a NotImplementedError whenever an unsupported CSS pseudoclass
765 is used in select(). Previously some cases did not result in a
766 NotImplementedError.
767
768* It's now possible to pickle a BeautifulSoup object no matter which
769 tree builder was used to create it. However, the only tree builder
770 that survives the pickling process is the HTMLParserTreeBuilder
771 ('html.parser'). If you unpickle a BeautifulSoup object created with
772 some other tree builder, soup.builder will be None. [bug=1231545]
773
1= 4.3.2 (20131002) = 774= 4.3.2 (20131002) =
2 775
3* Fixed a bug in which short Unicode input was improperly encoded to 776* Fixed a bug in which short Unicode input was improperly encoded to
@@ -331,7 +1104,7 @@
331* Renamed Tag.nsprefix to Tag.prefix, for consistency with 1104* Renamed Tag.nsprefix to Tag.prefix, for consistency with
332 NamespacedAttribute. 1105 NamespacedAttribute.
333 1106
334* Fixed a test failure that occured on Python 3.x when chardet was 1107* Fixed a test failure that occurred on Python 3.x when chardet was
335 installed. 1108 installed.
336 1109
337* Made prettify() return Unicode by default, so it will look nice on 1110* Made prettify() return Unicode by default, so it will look nice on
@@ -365,7 +1138,7 @@
365 1138
366* Restored compatibility with Python 2.6. 1139* Restored compatibility with Python 2.6.
367 1140
368* The install process no longer installs docs or auxillary text files. 1141* The install process no longer installs docs or auxiliary text files.
369 1142
370* It's now possible to deepcopy a BeautifulSoup object created with 1143* It's now possible to deepcopy a BeautifulSoup object created with
371 Python's built-in HTML parser. 1144 Python's built-in HTML parser.
@@ -604,7 +1377,7 @@ Added an import that makes BS work in Python 2.3.
604Fixed a UnicodeDecodeError when unpickling documents that contain 1377Fixed a UnicodeDecodeError when unpickling documents that contain
605non-ASCII characters. 1378non-ASCII characters.
606 1379
607Fixed a TypeError that occured in some circumstances when a tag 1380Fixed a TypeError that occurred in some circumstances when a tag
608contained no text. 1381contained no text.
609 1382
610Jump through hoops to avoid the use of chardet, which can be extremely 1383Jump through hoops to avoid the use of chardet, which can be extremely
diff --git a/bitbake/lib/bs4/COPYING.txt b/bitbake/lib/bs4/LICENSE
index d668d13f04..08e3a9cf8c 100644
--- a/bitbake/lib/bs4/COPYING.txt
+++ b/bitbake/lib/bs4/LICENSE
@@ -1,6 +1,6 @@
1Beautiful Soup is made available under the MIT license: 1Beautiful Soup is made available under the MIT license:
2 2
3 Copyright (c) 2004-2012 Leonard Richardson 3 Copyright (c) Leonard Richardson
4 4
5 Permission is hereby granted, free of charge, to any person obtaining 5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the 6 a copy of this software and associated documentation files (the
@@ -20,7 +20,12 @@ Beautiful Soup is made available under the MIT license:
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE, DAMMIT. 23 SOFTWARE.
24 24
25Beautiful Soup incorporates code from the html5lib library, which is 25Beautiful Soup incorporates code from the html5lib library, which is
26also made available under the MIT license. 26also made available under the MIT license. Copyright (c) James Graham
27and other contributors
28
29Beautiful Soup has an optional dependency on the soupsieve library,
30which is also made available under the MIT license. Copyright (c)
31Isaac Muse
diff --git a/bitbake/lib/bs4/__init__.py b/bitbake/lib/bs4/__init__.py
index e35725b86e..d8ad5e1dc1 100644
--- a/bitbake/lib/bs4/__init__.py
+++ b/bitbake/lib/bs4/__init__.py
@@ -1,65 +1,99 @@
1"""Beautiful Soup 1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
2Elixir and Tonic 2
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/ 3http://www.crummy.com/software/BeautifulSoup/
5 4
6Beautiful Soup uses a pluggable XML or HTML parser to parse a 5Beautiful Soup uses a pluggable XML or HTML parser to parse a
7(possibly invalid) document into a tree representation. Beautiful Soup 6(possibly invalid) document into a tree representation. Beautiful Soup
8provides provides methods and Pythonic idioms that make it easy to 7provides methods and Pythonic idioms that make it easy to navigate,
9navigate, search, and modify the parse tree. 8search, and modify the parse tree.
10 9
11Beautiful Soup works with Python 2.6 and up. It works better if lxml 10Beautiful Soup works with Python 3.6 and up. It works better if lxml
12and/or html5lib is installed. 11and/or html5lib is installed.
13 12
14For more than you ever wanted to know about Beautiful Soup, see the 13For more than you ever wanted to know about Beautiful Soup, see the
15documentation: 14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
16http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17""" 15"""
18 16
19__author__ = "Leonard Richardson (leonardr@segfault.org)" 17__author__ = "Leonard Richardson (leonardr@segfault.org)"
20__version__ = "4.4.1" 18__version__ = "4.12.3"
21__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" 19__copyright__ = "Copyright (c) 2004-2024 Leonard Richardson"
20# Use of this source code is governed by the MIT license.
22__license__ = "MIT" 21__license__ = "MIT"
23 22
24__all__ = ['BeautifulSoup'] 23__all__ = ['BeautifulSoup']
25 24
25from collections import Counter
26import os 26import os
27import re 27import re
28import sys
29import traceback
28import warnings 30import warnings
29 31
30from .builder import builder_registry, ParserRejectedMarkup 32# The very first thing we do is give a useful error if someone is
33# running this code under Python 2.
34if sys.version_info.major < 3:
35 raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
36
37from .builder import (
38 builder_registry,
39 ParserRejectedMarkup,
40 XMLParsedAsHTMLWarning,
41 HTMLParserTreeBuilder
42)
31from .dammit import UnicodeDammit 43from .dammit import UnicodeDammit
32from .element import ( 44from .element import (
33 CData, 45 CData,
34 Comment, 46 Comment,
47 CSS,
35 DEFAULT_OUTPUT_ENCODING, 48 DEFAULT_OUTPUT_ENCODING,
36 Declaration, 49 Declaration,
37 Doctype, 50 Doctype,
38 NavigableString, 51 NavigableString,
39 PageElement, 52 PageElement,
40 ProcessingInstruction, 53 ProcessingInstruction,
54 PYTHON_SPECIFIC_ENCODINGS,
41 ResultSet, 55 ResultSet,
56 Script,
57 Stylesheet,
42 SoupStrainer, 58 SoupStrainer,
43 Tag, 59 Tag,
60 TemplateString,
44 ) 61 )
45 62
46# The very first thing we do is give a useful error if someone is 63# Define some custom warnings.
47# running this code under Python 3 without converting it. 64class GuessedAtParserWarning(UserWarning):
48'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 65 """The warning issued when BeautifulSoup has to guess what parser to
66 use -- probably because no parser was specified in the constructor.
67 """
49 68
50class BeautifulSoup(Tag): 69class MarkupResemblesLocatorWarning(UserWarning):
70 """The warning issued when BeautifulSoup is given 'markup' that
71 actually looks like a resource locator -- a URL or a path to a file
72 on disk.
51 """ 73 """
52 This class defines the basic interface called by the tree builders.
53 74
54 These methods will be called by the parser: 75
55 reset() 76class BeautifulSoup(Tag):
56 feed(markup) 77 """A data structure representing a parsed HTML or XML document.
78
79 Most of the methods you'll call on a BeautifulSoup object are inherited from
80 PageElement or Tag.
81
82 Internally, this class defines the basic interface called by the
83 tree builders when converting an HTML/XML document into a data
84 structure. The interface abstracts away the differences between
85 parsers. To write a new tree builder, you'll need to understand
86 these methods as a whole.
87
88 These methods will be called by the BeautifulSoup constructor:
89 * reset()
90 * feed(markup)
57 91
58 The tree builder may call these methods from its feed() implementation: 92 The tree builder may call these methods from its feed() implementation:
59 handle_starttag(name, attrs) # See note about return value 93 * handle_starttag(name, attrs) # See note about return value
60 handle_endtag(name) 94 * handle_endtag(name)
61 handle_data(data) # Appends to the current data node 95 * handle_data(data) # Appends to the current data node
62 endData(containerClass=NavigableString) # Ends the current data node 96 * endData(containerClass) # Ends the current data node
63 97
64 No matter how complicated the underlying parser is, you should be 98 No matter how complicated the underlying parser is, you should be
65 able to build a tree using 'start tag' events, 'end tag' events, 99 able to build a tree using 'start tag' events, 'end tag' events,
@@ -69,24 +103,77 @@ class BeautifulSoup(Tag):
69 like HTML's <br> tag), call handle_starttag and then 103 like HTML's <br> tag), call handle_starttag and then
70 handle_endtag. 104 handle_endtag.
71 """ 105 """
106
107 # Since BeautifulSoup subclasses Tag, it's possible to treat it as
108 # a Tag with a .name. This name makes it clear the BeautifulSoup
109 # object isn't a real markup tag.
72 ROOT_TAG_NAME = '[document]' 110 ROOT_TAG_NAME = '[document]'
73 111
74 # If the end-user gives no indication which tree builder they 112 # If the end-user gives no indication which tree builder they
75 # want, look for one with these features. 113 # want, look for one with these features.
76 DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 114 DEFAULT_BUILDER_FEATURES = ['html', 'fast']
77 115
116 # A string containing all ASCII whitespace characters, used in
117 # endData() to detect data chunks that seem 'empty'.
78 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 118 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
79 119
80 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" 120 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
81 121
82 def __init__(self, markup="", features=None, builder=None, 122 def __init__(self, markup="", features=None, builder=None,
83 parse_only=None, from_encoding=None, exclude_encodings=None, 123 parse_only=None, from_encoding=None, exclude_encodings=None,
84 **kwargs): 124 element_classes=None, **kwargs):
85 """The Soup object is initialized as the 'root tag', and the 125 """Constructor.
86 provided markup (which can be a string or a file-like object) 126
87 is fed into the underlying parser.""" 127 :param markup: A string or a file-like object representing
88 128 markup to be parsed.
129
130 :param features: Desirable features of the parser to be
131 used. This may be the name of a specific parser ("lxml",
132 "lxml-xml", "html.parser", or "html5lib") or it may be the
133 type of markup to be used ("html", "html5", "xml"). It's
134 recommended that you name a specific parser, so that
135 Beautiful Soup gives you the same results across platforms
136 and virtual environments.
137
138 :param builder: A TreeBuilder subclass to instantiate (or
139 instance to use) instead of looking one up based on
140 `features`. You only need to use this if you've implemented a
141 custom TreeBuilder.
142
143 :param parse_only: A SoupStrainer. Only parts of the document
144 matching the SoupStrainer will be considered. This is useful
145 when parsing part of a document that would otherwise be too
146 large to fit into memory.
147
148 :param from_encoding: A string indicating the encoding of the
149 document to be parsed. Pass this in if Beautiful Soup is
150 guessing wrongly about the document's encoding.
151
152 :param exclude_encodings: A list of strings indicating
153 encodings known to be wrong. Pass this in if you don't know
154 the document's encoding but you know Beautiful Soup's guess is
155 wrong.
156
157 :param element_classes: A dictionary mapping BeautifulSoup
158 classes like Tag and NavigableString, to other classes you'd
159 like to be instantiated instead as the parse tree is
160 built. This is useful for subclassing Tag or NavigableString
161 to modify default behavior.
162
163 :param kwargs: For backwards compatibility purposes, the
164 constructor accepts certain keyword arguments used in
165 Beautiful Soup 3. None of these arguments do anything in
166 Beautiful Soup 4; they will result in a warning and then be
167 ignored.
168
169 Apart from this, any keyword arguments passed into the
170 BeautifulSoup constructor are propagated to the TreeBuilder
171 constructor. This makes it possible to configure a
172 TreeBuilder by passing in arguments, not just by saying which
173 one to use.
174 """
89 if 'convertEntities' in kwargs: 175 if 'convertEntities' in kwargs:
176 del kwargs['convertEntities']
90 warnings.warn( 177 warnings.warn(
91 "BS4 does not respect the convertEntities argument to the " 178 "BS4 does not respect the convertEntities argument to the "
92 "BeautifulSoup constructor. Entities are always converted " 179 "BeautifulSoup constructor. Entities are always converted "
@@ -125,10 +212,10 @@ class BeautifulSoup(Tag):
125 if old_name in kwargs: 212 if old_name in kwargs:
126 warnings.warn( 213 warnings.warn(
127 'The "%s" argument to the BeautifulSoup constructor ' 214 'The "%s" argument to the BeautifulSoup constructor '
128 'has been renamed to "%s."' % (old_name, new_name)) 215 'has been renamed to "%s."' % (old_name, new_name),
129 value = kwargs[old_name] 216 DeprecationWarning, stacklevel=3
130 del kwargs[old_name] 217 )
131 return value 218 return kwargs.pop(old_name)
132 return None 219 return None
133 220
134 parse_only = parse_only or deprecated_argument( 221 parse_only = parse_only or deprecated_argument(
@@ -137,13 +224,23 @@ class BeautifulSoup(Tag):
137 from_encoding = from_encoding or deprecated_argument( 224 from_encoding = from_encoding or deprecated_argument(
138 "fromEncoding", "from_encoding") 225 "fromEncoding", "from_encoding")
139 226
140 if len(kwargs) > 0: 227 if from_encoding and isinstance(markup, str):
141 arg = list(kwargs.keys()).pop() 228 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
142 raise TypeError( 229 from_encoding = None
143 "__init__() got an unexpected keyword argument '%s'" % arg) 230
144 231 self.element_classes = element_classes or dict()
145 if builder is None: 232
146 original_features = features 233 # We need this information to track whether or not the builder
234 # was specified well enough that we can omit the 'you need to
235 # specify a parser' warning.
236 original_builder = builder
237 original_features = features
238
239 if isinstance(builder, type):
240 # A builder class was passed in; it needs to be instantiated.
241 builder_class = builder
242 builder = None
243 elif builder is None:
147 if isinstance(features, str): 244 if isinstance(features, str):
148 features = [features] 245 features = [features]
149 if features is None or len(features) == 0: 246 if features is None or len(features) == 0:
@@ -154,85 +251,227 @@ class BeautifulSoup(Tag):
154 "Couldn't find a tree builder with the features you " 251 "Couldn't find a tree builder with the features you "
155 "requested: %s. Do you need to install a parser library?" 252 "requested: %s. Do you need to install a parser library?"
156 % ",".join(features)) 253 % ",".join(features))
157 builder = builder_class() 254
158 if not (original_features == builder.NAME or 255 # At this point either we have a TreeBuilder instance in
159 original_features in builder.ALTERNATE_NAMES): 256 # builder, or we have a builder_class that we can instantiate
257 # with the remaining **kwargs.
258 if builder is None:
259 builder = builder_class(**kwargs)
260 if not original_builder and not (
261 original_features == builder.NAME or
262 original_features in builder.ALTERNATE_NAMES
263 ) and markup:
264 # The user did not tell us which TreeBuilder to use,
265 # and we had to guess. Issue a warning.
160 if builder.is_xml: 266 if builder.is_xml:
161 markup_type = "XML" 267 markup_type = "XML"
162 else: 268 else:
163 markup_type = "HTML" 269 markup_type = "HTML"
164 warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
165 parser=builder.NAME,
166 markup_type=markup_type))
167 270
271 # This code adapted from warnings.py so that we get the same line
272 # of code as our warnings.warn() call gets, even if the answer is wrong
273 # (as it may be in a multithreading situation).
274 caller = None
275 try:
276 caller = sys._getframe(1)
277 except ValueError:
278 pass
279 if caller:
280 globals = caller.f_globals
281 line_number = caller.f_lineno
282 else:
283 globals = sys.__dict__
284 line_number= 1
285 filename = globals.get('__file__')
286 if filename:
287 fnl = filename.lower()
288 if fnl.endswith((".pyc", ".pyo")):
289 filename = filename[:-1]
290 if filename:
291 # If there is no filename at all, the user is most likely in a REPL,
292 # and the warning is not necessary.
293 values = dict(
294 filename=filename,
295 line_number=line_number,
296 parser=builder.NAME,
297 markup_type=markup_type
298 )
299 warnings.warn(
300 self.NO_PARSER_SPECIFIED_WARNING % values,
301 GuessedAtParserWarning, stacklevel=2
302 )
303 else:
304 if kwargs:
305 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
306
168 self.builder = builder 307 self.builder = builder
169 self.is_xml = builder.is_xml 308 self.is_xml = builder.is_xml
170 self.builder.soup = self 309 self.known_xml = self.is_xml
171 310 self._namespaces = dict()
172 self.parse_only = parse_only 311 self.parse_only = parse_only
173 312
174 if hasattr(markup, 'read'): # It's a file-type object. 313 if hasattr(markup, 'read'): # It's a file-type object.
175 markup = markup.read() 314 markup = markup.read()
176 elif len(markup) <= 256: 315 elif len(markup) <= 256 and (
177 # Print out warnings for a couple beginner problems 316 (isinstance(markup, bytes) and not b'<' in markup)
317 or (isinstance(markup, str) and not '<' in markup)
318 ):
319 # Issue warnings for a couple beginner problems
178 # involving passing non-markup to Beautiful Soup. 320 # involving passing non-markup to Beautiful Soup.
179 # Beautiful Soup will still parse the input as markup, 321 # Beautiful Soup will still parse the input as markup,
180 # just in case that's what the user really wants. 322 # since that is sometimes the intended behavior.
181 if (isinstance(markup, str) 323 if not self._markup_is_url(markup):
182 and not os.path.supports_unicode_filenames): 324 self._markup_resembles_filename(markup)
183 possible_filename = markup.encode("utf8")
184 else:
185 possible_filename = markup
186 is_file = False
187 try:
188 is_file = os.path.exists(possible_filename)
189 except Exception as e:
190 # This is almost certainly a problem involving
191 # characters not valid in filenames on this
192 # system. Just let it go.
193 pass
194 if is_file:
195 if isinstance(markup, str):
196 markup = markup.encode("utf8")
197 warnings.warn(
198 '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
199 if markup[:5] == "http:" or markup[:6] == "https:":
200 # TODO: This is ugly but I couldn't get it to work in
201 # Python 3 otherwise.
202 if ((isinstance(markup, bytes) and not b' ' in markup)
203 or (isinstance(markup, str) and not ' ' in markup)):
204 if isinstance(markup, str):
205 markup = markup.encode("utf8")
206 warnings.warn(
207 '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
208 325
326 rejections = []
327 success = False
209 for (self.markup, self.original_encoding, self.declared_html_encoding, 328 for (self.markup, self.original_encoding, self.declared_html_encoding,
210 self.contains_replacement_characters) in ( 329 self.contains_replacement_characters) in (
211 self.builder.prepare_markup( 330 self.builder.prepare_markup(
212 markup, from_encoding, exclude_encodings=exclude_encodings)): 331 markup, from_encoding, exclude_encodings=exclude_encodings)):
213 self.reset() 332 self.reset()
333 self.builder.initialize_soup(self)
214 try: 334 try:
215 self._feed() 335 self._feed()
336 success = True
216 break 337 break
217 except ParserRejectedMarkup: 338 except ParserRejectedMarkup as e:
339 rejections.append(e)
218 pass 340 pass
219 341
342 if not success:
343 other_exceptions = [str(e) for e in rejections]
344 raise ParserRejectedMarkup(
345 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
346 )
347
220 # Clear out the markup and remove the builder's circular 348 # Clear out the markup and remove the builder's circular
221 # reference to this object. 349 # reference to this object.
222 self.markup = None 350 self.markup = None
223 self.builder.soup = None 351 self.builder.soup = None
224 352
225 def __copy__(self): 353 def _clone(self):
226 return type(self)(self.encode(), builder=self.builder) 354 """Create a new BeautifulSoup object with the same TreeBuilder,
355 but not associated with any markup.
356
357 This is the first step of the deepcopy process.
358 """
359 clone = type(self)("", None, self.builder)
227 360
361 # Keep track of the encoding of the original document,
362 # since we won't be parsing it again.
363 clone.original_encoding = self.original_encoding
364 return clone
365
228 def __getstate__(self): 366 def __getstate__(self):
229 # Frequently a tree builder can't be pickled. 367 # Frequently a tree builder can't be pickled.
230 d = dict(self.__dict__) 368 d = dict(self.__dict__)
231 if 'builder' in d and not self.builder.picklable: 369 if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
232 del d['builder'] 370 d['builder'] = type(self.builder)
371 # Store the contents as a Unicode string.
372 d['contents'] = []
373 d['markup'] = self.decode()
374
375 # If _most_recent_element is present, it's a Tag object left
376 # over from initial parse. It might not be picklable and we
377 # don't need it.
378 if '_most_recent_element' in d:
379 del d['_most_recent_element']
233 return d 380 return d
234 381
382 def __setstate__(self, state):
383 # If necessary, restore the TreeBuilder by looking it up.
384 self.__dict__ = state
385 if isinstance(self.builder, type):
386 self.builder = self.builder()
387 elif not self.builder:
388 # We don't know which builder was used to build this
389 # parse tree, so use a default we know is always available.
390 self.builder = HTMLParserTreeBuilder()
391 self.builder.soup = self
392 self.reset()
393 self._feed()
394 return state
395
396
397 @classmethod
398 def _decode_markup(cls, markup):
399 """Ensure `markup` is bytes so it's safe to send into warnings.warn.
400
401 TODO: warnings.warn had this problem back in 2010 but it might not
402 anymore.
403 """
404 if isinstance(markup, bytes):
405 decoded = markup.decode('utf-8', 'replace')
406 else:
407 decoded = markup
408 return decoded
409
410 @classmethod
411 def _markup_is_url(cls, markup):
412 """Error-handling method to raise a warning if incoming markup looks
413 like a URL.
414
415 :param markup: A string.
416 :return: Whether or not the markup resembles a URL
417 closely enough to justify a warning.
418 """
419 if isinstance(markup, bytes):
420 space = b' '
421 cant_start_with = (b"http:", b"https:")
422 elif isinstance(markup, str):
423 space = ' '
424 cant_start_with = ("http:", "https:")
425 else:
426 return False
427
428 if any(markup.startswith(prefix) for prefix in cant_start_with):
429 if not space in markup:
430 warnings.warn(
431 'The input looks more like a URL than markup. You may want to use'
432 ' an HTTP client like requests to get the document behind'
433 ' the URL, and feed that document to Beautiful Soup.',
434 MarkupResemblesLocatorWarning,
435 stacklevel=3
436 )
437 return True
438 return False
439
440 @classmethod
441 def _markup_resembles_filename(cls, markup):
442 """Error-handling method to raise a warning if incoming markup
443 resembles a filename.
444
445 :param markup: A bytestring or string.
446 :return: Whether or not the markup resembles a filename
447 closely enough to justify a warning.
448 """
449 path_characters = '/\\'
450 extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
451 if isinstance(markup, bytes):
452 path_characters = path_characters.encode("utf8")
453 extensions = [x.encode('utf8') for x in extensions]
454 filelike = False
455 if any(x in markup for x in path_characters):
456 filelike = True
457 else:
458 lower = markup.lower()
459 if any(lower.endswith(ext) for ext in extensions):
460 filelike = True
461 if filelike:
462 warnings.warn(
463 'The input looks more like a filename than markup. You may'
464 ' want to open this file and pass the filehandle into'
465 ' Beautiful Soup.',
466 MarkupResemblesLocatorWarning, stacklevel=3
467 )
468 return True
469 return False
470
235 def _feed(self): 471 def _feed(self):
472 """Internal method that parses previously set markup, creating a large
473 number of Tag and NavigableString objects.
474 """
236 # Convert the document to Unicode. 475 # Convert the document to Unicode.
237 self.builder.reset() 476 self.builder.reset()
238 477
@@ -243,48 +482,111 @@ class BeautifulSoup(Tag):
243 self.popTag() 482 self.popTag()
244 483
245 def reset(self): 484 def reset(self):
485 """Reset this object to a state as though it had never parsed any
486 markup.
487 """
246 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 488 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
247 self.hidden = 1 489 self.hidden = 1
248 self.builder.reset() 490 self.builder.reset()
249 self.current_data = [] 491 self.current_data = []
250 self.currentTag = None 492 self.currentTag = None
251 self.tagStack = [] 493 self.tagStack = []
494 self.open_tag_counter = Counter()
252 self.preserve_whitespace_tag_stack = [] 495 self.preserve_whitespace_tag_stack = []
496 self.string_container_stack = []
497 self._most_recent_element = None
253 self.pushTag(self) 498 self.pushTag(self)
254 499
255 def new_tag(self, name, namespace=None, nsprefix=None, **attrs): 500 def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
256 """Create a new tag associated with this soup.""" 501 sourceline=None, sourcepos=None, **kwattrs):
257 return Tag(None, self.builder, name, namespace, nsprefix, attrs) 502 """Create a new Tag associated with this BeautifulSoup object.
503
504 :param name: The name of the new Tag.
505 :param namespace: The URI of the new Tag's XML namespace, if any.
506 :param prefix: The prefix for the new Tag's XML namespace, if any.
507 :param attrs: A dictionary of this Tag's attribute values; can
508 be used instead of `kwattrs` for attributes like 'class'
509 that are reserved words in Python.
510 :param sourceline: The line number where this tag was
511 (purportedly) found in its source document.
512 :param sourcepos: The character position within `sourceline` where this
513 tag was (purportedly) found.
514 :param kwattrs: Keyword arguments for the new Tag's attribute values.
258 515
259 def new_string(self, s, subclass=NavigableString): 516 """
260 """Create a new NavigableString associated with this soup.""" 517 kwattrs.update(attrs)
261 return subclass(s) 518 return self.element_classes.get(Tag, Tag)(
519 None, self.builder, name, namespace, nsprefix, kwattrs,
520 sourceline=sourceline, sourcepos=sourcepos
521 )
522
523 def string_container(self, base_class=None):
524 container = base_class or NavigableString
525
526 # There may be a general override of NavigableString.
527 container = self.element_classes.get(
528 container, container
529 )
530
531 # On top of that, we may be inside a tag that needs a special
532 # container class.
533 if self.string_container_stack and container is NavigableString:
534 container = self.builder.string_containers.get(
535 self.string_container_stack[-1].name, container
536 )
537 return container
538
539 def new_string(self, s, subclass=None):
540 """Create a new NavigableString associated with this BeautifulSoup
541 object.
542 """
543 container = self.string_container(subclass)
544 return container(s)
262 545
263 def insert_before(self, successor): 546 def insert_before(self, *args):
547 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
548 it because there is nothing before or after it in the parse tree.
549 """
264 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 550 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
265 551
266 def insert_after(self, successor): 552 def insert_after(self, *args):
553 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
554 it because there is nothing before or after it in the parse tree.
555 """
267 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 556 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
268 557
269 def popTag(self): 558 def popTag(self):
559 """Internal method called by _popToTag when a tag is closed."""
270 tag = self.tagStack.pop() 560 tag = self.tagStack.pop()
561 if tag.name in self.open_tag_counter:
562 self.open_tag_counter[tag.name] -= 1
271 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 563 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
272 self.preserve_whitespace_tag_stack.pop() 564 self.preserve_whitespace_tag_stack.pop()
273 #print "Pop", tag.name 565 if self.string_container_stack and tag == self.string_container_stack[-1]:
566 self.string_container_stack.pop()
567 #print("Pop", tag.name)
274 if self.tagStack: 568 if self.tagStack:
275 self.currentTag = self.tagStack[-1] 569 self.currentTag = self.tagStack[-1]
276 return self.currentTag 570 return self.currentTag
277 571
278 def pushTag(self, tag): 572 def pushTag(self, tag):
279 #print "Push", tag.name 573 """Internal method called by handle_starttag when a tag is opened."""
280 if self.currentTag: 574 #print("Push", tag.name)
575 if self.currentTag is not None:
281 self.currentTag.contents.append(tag) 576 self.currentTag.contents.append(tag)
282 self.tagStack.append(tag) 577 self.tagStack.append(tag)
283 self.currentTag = self.tagStack[-1] 578 self.currentTag = self.tagStack[-1]
579 if tag.name != self.ROOT_TAG_NAME:
580 self.open_tag_counter[tag.name] += 1
284 if tag.name in self.builder.preserve_whitespace_tags: 581 if tag.name in self.builder.preserve_whitespace_tags:
285 self.preserve_whitespace_tag_stack.append(tag) 582 self.preserve_whitespace_tag_stack.append(tag)
583 if tag.name in self.builder.string_containers:
584 self.string_container_stack.append(tag)
286 585
287 def endData(self, containerClass=NavigableString): 586 def endData(self, containerClass=None):
587 """Method called by the TreeBuilder when the end of a data segment
588 occurs.
589 """
288 if self.current_data: 590 if self.current_data:
289 current_data = ''.join(self.current_data) 591 current_data = ''.join(self.current_data)
290 # If whitespace is not preserved, and this string contains 592 # If whitespace is not preserved, and this string contains
@@ -311,61 +613,93 @@ class BeautifulSoup(Tag):
311 not self.parse_only.search(current_data)): 613 not self.parse_only.search(current_data)):
312 return 614 return
313 615
616 containerClass = self.string_container(containerClass)
314 o = containerClass(current_data) 617 o = containerClass(current_data)
315 self.object_was_parsed(o) 618 self.object_was_parsed(o)
316 619
317 def object_was_parsed(self, o, parent=None, most_recent_element=None): 620 def object_was_parsed(self, o, parent=None, most_recent_element=None):
318 """Add an object to the parse tree.""" 621 """Method called by the TreeBuilder to integrate an object into the parse tree."""
319 parent = parent or self.currentTag 622 if parent is None:
320 previous_element = most_recent_element or self._most_recent_element 623 parent = self.currentTag
624 if most_recent_element is not None:
625 previous_element = most_recent_element
626 else:
627 previous_element = self._most_recent_element
321 628
322 next_element = previous_sibling = next_sibling = None 629 next_element = previous_sibling = next_sibling = None
323 if isinstance(o, Tag): 630 if isinstance(o, Tag):
324 next_element = o.next_element 631 next_element = o.next_element
325 next_sibling = o.next_sibling 632 next_sibling = o.next_sibling
326 previous_sibling = o.previous_sibling 633 previous_sibling = o.previous_sibling
327 if not previous_element: 634 if previous_element is None:
328 previous_element = o.previous_element 635 previous_element = o.previous_element
329 636
637 fix = parent.next_element is not None
638
330 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 639 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
331 640
332 self._most_recent_element = o 641 self._most_recent_element = o
333 parent.contents.append(o) 642 parent.contents.append(o)
334 643
335 if parent.next_sibling: 644 # Check if we are inserting into an already parsed node.
336 # This node is being inserted into an element that has 645 if fix:
337 # already been parsed. Deal with any dangling references. 646 self._linkage_fixer(parent)
338 index = parent.contents.index(o) 647
339 if index == 0: 648 def _linkage_fixer(self, el):
340 previous_element = parent 649 """Make sure linkage of this fragment is sound."""
341 previous_sibling = None 650
342 else: 651 first = el.contents[0]
343 previous_element = previous_sibling = parent.contents[index-1] 652 child = el.contents[-1]
344 if index == len(parent.contents)-1: 653 descendant = child
345 next_element = parent.next_sibling 654
346 next_sibling = None 655 if child is first and el.parent is not None:
347 else: 656 # Parent should be linked to first child
348 next_element = next_sibling = parent.contents[index+1] 657 el.next_element = child
349 658 # We are no longer linked to whatever this element is
350 o.previous_element = previous_element 659 prev_el = child.previous_element
351 if previous_element: 660 if prev_el is not None and prev_el is not el:
352 previous_element.next_element = o 661 prev_el.next_element = None
353 o.next_element = next_element 662 # First child should be linked to the parent, and no previous siblings.
354 if next_element: 663 child.previous_element = el
355 next_element.previous_element = o 664 child.previous_sibling = None
356 o.next_sibling = next_sibling 665
357 if next_sibling: 666 # We have no sibling as we've been appended as the last.
358 next_sibling.previous_sibling = o 667 child.next_sibling = None
359 o.previous_sibling = previous_sibling 668
360 if previous_sibling: 669 # This index is a tag, dig deeper for a "last descendant"
361 previous_sibling.next_sibling = o 670 if isinstance(child, Tag) and child.contents:
671 descendant = child._last_descendant(False)
672
673 # As the final step, link last descendant. It should be linked
674 # to the parent's next sibling (if found), else walk up the chain
675 # and find a parent with a sibling. It should have no next sibling.
676 descendant.next_element = None
677 descendant.next_sibling = None
678 target = el
679 while True:
680 if target is None:
681 break
682 elif target.next_sibling is not None:
683 descendant.next_element = target.next_sibling
684 target.next_sibling.previous_element = child
685 break
686 target = target.parent
362 687
363 def _popToTag(self, name, nsprefix=None, inclusivePop=True): 688 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
364 """Pops the tag stack up to and including the most recent 689 """Pops the tag stack up to and including the most recent
365 instance of the given tag. If inclusivePop is false, pops the tag 690 instance of the given tag.
366 stack up to but *not* including the most recent instqance of 691
367 the given tag.""" 692 If there are no open tags with the given name, nothing will be
368 #print "Popping to %s" % name 693 popped.
694
695 :param name: Pop up to the most recent tag with this name.
696 :param nsprefix: The namespace prefix that goes with `name`.
697 :param inclusivePop: It this is false, pops the tag stack up
698 to but *not* including the most recent instqance of the
699 given tag.
700
701 """
702 #print("Popping to %s" % name)
369 if name == self.ROOT_TAG_NAME: 703 if name == self.ROOT_TAG_NAME:
370 # The BeautifulSoup object itself can never be popped. 704 # The BeautifulSoup object itself can never be popped.
371 return 705 return
@@ -374,6 +708,8 @@ class BeautifulSoup(Tag):
374 708
375 stack_size = len(self.tagStack) 709 stack_size = len(self.tagStack)
376 for i in range(stack_size - 1, 0, -1): 710 for i in range(stack_size - 1, 0, -1):
711 if not self.open_tag_counter.get(name):
712 break
377 t = self.tagStack[i] 713 t = self.tagStack[i]
378 if (name == t.name and nsprefix == t.prefix): 714 if (name == t.name and nsprefix == t.prefix):
379 if inclusivePop: 715 if inclusivePop:
@@ -383,16 +719,26 @@ class BeautifulSoup(Tag):
383 719
384 return most_recently_popped 720 return most_recently_popped
385 721
386 def handle_starttag(self, name, namespace, nsprefix, attrs): 722 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
387 """Push a start tag on to the stack. 723 sourcepos=None, namespaces=None):
388 724 """Called by the tree builder when a new tag is encountered.
389 If this method returns None, the tag was rejected by the 725
390 SoupStrainer. You should proceed as if the tag had not occured 726 :param name: Name of the tag.
727 :param nsprefix: Namespace prefix for the tag.
728 :param attrs: A dictionary of attribute values.
729 :param sourceline: The line number where this tag was found in its
730 source document.
731 :param sourcepos: The character position within `sourceline` where this
732 tag was found.
733 :param namespaces: A dictionary of all namespace prefix mappings
734 currently in scope in the document.
735
736 If this method returns None, the tag was rejected by an active
737 SoupStrainer. You should proceed as if the tag had not occurred
391 in the document. For instance, if this was a self-closing tag, 738 in the document. For instance, if this was a self-closing tag,
392 don't call handle_endtag. 739 don't call handle_endtag.
393 """ 740 """
394 741 # print("Start tag %s: %s" % (name, attrs))
395 # print "Start tag %s: %s" % (name, attrs)
396 self.endData() 742 self.endData()
397 743
398 if (self.parse_only and len(self.tagStack) <= 1 744 if (self.parse_only and len(self.tagStack) <= 1
@@ -400,34 +746,54 @@ class BeautifulSoup(Tag):
400 or not self.parse_only.search_tag(name, attrs))): 746 or not self.parse_only.search_tag(name, attrs))):
401 return None 747 return None
402 748
403 tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, 749 tag = self.element_classes.get(Tag, Tag)(
404 self.currentTag, self._most_recent_element) 750 self, self.builder, name, namespace, nsprefix, attrs,
751 self.currentTag, self._most_recent_element,
752 sourceline=sourceline, sourcepos=sourcepos,
753 namespaces=namespaces
754 )
405 if tag is None: 755 if tag is None:
406 return tag 756 return tag
407 if self._most_recent_element: 757 if self._most_recent_element is not None:
408 self._most_recent_element.next_element = tag 758 self._most_recent_element.next_element = tag
409 self._most_recent_element = tag 759 self._most_recent_element = tag
410 self.pushTag(tag) 760 self.pushTag(tag)
411 return tag 761 return tag
412 762
413 def handle_endtag(self, name, nsprefix=None): 763 def handle_endtag(self, name, nsprefix=None):
414 #print "End tag: " + name 764 """Called by the tree builder when an ending tag is encountered.
765
766 :param name: Name of the tag.
767 :param nsprefix: Namespace prefix for the tag.
768 """
769 #print("End tag: " + name)
415 self.endData() 770 self.endData()
416 self._popToTag(name, nsprefix) 771 self._popToTag(name, nsprefix)
417 772
418 def handle_data(self, data): 773 def handle_data(self, data):
774 """Called by the tree builder when a chunk of textual data is encountered."""
419 self.current_data.append(data) 775 self.current_data.append(data)
420 776
421 def decode(self, pretty_print=False, 777 def decode(self, pretty_print=False,
422 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 778 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
423 formatter="minimal"): 779 formatter="minimal", iterator=None):
424 """Returns a string or Unicode representation of this document. 780 """Returns a string or Unicode representation of the parse tree
425 To get Unicode, pass None for encoding.""" 781 as an HTML or XML document.
426 782
783 :param pretty_print: If this is True, indentation will be used to
784 make the document more readable.
785 :param eventual_encoding: The encoding of the final document.
786 If this is None, the document will be a Unicode string.
787 """
427 if self.is_xml: 788 if self.is_xml:
428 # Print the XML declaration 789 # Print the XML declaration
429 encoding_part = '' 790 encoding_part = ''
430 if eventual_encoding is not None: 791 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
792 # This is a special Python encoding; it can't actually
793 # go into an XML document because it means nothing
794 # outside of Python.
795 eventual_encoding = None
796 if eventual_encoding != None:
431 encoding_part = ' encoding="%s"' % eventual_encoding 797 encoding_part = ' encoding="%s"' % eventual_encoding
432 prefix = '<?xml version="1.0"%s?>\n' % encoding_part 798 prefix = '<?xml version="1.0"%s?>\n' % encoding_part
433 else: 799 else:
@@ -437,9 +803,9 @@ class BeautifulSoup(Tag):
437 else: 803 else:
438 indent_level = 0 804 indent_level = 0
439 return prefix + super(BeautifulSoup, self).decode( 805 return prefix + super(BeautifulSoup, self).decode(
440 indent_level, eventual_encoding, formatter) 806 indent_level, eventual_encoding, formatter, iterator)
441 807
442# Alias to make it easier to type import: 'from bs4 import _soup' 808# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
443_s = BeautifulSoup 809_s = BeautifulSoup
444_soup = BeautifulSoup 810_soup = BeautifulSoup
445 811
@@ -450,19 +816,25 @@ class BeautifulStoneSoup(BeautifulSoup):
450 kwargs['features'] = 'xml' 816 kwargs['features'] = 'xml'
451 warnings.warn( 817 warnings.warn(
452 'The BeautifulStoneSoup class is deprecated. Instead of using ' 818 'The BeautifulStoneSoup class is deprecated. Instead of using '
453 'it, pass features="xml" into the BeautifulSoup constructor.') 819 'it, pass features="xml" into the BeautifulSoup constructor.',
820 DeprecationWarning, stacklevel=2
821 )
454 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 822 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
455 823
456 824
457class StopParsing(Exception): 825class StopParsing(Exception):
826 """Exception raised by a TreeBuilder if it's unable to continue parsing."""
458 pass 827 pass
459 828
460class FeatureNotFound(ValueError): 829class FeatureNotFound(ValueError):
830 """Exception raised by the BeautifulSoup constructor if no parser with the
831 requested features is found.
832 """
461 pass 833 pass
462 834
463 835
464#By default, act as an HTML pretty-printer. 836#If this file is run as a script, act as an HTML pretty-printer.
465if __name__ == '__main__': 837if __name__ == '__main__':
466 import sys 838 import sys
467 soup = BeautifulSoup(sys.stdin) 839 soup = BeautifulSoup(sys.stdin)
468 print(soup.prettify()) 840 print((soup.prettify()))
diff --git a/bitbake/lib/bs4/builder/__init__.py b/bitbake/lib/bs4/builder/__init__.py
index 6ccd4d23d6..ffb31fc25e 100644
--- a/bitbake/lib/bs4/builder/__init__.py
+++ b/bitbake/lib/bs4/builder/__init__.py
@@ -1,11 +1,21 @@
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
1from collections import defaultdict 4from collections import defaultdict
2import itertools 5import itertools
6import re
7import warnings
3import sys 8import sys
4from bs4.element import ( 9from bs4.element import (
5 CharsetMetaAttributeValue, 10 CharsetMetaAttributeValue,
6 ContentMetaAttributeValue, 11 ContentMetaAttributeValue,
7 whitespace_re 12 RubyParenthesisString,
8 ) 13 RubyTextString,
14 Stylesheet,
15 Script,
16 TemplateString,
17 nonwhitespace_re
18)
9 19
10__all__ = [ 20__all__ = [
11 'HTMLTreeBuilder', 21 'HTMLTreeBuilder',
@@ -22,20 +32,41 @@ XML = 'xml'
22HTML = 'html' 32HTML = 'html'
23HTML_5 = 'html5' 33HTML_5 = 'html5'
24 34
35class XMLParsedAsHTMLWarning(UserWarning):
36 """The warning issued when an HTML parser is used to parse
37 XML that is not XHTML.
38 """
39 MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
40
25 41
26class TreeBuilderRegistry(object): 42class TreeBuilderRegistry(object):
27 43 """A way of looking up TreeBuilder subclasses by their name or by desired
44 features.
45 """
46
28 def __init__(self): 47 def __init__(self):
29 self.builders_for_feature = defaultdict(list) 48 self.builders_for_feature = defaultdict(list)
30 self.builders = [] 49 self.builders = []
31 50
32 def register(self, treebuilder_class): 51 def register(self, treebuilder_class):
33 """Register a treebuilder based on its advertised features.""" 52 """Register a treebuilder based on its advertised features.
53
54 :param treebuilder_class: A subclass of Treebuilder. its .features
55 attribute should list its features.
56 """
34 for feature in treebuilder_class.features: 57 for feature in treebuilder_class.features:
35 self.builders_for_feature[feature].insert(0, treebuilder_class) 58 self.builders_for_feature[feature].insert(0, treebuilder_class)
36 self.builders.insert(0, treebuilder_class) 59 self.builders.insert(0, treebuilder_class)
37 60
38 def lookup(self, *features): 61 def lookup(self, *features):
62 """Look up a TreeBuilder subclass with the desired features.
63
64 :param features: A list of features to look for. If none are
65 provided, the most recently registered TreeBuilder subclass
66 will be used.
67 :return: A TreeBuilder subclass, or None if there's no
68 registered subclass with all the requested features.
69 """
39 if len(self.builders) == 0: 70 if len(self.builders) == 0:
40 # There are no builders at all. 71 # There are no builders at all.
41 return None 72 return None
@@ -78,7 +109,7 @@ class TreeBuilderRegistry(object):
78builder_registry = TreeBuilderRegistry() 109builder_registry = TreeBuilderRegistry()
79 110
80class TreeBuilder(object): 111class TreeBuilder(object):
81 """Turn a document into a Beautiful Soup object tree.""" 112 """Turn a textual document into a Beautiful Soup object tree."""
82 113
83 NAME = "[Unknown tree builder]" 114 NAME = "[Unknown tree builder]"
84 ALTERNATE_NAMES = [] 115 ALTERNATE_NAMES = []
@@ -86,19 +117,89 @@ class TreeBuilder(object):
86 117
87 is_xml = False 118 is_xml = False
88 picklable = False 119 picklable = False
89 preserve_whitespace_tags = set()
90 empty_element_tags = None # A tag will be considered an empty-element 120 empty_element_tags = None # A tag will be considered an empty-element
91 # tag when and only when it has no contents. 121 # tag when and only when it has no contents.
92 122
93 # A value for these tag/attribute combinations is a space- or 123 # A value for these tag/attribute combinations is a space- or
94 # comma-separated list of CDATA, rather than a single CDATA. 124 # comma-separated list of CDATA, rather than a single CDATA.
95 cdata_list_attributes = {} 125 DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
96 126
97 127 # Whitespace should be preserved inside these tags.
98 def __init__(self): 128 DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
129
130 # The textual contents of tags with these names should be
131 # instantiated with some class other than NavigableString.
132 DEFAULT_STRING_CONTAINERS = {}
133
134 USE_DEFAULT = object()
135
136 # Most parsers don't keep track of line numbers.
137 TRACKS_LINE_NUMBERS = False
138
139 def __init__(self, multi_valued_attributes=USE_DEFAULT,
140 preserve_whitespace_tags=USE_DEFAULT,
141 store_line_numbers=USE_DEFAULT,
142 string_containers=USE_DEFAULT,
143 ):
144 """Constructor.
145
146 :param multi_valued_attributes: If this is set to None, the
147 TreeBuilder will not turn any values for attributes like
148 'class' into lists. Setting this to a dictionary will
149 customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
150 for an example.
151
152 Internally, these are called "CDATA list attributes", but that
153 probably doesn't make sense to an end-user, so the argument name
154 is `multi_valued_attributes`.
155
156 :param preserve_whitespace_tags: A list of tags to treat
157 the way <pre> tags are treated in HTML. Tags in this list
158 are immune from pretty-printing; their contents will always be
159 output as-is.
160
161 :param string_containers: A dictionary mapping tag names to
162 the classes that should be instantiated to contain the textual
163 contents of those tags. The default is to use NavigableString
164 for every tag, no matter what the name. You can override the
165 default by changing DEFAULT_STRING_CONTAINERS.
166
167 :param store_line_numbers: If the parser keeps track of the
168 line numbers and positions of the original markup, that
169 information will, by default, be stored in each corresponding
170 `Tag` object. You can turn this off by passing
171 store_line_numbers=False. If the parser you're using doesn't
172 keep track of this information, then setting store_line_numbers=True
173 will do nothing.
174 """
99 self.soup = None 175 self.soup = None
100 176 if multi_valued_attributes is self.USE_DEFAULT:
177 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
178 self.cdata_list_attributes = multi_valued_attributes
179 if preserve_whitespace_tags is self.USE_DEFAULT:
180 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
181 self.preserve_whitespace_tags = preserve_whitespace_tags
182 if store_line_numbers == self.USE_DEFAULT:
183 store_line_numbers = self.TRACKS_LINE_NUMBERS
184 self.store_line_numbers = store_line_numbers
185 if string_containers == self.USE_DEFAULT:
186 string_containers = self.DEFAULT_STRING_CONTAINERS
187 self.string_containers = string_containers
188
189 def initialize_soup(self, soup):
190 """The BeautifulSoup object has been initialized and is now
191 being associated with the TreeBuilder.
192
193 :param soup: A BeautifulSoup object.
194 """
195 self.soup = soup
196
101 def reset(self): 197 def reset(self):
198 """Do any work necessary to reset the underlying parser
199 for a new document.
200
201 By default, this does nothing.
202 """
102 pass 203 pass
103 204
104 def can_be_empty_element(self, tag_name): 205 def can_be_empty_element(self, tag_name):
@@ -110,24 +211,58 @@ class TreeBuilder(object):
110 For instance: an HTMLBuilder does not consider a <p> tag to be 211 For instance: an HTMLBuilder does not consider a <p> tag to be
111 an empty-element tag (it's not in 212 an empty-element tag (it's not in
112 HTMLBuilder.empty_element_tags). This means an empty <p> tag 213 HTMLBuilder.empty_element_tags). This means an empty <p> tag
113 will be presented as "<p></p>", not "<p />". 214 will be presented as "<p></p>", not "<p/>" or "<p>".
114 215
115 The default implementation has no opinion about which tags are 216 The default implementation has no opinion about which tags are
116 empty-element tags, so a tag will be presented as an 217 empty-element tags, so a tag will be presented as an
117 empty-element tag if and only if it has no contents. 218 empty-element tag if and only if it has no children.
118 "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will 219 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
119 be left alone. 220 be left alone.
221
222 :param tag_name: The name of a markup tag.
120 """ 223 """
121 if self.empty_element_tags is None: 224 if self.empty_element_tags is None:
122 return True 225 return True
123 return tag_name in self.empty_element_tags 226 return tag_name in self.empty_element_tags
124 227
125 def feed(self, markup): 228 def feed(self, markup):
229 """Run some incoming markup through some parsing process,
230 populating the `BeautifulSoup` object in self.soup.
231
232 This method is not implemented in TreeBuilder; it must be
233 implemented in subclasses.
234
235 :return: None.
236 """
126 raise NotImplementedError() 237 raise NotImplementedError()
127 238
128 def prepare_markup(self, markup, user_specified_encoding=None, 239 def prepare_markup(self, markup, user_specified_encoding=None,
129 document_declared_encoding=None): 240 document_declared_encoding=None, exclude_encodings=None):
130 return markup, None, None, False 241 """Run any preliminary steps necessary to make incoming markup
242 acceptable to the parser.
243
244 :param markup: Some markup -- probably a bytestring.
245 :param user_specified_encoding: The user asked to try this encoding.
246 :param document_declared_encoding: The markup itself claims to be
247 in this encoding. NOTE: This argument is not used by the
248 calling code and can probably be removed.
249 :param exclude_encodings: The user asked _not_ to try any of
250 these encodings.
251
252 :yield: A series of 4-tuples:
253 (markup, encoding, declared encoding,
254 has undergone character replacement)
255
256 Each 4-tuple represents a strategy for converting the
257 document to Unicode and parsing it. Each strategy will be tried
258 in turn.
259
260 By default, the only strategy is to parse the markup
261 as-is. See `LXMLTreeBuilderForXML` and
262 `HTMLParserTreeBuilder` for implementations that take into
263 account the quirks of particular parsers.
264 """
265 yield markup, None, None, False
131 266
132 def test_fragment_to_document(self, fragment): 267 def test_fragment_to_document(self, fragment):
133 """Wrap an HTML fragment to make it look like a document. 268 """Wrap an HTML fragment to make it look like a document.
@@ -139,16 +274,36 @@ class TreeBuilder(object):
139 results against other HTML fragments. 274 results against other HTML fragments.
140 275
141 This method should not be used outside of tests. 276 This method should not be used outside of tests.
277
278 :param fragment: A string -- fragment of HTML.
279 :return: A string -- a full HTML document.
142 """ 280 """
143 return fragment 281 return fragment
144 282
145 def set_up_substitutions(self, tag): 283 def set_up_substitutions(self, tag):
284 """Set up any substitutions that will need to be performed on
285 a `Tag` when it's output as a string.
286
287 By default, this does nothing. See `HTMLTreeBuilder` for a
288 case where this is used.
289
290 :param tag: A `Tag`
291 :return: Whether or not a substitution was performed.
292 """
146 return False 293 return False
147 294
148 def _replace_cdata_list_attribute_values(self, tag_name, attrs): 295 def _replace_cdata_list_attribute_values(self, tag_name, attrs):
149 """Replaces class="foo bar" with class=["foo", "bar"] 296 """When an attribute value is associated with a tag that can
297 have multiple values for that attribute, convert the string
298 value to a list of strings.
150 299
151 Modifies its input in place. 300 Basically, replaces class="foo bar" with class=["foo", "bar"]
301
302 NOTE: This method modifies its input in place.
303
304 :param tag_name: The name of a tag.
305 :param attrs: A dictionary containing the tag's attributes.
306 Any appropriate attribute values will be modified in place.
152 """ 307 """
153 if not attrs: 308 if not attrs:
154 return attrs 309 return attrs
@@ -163,7 +318,7 @@ class TreeBuilder(object):
163 # values. Split it into a list. 318 # values. Split it into a list.
164 value = attrs[attr] 319 value = attrs[attr]
165 if isinstance(value, str): 320 if isinstance(value, str):
166 values = whitespace_re.split(value) 321 values = nonwhitespace_re.findall(value)
167 else: 322 else:
168 # html5lib sometimes calls setAttributes twice 323 # html5lib sometimes calls setAttributes twice
169 # for the same tag when rearranging the parse 324 # for the same tag when rearranging the parse
@@ -174,9 +329,13 @@ class TreeBuilder(object):
174 values = value 329 values = value
175 attrs[attr] = values 330 attrs[attr] = values
176 return attrs 331 return attrs
177 332
178class SAXTreeBuilder(TreeBuilder): 333class SAXTreeBuilder(TreeBuilder):
179 """A Beautiful Soup treebuilder that listens for SAX events.""" 334 """A Beautiful Soup treebuilder that listens for SAX events.
335
336 This is not currently used for anything, but it demonstrates
337 how a simple TreeBuilder would work.
338 """
180 339
181 def feed(self, markup): 340 def feed(self, markup):
182 raise NotImplementedError() 341 raise NotImplementedError()
@@ -186,11 +345,11 @@ class SAXTreeBuilder(TreeBuilder):
186 345
187 def startElement(self, name, attrs): 346 def startElement(self, name, attrs):
188 attrs = dict((key[1], value) for key, value in list(attrs.items())) 347 attrs = dict((key[1], value) for key, value in list(attrs.items()))
189 #print "Start %s, %r" % (name, attrs) 348 #print("Start %s, %r" % (name, attrs))
190 self.soup.handle_starttag(name, attrs) 349 self.soup.handle_starttag(name, attrs)
191 350
192 def endElement(self, name): 351 def endElement(self, name):
193 #print "End %s" % name 352 #print("End %s" % name)
194 self.soup.handle_endtag(name) 353 self.soup.handle_endtag(name)
195 354
196 def startElementNS(self, nsTuple, nodeName, attrs): 355 def startElementNS(self, nsTuple, nodeName, attrs):
@@ -227,10 +386,44 @@ class HTMLTreeBuilder(TreeBuilder):
227 Such as which tags are empty-element tags. 386 Such as which tags are empty-element tags.
228 """ 387 """
229 388
230 preserve_whitespace_tags = set(['pre', 'textarea']) 389 empty_element_tags = set([
231 empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 390 # These are from HTML5.
232 'spacer', 'link', 'frame', 'base']) 391 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
233 392
393 # These are from earlier versions of HTML and are removed in HTML5.
394 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
395 ])
396
397 # The HTML standard defines these as block-level elements. Beautiful
398 # Soup does not treat these elements differently from other elements,
399 # but it may do so eventually, and this information is available if
400 # you need to use it.
401 block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
402
403 # These HTML tags need special treatment so they can be
404 # represented by a string class other than NavigableString.
405 #
406 # For some of these tags, it's because the HTML standard defines
407 # an unusual content model for them. I made this list by going
408 # through the HTML spec
409 # (https://html.spec.whatwg.org/#metadata-content) and looking for
410 # "metadata content" elements that can contain strings.
411 #
412 # The Ruby tags (<rt> and <rp>) are here despite being normal
413 # "phrasing content" tags, because the content they contain is
414 # qualitatively different from other text in the document, and it
415 # can be useful to be able to distinguish it.
416 #
417 # TODO: Arguably <noscript> could go here but it seems
418 # qualitatively different from the other tags.
419 DEFAULT_STRING_CONTAINERS = {
420 'rt' : RubyTextString,
421 'rp' : RubyParenthesisString,
422 'style': Stylesheet,
423 'script': Script,
424 'template': TemplateString,
425 }
426
234 # The HTML standard defines these attributes as containing a 427 # The HTML standard defines these attributes as containing a
235 # space-separated list of values, not a single value. That is, 428 # space-separated list of values, not a single value. That is,
236 # class="foo bar" means that the 'class' attribute has two values, 429 # class="foo bar" means that the 'class' attribute has two values,
@@ -238,7 +431,7 @@ class HTMLTreeBuilder(TreeBuilder):
238 # encounter one of these attributes, we will parse its value into 431 # encounter one of these attributes, we will parse its value into
239 # a list of values if possible. Upon output, the list will be 432 # a list of values if possible. Upon output, the list will be
240 # converted back into a string. 433 # converted back into a string.
241 cdata_list_attributes = { 434 DEFAULT_CDATA_LIST_ATTRIBUTES = {
242 "*" : ['class', 'accesskey', 'dropzone'], 435 "*" : ['class', 'accesskey', 'dropzone'],
243 "a" : ['rel', 'rev'], 436 "a" : ['rel', 'rev'],
244 "link" : ['rel', 'rev'], 437 "link" : ['rel', 'rev'],
@@ -255,7 +448,19 @@ class HTMLTreeBuilder(TreeBuilder):
255 "output" : ["for"], 448 "output" : ["for"],
256 } 449 }
257 450
451 DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
452
258 def set_up_substitutions(self, tag): 453 def set_up_substitutions(self, tag):
454 """Replace the declared encoding in a <meta> tag with a placeholder,
455 to be substituted when the tag is output to a string.
456
457 An HTML document may come in to Beautiful Soup as one
458 encoding, but exit in a different encoding, and the <meta> tag
459 needs to be changed to reflect this.
460
461 :param tag: A `Tag`
462 :return: Whether or not a substitution was performed.
463 """
259 # We are only interested in <meta> tags 464 # We are only interested in <meta> tags
260 if tag.name != 'meta': 465 if tag.name != 'meta':
261 return False 466 return False
@@ -288,10 +493,107 @@ class HTMLTreeBuilder(TreeBuilder):
288 493
289 return (meta_encoding is not None) 494 return (meta_encoding is not None)
290 495
496class DetectsXMLParsedAsHTML(object):
497 """A mixin class for any class (a TreeBuilder, or some class used by a
498 TreeBuilder) that's in a position to detect whether an XML
499 document is being incorrectly parsed as HTML, and issue an
500 appropriate warning.
501
502 This requires being able to observe an incoming processing
503 instruction that might be an XML declaration, and also able to
504 observe tags as they're opened. If you can't do that for a given
505 TreeBuilder, there's a less reliable implementation based on
506 examining the raw markup.
507 """
508
509 # Regular expression for seeing if markup has an <html> tag.
510 LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
511 LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
512
513 XML_PREFIX = '<?xml'
514 XML_PREFIX_B = b'<?xml'
515
516 @classmethod
517 def warn_if_markup_looks_like_xml(cls, markup, stacklevel=3):
518 """Perform a check on some markup to see if it looks like XML
519 that's not XHTML. If so, issue a warning.
520
521 This is much less reliable than doing the check while parsing,
522 but some of the tree builders can't do that.
523
524 :param stacklevel: The stacklevel of the code calling this
525 function.
526
527 :return: True if the markup looks like non-XHTML XML, False
528 otherwise.
529
530 """
531 if isinstance(markup, bytes):
532 prefix = cls.XML_PREFIX_B
533 looks_like_html = cls.LOOKS_LIKE_HTML_B
534 else:
535 prefix = cls.XML_PREFIX
536 looks_like_html = cls.LOOKS_LIKE_HTML
537
538 if (markup is not None
539 and markup.startswith(prefix)
540 and not looks_like_html.search(markup[:500])
541 ):
542 cls._warn(stacklevel=stacklevel+2)
543 return True
544 return False
545
546 @classmethod
547 def _warn(cls, stacklevel=5):
548 """Issue a warning about XML being parsed as HTML."""
549 warnings.warn(
550 XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning,
551 stacklevel=stacklevel
552 )
553
554 def _initialize_xml_detector(self):
555 """Call this method before parsing a document."""
556 self._first_processing_instruction = None
557 self._root_tag = None
558
559 def _document_might_be_xml(self, processing_instruction):
560 """Call this method when encountering an XML declaration, or a
561 "processing instruction" that might be an XML declaration.
562 """
563 if (self._first_processing_instruction is not None
564 or self._root_tag is not None):
565 # The document has already started. Don't bother checking
566 # anymore.
567 return
568
569 self._first_processing_instruction = processing_instruction
570
571 # We won't know until we encounter the first tag whether or
572 # not this is actually a problem.
573
574 def _root_tag_encountered(self, name):
575 """Call this when you encounter the document's root tag.
576
577 This is where we actually check whether an XML document is
578 being incorrectly parsed as HTML, and issue the warning.
579 """
580 if self._root_tag is not None:
581 # This method was incorrectly called multiple times. Do
582 # nothing.
583 return
584
585 self._root_tag = name
586 if (name != 'html' and self._first_processing_instruction is not None
587 and self._first_processing_instruction.lower().startswith('xml ')):
588 # We encountered an XML declaration and then a tag other
589 # than 'html'. This is a reliable indicator that a
590 # non-XHTML document is being parsed as XML.
591 self._warn()
592
593
291def register_treebuilders_from(module): 594def register_treebuilders_from(module):
292 """Copy TreeBuilders from the given module into this module.""" 595 """Copy TreeBuilders from the given module into this module."""
293 # I'm fairly sure this is not the best way to do this. 596 this_module = sys.modules[__name__]
294 this_module = sys.modules['bs4.builder']
295 for name in module.__all__: 597 for name in module.__all__:
296 obj = getattr(module, name) 598 obj = getattr(module, name)
297 599
@@ -302,12 +604,22 @@ def register_treebuilders_from(module):
302 this_module.builder_registry.register(obj) 604 this_module.builder_registry.register(obj)
303 605
304class ParserRejectedMarkup(Exception): 606class ParserRejectedMarkup(Exception):
305 pass 607 """An Exception to be raised when the underlying parser simply
306 608 refuses to parse the given markup.
609 """
610 def __init__(self, message_or_exception):
611 """Explain why the parser rejected the given markup, either
612 with a textual explanation or another exception.
613 """
614 if isinstance(message_or_exception, Exception):
615 e = message_or_exception
616 message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
617 super(ParserRejectedMarkup, self).__init__(message_or_exception)
618
307# Builders are registered in reverse order of priority, so that custom 619# Builders are registered in reverse order of priority, so that custom
308# builder registrations will take precedence. In general, we want lxml 620# builder registrations will take precedence. In general, we want lxml
309# to take precedence over html5lib, because it's faster. And we only 621# to take precedence over html5lib, because it's faster. And we only
310# want to use HTMLParser as a last result. 622# want to use HTMLParser as a last resort.
311from . import _htmlparser 623from . import _htmlparser
312register_treebuilders_from(_htmlparser) 624register_treebuilders_from(_htmlparser)
313try: 625try:
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py
index 9e9216ef9c..7c46a85118 100644
--- a/bitbake/lib/bs4/builder/_html5lib.py
+++ b/bitbake/lib/bs4/builder/_html5lib.py
@@ -1,9 +1,14 @@
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
1__all__ = [ 4__all__ = [
2 'HTML5TreeBuilder', 5 'HTML5TreeBuilder',
3 ] 6 ]
4 7
5import warnings 8import warnings
9import re
6from bs4.builder import ( 10from bs4.builder import (
11 DetectsXMLParsedAsHTML,
7 PERMISSIVE, 12 PERMISSIVE,
8 HTML, 13 HTML,
9 HTML_5, 14 HTML_5,
@@ -11,17 +16,13 @@ from bs4.builder import (
11 ) 16 )
12from bs4.element import ( 17from bs4.element import (
13 NamespacedAttribute, 18 NamespacedAttribute,
14 whitespace_re, 19 nonwhitespace_re,
15) 20)
16import html5lib 21import html5lib
17try: 22from html5lib.constants import (
18 # html5lib >= 0.99999999/1.0b9 23 namespaces,
19 from html5lib.treebuilders import base as treebuildersbase 24 prefixes,
20except ImportError: 25 )
21 # html5lib <= 0.9999999/1.0b8
22 from html5lib.treebuilders import _base as treebuildersbase
23from html5lib.constants import namespaces
24
25from bs4.element import ( 26from bs4.element import (
26 Comment, 27 Comment,
27 Doctype, 28 Doctype,
@@ -29,13 +30,37 @@ from bs4.element import (
29 Tag, 30 Tag,
30 ) 31 )
31 32
33try:
34 # Pre-0.99999999
35 from html5lib.treebuilders import _base as treebuilder_base
36 new_html5lib = False
37except ImportError as e:
38 # 0.99999999 and up
39 from html5lib.treebuilders import base as treebuilder_base
40 new_html5lib = True
41
32class HTML5TreeBuilder(HTMLTreeBuilder): 42class HTML5TreeBuilder(HTMLTreeBuilder):
33 """Use html5lib to build a tree.""" 43 """Use html5lib to build a tree.
44
45 Note that this TreeBuilder does not support some features common
46 to HTML TreeBuilders. Some of these features could theoretically
47 be implemented, but at the very least it's quite difficult,
48 because html5lib moves the parse tree around as it's being built.
49
50 * This TreeBuilder doesn't use different subclasses of NavigableString
51 based on the name of the tag in which the string was found.
52
53 * You can't use a SoupStrainer to parse only part of a document.
54 """
34 55
35 NAME = "html5lib" 56 NAME = "html5lib"
36 57
37 features = [NAME, PERMISSIVE, HTML_5, HTML] 58 features = [NAME, PERMISSIVE, HTML_5, HTML]
38 59
60 # html5lib can tell us which line number and position in the
61 # original file is the source of an element.
62 TRACKS_LINE_NUMBERS = True
63
39 def prepare_markup(self, markup, user_specified_encoding, 64 def prepare_markup(self, markup, user_specified_encoding,
40 document_declared_encoding=None, exclude_encodings=None): 65 document_declared_encoding=None, exclude_encodings=None):
41 # Store the user-specified encoding for use later on. 66 # Store the user-specified encoding for use later on.
@@ -45,27 +70,56 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
45 # ATM because the html5lib TreeBuilder doesn't use 70 # ATM because the html5lib TreeBuilder doesn't use
46 # UnicodeDammit. 71 # UnicodeDammit.
47 if exclude_encodings: 72 if exclude_encodings:
48 warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 73 warnings.warn(
74 "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
75 stacklevel=3
76 )
77
78 # html5lib only parses HTML, so if it's given XML that's worth
79 # noting.
80 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
81 markup, stacklevel=3
82 )
83
49 yield (markup, None, None, False) 84 yield (markup, None, None, False)
50 85
51 # These methods are defined by Beautiful Soup. 86 # These methods are defined by Beautiful Soup.
52 def feed(self, markup): 87 def feed(self, markup):
53 if self.soup.parse_only is not None: 88 if self.soup.parse_only is not None:
54 warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 89 warnings.warn(
90 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
91 stacklevel=4
92 )
55 parser = html5lib.HTMLParser(tree=self.create_treebuilder) 93 parser = html5lib.HTMLParser(tree=self.create_treebuilder)
56 doc = parser.parse(markup, encoding=self.user_specified_encoding) 94 self.underlying_builder.parser = parser
57 95 extra_kwargs = dict()
96 if not isinstance(markup, str):
97 if new_html5lib:
98 extra_kwargs['override_encoding'] = self.user_specified_encoding
99 else:
100 extra_kwargs['encoding'] = self.user_specified_encoding
101 doc = parser.parse(markup, **extra_kwargs)
102
58 # Set the character encoding detected by the tokenizer. 103 # Set the character encoding detected by the tokenizer.
59 if isinstance(markup, str): 104 if isinstance(markup, str):
60 # We need to special-case this because html5lib sets 105 # We need to special-case this because html5lib sets
61 # charEncoding to UTF-8 if it gets Unicode input. 106 # charEncoding to UTF-8 if it gets Unicode input.
62 doc.original_encoding = None 107 doc.original_encoding = None
63 else: 108 else:
64 doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 109 original_encoding = parser.tokenizer.stream.charEncoding[0]
65 110 if not isinstance(original_encoding, str):
111 # In 0.99999999 and up, the encoding is an html5lib
112 # Encoding object. We want to use a string for compatibility
113 # with other tree builders.
114 original_encoding = original_encoding.name
115 doc.original_encoding = original_encoding
116 self.underlying_builder.parser = None
117
66 def create_treebuilder(self, namespaceHTMLElements): 118 def create_treebuilder(self, namespaceHTMLElements):
67 self.underlying_builder = TreeBuilderForHtml5lib( 119 self.underlying_builder = TreeBuilderForHtml5lib(
68 self.soup, namespaceHTMLElements) 120 namespaceHTMLElements, self.soup,
121 store_line_numbers=self.store_line_numbers
122 )
69 return self.underlying_builder 123 return self.underlying_builder
70 124
71 def test_fragment_to_document(self, fragment): 125 def test_fragment_to_document(self, fragment):
@@ -73,12 +127,30 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
73 return '<html><head></head><body>%s</body></html>' % fragment 127 return '<html><head></head><body>%s</body></html>' % fragment
74 128
75 129
76class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder): 130class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
77 131
78 def __init__(self, soup, namespaceHTMLElements): 132 def __init__(self, namespaceHTMLElements, soup=None,
79 self.soup = soup 133 store_line_numbers=True, **kwargs):
134 if soup:
135 self.soup = soup
136 else:
137 from bs4 import BeautifulSoup
138 # TODO: Why is the parser 'html.parser' here? To avoid an
139 # infinite loop?
140 self.soup = BeautifulSoup(
141 "", "html.parser", store_line_numbers=store_line_numbers,
142 **kwargs
143 )
144 # TODO: What are **kwargs exactly? Should they be passed in
145 # here in addition to/instead of being passed to the BeautifulSoup
146 # constructor?
80 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 147 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
81 148
149 # This will be set later to an html5lib.html5parser.HTMLParser
150 # object, which we can use to track the current line number.
151 self.parser = None
152 self.store_line_numbers = store_line_numbers
153
82 def documentClass(self): 154 def documentClass(self):
83 self.soup.reset() 155 self.soup.reset()
84 return Element(self.soup, self.soup, None) 156 return Element(self.soup, self.soup, None)
@@ -92,14 +164,26 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
92 self.soup.object_was_parsed(doctype) 164 self.soup.object_was_parsed(doctype)
93 165
94 def elementClass(self, name, namespace): 166 def elementClass(self, name, namespace):
95 tag = self.soup.new_tag(name, namespace) 167 kwargs = {}
168 if self.parser and self.store_line_numbers:
169 # This represents the point immediately after the end of the
170 # tag. We don't know when the tag started, but we do know
171 # where it ended -- the character just before this one.
172 sourceline, sourcepos = self.parser.tokenizer.stream.position()
173 kwargs['sourceline'] = sourceline
174 kwargs['sourcepos'] = sourcepos-1
175 tag = self.soup.new_tag(name, namespace, **kwargs)
176
96 return Element(tag, self.soup, namespace) 177 return Element(tag, self.soup, namespace)
97 178
98 def commentClass(self, data): 179 def commentClass(self, data):
99 return TextNode(Comment(data), self.soup) 180 return TextNode(Comment(data), self.soup)
100 181
101 def fragmentClass(self): 182 def fragmentClass(self):
102 self.soup = BeautifulSoup("") 183 from bs4 import BeautifulSoup
184 # TODO: Why is the parser 'html.parser' here? To avoid an
185 # infinite loop?
186 self.soup = BeautifulSoup("", "html.parser")
103 self.soup.name = "[document_fragment]" 187 self.soup.name = "[document_fragment]"
104 return Element(self.soup, self.soup, None) 188 return Element(self.soup, self.soup, None)
105 189
@@ -111,7 +195,57 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder):
111 return self.soup 195 return self.soup
112 196
113 def getFragment(self): 197 def getFragment(self):
114 return treebuildersbase.TreeBuilder.getFragment(self).element 198 return treebuilder_base.TreeBuilder.getFragment(self).element
199
200 def testSerializer(self, element):
201 from bs4 import BeautifulSoup
202 rv = []
203 doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
204
205 def serializeElement(element, indent=0):
206 if isinstance(element, BeautifulSoup):
207 pass
208 if isinstance(element, Doctype):
209 m = doctype_re.match(element)
210 if m:
211 name = m.group(1)
212 if m.lastindex > 1:
213 publicId = m.group(2) or ""
214 systemId = m.group(3) or m.group(4) or ""
215 rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
216 (' ' * indent, name, publicId, systemId))
217 else:
218 rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
219 else:
220 rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
221 elif isinstance(element, Comment):
222 rv.append("|%s<!-- %s -->" % (' ' * indent, element))
223 elif isinstance(element, NavigableString):
224 rv.append("|%s\"%s\"" % (' ' * indent, element))
225 else:
226 if element.namespace:
227 name = "%s %s" % (prefixes[element.namespace],
228 element.name)
229 else:
230 name = element.name
231 rv.append("|%s<%s>" % (' ' * indent, name))
232 if element.attrs:
233 attributes = []
234 for name, value in list(element.attrs.items()):
235 if isinstance(name, NamespacedAttribute):
236 name = "%s %s" % (prefixes[name.namespace], name.name)
237 if isinstance(value, list):
238 value = " ".join(value)
239 attributes.append((name, value))
240
241 for name, value in sorted(attributes):
242 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
243 indent += 2
244 for child in element.children:
245 serializeElement(child, indent)
246 serializeElement(element, 0)
247
248 return "\n".join(rv)
115 249
116class AttrList(object): 250class AttrList(object):
117 def __init__(self, element): 251 def __init__(self, element):
@@ -122,14 +256,14 @@ class AttrList(object):
122 def __setitem__(self, name, value): 256 def __setitem__(self, name, value):
123 # If this attribute is a multi-valued attribute for this element, 257 # If this attribute is a multi-valued attribute for this element,
124 # turn its value into a list. 258 # turn its value into a list.
125 list_attr = HTML5TreeBuilder.cdata_list_attributes 259 list_attr = self.element.cdata_list_attributes or {}
126 if (name in list_attr['*'] 260 if (name in list_attr.get('*', [])
127 or (self.element.name in list_attr 261 or (self.element.name in list_attr
128 and name in list_attr[self.element.name])): 262 and name in list_attr.get(self.element.name, []))):
129 # A node that is being cloned may have already undergone 263 # A node that is being cloned may have already undergone
130 # this procedure. 264 # this procedure.
131 if not isinstance(value, list): 265 if not isinstance(value, list):
132 value = whitespace_re.split(value) 266 value = nonwhitespace_re.findall(value)
133 self.element[name] = value 267 self.element[name] = value
134 def items(self): 268 def items(self):
135 return list(self.attrs.items()) 269 return list(self.attrs.items())
@@ -143,9 +277,9 @@ class AttrList(object):
143 return name in list(self.attrs.keys()) 277 return name in list(self.attrs.keys())
144 278
145 279
146class Element(treebuildersbase.Node): 280class Element(treebuilder_base.Node):
147 def __init__(self, element, soup, namespace): 281 def __init__(self, element, soup, namespace):
148 treebuildersbase.Node.__init__(self, element.name) 282 treebuilder_base.Node.__init__(self, element.name)
149 self.element = element 283 self.element = element
150 self.soup = soup 284 self.soup = soup
151 self.namespace = namespace 285 self.namespace = namespace
@@ -164,13 +298,15 @@ class Element(treebuildersbase.Node):
164 child = node 298 child = node
165 elif node.element.__class__ == NavigableString: 299 elif node.element.__class__ == NavigableString:
166 string_child = child = node.element 300 string_child = child = node.element
301 node.parent = self
167 else: 302 else:
168 child = node.element 303 child = node.element
304 node.parent = self
169 305
170 if not isinstance(child, str) and child.parent is not None: 306 if not isinstance(child, str) and child.parent is not None:
171 node.element.extract() 307 node.element.extract()
172 308
173 if (string_child and self.element.contents 309 if (string_child is not None and self.element.contents
174 and self.element.contents[-1].__class__ == NavigableString): 310 and self.element.contents[-1].__class__ == NavigableString):
175 # We are appending a string onto another string. 311 # We are appending a string onto another string.
176 # TODO This has O(n^2) performance, for input like 312 # TODO This has O(n^2) performance, for input like
@@ -203,12 +339,12 @@ class Element(treebuildersbase.Node):
203 most_recent_element=most_recent_element) 339 most_recent_element=most_recent_element)
204 340
205 def getAttributes(self): 341 def getAttributes(self):
342 if isinstance(self.element, Comment):
343 return {}
206 return AttrList(self.element) 344 return AttrList(self.element)
207 345
208 def setAttributes(self, attributes): 346 def setAttributes(self, attributes):
209
210 if attributes is not None and len(attributes) > 0: 347 if attributes is not None and len(attributes) > 0:
211
212 converted_attributes = [] 348 converted_attributes = []
213 for name, value in list(attributes.items()): 349 for name, value in list(attributes.items()):
214 if isinstance(name, tuple): 350 if isinstance(name, tuple):
@@ -230,11 +366,11 @@ class Element(treebuildersbase.Node):
230 attributes = property(getAttributes, setAttributes) 366 attributes = property(getAttributes, setAttributes)
231 367
232 def insertText(self, data, insertBefore=None): 368 def insertText(self, data, insertBefore=None):
369 text = TextNode(self.soup.new_string(data), self.soup)
233 if insertBefore: 370 if insertBefore:
234 text = TextNode(self.soup.new_string(data), self.soup) 371 self.insertBefore(text, insertBefore)
235 self.insertBefore(data, insertBefore)
236 else: 372 else:
237 self.appendChild(data) 373 self.appendChild(text)
238 374
239 def insertBefore(self, node, refNode): 375 def insertBefore(self, node, refNode):
240 index = self.element.index(refNode.element) 376 index = self.element.index(refNode.element)
@@ -253,9 +389,10 @@ class Element(treebuildersbase.Node):
253 389
254 def reparentChildren(self, new_parent): 390 def reparentChildren(self, new_parent):
255 """Move all of this tag's children into another tag.""" 391 """Move all of this tag's children into another tag."""
256 # print "MOVE", self.element.contents 392 # print("MOVE", self.element.contents)
257 # print "FROM", self.element 393 # print("FROM", self.element)
258 # print "TO", new_parent.element 394 # print("TO", new_parent.element)
395
259 element = self.element 396 element = self.element
260 new_parent_element = new_parent.element 397 new_parent_element = new_parent.element
261 # Determine what this tag's next_element will be once all the children 398 # Determine what this tag's next_element will be once all the children
@@ -274,29 +411,35 @@ class Element(treebuildersbase.Node):
274 new_parents_last_descendant_next_element = new_parent_element.next_element 411 new_parents_last_descendant_next_element = new_parent_element.next_element
275 412
276 to_append = element.contents 413 to_append = element.contents
277 append_after = new_parent_element.contents
278 if len(to_append) > 0: 414 if len(to_append) > 0:
279 # Set the first child's previous_element and previous_sibling 415 # Set the first child's previous_element and previous_sibling
280 # to elements within the new parent 416 # to elements within the new parent
281 first_child = to_append[0] 417 first_child = to_append[0]
282 if new_parents_last_descendant: 418 if new_parents_last_descendant is not None:
283 first_child.previous_element = new_parents_last_descendant 419 first_child.previous_element = new_parents_last_descendant
284 else: 420 else:
285 first_child.previous_element = new_parent_element 421 first_child.previous_element = new_parent_element
286 first_child.previous_sibling = new_parents_last_child 422 first_child.previous_sibling = new_parents_last_child
287 if new_parents_last_descendant: 423 if new_parents_last_descendant is not None:
288 new_parents_last_descendant.next_element = first_child 424 new_parents_last_descendant.next_element = first_child
289 else: 425 else:
290 new_parent_element.next_element = first_child 426 new_parent_element.next_element = first_child
291 if new_parents_last_child: 427 if new_parents_last_child is not None:
292 new_parents_last_child.next_sibling = first_child 428 new_parents_last_child.next_sibling = first_child
293 429
294 # Fix the last child's next_element and next_sibling 430 # Find the very last element being moved. It is now the
295 last_child = to_append[-1] 431 # parent's last descendant. It has no .next_sibling and
296 last_child.next_element = new_parents_last_descendant_next_element 432 # its .next_element is whatever the previous last
297 if new_parents_last_descendant_next_element: 433 # descendant had.
298 new_parents_last_descendant_next_element.previous_element = last_child 434 last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
299 last_child.next_sibling = None 435
436 last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
437 if new_parents_last_descendant_next_element is not None:
438 # TODO: This code has no test coverage and I'm not sure
439 # how to get html5lib to go through this path, but it's
440 # just the other side of the previous line.
441 new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
442 last_childs_last_descendant.next_sibling = None
300 443
301 for child in to_append: 444 for child in to_append:
302 child.parent = new_parent_element 445 child.parent = new_parent_element
@@ -306,9 +449,9 @@ class Element(treebuildersbase.Node):
306 element.contents = [] 449 element.contents = []
307 element.next_element = final_next_element 450 element.next_element = final_next_element
308 451
309 # print "DONE WITH MOVE" 452 # print("DONE WITH MOVE")
310 # print "FROM", self.element 453 # print("FROM", self.element)
311 # print "TO", new_parent_element 454 # print("TO", new_parent_element)
312 455
313 def cloneNode(self): 456 def cloneNode(self):
314 tag = self.soup.new_tag(self.element.name, self.namespace) 457 tag = self.soup.new_tag(self.element.name, self.namespace)
@@ -321,7 +464,7 @@ class Element(treebuildersbase.Node):
321 return self.element.contents 464 return self.element.contents
322 465
323 def getNameTuple(self): 466 def getNameTuple(self):
324 if self.namespace is None: 467 if self.namespace == None:
325 return namespaces["html"], self.name 468 return namespaces["html"], self.name
326 else: 469 else:
327 return self.namespace, self.name 470 return self.namespace, self.name
@@ -330,7 +473,7 @@ class Element(treebuildersbase.Node):
330 473
331class TextNode(Element): 474class TextNode(Element):
332 def __init__(self, element, soup): 475 def __init__(self, element, soup):
333 treebuildersbase.Node.__init__(self, None) 476 treebuilder_base.Node.__init__(self, None)
334 self.element = element 477 self.element = element
335 self.soup = soup 478 self.soup = soup
336 479
diff --git a/bitbake/lib/bs4/builder/_htmlparser.py b/bitbake/lib/bs4/builder/_htmlparser.py
index bb0a63f2f3..3cc187f892 100644
--- a/bitbake/lib/bs4/builder/_htmlparser.py
+++ b/bitbake/lib/bs4/builder/_htmlparser.py
@@ -1,35 +1,18 @@
1# encoding: utf-8
1"""Use the HTMLParser library to parse HTML files that aren't too bad.""" 2"""Use the HTMLParser library to parse HTML files that aren't too bad."""
2 3
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
6
3__all__ = [ 7__all__ = [
4 'HTMLParserTreeBuilder', 8 'HTMLParserTreeBuilder',
5 ] 9 ]
6 10
7from html.parser import HTMLParser 11from html.parser import HTMLParser
8 12
9try:
10 from html.parser import HTMLParseError
11except ImportError as e:
12 # HTMLParseError is removed in Python 3.5. Since it can never be
13 # thrown in 3.5, we can just define our own class as a placeholder.
14 class HTMLParseError(Exception):
15 pass
16
17import sys 13import sys
18import warnings 14import warnings
19 15
20# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
21# argument, which we'd like to set to False. Unfortunately,
22# http://bugs.python.org/issue13273 makes strict=True a better bet
23# before Python 3.2.3.
24#
25# At the end of this file, we monkeypatch HTMLParser so that
26# strict=True works well on Python 3.2.2.
27major, minor, release = sys.version_info[:3]
28CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
29CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
30CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
31
32
33from bs4.element import ( 16from bs4.element import (
34 CData, 17 CData,
35 Comment, 18 Comment,
@@ -40,6 +23,8 @@ from bs4.element import (
40from bs4.dammit import EntitySubstitution, UnicodeDammit 23from bs4.dammit import EntitySubstitution, UnicodeDammit
41 24
42from bs4.builder import ( 25from bs4.builder import (
26 DetectsXMLParsedAsHTML,
27 ParserRejectedMarkup,
43 HTML, 28 HTML,
44 HTMLTreeBuilder, 29 HTMLTreeBuilder,
45 STRICT, 30 STRICT,
@@ -48,8 +33,84 @@ from bs4.builder import (
48 33
49HTMLPARSER = 'html.parser' 34HTMLPARSER = 'html.parser'
50 35
51class BeautifulSoupHTMLParser(HTMLParser): 36class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
52 def handle_starttag(self, name, attrs): 37 """A subclass of the Python standard library's HTMLParser class, which
38 listens for HTMLParser events and translates them into calls
39 to Beautiful Soup's tree construction API.
40 """
41
42 # Strategies for handling duplicate attributes
43 IGNORE = 'ignore'
44 REPLACE = 'replace'
45
46 def __init__(self, *args, **kwargs):
47 """Constructor.
48
49 :param on_duplicate_attribute: A strategy for what to do if a
50 tag includes the same attribute more than once. Accepted
51 values are: REPLACE (replace earlier values with later
52 ones, the default), IGNORE (keep the earliest value
53 encountered), or a callable. A callable must take three
54 arguments: the dictionary of attributes already processed,
55 the name of the duplicate attribute, and the most recent value
56 encountered.
57 """
58 self.on_duplicate_attribute = kwargs.pop(
59 'on_duplicate_attribute', self.REPLACE
60 )
61 HTMLParser.__init__(self, *args, **kwargs)
62
63 # Keep a list of empty-element tags that were encountered
64 # without an explicit closing tag. If we encounter a closing tag
65 # of this type, we'll associate it with one of those entries.
66 #
67 # This isn't a stack because we don't care about the
68 # order. It's a list of closing tags we've already handled and
69 # will ignore, assuming they ever show up.
70 self.already_closed_empty_element = []
71
72 self._initialize_xml_detector()
73
74 def error(self, message):
75 # NOTE: This method is required so long as Python 3.9 is
76 # supported. The corresponding code is removed from HTMLParser
77 # in 3.5, but not removed from ParserBase until 3.10.
78 # https://github.com/python/cpython/issues/76025
79 #
80 # The original implementation turned the error into a warning,
81 # but in every case I discovered, this made HTMLParser
82 # immediately crash with an error message that was less
83 # helpful than the warning. The new implementation makes it
84 # more clear that html.parser just can't parse this
85 # markup. The 3.10 implementation does the same, though it
86 # raises AssertionError rather than calling a method. (We
87 # catch this error and wrap it in a ParserRejectedMarkup.)
88 raise ParserRejectedMarkup(message)
89
90 def handle_startendtag(self, name, attrs):
91 """Handle an incoming empty-element tag.
92
93 This is only called when the markup looks like <tag/>.
94
95 :param name: Name of the tag.
96 :param attrs: Dictionary of the tag's attributes.
97 """
98 # is_startend() tells handle_starttag not to close the tag
99 # just because its name matches a known empty-element tag. We
100 # know that this is an empty-element tag and we want to call
101 # handle_endtag ourselves.
102 tag = self.handle_starttag(name, attrs, handle_empty_element=False)
103 self.handle_endtag(name)
104
105 def handle_starttag(self, name, attrs, handle_empty_element=True):
106 """Handle an opening tag, e.g. '<tag>'
107
108 :param name: Name of the tag.
109 :param attrs: Dictionary of the tag's attributes.
110 :param handle_empty_element: True if this tag is known to be
111 an empty-element tag (i.e. there is not expected to be any
112 closing tag).
113 """
53 # XXX namespace 114 # XXX namespace
54 attr_dict = {} 115 attr_dict = {}
55 for key, value in attrs: 116 for key, value in attrs:
@@ -57,20 +118,78 @@ class BeautifulSoupHTMLParser(HTMLParser):
57 # for consistency with the other tree builders. 118 # for consistency with the other tree builders.
58 if value is None: 119 if value is None:
59 value = '' 120 value = ''
60 attr_dict[key] = value 121 if key in attr_dict:
122 # A single attribute shows up multiple times in this
123 # tag. How to handle it depends on the
124 # on_duplicate_attribute setting.
125 on_dupe = self.on_duplicate_attribute
126 if on_dupe == self.IGNORE:
127 pass
128 elif on_dupe in (None, self.REPLACE):
129 attr_dict[key] = value
130 else:
131 on_dupe(attr_dict, key, value)
132 else:
133 attr_dict[key] = value
61 attrvalue = '""' 134 attrvalue = '""'
62 self.soup.handle_starttag(name, None, None, attr_dict) 135 #print("START", name)
63 136 sourceline, sourcepos = self.getpos()
64 def handle_endtag(self, name): 137 tag = self.soup.handle_starttag(
65 self.soup.handle_endtag(name) 138 name, None, None, attr_dict, sourceline=sourceline,
66 139 sourcepos=sourcepos
140 )
141 if tag and tag.is_empty_element and handle_empty_element:
142 # Unlike other parsers, html.parser doesn't send separate end tag
143 # events for empty-element tags. (It's handled in
144 # handle_startendtag, but only if the original markup looked like
145 # <tag/>.)
146 #
147 # So we need to call handle_endtag() ourselves. Since we
148 # know the start event is identical to the end event, we
149 # don't want handle_endtag() to cross off any previous end
150 # events for tags of this name.
151 self.handle_endtag(name, check_already_closed=False)
152
153 # But we might encounter an explicit closing tag for this tag
154 # later on. If so, we want to ignore it.
155 self.already_closed_empty_element.append(name)
156
157 if self._root_tag is None:
158 self._root_tag_encountered(name)
159
160 def handle_endtag(self, name, check_already_closed=True):
161 """Handle a closing tag, e.g. '</tag>'
162
163 :param name: A tag name.
164 :param check_already_closed: True if this tag is expected to
165 be the closing portion of an empty-element tag,
166 e.g. '<tag></tag>'.
167 """
168 #print("END", name)
169 if check_already_closed and name in self.already_closed_empty_element:
170 # This is a redundant end tag for an empty-element tag.
171 # We've already called handle_endtag() for it, so just
172 # check it off the list.
173 #print("ALREADY CLOSED", name)
174 self.already_closed_empty_element.remove(name)
175 else:
176 self.soup.handle_endtag(name)
177
67 def handle_data(self, data): 178 def handle_data(self, data):
179 """Handle some textual data that shows up between tags."""
68 self.soup.handle_data(data) 180 self.soup.handle_data(data)
69 181
70 def handle_charref(self, name): 182 def handle_charref(self, name):
71 # XXX workaround for a bug in HTMLParser. Remove this once 183 """Handle a numeric character reference by converting it to the
72 # it's fixed in all supported versions. 184 corresponding Unicode character and treating it as textual
73 # http://bugs.python.org/issue13633 185 data.
186
187 :param name: Character number, possibly in hexadecimal.
188 """
189 # TODO: This was originally a workaround for a bug in
190 # HTMLParser. (http://bugs.python.org/issue13633) The bug has
191 # been fixed, but removing this code still makes some
192 # Beautiful Soup tests fail. This needs investigation.
74 if name.startswith('x'): 193 if name.startswith('x'):
75 real_name = int(name.lstrip('x'), 16) 194 real_name = int(name.lstrip('x'), 16)
76 elif name.startswith('X'): 195 elif name.startswith('X'):
@@ -78,37 +197,71 @@ class BeautifulSoupHTMLParser(HTMLParser):
78 else: 197 else:
79 real_name = int(name) 198 real_name = int(name)
80 199
81 try: 200 data = None
82 data = chr(real_name) 201 if real_name < 256:
83 except (ValueError, OverflowError) as e: 202 # HTML numeric entities are supposed to reference Unicode
84 data = "\N{REPLACEMENT CHARACTER}" 203 # code points, but sometimes they reference code points in
85 204 # some other encoding (ahem, Windows-1252). E.g. &#147;
205 # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
206 # code tries to detect this situation and compensate.
207 for encoding in (self.soup.original_encoding, 'windows-1252'):
208 if not encoding:
209 continue
210 try:
211 data = bytearray([real_name]).decode(encoding)
212 except UnicodeDecodeError as e:
213 pass
214 if not data:
215 try:
216 data = chr(real_name)
217 except (ValueError, OverflowError) as e:
218 pass
219 data = data or "\N{REPLACEMENT CHARACTER}"
86 self.handle_data(data) 220 self.handle_data(data)
87 221
88 def handle_entityref(self, name): 222 def handle_entityref(self, name):
223 """Handle a named entity reference by converting it to the
224 corresponding Unicode character(s) and treating it as textual
225 data.
226
227 :param name: Name of the entity reference.
228 """
89 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 229 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
90 if character is not None: 230 if character is not None:
91 data = character 231 data = character
92 else: 232 else:
93 data = "&%s;" % name 233 # If this were XML, it would be ambiguous whether "&foo"
234 # was an character entity reference with a missing
235 # semicolon or the literal string "&foo". Since this is
236 # HTML, we have a complete list of all character entity references,
237 # and this one wasn't found, so assume it's the literal string "&foo".
238 data = "&%s" % name
94 self.handle_data(data) 239 self.handle_data(data)
95 240
96 def handle_comment(self, data): 241 def handle_comment(self, data):
242 """Handle an HTML comment.
243
244 :param data: The text of the comment.
245 """
97 self.soup.endData() 246 self.soup.endData()
98 self.soup.handle_data(data) 247 self.soup.handle_data(data)
99 self.soup.endData(Comment) 248 self.soup.endData(Comment)
100 249
101 def handle_decl(self, data): 250 def handle_decl(self, data):
251 """Handle a DOCTYPE declaration.
252
253 :param data: The text of the declaration.
254 """
102 self.soup.endData() 255 self.soup.endData()
103 if data.startswith("DOCTYPE "): 256 data = data[len("DOCTYPE "):]
104 data = data[len("DOCTYPE "):]
105 elif data == 'DOCTYPE':
106 # i.e. "<!DOCTYPE>"
107 data = ''
108 self.soup.handle_data(data) 257 self.soup.handle_data(data)
109 self.soup.endData(Doctype) 258 self.soup.endData(Doctype)
110 259
111 def unknown_decl(self, data): 260 def unknown_decl(self, data):
261 """Handle a declaration of unknown type -- probably a CDATA block.
262
263 :param data: The text of the declaration.
264 """
112 if data.upper().startswith('CDATA['): 265 if data.upper().startswith('CDATA['):
113 cls = CData 266 cls = CData
114 data = data[len('CDATA['):] 267 data = data[len('CDATA['):]
@@ -119,144 +272,116 @@ class BeautifulSoupHTMLParser(HTMLParser):
119 self.soup.endData(cls) 272 self.soup.endData(cls)
120 273
121 def handle_pi(self, data): 274 def handle_pi(self, data):
275 """Handle a processing instruction.
276
277 :param data: The text of the instruction.
278 """
122 self.soup.endData() 279 self.soup.endData()
123 self.soup.handle_data(data) 280 self.soup.handle_data(data)
281 self._document_might_be_xml(data)
124 self.soup.endData(ProcessingInstruction) 282 self.soup.endData(ProcessingInstruction)
125 283
126 284
127class HTMLParserTreeBuilder(HTMLTreeBuilder): 285class HTMLParserTreeBuilder(HTMLTreeBuilder):
128 286 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
287 found in the Python standard library.
288 """
129 is_xml = False 289 is_xml = False
130 picklable = True 290 picklable = True
131 NAME = HTMLPARSER 291 NAME = HTMLPARSER
132 features = [NAME, HTML, STRICT] 292 features = [NAME, HTML, STRICT]
133 293
134 def __init__(self, *args, **kwargs): 294 # The html.parser knows which line number and position in the
135 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 295 # original file is the source of an element.
136 kwargs['strict'] = False 296 TRACKS_LINE_NUMBERS = True
137 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
138 kwargs['convert_charrefs'] = False
139 self.parser_args = (args, kwargs)
140 297
298 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
299 """Constructor.
300
301 :param parser_args: Positional arguments to pass into
302 the BeautifulSoupHTMLParser constructor, once it's
303 invoked.
304 :param parser_kwargs: Keyword arguments to pass into
305 the BeautifulSoupHTMLParser constructor, once it's
306 invoked.
307 :param kwargs: Keyword arguments for the superclass constructor.
308 """
309 # Some keyword arguments will be pulled out of kwargs and placed
310 # into parser_kwargs.
311 extra_parser_kwargs = dict()
312 for arg in ('on_duplicate_attribute',):
313 if arg in kwargs:
314 value = kwargs.pop(arg)
315 extra_parser_kwargs[arg] = value
316 super(HTMLParserTreeBuilder, self).__init__(**kwargs)
317 parser_args = parser_args or []
318 parser_kwargs = parser_kwargs or {}
319 parser_kwargs.update(extra_parser_kwargs)
320 parser_kwargs['convert_charrefs'] = False
321 self.parser_args = (parser_args, parser_kwargs)
322
141 def prepare_markup(self, markup, user_specified_encoding=None, 323 def prepare_markup(self, markup, user_specified_encoding=None,
142 document_declared_encoding=None, exclude_encodings=None): 324 document_declared_encoding=None, exclude_encodings=None):
143 """ 325
144 :return: A 4-tuple (markup, original encoding, encoding 326 """Run any preliminary steps necessary to make incoming markup
145 declared within markup, whether any characters had to be 327 acceptable to the parser.
146 replaced with REPLACEMENT CHARACTER). 328
329 :param markup: Some markup -- probably a bytestring.
330 :param user_specified_encoding: The user asked to try this encoding.
331 :param document_declared_encoding: The markup itself claims to be
332 in this encoding.
333 :param exclude_encodings: The user asked _not_ to try any of
334 these encodings.
335
336 :yield: A series of 4-tuples:
337 (markup, encoding, declared encoding,
338 has undergone character replacement)
339
340 Each 4-tuple represents a strategy for converting the
341 document to Unicode and parsing it. Each strategy will be tried
342 in turn.
147 """ 343 """
148 if isinstance(markup, str): 344 if isinstance(markup, str):
345 # Parse Unicode as-is.
149 yield (markup, None, None, False) 346 yield (markup, None, None, False)
150 return 347 return
151 348
349 # Ask UnicodeDammit to sniff the most likely encoding.
350
351 # This was provided by the end-user; treat it as a known
352 # definite encoding per the algorithm laid out in the HTML5
353 # spec. (See the EncodingDetector class for details.)
354 known_definite_encodings = [user_specified_encoding]
355
356 # This was found in the document; treat it as a slightly lower-priority
357 # user encoding.
358 user_encodings = [document_declared_encoding]
359
152 try_encodings = [user_specified_encoding, document_declared_encoding] 360 try_encodings = [user_specified_encoding, document_declared_encoding]
153 dammit = UnicodeDammit(markup, try_encodings, is_html=True, 361 dammit = UnicodeDammit(
154 exclude_encodings=exclude_encodings) 362 markup,
363 known_definite_encodings=known_definite_encodings,
364 user_encodings=user_encodings,
365 is_html=True,
366 exclude_encodings=exclude_encodings
367 )
155 yield (dammit.markup, dammit.original_encoding, 368 yield (dammit.markup, dammit.original_encoding,
156 dammit.declared_html_encoding, 369 dammit.declared_html_encoding,
157 dammit.contains_replacement_characters) 370 dammit.contains_replacement_characters)
158 371
159 def feed(self, markup): 372 def feed(self, markup):
373 """Run some incoming markup through some parsing process,
374 populating the `BeautifulSoup` object in self.soup.
375 """
160 args, kwargs = self.parser_args 376 args, kwargs = self.parser_args
161 parser = BeautifulSoupHTMLParser(*args, **kwargs) 377 parser = BeautifulSoupHTMLParser(*args, **kwargs)
162 parser.soup = self.soup 378 parser.soup = self.soup
163 try: 379 try:
164 parser.feed(markup) 380 parser.feed(markup)
165 except HTMLParseError as e: 381 parser.close()
166 warnings.warn(RuntimeWarning( 382 except AssertionError as e:
167 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 383 # html.parser raises AssertionError in rare cases to
168 raise e 384 # indicate a fatal problem with the markup, especially
169 385 # when there's an error in the doctype declaration.
170# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 386 raise ParserRejectedMarkup(e)
171# 3.2.3 code. This ensures they don't treat markup like <p></p> as a 387 parser.already_closed_empty_element = []
172# string.
173#
174# XXX This code can be removed once most Python 3 users are on 3.2.3.
175if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
176 import re
177 attrfind_tolerant = re.compile(
178 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
179 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
180 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
181
182 locatestarttagend = re.compile(r"""
183 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
184 (?:\s+ # whitespace before attribute name
185 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
186 (?:\s*=\s* # value indicator
187 (?:'[^']*' # LITA-enclosed value
188 |\"[^\"]*\" # LIT-enclosed value
189 |[^'\">\s]+ # bare value
190 )
191 )?
192 )
193 )*
194 \s* # trailing whitespace
195""", re.VERBOSE)
196 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
197
198 from html.parser import tagfind, attrfind
199
200 def parse_starttag(self, i):
201 self.__starttag_text = None
202 endpos = self.check_for_whole_start_tag(i)
203 if endpos < 0:
204 return endpos
205 rawdata = self.rawdata
206 self.__starttag_text = rawdata[i:endpos]
207
208 # Now parse the data between i+1 and j into a tag and attrs
209 attrs = []
210 match = tagfind.match(rawdata, i+1)
211 assert match, 'unexpected call to parse_starttag()'
212 k = match.end()
213 self.lasttag = tag = rawdata[i+1:k].lower()
214 while k < endpos:
215 if self.strict:
216 m = attrfind.match(rawdata, k)
217 else:
218 m = attrfind_tolerant.match(rawdata, k)
219 if not m:
220 break
221 attrname, rest, attrvalue = m.group(1, 2, 3)
222 if not rest:
223 attrvalue = None
224 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
225 attrvalue[:1] == '"' == attrvalue[-1:]:
226 attrvalue = attrvalue[1:-1]
227 if attrvalue:
228 attrvalue = self.unescape(attrvalue)
229 attrs.append((attrname.lower(), attrvalue))
230 k = m.end()
231
232 end = rawdata[k:endpos].strip()
233 if end not in (">", "/>"):
234 lineno, offset = self.getpos()
235 if "\n" in self.__starttag_text:
236 lineno = lineno + self.__starttag_text.count("\n")
237 offset = len(self.__starttag_text) \
238 - self.__starttag_text.rfind("\n")
239 else:
240 offset = offset + len(self.__starttag_text)
241 if self.strict:
242 self.error("junk characters in start tag: %r"
243 % (rawdata[k:endpos][:20],))
244 self.handle_data(rawdata[i:endpos])
245 return endpos
246 if end.endswith('/>'):
247 # XHTML-style empty tag: <span attr="value" />
248 self.handle_startendtag(tag, attrs)
249 else:
250 self.handle_starttag(tag, attrs)
251 if tag in self.CDATA_CONTENT_ELEMENTS:
252 self.set_cdata_mode(tag)
253 return endpos
254
255 def set_cdata_mode(self, elem):
256 self.cdata_elem = elem.lower()
257 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
258
259 BeautifulSoupHTMLParser.parse_starttag = parse_starttag
260 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
261
262 CONSTRUCTOR_TAKES_STRICT = True
diff --git a/bitbake/lib/bs4/builder/_lxml.py b/bitbake/lib/bs4/builder/_lxml.py
index 9c6c14ee65..4f7cf74681 100644
--- a/bitbake/lib/bs4/builder/_lxml.py
+++ b/bitbake/lib/bs4/builder/_lxml.py
@@ -1,19 +1,28 @@
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
1__all__ = [ 4__all__ = [
2 'LXMLTreeBuilderForXML', 5 'LXMLTreeBuilderForXML',
3 'LXMLTreeBuilder', 6 'LXMLTreeBuilder',
4 ] 7 ]
5 8
9try:
10 from collections.abc import Callable # Python 3.6
11except ImportError as e:
12 from collections import Callable
13
6from io import BytesIO 14from io import BytesIO
7from io import StringIO 15from io import StringIO
8import collections
9from lxml import etree 16from lxml import etree
10from bs4.element import ( 17from bs4.element import (
11 Comment, 18 Comment,
12 Doctype, 19 Doctype,
13 NamespacedAttribute, 20 NamespacedAttribute,
14 ProcessingInstruction, 21 ProcessingInstruction,
22 XMLProcessingInstruction,
15) 23)
16from bs4.builder import ( 24from bs4.builder import (
25 DetectsXMLParsedAsHTML,
17 FAST, 26 FAST,
18 HTML, 27 HTML,
19 HTMLTreeBuilder, 28 HTMLTreeBuilder,
@@ -25,10 +34,15 @@ from bs4.dammit import EncodingDetector
25 34
26LXML = 'lxml' 35LXML = 'lxml'
27 36
37def _invert(d):
38 "Invert a dictionary."
39 return dict((v,k) for k, v in list(d.items()))
40
28class LXMLTreeBuilderForXML(TreeBuilder): 41class LXMLTreeBuilderForXML(TreeBuilder):
29 DEFAULT_PARSER_CLASS = etree.XMLParser 42 DEFAULT_PARSER_CLASS = etree.XMLParser
30 43
31 is_xml = True 44 is_xml = True
45 processing_instruction_class = XMLProcessingInstruction
32 46
33 NAME = "lxml-xml" 47 NAME = "lxml-xml"
34 ALTERNATE_NAMES = ["xml"] 48 ALTERNATE_NAMES = ["xml"]
@@ -40,26 +54,79 @@ class LXMLTreeBuilderForXML(TreeBuilder):
40 54
41 # This namespace mapping is specified in the XML Namespace 55 # This namespace mapping is specified in the XML Namespace
42 # standard. 56 # standard.
43 DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 57 DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
58
59 DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
60
61 # NOTE: If we parsed Element objects and looked at .sourceline,
62 # we'd be able to see the line numbers from the original document.
63 # But instead we build an XMLParser or HTMLParser object to serve
64 # as the target of parse messages, and those messages don't include
65 # line numbers.
66 # See: https://bugs.launchpad.net/lxml/+bug/1846906
67
68 def initialize_soup(self, soup):
69 """Let the BeautifulSoup object know about the standard namespace
70 mapping.
71
72 :param soup: A `BeautifulSoup`.
73 """
74 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
75 self._register_namespaces(self.DEFAULT_NSMAPS)
76
77 def _register_namespaces(self, mapping):
78 """Let the BeautifulSoup object know about namespaces encountered
79 while parsing the document.
80
81 This might be useful later on when creating CSS selectors.
82
83 This will track (almost) all namespaces, even ones that were
84 only in scope for part of the document. If two namespaces have
85 the same prefix, only the first one encountered will be
86 tracked. Un-prefixed namespaces are not tracked.
44 87
88 :param mapping: A dictionary mapping namespace prefixes to URIs.
89 """
90 for key, value in list(mapping.items()):
91 # This is 'if key' and not 'if key is not None' because we
92 # don't track un-prefixed namespaces. Soupselect will
93 # treat an un-prefixed namespace as the default, which
94 # causes confusion in some cases.
95 if key and key not in self.soup._namespaces:
96 # Let the BeautifulSoup object know about a new namespace.
97 # If there are multiple namespaces defined with the same
98 # prefix, the first one in the document takes precedence.
99 self.soup._namespaces[key] = value
100
45 def default_parser(self, encoding): 101 def default_parser(self, encoding):
46 # This can either return a parser object or a class, which 102 """Find the default parser for the given encoding.
47 # will be instantiated with default arguments. 103
104 :param encoding: A string.
105 :return: Either a parser object or a class, which
106 will be instantiated with default arguments.
107 """
48 if self._default_parser is not None: 108 if self._default_parser is not None:
49 return self._default_parser 109 return self._default_parser
50 return etree.XMLParser( 110 return etree.XMLParser(
51 target=self, strip_cdata=False, recover=True, encoding=encoding) 111 target=self, strip_cdata=False, recover=True, encoding=encoding)
52 112
53 def parser_for(self, encoding): 113 def parser_for(self, encoding):
114 """Instantiate an appropriate parser for the given encoding.
115
116 :param encoding: A string.
117 :return: A parser object such as an `etree.XMLParser`.
118 """
54 # Use the default parser. 119 # Use the default parser.
55 parser = self.default_parser(encoding) 120 parser = self.default_parser(encoding)
56 121
57 if isinstance(parser, collections.Callable): 122 if isinstance(parser, Callable):
58 # Instantiate the parser with default arguments 123 # Instantiate the parser with default arguments
59 parser = parser(target=self, strip_cdata=False, encoding=encoding) 124 parser = parser(
125 target=self, strip_cdata=False, recover=True, encoding=encoding
126 )
60 return parser 127 return parser
61 128
62 def __init__(self, parser=None, empty_element_tags=None): 129 def __init__(self, parser=None, empty_element_tags=None, **kwargs):
63 # TODO: Issue a warning if parser is present but not a 130 # TODO: Issue a warning if parser is present but not a
64 # callable, since that means there's no way to create new 131 # callable, since that means there's no way to create new
65 # parsers for different encodings. 132 # parsers for different encodings.
@@ -67,8 +134,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
67 if empty_element_tags is not None: 134 if empty_element_tags is not None:
68 self.empty_element_tags = set(empty_element_tags) 135 self.empty_element_tags = set(empty_element_tags)
69 self.soup = None 136 self.soup = None
70 self.nsmaps = [self.DEFAULT_NSMAPS] 137 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
71 138 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
139 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
140
72 def _getNsTag(self, tag): 141 def _getNsTag(self, tag):
73 # Split the namespace URL out of a fully-qualified lxml tag 142 # Split the namespace URL out of a fully-qualified lxml tag
74 # name. Copied from lxml's src/lxml/sax.py. 143 # name. Copied from lxml's src/lxml/sax.py.
@@ -80,16 +149,51 @@ class LXMLTreeBuilderForXML(TreeBuilder):
80 def prepare_markup(self, markup, user_specified_encoding=None, 149 def prepare_markup(self, markup, user_specified_encoding=None,
81 exclude_encodings=None, 150 exclude_encodings=None,
82 document_declared_encoding=None): 151 document_declared_encoding=None):
83 """ 152 """Run any preliminary steps necessary to make incoming markup
84 :yield: A series of 4-tuples. 153 acceptable to the parser.
154
155 lxml really wants to get a bytestring and convert it to
156 Unicode itself. So instead of using UnicodeDammit to convert
157 the bytestring to Unicode using different encodings, this
158 implementation uses EncodingDetector to iterate over the
159 encodings, and tell lxml to try to parse the document as each
160 one in turn.
161
162 :param markup: Some markup -- hopefully a bytestring.
163 :param user_specified_encoding: The user asked to try this encoding.
164 :param document_declared_encoding: The markup itself claims to be
165 in this encoding.
166 :param exclude_encodings: The user asked _not_ to try any of
167 these encodings.
168
169 :yield: A series of 4-tuples:
85 (markup, encoding, declared encoding, 170 (markup, encoding, declared encoding,
86 has undergone character replacement) 171 has undergone character replacement)
87 172
88 Each 4-tuple represents a strategy for parsing the document. 173 Each 4-tuple represents a strategy for converting the
174 document to Unicode and parsing it. Each strategy will be tried
175 in turn.
89 """ 176 """
177 is_html = not self.is_xml
178 if is_html:
179 self.processing_instruction_class = ProcessingInstruction
180 # We're in HTML mode, so if we're given XML, that's worth
181 # noting.
182 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
183 markup, stacklevel=3
184 )
185 else:
186 self.processing_instruction_class = XMLProcessingInstruction
187
90 if isinstance(markup, str): 188 if isinstance(markup, str):
91 # We were given Unicode. Maybe lxml can parse Unicode on 189 # We were given Unicode. Maybe lxml can parse Unicode on
92 # this system? 190 # this system?
191
192 # TODO: This is a workaround for
193 # https://bugs.launchpad.net/lxml/+bug/1948551.
194 # We can remove it once the upstream issue is fixed.
195 if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
196 markup = markup[1:]
93 yield markup, None, document_declared_encoding, False 197 yield markup, None, document_declared_encoding, False
94 198
95 if isinstance(markup, str): 199 if isinstance(markup, str):
@@ -98,14 +202,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
98 yield (markup.encode("utf8"), "utf8", 202 yield (markup.encode("utf8"), "utf8",
99 document_declared_encoding, False) 203 document_declared_encoding, False)
100 204
101 # Instead of using UnicodeDammit to convert the bytestring to 205 # This was provided by the end-user; treat it as a known
102 # Unicode using different encodings, use EncodingDetector to 206 # definite encoding per the algorithm laid out in the HTML5
103 # iterate over the encodings, and tell lxml to try to parse 207 # spec. (See the EncodingDetector class for details.)
104 # the document as each one in turn. 208 known_definite_encodings = [user_specified_encoding]
105 is_html = not self.is_xml 209
106 try_encodings = [user_specified_encoding, document_declared_encoding] 210 # This was found in the document; treat it as a slightly lower-priority
211 # user encoding.
212 user_encodings = [document_declared_encoding]
107 detector = EncodingDetector( 213 detector = EncodingDetector(
108 markup, try_encodings, is_html, exclude_encodings) 214 markup, known_definite_encodings=known_definite_encodings,
215 user_encodings=user_encodings, is_html=is_html,
216 exclude_encodings=exclude_encodings
217 )
109 for encoding in detector.encodings: 218 for encoding in detector.encodings:
110 yield (detector.markup, encoding, document_declared_encoding, False) 219 yield (detector.markup, encoding, document_declared_encoding, False)
111 220
@@ -128,25 +237,45 @@ class LXMLTreeBuilderForXML(TreeBuilder):
128 self.parser.feed(data) 237 self.parser.feed(data)
129 self.parser.close() 238 self.parser.close()
130 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 239 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
131 raise ParserRejectedMarkup(str(e)) 240 raise ParserRejectedMarkup(e)
132 241
133 def close(self): 242 def close(self):
134 self.nsmaps = [self.DEFAULT_NSMAPS] 243 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
135 244
136 def start(self, name, attrs, nsmap={}): 245 def start(self, name, attrs, nsmap={}):
137 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 246 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
138 attrs = dict(attrs) 247 attrs = dict(attrs)
139 nsprefix = None 248 nsprefix = None
140 # Invert each namespace map as it comes in. 249 # Invert each namespace map as it comes in.
141 if len(self.nsmaps) > 1: 250 if len(nsmap) == 0 and len(self.nsmaps) > 1:
142 # There are no new namespaces for this tag, but 251 # There are no new namespaces for this tag, but
143 # non-default namespaces are in play, so we need a 252 # non-default namespaces are in play, so we need a
144 # separate tag stack to know when they end. 253 # separate tag stack to know when they end.
145 self.nsmaps.append(None) 254 self.nsmaps.append(None)
146 elif len(nsmap) > 0: 255 elif len(nsmap) > 0:
147 # A new namespace mapping has come into play. 256 # A new namespace mapping has come into play.
148 inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) 257
149 self.nsmaps.append(inverted_nsmap) 258 # First, Let the BeautifulSoup object know about it.
259 self._register_namespaces(nsmap)
260
261 # Then, add it to our running list of inverted namespace
262 # mappings.
263 self.nsmaps.append(_invert(nsmap))
264
265 # The currently active namespace prefixes have
266 # changed. Calculate the new mapping so it can be stored
267 # with all Tag objects created while these prefixes are in
268 # scope.
269 current_mapping = dict(self.active_namespace_prefixes[-1])
270 current_mapping.update(nsmap)
271
272 # We should not track un-prefixed namespaces as we can only hold one
273 # and it will be recognized as the default namespace by soupsieve,
274 # which may be confusing in some situations.
275 if '' in current_mapping:
276 del current_mapping['']
277 self.active_namespace_prefixes.append(current_mapping)
278
150 # Also treat the namespace mapping as a set of attributes on the 279 # Also treat the namespace mapping as a set of attributes on the
151 # tag, so we can recreate it later. 280 # tag, so we can recreate it later.
152 attrs = attrs.copy() 281 attrs = attrs.copy()
@@ -171,8 +300,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
171 300
172 namespace, name = self._getNsTag(name) 301 namespace, name = self._getNsTag(name)
173 nsprefix = self._prefix_for_namespace(namespace) 302 nsprefix = self._prefix_for_namespace(namespace)
174 self.soup.handle_starttag(name, namespace, nsprefix, attrs) 303 self.soup.handle_starttag(
175 304 name, namespace, nsprefix, attrs,
305 namespaces=self.active_namespace_prefixes[-1]
306 )
307
176 def _prefix_for_namespace(self, namespace): 308 def _prefix_for_namespace(self, namespace):
177 """Find the currently active prefix for the given namespace.""" 309 """Find the currently active prefix for the given namespace."""
178 if namespace is None: 310 if namespace is None:
@@ -196,13 +328,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
196 if len(self.nsmaps) > 1: 328 if len(self.nsmaps) > 1:
197 # This tag, or one of its parents, introduced a namespace 329 # This tag, or one of its parents, introduced a namespace
198 # mapping, so pop it off the stack. 330 # mapping, so pop it off the stack.
199 self.nsmaps.pop() 331 out_of_scope_nsmap = self.nsmaps.pop()
200 332
333 if out_of_scope_nsmap is not None:
334 # This tag introduced a namespace mapping which is no
335 # longer in scope. Recalculate the currently active
336 # namespace prefixes.
337 self.active_namespace_prefixes.pop()
338
201 def pi(self, target, data): 339 def pi(self, target, data):
202 self.soup.endData() 340 self.soup.endData()
203 self.soup.handle_data(target + ' ' + data) 341 data = target + ' ' + data
204 self.soup.endData(ProcessingInstruction) 342 self.soup.handle_data(data)
205 343 self.soup.endData(self.processing_instruction_class)
344
206 def data(self, content): 345 def data(self, content):
207 self.soup.handle_data(content) 346 self.soup.handle_data(content)
208 347
@@ -229,6 +368,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
229 368
230 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 369 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
231 is_xml = False 370 is_xml = False
371 processing_instruction_class = ProcessingInstruction
232 372
233 def default_parser(self, encoding): 373 def default_parser(self, encoding):
234 return etree.HTMLParser 374 return etree.HTMLParser
@@ -240,7 +380,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
240 self.parser.feed(markup) 380 self.parser.feed(markup)
241 self.parser.close() 381 self.parser.close()
242 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 382 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
243 raise ParserRejectedMarkup(str(e)) 383 raise ParserRejectedMarkup(e)
244 384
245 385
246 def test_fragment_to_document(self, fragment): 386 def test_fragment_to_document(self, fragment):
diff --git a/bitbake/lib/bs4/css.py b/bitbake/lib/bs4/css.py
new file mode 100644
index 0000000000..cd1fd2df88
--- /dev/null
+++ b/bitbake/lib/bs4/css.py
@@ -0,0 +1,274 @@
1"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve)."""
2
3# We don't use soupsieve
4soupsieve = None
5
6
7class CSS(object):
8 """A proxy object against the soupsieve library, to simplify its
9 CSS selector API.
10
11 Acquire this object through the .css attribute on the
12 BeautifulSoup object, or on the Tag you want to use as the
13 starting point for a CSS selector.
14
15 The main advantage of doing this is that the tag to be selected
16 against doesn't need to be explicitly specified in the function
17 calls, since it's already scoped to a tag.
18 """
19
20 def __init__(self, tag, api=soupsieve):
21 """Constructor.
22
23 You don't need to instantiate this class yourself; instead,
24 access the .css attribute on the BeautifulSoup object, or on
25 the Tag you want to use as the starting point for your CSS
26 selector.
27
28 :param tag: All CSS selectors will use this as their starting
29 point.
30
31 :param api: A plug-in replacement for the soupsieve module,
32 designed mainly for use in tests.
33 """
34 if api is None:
35 raise NotImplementedError(
36 "Cannot execute CSS selectors because the soupsieve package is not installed."
37 )
38 self.api = api
39 self.tag = tag
40
41 def escape(self, ident):
42 """Escape a CSS identifier.
43
44 This is a simple wrapper around soupselect.escape(). See the
45 documentation for that function for more information.
46 """
47 if soupsieve is None:
48 raise NotImplementedError(
49 "Cannot escape CSS identifiers because the soupsieve package is not installed."
50 )
51 return self.api.escape(ident)
52
53 def _ns(self, ns, select):
54 """Normalize a dictionary of namespaces."""
55 if not isinstance(select, self.api.SoupSieve) and ns is None:
56 # If the selector is a precompiled pattern, it already has
57 # a namespace context compiled in, which cannot be
58 # replaced.
59 ns = self.tag._namespaces
60 return ns
61
62 def _rs(self, results):
63 """Normalize a list of results to a Resultset.
64
65 A ResultSet is more consistent with the rest of Beautiful
66 Soup's API, and ResultSet.__getattr__ has a helpful error
67 message if you try to treat a list of results as a single
68 result (a common mistake).
69 """
70 # Import here to avoid circular import
71 from bs4.element import ResultSet
72 return ResultSet(None, results)
73
74 def compile(self, select, namespaces=None, flags=0, **kwargs):
75 """Pre-compile a selector and return the compiled object.
76
77 :param selector: A CSS selector.
78
79 :param namespaces: A dictionary mapping namespace prefixes
80 used in the CSS selector to namespace URIs. By default,
81 Beautiful Soup will use the prefixes it encountered while
82 parsing the document.
83
84 :param flags: Flags to be passed into Soup Sieve's
85 soupsieve.compile() method.
86
87 :param kwargs: Keyword arguments to be passed into SoupSieve's
88 soupsieve.compile() method.
89
90 :return: A precompiled selector object.
91 :rtype: soupsieve.SoupSieve
92 """
93 return self.api.compile(
94 select, self._ns(namespaces, select), flags, **kwargs
95 )
96
97 def select_one(self, select, namespaces=None, flags=0, **kwargs):
98 """Perform a CSS selection operation on the current Tag and return the
99 first result.
100
101 This uses the Soup Sieve library. For more information, see
102 that library's documentation for the soupsieve.select_one()
103 method.
104
105 :param selector: A CSS selector.
106
107 :param namespaces: A dictionary mapping namespace prefixes
108 used in the CSS selector to namespace URIs. By default,
109 Beautiful Soup will use the prefixes it encountered while
110 parsing the document.
111
112 :param flags: Flags to be passed into Soup Sieve's
113 soupsieve.select_one() method.
114
115 :param kwargs: Keyword arguments to be passed into SoupSieve's
116 soupsieve.select_one() method.
117
118 :return: A Tag, or None if the selector has no match.
119 :rtype: bs4.element.Tag
120
121 """
122 return self.api.select_one(
123 select, self.tag, self._ns(namespaces, select), flags, **kwargs
124 )
125
126 def select(self, select, namespaces=None, limit=0, flags=0, **kwargs):
127 """Perform a CSS selection operation on the current Tag.
128
129 This uses the Soup Sieve library. For more information, see
130 that library's documentation for the soupsieve.select()
131 method.
132
133 :param selector: A string containing a CSS selector.
134
135 :param namespaces: A dictionary mapping namespace prefixes
136 used in the CSS selector to namespace URIs. By default,
137 Beautiful Soup will pass in the prefixes it encountered while
138 parsing the document.
139
140 :param limit: After finding this number of results, stop looking.
141
142 :param flags: Flags to be passed into Soup Sieve's
143 soupsieve.select() method.
144
145 :param kwargs: Keyword arguments to be passed into SoupSieve's
146 soupsieve.select() method.
147
148 :return: A ResultSet of Tag objects.
149 :rtype: bs4.element.ResultSet
150
151 """
152 if limit is None:
153 limit = 0
154
155 return self._rs(
156 self.api.select(
157 select, self.tag, self._ns(namespaces, select), limit, flags,
158 **kwargs
159 )
160 )
161
162 def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs):
163 """Perform a CSS selection operation on the current Tag.
164
165 This uses the Soup Sieve library. For more information, see
166 that library's documentation for the soupsieve.iselect()
167 method. It is the same as select(), but it returns a generator
168 instead of a list.
169
170 :param selector: A string containing a CSS selector.
171
172 :param namespaces: A dictionary mapping namespace prefixes
173 used in the CSS selector to namespace URIs. By default,
174 Beautiful Soup will pass in the prefixes it encountered while
175 parsing the document.
176
177 :param limit: After finding this number of results, stop looking.
178
179 :param flags: Flags to be passed into Soup Sieve's
180 soupsieve.iselect() method.
181
182 :param kwargs: Keyword arguments to be passed into SoupSieve's
183 soupsieve.iselect() method.
184
185 :return: A generator
186 :rtype: types.GeneratorType
187 """
188 return self.api.iselect(
189 select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
190 )
191
192 def closest(self, select, namespaces=None, flags=0, **kwargs):
193 """Find the Tag closest to this one that matches the given selector.
194
195 This uses the Soup Sieve library. For more information, see
196 that library's documentation for the soupsieve.closest()
197 method.
198
199 :param selector: A string containing a CSS selector.
200
201 :param namespaces: A dictionary mapping namespace prefixes
202 used in the CSS selector to namespace URIs. By default,
203 Beautiful Soup will pass in the prefixes it encountered while
204 parsing the document.
205
206 :param flags: Flags to be passed into Soup Sieve's
207 soupsieve.closest() method.
208
209 :param kwargs: Keyword arguments to be passed into SoupSieve's
210 soupsieve.closest() method.
211
212 :return: A Tag, or None if there is no match.
213 :rtype: bs4.Tag
214
215 """
216 return self.api.closest(
217 select, self.tag, self._ns(namespaces, select), flags, **kwargs
218 )
219
220 def match(self, select, namespaces=None, flags=0, **kwargs):
221 """Check whether this Tag matches the given CSS selector.
222
223 This uses the Soup Sieve library. For more information, see
224 that library's documentation for the soupsieve.match()
225 method.
226
227 :param: a CSS selector.
228
229 :param namespaces: A dictionary mapping namespace prefixes
230 used in the CSS selector to namespace URIs. By default,
231 Beautiful Soup will pass in the prefixes it encountered while
232 parsing the document.
233
234 :param flags: Flags to be passed into Soup Sieve's
235 soupsieve.match() method.
236
237 :param kwargs: Keyword arguments to be passed into SoupSieve's
238 soupsieve.match() method.
239
240 :return: True if this Tag matches the selector; False otherwise.
241 :rtype: bool
242 """
243 return self.api.match(
244 select, self.tag, self._ns(namespaces, select), flags, **kwargs
245 )
246
247 def filter(self, select, namespaces=None, flags=0, **kwargs):
248 """Filter this Tag's direct children based on the given CSS selector.
249
250 This uses the Soup Sieve library. It works the same way as
251 passing this Tag into that library's soupsieve.filter()
252 method. More information, for more information see the
253 documentation for soupsieve.filter().
254
255 :param namespaces: A dictionary mapping namespace prefixes
256 used in the CSS selector to namespace URIs. By default,
257 Beautiful Soup will pass in the prefixes it encountered while
258 parsing the document.
259
260 :param flags: Flags to be passed into Soup Sieve's
261 soupsieve.filter() method.
262
263 :param kwargs: Keyword arguments to be passed into SoupSieve's
264 soupsieve.filter() method.
265
266 :return: A ResultSet of Tag objects.
267 :rtype: bs4.element.ResultSet
268
269 """
270 return self._rs(
271 self.api.filter(
272 select, self.tag, self._ns(namespaces, select), flags, **kwargs
273 )
274 )
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py
index 7ad9e0dd1e..692433c57a 100644
--- a/bitbake/lib/bs4/dammit.py
+++ b/bitbake/lib/bs4/dammit.py
@@ -6,61 +6,185 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and HTML, but it does not rewrite the 6Feed Parser. It works best on XML and HTML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job. 7XML or HTML to reflect a new encoding; that's the tree builder's job.
8""" 8"""
9# Use of this source code is governed by the MIT license.
9__license__ = "MIT" 10__license__ = "MIT"
10 11
11import codecs
12from html.entities import codepoint2name 12from html.entities import codepoint2name
13from collections import defaultdict
14import codecs
13import re 15import re
14import logging 16import logging
15 17import string
16# Import a library to autodetect character encodings. 18
17chardet_type = None 19# Import a library to autodetect character encodings. We'll support
20# any of a number of libraries that all support the same API:
21#
22# * cchardet
23# * chardet
24# * charset-normalizer
25chardet_module = None
18try: 26try:
19 # First try the fast C implementation.
20 # PyPI package: cchardet 27 # PyPI package: cchardet
21 import cchardet 28 import cchardet as chardet_module
22 def chardet_dammit(s):
23 return cchardet.detect(s)['encoding']
24except ImportError: 29except ImportError:
25 try: 30 try:
26 # Fall back to the pure Python implementation
27 # Debian package: python-chardet 31 # Debian package: python-chardet
28 # PyPI package: chardet 32 # PyPI package: chardet
29 import chardet 33 import chardet as chardet_module
30 def chardet_dammit(s):
31 return chardet.detect(s)['encoding']
32 #import chardet.constants
33 #chardet.constants._debug = 1
34 except ImportError: 34 except ImportError:
35 # No chardet available. 35 try:
36 def chardet_dammit(s): 36 # PyPI package: charset-normalizer
37 import charset_normalizer as chardet_module
38 except ImportError:
39 # No chardet available.
40 chardet_module = None
41
42if chardet_module:
43 def chardet_dammit(s):
44 if isinstance(s, str):
37 return None 45 return None
46 return chardet_module.detect(s)['encoding']
47else:
48 def chardet_dammit(s):
49 return None
38 50
39xml_encoding_re = re.compile( 51# Build bytestring and Unicode versions of regular expressions for finding
40 r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) 52# a declared encoding inside an XML or HTML document.
41html_meta_re = re.compile( 53xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
42 r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) 54html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
55encoding_res = dict()
56encoding_res[bytes] = {
57 'html' : re.compile(html_meta.encode("ascii"), re.I),
58 'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
59}
60encoding_res[str] = {
61 'html' : re.compile(html_meta, re.I),
62 'xml' : re.compile(xml_encoding, re.I)
63}
64
65from html.entities import html5
43 66
44class EntitySubstitution(object): 67class EntitySubstitution(object):
45 68 """The ability to substitute XML or HTML entities for certain characters."""
46 """Substitute XML or HTML entities for the corresponding characters."""
47 69
48 def _populate_class_variables(): 70 def _populate_class_variables():
49 lookup = {} 71 """Initialize variables used by this class to manage the plethora of
50 reverse_lookup = {} 72 HTML5 named entities.
51 characters_for_re = [] 73
74 This function returns a 3-tuple containing two dictionaries
75 and a regular expression:
76
77 unicode_to_name - A mapping of Unicode strings like "⦨" to
78 entity names like "angmsdaa". When a single Unicode string has
79 multiple entity names, we try to choose the most commonly-used
80 name.
81
82 name_to_unicode: A mapping of entity names like "angmsdaa" to
83 Unicode strings like "⦨".
84
85 named_entity_re: A regular expression matching (almost) any
86 Unicode string that corresponds to an HTML5 named entity.
87 """
88 unicode_to_name = {}
89 name_to_unicode = {}
90
91 short_entities = set()
92 long_entities_by_first_character = defaultdict(set)
93
94 for name_with_semicolon, character in sorted(html5.items()):
95 # "It is intentional, for legacy compatibility, that many
96 # code points have multiple character reference names. For
97 # example, some appear both with and without the trailing
98 # semicolon, or with different capitalizations."
99 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
100 #
101 # The parsers are in charge of handling (or not) character
102 # references with no trailing semicolon, so we remove the
103 # semicolon whenever it appears.
104 if name_with_semicolon.endswith(';'):
105 name = name_with_semicolon[:-1]
106 else:
107 name = name_with_semicolon
108
109 # When parsing HTML, we want to recognize any known named
110 # entity and convert it to a sequence of Unicode
111 # characters.
112 if name not in name_to_unicode:
113 name_to_unicode[name] = character
114
115 # When _generating_ HTML, we want to recognize special
116 # character sequences that _could_ be converted to named
117 # entities.
118 unicode_to_name[character] = name
119
120 # We also need to build a regular expression that lets us
121 # _find_ those characters in output strings so we can
122 # replace them.
123 #
124 # This is tricky, for two reasons.
125
126 if (len(character) == 1 and ord(character) < 128
127 and character not in '<>&'):
128 # First, it would be annoying to turn single ASCII
129 # characters like | into named entities like
130 # &verbar;. The exceptions are <>&, which we _must_
131 # turn into named entities to produce valid HTML.
132 continue
133
134 if len(character) > 1 and all(ord(x) < 128 for x in character):
135 # We also do not want to turn _combinations_ of ASCII
136 # characters like 'fj' into named entities like '&fjlig;',
137 # though that's more debateable.
138 continue
139
140 # Second, some named entities have a Unicode value that's
141 # a subset of the Unicode value for some _other_ named
142 # entity. As an example, \u2267' is &GreaterFullEqual;,
143 # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular
144 # expression needs to match the first two characters of
145 # "\u2267\u0338foo", but only the first character of
146 # "\u2267foo".
147 #
148 # In this step, we build two sets of characters that
149 # _eventually_ need to go into the regular expression. But
150 # we won't know exactly what the regular expression needs
151 # to look like until we've gone through the entire list of
152 # named entities.
153 if len(character) == 1:
154 short_entities.add(character)
155 else:
156 long_entities_by_first_character[character[0]].add(character)
157
158 # Now that we've been through the entire list of entities, we
159 # can create a regular expression that matches any of them.
160 particles = set()
161 for short in short_entities:
162 long_versions = long_entities_by_first_character[short]
163 if not long_versions:
164 particles.add(short)
165 else:
166 ignore = "".join([x[1] for x in long_versions])
167 # This finds, e.g. \u2267 but only if it is _not_
168 # followed by \u0338.
169 particles.add("%s(?![%s])" % (short, ignore))
170
171 for long_entities in list(long_entities_by_first_character.values()):
172 for long_entity in long_entities:
173 particles.add(long_entity)
174
175 re_definition = "(%s)" % "|".join(particles)
176
177 # If an entity shows up in both html5 and codepoint2name, it's
178 # likely that HTML5 gives it several different names, such as
179 # 'rsquo' and 'rsquor'. When converting Unicode characters to
180 # named entities, the codepoint2name name should take
181 # precedence where possible, since that's the more easily
182 # recognizable one.
52 for codepoint, name in list(codepoint2name.items()): 183 for codepoint, name in list(codepoint2name.items()):
53 character = chr(codepoint) 184 character = chr(codepoint)
54 if codepoint != 34: 185 unicode_to_name[character] = name
55 # There's no point in turning the quotation mark into 186
56 # &quot;, unless it happens within an attribute value, which 187 return unicode_to_name, name_to_unicode, re.compile(re_definition)
57 # is handled elsewhere.
58 characters_for_re.append(character)
59 lookup[character] = name
60 # But we do want to turn &quot; into the quotation mark.
61 reverse_lookup[name] = character
62 re_definition = "[%s]" % "".join(characters_for_re)
63 return lookup, reverse_lookup, re.compile(re_definition)
64 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 188 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
65 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 189 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
66 190
@@ -72,21 +196,23 @@ class EntitySubstitution(object):
72 ">": "gt", 196 ">": "gt",
73 } 197 }
74 198
75 BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|" 199 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
76 r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 200 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
77 r")") 201 ")")
78 202
79 AMPERSAND_OR_BRACKET = re.compile(r"([<>&])") 203 AMPERSAND_OR_BRACKET = re.compile("([<>&])")
80 204
81 @classmethod 205 @classmethod
82 def _substitute_html_entity(cls, matchobj): 206 def _substitute_html_entity(cls, matchobj):
207 """Used with a regular expression to substitute the
208 appropriate HTML entity for a special character string."""
83 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 209 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
84 return "&%s;" % entity 210 return "&%s;" % entity
85 211
86 @classmethod 212 @classmethod
87 def _substitute_xml_entity(cls, matchobj): 213 def _substitute_xml_entity(cls, matchobj):
88 """Used with a regular expression to substitute the 214 """Used with a regular expression to substitute the
89 appropriate XML entity for an XML special character.""" 215 appropriate XML entity for a special character string."""
90 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 216 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
91 return "&%s;" % entity 217 return "&%s;" % entity
92 218
@@ -181,6 +307,8 @@ class EntitySubstitution(object):
181 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 307 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
182 character with "&eacute;" will make it more readable to some 308 character with "&eacute;" will make it more readable to some
183 people. 309 people.
310
311 :param s: A Unicode string.
184 """ 312 """
185 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 313 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
186 cls._substitute_html_entity, s) 314 cls._substitute_html_entity, s)
@@ -192,23 +320,65 @@ class EncodingDetector:
192 Order of precedence: 320 Order of precedence:
193 321
194 1. Encodings you specifically tell EncodingDetector to try first 322 1. Encodings you specifically tell EncodingDetector to try first
195 (the override_encodings argument to the constructor). 323 (the known_definite_encodings argument to the constructor).
324
325 2. An encoding determined by sniffing the document's byte-order mark.
326
327 3. Encodings you specifically tell EncodingDetector to try if
328 byte-order mark sniffing fails (the user_encodings argument to the
329 constructor).
196 330
197 2. An encoding declared within the bytestring itself, either in an 331 4. An encoding declared within the bytestring itself, either in an
198 XML declaration (if the bytestring is to be interpreted as an XML 332 XML declaration (if the bytestring is to be interpreted as an XML
199 document), or in a <meta> tag (if the bytestring is to be 333 document), or in a <meta> tag (if the bytestring is to be
200 interpreted as an HTML document.) 334 interpreted as an HTML document.)
201 335
202 3. An encoding detected through textual analysis by chardet, 336 5. An encoding detected through textual analysis by chardet,
203 cchardet, or a similar external library. 337 cchardet, or a similar external library.
204 338
205 4. UTF-8. 339 4. UTF-8.
206 340
207 5. Windows-1252. 341 5. Windows-1252.
342
208 """ 343 """
209 def __init__(self, markup, override_encodings=None, is_html=False, 344 def __init__(self, markup, known_definite_encodings=None,
210 exclude_encodings=None): 345 is_html=False, exclude_encodings=None,
211 self.override_encodings = override_encodings or [] 346 user_encodings=None, override_encodings=None):
347 """Constructor.
348
349 :param markup: Some markup in an unknown encoding.
350
351 :param known_definite_encodings: When determining the encoding
352 of `markup`, these encodings will be tried first, in
353 order. In HTML terms, this corresponds to the "known
354 definite encoding" step defined here:
355 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
356
357 :param user_encodings: These encodings will be tried after the
358 `known_definite_encodings` have been tried and failed, and
359 after an attempt to sniff the encoding by looking at a
360 byte order mark has failed. In HTML terms, this
361 corresponds to the step "user has explicitly instructed
362 the user agent to override the document's character
363 encoding", defined here:
364 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
365
366 :param override_encodings: A deprecated alias for
367 known_definite_encodings. Any encodings here will be tried
368 immediately after the encodings in
369 known_definite_encodings.
370
371 :param is_html: If True, this markup is considered to be
372 HTML. Otherwise it's assumed to be XML.
373
374 :param exclude_encodings: These encodings will not be tried,
375 even if they otherwise would be.
376
377 """
378 self.known_definite_encodings = list(known_definite_encodings or [])
379 if override_encodings:
380 self.known_definite_encodings += override_encodings
381 self.user_encodings = user_encodings or []
212 exclude_encodings = exclude_encodings or [] 382 exclude_encodings = exclude_encodings or []
213 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) 383 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
214 self.chardet_encoding = None 384 self.chardet_encoding = None
@@ -219,6 +389,12 @@ class EncodingDetector:
219 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 389 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
220 390
221 def _usable(self, encoding, tried): 391 def _usable(self, encoding, tried):
392 """Should we even bother to try this encoding?
393
394 :param encoding: Name of an encoding.
395 :param tried: Encodings that have already been tried. This will be modified
396 as a side effect.
397 """
222 if encoding is not None: 398 if encoding is not None:
223 encoding = encoding.lower() 399 encoding = encoding.lower()
224 if encoding in self.exclude_encodings: 400 if encoding in self.exclude_encodings:
@@ -230,9 +406,14 @@ class EncodingDetector:
230 406
231 @property 407 @property
232 def encodings(self): 408 def encodings(self):
233 """Yield a number of encodings that might work for this markup.""" 409 """Yield a number of encodings that might work for this markup.
410
411 :yield: A sequence of strings.
412 """
234 tried = set() 413 tried = set()
235 for e in self.override_encodings: 414
415 # First, try the known definite encodings
416 for e in self.known_definite_encodings:
236 if self._usable(e, tried): 417 if self._usable(e, tried):
237 yield e 418 yield e
238 419
@@ -241,6 +422,12 @@ class EncodingDetector:
241 if self._usable(self.sniffed_encoding, tried): 422 if self._usable(self.sniffed_encoding, tried):
242 yield self.sniffed_encoding 423 yield self.sniffed_encoding
243 424
425 # Sniffing the byte-order mark did nothing; try the user
426 # encodings.
427 for e in self.user_encodings:
428 if self._usable(e, tried):
429 yield e
430
244 # Look within the document for an XML or HTML encoding 431 # Look within the document for an XML or HTML encoding
245 # declaration. 432 # declaration.
246 if self.declared_encoding is None: 433 if self.declared_encoding is None:
@@ -263,7 +450,11 @@ class EncodingDetector:
263 450
264 @classmethod 451 @classmethod
265 def strip_byte_order_mark(cls, data): 452 def strip_byte_order_mark(cls, data):
266 """If a byte-order mark is present, strip it and return the encoding it implies.""" 453 """If a byte-order mark is present, strip it and return the encoding it implies.
454
455 :param data: Some markup.
456 :return: A 2-tuple (modified data, implied encoding)
457 """
267 encoding = None 458 encoding = None
268 if isinstance(data, str): 459 if isinstance(data, str):
269 # Unicode data cannot have a byte-order mark. 460 # Unicode data cannot have a byte-order mark.
@@ -295,21 +486,36 @@ class EncodingDetector:
295 486
296 An HTML encoding is declared in a <meta> tag, hopefully near the 487 An HTML encoding is declared in a <meta> tag, hopefully near the
297 beginning of the document. 488 beginning of the document.
489
490 :param markup: Some markup.
491 :param is_html: If True, this markup is considered to be HTML. Otherwise
492 it's assumed to be XML.
493 :param search_entire_document: Since an encoding is supposed to declared near the beginning
494 of the document, most of the time it's only necessary to search a few kilobytes of data.
495 Set this to True to force this method to search the entire document.
298 """ 496 """
299 if search_entire_document: 497 if search_entire_document:
300 xml_endpos = html_endpos = len(markup) 498 xml_endpos = html_endpos = len(markup)
301 else: 499 else:
302 xml_endpos = 1024 500 xml_endpos = 1024
303 html_endpos = max(2048, int(len(markup) * 0.05)) 501 html_endpos = max(2048, int(len(markup) * 0.05))
304 502
503 if isinstance(markup, bytes):
504 res = encoding_res[bytes]
505 else:
506 res = encoding_res[str]
507
508 xml_re = res['xml']
509 html_re = res['html']
305 declared_encoding = None 510 declared_encoding = None
306 declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) 511 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
307 if not declared_encoding_match and is_html: 512 if not declared_encoding_match and is_html:
308 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) 513 declared_encoding_match = html_re.search(markup, endpos=html_endpos)
309 if declared_encoding_match is not None: 514 if declared_encoding_match is not None:
310 declared_encoding = declared_encoding_match.groups()[0].decode( 515 declared_encoding = declared_encoding_match.groups()[0]
311 'ascii', 'replace')
312 if declared_encoding: 516 if declared_encoding:
517 if isinstance(declared_encoding, bytes):
518 declared_encoding = declared_encoding.decode('ascii', 'replace')
313 return declared_encoding.lower() 519 return declared_encoding.lower()
314 return None 520 return None
315 521
@@ -332,15 +538,53 @@ class UnicodeDammit:
332 "iso-8859-2", 538 "iso-8859-2",
333 ] 539 ]
334 540
335 def __init__(self, markup, override_encodings=[], 541 def __init__(self, markup, known_definite_encodings=[],
336 smart_quotes_to=None, is_html=False, exclude_encodings=[]): 542 smart_quotes_to=None, is_html=False, exclude_encodings=[],
543 user_encodings=None, override_encodings=None
544 ):
545 """Constructor.
546
547 :param markup: A bytestring representing markup in an unknown encoding.
548
549 :param known_definite_encodings: When determining the encoding
550 of `markup`, these encodings will be tried first, in
551 order. In HTML terms, this corresponds to the "known
552 definite encoding" step defined here:
553 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
554
555 :param user_encodings: These encodings will be tried after the
556 `known_definite_encodings` have been tried and failed, and
557 after an attempt to sniff the encoding by looking at a
558 byte order mark has failed. In HTML terms, this
559 corresponds to the step "user has explicitly instructed
560 the user agent to override the document's character
561 encoding", defined here:
562 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
563
564 :param override_encodings: A deprecated alias for
565 known_definite_encodings. Any encodings here will be tried
566 immediately after the encodings in
567 known_definite_encodings.
568
569 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
570 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
571 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
572 will convert them to HTML entity references.
573 :param is_html: If True, this markup is considered to be HTML. Otherwise
574 it's assumed to be XML.
575 :param exclude_encodings: These encodings will not be considered, even
576 if the sniffing code thinks they might make sense.
577
578 """
337 self.smart_quotes_to = smart_quotes_to 579 self.smart_quotes_to = smart_quotes_to
338 self.tried_encodings = [] 580 self.tried_encodings = []
339 self.contains_replacement_characters = False 581 self.contains_replacement_characters = False
340 self.is_html = is_html 582 self.is_html = is_html
341 583 self.log = logging.getLogger(__name__)
342 self.detector = EncodingDetector( 584 self.detector = EncodingDetector(
343 markup, override_encodings, is_html, exclude_encodings) 585 markup, known_definite_encodings, is_html, exclude_encodings,
586 user_encodings, override_encodings
587 )
344 588
345 # Short-circuit if the data is in Unicode to begin with. 589 # Short-circuit if the data is in Unicode to begin with.
346 if isinstance(markup, str) or markup == '': 590 if isinstance(markup, str) or markup == '':
@@ -368,9 +612,10 @@ class UnicodeDammit:
368 if encoding != "ascii": 612 if encoding != "ascii":
369 u = self._convert_from(encoding, "replace") 613 u = self._convert_from(encoding, "replace")
370 if u is not None: 614 if u is not None:
371 logging.warning( 615 self.log.warning(
372 "Some characters could not be decoded, and were " 616 "Some characters could not be decoded, and were "
373 "replaced with REPLACEMENT CHARACTER.") 617 "replaced with REPLACEMENT CHARACTER."
618 )
374 self.contains_replacement_characters = True 619 self.contains_replacement_characters = True
375 break 620 break
376 621
@@ -399,6 +644,10 @@ class UnicodeDammit:
399 return sub 644 return sub
400 645
401 def _convert_from(self, proposed, errors="strict"): 646 def _convert_from(self, proposed, errors="strict"):
647 """Attempt to convert the markup to the proposed encoding.
648
649 :param proposed: The name of a character encoding.
650 """
402 proposed = self.find_codec(proposed) 651 proposed = self.find_codec(proposed)
403 if not proposed or (proposed, errors) in self.tried_encodings: 652 if not proposed or (proposed, errors) in self.tried_encodings:
404 return None 653 return None
@@ -413,30 +662,40 @@ class UnicodeDammit:
413 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 662 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
414 663
415 try: 664 try:
416 #print "Trying to convert document to %s (errors=%s)" % ( 665 #print("Trying to convert document to %s (errors=%s)" % (
417 # proposed, errors) 666 # proposed, errors))
418 u = self._to_unicode(markup, proposed, errors) 667 u = self._to_unicode(markup, proposed, errors)
419 self.markup = u 668 self.markup = u
420 self.original_encoding = proposed 669 self.original_encoding = proposed
421 except Exception as e: 670 except Exception as e:
422 #print "That didn't work!" 671 #print("That didn't work!")
423 #print e 672 #print(e)
424 return None 673 return None
425 #print "Correct encoding: %s" % proposed 674 #print("Correct encoding: %s" % proposed)
426 return self.markup 675 return self.markup
427 676
428 def _to_unicode(self, data, encoding, errors="strict"): 677 def _to_unicode(self, data, encoding, errors="strict"):
429 '''Given a string and its encoding, decodes the string into Unicode. 678 """Given a string and its encoding, decodes the string into Unicode.
430 %encoding is a string recognized by encodings.aliases''' 679
680 :param encoding: The name of an encoding.
681 """
431 return str(data, encoding, errors) 682 return str(data, encoding, errors)
432 683
433 @property 684 @property
434 def declared_html_encoding(self): 685 def declared_html_encoding(self):
686 """If the markup is an HTML document, returns the encoding declared _within_
687 the document.
688 """
435 if not self.is_html: 689 if not self.is_html:
436 return None 690 return None
437 return self.detector.declared_encoding 691 return self.detector.declared_encoding
438 692
439 def find_codec(self, charset): 693 def find_codec(self, charset):
694 """Convert the name of a character set to a codec name.
695
696 :param charset: The name of a character set.
697 :return: The name of a codec.
698 """
440 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) 699 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
441 or (charset and self._codec(charset.replace("-", ""))) 700 or (charset and self._codec(charset.replace("-", "")))
442 or (charset and self._codec(charset.replace("-", "_"))) 701 or (charset and self._codec(charset.replace("-", "_")))
@@ -726,7 +985,7 @@ class UnicodeDammit:
726 0xde : b'\xc3\x9e', # Þ 985 0xde : b'\xc3\x9e', # Þ
727 0xdf : b'\xc3\x9f', # ß 986 0xdf : b'\xc3\x9f', # ß
728 0xe0 : b'\xc3\xa0', # à 987 0xe0 : b'\xc3\xa0', # à
729 0xe1 : b'\xa1', # á 988 0xe1 : b'\xa1', # á
730 0xe2 : b'\xc3\xa2', # â 989 0xe2 : b'\xc3\xa2', # â
731 0xe3 : b'\xc3\xa3', # ã 990 0xe3 : b'\xc3\xa3', # ã
732 0xe4 : b'\xc3\xa4', # ä 991 0xe4 : b'\xc3\xa4', # ä
@@ -775,12 +1034,16 @@ class UnicodeDammit:
775 Currently the only situation supported is Windows-1252 (or its 1034 Currently the only situation supported is Windows-1252 (or its
776 subset ISO-8859-1), embedded in UTF-8. 1035 subset ISO-8859-1), embedded in UTF-8.
777 1036
778 The input must be a bytestring. If you've already converted 1037 :param in_bytes: A bytestring that you suspect contains
779 the document to Unicode, you're too late. 1038 characters from multiple encodings. Note that this _must_
780 1039 be a bytestring. If you've already converted the document
781 The output is a bytestring in which `embedded_encoding` 1040 to Unicode, you're too late.
782 characters have been converted to their `main_encoding` 1041 :param main_encoding: The primary encoding of `in_bytes`.
783 equivalents. 1042 :param embedded_encoding: The encoding that was used to embed characters
1043 in the main document.
1044 :return: A bytestring in which `embedded_encoding`
1045 characters have been converted to their `main_encoding`
1046 equivalents.
784 """ 1047 """
785 if embedded_encoding.replace('_', '-').lower() not in ( 1048 if embedded_encoding.replace('_', '-').lower() not in (
786 'windows-1252', 'windows_1252'): 1049 'windows-1252', 'windows_1252'):
diff --git a/bitbake/lib/bs4/diagnose.py b/bitbake/lib/bs4/diagnose.py
index 083395fb46..e079772e69 100644
--- a/bitbake/lib/bs4/diagnose.py
+++ b/bitbake/lib/bs4/diagnose.py
@@ -1,9 +1,10 @@
1"""Diagnostic functions, mainly for use when doing tech support.""" 1"""Diagnostic functions, mainly for use when doing tech support."""
2 2
3# Use of this source code is governed by the MIT license.
3__license__ = "MIT" 4__license__ = "MIT"
4 5
5import cProfile 6import cProfile
6from io import StringIO 7from io import BytesIO
7from html.parser import HTMLParser 8from html.parser import HTMLParser
8import bs4 9import bs4
9from bs4 import BeautifulSoup, __version__ 10from bs4 import BeautifulSoup, __version__
@@ -19,9 +20,13 @@ import sys
19import cProfile 20import cProfile
20 21
21def diagnose(data): 22def diagnose(data):
22 """Diagnostic suite for isolating common problems.""" 23 """Diagnostic suite for isolating common problems.
23 print("Diagnostic running on Beautiful Soup %s" % __version__) 24
24 print("Python version %s" % sys.version) 25 :param data: A string containing markup that needs to be explained.
26 :return: None; diagnostics are printed to standard output.
27 """
28 print(("Diagnostic running on Beautiful Soup %s" % __version__))
29 print(("Python version %s" % sys.version))
25 30
26 basic_parsers = ["html.parser", "html5lib", "lxml"] 31 basic_parsers = ["html.parser", "html5lib", "lxml"]
27 for name in basic_parsers: 32 for name in basic_parsers:
@@ -35,61 +40,70 @@ def diagnose(data):
35 name)) 40 name))
36 41
37 if 'lxml' in basic_parsers: 42 if 'lxml' in basic_parsers:
38 basic_parsers.append(["lxml", "xml"]) 43 basic_parsers.append("lxml-xml")
39 try: 44 try:
40 from lxml import etree 45 from lxml import etree
41 print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) 46 print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
42 except ImportError as e: 47 except ImportError as e:
43 print ( 48 print(
44 "lxml is not installed or couldn't be imported.") 49 "lxml is not installed or couldn't be imported.")
45 50
46 51
47 if 'html5lib' in basic_parsers: 52 if 'html5lib' in basic_parsers:
48 try: 53 try:
49 import html5lib 54 import html5lib
50 print("Found html5lib version %s" % html5lib.__version__) 55 print(("Found html5lib version %s" % html5lib.__version__))
51 except ImportError as e: 56 except ImportError as e:
52 print ( 57 print(
53 "html5lib is not installed or couldn't be imported.") 58 "html5lib is not installed or couldn't be imported.")
54 59
55 if hasattr(data, 'read'): 60 if hasattr(data, 'read'):
56 data = data.read() 61 data = data.read()
57 elif os.path.exists(data):
58 print('"%s" looks like a filename. Reading data from the file.' % data)
59 data = open(data).read()
60 elif data.startswith("http:") or data.startswith("https:"):
61 print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
62 print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
63 return
64 print()
65 62
66 for parser in basic_parsers: 63 for parser in basic_parsers:
67 print("Trying to parse your markup with %s" % parser) 64 print(("Trying to parse your markup with %s" % parser))
68 success = False 65 success = False
69 try: 66 try:
70 soup = BeautifulSoup(data, parser) 67 soup = BeautifulSoup(data, features=parser)
71 success = True 68 success = True
72 except Exception as e: 69 except Exception as e:
73 print("%s could not parse the markup." % parser) 70 print(("%s could not parse the markup." % parser))
74 traceback.print_exc() 71 traceback.print_exc()
75 if success: 72 if success:
76 print("Here's what %s did with the markup:" % parser) 73 print(("Here's what %s did with the markup:" % parser))
77 print(soup.prettify()) 74 print((soup.prettify()))
78 75
79 print("-" * 80) 76 print(("-" * 80))
80 77
81def lxml_trace(data, html=True, **kwargs): 78def lxml_trace(data, html=True, **kwargs):
82 """Print out the lxml events that occur during parsing. 79 """Print out the lxml events that occur during parsing.
83 80
84 This lets you see how lxml parses a document when no Beautiful 81 This lets you see how lxml parses a document when no Beautiful
85 Soup code is running. 82 Soup code is running. You can use this to determine whether
83 an lxml-specific problem is in Beautiful Soup's lxml tree builders
84 or in lxml itself.
85
86 :param data: Some markup.
87 :param html: If True, markup will be parsed with lxml's HTML parser.
88 if False, lxml's XML parser will be used.
86 """ 89 """
87 from lxml import etree 90 from lxml import etree
88 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 91 recover = kwargs.pop('recover', True)
92 if isinstance(data, str):
93 data = data.encode("utf8")
94 reader = BytesIO(data)
95 for event, element in etree.iterparse(
96 reader, html=html, recover=recover, **kwargs
97 ):
89 print(("%s, %4s, %s" % (event, element.tag, element.text))) 98 print(("%s, %4s, %s" % (event, element.tag, element.text)))
90 99
91class AnnouncingParser(HTMLParser): 100class AnnouncingParser(HTMLParser):
92 """Announces HTMLParser parse events, without doing anything else.""" 101 """Subclass of HTMLParser that announces parse events, without doing
102 anything else.
103
104 You can use this to get a picture of how html.parser sees a given
105 document. The easiest way to do this is to call `htmlparser_trace`.
106 """
93 107
94 def _p(self, s): 108 def _p(self, s):
95 print(s) 109 print(s)
@@ -126,6 +140,8 @@ def htmlparser_trace(data):
126 140
127 This lets you see how HTMLParser parses a document when no 141 This lets you see how HTMLParser parses a document when no
128 Beautiful Soup code is running. 142 Beautiful Soup code is running.
143
144 :param data: Some markup.
129 """ 145 """
130 parser = AnnouncingParser() 146 parser = AnnouncingParser()
131 parser.feed(data) 147 parser.feed(data)
@@ -168,9 +184,9 @@ def rdoc(num_elements=1000):
168 184
169def benchmark_parsers(num_elements=100000): 185def benchmark_parsers(num_elements=100000):
170 """Very basic head-to-head performance benchmark.""" 186 """Very basic head-to-head performance benchmark."""
171 print("Comparative parser benchmark on Beautiful Soup %s" % __version__) 187 print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
172 data = rdoc(num_elements) 188 data = rdoc(num_elements)
173 print("Generated a large invalid HTML document (%d bytes)." % len(data)) 189 print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
174 190
175 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 191 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
176 success = False 192 success = False
@@ -180,26 +196,26 @@ def benchmark_parsers(num_elements=100000):
180 b = time.time() 196 b = time.time()
181 success = True 197 success = True
182 except Exception as e: 198 except Exception as e:
183 print("%s could not parse the markup." % parser) 199 print(("%s could not parse the markup." % parser))
184 traceback.print_exc() 200 traceback.print_exc()
185 if success: 201 if success:
186 print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) 202 print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
187 203
188 from lxml import etree 204 from lxml import etree
189 a = time.time() 205 a = time.time()
190 etree.HTML(data) 206 etree.HTML(data)
191 b = time.time() 207 b = time.time()
192 print("Raw lxml parsed the markup in %.2fs." % (b-a)) 208 print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
193 209
194 import html5lib 210 import html5lib
195 parser = html5lib.HTMLParser() 211 parser = html5lib.HTMLParser()
196 a = time.time() 212 a = time.time()
197 parser.parse(data) 213 parser.parse(data)
198 b = time.time() 214 b = time.time()
199 print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 215 print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
200 216
201def profile(num_elements=100000, parser="lxml"): 217def profile(num_elements=100000, parser="lxml"):
202 218 """Use Python's profiler on a randomly generated document."""
203 filehandle = tempfile.NamedTemporaryFile() 219 filehandle = tempfile.NamedTemporaryFile()
204 filename = filehandle.name 220 filename = filehandle.name
205 221
@@ -212,5 +228,6 @@ def profile(num_elements=100000, parser="lxml"):
212 stats.sort_stats("cumulative") 228 stats.sort_stats("cumulative")
213 stats.print_stats('_html5lib|bs4', 50) 229 stats.print_stats('_html5lib|bs4', 50)
214 230
231# If this file is run as a script, standard input is diagnosed.
215if __name__ == '__main__': 232if __name__ == '__main__':
216 diagnose(sys.stdin.read()) 233 diagnose(sys.stdin.read())
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py
index 68be42d138..0aefe734b2 100644
--- a/bitbake/lib/bs4/element.py
+++ b/bitbake/lib/bs4/element.py
@@ -1,14 +1,27 @@
1# Use of this source code is governed by the MIT license.
1__license__ = "MIT" 2__license__ = "MIT"
2 3
3import collections.abc 4try:
5 from collections.abc import Callable # Python 3.6
6except ImportError as e:
7 from collections import Callable
4import re 8import re
5import sys 9import sys
6import warnings 10import warnings
7from bs4.dammit import EntitySubstitution 11
12from bs4.css import CSS
13from bs4.formatter import (
14 Formatter,
15 HTMLFormatter,
16 XMLFormatter,
17)
8 18
9DEFAULT_OUTPUT_ENCODING = "utf-8" 19DEFAULT_OUTPUT_ENCODING = "utf-8"
10PY3K = (sys.version_info[0] > 2)
11 20
21nonwhitespace_re = re.compile(r"\S+")
22
23# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
24# the off chance someone imported it for their own use.
12whitespace_re = re.compile(r"\s+") 25whitespace_re = re.compile(r"\s+")
13 26
14def _alias(attr): 27def _alias(attr):
@@ -23,12 +36,49 @@ def _alias(attr):
23 return alias 36 return alias
24 37
25 38
39# These encodings are recognized by Python (so PageElement.encode
40# could theoretically support them) but XML and HTML don't recognize
41# them (so they should not show up in an XML or HTML document as that
42# document's encoding).
43#
44# If an XML document is encoded in one of these encodings, no encoding
45# will be mentioned in the XML declaration. If an HTML document is
46# encoded in one of these encodings, and the HTML document has a
47# <meta> tag that mentions an encoding, the encoding will be given as
48# the empty string.
49#
50# Source:
51# https://docs.python.org/3/library/codecs.html#python-specific-encodings
52PYTHON_SPECIFIC_ENCODINGS = set([
53 "idna",
54 "mbcs",
55 "oem",
56 "palmos",
57 "punycode",
58 "raw_unicode_escape",
59 "undefined",
60 "unicode_escape",
61 "raw-unicode-escape",
62 "unicode-escape",
63 "string-escape",
64 "string_escape",
65])
66
67
26class NamespacedAttribute(str): 68class NamespacedAttribute(str):
69 """A namespaced string (e.g. 'xml:lang') that remembers the namespace
70 ('xml') and the name ('lang') that were used to create it.
71 """
27 72
28 def __new__(cls, prefix, name, namespace=None): 73 def __new__(cls, prefix, name=None, namespace=None):
29 if name is None: 74 if not name:
75 # This is the default namespace. Its name "has no value"
76 # per https://www.w3.org/TR/xml-names/#defaulting
77 name = None
78
79 if not name:
30 obj = str.__new__(cls, prefix) 80 obj = str.__new__(cls, prefix)
31 elif prefix is None: 81 elif not prefix:
32 # Not really namespaced. 82 # Not really namespaced.
33 obj = str.__new__(cls, name) 83 obj = str.__new__(cls, name)
34 else: 84 else:
@@ -54,6 +104,11 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
54 return obj 104 return obj
55 105
56 def encode(self, encoding): 106 def encode(self, encoding):
107 """When an HTML document is being encoded to a given encoding, the
108 value of a meta tag's 'charset' is the name of the encoding.
109 """
110 if encoding in PYTHON_SPECIFIC_ENCODINGS:
111 return ''
57 return encoding 112 return encoding
58 113
59 114
@@ -79,118 +134,44 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
79 return obj 134 return obj
80 135
81 def encode(self, encoding): 136 def encode(self, encoding):
137 if encoding in PYTHON_SPECIFIC_ENCODINGS:
138 return ''
82 def rewrite(match): 139 def rewrite(match):
83 return match.group(1) + encoding 140 return match.group(1) + encoding
84 return self.CHARSET_RE.sub(rewrite, self.original_value) 141 return self.CHARSET_RE.sub(rewrite, self.original_value)
85 142
86class HTMLAwareEntitySubstitution(EntitySubstitution):
87
88 """Entity substitution rules that are aware of some HTML quirks.
89 143
90 Specifically, the contents of <script> and <style> tags should not 144class PageElement(object):
91 undergo entity substitution. 145 """Contains the navigational information for some part of the page:
146 that is, its current location in the parse tree.
92 147
93 Incoming NavigableString objects are checked to see if they're the 148 NavigableString, Tag, etc. are all subclasses of PageElement.
94 direct children of a <script> or <style> tag.
95 """ 149 """
96 150
97 cdata_containing_tags = set(["script", "style"]) 151 # In general, we can't tell just by looking at an element whether
152 # it's contained in an XML document or an HTML document. But for
153 # Tags (q.v.) we can store this information at parse time.
154 known_xml = None
98 155
99 preformatted_tags = set(["pre"]) 156 def setup(self, parent=None, previous_element=None, next_element=None,
100 157 previous_sibling=None, next_sibling=None):
101 @classmethod 158 """Sets up the initial relations between this element and
102 def _substitute_if_appropriate(cls, ns, f): 159 other elements.
103 if (isinstance(ns, NavigableString)
104 and ns.parent is not None
105 and ns.parent.name in cls.cdata_containing_tags):
106 # Do nothing.
107 return ns
108 # Substitute.
109 return f(ns)
110 160
111 @classmethod 161 :param parent: The parent of this element.
112 def substitute_html(cls, ns):
113 return cls._substitute_if_appropriate(
114 ns, EntitySubstitution.substitute_html)
115 162
116 @classmethod 163 :param previous_element: The element parsed immediately before
117 def substitute_xml(cls, ns): 164 this one.
118 return cls._substitute_if_appropriate(
119 ns, EntitySubstitution.substitute_xml)
120 165
121class PageElement(object): 166 :param next_element: The element parsed immediately before
122 """Contains the navigational information for some part of the page 167 this one.
123 (either a tag or a piece of text)"""
124
125 # There are five possible values for the "formatter" argument passed in
126 # to methods like encode() and prettify():
127 #
128 # "html" - All Unicode characters with corresponding HTML entities
129 # are converted to those entities on output.
130 # "minimal" - Bare ampersands and angle brackets are converted to
131 # XML entities: &amp; &lt; &gt;
132 # None - The null formatter. Unicode characters are never
133 # converted to entities. This is not recommended, but it's
134 # faster than "minimal".
135 # A function - This function will be called on every string that
136 # needs to undergo entity substitution.
137 #
138
139 # In an HTML document, the default "html" and "minimal" functions
140 # will leave the contents of <script> and <style> tags alone. For
141 # an XML document, all tags will be given the same treatment.
142
143 HTML_FORMATTERS = {
144 "html" : HTMLAwareEntitySubstitution.substitute_html,
145 "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
146 None : None
147 }
148
149 XML_FORMATTERS = {
150 "html" : EntitySubstitution.substitute_html,
151 "minimal" : EntitySubstitution.substitute_xml,
152 None : None
153 }
154
155 def format_string(self, s, formatter='minimal'):
156 """Format the given string using the given formatter."""
157 if not isinstance(formatter, collections.abc.Callable):
158 formatter = self._formatter_for_name(formatter)
159 if formatter is None:
160 output = s
161 else:
162 output = formatter(s)
163 return output
164 168
165 @property 169 :param previous_sibling: The most recently encountered element
166 def _is_xml(self): 170 on the same level of the parse tree as this one.
167 """Is this element part of an XML tree or an HTML tree?
168 171
169 This is used when mapping a formatter name ("minimal") to an 172 :param previous_sibling: The next element to be encountered
170 appropriate function (one that performs entity-substitution on 173 on the same level of the parse tree as this one.
171 the contents of <script> and <style> tags, or not). It's
172 inefficient, but it should be called very rarely.
173 """ 174 """
174 if self.parent is None:
175 # This is the top-level object. It should have .is_xml set
176 # from tree creation. If not, take a guess--BS is usually
177 # used on HTML markup.
178 return getattr(self, 'is_xml', False)
179 return self.parent._is_xml
180
181 def _formatter_for_name(self, name):
182 "Look up a formatter function based on its name and the tree."
183 if self._is_xml:
184 return self.XML_FORMATTERS.get(
185 name, EntitySubstitution.substitute_xml)
186 else:
187 return self.HTML_FORMATTERS.get(
188 name, HTMLAwareEntitySubstitution.substitute_xml)
189
190 def setup(self, parent=None, previous_element=None, next_element=None,
191 previous_sibling=None, next_sibling=None):
192 """Sets up the initial relations between this element and
193 other elements."""
194 self.parent = parent 175 self.parent = parent
195 176
196 self.previous_element = previous_element 177 self.previous_element = previous_element
@@ -198,48 +179,156 @@ class PageElement(object):
198 self.previous_element.next_element = self 179 self.previous_element.next_element = self
199 180
200 self.next_element = next_element 181 self.next_element = next_element
201 if self.next_element: 182 if self.next_element is not None:
202 self.next_element.previous_element = self 183 self.next_element.previous_element = self
203 184
204 self.next_sibling = next_sibling 185 self.next_sibling = next_sibling
205 if self.next_sibling: 186 if self.next_sibling is not None:
206 self.next_sibling.previous_sibling = self 187 self.next_sibling.previous_sibling = self
207 188
208 if (not previous_sibling 189 if (previous_sibling is None
209 and self.parent is not None and self.parent.contents): 190 and self.parent is not None and self.parent.contents):
210 previous_sibling = self.parent.contents[-1] 191 previous_sibling = self.parent.contents[-1]
211 192
212 self.previous_sibling = previous_sibling 193 self.previous_sibling = previous_sibling
213 if previous_sibling: 194 if previous_sibling is not None:
214 self.previous_sibling.next_sibling = self 195 self.previous_sibling.next_sibling = self
215 196
197 def format_string(self, s, formatter):
198 """Format the given string using the given formatter.
199
200 :param s: A string.
201 :param formatter: A Formatter object, or a string naming one of the standard formatters.
202 """
203 if formatter is None:
204 return s
205 if not isinstance(formatter, Formatter):
206 formatter = self.formatter_for_name(formatter)
207 output = formatter.substitute(s)
208 return output
209
210 def formatter_for_name(self, formatter):
211 """Look up or create a Formatter for the given identifier,
212 if necessary.
213
214 :param formatter: Can be a Formatter object (used as-is), a
215 function (used as the entity substitution hook for an
216 XMLFormatter or HTMLFormatter), or a string (used to look
217 up an XMLFormatter or HTMLFormatter in the appropriate
218 registry.
219 """
220 if isinstance(formatter, Formatter):
221 return formatter
222 if self._is_xml:
223 c = XMLFormatter
224 else:
225 c = HTMLFormatter
226 if isinstance(formatter, Callable):
227 return c(entity_substitution=formatter)
228 return c.REGISTRY[formatter]
229
230 @property
231 def _is_xml(self):
232 """Is this element part of an XML tree or an HTML tree?
233
234 This is used in formatter_for_name, when deciding whether an
235 XMLFormatter or HTMLFormatter is more appropriate. It can be
236 inefficient, but it should be called very rarely.
237 """
238 if self.known_xml is not None:
239 # Most of the time we will have determined this when the
240 # document is parsed.
241 return self.known_xml
242
243 # Otherwise, it's likely that this element was created by
244 # direct invocation of the constructor from within the user's
245 # Python code.
246 if self.parent is None:
247 # This is the top-level object. It should have .known_xml set
248 # from tree creation. If not, take a guess--BS is usually
249 # used on HTML markup.
250 return getattr(self, 'is_xml', False)
251 return self.parent._is_xml
252
216 nextSibling = _alias("next_sibling") # BS3 253 nextSibling = _alias("next_sibling") # BS3
217 previousSibling = _alias("previous_sibling") # BS3 254 previousSibling = _alias("previous_sibling") # BS3
218 255
219 def replace_with(self, replace_with): 256 default = object()
220 if not self.parent: 257 def _all_strings(self, strip=False, types=default):
258 """Yield all strings of certain classes, possibly stripping them.
259
260 This is implemented differently in Tag and NavigableString.
261 """
262 raise NotImplementedError()
263
264 @property
265 def stripped_strings(self):
266 """Yield all strings in this PageElement, stripping them first.
267
268 :yield: A sequence of stripped strings.
269 """
270 for string in self._all_strings(True):
271 yield string
272
273 def get_text(self, separator="", strip=False,
274 types=default):
275 """Get all child strings of this PageElement, concatenated using the
276 given separator.
277
278 :param separator: Strings will be concatenated using this separator.
279
280 :param strip: If True, strings will be stripped before being
281 concatenated.
282
283 :param types: A tuple of NavigableString subclasses. Any
284 strings of a subclass not found in this list will be
285 ignored. Although there are exceptions, the default
286 behavior in most cases is to consider only NavigableString
287 and CData objects. That means no comments, processing
288 instructions, etc.
289
290 :return: A string.
291 """
292 return separator.join([s for s in self._all_strings(
293 strip, types=types)])
294 getText = get_text
295 text = property(get_text)
296
297 def replace_with(self, *args):
298 """Replace this PageElement with one or more PageElements, keeping the
299 rest of the tree the same.
300
301 :param args: One or more PageElements.
302 :return: `self`, no longer part of the tree.
303 """
304 if self.parent is None:
221 raise ValueError( 305 raise ValueError(
222 "Cannot replace one element with another when the" 306 "Cannot replace one element with another when the "
223 "element to be replaced is not part of a tree.") 307 "element to be replaced is not part of a tree.")
224 if replace_with is self: 308 if len(args) == 1 and args[0] is self:
225 return 309 return
226 if replace_with is self.parent: 310 if any(x is self.parent for x in args):
227 raise ValueError("Cannot replace a Tag with its parent.") 311 raise ValueError("Cannot replace a Tag with its parent.")
228 old_parent = self.parent 312 old_parent = self.parent
229 my_index = self.parent.index(self) 313 my_index = self.parent.index(self)
230 self.extract() 314 self.extract(_self_index=my_index)
231 old_parent.insert(my_index, replace_with) 315 for idx, replace_with in enumerate(args, start=my_index):
316 old_parent.insert(idx, replace_with)
232 return self 317 return self
233 replaceWith = replace_with # BS3 318 replaceWith = replace_with # BS3
234 319
235 def unwrap(self): 320 def unwrap(self):
321 """Replace this PageElement with its contents.
322
323 :return: `self`, no longer part of the tree.
324 """
236 my_parent = self.parent 325 my_parent = self.parent
237 if not self.parent: 326 if self.parent is None:
238 raise ValueError( 327 raise ValueError(
239 "Cannot replace an element with its contents when that" 328 "Cannot replace an element with its contents when that"
240 "element is not part of a tree.") 329 "element is not part of a tree.")
241 my_index = self.parent.index(self) 330 my_index = self.parent.index(self)
242 self.extract() 331 self.extract(_self_index=my_index)
243 for child in reversed(self.contents[:]): 332 for child in reversed(self.contents[:]):
244 my_parent.insert(my_index, child) 333 my_parent.insert(my_index, child)
245 return self 334 return self
@@ -247,14 +336,29 @@ class PageElement(object):
247 replaceWithChildren = unwrap # BS3 336 replaceWithChildren = unwrap # BS3
248 337
249 def wrap(self, wrap_inside): 338 def wrap(self, wrap_inside):
339 """Wrap this PageElement inside another one.
340
341 :param wrap_inside: A PageElement.
342 :return: `wrap_inside`, occupying the position in the tree that used
343 to be occupied by `self`, and with `self` inside it.
344 """
250 me = self.replace_with(wrap_inside) 345 me = self.replace_with(wrap_inside)
251 wrap_inside.append(me) 346 wrap_inside.append(me)
252 return wrap_inside 347 return wrap_inside
253 348
254 def extract(self): 349 def extract(self, _self_index=None):
255 """Destructively rips this element out of the tree.""" 350 """Destructively rips this element out of the tree.
351
352 :param _self_index: The location of this element in its parent's
353 .contents, if known. Passing this in allows for a performance
354 optimization.
355
356 :return: `self`, no longer part of the tree.
357 """
256 if self.parent is not None: 358 if self.parent is not None:
257 del self.parent.contents[self.parent.index(self)] 359 if _self_index is None:
360 _self_index = self.parent.index(self)
361 del self.parent.contents[_self_index]
258 362
259 #Find the two elements that would be next to each other if 363 #Find the two elements that would be next to each other if
260 #this element (and any children) hadn't been parsed. Connect 364 #this element (and any children) hadn't been parsed. Connect
@@ -281,8 +385,13 @@ class PageElement(object):
281 return self 385 return self
282 386
283 def _last_descendant(self, is_initialized=True, accept_self=True): 387 def _last_descendant(self, is_initialized=True, accept_self=True):
284 "Finds the last element beneath this object to be parsed." 388 """Finds the last element beneath this object to be parsed.
285 if is_initialized and self.next_sibling: 389
390 :param is_initialized: Has `setup` been called on this PageElement
391 yet?
392 :param accept_self: Is `self` an acceptable answer to the question?
393 """
394 if is_initialized and self.next_sibling is not None:
286 last_child = self.next_sibling.previous_element 395 last_child = self.next_sibling.previous_element
287 else: 396 else:
288 last_child = self 397 last_child = self
@@ -295,6 +404,14 @@ class PageElement(object):
295 _lastRecursiveChild = _last_descendant 404 _lastRecursiveChild = _last_descendant
296 405
297 def insert(self, position, new_child): 406 def insert(self, position, new_child):
407 """Insert a new PageElement in the list of this PageElement's children.
408
409 This works the same way as `list.insert`.
410
411 :param position: The numeric position that should be occupied
412 in `self.children` by the new PageElement.
413 :param new_child: A PageElement.
414 """
298 if new_child is None: 415 if new_child is None:
299 raise ValueError("Cannot insert None into a tag.") 416 raise ValueError("Cannot insert None into a tag.")
300 if new_child is self: 417 if new_child is self:
@@ -303,6 +420,14 @@ class PageElement(object):
303 and not isinstance(new_child, NavigableString)): 420 and not isinstance(new_child, NavigableString)):
304 new_child = NavigableString(new_child) 421 new_child = NavigableString(new_child)
305 422
423 from bs4 import BeautifulSoup
424 if isinstance(new_child, BeautifulSoup):
425 # We don't want to end up with a situation where one BeautifulSoup
426 # object contains another. Insert the children one at a time.
427 for subchild in list(new_child.contents):
428 self.insert(position, subchild)
429 position += 1
430 return
306 position = min(position, len(self.contents)) 431 position = min(position, len(self.contents))
307 if hasattr(new_child, 'parent') and new_child.parent is not None: 432 if hasattr(new_child, 'parent') and new_child.parent is not None:
308 # We're 'inserting' an element that's already one 433 # We're 'inserting' an element that's already one
@@ -361,160 +486,326 @@ class PageElement(object):
361 self.contents.insert(position, new_child) 486 self.contents.insert(position, new_child)
362 487
363 def append(self, tag): 488 def append(self, tag):
364 """Appends the given tag to the contents of this tag.""" 489 """Appends the given PageElement to the contents of this one.
490
491 :param tag: A PageElement.
492 """
365 self.insert(len(self.contents), tag) 493 self.insert(len(self.contents), tag)
366 494
367 def insert_before(self, predecessor): 495 def extend(self, tags):
368 """Makes the given element the immediate predecessor of this one. 496 """Appends the given PageElements to this one's contents.
369 497
370 The two elements will have the same parent, and the given element 498 :param tags: A list of PageElements. If a single Tag is
499 provided instead, this PageElement's contents will be extended
500 with that Tag's contents.
501 """
502 if isinstance(tags, Tag):
503 tags = tags.contents
504 if isinstance(tags, list):
505 # Moving items around the tree may change their position in
506 # the original list. Make a list that won't change.
507 tags = list(tags)
508 for tag in tags:
509 self.append(tag)
510
511 def insert_before(self, *args):
512 """Makes the given element(s) the immediate predecessor of this one.
513
514 All the elements will have the same parent, and the given elements
371 will be immediately before this one. 515 will be immediately before this one.
516
517 :param args: One or more PageElements.
372 """ 518 """
373 if self is predecessor:
374 raise ValueError("Can't insert an element before itself.")
375 parent = self.parent 519 parent = self.parent
376 if parent is None: 520 if parent is None:
377 raise ValueError( 521 raise ValueError(
378 "Element has no parent, so 'before' has no meaning.") 522 "Element has no parent, so 'before' has no meaning.")
379 # Extract first so that the index won't be screwed up if they 523 if any(x is self for x in args):
380 # are siblings. 524 raise ValueError("Can't insert an element before itself.")
381 if isinstance(predecessor, PageElement): 525 for predecessor in args:
382 predecessor.extract() 526 # Extract first so that the index won't be screwed up if they
383 index = parent.index(self) 527 # are siblings.
384 parent.insert(index, predecessor) 528 if isinstance(predecessor, PageElement):
385 529 predecessor.extract()
386 def insert_after(self, successor): 530 index = parent.index(self)
387 """Makes the given element the immediate successor of this one. 531 parent.insert(index, predecessor)
388 532
389 The two elements will have the same parent, and the given element 533 def insert_after(self, *args):
534 """Makes the given element(s) the immediate successor of this one.
535
536 The elements will have the same parent, and the given elements
390 will be immediately after this one. 537 will be immediately after this one.
538
539 :param args: One or more PageElements.
391 """ 540 """
392 if self is successor: 541 # Do all error checking before modifying the tree.
393 raise ValueError("Can't insert an element after itself.")
394 parent = self.parent 542 parent = self.parent
395 if parent is None: 543 if parent is None:
396 raise ValueError( 544 raise ValueError(
397 "Element has no parent, so 'after' has no meaning.") 545 "Element has no parent, so 'after' has no meaning.")
398 # Extract first so that the index won't be screwed up if they 546 if any(x is self for x in args):
399 # are siblings. 547 raise ValueError("Can't insert an element after itself.")
400 if isinstance(successor, PageElement): 548
401 successor.extract() 549 offset = 0
402 index = parent.index(self) 550 for successor in args:
403 parent.insert(index+1, successor) 551 # Extract first so that the index won't be screwed up if they
404 552 # are siblings.
405 def find_next(self, name=None, attrs={}, text=None, **kwargs): 553 if isinstance(successor, PageElement):
406 """Returns the first item that matches the given criteria and 554 successor.extract()
407 appears after this Tag in the document.""" 555 index = parent.index(self)
408 return self._find_one(self.find_all_next, name, attrs, text, **kwargs) 556 parent.insert(index+1+offset, successor)
557 offset += 1
558
559 def find_next(self, name=None, attrs={}, string=None, **kwargs):
560 """Find the first PageElement that matches the given criteria and
561 appears later in the document than this PageElement.
562
563 All find_* methods take a common set of arguments. See the online
564 documentation for detailed explanations.
565
566 :param name: A filter on tag name.
567 :param attrs: A dictionary of filters on attribute values.
568 :param string: A filter for a NavigableString with specific text.
569 :kwargs: A dictionary of filters on attribute values.
570 :return: A PageElement.
571 :rtype: bs4.element.Tag | bs4.element.NavigableString
572 """
573 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
409 findNext = find_next # BS3 574 findNext = find_next # BS3
410 575
411 def find_all_next(self, name=None, attrs={}, text=None, limit=None, 576 def find_all_next(self, name=None, attrs={}, string=None, limit=None,
412 **kwargs): 577 **kwargs):
413 """Returns all items that match the given criteria and appear 578 """Find all PageElements that match the given criteria and appear
414 after this Tag in the document.""" 579 later in the document than this PageElement.
415 return self._find_all(name, attrs, text, limit, self.next_elements, 580
416 **kwargs) 581 All find_* methods take a common set of arguments. See the online
582 documentation for detailed explanations.
583
584 :param name: A filter on tag name.
585 :param attrs: A dictionary of filters on attribute values.
586 :param string: A filter for a NavigableString with specific text.
587 :param limit: Stop looking after finding this many results.
588 :kwargs: A dictionary of filters on attribute values.
589 :return: A ResultSet containing PageElements.
590 """
591 _stacklevel = kwargs.pop('_stacklevel', 2)
592 return self._find_all(name, attrs, string, limit, self.next_elements,
593 _stacklevel=_stacklevel+1, **kwargs)
417 findAllNext = find_all_next # BS3 594 findAllNext = find_all_next # BS3
418 595
419 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): 596 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
420 """Returns the closest sibling to this Tag that matches the 597 """Find the closest sibling to this PageElement that matches the
421 given criteria and appears after this Tag in the document.""" 598 given criteria and appears later in the document.
422 return self._find_one(self.find_next_siblings, name, attrs, text, 599
600 All find_* methods take a common set of arguments. See the
601 online documentation for detailed explanations.
602
603 :param name: A filter on tag name.
604 :param attrs: A dictionary of filters on attribute values.
605 :param string: A filter for a NavigableString with specific text.
606 :kwargs: A dictionary of filters on attribute values.
607 :return: A PageElement.
608 :rtype: bs4.element.Tag | bs4.element.NavigableString
609 """
610 return self._find_one(self.find_next_siblings, name, attrs, string,
423 **kwargs) 611 **kwargs)
424 findNextSibling = find_next_sibling # BS3 612 findNextSibling = find_next_sibling # BS3
425 613
426 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, 614 def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
427 **kwargs): 615 **kwargs):
428 """Returns the siblings of this Tag that match the given 616 """Find all siblings of this PageElement that match the given criteria
429 criteria and appear after this Tag in the document.""" 617 and appear later in the document.
430 return self._find_all(name, attrs, text, limit, 618
431 self.next_siblings, **kwargs) 619 All find_* methods take a common set of arguments. See the online
620 documentation for detailed explanations.
621
622 :param name: A filter on tag name.
623 :param attrs: A dictionary of filters on attribute values.
624 :param string: A filter for a NavigableString with specific text.
625 :param limit: Stop looking after finding this many results.
626 :kwargs: A dictionary of filters on attribute values.
627 :return: A ResultSet of PageElements.
628 :rtype: bs4.element.ResultSet
629 """
630 _stacklevel = kwargs.pop('_stacklevel', 2)
631 return self._find_all(
632 name, attrs, string, limit,
633 self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
634 )
432 findNextSiblings = find_next_siblings # BS3 635 findNextSiblings = find_next_siblings # BS3
433 fetchNextSiblings = find_next_siblings # BS2 636 fetchNextSiblings = find_next_siblings # BS2
434 637
435 def find_previous(self, name=None, attrs={}, text=None, **kwargs): 638 def find_previous(self, name=None, attrs={}, string=None, **kwargs):
436 """Returns the first item that matches the given criteria and 639 """Look backwards in the document from this PageElement and find the
437 appears before this Tag in the document.""" 640 first PageElement that matches the given criteria.
641
642 All find_* methods take a common set of arguments. See the online
643 documentation for detailed explanations.
644
645 :param name: A filter on tag name.
646 :param attrs: A dictionary of filters on attribute values.
647 :param string: A filter for a NavigableString with specific text.
648 :kwargs: A dictionary of filters on attribute values.
649 :return: A PageElement.
650 :rtype: bs4.element.Tag | bs4.element.NavigableString
651 """
438 return self._find_one( 652 return self._find_one(
439 self.find_all_previous, name, attrs, text, **kwargs) 653 self.find_all_previous, name, attrs, string, **kwargs)
440 findPrevious = find_previous # BS3 654 findPrevious = find_previous # BS3
441 655
442 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, 656 def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
443 **kwargs): 657 **kwargs):
444 """Returns all items that match the given criteria and appear 658 """Look backwards in the document from this PageElement and find all
445 before this Tag in the document.""" 659 PageElements that match the given criteria.
446 return self._find_all(name, attrs, text, limit, self.previous_elements, 660
447 **kwargs) 661 All find_* methods take a common set of arguments. See the online
662 documentation for detailed explanations.
663
664 :param name: A filter on tag name.
665 :param attrs: A dictionary of filters on attribute values.
666 :param string: A filter for a NavigableString with specific text.
667 :param limit: Stop looking after finding this many results.
668 :kwargs: A dictionary of filters on attribute values.
669 :return: A ResultSet of PageElements.
670 :rtype: bs4.element.ResultSet
671 """
672 _stacklevel = kwargs.pop('_stacklevel', 2)
673 return self._find_all(
674 name, attrs, string, limit, self.previous_elements,
675 _stacklevel=_stacklevel+1, **kwargs
676 )
448 findAllPrevious = find_all_previous # BS3 677 findAllPrevious = find_all_previous # BS3
449 fetchPrevious = find_all_previous # BS2 678 fetchPrevious = find_all_previous # BS2
450 679
451 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): 680 def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
452 """Returns the closest sibling to this Tag that matches the 681 """Returns the closest sibling to this PageElement that matches the
453 given criteria and appears before this Tag in the document.""" 682 given criteria and appears earlier in the document.
454 return self._find_one(self.find_previous_siblings, name, attrs, text, 683
684 All find_* methods take a common set of arguments. See the online
685 documentation for detailed explanations.
686
687 :param name: A filter on tag name.
688 :param attrs: A dictionary of filters on attribute values.
689 :param string: A filter for a NavigableString with specific text.
690 :kwargs: A dictionary of filters on attribute values.
691 :return: A PageElement.
692 :rtype: bs4.element.Tag | bs4.element.NavigableString
693 """
694 return self._find_one(self.find_previous_siblings, name, attrs, string,
455 **kwargs) 695 **kwargs)
456 findPreviousSibling = find_previous_sibling # BS3 696 findPreviousSibling = find_previous_sibling # BS3
457 697
458 def find_previous_siblings(self, name=None, attrs={}, text=None, 698 def find_previous_siblings(self, name=None, attrs={}, string=None,
459 limit=None, **kwargs): 699 limit=None, **kwargs):
460 """Returns the siblings of this Tag that match the given 700 """Returns all siblings to this PageElement that match the
461 criteria and appear before this Tag in the document.""" 701 given criteria and appear earlier in the document.
462 return self._find_all(name, attrs, text, limit, 702
463 self.previous_siblings, **kwargs) 703 All find_* methods take a common set of arguments. See the online
704 documentation for detailed explanations.
705
706 :param name: A filter on tag name.
707 :param attrs: A dictionary of filters on attribute values.
708 :param string: A filter for a NavigableString with specific text.
709 :param limit: Stop looking after finding this many results.
710 :kwargs: A dictionary of filters on attribute values.
711 :return: A ResultSet of PageElements.
712 :rtype: bs4.element.ResultSet
713 """
714 _stacklevel = kwargs.pop('_stacklevel', 2)
715 return self._find_all(
716 name, attrs, string, limit,
717 self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
718 )
464 findPreviousSiblings = find_previous_siblings # BS3 719 findPreviousSiblings = find_previous_siblings # BS3
465 fetchPreviousSiblings = find_previous_siblings # BS2 720 fetchPreviousSiblings = find_previous_siblings # BS2
466 721
467 def find_parent(self, name=None, attrs={}, **kwargs): 722 def find_parent(self, name=None, attrs={}, **kwargs):
468 """Returns the closest parent of this Tag that matches the given 723 """Find the closest parent of this PageElement that matches the given
469 criteria.""" 724 criteria.
725
726 All find_* methods take a common set of arguments. See the online
727 documentation for detailed explanations.
728
729 :param name: A filter on tag name.
730 :param attrs: A dictionary of filters on attribute values.
731 :kwargs: A dictionary of filters on attribute values.
732
733 :return: A PageElement.
734 :rtype: bs4.element.Tag | bs4.element.NavigableString
735 """
470 # NOTE: We can't use _find_one because findParents takes a different 736 # NOTE: We can't use _find_one because findParents takes a different
471 # set of arguments. 737 # set of arguments.
472 r = None 738 r = None
473 l = self.find_parents(name, attrs, 1, **kwargs) 739 l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
474 if l: 740 if l:
475 r = l[0] 741 r = l[0]
476 return r 742 return r
477 findParent = find_parent # BS3 743 findParent = find_parent # BS3
478 744
479 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): 745 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
480 """Returns the parents of this Tag that match the given 746 """Find all parents of this PageElement that match the given criteria.
481 criteria.""" 747
748 All find_* methods take a common set of arguments. See the online
749 documentation for detailed explanations.
482 750
751 :param name: A filter on tag name.
752 :param attrs: A dictionary of filters on attribute values.
753 :param limit: Stop looking after finding this many results.
754 :kwargs: A dictionary of filters on attribute values.
755
756 :return: A PageElement.
757 :rtype: bs4.element.Tag | bs4.element.NavigableString
758 """
759 _stacklevel = kwargs.pop('_stacklevel', 2)
483 return self._find_all(name, attrs, None, limit, self.parents, 760 return self._find_all(name, attrs, None, limit, self.parents,
484 **kwargs) 761 _stacklevel=_stacklevel+1, **kwargs)
485 findParents = find_parents # BS3 762 findParents = find_parents # BS3
486 fetchParents = find_parents # BS2 763 fetchParents = find_parents # BS2
487 764
488 @property 765 @property
489 def next(self): 766 def next(self):
767 """The PageElement, if any, that was parsed just after this one.
768
769 :return: A PageElement.
770 :rtype: bs4.element.Tag | bs4.element.NavigableString
771 """
490 return self.next_element 772 return self.next_element
491 773
492 @property 774 @property
493 def previous(self): 775 def previous(self):
776 """The PageElement, if any, that was parsed just before this one.
777
778 :return: A PageElement.
779 :rtype: bs4.element.Tag | bs4.element.NavigableString
780 """
494 return self.previous_element 781 return self.previous_element
495 782
496 #These methods do the real heavy lifting. 783 #These methods do the real heavy lifting.
497 784
498 def _find_one(self, method, name, attrs, text, **kwargs): 785 def _find_one(self, method, name, attrs, string, **kwargs):
499 r = None 786 r = None
500 l = method(name, attrs, text, 1, **kwargs) 787 l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
501 if l: 788 if l:
502 r = l[0] 789 r = l[0]
503 return r 790 return r
504 791
505 def _find_all(self, name, attrs, text, limit, generator, **kwargs): 792 def _find_all(self, name, attrs, string, limit, generator, **kwargs):
506 "Iterates over a generator looking for things that match." 793 "Iterates over a generator looking for things that match."
794 _stacklevel = kwargs.pop('_stacklevel', 3)
507 795
508 if text is None and 'string' in kwargs: 796 if string is None and 'text' in kwargs:
509 text = kwargs['string'] 797 string = kwargs.pop('text')
510 del kwargs['string'] 798 warnings.warn(
799 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
800 DeprecationWarning, stacklevel=_stacklevel
801 )
511 802
512 if isinstance(name, SoupStrainer): 803 if isinstance(name, SoupStrainer):
513 strainer = name 804 strainer = name
514 else: 805 else:
515 strainer = SoupStrainer(name, attrs, text, **kwargs) 806 strainer = SoupStrainer(name, attrs, string, **kwargs)
516 807
517 if text is None and not limit and not attrs and not kwargs: 808 if string is None and not limit and not attrs and not kwargs:
518 if name is True or name is None: 809 if name is True or name is None:
519 # Optimization to find all tags. 810 # Optimization to find all tags.
520 result = (element for element in generator 811 result = (element for element in generator
@@ -522,9 +813,23 @@ class PageElement(object):
522 return ResultSet(strainer, result) 813 return ResultSet(strainer, result)
523 elif isinstance(name, str): 814 elif isinstance(name, str):
524 # Optimization to find all tags with a given name. 815 # Optimization to find all tags with a given name.
816 if name.count(':') == 1:
817 # This is a name with a prefix. If this is a namespace-aware document,
818 # we need to match the local name against tag.name. If not,
819 # we need to match the fully-qualified name against tag.name.
820 prefix, local_name = name.split(':', 1)
821 else:
822 prefix = None
823 local_name = name
525 result = (element for element in generator 824 result = (element for element in generator
526 if isinstance(element, Tag) 825 if isinstance(element, Tag)
527 and element.name == name) 826 and (
827 element.name == name
828 ) or (
829 element.name == local_name
830 and (prefix is None or element.prefix == prefix)
831 )
832 )
528 return ResultSet(strainer, result) 833 return ResultSet(strainer, result)
529 results = ResultSet(strainer) 834 results = ResultSet(strainer)
530 while True: 835 while True:
@@ -544,6 +849,10 @@ class PageElement(object):
544 #NavigableStrings and Tags. 849 #NavigableStrings and Tags.
545 @property 850 @property
546 def next_elements(self): 851 def next_elements(self):
852 """All PageElements that were parsed after this one.
853
854 :yield: A sequence of PageElements.
855 """
547 i = self.next_element 856 i = self.next_element
548 while i is not None: 857 while i is not None:
549 yield i 858 yield i
@@ -551,6 +860,11 @@ class PageElement(object):
551 860
552 @property 861 @property
553 def next_siblings(self): 862 def next_siblings(self):
863 """All PageElements that are siblings of this one but were parsed
864 later.
865
866 :yield: A sequence of PageElements.
867 """
554 i = self.next_sibling 868 i = self.next_sibling
555 while i is not None: 869 while i is not None:
556 yield i 870 yield i
@@ -558,6 +872,10 @@ class PageElement(object):
558 872
559 @property 873 @property
560 def previous_elements(self): 874 def previous_elements(self):
875 """All PageElements that were parsed before this one.
876
877 :yield: A sequence of PageElements.
878 """
561 i = self.previous_element 879 i = self.previous_element
562 while i is not None: 880 while i is not None:
563 yield i 881 yield i
@@ -565,6 +883,11 @@ class PageElement(object):
565 883
566 @property 884 @property
567 def previous_siblings(self): 885 def previous_siblings(self):
886 """All PageElements that are siblings of this one but were parsed
887 earlier.
888
889 :yield: A sequence of PageElements.
890 """
568 i = self.previous_sibling 891 i = self.previous_sibling
569 while i is not None: 892 while i is not None:
570 yield i 893 yield i
@@ -572,87 +895,23 @@ class PageElement(object):
572 895
573 @property 896 @property
574 def parents(self): 897 def parents(self):
898 """All PageElements that are parents of this PageElement.
899
900 :yield: A sequence of PageElements.
901 """
575 i = self.parent 902 i = self.parent
576 while i is not None: 903 while i is not None:
577 yield i 904 yield i
578 i = i.parent 905 i = i.parent
579 906
580 # Methods for supporting CSS selectors. 907 @property
581 908 def decomposed(self):
582 tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') 909 """Check whether a PageElement has been decomposed.
583
584 # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
585 # \---------------------------/ \---/\-------------/ \-------/
586 # | | | |
587 # | | | The value
588 # | | ~,|,^,$,* or =
589 # | Attribute
590 # Tag
591 attribselect_re = re.compile(
592 r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
593 r'=?"?(?P<value>[^\]"]*)"?\]$'
594 )
595
596 def _attr_value_as_string(self, value, default=None):
597 """Force an attribute value into a string representation.
598 910
599 A multi-valued attribute will be converted into a 911 :rtype: bool
600 space-separated stirng.
601 """ 912 """
602 value = self.get(value, default) 913 return getattr(self, '_decomposed', False) or False
603 if isinstance(value, list) or isinstance(value, tuple): 914
604 value =" ".join(value)
605 return value
606
607 def _tag_name_matches_and(self, function, tag_name):
608 if not tag_name:
609 return function
610 else:
611 def _match(tag):
612 return tag.name == tag_name and function(tag)
613 return _match
614
615 def _attribute_checker(self, operator, attribute, value=''):
616 """Create a function that performs a CSS selector operation.
617
618 Takes an operator, attribute and optional value. Returns a
619 function that will return True for elements that match that
620 combination.
621 """
622 if operator == '=':
623 # string representation of `attribute` is equal to `value`
624 return lambda el: el._attr_value_as_string(attribute) == value
625 elif operator == '~':
626 # space-separated list representation of `attribute`
627 # contains `value`
628 def _includes_value(element):
629 attribute_value = element.get(attribute, [])
630 if not isinstance(attribute_value, list):
631 attribute_value = attribute_value.split()
632 return value in attribute_value
633 return _includes_value
634 elif operator == '^':
635 # string representation of `attribute` starts with `value`
636 return lambda el: el._attr_value_as_string(
637 attribute, '').startswith(value)
638 elif operator == '$':
639 # string represenation of `attribute` ends with `value`
640 return lambda el: el._attr_value_as_string(
641 attribute, '').endswith(value)
642 elif operator == '*':
643 # string representation of `attribute` contains `value`
644 return lambda el: value in el._attr_value_as_string(attribute, '')
645 elif operator == '|':
646 # string representation of `attribute` is either exactly
647 # `value` or starts with `value` and then a dash.
648 def _is_or_starts_with_dash(element):
649 attribute_value = element._attr_value_as_string(attribute, '')
650 return (attribute_value == value or attribute_value.startswith(
651 value + '-'))
652 return _is_or_starts_with_dash
653 else:
654 return lambda el: el.has_attr(attribute)
655
656 # Old non-property versions of the generators, for backwards 915 # Old non-property versions of the generators, for backwards
657 # compatibility with BS3. 916 # compatibility with BS3.
658 def nextGenerator(self): 917 def nextGenerator(self):
@@ -672,6 +931,11 @@ class PageElement(object):
672 931
673 932
674class NavigableString(str, PageElement): 933class NavigableString(str, PageElement):
934 """A Python Unicode string that is part of a parse tree.
935
936 When Beautiful Soup parses the markup <b>penguin</b>, it will
937 create a NavigableString for the string "penguin".
938 """
675 939
676 PREFIX = '' 940 PREFIX = ''
677 SUFFIX = '' 941 SUFFIX = ''
@@ -691,12 +955,22 @@ class NavigableString(str, PageElement):
691 u.setup() 955 u.setup()
692 return u 956 return u
693 957
694 def __copy__(self): 958 def __deepcopy__(self, memo, recursive=False):
695 """A copy of a NavigableString has the same contents and class 959 """A copy of a NavigableString has the same contents and class
696 as the original, but it is not connected to the parse tree. 960 as the original, but it is not connected to the parse tree.
961
962 :param recursive: This parameter is ignored; it's only defined
963 so that NavigableString.__deepcopy__ implements the same
964 signature as Tag.__deepcopy__.
697 """ 965 """
698 return type(self)(self) 966 return type(self)(self)
699 967
968 def __copy__(self):
969 """A copy of a NavigableString can only be a deep copy, because
970 only one PageElement can occupy a given place in a parse tree.
971 """
972 return self.__deepcopy__({})
973
700 def __getnewargs__(self): 974 def __getnewargs__(self):
701 return (str(self),) 975 return (str(self),)
702 976
@@ -712,55 +986,146 @@ class NavigableString(str, PageElement):
712 self.__class__.__name__, attr)) 986 self.__class__.__name__, attr))
713 987
714 def output_ready(self, formatter="minimal"): 988 def output_ready(self, formatter="minimal"):
989 """Run the string through the provided formatter.
990
991 :param formatter: A Formatter object, or a string naming one of the standard formatters.
992 """
715 output = self.format_string(self, formatter) 993 output = self.format_string(self, formatter)
716 return self.PREFIX + output + self.SUFFIX 994 return self.PREFIX + output + self.SUFFIX
717 995
718 @property 996 @property
719 def name(self): 997 def name(self):
998 """Since a NavigableString is not a Tag, it has no .name.
999
1000 This property is implemented so that code like this doesn't crash
1001 when run on a mixture of Tag and NavigableString objects:
1002 [x.name for x in tag.children]
1003 """
720 return None 1004 return None
721 1005
722 @name.setter 1006 @name.setter
723 def name(self, name): 1007 def name(self, name):
1008 """Prevent NavigableString.name from ever being set."""
724 raise AttributeError("A NavigableString cannot be given a name.") 1009 raise AttributeError("A NavigableString cannot be given a name.")
725 1010
1011 def _all_strings(self, strip=False, types=PageElement.default):
1012 """Yield all strings of certain classes, possibly stripping them.
1013
1014 This makes it easy for NavigableString to implement methods
1015 like get_text() as conveniences, creating a consistent
1016 text-extraction API across all PageElements.
1017
1018 :param strip: If True, all strings will be stripped before being
1019 yielded.
1020
1021 :param types: A tuple of NavigableString subclasses. If this
1022 NavigableString isn't one of those subclasses, the
1023 sequence will be empty. By default, the subclasses
1024 considered are NavigableString and CData objects. That
1025 means no comments, processing instructions, etc.
1026
1027 :yield: A sequence that either contains this string, or is empty.
1028
1029 """
1030 if types is self.default:
1031 # This is kept in Tag because it's full of subclasses of
1032 # this class, which aren't defined until later in the file.
1033 types = Tag.DEFAULT_INTERESTING_STRING_TYPES
1034
1035 # Do nothing if the caller is looking for specific types of
1036 # string, and we're of a different type.
1037 #
1038 # We check specific types instead of using isinstance(self,
1039 # types) because all of these classes subclass
1040 # NavigableString. Anyone who's using this feature probably
1041 # wants generic NavigableStrings but not other stuff.
1042 my_type = type(self)
1043 if types is not None:
1044 if isinstance(types, type):
1045 # Looking for a single type.
1046 if my_type is not types:
1047 return
1048 elif my_type not in types:
1049 # Looking for one of a list of types.
1050 return
1051
1052 value = self
1053 if strip:
1054 value = value.strip()
1055 if len(value) > 0:
1056 yield value
1057 strings = property(_all_strings)
1058
726class PreformattedString(NavigableString): 1059class PreformattedString(NavigableString):
727 """A NavigableString not subject to the normal formatting rules. 1060 """A NavigableString not subject to the normal formatting rules.
728 1061
729 The string will be passed into the formatter (to trigger side effects), 1062 This is an abstract class used for special kinds of strings such
730 but the return value will be ignored. 1063 as comments (the Comment class) and CDATA blocks (the CData
1064 class).
731 """ 1065 """
732 1066
733 def output_ready(self, formatter="minimal"): 1067 PREFIX = ''
734 """CData strings are passed into the formatter. 1068 SUFFIX = ''
735 But the return value is ignored.""" 1069
736 self.format_string(self, formatter) 1070 def output_ready(self, formatter=None):
1071 """Make this string ready for output by adding any subclass-specific
1072 prefix or suffix.
1073
1074 :param formatter: A Formatter object, or a string naming one
1075 of the standard formatters. The string will be passed into the
1076 Formatter, but only to trigger any side effects: the return
1077 value is ignored.
1078
1079 :return: The string, with any subclass-specific prefix and
1080 suffix added on.
1081 """
1082 if formatter is not None:
1083 ignore = self.format_string(self, formatter)
737 return self.PREFIX + self + self.SUFFIX 1084 return self.PREFIX + self + self.SUFFIX
738 1085
739class CData(PreformattedString): 1086class CData(PreformattedString):
740 1087 """A CDATA block."""
741 PREFIX = '<![CDATA[' 1088 PREFIX = '<![CDATA['
742 SUFFIX = ']]>' 1089 SUFFIX = ']]>'
743 1090
744class ProcessingInstruction(PreformattedString): 1091class ProcessingInstruction(PreformattedString):
1092 """A SGML processing instruction."""
745 1093
746 PREFIX = '<?' 1094 PREFIX = '<?'
747 SUFFIX = '>' 1095 SUFFIX = '>'
748 1096
749class Comment(PreformattedString): 1097class XMLProcessingInstruction(ProcessingInstruction):
1098 """An XML processing instruction."""
1099 PREFIX = '<?'
1100 SUFFIX = '?>'
750 1101
1102class Comment(PreformattedString):
1103 """An HTML or XML comment."""
751 PREFIX = '<!--' 1104 PREFIX = '<!--'
752 SUFFIX = '-->' 1105 SUFFIX = '-->'
753 1106
754 1107
755class Declaration(PreformattedString): 1108class Declaration(PreformattedString):
1109 """An XML declaration."""
756 PREFIX = '<?' 1110 PREFIX = '<?'
757 SUFFIX = '?>' 1111 SUFFIX = '?>'
758 1112
759 1113
760class Doctype(PreformattedString): 1114class Doctype(PreformattedString):
761 1115 """A document type declaration."""
762 @classmethod 1116 @classmethod
763 def for_name_and_ids(cls, name, pub_id, system_id): 1117 def for_name_and_ids(cls, name, pub_id, system_id):
1118 """Generate an appropriate document type declaration for a given
1119 public ID and system ID.
1120
1121 :param name: The name of the document's root element, e.g. 'html'.
1122 :param pub_id: The Formal Public Identifier for this document type,
1123 e.g. '-//W3C//DTD XHTML 1.1//EN'
1124 :param system_id: The system identifier for this document type,
1125 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1126
1127 :return: A Doctype.
1128 """
764 value = name or '' 1129 value = name or ''
765 if pub_id is not None: 1130 if pub_id is not None:
766 value += ' PUBLIC "%s"' % pub_id 1131 value += ' PUBLIC "%s"' % pub_id
@@ -775,14 +1140,105 @@ class Doctype(PreformattedString):
775 SUFFIX = '>\n' 1140 SUFFIX = '>\n'
776 1141
777 1142
1143class Stylesheet(NavigableString):
1144 """A NavigableString representing an stylesheet (probably
1145 CSS).
1146
1147 Used to distinguish embedded stylesheets from textual content.
1148 """
1149 pass
1150
1151
1152class Script(NavigableString):
1153 """A NavigableString representing an executable script (probably
1154 Javascript).
1155
1156 Used to distinguish executable code from textual content.
1157 """
1158 pass
1159
1160
1161class TemplateString(NavigableString):
1162 """A NavigableString representing a string found inside an HTML
1163 template embedded in a larger document.
1164
1165 Used to distinguish such strings from the main body of the document.
1166 """
1167 pass
1168
1169
1170class RubyTextString(NavigableString):
1171 """A NavigableString representing the contents of the <rt> HTML
1172 element.
1173
1174 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
1175
1176 Can be used to distinguish such strings from the strings they're
1177 annotating.
1178 """
1179 pass
1180
1181
1182class RubyParenthesisString(NavigableString):
1183 """A NavigableString representing the contents of the <rp> HTML
1184 element.
1185
1186 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
1187 """
1188 pass
1189
1190
778class Tag(PageElement): 1191class Tag(PageElement):
1192 """Represents an HTML or XML tag that is part of a parse tree, along
1193 with its attributes and contents.
779 1194
780 """Represents a found HTML tag with its attributes and contents.""" 1195 When Beautiful Soup parses the markup <b>penguin</b>, it will
1196 create a Tag object representing the <b> tag.
1197 """
781 1198
782 def __init__(self, parser=None, builder=None, name=None, namespace=None, 1199 def __init__(self, parser=None, builder=None, name=None, namespace=None,
783 prefix=None, attrs=None, parent=None, previous=None): 1200 prefix=None, attrs=None, parent=None, previous=None,
784 "Basic constructor." 1201 is_xml=None, sourceline=None, sourcepos=None,
785 1202 can_be_empty_element=None, cdata_list_attributes=None,
1203 preserve_whitespace_tags=None,
1204 interesting_string_types=None,
1205 namespaces=None
1206 ):
1207 """Basic constructor.
1208
1209 :param parser: A BeautifulSoup object.
1210 :param builder: A TreeBuilder.
1211 :param name: The name of the tag.
1212 :param namespace: The URI of this Tag's XML namespace, if any.
1213 :param prefix: The prefix for this Tag's XML namespace, if any.
1214 :param attrs: A dictionary of this Tag's attribute values.
1215 :param parent: The PageElement to use as this Tag's parent.
1216 :param previous: The PageElement that was parsed immediately before
1217 this tag.
1218 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1219 HTML tag.
1220 :param sourceline: The line number where this tag was found in its
1221 source document.
1222 :param sourcepos: The character position within `sourceline` where this
1223 tag was found.
1224 :param can_be_empty_element: If True, this tag should be
1225 represented as <tag/>. If False, this tag should be represented
1226 as <tag></tag>.
1227 :param cdata_list_attributes: A list of attributes whose values should
1228 be treated as CDATA if they ever show up on this tag.
1229 :param preserve_whitespace_tags: A list of tag names whose contents
1230 should have their whitespace preserved.
1231 :param interesting_string_types: This is a NavigableString
1232 subclass or a tuple of them. When iterating over this
1233 Tag's strings in methods like Tag.strings or Tag.get_text,
1234 these are the types of strings that are interesting enough
1235 to be considered. The default is to consider
1236 NavigableString and CData the only interesting string
1237 subtypes.
1238 :param namespaces: A dictionary mapping currently active
1239 namespace prefixes to URIs. This can be used later to
1240 construct CSS selectors.
1241 """
786 if parser is None: 1242 if parser is None:
787 self.parser_class = None 1243 self.parser_class = None
788 else: 1244 else:
@@ -793,7 +1249,12 @@ class Tag(PageElement):
793 raise ValueError("No value provided for new tag's name.") 1249 raise ValueError("No value provided for new tag's name.")
794 self.name = name 1250 self.name = name
795 self.namespace = namespace 1251 self.namespace = namespace
1252 self._namespaces = namespaces or {}
796 self.prefix = prefix 1253 self.prefix = prefix
1254 if ((not builder or builder.store_line_numbers)
1255 and (sourceline is not None or sourcepos is not None)):
1256 self.sourceline = sourceline
1257 self.sourcepos = sourcepos
797 if attrs is None: 1258 if attrs is None:
798 attrs = {} 1259 attrs = {}
799 elif attrs: 1260 elif attrs:
@@ -804,32 +1265,109 @@ class Tag(PageElement):
804 attrs = dict(attrs) 1265 attrs = dict(attrs)
805 else: 1266 else:
806 attrs = dict(attrs) 1267 attrs = dict(attrs)
1268
1269 # If possible, determine ahead of time whether this tag is an
1270 # XML tag.
1271 if builder:
1272 self.known_xml = builder.is_xml
1273 else:
1274 self.known_xml = is_xml
807 self.attrs = attrs 1275 self.attrs = attrs
808 self.contents = [] 1276 self.contents = []
809 self.setup(parent, previous) 1277 self.setup(parent, previous)
810 self.hidden = False 1278 self.hidden = False
811 1279
812 # Set up any substitutions, such as the charset in a META tag. 1280 if builder is None:
813 if builder is not None: 1281 # In the absence of a TreeBuilder, use whatever values were
1282 # passed in here. They're probably None, unless this is a copy of some
1283 # other tag.
1284 self.can_be_empty_element = can_be_empty_element
1285 self.cdata_list_attributes = cdata_list_attributes
1286 self.preserve_whitespace_tags = preserve_whitespace_tags
1287 self.interesting_string_types = interesting_string_types
1288 else:
1289 # Set up any substitutions for this tag, such as the charset in a META tag.
814 builder.set_up_substitutions(self) 1290 builder.set_up_substitutions(self)
1291
1292 # Ask the TreeBuilder whether this tag might be an empty-element tag.
815 self.can_be_empty_element = builder.can_be_empty_element(name) 1293 self.can_be_empty_element = builder.can_be_empty_element(name)
816 else: 1294
817 self.can_be_empty_element = False 1295 # Keep track of the list of attributes of this tag that
1296 # might need to be treated as a list.
1297 #
1298 # For performance reasons, we store the whole data structure
1299 # rather than asking the question of every tag. Asking would
1300 # require building a new data structure every time, and
1301 # (unlike can_be_empty_element), we almost never need
1302 # to check this.
1303 self.cdata_list_attributes = builder.cdata_list_attributes
1304
1305 # Keep track of the names that might cause this tag to be treated as a
1306 # whitespace-preserved tag.
1307 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1308
1309 if self.name in builder.string_containers:
1310 # This sort of tag uses a special string container
1311 # subclass for most of its strings. When we ask the
1312 self.interesting_string_types = builder.string_containers[self.name]
1313 else:
1314 self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
818 1315
819 parserClass = _alias("parser_class") # BS3 1316 parserClass = _alias("parser_class") # BS3
820 1317
821 def __copy__(self): 1318 def __deepcopy__(self, memo, recursive=True):
822 """A copy of a Tag is a new Tag, unconnected to the parse tree. 1319 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
823 Its contents are a copy of the old Tag's contents. 1320 Its contents are a copy of the old Tag's contents.
824 """ 1321 """
825 clone = type(self)(None, self.builder, self.name, self.namespace, 1322 clone = self._clone()
826 self.nsprefix, self.attrs) 1323
1324 if recursive:
1325 # Clone this tag's descendants recursively, but without
1326 # making any recursive function calls.
1327 tag_stack = [clone]
1328 for event, element in self._event_stream(self.descendants):
1329 if event is Tag.END_ELEMENT_EVENT:
1330 # Stop appending incoming Tags to the Tag that was
1331 # just closed.
1332 tag_stack.pop()
1333 else:
1334 descendant_clone = element.__deepcopy__(
1335 memo, recursive=False
1336 )
1337 # Add to its parent's .contents
1338 tag_stack[-1].append(descendant_clone)
1339
1340 if event is Tag.START_ELEMENT_EVENT:
1341 # Add the Tag itself to the stack so that its
1342 # children will be .appended to it.
1343 tag_stack.append(descendant_clone)
1344 return clone
1345
1346 def __copy__(self):
1347 """A copy of a Tag must always be a deep copy, because a Tag's
1348 children can only have one parent at a time.
1349 """
1350 return self.__deepcopy__({})
1351
1352 def _clone(self):
1353 """Create a new Tag just like this one, but with no
1354 contents and unattached to any parse tree.
1355
1356 This is the first step in the deepcopy process.
1357 """
1358 clone = type(self)(
1359 None, None, self.name, self.namespace,
1360 self.prefix, self.attrs, is_xml=self._is_xml,
1361 sourceline=self.sourceline, sourcepos=self.sourcepos,
1362 can_be_empty_element=self.can_be_empty_element,
1363 cdata_list_attributes=self.cdata_list_attributes,
1364 preserve_whitespace_tags=self.preserve_whitespace_tags,
1365 interesting_string_types=self.interesting_string_types
1366 )
827 for attr in ('can_be_empty_element', 'hidden'): 1367 for attr in ('can_be_empty_element', 'hidden'):
828 setattr(clone, attr, getattr(self, attr)) 1368 setattr(clone, attr, getattr(self, attr))
829 for child in self.contents:
830 clone.append(child.__copy__())
831 return clone 1369 return clone
832 1370
833 @property 1371 @property
834 def is_empty_element(self): 1372 def is_empty_element(self):
835 """Is this tag an empty-element tag? (aka a self-closing tag) 1373 """Is this tag an empty-element tag? (aka a self-closing tag)
@@ -850,13 +1388,17 @@ class Tag(PageElement):
850 1388
851 @property 1389 @property
852 def string(self): 1390 def string(self):
853 """Convenience property to get the single string within this tag. 1391 """Convenience property to get the single string within this
1392 PageElement.
854 1393
855 :Return: If this tag has a single string child, return value 1394 TODO It might make sense to have NavigableString.string return
856 is that string. If this tag has no children, or more than one 1395 itself.
857 child, return value is None. If this tag has one child tag, 1396
1397 :return: If this element has a single string child, return
1398 value is that string. If this element has one child tag,
858 return value is the 'string' attribute of the child tag, 1399 return value is the 'string' attribute of the child tag,
859 recursively. 1400 recursively. If this element is itself a string, has no
1401 children, or has more than one child, return value is None.
860 """ 1402 """
861 if len(self.contents) != 1: 1403 if len(self.contents) != 1:
862 return None 1404 return None
@@ -867,57 +1409,75 @@ class Tag(PageElement):
867 1409
868 @string.setter 1410 @string.setter
869 def string(self, string): 1411 def string(self, string):
1412 """Replace this PageElement's contents with `string`."""
870 self.clear() 1413 self.clear()
871 self.append(string.__class__(string)) 1414 self.append(string.__class__(string))
872 1415
873 def _all_strings(self, strip=False, types=(NavigableString, CData)): 1416 DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
1417 def _all_strings(self, strip=False, types=PageElement.default):
874 """Yield all strings of certain classes, possibly stripping them. 1418 """Yield all strings of certain classes, possibly stripping them.
875 1419
876 By default, yields only NavigableString and CData objects. So 1420 :param strip: If True, all strings will be stripped before being
877 no comments, processing instructions, etc. 1421 yielded.
1422
1423 :param types: A tuple of NavigableString subclasses. Any strings of
1424 a subclass not found in this list will be ignored. By
1425 default, the subclasses considered are the ones found in
1426 self.interesting_string_types. If that's not specified,
1427 only NavigableString and CData objects will be
1428 considered. That means no comments, processing
1429 instructions, etc.
1430
1431 :yield: A sequence of strings.
1432
878 """ 1433 """
1434 if types is self.default:
1435 types = self.interesting_string_types
1436
879 for descendant in self.descendants: 1437 for descendant in self.descendants:
880 if ( 1438 if (types is None and not isinstance(descendant, NavigableString)):
881 (types is None and not isinstance(descendant, NavigableString)) 1439 continue
882 or 1440 descendant_type = type(descendant)
883 (types is not None and type(descendant) not in types)): 1441 if isinstance(types, type):
1442 if descendant_type is not types:
1443 # We're not interested in strings of this type.
1444 continue
1445 elif types is not None and descendant_type not in types:
1446 # We're not interested in strings of this type.
884 continue 1447 continue
885 if strip: 1448 if strip:
886 descendant = descendant.strip() 1449 descendant = descendant.strip()
887 if len(descendant) == 0: 1450 if len(descendant) == 0:
888 continue 1451 continue
889 yield descendant 1452 yield descendant
890
891 strings = property(_all_strings) 1453 strings = property(_all_strings)
892 1454
893 @property 1455 def decompose(self):
894 def stripped_strings(self): 1456 """Recursively destroys this PageElement and its children.
895 for string in self._all_strings(True):
896 yield string
897 1457
898 def get_text(self, separator="", strip=False, 1458 This element will be removed from the tree and wiped out; so
899 types=(NavigableString, CData)): 1459 will everything beneath it.
900 """
901 Get all child strings, concatenated using the given separator.
902 """
903 return separator.join([s for s in self._all_strings(
904 strip, types=types)])
905 getText = get_text
906 text = property(get_text)
907 1460
908 def decompose(self): 1461 The behavior of a decomposed PageElement is undefined and you
909 """Recursively destroys the contents of this tree.""" 1462 should never use one for anything, but if you need to _check_
1463 whether an element has been decomposed, you can use the
1464 `decomposed` property.
1465 """
910 self.extract() 1466 self.extract()
911 i = self 1467 i = self
912 while i is not None: 1468 while i is not None:
913 next = i.next_element 1469 n = i.next_element
914 i.__dict__.clear() 1470 i.__dict__.clear()
915 i.contents = [] 1471 i.contents = []
916 i = next 1472 i._decomposed = True
1473 i = n
917 1474
918 def clear(self, decompose=False): 1475 def clear(self, decompose=False):
919 """ 1476 """Wipe out all children of this PageElement by calling extract()
920 Extract all children. If decompose is True, decompose instead. 1477 on them.
1478
1479 :param decompose: If this is True, decompose() (a more
1480 destructive method) will be called instead of extract().
921 """ 1481 """
922 if decompose: 1482 if decompose:
923 for element in self.contents[:]: 1483 for element in self.contents[:]:
@@ -929,10 +1489,51 @@ class Tag(PageElement):
929 for element in self.contents[:]: 1489 for element in self.contents[:]:
930 element.extract() 1490 element.extract()
931 1491
932 def index(self, element): 1492 def smooth(self):
1493 """Smooth out this element's children by consolidating consecutive
1494 strings.
1495
1496 This makes pretty-printed output look more natural following a
1497 lot of operations that modified the tree.
933 """ 1498 """
934 Find the index of a child by identity, not value. Avoids issues with 1499 # Mark the first position of every pair of children that need
935 tag.contents.index(element) getting the index of equal elements. 1500 # to be consolidated. Do this rather than making a copy of
1501 # self.contents, since in most cases very few strings will be
1502 # affected.
1503 marked = []
1504 for i, a in enumerate(self.contents):
1505 if isinstance(a, Tag):
1506 # Recursively smooth children.
1507 a.smooth()
1508 if i == len(self.contents)-1:
1509 # This is the last item in .contents, and it's not a
1510 # tag. There's no chance it needs any work.
1511 continue
1512 b = self.contents[i+1]
1513 if (isinstance(a, NavigableString)
1514 and isinstance(b, NavigableString)
1515 and not isinstance(a, PreformattedString)
1516 and not isinstance(b, PreformattedString)
1517 ):
1518 marked.append(i)
1519
1520 # Go over the marked positions in reverse order, so that
1521 # removing items from .contents won't affect the remaining
1522 # positions.
1523 for i in reversed(marked):
1524 a = self.contents[i]
1525 b = self.contents[i+1]
1526 b.extract()
1527 n = NavigableString(a+b)
1528 a.replace_with(n)
1529
1530 def index(self, element):
1531 """Find the index of a child by identity, not value.
1532
1533 Avoids issues with tag.contents.index(element) getting the
1534 index of equal elements.
1535
1536 :param element: Look for this PageElement in `self.contents`.
936 """ 1537 """
937 for i, child in enumerate(self.contents): 1538 for i, child in enumerate(self.contents):
938 if child is element: 1539 if child is element:
@@ -945,23 +1546,38 @@ class Tag(PageElement):
945 attribute.""" 1546 attribute."""
946 return self.attrs.get(key, default) 1547 return self.attrs.get(key, default)
947 1548
1549 def get_attribute_list(self, key, default=None):
1550 """The same as get(), but always returns a list.
1551
1552 :param key: The attribute to look for.
1553 :param default: Use this value if the attribute is not present
1554 on this PageElement.
1555 :return: A list of values, probably containing only a single
1556 value.
1557 """
1558 value = self.get(key, default)
1559 if not isinstance(value, list):
1560 value = [value]
1561 return value
1562
948 def has_attr(self, key): 1563 def has_attr(self, key):
1564 """Does this PageElement have an attribute with the given name?"""
949 return key in self.attrs 1565 return key in self.attrs
950 1566
951 def __hash__(self): 1567 def __hash__(self):
952 return str(self).__hash__() 1568 return str(self).__hash__()
953 1569
954 def __getitem__(self, key): 1570 def __getitem__(self, key):
955 """tag[key] returns the value of the 'key' attribute for the tag, 1571 """tag[key] returns the value of the 'key' attribute for the Tag,
956 and throws an exception if it's not there.""" 1572 and throws an exception if it's not there."""
957 return self.attrs[key] 1573 return self.attrs[key]
958 1574
959 def __iter__(self): 1575 def __iter__(self):
960 "Iterating over a tag iterates over its contents." 1576 "Iterating over a Tag iterates over its contents."
961 return iter(self.contents) 1577 return iter(self.contents)
962 1578
963 def __len__(self): 1579 def __len__(self):
964 "The length of a tag is the length of its list of contents." 1580 "The length of a Tag is the length of its list of contents."
965 return len(self.contents) 1581 return len(self.contents)
966 1582
967 def __contains__(self, x): 1583 def __contains__(self, x):
@@ -981,29 +1597,33 @@ class Tag(PageElement):
981 self.attrs.pop(key, None) 1597 self.attrs.pop(key, None)
982 1598
983 def __call__(self, *args, **kwargs): 1599 def __call__(self, *args, **kwargs):
984 """Calling a tag like a function is the same as calling its 1600 """Calling a Tag like a function is the same as calling its
985 find_all() method. Eg. tag('a') returns a list of all the A tags 1601 find_all() method. Eg. tag('a') returns a list of all the A tags
986 found within this tag.""" 1602 found within this tag."""
987 return self.find_all(*args, **kwargs) 1603 return self.find_all(*args, **kwargs)
988 1604
989 def __getattr__(self, tag): 1605 def __getattr__(self, tag):
990 #print "Getattr %s.%s" % (self.__class__, tag) 1606 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1607 #print("Getattr %s.%s" % (self.__class__, tag))
991 if len(tag) > 3 and tag.endswith('Tag'): 1608 if len(tag) > 3 and tag.endswith('Tag'):
992 # BS3: soup.aTag -> "soup.find("a") 1609 # BS3: soup.aTag -> "soup.find("a")
993 tag_name = tag[:-3] 1610 tag_name = tag[:-3]
994 warnings.warn( 1611 warnings.warn(
995 '.%sTag is deprecated, use .find("%s") instead.' % ( 1612 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
996 tag_name, tag_name)) 1613 name=tag_name
1614 ),
1615 DeprecationWarning, stacklevel=2
1616 )
997 return self.find(tag_name) 1617 return self.find(tag_name)
998 # We special case contents to avoid recursion. 1618 # We special case contents to avoid recursion.
999 elif not tag.startswith("__") and not tag=="contents": 1619 elif not tag.startswith("__") and not tag == "contents":
1000 return self.find(tag) 1620 return self.find(tag)
1001 raise AttributeError( 1621 raise AttributeError(
1002 "'%s' object has no attribute '%s'" % (self.__class__, tag)) 1622 "'%s' object has no attribute '%s'" % (self.__class__, tag))
1003 1623
1004 def __eq__(self, other): 1624 def __eq__(self, other):
1005 """Returns true iff this tag has the same name, the same attributes, 1625 """Returns true iff this Tag has the same name, the same attributes,
1006 and the same contents (recursively) as the given tag.""" 1626 and the same contents (recursively) as `other`."""
1007 if self is other: 1627 if self is other:
1008 return True 1628 return True
1009 if (not hasattr(other, 'name') or 1629 if (not hasattr(other, 'name') or
@@ -1019,69 +1639,235 @@ class Tag(PageElement):
1019 return True 1639 return True
1020 1640
1021 def __ne__(self, other): 1641 def __ne__(self, other):
1022 """Returns true iff this tag is not identical to the other tag, 1642 """Returns true iff this Tag is not identical to `other`,
1023 as defined in __eq__.""" 1643 as defined in __eq__."""
1024 return not self == other 1644 return not self == other
1025 1645
1026 def __repr__(self, encoding="unicode-escape"): 1646 def __repr__(self, encoding="unicode-escape"):
1027 """Renders this tag as a string.""" 1647 """Renders this PageElement as a string.
1028 if PY3K:
1029 # "The return value must be a string object", i.e. Unicode
1030 return self.decode()
1031 else:
1032 # "The return value must be a string object", i.e. a bytestring.
1033 # By convention, the return value of __repr__ should also be
1034 # an ASCII string.
1035 return self.encode(encoding)
1036 1648
1037 def __unicode__(self): 1649 :param encoding: The encoding to use (Python 2 only).
1650 TODO: This is now ignored and a warning should be issued
1651 if a value is provided.
1652 :return: A (Unicode) string.
1653 """
1654 # "The return value must be a string object", i.e. Unicode
1038 return self.decode() 1655 return self.decode()
1039 1656
1040 def __str__(self): 1657 def __unicode__(self):
1041 if PY3K: 1658 """Renders this PageElement as a Unicode string."""
1042 return self.decode() 1659 return self.decode()
1043 else:
1044 return self.encode()
1045 1660
1046 if PY3K: 1661 __str__ = __repr__ = __unicode__
1047 __str__ = __repr__ = __unicode__
1048 1662
1049 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 1663 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1050 indent_level=None, formatter="minimal", 1664 indent_level=None, formatter="minimal",
1051 errors="xmlcharrefreplace"): 1665 errors="xmlcharrefreplace"):
1666 """Render a bytestring representation of this PageElement and its
1667 contents.
1668
1669 :param encoding: The destination encoding.
1670 :param indent_level: Each line of the rendering will be
1671 indented this many levels. (The formatter decides what a
1672 'level' means in terms of spaces or other characters
1673 output.) Used internally in recursive calls while
1674 pretty-printing.
1675 :param formatter: A Formatter object, or a string naming one of
1676 the standard formatters.
1677 :param errors: An error handling strategy such as
1678 'xmlcharrefreplace'. This value is passed along into
1679 encode() and its value should be one of the constants
1680 defined by Python.
1681 :return: A bytestring.
1682
1683 """
1052 # Turn the data structure into Unicode, then encode the 1684 # Turn the data structure into Unicode, then encode the
1053 # Unicode. 1685 # Unicode.
1054 u = self.decode(indent_level, encoding, formatter) 1686 u = self.decode(indent_level, encoding, formatter)
1055 return u.encode(encoding, errors) 1687 return u.encode(encoding, errors)
1056 1688
1057 def _should_pretty_print(self, indent_level):
1058 """Should this tag be pretty-printed?"""
1059 return (
1060 indent_level is not None and
1061 (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1062 or self._is_xml))
1063
1064 def decode(self, indent_level=None, 1689 def decode(self, indent_level=None,
1065 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1690 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1066 formatter="minimal"): 1691 formatter="minimal",
1067 """Returns a Unicode representation of this tag and its contents. 1692 iterator=None):
1693 pieces = []
1694 # First off, turn a non-Formatter `formatter` into a Formatter
1695 # object. This will stop the lookup from happening over and
1696 # over again.
1697 if not isinstance(formatter, Formatter):
1698 formatter = self.formatter_for_name(formatter)
1699
1700 if indent_level is True:
1701 indent_level = 0
1702
1703 # The currently active tag that put us into string literal
1704 # mode. Until this element is closed, children will be treated
1705 # as string literals and not pretty-printed. String literal
1706 # mode is turned on immediately after this tag begins, and
1707 # turned off immediately before it's closed. This means there
1708 # will be whitespace before and after the tag itself.
1709 string_literal_tag = None
1710
1711 for event, element in self._event_stream(iterator):
1712 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
1713 piece = element._format_tag(
1714 eventual_encoding, formatter, opening=True
1715 )
1716 elif event is Tag.END_ELEMENT_EVENT:
1717 piece = element._format_tag(
1718 eventual_encoding, formatter, opening=False
1719 )
1720 if indent_level is not None:
1721 indent_level -= 1
1722 else:
1723 piece = element.output_ready(formatter)
1724
1725 # Now we need to apply the 'prettiness' -- extra
1726 # whitespace before and/or after this tag. This can get
1727 # complicated because certain tags, like <pre> and
1728 # <script>, can't be prettified, since adding whitespace would
1729 # change the meaning of the content.
1730
1731 # The default behavior is to add whitespace before and
1732 # after an element when string literal mode is off, and to
1733 # leave things as they are when string literal mode is on.
1734 if string_literal_tag:
1735 indent_before = indent_after = False
1736 else:
1737 indent_before = indent_after = True
1738
1739 # The only time the behavior is more complex than that is
1740 # when we encounter an opening or closing tag that might
1741 # put us into or out of string literal mode.
1742 if (event is Tag.START_ELEMENT_EVENT
1743 and not string_literal_tag
1744 and not element._should_pretty_print()):
1745 # We are about to enter string literal mode. Add
1746 # whitespace before this tag, but not after. We
1747 # will stay in string literal mode until this tag
1748 # is closed.
1749 indent_before = True
1750 indent_after = False
1751 string_literal_tag = element
1752 elif (event is Tag.END_ELEMENT_EVENT
1753 and element is string_literal_tag):
1754 # We are about to exit string literal mode by closing
1755 # the tag that sent us into that mode. Add whitespace
1756 # after this tag, but not before.
1757 indent_before = False
1758 indent_after = True
1759 string_literal_tag = None
1760
1761 # Now we know whether to add whitespace before and/or
1762 # after this element.
1763 if indent_level is not None:
1764 if (indent_before or indent_after):
1765 if isinstance(element, NavigableString):
1766 piece = piece.strip()
1767 if piece:
1768 piece = self._indent_string(
1769 piece, indent_level, formatter,
1770 indent_before, indent_after
1771 )
1772 if event == Tag.START_ELEMENT_EVENT:
1773 indent_level += 1
1774 pieces.append(piece)
1775 return "".join(pieces)
1776
1777 # Names for the different events yielded by _event_stream
1778 START_ELEMENT_EVENT = object()
1779 END_ELEMENT_EVENT = object()
1780 EMPTY_ELEMENT_EVENT = object()
1781 STRING_ELEMENT_EVENT = object()
1782
1783 def _event_stream(self, iterator=None):
1784 """Yield a sequence of events that can be used to reconstruct the DOM
1785 for this element.
1786
1787 This lets us recreate the nested structure of this element
1788 (e.g. when formatting it as a string) without using recursive
1789 method calls.
1790
1791 This is similar in concept to the SAX API, but it's a simpler
1792 interface designed for internal use. The events are different
1793 from SAX and the arguments associated with the events are Tags
1794 and other Beautiful Soup objects.
1795
1796 :param iterator: An alternate iterator to use when traversing
1797 the tree.
1798 """
1799 tag_stack = []
1068 1800
1069 :param eventual_encoding: The tag is destined to be 1801 iterator = iterator or self.self_and_descendants
1070 encoded into this encoding. This method is _not_ 1802
1071 responsible for performing that encoding. This information 1803 for c in iterator:
1072 is passed in so that it can be substituted in if the 1804 # If the parent of the element we're about to yield is not
1073 document contains a <META> tag that mentions the document's 1805 # the tag currently on the stack, it means that the tag on
1074 encoding. 1806 # the stack closed before this element appeared.
1807 while tag_stack and c.parent != tag_stack[-1]:
1808 now_closed_tag = tag_stack.pop()
1809 yield Tag.END_ELEMENT_EVENT, now_closed_tag
1810
1811 if isinstance(c, Tag):
1812 if c.is_empty_element:
1813 yield Tag.EMPTY_ELEMENT_EVENT, c
1814 else:
1815 yield Tag.START_ELEMENT_EVENT, c
1816 tag_stack.append(c)
1817 continue
1818 else:
1819 yield Tag.STRING_ELEMENT_EVENT, c
1820
1821 while tag_stack:
1822 now_closed_tag = tag_stack.pop()
1823 yield Tag.END_ELEMENT_EVENT, now_closed_tag
1824
1825 def _indent_string(self, s, indent_level, formatter,
1826 indent_before, indent_after):
1827 """Add indentation whitespace before and/or after a string.
1828
1829 :param s: The string to amend with whitespace.
1830 :param indent_level: The indentation level; affects how much
1831 whitespace goes before the string.
1832 :param indent_before: Whether or not to add whitespace
1833 before the string.
1834 :param indent_after: Whether or not to add whitespace
1835 (a newline) after the string.
1075 """ 1836 """
1837 space_before = ''
1838 if indent_before and indent_level:
1839 space_before = (formatter.indent * indent_level)
1076 1840
1077 # First off, turn a string formatter into a function. This 1841 space_after = ''
1078 # will stop the lookup from happening over and over again. 1842 if indent_after:
1079 if not isinstance(formatter, collections.abc.Callable): 1843 space_after = "\n"
1080 formatter = self._formatter_for_name(formatter)
1081 1844
1082 attrs = [] 1845 return space_before + s + space_after
1083 if self.attrs: 1846
1084 for key, val in sorted(self.attrs.items()): 1847 def _format_tag(self, eventual_encoding, formatter, opening):
1848 if self.hidden:
1849 # A hidden tag is invisible, although its contents
1850 # are visible.
1851 return ''
1852
1853 # A tag starts with the < character (see below).
1854
1855 # Then the / character, if this is a closing tag.
1856 closing_slash = ''
1857 if not opening:
1858 closing_slash = '/'
1859
1860 # Then an optional namespace prefix.
1861 prefix = ''
1862 if self.prefix:
1863 prefix = self.prefix + ":"
1864
1865 # Then a list of attribute values, if this is an opening tag.
1866 attribute_string = ''
1867 if opening:
1868 attributes = formatter.attributes(self)
1869 attrs = []
1870 for key, val in attributes:
1085 if val is None: 1871 if val is None:
1086 decoded = key 1872 decoded = key
1087 else: 1873 else:
@@ -1090,71 +1876,52 @@ class Tag(PageElement):
1090 elif not isinstance(val, str): 1876 elif not isinstance(val, str):
1091 val = str(val) 1877 val = str(val)
1092 elif ( 1878 elif (
1093 isinstance(val, AttributeValueWithCharsetSubstitution) 1879 isinstance(val, AttributeValueWithCharsetSubstitution)
1094 and eventual_encoding is not None): 1880 and eventual_encoding is not None
1881 ):
1095 val = val.encode(eventual_encoding) 1882 val = val.encode(eventual_encoding)
1096 1883
1097 text = self.format_string(val, formatter) 1884 text = formatter.attribute_value(val)
1098 decoded = ( 1885 decoded = (
1099 str(key) + '=' 1886 str(key) + '='
1100 + EntitySubstitution.quoted_attribute_value(text)) 1887 + formatter.quoted_attribute_value(text))
1101 attrs.append(decoded) 1888 attrs.append(decoded)
1102 close = '' 1889 if attrs:
1103 closeTag = '' 1890 attribute_string = ' ' + ' '.join(attrs)
1104
1105 prefix = ''
1106 if self.prefix:
1107 prefix = self.prefix + ":"
1108 1891
1892 # Then an optional closing slash (for a void element in an
1893 # XML document).
1894 void_element_closing_slash = ''
1109 if self.is_empty_element: 1895 if self.is_empty_element:
1110 close = '/' 1896 void_element_closing_slash = formatter.void_element_close_prefix or ''
1111 else:
1112 closeTag = '</%s%s>' % (prefix, self.name)
1113
1114 pretty_print = self._should_pretty_print(indent_level)
1115 space = ''
1116 indent_space = ''
1117 if indent_level is not None:
1118 indent_space = (' ' * (indent_level - 1))
1119 if pretty_print:
1120 space = indent_space
1121 indent_contents = indent_level + 1
1122 else:
1123 indent_contents = None
1124 contents = self.decode_contents(
1125 indent_contents, eventual_encoding, formatter)
1126 1897
1127 if self.hidden: 1898 # Put it all together.
1128 # This is the 'document root' object. 1899 return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
1129 s = contents 1900
1130 else: 1901 def _should_pretty_print(self, indent_level=1):
1131 s = [] 1902 """Should this tag be pretty-printed?
1132 attribute_string = '' 1903
1133 if attrs: 1904 Most of them should, but some (such as <pre> in HTML
1134 attribute_string = ' ' + ' '.join(attrs) 1905 documents) should not.
1135 if indent_level is not None: 1906 """
1136 # Even if this particular tag is not pretty-printed, 1907 return (
1137 # we should indent up to the start of the tag. 1908 indent_level is not None
1138 s.append(indent_space) 1909 and (
1139 s.append('<%s%s%s%s>' % ( 1910 not self.preserve_whitespace_tags
1140 prefix, self.name, attribute_string, close)) 1911 or self.name not in self.preserve_whitespace_tags
1141 if pretty_print: 1912 )
1142 s.append("\n") 1913 )
1143 s.append(contents)
1144 if pretty_print and contents and contents[-1] != "\n":
1145 s.append("\n")
1146 if pretty_print and closeTag:
1147 s.append(space)
1148 s.append(closeTag)
1149 if indent_level is not None and closeTag and self.next_sibling:
1150 # Even if this particular tag is not pretty-printed,
1151 # we're now done with the tag, and we should add a
1152 # newline if appropriate.
1153 s.append("\n")
1154 s = ''.join(s)
1155 return s
1156 1914
1157 def prettify(self, encoding=None, formatter="minimal"): 1915 def prettify(self, encoding=None, formatter="minimal"):
1916 """Pretty-print this PageElement as a string.
1917
1918 :param encoding: The eventual encoding of the string. If this is None,
1919 a Unicode string will be returned.
1920 :param formatter: A Formatter object, or a string naming one of
1921 the standard formatters.
1922 :return: A Unicode string (if encoding==None) or a bytestring
1923 (otherwise).
1924 """
1158 if encoding is None: 1925 if encoding is None:
1159 return self.decode(True, formatter=formatter) 1926 return self.decode(True, formatter=formatter)
1160 else: 1927 else:
@@ -1166,62 +1933,50 @@ class Tag(PageElement):
1166 """Renders the contents of this tag as a Unicode string. 1933 """Renders the contents of this tag as a Unicode string.
1167 1934
1168 :param indent_level: Each line of the rendering will be 1935 :param indent_level: Each line of the rendering will be
1169 indented this many spaces. 1936 indented this many levels. (The formatter decides what a
1937 'level' means in terms of spaces or other characters
1938 output.) Used internally in recursive calls while
1939 pretty-printing.
1170 1940
1171 :param eventual_encoding: The tag is destined to be 1941 :param eventual_encoding: The tag is destined to be
1172 encoded into this encoding. This method is _not_ 1942 encoded into this encoding. decode_contents() is _not_
1173 responsible for performing that encoding. This information 1943 responsible for performing that encoding. This information
1174 is passed in so that it can be substituted in if the 1944 is passed in so that it can be substituted in if the
1175 document contains a <META> tag that mentions the document's 1945 document contains a <META> tag that mentions the document's
1176 encoding. 1946 encoding.
1177 1947
1178 :param formatter: The output formatter responsible for converting 1948 :param formatter: A Formatter object, or a string naming one of
1179 entities to Unicode characters. 1949 the standard Formatters.
1180 """ 1950
1181 # First off, turn a string formatter into a function. This 1951 """
1182 # will stop the lookup from happening over and over again. 1952 return self.decode(indent_level, eventual_encoding, formatter,
1183 if not isinstance(formatter, collections.abc.Callable): 1953 iterator=self.descendants)
1184 formatter = self._formatter_for_name(formatter)
1185
1186 pretty_print = (indent_level is not None)
1187 s = []
1188 for c in self:
1189 text = None
1190 if isinstance(c, NavigableString):
1191 text = c.output_ready(formatter)
1192 elif isinstance(c, Tag):
1193 s.append(c.decode(indent_level, eventual_encoding,
1194 formatter))
1195 if text and indent_level and not self.name == 'pre':
1196 text = text.strip()
1197 if text:
1198 if pretty_print and not self.name == 'pre':
1199 s.append(" " * (indent_level - 1))
1200 s.append(text)
1201 if pretty_print and not self.name == 'pre':
1202 s.append("\n")
1203 return ''.join(s)
1204 1954
1205 def encode_contents( 1955 def encode_contents(
1206 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1956 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1207 formatter="minimal"): 1957 formatter="minimal"):
1208 """Renders the contents of this tag as a bytestring. 1958 """Renders the contents of this PageElement as a bytestring.
1209 1959
1210 :param indent_level: Each line of the rendering will be 1960 :param indent_level: Each line of the rendering will be
1211 indented this many spaces. 1961 indented this many levels. (The formatter decides what a
1962 'level' means in terms of spaces or other characters
1963 output.) Used internally in recursive calls while
1964 pretty-printing.
1212 1965
1213 :param eventual_encoding: The bytestring will be in this encoding. 1966 :param eventual_encoding: The bytestring will be in this encoding.
1214 1967
1215 :param formatter: The output formatter responsible for converting 1968 :param formatter: A Formatter object, or a string naming one of
1216 entities to Unicode characters. 1969 the standard Formatters.
1217 """
1218 1970
1971 :return: A bytestring.
1972 """
1219 contents = self.decode_contents(indent_level, encoding, formatter) 1973 contents = self.decode_contents(indent_level, encoding, formatter)
1220 return contents.encode(encoding) 1974 return contents.encode(encoding)
1221 1975
1222 # Old method for BS3 compatibility 1976 # Old method for BS3 compatibility
1223 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 1977 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1224 prettyPrint=False, indentLevel=0): 1978 prettyPrint=False, indentLevel=0):
1979 """Deprecated method for BS3 compatibility."""
1225 if not prettyPrint: 1980 if not prettyPrint:
1226 indentLevel = None 1981 indentLevel = None
1227 return self.encode_contents( 1982 return self.encode_contents(
@@ -1229,44 +1984,88 @@ class Tag(PageElement):
1229 1984
1230 #Soup methods 1985 #Soup methods
1231 1986
1232 def find(self, name=None, attrs={}, recursive=True, text=None, 1987 def find(self, name=None, attrs={}, recursive=True, string=None,
1233 **kwargs): 1988 **kwargs):
1234 """Return only the first child of this Tag matching the given 1989 """Look in the children of this PageElement and find the first
1235 criteria.""" 1990 PageElement that matches the given criteria.
1991
1992 All find_* methods take a common set of arguments. See the online
1993 documentation for detailed explanations.
1994
1995 :param name: A filter on tag name.
1996 :param attrs: A dictionary of filters on attribute values.
1997 :param recursive: If this is True, find() will perform a
1998 recursive search of this PageElement's children. Otherwise,
1999 only the direct children will be considered.
2000 :param limit: Stop looking after finding this many results.
2001 :kwargs: A dictionary of filters on attribute values.
2002 :return: A PageElement.
2003 :rtype: bs4.element.Tag | bs4.element.NavigableString
2004 """
1236 r = None 2005 r = None
1237 l = self.find_all(name, attrs, recursive, text, 1, **kwargs) 2006 l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
2007 **kwargs)
1238 if l: 2008 if l:
1239 r = l[0] 2009 r = l[0]
1240 return r 2010 return r
1241 findChild = find 2011 findChild = find #BS2
1242 2012
1243 def find_all(self, name=None, attrs={}, recursive=True, text=None, 2013 def find_all(self, name=None, attrs={}, recursive=True, string=None,
1244 limit=None, **kwargs): 2014 limit=None, **kwargs):
1245 """Extracts a list of Tag objects that match the given 2015 """Look in the children of this PageElement and find all
1246 criteria. You can specify the name of the Tag and any 2016 PageElements that match the given criteria.
1247 attributes you want the Tag to have. 2017
1248 2018 All find_* methods take a common set of arguments. See the online
1249 The value of a key-value pair in the 'attrs' map can be a 2019 documentation for detailed explanations.
1250 string, a list of strings, a regular expression object, or a 2020
1251 callable that takes a string and returns whether or not the 2021 :param name: A filter on tag name.
1252 string matches for some custom definition of 'matches'. The 2022 :param attrs: A dictionary of filters on attribute values.
1253 same is true of the tag name.""" 2023 :param recursive: If this is True, find_all() will perform a
1254 2024 recursive search of this PageElement's children. Otherwise,
2025 only the direct children will be considered.
2026 :param limit: Stop looking after finding this many results.
2027 :kwargs: A dictionary of filters on attribute values.
2028 :return: A ResultSet of PageElements.
2029 :rtype: bs4.element.ResultSet
2030 """
1255 generator = self.descendants 2031 generator = self.descendants
1256 if not recursive: 2032 if not recursive:
1257 generator = self.children 2033 generator = self.children
1258 return self._find_all(name, attrs, text, limit, generator, **kwargs) 2034 _stacklevel = kwargs.pop('_stacklevel', 2)
2035 return self._find_all(name, attrs, string, limit, generator,
2036 _stacklevel=_stacklevel+1, **kwargs)
1259 findAll = find_all # BS3 2037 findAll = find_all # BS3
1260 findChildren = find_all # BS2 2038 findChildren = find_all # BS2
1261 2039
1262 #Generator methods 2040 #Generator methods
1263 @property 2041 @property
1264 def children(self): 2042 def children(self):
2043 """Iterate over all direct children of this PageElement.
2044
2045 :yield: A sequence of PageElements.
2046 """
1265 # return iter() to make the purpose of the method clear 2047 # return iter() to make the purpose of the method clear
1266 return iter(self.contents) # XXX This seems to be untested. 2048 return iter(self.contents) # XXX This seems to be untested.
1267 2049
1268 @property 2050 @property
2051 def self_and_descendants(self):
2052 """Iterate over this PageElement and its children in a
2053 breadth-first sequence.
2054
2055 :yield: A sequence of PageElements.
2056 """
2057 if not self.hidden:
2058 yield self
2059 for i in self.descendants:
2060 yield i
2061
2062 @property
1269 def descendants(self): 2063 def descendants(self):
2064 """Iterate over all children of this PageElement in a
2065 breadth-first sequence.
2066
2067 :yield: A sequence of PageElements.
2068 """
1270 if not len(self.contents): 2069 if not len(self.contents):
1271 return 2070 return
1272 stopNode = self._last_descendant().next_element 2071 stopNode = self._last_descendant().next_element
@@ -1276,262 +2075,102 @@ class Tag(PageElement):
1276 current = current.next_element 2075 current = current.next_element
1277 2076
1278 # CSS selector code 2077 # CSS selector code
2078 def select_one(self, selector, namespaces=None, **kwargs):
2079 """Perform a CSS selection operation on the current element.
1279 2080
1280 _selector_combinators = ['>', '+', '~'] 2081 :param selector: A CSS selector.
1281 _select_debug = False
1282 def select_one(self, selector):
1283 """Perform a CSS selection operation on the current element."""
1284 value = self.select(selector, limit=1)
1285 if value:
1286 return value[0]
1287 return None
1288 2082
1289 def select(self, selector, _candidate_generator=None, limit=None): 2083 :param namespaces: A dictionary mapping namespace prefixes
1290 """Perform a CSS selection operation on the current element.""" 2084 used in the CSS selector to namespace URIs. By default,
1291 2085 Beautiful Soup will use the prefixes it encountered while
1292 # Handle grouping selectors if ',' exists, ie: p,a 2086 parsing the document.
1293 if ',' in selector:
1294 context = []
1295 for partial_selector in selector.split(','):
1296 partial_selector = partial_selector.strip()
1297 if partial_selector == '':
1298 raise ValueError('Invalid group selection syntax: %s' % selector)
1299 candidates = self.select(partial_selector, limit=limit)
1300 for candidate in candidates:
1301 if candidate not in context:
1302 context.append(candidate)
1303
1304 if limit and len(context) >= limit:
1305 break
1306 return context
1307 2087
1308 tokens = selector.split() 2088 :param kwargs: Keyword arguments to be passed into Soup Sieve's
1309 current_context = [self] 2089 soupsieve.select() method.
1310 2090
1311 if tokens[-1] in self._selector_combinators: 2091 :return: A Tag.
1312 raise ValueError( 2092 :rtype: bs4.element.Tag
1313 'Final combinator "%s" is missing an argument.' % tokens[-1]) 2093 """
2094 return self.css.select_one(selector, namespaces, **kwargs)
1314 2095
1315 if self._select_debug: 2096 def select(self, selector, namespaces=None, limit=None, **kwargs):
1316 print('Running CSS selector "%s"' % selector) 2097 """Perform a CSS selection operation on the current element.
1317 2098
1318 for index, token in enumerate(tokens): 2099 This uses the SoupSieve library.
1319 new_context = []
1320 new_context_ids = set([])
1321 2100
1322 if tokens[index-1] in self._selector_combinators: 2101 :param selector: A string containing a CSS selector.
1323 # This token was consumed by the previous combinator. Skip it.
1324 if self._select_debug:
1325 print(' Token was consumed by the previous combinator.')
1326 continue
1327 2102
1328 if self._select_debug: 2103 :param namespaces: A dictionary mapping namespace prefixes
1329 print(' Considering token "%s"' % token) 2104 used in the CSS selector to namespace URIs. By default,
1330 recursive_candidate_generator = None 2105 Beautiful Soup will use the prefixes it encountered while
1331 tag_name = None 2106 parsing the document.
1332 2107
1333 # Each operation corresponds to a checker function, a rule 2108 :param limit: After finding this number of results, stop looking.
1334 # for determining whether a candidate matches the 2109
1335 # selector. Candidates are generated by the active 2110 :param kwargs: Keyword arguments to be passed into SoupSieve's
1336 # iterator. 2111 soupsieve.select() method.
1337 checker = None 2112
1338 2113 :return: A ResultSet of Tags.
1339 m = self.attribselect_re.match(token) 2114 :rtype: bs4.element.ResultSet
1340 if m is not None: 2115 """
1341 # Attribute selector 2116 return self.css.select(selector, namespaces, limit, **kwargs)
1342 tag_name, attribute, operator, value = m.groups() 2117
1343 checker = self._attribute_checker(operator, attribute, value) 2118 @property
1344 2119 def css(self):
1345 elif '#' in token: 2120 """Return an interface to the CSS selector API."""
1346 # ID selector 2121 return CSS(self)
1347 tag_name, tag_id = token.split('#', 1)
1348 def id_matches(tag):
1349 return tag.get('id', None) == tag_id
1350 checker = id_matches
1351
1352 elif '.' in token:
1353 # Class selector
1354 tag_name, klass = token.split('.', 1)
1355 classes = set(klass.split('.'))
1356 def classes_match(candidate):
1357 return classes.issubset(candidate.get('class', []))
1358 checker = classes_match
1359
1360 elif ':' in token:
1361 # Pseudo-class
1362 tag_name, pseudo = token.split(':', 1)
1363 if tag_name == '':
1364 raise ValueError(
1365 "A pseudo-class must be prefixed with a tag name.")
1366 pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1367 found = []
1368 if pseudo_attributes is None:
1369 pseudo_type = pseudo
1370 pseudo_value = None
1371 else:
1372 pseudo_type, pseudo_value = pseudo_attributes.groups()
1373 if pseudo_type == 'nth-of-type':
1374 try:
1375 pseudo_value = int(pseudo_value)
1376 except:
1377 raise NotImplementedError(
1378 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1379 if pseudo_value < 1:
1380 raise ValueError(
1381 'nth-of-type pseudo-class value must be at least 1.')
1382 class Counter(object):
1383 def __init__(self, destination):
1384 self.count = 0
1385 self.destination = destination
1386
1387 def nth_child_of_type(self, tag):
1388 self.count += 1
1389 if self.count == self.destination:
1390 return True
1391 if self.count > self.destination:
1392 # Stop the generator that's sending us
1393 # these things.
1394 raise StopIteration()
1395 return False
1396 checker = Counter(pseudo_value).nth_child_of_type
1397 else:
1398 raise NotImplementedError(
1399 'Only the following pseudo-classes are implemented: nth-of-type.')
1400
1401 elif token == '*':
1402 # Star selector -- matches everything
1403 pass
1404 elif token == '>':
1405 # Run the next token as a CSS selector against the
1406 # direct children of each tag in the current context.
1407 recursive_candidate_generator = lambda tag: tag.children
1408 elif token == '~':
1409 # Run the next token as a CSS selector against the
1410 # siblings of each tag in the current context.
1411 recursive_candidate_generator = lambda tag: tag.next_siblings
1412 elif token == '+':
1413 # For each tag in the current context, run the next
1414 # token as a CSS selector against the tag's next
1415 # sibling that's a tag.
1416 def next_tag_sibling(tag):
1417 yield tag.find_next_sibling(True)
1418 recursive_candidate_generator = next_tag_sibling
1419
1420 elif self.tag_name_re.match(token):
1421 # Just a tag name.
1422 tag_name = token
1423 else:
1424 raise ValueError(
1425 'Unsupported or invalid CSS selector: "%s"' % token)
1426 if recursive_candidate_generator:
1427 # This happens when the selector looks like "> foo".
1428 #
1429 # The generator calls select() recursively on every
1430 # member of the current context, passing in a different
1431 # candidate generator and a different selector.
1432 #
1433 # In the case of "> foo", the candidate generator is
1434 # one that yields a tag's direct children (">"), and
1435 # the selector is "foo".
1436 next_token = tokens[index+1]
1437 def recursive_select(tag):
1438 if self._select_debug:
1439 print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
1440 print('-' * 40)
1441 for i in tag.select(next_token, recursive_candidate_generator):
1442 if self._select_debug:
1443 print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
1444 yield i
1445 if self._select_debug:
1446 print('-' * 40)
1447 _use_candidate_generator = recursive_select
1448 elif _candidate_generator is None:
1449 # By default, a tag's candidates are all of its
1450 # children. If tag_name is defined, only yield tags
1451 # with that name.
1452 if self._select_debug:
1453 if tag_name:
1454 check = "[any]"
1455 else:
1456 check = tag_name
1457 print(' Default candidate generator, tag name="%s"' % check)
1458 if self._select_debug:
1459 # This is redundant with later code, but it stops
1460 # a bunch of bogus tags from cluttering up the
1461 # debug log.
1462 def default_candidate_generator(tag):
1463 for child in tag.descendants:
1464 if not isinstance(child, Tag):
1465 continue
1466 if tag_name and not child.name == tag_name:
1467 continue
1468 yield child
1469 _use_candidate_generator = default_candidate_generator
1470 else:
1471 _use_candidate_generator = lambda tag: tag.descendants
1472 else:
1473 _use_candidate_generator = _candidate_generator
1474
1475 count = 0
1476 for tag in current_context:
1477 if self._select_debug:
1478 print(" Running candidate generator on %s %s" % (
1479 tag.name, repr(tag.attrs)))
1480 for candidate in _use_candidate_generator(tag):
1481 if not isinstance(candidate, Tag):
1482 continue
1483 if tag_name and candidate.name != tag_name:
1484 continue
1485 if checker is not None:
1486 try:
1487 result = checker(candidate)
1488 except StopIteration:
1489 # The checker has decided we should no longer
1490 # run the generator.
1491 break
1492 if checker is None or result:
1493 if self._select_debug:
1494 print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
1495 if id(candidate) not in new_context_ids:
1496 # If a tag matches a selector more than once,
1497 # don't include it in the context more than once.
1498 new_context.append(candidate)
1499 new_context_ids.add(id(candidate))
1500 if limit and len(new_context) >= limit:
1501 break
1502 elif self._select_debug:
1503 print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
1504
1505
1506 current_context = new_context
1507
1508 if self._select_debug:
1509 print("Final verdict:")
1510 for i in current_context:
1511 print(" %s %s" % (i.name, i.attrs))
1512 return current_context
1513 2122
1514 # Old names for backwards compatibility 2123 # Old names for backwards compatibility
1515 def childGenerator(self): 2124 def childGenerator(self):
2125 """Deprecated generator."""
1516 return self.children 2126 return self.children
1517 2127
1518 def recursiveChildGenerator(self): 2128 def recursiveChildGenerator(self):
2129 """Deprecated generator."""
1519 return self.descendants 2130 return self.descendants
1520 2131
1521 def has_key(self, key): 2132 def has_key(self, key):
1522 """This was kind of misleading because has_key() (attributes) 2133 """Deprecated method. This was kind of misleading because has_key()
1523 was different from __in__ (contents). has_key() is gone in 2134 (attributes) was different from __in__ (contents).
1524 Python 3, anyway.""" 2135
1525 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( 2136 has_key() is gone in Python 3, anyway.
1526 key)) 2137 """
2138 warnings.warn(
2139 'has_key is deprecated. Use has_attr(key) instead.',
2140 DeprecationWarning, stacklevel=2
2141 )
1527 return self.has_attr(key) 2142 return self.has_attr(key)
1528 2143
1529# Next, a couple classes to represent queries and their results. 2144# Next, a couple classes to represent queries and their results.
1530class SoupStrainer(object): 2145class SoupStrainer(object):
1531 """Encapsulates a number of ways of matching a markup element (tag or 2146 """Encapsulates a number of ways of matching a markup element (tag or
1532 text).""" 2147 string).
2148
2149 This is primarily used to underpin the find_* methods, but you can
2150 create one yourself and pass it in as `parse_only` to the
2151 `BeautifulSoup` constructor, to parse a subset of a large
2152 document.
2153 """
2154
2155 def __init__(self, name=None, attrs={}, string=None, **kwargs):
2156 """Constructor.
2157
2158 The SoupStrainer constructor takes the same arguments passed
2159 into the find_* methods. See the online documentation for
2160 detailed explanations.
2161
2162 :param name: A filter on tag name.
2163 :param attrs: A dictionary of filters on attribute values.
2164 :param string: A filter for a NavigableString with specific text.
2165 :kwargs: A dictionary of filters on attribute values.
2166 """
2167 if string is None and 'text' in kwargs:
2168 string = kwargs.pop('text')
2169 warnings.warn(
2170 "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
2171 DeprecationWarning, stacklevel=2
2172 )
1533 2173
1534 def __init__(self, name=None, attrs={}, text=None, **kwargs):
1535 self.name = self._normalize_search_value(name) 2174 self.name = self._normalize_search_value(name)
1536 if not isinstance(attrs, dict): 2175 if not isinstance(attrs, dict):
1537 # Treat a non-dict value for attrs as a search for the 'class' 2176 # Treat a non-dict value for attrs as a search for the 'class'
@@ -1556,12 +2195,15 @@ class SoupStrainer(object):
1556 normalized_attrs[key] = self._normalize_search_value(value) 2195 normalized_attrs[key] = self._normalize_search_value(value)
1557 2196
1558 self.attrs = normalized_attrs 2197 self.attrs = normalized_attrs
1559 self.text = self._normalize_search_value(text) 2198 self.string = self._normalize_search_value(string)
2199
2200 # DEPRECATED but just in case someone is checking this.
2201 self.text = self.string
1560 2202
1561 def _normalize_search_value(self, value): 2203 def _normalize_search_value(self, value):
1562 # Leave it alone if it's a Unicode string, a callable, a 2204 # Leave it alone if it's a Unicode string, a callable, a
1563 # regular expression, a boolean, or None. 2205 # regular expression, a boolean, or None.
1564 if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match') 2206 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
1565 or isinstance(value, bool) or value is None): 2207 or isinstance(value, bool) or value is None):
1566 return value 2208 return value
1567 2209
@@ -1589,19 +2231,40 @@ class SoupStrainer(object):
1589 return str(str(value)) 2231 return str(str(value))
1590 2232
1591 def __str__(self): 2233 def __str__(self):
1592 if self.text: 2234 """A human-readable representation of this SoupStrainer."""
1593 return self.text 2235 if self.string:
2236 return self.string
1594 else: 2237 else:
1595 return "%s|%s" % (self.name, self.attrs) 2238 return "%s|%s" % (self.name, self.attrs)
1596 2239
1597 def search_tag(self, markup_name=None, markup_attrs={}): 2240 def search_tag(self, markup_name=None, markup_attrs={}):
2241 """Check whether a Tag with the given name and attributes would
2242 match this SoupStrainer.
2243
2244 Used prospectively to decide whether to even bother creating a Tag
2245 object.
2246
2247 :param markup_name: A tag name as found in some markup.
2248 :param markup_attrs: A dictionary of attributes as found in some markup.
2249
2250 :return: True if the prospective tag would match this SoupStrainer;
2251 False otherwise.
2252 """
1598 found = None 2253 found = None
1599 markup = None 2254 markup = None
1600 if isinstance(markup_name, Tag): 2255 if isinstance(markup_name, Tag):
1601 markup = markup_name 2256 markup = markup_name
1602 markup_attrs = markup 2257 markup_attrs = markup
2258
2259 if isinstance(self.name, str):
2260 # Optimization for a very common case where the user is
2261 # searching for a tag with one specific name, and we're
2262 # looking at a tag with a different name.
2263 if markup and not markup.prefix and self.name != markup.name:
2264 return False
2265
1603 call_function_with_tag_data = ( 2266 call_function_with_tag_data = (
1604 isinstance(self.name, collections.abc.Callable) 2267 isinstance(self.name, Callable)
1605 and not isinstance(markup_name, Tag)) 2268 and not isinstance(markup_name, Tag))
1606 2269
1607 if ((not self.name) 2270 if ((not self.name)
@@ -1630,13 +2293,22 @@ class SoupStrainer(object):
1630 found = markup 2293 found = markup
1631 else: 2294 else:
1632 found = markup_name 2295 found = markup_name
1633 if found and self.text and not self._matches(found.string, self.text): 2296 if found and self.string and not self._matches(found.string, self.string):
1634 found = None 2297 found = None
1635 return found 2298 return found
2299
2300 # For BS3 compatibility.
1636 searchTag = search_tag 2301 searchTag = search_tag
1637 2302
1638 def search(self, markup): 2303 def search(self, markup):
1639 # print 'looking for %s in %s' % (self, markup) 2304 """Find all items in `markup` that match this SoupStrainer.
2305
2306 Used by the core _find_all() method, which is ultimately
2307 called by all find_* methods.
2308
2309 :param markup: A PageElement or a list of them.
2310 """
2311 # print('looking for %s in %s' % (self, markup))
1640 found = None 2312 found = None
1641 # If given a list of items, scan it for a text element that 2313 # If given a list of items, scan it for a text element that
1642 # matches. 2314 # matches.
@@ -1649,49 +2321,44 @@ class SoupStrainer(object):
1649 # If it's a Tag, make sure its name or attributes match. 2321 # If it's a Tag, make sure its name or attributes match.
1650 # Don't bother with Tags if we're searching for text. 2322 # Don't bother with Tags if we're searching for text.
1651 elif isinstance(markup, Tag): 2323 elif isinstance(markup, Tag):
1652 if not self.text or self.name or self.attrs: 2324 if not self.string or self.name or self.attrs:
1653 found = self.search_tag(markup) 2325 found = self.search_tag(markup)
1654 # If it's text, make sure the text matches. 2326 # If it's text, make sure the text matches.
1655 elif isinstance(markup, NavigableString) or \ 2327 elif isinstance(markup, NavigableString) or \
1656 isinstance(markup, str): 2328 isinstance(markup, str):
1657 if not self.name and not self.attrs and self._matches(markup, self.text): 2329 if not self.name and not self.attrs and self._matches(markup, self.string):
1658 found = markup 2330 found = markup
1659 else: 2331 else:
1660 raise Exception( 2332 raise Exception(
1661 "I don't know how to match against a %s" % markup.__class__) 2333 "I don't know how to match against a %s" % markup.__class__)
1662 return found 2334 return found
1663 2335
1664 def _matches(self, markup, match_against): 2336 def _matches(self, markup, match_against, already_tried=None):
1665 # print u"Matching %s against %s" % (markup, match_against) 2337 # print(u"Matching %s against %s" % (markup, match_against))
1666 result = False 2338 result = False
1667 if isinstance(markup, list) or isinstance(markup, tuple): 2339 if isinstance(markup, list) or isinstance(markup, tuple):
1668 # This should only happen when searching a multi-valued attribute 2340 # This should only happen when searching a multi-valued attribute
1669 # like 'class'. 2341 # like 'class'.
1670 if (isinstance(match_against, str) 2342 for item in markup:
1671 and ' ' in match_against): 2343 if self._matches(item, match_against):
1672 # A bit of a special case. If they try to match "foo 2344 return True
1673 # bar" on a multivalue attribute's value, only accept 2345 # We didn't match any particular value of the multivalue
1674 # the literal value "foo bar" 2346 # attribute, but maybe we match the attribute value when
1675 # 2347 # considered as a string.
1676 # XXX This is going to be pretty slow because we keep 2348 if self._matches(' '.join(markup), match_against):
1677 # splitting match_against. But it shouldn't come up 2349 return True
1678 # too often. 2350 return False
1679 return (whitespace_re.split(match_against) == markup)
1680 else:
1681 for item in markup:
1682 if self._matches(item, match_against):
1683 return True
1684 return False
1685 2351
1686 if match_against is True: 2352 if match_against is True:
1687 # True matches any non-None value. 2353 # True matches any non-None value.
1688 return markup is not None 2354 return markup is not None
1689 2355
1690 if isinstance(match_against, collections.abc.Callable): 2356 if isinstance(match_against, Callable):
1691 return match_against(markup) 2357 return match_against(markup)
1692 2358
1693 # Custom callables take the tag as an argument, but all 2359 # Custom callables take the tag as an argument, but all
1694 # other ways of matching match the tag name as a string. 2360 # other ways of matching match the tag name as a string.
2361 original_markup = markup
1695 if isinstance(markup, Tag): 2362 if isinstance(markup, Tag):
1696 markup = markup.name 2363 markup = markup.name
1697 2364
@@ -1702,23 +2369,67 @@ class SoupStrainer(object):
1702 # None matches None, False, an empty string, an empty list, and so on. 2369 # None matches None, False, an empty string, an empty list, and so on.
1703 return not match_against 2370 return not match_against
1704 2371
1705 if isinstance(match_against, str): 2372 if (hasattr(match_against, '__iter__')
2373 and not isinstance(match_against, str)):
2374 # We're asked to match against an iterable of items.
2375 # The markup must be match at least one item in the
2376 # iterable. We'll try each one in turn.
2377 #
2378 # To avoid infinite recursion we need to keep track of
2379 # items we've already seen.
2380 if not already_tried:
2381 already_tried = set()
2382 for item in match_against:
2383 if item.__hash__:
2384 key = item
2385 else:
2386 key = id(item)
2387 if key in already_tried:
2388 continue
2389 else:
2390 already_tried.add(key)
2391 if self._matches(original_markup, item, already_tried):
2392 return True
2393 else:
2394 return False
2395
2396 # Beyond this point we might need to run the test twice: once against
2397 # the tag's name and once against its prefixed name.
2398 match = False
2399
2400 if not match and isinstance(match_against, str):
1706 # Exact string match 2401 # Exact string match
1707 return markup == match_against 2402 match = markup == match_against
1708 2403
1709 if hasattr(match_against, 'match'): 2404 if not match and hasattr(match_against, 'search'):
1710 # Regexp match 2405 # Regexp match
1711 return match_against.search(markup) 2406 return match_against.search(markup)
1712 2407
1713 if hasattr(match_against, '__iter__'): 2408 if (not match
1714 # The markup must be an exact match against something 2409 and isinstance(original_markup, Tag)
1715 # in the iterable. 2410 and original_markup.prefix):
1716 return markup in match_against 2411 # Try the whole thing again with the prefixed tag name.
2412 return self._matches(
2413 original_markup.prefix + ':' + original_markup.name, match_against
2414 )
2415
2416 return match
1717 2417
1718 2418
1719class ResultSet(list): 2419class ResultSet(list):
1720 """A ResultSet is just a list that keeps track of the SoupStrainer 2420 """A ResultSet is just a list that keeps track of the SoupStrainer
1721 that created it.""" 2421 that created it."""
1722 def __init__(self, source, result=()): 2422 def __init__(self, source, result=()):
2423 """Constructor.
2424
2425 :param source: A SoupStrainer.
2426 :param result: A list of PageElements.
2427 """
1723 super(ResultSet, self).__init__(result) 2428 super(ResultSet, self).__init__(result)
1724 self.source = source 2429 self.source = source
2430
2431 def __getattr__(self, key):
2432 """Raise a helpful exception to explain a common code fix."""
2433 raise AttributeError(
2434 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2435 )
diff --git a/bitbake/lib/bs4/formatter.py b/bitbake/lib/bs4/formatter.py
new file mode 100644
index 0000000000..9fa1b57cb6
--- /dev/null
+++ b/bitbake/lib/bs4/formatter.py
@@ -0,0 +1,185 @@
1from bs4.dammit import EntitySubstitution
2
3class Formatter(EntitySubstitution):
4 """Describes a strategy to use when outputting a parse tree to a string.
5
6 Some parts of this strategy come from the distinction between
7 HTML4, HTML5, and XML. Others are configurable by the user.
8
9 Formatters are passed in as the `formatter` argument to methods
10 like `PageElement.encode`. Most people won't need to think about
11 formatters, and most people who need to think about them can pass
12 in one of these predefined strings as `formatter` rather than
13 making a new Formatter object:
14
15 For HTML documents:
16 * 'html' - HTML entity substitution for generic HTML documents. (default)
17 * 'html5' - HTML entity substitution for HTML5 documents, as
18 well as some optimizations in the way tags are rendered.
19 * 'minimal' - Only make the substitutions necessary to guarantee
20 valid HTML.
21 * None - Do not perform any substitution. This will be faster
22 but may result in invalid markup.
23
24 For XML documents:
25 * 'html' - Entity substitution for XHTML documents.
26 * 'minimal' - Only make the substitutions necessary to guarantee
27 valid XML. (default)
28 * None - Do not perform any substitution. This will be faster
29 but may result in invalid markup.
30 """
31 # Registries of XML and HTML formatters.
32 XML_FORMATTERS = {}
33 HTML_FORMATTERS = {}
34
35 HTML = 'html'
36 XML = 'xml'
37
38 HTML_DEFAULTS = dict(
39 cdata_containing_tags=set(["script", "style"]),
40 )
41
42 def _default(self, language, value, kwarg):
43 if value is not None:
44 return value
45 if language == self.XML:
46 return set()
47 return self.HTML_DEFAULTS[kwarg]
48
49 def __init__(
50 self, language=None, entity_substitution=None,
51 void_element_close_prefix='/', cdata_containing_tags=None,
52 empty_attributes_are_booleans=False, indent=1,
53 ):
54 r"""Constructor.
55
56 :param language: This should be Formatter.XML if you are formatting
57 XML markup and Formatter.HTML if you are formatting HTML markup.
58
59 :param entity_substitution: A function to call to replace special
60 characters with XML/HTML entities. For examples, see
61 bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
62 :param void_element_close_prefix: By default, void elements
63 are represented as <tag/> (XML rules) rather than <tag>
64 (HTML rules). To get <tag>, pass in the empty string.
65 :param cdata_containing_tags: The list of tags that are defined
66 as containing CDATA in this dialect. For example, in HTML,
67 <script> and <style> tags are defined as containing CDATA,
68 and their contents should not be formatted.
69 :param blank_attributes_are_booleans: Render attributes whose value
70 is the empty string as HTML-style boolean attributes.
71 (Attributes whose value is None are always rendered this way.)
72
73 :param indent: If indent is a non-negative integer or string,
74 then the contents of elements will be indented
75 appropriately when pretty-printing. An indent level of 0,
76 negative, or "" will only insert newlines. Using a
77 positive integer indent indents that many spaces per
78 level. If indent is a string (such as "\t"), that string
79 is used to indent each level. The default behavior is to
80 indent one space per level.
81 """
82 self.language = language
83 self.entity_substitution = entity_substitution
84 self.void_element_close_prefix = void_element_close_prefix
85 self.cdata_containing_tags = self._default(
86 language, cdata_containing_tags, 'cdata_containing_tags'
87 )
88 self.empty_attributes_are_booleans=empty_attributes_are_booleans
89 if indent is None:
90 indent = 0
91 if isinstance(indent, int):
92 if indent < 0:
93 indent = 0
94 indent = ' ' * indent
95 elif isinstance(indent, str):
96 indent = indent
97 else:
98 indent = ' '
99 self.indent = indent
100
101 def substitute(self, ns):
102 """Process a string that needs to undergo entity substitution.
103 This may be a string encountered in an attribute value or as
104 text.
105
106 :param ns: A string.
107 :return: A string with certain characters replaced by named
108 or numeric entities.
109 """
110 if not self.entity_substitution:
111 return ns
112 from .element import NavigableString
113 if (isinstance(ns, NavigableString)
114 and ns.parent is not None
115 and ns.parent.name in self.cdata_containing_tags):
116 # Do nothing.
117 return ns
118 # Substitute.
119 return self.entity_substitution(ns)
120
121 def attribute_value(self, value):
122 """Process the value of an attribute.
123
124 :param ns: A string.
125 :return: A string with certain characters replaced by named
126 or numeric entities.
127 """
128 return self.substitute(value)
129
130 def attributes(self, tag):
131 """Reorder a tag's attributes however you want.
132
133 By default, attributes are sorted alphabetically. This makes
134 behavior consistent between Python 2 and Python 3, and preserves
135 backwards compatibility with older versions of Beautiful Soup.
136
137 If `empty_boolean_attributes` is True, then attributes whose
138 values are set to the empty string will be treated as boolean
139 attributes.
140 """
141 if tag.attrs is None:
142 return []
143 return sorted(
144 (k, (None if self.empty_attributes_are_booleans and v == '' else v))
145 for k, v in list(tag.attrs.items())
146 )
147
148class HTMLFormatter(Formatter):
149 """A generic Formatter for HTML."""
150 REGISTRY = {}
151 def __init__(self, *args, **kwargs):
152 super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
153
154
155class XMLFormatter(Formatter):
156 """A generic Formatter for XML."""
157 REGISTRY = {}
158 def __init__(self, *args, **kwargs):
159 super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
160
161
162# Set up aliases for the default formatters.
163HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
164 entity_substitution=EntitySubstitution.substitute_html
165)
166HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
167 entity_substitution=EntitySubstitution.substitute_html,
168 void_element_close_prefix=None,
169 empty_attributes_are_booleans=True,
170)
171HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
172 entity_substitution=EntitySubstitution.substitute_xml
173)
174HTMLFormatter.REGISTRY[None] = HTMLFormatter(
175 entity_substitution=None
176)
177XMLFormatter.REGISTRY["html"] = XMLFormatter(
178 entity_substitution=EntitySubstitution.substitute_html
179)
180XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
181 entity_substitution=EntitySubstitution.substitute_xml
182)
183XMLFormatter.REGISTRY[None] = Formatter(
184 Formatter(Formatter.XML, entity_substitution=None)
185)
diff --git a/bitbake/lib/bs4/testing.py b/bitbake/lib/bs4/testing.py
deleted file mode 100644
index 6584ecf303..0000000000
--- a/bitbake/lib/bs4/testing.py
+++ /dev/null
@@ -1,686 +0,0 @@
1"""Helper classes for tests."""
2
3__license__ = "MIT"
4
5import pickle
6import copy
7import unittest
8from unittest import TestCase
9from bs4 import BeautifulSoup
10from bs4.element import (
11 CharsetMetaAttributeValue,
12 Comment,
13 ContentMetaAttributeValue,
14 Doctype,
15 SoupStrainer,
16)
17
18from bs4.builder._htmlparser import HTMLParserTreeBuilder
19default_builder = HTMLParserTreeBuilder
20
21
22class SoupTest(unittest.TestCase):
23
24 @property
25 def default_builder(self):
26 return default_builder()
27
28 def soup(self, markup, **kwargs):
29 """Build a Beautiful Soup object from markup."""
30 builder = kwargs.pop('builder', self.default_builder)
31 return BeautifulSoup(markup, builder=builder, **kwargs)
32
33 def document_for(self, markup):
34 """Turn an HTML fragment into a document.
35
36 The details depend on the builder.
37 """
38 return self.default_builder.test_fragment_to_document(markup)
39
40 def assertSoupEquals(self, to_parse, compare_parsed_to=None):
41 builder = self.default_builder
42 obj = BeautifulSoup(to_parse, builder=builder)
43 if compare_parsed_to is None:
44 compare_parsed_to = to_parse
45
46 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
47
48 def assertConnectedness(self, element):
49 """Ensure that next_element and previous_element are properly
50 set for all descendants of the given element.
51 """
52 earlier = None
53 for e in element.descendants:
54 if earlier:
55 self.assertEqual(e, earlier.next_element)
56 self.assertEqual(earlier, e.previous_element)
57 earlier = e
58
59class HTMLTreeBuilderSmokeTest(SoupTest):
60
61 """A basic test of a treebuilder's competence.
62
63 Any HTML treebuilder, present or future, should be able to pass
64 these tests. With invalid markup, there's room for interpretation,
65 and different parsers can handle it differently. But with the
66 markup in these tests, there's not much room for interpretation.
67 """
68
69 def test_pickle_and_unpickle_identity(self):
70 # Pickling a tree, then unpickling it, yields a tree identical
71 # to the original.
72 tree = self.soup("<a><b>foo</a>")
73 dumped = pickle.dumps(tree, 2)
74 loaded = pickle.loads(dumped)
75 self.assertEqual(loaded.__class__, BeautifulSoup)
76 self.assertEqual(loaded.decode(), tree.decode())
77
78 def assertDoctypeHandled(self, doctype_fragment):
79 """Assert that a given doctype string is handled correctly."""
80 doctype_str, soup = self._document_with_doctype(doctype_fragment)
81
82 # Make sure a Doctype object was created.
83 doctype = soup.contents[0]
84 self.assertEqual(doctype.__class__, Doctype)
85 self.assertEqual(doctype, doctype_fragment)
86 self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
87
88 # Make sure that the doctype was correctly associated with the
89 # parse tree and that the rest of the document parsed.
90 self.assertEqual(soup.p.contents[0], 'foo')
91
92 def _document_with_doctype(self, doctype_fragment):
93 """Generate and parse a document with the given doctype."""
94 doctype = '<!DOCTYPE %s>' % doctype_fragment
95 markup = doctype + '\n<p>foo</p>'
96 soup = self.soup(markup)
97 return doctype, soup
98
99 def test_normal_doctypes(self):
100 """Make sure normal, everyday HTML doctypes are handled correctly."""
101 self.assertDoctypeHandled("html")
102 self.assertDoctypeHandled(
103 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
104
105 def test_empty_doctype(self):
106 soup = self.soup("<!DOCTYPE>")
107 doctype = soup.contents[0]
108 self.assertEqual("", doctype.strip())
109
110 def test_public_doctype_with_url(self):
111 doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
112 self.assertDoctypeHandled(doctype)
113
114 def test_system_doctype(self):
115 self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
116
117 def test_namespaced_system_doctype(self):
118 # We can handle a namespaced doctype with a system ID.
119 self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
120
121 def test_namespaced_public_doctype(self):
122 # Test a namespaced doctype with a public id.
123 self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
124
125 def test_real_xhtml_document(self):
126 """A real XHTML document should come out more or less the same as it went in."""
127 markup = b"""<?xml version="1.0" encoding="utf-8"?>
128<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
129<html xmlns="http://www.w3.org/1999/xhtml">
130<head><title>Hello.</title></head>
131<body>Goodbye.</body>
132</html>"""
133 soup = self.soup(markup)
134 self.assertEqual(
135 soup.encode("utf-8").replace(b"\n", b""),
136 markup.replace(b"\n", b""))
137
138 def test_processing_instruction(self):
139 markup = b"""<?PITarget PIContent?>"""
140 soup = self.soup(markup)
141 self.assertEqual(markup, soup.encode("utf8"))
142
143 def test_deepcopy(self):
144 """Make sure you can copy the tree builder.
145
146 This is important because the builder is part of a
147 BeautifulSoup object, and we want to be able to copy that.
148 """
149 copy.deepcopy(self.default_builder)
150
151 def test_p_tag_is_never_empty_element(self):
152 """A <p> tag is never designated as an empty-element tag.
153
154 Even if the markup shows it as an empty-element tag, it
155 shouldn't be presented that way.
156 """
157 soup = self.soup("<p/>")
158 self.assertFalse(soup.p.is_empty_element)
159 self.assertEqual(str(soup.p), "<p></p>")
160
161 def test_unclosed_tags_get_closed(self):
162 """A tag that's not closed by the end of the document should be closed.
163
164 This applies to all tags except empty-element tags.
165 """
166 self.assertSoupEquals("<p>", "<p></p>")
167 self.assertSoupEquals("<b>", "<b></b>")
168
169 self.assertSoupEquals("<br>", "<br/>")
170
171 def test_br_is_always_empty_element_tag(self):
172 """A <br> tag is designated as an empty-element tag.
173
174 Some parsers treat <br></br> as one <br/> tag, some parsers as
175 two tags, but it should always be an empty-element tag.
176 """
177 soup = self.soup("<br></br>")
178 self.assertTrue(soup.br.is_empty_element)
179 self.assertEqual(str(soup.br), "<br/>")
180
181 def test_nested_formatting_elements(self):
182 self.assertSoupEquals("<em><em></em></em>")
183
184 def test_double_head(self):
185 html = '''<!DOCTYPE html>
186<html>
187<head>
188<title>Ordinary HEAD element test</title>
189</head>
190<script type="text/javascript">
191alert("Help!");
192</script>
193<body>
194Hello, world!
195</body>
196</html>
197'''
198 soup = self.soup(html)
199 self.assertEqual("text/javascript", soup.find('script')['type'])
200
201 def test_comment(self):
202 # Comments are represented as Comment objects.
203 markup = "<p>foo<!--foobar-->baz</p>"
204 self.assertSoupEquals(markup)
205
206 soup = self.soup(markup)
207 comment = soup.find(text="foobar")
208 self.assertEqual(comment.__class__, Comment)
209
210 # The comment is properly integrated into the tree.
211 foo = soup.find(text="foo")
212 self.assertEqual(comment, foo.next_element)
213 baz = soup.find(text="baz")
214 self.assertEqual(comment, baz.previous_element)
215
216 def test_preserved_whitespace_in_pre_and_textarea(self):
217 """Whitespace must be preserved in <pre> and <textarea> tags."""
218 self.assertSoupEquals("<pre> </pre>")
219 self.assertSoupEquals("<textarea> woo </textarea>")
220
221 def test_nested_inline_elements(self):
222 """Inline elements can be nested indefinitely."""
223 b_tag = "<b>Inside a B tag</b>"
224 self.assertSoupEquals(b_tag)
225
226 nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
227 self.assertSoupEquals(nested_b_tag)
228
229 double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
230 self.assertSoupEquals(nested_b_tag)
231
232 def test_nested_block_level_elements(self):
233 """Block elements can be nested."""
234 soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
235 blockquote = soup.blockquote
236 self.assertEqual(blockquote.p.b.string, 'Foo')
237 self.assertEqual(blockquote.b.string, 'Foo')
238
239 def test_correctly_nested_tables(self):
240 """One table can go inside another one."""
241 markup = ('<table id="1">'
242 '<tr>'
243 "<td>Here's another table:"
244 '<table id="2">'
245 '<tr><td>foo</td></tr>'
246 '</table></td>')
247
248 self.assertSoupEquals(
249 markup,
250 '<table id="1"><tr><td>Here\'s another table:'
251 '<table id="2"><tr><td>foo</td></tr></table>'
252 '</td></tr></table>')
253
254 self.assertSoupEquals(
255 "<table><thead><tr><td>Foo</td></tr></thead>"
256 "<tbody><tr><td>Bar</td></tr></tbody>"
257 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
258
259 def test_deeply_nested_multivalued_attribute(self):
260 # html5lib can set the attributes of the same tag many times
261 # as it rearranges the tree. This has caused problems with
262 # multivalued attributes.
263 markup = '<table><div><div class="css"></div></div></table>'
264 soup = self.soup(markup)
265 self.assertEqual(["css"], soup.div.div['class'])
266
267 def test_multivalued_attribute_on_html(self):
268 # html5lib uses a different API to set the attributes ot the
269 # <html> tag. This has caused problems with multivalued
270 # attributes.
271 markup = '<html class="a b"></html>'
272 soup = self.soup(markup)
273 self.assertEqual(["a", "b"], soup.html['class'])
274
275 def test_angle_brackets_in_attribute_values_are_escaped(self):
276 self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
277
278 def test_entities_in_attributes_converted_to_unicode(self):
279 expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
280 self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
281 self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
282 self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
283 self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
284
285 def test_entities_in_text_converted_to_unicode(self):
286 expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
287 self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
288 self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
289 self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
290 self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
291
292 def test_quot_entity_converted_to_quotation_mark(self):
293 self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
294 '<p>I said "good day!"</p>')
295
296 def test_out_of_range_entity(self):
297 expect = "\N{REPLACEMENT CHARACTER}"
298 self.assertSoupEquals("&#10000000000000;", expect)
299 self.assertSoupEquals("&#x10000000000000;", expect)
300 self.assertSoupEquals("&#1000000000;", expect)
301
302 def test_multipart_strings(self):
303 "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
304 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
305 self.assertEqual("p", soup.h2.string.next_element.name)
306 self.assertEqual("p", soup.p.name)
307 self.assertConnectedness(soup)
308
309 def test_head_tag_between_head_and_body(self):
310 "Prevent recurrence of a bug in the html5lib treebuilder."
311 content = """<html><head></head>
312 <link></link>
313 <body>foo</body>
314</html>
315"""
316 soup = self.soup(content)
317 self.assertNotEqual(None, soup.html.body)
318 self.assertConnectedness(soup)
319
320 def test_multiple_copies_of_a_tag(self):
321 "Prevent recurrence of a bug in the html5lib treebuilder."
322 content = """<!DOCTYPE html>
323<html>
324 <body>
325 <article id="a" >
326 <div><a href="1"></div>
327 <footer>
328 <a href="2"></a>
329 </footer>
330 </article>
331 </body>
332</html>
333"""
334 soup = self.soup(content)
335 self.assertConnectedness(soup.article)
336
337 def test_basic_namespaces(self):
338 """Parsers don't need to *understand* namespaces, but at the
339 very least they should not choke on namespaces or lose
340 data."""
341
342 markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
343 soup = self.soup(markup)
344 self.assertEqual(markup, soup.encode())
345 html = soup.html
346 self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
347 self.assertEqual(
348 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
349 self.assertEqual(
350 'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
351
352 def test_multivalued_attribute_value_becomes_list(self):
353 markup = b'<a class="foo bar">'
354 soup = self.soup(markup)
355 self.assertEqual(['foo', 'bar'], soup.a['class'])
356
357 #
358 # Generally speaking, tests below this point are more tests of
359 # Beautiful Soup than tests of the tree builders. But parsers are
360 # weird, so we run these tests separately for every tree builder
361 # to detect any differences between them.
362 #
363
364 def test_can_parse_unicode_document(self):
365 # A seemingly innocuous document... but it's in Unicode! And
366 # it contains characters that can't be represented in the
367 # encoding found in the declaration! The horror!
368 markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
369 soup = self.soup(markup)
370 self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
371
372 def test_soupstrainer(self):
373 """Parsers should be able to work with SoupStrainers."""
374 strainer = SoupStrainer("b")
375 soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
376 parse_only=strainer)
377 self.assertEqual(soup.decode(), "<b>bold</b>")
378
379 def test_single_quote_attribute_values_become_double_quotes(self):
380 self.assertSoupEquals("<foo attr='bar'></foo>",
381 '<foo attr="bar"></foo>')
382
383 def test_attribute_values_with_nested_quotes_are_left_alone(self):
384 text = """<foo attr='bar "brawls" happen'>a</foo>"""
385 self.assertSoupEquals(text)
386
387 def test_attribute_values_with_double_nested_quotes_get_quoted(self):
388 text = """<foo attr='bar "brawls" happen'>a</foo>"""
389 soup = self.soup(text)
390 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
391 self.assertSoupEquals(
392 soup.foo.decode(),
393 """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
394
395 def test_ampersand_in_attribute_value_gets_escaped(self):
396 self.assertSoupEquals('<this is="really messed up & stuff"></this>',
397 '<this is="really messed up &amp; stuff"></this>')
398
399 self.assertSoupEquals(
400 '<a href="http://example.org?a=1&b=2;3">foo</a>',
401 '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
402
403 def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
404 self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
405
406 def test_entities_in_strings_converted_during_parsing(self):
407 # Both XML and HTML entities are converted to Unicode characters
408 # during parsing.
409 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
410 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
411 self.assertSoupEquals(text, expected)
412
413 def test_smart_quotes_converted_on_the_way_in(self):
414 # Microsoft smart quotes are converted to Unicode characters during
415 # parsing.
416 quote = b"<p>\x91Foo\x92</p>"
417 soup = self.soup(quote)
418 self.assertEqual(
419 soup.p.string,
420 "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
421
422 def test_non_breaking_spaces_converted_on_the_way_in(self):
423 soup = self.soup("<a>&nbsp;&nbsp;</a>")
424 self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
425
426 def test_entities_converted_on_the_way_out(self):
427 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
428 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
429 soup = self.soup(text)
430 self.assertEqual(soup.p.encode("utf-8"), expected)
431
432 def test_real_iso_latin_document(self):
433 # Smoke test of interrelated functionality, using an
434 # easy-to-understand document.
435
436 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
437 unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
438
439 # That's because we're going to encode it into ISO-Latin-1, and use
440 # that to test.
441 iso_latin_html = unicode_html.encode("iso-8859-1")
442
443 # Parse the ISO-Latin-1 HTML.
444 soup = self.soup(iso_latin_html)
445 # Encode it to UTF-8.
446 result = soup.encode("utf-8")
447
448 # What do we expect the result to look like? Well, it would
449 # look like unicode_html, except that the META tag would say
450 # UTF-8 instead of ISO-Latin-1.
451 expected = unicode_html.replace("ISO-Latin-1", "utf-8")
452
453 # And, of course, it would be in UTF-8, not Unicode.
454 expected = expected.encode("utf-8")
455
456 # Ta-da!
457 self.assertEqual(result, expected)
458
459 def test_real_shift_jis_document(self):
460 # Smoke test to make sure the parser can handle a document in
461 # Shift-JIS encoding, without choking.
462 shift_jis_html = (
463 b'<html><head></head><body><pre>'
464 b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
465 b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
466 b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
467 b'</pre></body></html>')
468 unicode_html = shift_jis_html.decode("shift-jis")
469 soup = self.soup(unicode_html)
470
471 # Make sure the parse tree is correctly encoded to various
472 # encodings.
473 self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
474 self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
475
476 def test_real_hebrew_document(self):
477 # A real-world test to make sure we can convert ISO-8859-9 (a
478 # Hebrew encoding) to UTF-8.
479 hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
480 soup = self.soup(
481 hebrew_document, from_encoding="iso8859-8")
482 self.assertEqual(soup.original_encoding, 'iso8859-8')
483 self.assertEqual(
484 soup.encode('utf-8'),
485 hebrew_document.decode("iso8859-8").encode("utf-8"))
486
487 def test_meta_tag_reflects_current_encoding(self):
488 # Here's the <meta> tag saying that a document is
489 # encoded in Shift-JIS.
490 meta_tag = ('<meta content="text/html; charset=x-sjis" '
491 'http-equiv="Content-type"/>')
492
493 # Here's a document incorporating that meta tag.
494 shift_jis_html = (
495 '<html><head>\n%s\n'
496 '<meta http-equiv="Content-language" content="ja"/>'
497 '</head><body>Shift-JIS markup goes here.') % meta_tag
498 soup = self.soup(shift_jis_html)
499
500 # Parse the document, and the charset is seemingly unaffected.
501 parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
502 content = parsed_meta['content']
503 self.assertEqual('text/html; charset=x-sjis', content)
504
505 # But that value is actually a ContentMetaAttributeValue object.
506 self.assertTrue(isinstance(content, ContentMetaAttributeValue))
507
508 # And it will take on a value that reflects its current
509 # encoding.
510 self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
511
512 # For the rest of the story, see TestSubstitutions in
513 # test_tree.py.
514
515 def test_html5_style_meta_tag_reflects_current_encoding(self):
516 # Here's the <meta> tag saying that a document is
517 # encoded in Shift-JIS.
518 meta_tag = ('<meta id="encoding" charset="x-sjis" />')
519
520 # Here's a document incorporating that meta tag.
521 shift_jis_html = (
522 '<html><head>\n%s\n'
523 '<meta http-equiv="Content-language" content="ja"/>'
524 '</head><body>Shift-JIS markup goes here.') % meta_tag
525 soup = self.soup(shift_jis_html)
526
527 # Parse the document, and the charset is seemingly unaffected.
528 parsed_meta = soup.find('meta', id="encoding")
529 charset = parsed_meta['charset']
530 self.assertEqual('x-sjis', charset)
531
532 # But that value is actually a CharsetMetaAttributeValue object.
533 self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
534
535 # And it will take on a value that reflects its current
536 # encoding.
537 self.assertEqual('utf8', charset.encode("utf8"))
538
539 def test_tag_with_no_attributes_can_have_attributes_added(self):
540 data = self.soup("<a>text</a>")
541 data.a['foo'] = 'bar'
542 self.assertEqual('<a foo="bar">text</a>', data.a.decode())
543
544class XMLTreeBuilderSmokeTest(SoupTest):
545
546 def test_pickle_and_unpickle_identity(self):
547 # Pickling a tree, then unpickling it, yields a tree identical
548 # to the original.
549 tree = self.soup("<a><b>foo</a>")
550 dumped = pickle.dumps(tree, 2)
551 loaded = pickle.loads(dumped)
552 self.assertEqual(loaded.__class__, BeautifulSoup)
553 self.assertEqual(loaded.decode(), tree.decode())
554
555 def test_docstring_generated(self):
556 soup = self.soup("<root/>")
557 self.assertEqual(
558 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
559
560 def test_xml_declaration(self):
561 markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
562 soup = self.soup(markup)
563 self.assertEqual(markup, soup.encode("utf8"))
564
565 def test_real_xhtml_document(self):
566 """A real XHTML document should come out *exactly* the same as it went in."""
567 markup = b"""<?xml version="1.0" encoding="utf-8"?>
568<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
569<html xmlns="http://www.w3.org/1999/xhtml">
570<head><title>Hello.</title></head>
571<body>Goodbye.</body>
572</html>"""
573 soup = self.soup(markup)
574 self.assertEqual(
575 soup.encode("utf-8"), markup)
576
577 def test_formatter_processes_script_tag_for_xml_documents(self):
578 doc = """
579 <script type="text/javascript">
580 </script>
581"""
582 soup = BeautifulSoup(doc, "lxml-xml")
583 # lxml would have stripped this while parsing, but we can add
584 # it later.
585 soup.script.string = 'console.log("< < hey > > ");'
586 encoded = soup.encode()
587 self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
588
589 def test_can_parse_unicode_document(self):
590 markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
591 soup = self.soup(markup)
592 self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
593
594 def test_popping_namespaced_tag(self):
595 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
596 soup = self.soup(markup)
597 self.assertEqual(
598 str(soup.rss), markup)
599
600 def test_docstring_includes_correct_encoding(self):
601 soup = self.soup("<root/>")
602 self.assertEqual(
603 soup.encode("latin1"),
604 b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
605
606 def test_large_xml_document(self):
607 """A large XML document should come out the same as it went in."""
608 markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
609 + b'0' * (2**12)
610 + b'</root>')
611 soup = self.soup(markup)
612 self.assertEqual(soup.encode("utf-8"), markup)
613
614
615 def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
616 self.assertSoupEquals("<p>", "<p/>")
617 self.assertSoupEquals("<p>foo</p>")
618
619 def test_namespaces_are_preserved(self):
620 markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
621 soup = self.soup(markup)
622 root = soup.root
623 self.assertEqual("http://example.com/", root['xmlns:a'])
624 self.assertEqual("http://example.net/", root['xmlns:b'])
625
626 def test_closing_namespaced_tag(self):
627 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
628 soup = self.soup(markup)
629 self.assertEqual(str(soup.p), markup)
630
631 def test_namespaced_attributes(self):
632 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
633 soup = self.soup(markup)
634 self.assertEqual(str(soup.foo), markup)
635
636 def test_namespaced_attributes_xml_namespace(self):
637 markup = '<foo xml:lang="fr">bar</foo>'
638 soup = self.soup(markup)
639 self.assertEqual(str(soup.foo), markup)
640
641class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
642 """Smoke test for a tree builder that supports HTML5."""
643
644 def test_real_xhtml_document(self):
645 # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
646 # XHTML documents in any particular way.
647 pass
648
649 def test_html_tags_have_namespace(self):
650 markup = "<a>"
651 soup = self.soup(markup)
652 self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
653
654 def test_svg_tags_have_namespace(self):
655 markup = '<svg><circle/></svg>'
656 soup = self.soup(markup)
657 namespace = "http://www.w3.org/2000/svg"
658 self.assertEqual(namespace, soup.svg.namespace)
659 self.assertEqual(namespace, soup.circle.namespace)
660
661
662 def test_mathml_tags_have_namespace(self):
663 markup = '<math><msqrt>5</msqrt></math>'
664 soup = self.soup(markup)
665 namespace = 'http://www.w3.org/1998/Math/MathML'
666 self.assertEqual(namespace, soup.math.namespace)
667 self.assertEqual(namespace, soup.msqrt.namespace)
668
669 def test_xml_declaration_becomes_comment(self):
670 markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
671 soup = self.soup(markup)
672 self.assertTrue(isinstance(soup.contents[0], Comment))
673 self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
674 self.assertEqual("html", soup.contents[0].next_element.name)
675
676def skipIf(condition, reason):
677 def nothing(test, *args, **kwargs):
678 return None
679
680 def decorator(test_item):
681 if condition:
682 return nothing
683 else:
684 return test_item
685
686 return decorator
diff --git a/bitbake/lib/bs4/tests/__init__.py b/bitbake/lib/bs4/tests/__init__.py
deleted file mode 100644
index 142c8cc3f1..0000000000
--- a/bitbake/lib/bs4/tests/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
1"The beautifulsoup tests."
diff --git a/bitbake/lib/bs4/tests/test_builder_registry.py b/bitbake/lib/bs4/tests/test_builder_registry.py
deleted file mode 100644
index 90cad82933..0000000000
--- a/bitbake/lib/bs4/tests/test_builder_registry.py
+++ /dev/null
@@ -1,147 +0,0 @@
1"""Tests of the builder registry."""
2
3import unittest
4import warnings
5
6from bs4 import BeautifulSoup
7from bs4.builder import (
8 builder_registry as registry,
9 HTMLParserTreeBuilder,
10 TreeBuilderRegistry,
11)
12
13try:
14 from bs4.builder import HTML5TreeBuilder
15 HTML5LIB_PRESENT = True
16except ImportError:
17 HTML5LIB_PRESENT = False
18
19try:
20 from bs4.builder import (
21 LXMLTreeBuilderForXML,
22 LXMLTreeBuilder,
23 )
24 LXML_PRESENT = True
25except ImportError:
26 LXML_PRESENT = False
27
28
29class BuiltInRegistryTest(unittest.TestCase):
30 """Test the built-in registry with the default builders registered."""
31
32 def test_combination(self):
33 if LXML_PRESENT:
34 self.assertEqual(registry.lookup('fast', 'html'),
35 LXMLTreeBuilder)
36
37 if LXML_PRESENT:
38 self.assertEqual(registry.lookup('permissive', 'xml'),
39 LXMLTreeBuilderForXML)
40 self.assertEqual(registry.lookup('strict', 'html'),
41 HTMLParserTreeBuilder)
42 if HTML5LIB_PRESENT:
43 self.assertEqual(registry.lookup('html5lib', 'html'),
44 HTML5TreeBuilder)
45
46 def test_lookup_by_markup_type(self):
47 if LXML_PRESENT:
48 self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
49 self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
50 else:
51 self.assertEqual(registry.lookup('xml'), None)
52 if HTML5LIB_PRESENT:
53 self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
54 else:
55 self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
56
57 def test_named_library(self):
58 if LXML_PRESENT:
59 self.assertEqual(registry.lookup('lxml', 'xml'),
60 LXMLTreeBuilderForXML)
61 self.assertEqual(registry.lookup('lxml', 'html'),
62 LXMLTreeBuilder)
63 if HTML5LIB_PRESENT:
64 self.assertEqual(registry.lookup('html5lib'),
65 HTML5TreeBuilder)
66
67 self.assertEqual(registry.lookup('html.parser'),
68 HTMLParserTreeBuilder)
69
70 def test_beautifulsoup_constructor_does_lookup(self):
71
72 with warnings.catch_warnings(record=True) as w:
73 # This will create a warning about not explicitly
74 # specifying a parser, but we'll ignore it.
75
76 # You can pass in a string.
77 BeautifulSoup("", features="html")
78 # Or a list of strings.
79 BeautifulSoup("", features=["html", "fast"])
80
81 # You'll get an exception if BS can't find an appropriate
82 # builder.
83 self.assertRaises(ValueError, BeautifulSoup,
84 "", features="no-such-feature")
85
86class RegistryTest(unittest.TestCase):
87 """Test the TreeBuilderRegistry class in general."""
88
89 def setUp(self):
90 self.registry = TreeBuilderRegistry()
91
92 def builder_for_features(self, *feature_list):
93 cls = type('Builder_' + '_'.join(feature_list),
94 (object,), {'features' : feature_list})
95
96 self.registry.register(cls)
97 return cls
98
99 def test_register_with_no_features(self):
100 builder = self.builder_for_features()
101
102 # Since the builder advertises no features, you can't find it
103 # by looking up features.
104 self.assertEqual(self.registry.lookup('foo'), None)
105
106 # But you can find it by doing a lookup with no features, if
107 # this happens to be the only registered builder.
108 self.assertEqual(self.registry.lookup(), builder)
109
110 def test_register_with_features_makes_lookup_succeed(self):
111 builder = self.builder_for_features('foo', 'bar')
112 self.assertEqual(self.registry.lookup('foo'), builder)
113 self.assertEqual(self.registry.lookup('bar'), builder)
114
115 def test_lookup_fails_when_no_builder_implements_feature(self):
116 builder = self.builder_for_features('foo', 'bar')
117 self.assertEqual(self.registry.lookup('baz'), None)
118
119 def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
120 builder1 = self.builder_for_features('foo')
121 builder2 = self.builder_for_features('bar')
122 self.assertEqual(self.registry.lookup(), builder2)
123
124 def test_lookup_fails_when_no_tree_builders_registered(self):
125 self.assertEqual(self.registry.lookup(), None)
126
127 def test_lookup_gets_most_recent_builder_supporting_all_features(self):
128 has_one = self.builder_for_features('foo')
129 has_the_other = self.builder_for_features('bar')
130 has_both_early = self.builder_for_features('foo', 'bar', 'baz')
131 has_both_late = self.builder_for_features('foo', 'bar', 'quux')
132 lacks_one = self.builder_for_features('bar')
133 has_the_other = self.builder_for_features('foo')
134
135 # There are two builders featuring 'foo' and 'bar', but
136 # the one that also features 'quux' was registered later.
137 self.assertEqual(self.registry.lookup('foo', 'bar'),
138 has_both_late)
139
140 # There is only one builder featuring 'foo', 'bar', and 'baz'.
141 self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
142 has_both_early)
143
144 def test_lookup_fails_when_cannot_reconcile_requested_features(self):
145 builder1 = self.builder_for_features('foo', 'bar')
146 builder2 = self.builder_for_features('foo', 'baz')
147 self.assertEqual(self.registry.lookup('bar', 'baz'), None)
diff --git a/bitbake/lib/bs4/tests/test_docs.py b/bitbake/lib/bs4/tests/test_docs.py
deleted file mode 100644
index d1d76a33bf..0000000000
--- a/bitbake/lib/bs4/tests/test_docs.py
+++ /dev/null
@@ -1,32 +0,0 @@
1"Test harness for doctests."
2
3# pylint: disable-msg=E0611,W0142
4
5__metaclass__ = type
6__all__ = [
7 'additional_tests',
8 ]
9
10import doctest
11#from pkg_resources import (
12# resource_filename, resource_exists, resource_listdir, cleanup_resources)
13
14DOCTEST_FLAGS = (
15 doctest.ELLIPSIS |
16 doctest.NORMALIZE_WHITESPACE |
17 doctest.REPORT_NDIFF)
18
19# def additional_tests():
20# "Run the doc tests (README.txt and docs/*, if any exist)"
21# doctest_files = [
22# os.path.abspath(resource_filename('bs4', 'README.txt'))]
23# if resource_exists('bs4', 'docs'):
24# for name in resource_listdir('bs4', 'docs'):
25# if name.endswith('.txt'):
26# doctest_files.append(
27# os.path.abspath(
28# resource_filename('bs4', 'docs/%s' % name)))
29# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
30# atexit.register(cleanup_resources)
31# return unittest.TestSuite((
32# doctest.DocFileSuite(*doctest_files, **kwargs)))
diff --git a/bitbake/lib/bs4/tests/test_html5lib.py b/bitbake/lib/bs4/tests/test_html5lib.py
deleted file mode 100644
index a7494ca5ba..0000000000
--- a/bitbake/lib/bs4/tests/test_html5lib.py
+++ /dev/null
@@ -1,98 +0,0 @@
1"""Tests to ensure that the html5lib tree builder generates good trees."""
2
3import warnings
4
5try:
6 from bs4.builder import HTML5TreeBuilder
7 HTML5LIB_PRESENT = True
8except ImportError as e:
9 HTML5LIB_PRESENT = False
10from bs4.element import SoupStrainer
11from bs4.testing import (
12 HTML5TreeBuilderSmokeTest,
13 SoupTest,
14 skipIf,
15)
16
17@skipIf(
18 not HTML5LIB_PRESENT,
19 "html5lib seems not to be present, not testing its tree builder.")
20class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
21 """See ``HTML5TreeBuilderSmokeTest``."""
22
23 @property
24 def default_builder(self):
25 return HTML5TreeBuilder()
26
27 def test_soupstrainer(self):
28 # The html5lib tree builder does not support SoupStrainers.
29 strainer = SoupStrainer("b")
30 markup = "<p>A <b>bold</b> statement.</p>"
31 with warnings.catch_warnings(record=True) as w:
32 soup = self.soup(markup, parse_only=strainer)
33 self.assertEqual(
34 soup.decode(), self.document_for(markup))
35
36 self.assertTrue(
37 "the html5lib tree builder doesn't support parse_only" in
38 str(w[0].message))
39
40 def test_correctly_nested_tables(self):
41 """html5lib inserts <tbody> tags where other parsers don't."""
42 markup = ('<table id="1">'
43 '<tr>'
44 "<td>Here's another table:"
45 '<table id="2">'
46 '<tr><td>foo</td></tr>'
47 '</table></td>')
48
49 self.assertSoupEquals(
50 markup,
51 '<table id="1"><tbody><tr><td>Here\'s another table:'
52 '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
53 '</td></tr></tbody></table>')
54
55 self.assertSoupEquals(
56 "<table><thead><tr><td>Foo</td></tr></thead>"
57 "<tbody><tr><td>Bar</td></tr></tbody>"
58 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
59
60 def test_xml_declaration_followed_by_doctype(self):
61 markup = '''<?xml version="1.0" encoding="utf-8"?>
62<!DOCTYPE html>
63<html>
64 <head>
65 </head>
66 <body>
67 <p>foo</p>
68 </body>
69</html>'''
70 soup = self.soup(markup)
71 # Verify that we can reach the <p> tag; this means the tree is connected.
72 self.assertEqual(b"<p>foo</p>", soup.p.encode())
73
74 def test_reparented_markup(self):
75 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 soup = self.soup(markup)
77 self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 self.assertEqual(2, len(soup.find_all('p')))
79
80
81 def test_reparented_markup_ends_with_whitespace(self):
82 markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 soup = self.soup(markup)
84 self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 self.assertEqual(2, len(soup.find_all('p')))
86
87 def test_processing_instruction(self):
88 """Processing instructions become comments."""
89 markup = b"""<?PITarget PIContent?>"""
90 soup = self.soup(markup)
91 assert str(soup).startswith("<!--?PITarget PIContent?-->")
92
93 def test_cloned_multivalue_node(self):
94 markup = b"""<a class="my_class"><p></a>"""
95 soup = self.soup(markup)
96 a1, a2 = soup.find_all('a')
97 self.assertEqual(a1, a2)
98 assert a1 is not a2
diff --git a/bitbake/lib/bs4/tests/test_htmlparser.py b/bitbake/lib/bs4/tests/test_htmlparser.py
deleted file mode 100644
index 30a25e6709..0000000000
--- a/bitbake/lib/bs4/tests/test_htmlparser.py
+++ /dev/null
@@ -1,31 +0,0 @@
1"""Tests to ensure that the html.parser tree builder generates good
2trees."""
3
4import pickle
5from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
6from bs4.builder import HTMLParserTreeBuilder
7
8class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
9
10 @property
11 def default_builder(self):
12 return HTMLParserTreeBuilder()
13
14 def test_namespaced_system_doctype(self):
15 # html.parser can't handle namespaced doctypes, so skip this one.
16 pass
17
18 def test_namespaced_public_doctype(self):
19 # html.parser can't handle namespaced doctypes, so skip this one.
20 pass
21
22 def test_builder_is_pickled(self):
23 """Unlike most tree builders, HTMLParserTreeBuilder and will
24 be restored after pickling.
25 """
26 tree = self.soup("<a><b>foo</a>")
27 dumped = pickle.dumps(tree, 2)
28 loaded = pickle.loads(dumped)
29 self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
30
31
diff --git a/bitbake/lib/bs4/tests/test_lxml.py b/bitbake/lib/bs4/tests/test_lxml.py
deleted file mode 100644
index 6b6cdd07cb..0000000000
--- a/bitbake/lib/bs4/tests/test_lxml.py
+++ /dev/null
@@ -1,70 +0,0 @@
1"""Tests to ensure that the lxml tree builder generates good trees."""
2
3import warnings
4
5try:
6 import lxml.etree
7 LXML_PRESENT = True
8 LXML_VERSION = lxml.etree.LXML_VERSION
9except ImportError as e:
10 LXML_PRESENT = False
11 LXML_VERSION = (0,)
12
13if LXML_PRESENT:
14 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
15
16from bs4 import BeautifulStoneSoup
17from bs4.testing import skipIf
18from bs4.testing import (
19 HTMLTreeBuilderSmokeTest,
20 XMLTreeBuilderSmokeTest,
21 SoupTest,
22 skipIf,
23)
24
25@skipIf(
26 not LXML_PRESENT,
27 "lxml seems not to be present, not testing its tree builder.")
28class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
29 """See ``HTMLTreeBuilderSmokeTest``."""
30
31 @property
32 def default_builder(self):
33 return LXMLTreeBuilder()
34
35 def test_out_of_range_entity(self):
36 self.assertSoupEquals(
37 "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
38 self.assertSoupEquals(
39 "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
40 self.assertSoupEquals(
41 "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
42
43 # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
44 # test if an old version of lxml is installed.
45
46 @skipIf(
47 not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
48 "Skipping doctype test for old version of lxml to avoid segfault.")
49 def test_empty_doctype(self):
50 soup = self.soup("<!DOCTYPE>")
51 doctype = soup.contents[0]
52 self.assertEqual("", doctype.strip())
53
54 def test_beautifulstonesoup_is_xml_parser(self):
55 # Make sure that the deprecated BSS class uses an xml builder
56 # if one is installed.
57 with warnings.catch_warnings(record=True) as w:
58 soup = BeautifulStoneSoup("<b />")
59 self.assertEqual("<b/>", str(soup.b))
60 self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
61
62@skipIf(
63 not LXML_PRESENT,
64 "lxml seems not to be present, not testing its XML tree builder.")
65class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
66 """See ``HTMLTreeBuilderSmokeTest``."""
67
68 @property
69 def default_builder(self):
70 return LXMLTreeBuilderForXML()
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py
deleted file mode 100644
index 6ad3cb3765..0000000000
--- a/bitbake/lib/bs4/tests/test_soup.py
+++ /dev/null
@@ -1,479 +0,0 @@
1# -*- coding: utf-8 -*-
2"""Tests of Beautiful Soup as a whole."""
3
4import logging
5import unittest
6import sys
7import tempfile
8
9from bs4 import BeautifulSoup
10from bs4.element import (
11 CharsetMetaAttributeValue,
12 ContentMetaAttributeValue,
13 SoupStrainer,
14 NamespacedAttribute,
15 )
16import bs4.dammit
17from bs4.dammit import (
18 EntitySubstitution,
19 UnicodeDammit,
20 EncodingDetector,
21)
22from bs4.testing import (
23 SoupTest,
24 skipIf,
25)
26import warnings
27
28try:
29 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
30 LXML_PRESENT = True
31except ImportError as e:
32 LXML_PRESENT = False
33
34PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
35PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
36
37class TestConstructor(SoupTest):
38
39 def test_short_unicode_input(self):
40 data = "<h1>éé</h1>"
41 soup = self.soup(data)
42 self.assertEqual("éé", soup.h1.string)
43
44 def test_embedded_null(self):
45 data = "<h1>foo\0bar</h1>"
46 soup = self.soup(data)
47 self.assertEqual("foo\0bar", soup.h1.string)
48
49 def test_exclude_encodings(self):
50 utf8_data = "Räksmörgås".encode("utf-8")
51 soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
52 self.assertEqual("windows-1252", soup.original_encoding)
53
54
55class TestWarnings(SoupTest):
56
57 def _no_parser_specified(self, s, is_there=True):
58 v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
59 self.assertTrue(v)
60
61 def test_warning_if_no_parser_specified(self):
62 with warnings.catch_warnings(record=True) as w:
63 soup = self.soup("<a><b></b></a>")
64 msg = str(w[0].message)
65 self._assert_no_parser_specified(msg)
66
67 def test_warning_if_parser_specified_too_vague(self):
68 with warnings.catch_warnings(record=True) as w:
69 soup = self.soup("<a><b></b></a>", "html")
70 msg = str(w[0].message)
71 self._assert_no_parser_specified(msg)
72
73 def test_no_warning_if_explicit_parser_specified(self):
74 with warnings.catch_warnings(record=True) as w:
75 soup = self.soup("<a><b></b></a>", "html.parser")
76 self.assertEqual([], w)
77
78 def test_parseOnlyThese_renamed_to_parse_only(self):
79 with warnings.catch_warnings(record=True) as w:
80 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
81 msg = str(w[0].message)
82 self.assertTrue("parseOnlyThese" in msg)
83 self.assertTrue("parse_only" in msg)
84 self.assertEqual(b"<b></b>", soup.encode())
85
86 def test_fromEncoding_renamed_to_from_encoding(self):
87 with warnings.catch_warnings(record=True) as w:
88 utf8 = b"\xc3\xa9"
89 soup = self.soup(utf8, fromEncoding="utf8")
90 msg = str(w[0].message)
91 self.assertTrue("fromEncoding" in msg)
92 self.assertTrue("from_encoding" in msg)
93 self.assertEqual("utf8", soup.original_encoding)
94
95 def test_unrecognized_keyword_argument(self):
96 self.assertRaises(
97 TypeError, self.soup, "<a>", no_such_argument=True)
98
99class TestWarnings(SoupTest):
100
101 def test_disk_file_warning(self):
102 filehandle = tempfile.NamedTemporaryFile()
103 filename = filehandle.name
104 try:
105 with warnings.catch_warnings(record=True) as w:
106 soup = self.soup(filename)
107 msg = str(w[0].message)
108 self.assertTrue("looks like a filename" in msg)
109 finally:
110 filehandle.close()
111
112 # The file no longer exists, so Beautiful Soup will no longer issue the warning.
113 with warnings.catch_warnings(record=True) as w:
114 soup = self.soup(filename)
115 self.assertEqual(0, len(w))
116
117 def test_url_warning(self):
118 with warnings.catch_warnings(record=True) as w:
119 soup = self.soup("http://www.crummy.com/")
120 msg = str(w[0].message)
121 self.assertTrue("looks like a URL" in msg)
122
123 with warnings.catch_warnings(record=True) as w:
124 soup = self.soup("http://www.crummy.com/ is great")
125 self.assertEqual(0, len(w))
126
127class TestSelectiveParsing(SoupTest):
128
129 def test_parse_with_soupstrainer(self):
130 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
131 strainer = SoupStrainer("b")
132 soup = self.soup(markup, parse_only=strainer)
133 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
134
135
136class TestEntitySubstitution(unittest.TestCase):
137 """Standalone tests of the EntitySubstitution class."""
138 def setUp(self):
139 self.sub = EntitySubstitution
140
141 def test_simple_html_substitution(self):
142 # Unicode characters corresponding to named HTML entites
143 # are substituted, and no others.
144 s = "foo\u2200\N{SNOWMAN}\u00f5bar"
145 self.assertEqual(self.sub.substitute_html(s),
146 "foo&forall;\N{SNOWMAN}&otilde;bar")
147
148 def test_smart_quote_substitution(self):
149 # MS smart quotes are a common source of frustration, so we
150 # give them a special test.
151 quotes = b"\x91\x92foo\x93\x94"
152 dammit = UnicodeDammit(quotes)
153 self.assertEqual(self.sub.substitute_html(dammit.markup),
154 "&lsquo;&rsquo;foo&ldquo;&rdquo;")
155
156 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
157 s = 'Welcome to "my bar"'
158 self.assertEqual(self.sub.substitute_xml(s, False), s)
159
160 def test_xml_attribute_quoting_normally_uses_double_quotes(self):
161 self.assertEqual(self.sub.substitute_xml("Welcome", True),
162 '"Welcome"')
163 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
164 '"Bob\'s Bar"')
165
166 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
167 s = 'Welcome to "my bar"'
168 self.assertEqual(self.sub.substitute_xml(s, True),
169 "'Welcome to \"my bar\"'")
170
171 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
172 s = 'Welcome to "Bob\'s Bar"'
173 self.assertEqual(
174 self.sub.substitute_xml(s, True),
175 '"Welcome to &quot;Bob\'s Bar&quot;"')
176
177 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
178 quoted = 'Welcome to "Bob\'s Bar"'
179 self.assertEqual(self.sub.substitute_xml(quoted), quoted)
180
181 def test_xml_quoting_handles_angle_brackets(self):
182 self.assertEqual(
183 self.sub.substitute_xml("foo<bar>"),
184 "foo&lt;bar&gt;")
185
186 def test_xml_quoting_handles_ampersands(self):
187 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
188
189 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
190 self.assertEqual(
191 self.sub.substitute_xml("&Aacute;T&T"),
192 "&amp;Aacute;T&amp;T")
193
194 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
195 self.assertEqual(
196 self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
197 "&Aacute;T&amp;T")
198
199 def test_quotes_not_html_substituted(self):
200 """There's no need to do this except inside attribute values."""
201 text = 'Bob\'s "bar"'
202 self.assertEqual(self.sub.substitute_html(text), text)
203
204
205class TestEncodingConversion(SoupTest):
206 # Test Beautiful Soup's ability to decode and encode from various
207 # encodings.
208
209 def setUp(self):
210 super(TestEncodingConversion, self).setUp()
211 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
212 self.utf8_data = self.unicode_data.encode("utf-8")
213 # Just so you know what it looks like.
214 self.assertEqual(
215 self.utf8_data,
216 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
217
218 def test_ascii_in_unicode_out(self):
219 # ASCII input is converted to Unicode. The original_encoding
220 # attribute is set to 'utf-8', a superset of ASCII.
221 chardet = bs4.dammit.chardet_dammit
222 logging.disable(logging.WARNING)
223 try:
224 def noop(str):
225 return None
226 # Disable chardet, which will realize that the ASCII is ASCII.
227 bs4.dammit.chardet_dammit = noop
228 ascii = b"<foo>a</foo>"
229 soup_from_ascii = self.soup(ascii)
230 unicode_output = soup_from_ascii.decode()
231 self.assertTrue(isinstance(unicode_output, str))
232 self.assertEqual(unicode_output, self.document_for(ascii.decode()))
233 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
234 finally:
235 logging.disable(logging.NOTSET)
236 bs4.dammit.chardet_dammit = chardet
237
238 def test_unicode_in_unicode_out(self):
239 # Unicode input is left alone. The original_encoding attribute
240 # is not set.
241 soup_from_unicode = self.soup(self.unicode_data)
242 self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
243 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
244 self.assertEqual(soup_from_unicode.original_encoding, None)
245
246 def test_utf8_in_unicode_out(self):
247 # UTF-8 input is converted to Unicode. The original_encoding
248 # attribute is set.
249 soup_from_utf8 = self.soup(self.utf8_data)
250 self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
251 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
252
253 def test_utf8_out(self):
254 # The internal data structures can be encoded as UTF-8.
255 soup_from_unicode = self.soup(self.unicode_data)
256 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
257
258 @skipIf(
259 PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
260 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
261 def test_attribute_name_containing_unicode_characters(self):
262 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
263 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
264
265class TestUnicodeDammit(unittest.TestCase):
266 """Standalone tests of UnicodeDammit."""
267
268 def test_unicode_input(self):
269 markup = "I'm already Unicode! \N{SNOWMAN}"
270 dammit = UnicodeDammit(markup)
271 self.assertEqual(dammit.unicode_markup, markup)
272
273 def test_smart_quotes_to_unicode(self):
274 markup = b"<foo>\x91\x92\x93\x94</foo>"
275 dammit = UnicodeDammit(markup)
276 self.assertEqual(
277 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
278
279 def test_smart_quotes_to_xml_entities(self):
280 markup = b"<foo>\x91\x92\x93\x94</foo>"
281 dammit = UnicodeDammit(markup, smart_quotes_to="xml")
282 self.assertEqual(
283 dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
284
285 def test_smart_quotes_to_html_entities(self):
286 markup = b"<foo>\x91\x92\x93\x94</foo>"
287 dammit = UnicodeDammit(markup, smart_quotes_to="html")
288 self.assertEqual(
289 dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
290
291 def test_smart_quotes_to_ascii(self):
292 markup = b"<foo>\x91\x92\x93\x94</foo>"
293 dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
294 self.assertEqual(
295 dammit.unicode_markup, """<foo>''""</foo>""")
296
297 def test_detect_utf8(self):
298 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
299 dammit = UnicodeDammit(utf8)
300 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
301 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
302
303
304 def test_convert_hebrew(self):
305 hebrew = b"\xed\xe5\xec\xf9"
306 dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
307 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
308 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
309
310 def test_dont_see_smart_quotes_where_there_are_none(self):
311 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
312 dammit = UnicodeDammit(utf_8)
313 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
314 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
315
316 def test_ignore_inappropriate_codecs(self):
317 utf8_data = "Räksmörgås".encode("utf-8")
318 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
319 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
320
321 def test_ignore_invalid_codecs(self):
322 utf8_data = "Räksmörgås".encode("utf-8")
323 for bad_encoding in ['.utf8', '...', 'utF---16.!']:
324 dammit = UnicodeDammit(utf8_data, [bad_encoding])
325 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
326
327 def test_exclude_encodings(self):
328 # This is UTF-8.
329 utf8_data = "Räksmörgås".encode("utf-8")
330
331 # But if we exclude UTF-8 from consideration, the guess is
332 # Windows-1252.
333 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
334 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
335
336 # And if we exclude that, there is no valid guess at all.
337 dammit = UnicodeDammit(
338 utf8_data, exclude_encodings=["utf-8", "windows-1252"])
339 self.assertEqual(dammit.original_encoding, None)
340
341 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
342 detected = EncodingDetector(
343 b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
344 encodings = list(detected.encodings)
345 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
346
347 def test_detect_html5_style_meta_tag(self):
348
349 for data in (
350 b'<html><meta charset="euc-jp" /></html>',
351 b"<html><meta charset='euc-jp' /></html>",
352 b"<html><meta charset=euc-jp /></html>",
353 b"<html><meta charset=euc-jp/></html>"):
354 dammit = UnicodeDammit(data, is_html=True)
355 self.assertEqual(
356 "euc-jp", dammit.original_encoding)
357
358 def test_last_ditch_entity_replacement(self):
359 # This is a UTF-8 document that contains bytestrings
360 # completely incompatible with UTF-8 (ie. encoded with some other
361 # encoding).
362 #
363 # Since there is no consistent encoding for the document,
364 # Unicode, Dammit will eventually encode the document as UTF-8
365 # and encode the incompatible characters as REPLACEMENT
366 # CHARACTER.
367 #
368 # If chardet is installed, it will detect that the document
369 # can be converted into ISO-8859-1 without errors. This happens
370 # to be the wrong encoding, but it is a consistent encoding, so the
371 # code we're testing here won't run.
372 #
373 # So we temporarily disable chardet if it's present.
374 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
375<html><b>\330\250\330\252\330\261</b>
376<i>\310\322\321\220\312\321\355\344</i></html>"""
377 chardet = bs4.dammit.chardet_dammit
378 logging.disable(logging.WARNING)
379 try:
380 def noop(str):
381 return None
382 bs4.dammit.chardet_dammit = noop
383 dammit = UnicodeDammit(doc)
384 self.assertEqual(True, dammit.contains_replacement_characters)
385 self.assertTrue("\ufffd" in dammit.unicode_markup)
386
387 soup = BeautifulSoup(doc, "html.parser")
388 self.assertTrue(soup.contains_replacement_characters)
389 finally:
390 logging.disable(logging.NOTSET)
391 bs4.dammit.chardet_dammit = chardet
392
393 def test_byte_order_mark_removed(self):
394 # A document written in UTF-16LE will have its byte order marker stripped.
395 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
396 dammit = UnicodeDammit(data)
397 self.assertEqual("<a>áé</a>", dammit.unicode_markup)
398 self.assertEqual("utf-16le", dammit.original_encoding)
399
400 def test_detwingle(self):
401 # Here's a UTF8 document.
402 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
403
404 # Here's a Windows-1252 document.
405 windows_1252 = (
406 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
407 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
408
409 # Through some unholy alchemy, they've been stuck together.
410 doc = utf8 + windows_1252 + utf8
411
412 # The document can't be turned into UTF-8:
413 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
414
415 # Unicode, Dammit thinks the whole document is Windows-1252,
416 # and decodes it into "☃☃☃“Hi, I like Windows!â€Ã¢ËœÆ’☃☃"
417
418 # But if we run it through fix_embedded_windows_1252, it's fixed:
419
420 fixed = UnicodeDammit.detwingle(doc)
421 self.assertEqual(
422 "☃☃☃“Hi, I like Windows!â€â˜ƒâ˜ƒâ˜ƒ", fixed.decode("utf8"))
423
424 def test_detwingle_ignores_multibyte_characters(self):
425 # Each of these characters has a UTF-8 representation ending
426 # in \x93. \x93 is a smart quote if interpreted as
427 # Windows-1252. But our code knows to skip over multibyte
428 # UTF-8 characters, so they'll survive the process unscathed.
429 for tricky_unicode_char in (
430 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
431 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
432 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
433 ):
434 input = tricky_unicode_char.encode("utf8")
435 self.assertTrue(input.endswith(b'\x93'))
436 output = UnicodeDammit.detwingle(input)
437 self.assertEqual(output, input)
438
439class TestNamedspacedAttribute(SoupTest):
440
441 def test_name_may_be_none(self):
442 a = NamespacedAttribute("xmlns", None)
443 self.assertEqual(a, "xmlns")
444
445 def test_attribute_is_equivalent_to_colon_separated_string(self):
446 a = NamespacedAttribute("a", "b")
447 self.assertEqual("a:b", a)
448
449 def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
450 a = NamespacedAttribute("a", "b", "c")
451 b = NamespacedAttribute("a", "b", "c")
452 self.assertEqual(a, b)
453
454 # The actual namespace is not considered.
455 c = NamespacedAttribute("a", "b", None)
456 self.assertEqual(a, c)
457
458 # But name and prefix are important.
459 d = NamespacedAttribute("a", "z", "c")
460 self.assertNotEqual(a, d)
461
462 e = NamespacedAttribute("z", "b", "c")
463 self.assertNotEqual(a, e)
464
465
466class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
467
468 def test_content_meta_attribute_value(self):
469 value = CharsetMetaAttributeValue("euc-jp")
470 self.assertEqual("euc-jp", value)
471 self.assertEqual("euc-jp", value.original_value)
472 self.assertEqual("utf8", value.encode("utf8"))
473
474
475 def test_content_meta_attribute_value(self):
476 value = ContentMetaAttributeValue("text/html; charset=euc-jp")
477 self.assertEqual("text/html; charset=euc-jp", value)
478 self.assertEqual("text/html; charset=euc-jp", value.original_value)
479 self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
diff --git a/bitbake/lib/bs4/tests/test_tree.py b/bitbake/lib/bs4/tests/test_tree.py
deleted file mode 100644
index cf0f1abe0c..0000000000
--- a/bitbake/lib/bs4/tests/test_tree.py
+++ /dev/null
@@ -1,2004 +0,0 @@
1# -*- coding: utf-8 -*-
2"""Tests for Beautiful Soup's tree traversal methods.
3
4The tree traversal methods are the main advantage of using Beautiful
5Soup over just using a parser.
6
7Different parsers will build different Beautiful Soup trees given the
8same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here.
10"""
11
12import copy
13import pickle
14import re
15import warnings
16from bs4 import BeautifulSoup
17from bs4.builder import builder_registry
18from bs4.element import (
19 PY3K,
20 CData,
21 Comment,
22 Declaration,
23 Doctype,
24 NavigableString,
25 SoupStrainer,
26 Tag,
27)
28from bs4.testing import SoupTest
29
30XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
31LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
32
33class TreeTest(SoupTest):
34
35 def assertSelects(self, tags, should_match):
36 """Make sure that the given tags have the correct text.
37
38 This is used in tests that define a bunch of tags, each
39 containing a single string, and then select certain strings by
40 some mechanism.
41 """
42 self.assertEqual([tag.string for tag in tags], should_match)
43
44 def assertSelectsIDs(self, tags, should_match):
45 """Make sure that the given tags have the correct IDs.
46
47 This is used in tests that define a bunch of tags, each
48 containing a single string, and then select certain strings by
49 some mechanism.
50 """
51 self.assertEqual([tag['id'] for tag in tags], should_match)
52
53
54class TestFind(TreeTest):
55 """Basic tests of the find() method.
56
57 find() just calls find_all() with limit=1, so it's not tested all
58 that thouroughly here.
59 """
60
61 def test_find_tag(self):
62 soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>")
63 self.assertEqual(soup.find("b").string, "2")
64
65 def test_unicode_text_find(self):
66 soup = self.soup('<h1>Räksmörgås</h1>')
67 self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
68
69 def test_unicode_attribute_find(self):
70 soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
71 str(soup)
72 self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
73
74
75 def test_find_everything(self):
76 """Test an optimization that finds all tags."""
77 soup = self.soup("<a>foo</a><b>bar</b>")
78 self.assertEqual(2, len(soup.find_all()))
79
80 def test_find_everything_with_name(self):
81 """Test an optimization that finds all tags with a given name."""
82 soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
83 self.assertEqual(2, len(soup.find_all('a')))
84
85class TestFindAll(TreeTest):
86 """Basic tests of the find_all() method."""
87
88 def test_find_all_text_nodes(self):
89 """You can search the tree for text nodes."""
90 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
91 # Exact match.
92 self.assertEqual(soup.find_all(string="bar"), ["bar"])
93 self.assertEqual(soup.find_all(text="bar"), ["bar"])
94 # Match any of a number of strings.
95 self.assertEqual(
96 soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
97 # Match a regular expression.
98 self.assertEqual(soup.find_all(text=re.compile('.*')),
99 ["Foo", "bar", '\xbb'])
100 # Match anything.
101 self.assertEqual(soup.find_all(text=True),
102 ["Foo", "bar", '\xbb'])
103
104 def test_find_all_limit(self):
105 """You can limit the number of items returned by find_all."""
106 soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>")
107 self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
108 self.assertSelects(soup.find_all('a', limit=1), ["1"])
109 self.assertSelects(
110 soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
111
112 # A limit of 0 means no limit.
113 self.assertSelects(
114 soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
115
116 def test_calling_a_tag_is_calling_findall(self):
117 soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>")
118 self.assertSelects(soup('a', limit=1), ["1"])
119 self.assertSelects(soup.b(id="foo"), ["3"])
120
121 def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
122 soup = self.soup("<a></a>")
123 # Create a self-referential list.
124 l = []
125 l.append(l)
126
127 # Without special code in _normalize_search_value, this would cause infinite
128 # recursion.
129 self.assertEqual([], soup.find_all(l))
130
131 def test_find_all_resultset(self):
132 """All find_all calls return a ResultSet"""
133 soup = self.soup("<a></a>")
134 result = soup.find_all("a")
135 self.assertTrue(hasattr(result, "source"))
136
137 result = soup.find_all(True)
138 self.assertTrue(hasattr(result, "source"))
139
140 result = soup.find_all(text="foo")
141 self.assertTrue(hasattr(result, "source"))
142
143
144class TestFindAllBasicNamespaces(TreeTest):
145
146 def test_find_by_namespaced_name(self):
147 soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
148 self.assertEqual("4", soup.find("mathml:msqrt").string)
149 self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
150
151
152class TestFindAllByName(TreeTest):
153 """Test ways of finding tags by tag name."""
154
155 def setUp(self):
156 super(TreeTest, self).setUp()
157 self.tree = self.soup("""<a>First tag.</a>
158 <b>Second tag.</b>
159 <c>Third <a>Nested tag.</a> tag.</c>""")
160
161 def test_find_all_by_tag_name(self):
162 # Find all the <a> tags.
163 self.assertSelects(
164 self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
165
166 def test_find_all_by_name_and_text(self):
167 self.assertSelects(
168 self.tree.find_all('a', text='First tag.'), ['First tag.'])
169
170 self.assertSelects(
171 self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
172
173 self.assertSelects(
174 self.tree.find_all('a', text=re.compile("tag")),
175 ['First tag.', 'Nested tag.'])
176
177
178 def test_find_all_on_non_root_element(self):
179 # You can call find_all on any node, not just the root.
180 self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
181
182 def test_calling_element_invokes_find_all(self):
183 self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
184
185 def test_find_all_by_tag_strainer(self):
186 self.assertSelects(
187 self.tree.find_all(SoupStrainer('a')),
188 ['First tag.', 'Nested tag.'])
189
190 def test_find_all_by_tag_names(self):
191 self.assertSelects(
192 self.tree.find_all(['a', 'b']),
193 ['First tag.', 'Second tag.', 'Nested tag.'])
194
195 def test_find_all_by_tag_dict(self):
196 self.assertSelects(
197 self.tree.find_all({'a' : True, 'b' : True}),
198 ['First tag.', 'Second tag.', 'Nested tag.'])
199
200 def test_find_all_by_tag_re(self):
201 self.assertSelects(
202 self.tree.find_all(re.compile('^[ab]$')),
203 ['First tag.', 'Second tag.', 'Nested tag.'])
204
205 def test_find_all_with_tags_matching_method(self):
206 # You can define an oracle method that determines whether
207 # a tag matches the search.
208 def id_matches_name(tag):
209 return tag.name == tag.get('id')
210
211 tree = self.soup("""<a id="a">Match 1.</a>
212 <a id="1">Does not match.</a>
213 <b id="b">Match 2.</a>""")
214
215 self.assertSelects(
216 tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
217
218
219class TestFindAllByAttribute(TreeTest):
220
221 def test_find_all_by_attribute_name(self):
222 # You can pass in keyword arguments to find_all to search by
223 # attribute.
224 tree = self.soup("""
225 <a id="first">Matching a.</a>
226 <a id="second">
227 Non-matching <b id="first">Matching b.</b>a.
228 </a>""")
229 self.assertSelects(tree.find_all(id='first'),
230 ["Matching a.", "Matching b."])
231
232 def test_find_all_by_utf8_attribute_value(self):
233 peace = "×ולש".encode("utf8")
234 data = '<a title="×ולש"></a>'.encode("utf8")
235 soup = self.soup(data)
236 self.assertEqual([soup.a], soup.find_all(title=peace))
237 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
238 self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
239
240 def test_find_all_by_attribute_dict(self):
241 # You can pass in a dictionary as the argument 'attrs'. This
242 # lets you search for attributes like 'name' (a fixed argument
243 # to find_all) and 'class' (a reserved word in Python.)
244 tree = self.soup("""
245 <a name="name1" class="class1">Name match.</a>
246 <a name="name2" class="class2">Class match.</a>
247 <a name="name3" class="class3">Non-match.</a>
248 <name1>A tag called 'name1'.</name1>
249 """)
250
251 # This doesn't do what you want.
252 self.assertSelects(tree.find_all(name='name1'),
253 ["A tag called 'name1'."])
254 # This does what you want.
255 self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
256 ["Name match."])
257
258 self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
259 ["Class match."])
260
261 def test_find_all_by_class(self):
262 tree = self.soup("""
263 <a class="1">Class 1.</a>
264 <a class="2">Class 2.</a>
265 <b class="1">Class 1.</b>
266 <c class="3 4">Class 3 and 4.</c>
267 """)
268
269 # Passing in the class_ keyword argument will search against
270 # the 'class' attribute.
271 self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
272 self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
273 self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
274
275 # Passing in a string to 'attrs' will also search the CSS class.
276 self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
277 self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
278 self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
279 self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
280
281 def test_find_by_class_when_multiple_classes_present(self):
282 tree = self.soup("<gar class='foo bar'>Found it</gar>")
283
284 f = tree.find_all("gar", class_=re.compile("o"))
285 self.assertSelects(f, ["Found it"])
286
287 f = tree.find_all("gar", class_=re.compile("a"))
288 self.assertSelects(f, ["Found it"])
289
290 # Since the class is not the string "foo bar", but the two
291 # strings "foo" and "bar", this will not find anything.
292 f = tree.find_all("gar", class_=re.compile("o b"))
293 self.assertSelects(f, [])
294
295 def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
296 soup = self.soup("<a class='bar'>Found it</a>")
297
298 self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
299
300 def big_attribute_value(value):
301 return len(value) > 3
302
303 self.assertSelects(soup.find_all("a", big_attribute_value), [])
304
305 def small_attribute_value(value):
306 return len(value) <= 3
307
308 self.assertSelects(
309 soup.find_all("a", small_attribute_value), ["Found it"])
310
311 def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
312 soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
313 a, a2 = soup.find_all("a")
314 self.assertEqual([a, a2], soup.find_all("a", "foo"))
315 self.assertEqual([a], soup.find_all("a", "bar"))
316
317 # If you specify the class as a string that contains a
318 # space, only that specific value will be found.
319 self.assertEqual([a], soup.find_all("a", class_="foo bar"))
320 self.assertEqual([a], soup.find_all("a", "foo bar"))
321 self.assertEqual([], soup.find_all("a", "bar foo"))
322
323 def test_find_all_by_attribute_soupstrainer(self):
324 tree = self.soup("""
325 <a id="first">Match.</a>
326 <a id="second">Non-match.</a>""")
327
328 strainer = SoupStrainer(attrs={'id' : 'first'})
329 self.assertSelects(tree.find_all(strainer), ['Match.'])
330
331 def test_find_all_with_missing_atribute(self):
332 # You can pass in None as the value of an attribute to find_all.
333 # This will match tags that do not have that attribute set.
334 tree = self.soup("""<a id="1">ID present.</a>
335 <a>No ID present.</a>
336 <a id="">ID is empty.</a>""")
337 self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
338
339 def test_find_all_with_defined_attribute(self):
340 # You can pass in None as the value of an attribute to find_all.
341 # This will match tags that have that attribute set to any value.
342 tree = self.soup("""<a id="1">ID present.</a>
343 <a>No ID present.</a>
344 <a id="">ID is empty.</a>""")
345 self.assertSelects(
346 tree.find_all(id=True), ["ID present.", "ID is empty."])
347
348 def test_find_all_with_numeric_attribute(self):
349 # If you search for a number, it's treated as a string.
350 tree = self.soup("""<a id=1>Unquoted attribute.</a>
351 <a id="1">Quoted attribute.</a>""")
352
353 expected = ["Unquoted attribute.", "Quoted attribute."]
354 self.assertSelects(tree.find_all(id=1), expected)
355 self.assertSelects(tree.find_all(id="1"), expected)
356
357 def test_find_all_with_list_attribute_values(self):
358 # You can pass a list of attribute values instead of just one,
359 # and you'll get tags that match any of the values.
360 tree = self.soup("""<a id="1">1</a>
361 <a id="2">2</a>
362 <a id="3">3</a>
363 <a>No ID.</a>""")
364 self.assertSelects(tree.find_all(id=["1", "3", "4"]),
365 ["1", "3"])
366
367 def test_find_all_with_regular_expression_attribute_value(self):
368 # You can pass a regular expression as an attribute value, and
369 # you'll get tags whose values for that attribute match the
370 # regular expression.
371 tree = self.soup("""<a id="a">One a.</a>
372 <a id="aa">Two as.</a>
373 <a id="ab">Mixed as and bs.</a>
374 <a id="b">One b.</a>
375 <a>No ID.</a>""")
376
377 self.assertSelects(tree.find_all(id=re.compile("^a+$")),
378 ["One a.", "Two as."])
379
380 def test_find_by_name_and_containing_string(self):
381 soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>")
382 a = soup.a
383
384 self.assertEqual([a], soup.find_all("a", text="foo"))
385 self.assertEqual([], soup.find_all("a", text="bar"))
386 self.assertEqual([], soup.find_all("a", text="bar"))
387
388 def test_find_by_name_and_containing_string_when_string_is_buried(self):
389 soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>")
390 self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
391
392 def test_find_by_attribute_and_containing_string(self):
393 soup = self.soup('<b id="1">foo</b><a id="2">foo</a>')
394 a = soup.a
395
396 self.assertEqual([a], soup.find_all(id=2, text="foo"))
397 self.assertEqual([], soup.find_all(id=1, text="bar"))
398
399
400
401
402class TestIndex(TreeTest):
403 """Test Tag.index"""
404 def test_index(self):
405 tree = self.soup("""<div>
406 <a>Identical</a>
407 <b>Not identical</b>
408 <a>Identical</a>
409
410 <c><d>Identical with child</d></c>
411 <b>Also not identical</b>
412 <c><d>Identical with child</d></c>
413 </div>""")
414 div = tree.div
415 for i, element in enumerate(div.contents):
416 self.assertEqual(i, div.index(element))
417 self.assertRaises(ValueError, tree.index, 1)
418
419
420class TestParentOperations(TreeTest):
421 """Test navigation and searching through an element's parents."""
422
423 def setUp(self):
424 super(TestParentOperations, self).setUp()
425 self.tree = self.soup('''<ul id="empty"></ul>
426 <ul id="top">
427 <ul id="middle">
428 <ul id="bottom">
429 <b>Start here</b>
430 </ul>
431 </ul>''')
432 self.start = self.tree.b
433
434
435 def test_parent(self):
436 self.assertEqual(self.start.parent['id'], 'bottom')
437 self.assertEqual(self.start.parent.parent['id'], 'middle')
438 self.assertEqual(self.start.parent.parent.parent['id'], 'top')
439
440 def test_parent_of_top_tag_is_soup_object(self):
441 top_tag = self.tree.contents[0]
442 self.assertEqual(top_tag.parent, self.tree)
443
444 def test_soup_object_has_no_parent(self):
445 self.assertEqual(None, self.tree.parent)
446
447 def test_find_parents(self):
448 self.assertSelectsIDs(
449 self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
450 self.assertSelectsIDs(
451 self.start.find_parents('ul', id="middle"), ['middle'])
452
453 def test_find_parent(self):
454 self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
455 self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
456
457 def test_parent_of_text_element(self):
458 text = self.tree.find(text="Start here")
459 self.assertEqual(text.parent.name, 'b')
460
461 def test_text_element_find_parent(self):
462 text = self.tree.find(text="Start here")
463 self.assertEqual(text.find_parent('ul')['id'], 'bottom')
464
465 def test_parent_generator(self):
466 parents = [parent['id'] for parent in self.start.parents
467 if parent is not None and 'id' in parent.attrs]
468 self.assertEqual(parents, ['bottom', 'middle', 'top'])
469
470
471class ProximityTest(TreeTest):
472
473 def setUp(self):
474 super(TreeTest, self).setUp()
475 self.tree = self.soup(
476 '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>')
477
478
479class TestNextOperations(ProximityTest):
480
481 def setUp(self):
482 super(TestNextOperations, self).setUp()
483 self.start = self.tree.b
484
485 def test_next(self):
486 self.assertEqual(self.start.next_element, "One")
487 self.assertEqual(self.start.next_element.next_element['id'], "2")
488
489 def test_next_of_last_item_is_none(self):
490 last = self.tree.find(text="Three")
491 self.assertEqual(last.next_element, None)
492
493 def test_next_of_root_is_none(self):
494 # The document root is outside the next/previous chain.
495 self.assertEqual(self.tree.next_element, None)
496
497 def test_find_all_next(self):
498 self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
499 self.start.find_all_next(id=3)
500 self.assertSelects(self.start.find_all_next(id=3), ["Three"])
501
502 def test_find_next(self):
503 self.assertEqual(self.start.find_next('b')['id'], '2')
504 self.assertEqual(self.start.find_next(text="Three"), "Three")
505
506 def test_find_next_for_text_element(self):
507 text = self.tree.find(text="One")
508 self.assertEqual(text.find_next("b").string, "Two")
509 self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
510
511 def test_next_generator(self):
512 start = self.tree.find(text="Two")
513 successors = [node for node in start.next_elements]
514 # There are two successors: the final <b> tag and its text contents.
515 tag, contents = successors
516 self.assertEqual(tag['id'], '3')
517 self.assertEqual(contents, "Three")
518
519class TestPreviousOperations(ProximityTest):
520
521 def setUp(self):
522 super(TestPreviousOperations, self).setUp()
523 self.end = self.tree.find(text="Three")
524
525 def test_previous(self):
526 self.assertEqual(self.end.previous_element['id'], "3")
527 self.assertEqual(self.end.previous_element.previous_element, "Two")
528
529 def test_previous_of_first_item_is_none(self):
530 first = self.tree.find('html')
531 self.assertEqual(first.previous_element, None)
532
533 def test_previous_of_root_is_none(self):
534 # The document root is outside the next/previous chain.
535 # XXX This is broken!
536 #self.assertEqual(self.tree.previous_element, None)
537 pass
538
539 def test_find_all_previous(self):
540 # The <b> tag containing the "Three" node is the predecessor
541 # of the "Three" node itself, which is why "Three" shows up
542 # here.
543 self.assertSelects(
544 self.end.find_all_previous('b'), ["Three", "Two", "One"])
545 self.assertSelects(self.end.find_all_previous(id=1), ["One"])
546
547 def test_find_previous(self):
548 self.assertEqual(self.end.find_previous('b')['id'], '3')
549 self.assertEqual(self.end.find_previous(text="One"), "One")
550
551 def test_find_previous_for_text_element(self):
552 text = self.tree.find(text="Three")
553 self.assertEqual(text.find_previous("b").string, "Three")
554 self.assertSelects(
555 text.find_all_previous("b"), ["Three", "Two", "One"])
556
557 def test_previous_generator(self):
558 start = self.tree.find(text="One")
559 predecessors = [node for node in start.previous_elements]
560
561 # There are four predecessors: the <b> tag containing "One"
562 # the <body> tag, the <head> tag, and the <html> tag.
563 b, body, head, html = predecessors
564 self.assertEqual(b['id'], '1')
565 self.assertEqual(body.name, "body")
566 self.assertEqual(head.name, "head")
567 self.assertEqual(html.name, "html")
568
569
570class SiblingTest(TreeTest):
571
572 def setUp(self):
573 super(SiblingTest, self).setUp()
574 markup = '''<html>
575 <span id="1">
576 <span id="1.1"></span>
577 </span>
578 <span id="2">
579 <span id="2.1"></span>
580 </span>
581 <span id="3">
582 <span id="3.1"></span>
583 </span>
584 <span id="4"></span>
585 </html>'''
586 # All that whitespace looks good but makes the tests more
587 # difficult. Get rid of it.
588 markup = re.compile(r"\n\s*").sub("", markup)
589 self.tree = self.soup(markup)
590
591
592class TestNextSibling(SiblingTest):
593
594 def setUp(self):
595 super(TestNextSibling, self).setUp()
596 self.start = self.tree.find(id="1")
597
598 def test_next_sibling_of_root_is_none(self):
599 self.assertEqual(self.tree.next_sibling, None)
600
601 def test_next_sibling(self):
602 self.assertEqual(self.start.next_sibling['id'], '2')
603 self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
604
605 # Note the difference between next_sibling and next_element.
606 self.assertEqual(self.start.next_element['id'], '1.1')
607
608 def test_next_sibling_may_not_exist(self):
609 self.assertEqual(self.tree.html.next_sibling, None)
610
611 nested_span = self.tree.find(id="1.1")
612 self.assertEqual(nested_span.next_sibling, None)
613
614 last_span = self.tree.find(id="4")
615 self.assertEqual(last_span.next_sibling, None)
616
617 def test_find_next_sibling(self):
618 self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
619
620 def test_next_siblings(self):
621 self.assertSelectsIDs(self.start.find_next_siblings("span"),
622 ['2', '3', '4'])
623
624 self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
625
626 def test_next_sibling_for_text_element(self):
627 soup = self.soup("Foo<b>bar</b>baz")
628 start = soup.find(text="Foo")
629 self.assertEqual(start.next_sibling.name, 'b')
630 self.assertEqual(start.next_sibling.next_sibling, 'baz')
631
632 self.assertSelects(start.find_next_siblings('b'), ['bar'])
633 self.assertEqual(start.find_next_sibling(text="baz"), "baz")
634 self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
635
636
637class TestPreviousSibling(SiblingTest):
638
639 def setUp(self):
640 super(TestPreviousSibling, self).setUp()
641 self.end = self.tree.find(id="4")
642
643 def test_previous_sibling_of_root_is_none(self):
644 self.assertEqual(self.tree.previous_sibling, None)
645
646 def test_previous_sibling(self):
647 self.assertEqual(self.end.previous_sibling['id'], '3')
648 self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
649
650 # Note the difference between previous_sibling and previous_element.
651 self.assertEqual(self.end.previous_element['id'], '3.1')
652
653 def test_previous_sibling_may_not_exist(self):
654 self.assertEqual(self.tree.html.previous_sibling, None)
655
656 nested_span = self.tree.find(id="1.1")
657 self.assertEqual(nested_span.previous_sibling, None)
658
659 first_span = self.tree.find(id="1")
660 self.assertEqual(first_span.previous_sibling, None)
661
662 def test_find_previous_sibling(self):
663 self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
664
665 def test_previous_siblings(self):
666 self.assertSelectsIDs(self.end.find_previous_siblings("span"),
667 ['3', '2', '1'])
668
669 self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
670
671 def test_previous_sibling_for_text_element(self):
672 soup = self.soup("Foo<b>bar</b>baz")
673 start = soup.find(text="baz")
674 self.assertEqual(start.previous_sibling.name, 'b')
675 self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
676
677 self.assertSelects(start.find_previous_siblings('b'), ['bar'])
678 self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
679 self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
680
681
682class TestTagCreation(SoupTest):
683 """Test the ability to create new tags."""
684 def test_new_tag(self):
685 soup = self.soup("")
686 new_tag = soup.new_tag("foo", bar="baz")
687 self.assertTrue(isinstance(new_tag, Tag))
688 self.assertEqual("foo", new_tag.name)
689 self.assertEqual(dict(bar="baz"), new_tag.attrs)
690 self.assertEqual(None, new_tag.parent)
691
692 def test_tag_inherits_self_closing_rules_from_builder(self):
693 if XML_BUILDER_PRESENT:
694 xml_soup = BeautifulSoup("", "lxml-xml")
695 xml_br = xml_soup.new_tag("br")
696 xml_p = xml_soup.new_tag("p")
697
698 # Both the <br> and <p> tag are empty-element, just because
699 # they have no contents.
700 self.assertEqual(b"<br/>", xml_br.encode())
701 self.assertEqual(b"<p/>", xml_p.encode())
702
703 html_soup = BeautifulSoup("", "html.parser")
704 html_br = html_soup.new_tag("br")
705 html_p = html_soup.new_tag("p")
706
707 # The HTML builder users HTML's rules about which tags are
708 # empty-element tags, and the new tags reflect these rules.
709 self.assertEqual(b"<br/>", html_br.encode())
710 self.assertEqual(b"<p></p>", html_p.encode())
711
712 def test_new_string_creates_navigablestring(self):
713 soup = self.soup("")
714 s = soup.new_string("foo")
715 self.assertEqual("foo", s)
716 self.assertTrue(isinstance(s, NavigableString))
717
718 def test_new_string_can_create_navigablestring_subclass(self):
719 soup = self.soup("")
720 s = soup.new_string("foo", Comment)
721 self.assertEqual("foo", s)
722 self.assertTrue(isinstance(s, Comment))
723
724class TestTreeModification(SoupTest):
725
726 def test_attribute_modification(self):
727 soup = self.soup('<a id="1"></a>')
728 soup.a['id'] = 2
729 self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
730 del(soup.a['id'])
731 self.assertEqual(soup.decode(), self.document_for('<a></a>'))
732 soup.a['id2'] = 'foo'
733 self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
734
735 def test_new_tag_creation(self):
736 builder = builder_registry.lookup('html')()
737 soup = self.soup("<body></body>", builder=builder)
738 a = Tag(soup, builder, 'a')
739 ol = Tag(soup, builder, 'ol')
740 a['href'] = 'http://foo.com/'
741 soup.body.insert(0, a)
742 soup.body.insert(1, ol)
743 self.assertEqual(
744 soup.body.encode(),
745 b'<body><a href="http://foo.com/"></a><ol></ol></body>')
746
747 def test_append_to_contents_moves_tag(self):
748 doc = """<p id="1">Don't leave me <b>here</b>.</p>
749 <p id="2">Don\'t leave!</p>"""
750 soup = self.soup(doc)
751 second_para = soup.find(id='2')
752 bold = soup.b
753
754 # Move the <b> tag to the end of the second paragraph.
755 soup.find(id='2').append(soup.b)
756
757 # The <b> tag is now a child of the second paragraph.
758 self.assertEqual(bold.parent, second_para)
759
760 self.assertEqual(
761 soup.decode(), self.document_for(
762 '<p id="1">Don\'t leave me .</p>\n'
763 '<p id="2">Don\'t leave!<b>here</b></p>'))
764
765 def test_replace_with_returns_thing_that_was_replaced(self):
766 text = "<a></a><b><c></c></b>"
767 soup = self.soup(text)
768 a = soup.a
769 new_a = a.replace_with(soup.c)
770 self.assertEqual(a, new_a)
771
772 def test_unwrap_returns_thing_that_was_replaced(self):
773 text = "<a><b></b><c></c></a>"
774 soup = self.soup(text)
775 a = soup.a
776 new_a = a.unwrap()
777 self.assertEqual(a, new_a)
778
779 def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
780 soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
781 a = soup.a
782 a.extract()
783 self.assertEqual(None, a.parent)
784 self.assertRaises(ValueError, a.unwrap)
785 self.assertRaises(ValueError, a.replace_with, soup.c)
786
787 def test_replace_tag_with_itself(self):
788 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
789 soup = self.soup(text)
790 c = soup.c
791 soup.c.replace_with(c)
792 self.assertEqual(soup.decode(), self.document_for(text))
793
794 def test_replace_tag_with_its_parent_raises_exception(self):
795 text = "<a><b></b></a>"
796 soup = self.soup(text)
797 self.assertRaises(ValueError, soup.b.replace_with, soup.a)
798
799 def test_insert_tag_into_itself_raises_exception(self):
800 text = "<a><b></b></a>"
801 soup = self.soup(text)
802 self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
803
804 def test_replace_with_maintains_next_element_throughout(self):
805 soup = self.soup('<p><a>one</a><b>three</b></p>')
806 a = soup.a
807 b = a.contents[0]
808 # Make it so the <a> tag has two text children.
809 a.insert(1, "two")
810
811 # Now replace each one with the empty string.
812 left, right = a.contents
813 left.replaceWith('')
814 right.replaceWith('')
815
816 # The <b> tag is still connected to the tree.
817 self.assertEqual("three", soup.b.string)
818
819 def test_replace_final_node(self):
820 soup = self.soup("<b>Argh!</b>")
821 soup.find(text="Argh!").replace_with("Hooray!")
822 new_text = soup.find(text="Hooray!")
823 b = soup.b
824 self.assertEqual(new_text.previous_element, b)
825 self.assertEqual(new_text.parent, b)
826 self.assertEqual(new_text.previous_element.next_element, new_text)
827 self.assertEqual(new_text.next_element, None)
828
829 def test_consecutive_text_nodes(self):
830 # A builder should never create two consecutive text nodes,
831 # but if you insert one next to another, Beautiful Soup will
832 # handle it correctly.
833 soup = self.soup("<a><b>Argh!</b><c></c></a>")
834 soup.b.insert(1, "Hooray!")
835
836 self.assertEqual(
837 soup.decode(), self.document_for(
838 "<a><b>Argh!Hooray!</b><c></c></a>"))
839
840 new_text = soup.find(text="Hooray!")
841 self.assertEqual(new_text.previous_element, "Argh!")
842 self.assertEqual(new_text.previous_element.next_element, new_text)
843
844 self.assertEqual(new_text.previous_sibling, "Argh!")
845 self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
846
847 self.assertEqual(new_text.next_sibling, None)
848 self.assertEqual(new_text.next_element, soup.c)
849
850 def test_insert_string(self):
851 soup = self.soup("<a></a>")
852 soup.a.insert(0, "bar")
853 soup.a.insert(0, "foo")
854 # The string were added to the tag.
855 self.assertEqual(["foo", "bar"], soup.a.contents)
856 # And they were converted to NavigableStrings.
857 self.assertEqual(soup.a.contents[0].next_element, "bar")
858
859 def test_insert_tag(self):
860 builder = self.default_builder
861 soup = self.soup(
862 "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
863 magic_tag = Tag(soup, builder, 'magictag')
864 magic_tag.insert(0, "the")
865 soup.a.insert(1, magic_tag)
866
867 self.assertEqual(
868 soup.decode(), self.document_for(
869 "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
870
871 # Make sure all the relationships are hooked up correctly.
872 b_tag = soup.b
873 self.assertEqual(b_tag.next_sibling, magic_tag)
874 self.assertEqual(magic_tag.previous_sibling, b_tag)
875
876 find = b_tag.find(text="Find")
877 self.assertEqual(find.next_element, magic_tag)
878 self.assertEqual(magic_tag.previous_element, find)
879
880 c_tag = soup.c
881 self.assertEqual(magic_tag.next_sibling, c_tag)
882 self.assertEqual(c_tag.previous_sibling, magic_tag)
883
884 the = magic_tag.find(text="the")
885 self.assertEqual(the.parent, magic_tag)
886 self.assertEqual(the.next_element, c_tag)
887 self.assertEqual(c_tag.previous_element, the)
888
889 def test_append_child_thats_already_at_the_end(self):
890 data = "<a><b></b></a>"
891 soup = self.soup(data)
892 soup.a.append(soup.b)
893 self.assertEqual(data, soup.decode())
894
895 def test_move_tag_to_beginning_of_parent(self):
896 data = "<a><b></b><c></c><d></d></a>"
897 soup = self.soup(data)
898 soup.a.insert(0, soup.d)
899 self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode())
900
901 def test_insert_works_on_empty_element_tag(self):
902 # This is a little strange, since most HTML parsers don't allow
903 # markup like this to come through. But in general, we don't
904 # know what the parser would or wouldn't have allowed, so
905 # I'm letting this succeed for now.
906 soup = self.soup("<br/>")
907 soup.br.insert(1, "Contents")
908 self.assertEqual(str(soup.br), "<br>Contents</br>")
909
910 def test_insert_before(self):
911 soup = self.soup("<a>foo</a><b>bar</b>")
912 soup.b.insert_before("BAZ")
913 soup.a.insert_before("QUUX")
914 self.assertEqual(
915 soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>"))
916
917 soup.a.insert_before(soup.b)
918 self.assertEqual(
919 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
920
921 def test_insert_after(self):
922 soup = self.soup("<a>foo</a><b>bar</b>")
923 soup.b.insert_after("BAZ")
924 soup.a.insert_after("QUUX")
925 self.assertEqual(
926 soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ"))
927 soup.b.insert_after(soup.a)
928 self.assertEqual(
929 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
930
931 def test_insert_after_raises_exception_if_after_has_no_meaning(self):
932 soup = self.soup("")
933 tag = soup.new_tag("a")
934 string = soup.new_string("")
935 self.assertRaises(ValueError, string.insert_after, tag)
936 self.assertRaises(NotImplementedError, soup.insert_after, tag)
937 self.assertRaises(ValueError, tag.insert_after, tag)
938
939 def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
940 soup = self.soup("")
941 tag = soup.new_tag("a")
942 string = soup.new_string("")
943 self.assertRaises(ValueError, string.insert_before, tag)
944 self.assertRaises(NotImplementedError, soup.insert_before, tag)
945 self.assertRaises(ValueError, tag.insert_before, tag)
946
947 def test_replace_with(self):
948 soup = self.soup(
949 "<p>There's <b>no</b> business like <b>show</b> business</p>")
950 no, show = soup.find_all('b')
951 show.replace_with(no)
952 self.assertEqual(
953 soup.decode(),
954 self.document_for(
955 "<p>There's business like <b>no</b> business</p>"))
956
957 self.assertEqual(show.parent, None)
958 self.assertEqual(no.parent, soup.p)
959 self.assertEqual(no.next_element, "no")
960 self.assertEqual(no.next_sibling, " business")
961
962 def test_replace_first_child(self):
963 data = "<a><b></b><c></c></a>"
964 soup = self.soup(data)
965 soup.b.replace_with(soup.c)
966 self.assertEqual("<a><c></c></a>", soup.decode())
967
968 def test_replace_last_child(self):
969 data = "<a><b></b><c></c></a>"
970 soup = self.soup(data)
971 soup.c.replace_with(soup.b)
972 self.assertEqual("<a><b></b></a>", soup.decode())
973
974 def test_nested_tag_replace_with(self):
975 soup = self.soup(
976 """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
977
978 # Replace the entire <b> tag and its contents ("reserve the
979 # right") with the <f> tag ("refuse").
980 remove_tag = soup.b
981 move_tag = soup.f
982 remove_tag.replace_with(move_tag)
983
984 self.assertEqual(
985 soup.decode(), self.document_for(
986 "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
987
988 # The <b> tag is now an orphan.
989 self.assertEqual(remove_tag.parent, None)
990 self.assertEqual(remove_tag.find(text="right").next_element, None)
991 self.assertEqual(remove_tag.previous_element, None)
992 self.assertEqual(remove_tag.next_sibling, None)
993 self.assertEqual(remove_tag.previous_sibling, None)
994
995 # The <f> tag is now connected to the <a> tag.
996 self.assertEqual(move_tag.parent, soup.a)
997 self.assertEqual(move_tag.previous_element, "We")
998 self.assertEqual(move_tag.next_element.next_element, soup.e)
999 self.assertEqual(move_tag.next_sibling, None)
1000
1001 # The gap where the <f> tag used to be has been mended, and
1002 # the word "to" is now connected to the <g> tag.
1003 to_text = soup.find(text="to")
1004 g_tag = soup.g
1005 self.assertEqual(to_text.next_element, g_tag)
1006 self.assertEqual(to_text.next_sibling, g_tag)
1007 self.assertEqual(g_tag.previous_element, to_text)
1008 self.assertEqual(g_tag.previous_sibling, to_text)
1009
1010 def test_unwrap(self):
1011 tree = self.soup("""
1012 <p>Unneeded <em>formatting</em> is unneeded</p>
1013 """)
1014 tree.em.unwrap()
1015 self.assertEqual(tree.em, None)
1016 self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
1017
1018 def test_wrap(self):
1019 soup = self.soup("I wish I was bold.")
1020 value = soup.string.wrap(soup.new_tag("b"))
1021 self.assertEqual(value.decode(), "<b>I wish I was bold.</b>")
1022 self.assertEqual(
1023 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1024
1025 def test_wrap_extracts_tag_from_elsewhere(self):
1026 soup = self.soup("<b></b>I wish I was bold.")
1027 soup.b.next_sibling.wrap(soup.b)
1028 self.assertEqual(
1029 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1030
1031 def test_wrap_puts_new_contents_at_the_end(self):
1032 soup = self.soup("<b>I like being bold.</b>I wish I was bold.")
1033 soup.b.next_sibling.wrap(soup.b)
1034 self.assertEqual(2, len(soup.b.contents))
1035 self.assertEqual(
1036 soup.decode(), self.document_for(
1037 "<b>I like being bold.I wish I was bold.</b>"))
1038
1039 def test_extract(self):
1040 soup = self.soup(
1041 '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
1042
1043 self.assertEqual(len(soup.body.contents), 3)
1044 extracted = soup.find(id="nav").extract()
1045
1046 self.assertEqual(
1047 soup.decode(), "<html><body>Some content. More content.</body></html>")
1048 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
1049
1050 # The extracted tag is now an orphan.
1051 self.assertEqual(len(soup.body.contents), 2)
1052 self.assertEqual(extracted.parent, None)
1053 self.assertEqual(extracted.previous_element, None)
1054 self.assertEqual(extracted.next_element.next_element, None)
1055
1056 # The gap where the extracted tag used to be has been mended.
1057 content_1 = soup.find(text="Some content. ")
1058 content_2 = soup.find(text=" More content.")
1059 self.assertEqual(content_1.next_element, content_2)
1060 self.assertEqual(content_1.next_sibling, content_2)
1061 self.assertEqual(content_2.previous_element, content_1)
1062 self.assertEqual(content_2.previous_sibling, content_1)
1063
1064 def test_extract_distinguishes_between_identical_strings(self):
1065 soup = self.soup("<a>foo</a><b>bar</b>")
1066 foo_1 = soup.a.string
1067 bar_1 = soup.b.string
1068 foo_2 = soup.new_string("foo")
1069 bar_2 = soup.new_string("bar")
1070 soup.a.append(foo_2)
1071 soup.b.append(bar_2)
1072
1073 # Now there are two identical strings in the <a> tag, and two
1074 # in the <b> tag. Let's remove the first "foo" and the second
1075 # "bar".
1076 foo_1.extract()
1077 bar_2.extract()
1078 self.assertEqual(foo_2, soup.a.string)
1079 self.assertEqual(bar_2, soup.b.string)
1080
1081 def test_extract_multiples_of_same_tag(self):
1082 soup = self.soup("""
1083<html>
1084<head>
1085<script>foo</script>
1086</head>
1087<body>
1088 <script>bar</script>
1089 <a></a>
1090</body>
1091<script>baz</script>
1092</html>""")
1093 [soup.script.extract() for i in soup.find_all("script")]
1094 self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
1095
1096
1097 def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
1098 soup = self.soup(
1099 '<html>\n'
1100 '<body>hi</body>\n'
1101 '</html>')
1102 soup.find('body').extract()
1103 self.assertEqual(None, soup.find('body'))
1104
1105
1106 def test_clear(self):
1107 """Tag.clear()"""
1108 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
1109 # clear using extract()
1110 a = soup.a
1111 soup.p.clear()
1112 self.assertEqual(len(soup.p.contents), 0)
1113 self.assertTrue(hasattr(a, "contents"))
1114
1115 # clear using decompose()
1116 em = a.em
1117 a.clear(decompose=True)
1118 self.assertEqual(0, len(em.contents))
1119
1120 def test_string_set(self):
1121 """Tag.string = 'string'"""
1122 soup = self.soup("<a></a> <b><c></c></b>")
1123 soup.a.string = "foo"
1124 self.assertEqual(soup.a.contents, ["foo"])
1125 soup.b.string = "bar"
1126 self.assertEqual(soup.b.contents, ["bar"])
1127
1128 def test_string_set_does_not_affect_original_string(self):
1129 soup = self.soup("<a><b>foo</b><c>bar</c>")
1130 soup.b.string = soup.c.string
1131 self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
1132
1133 def test_set_string_preserves_class_of_string(self):
1134 soup = self.soup("<a></a>")
1135 cdata = CData("foo")
1136 soup.a.string = cdata
1137 self.assertTrue(isinstance(soup.a.string, CData))
1138
1139class TestElementObjects(SoupTest):
1140 """Test various features of element objects."""
1141
1142 def test_len(self):
1143 """The length of an element is its number of children."""
1144 soup = self.soup("<top>1<b>2</b>3</top>")
1145
1146 # The BeautifulSoup object itself contains one element: the
1147 # <top> tag.
1148 self.assertEqual(len(soup.contents), 1)
1149 self.assertEqual(len(soup), 1)
1150
1151 # The <top> tag contains three elements: the text node "1", the
1152 # <b> tag, and the text node "3".
1153 self.assertEqual(len(soup.top), 3)
1154 self.assertEqual(len(soup.top.contents), 3)
1155
1156 def test_member_access_invokes_find(self):
1157 """Accessing a Python member .foo invokes find('foo')"""
1158 soup = self.soup('<b><i></i></b>')
1159 self.assertEqual(soup.b, soup.find('b'))
1160 self.assertEqual(soup.b.i, soup.find('b').find('i'))
1161 self.assertEqual(soup.a, None)
1162
1163 def test_deprecated_member_access(self):
1164 soup = self.soup('<b><i></i></b>')
1165 with warnings.catch_warnings(record=True) as w:
1166 tag = soup.bTag
1167 self.assertEqual(soup.b, tag)
1168 self.assertEqual(
1169 '.bTag is deprecated, use .find("b") instead.',
1170 str(w[0].message))
1171
1172 def test_has_attr(self):
1173 """has_attr() checks for the presence of an attribute.
1174
1175 Please note note: has_attr() is different from
1176 __in__. has_attr() checks the tag's attributes and __in__
1177 checks the tag's chidlren.
1178 """
1179 soup = self.soup("<foo attr='bar'>")
1180 self.assertTrue(soup.foo.has_attr('attr'))
1181 self.assertFalse(soup.foo.has_attr('attr2'))
1182
1183
1184 def test_attributes_come_out_in_alphabetical_order(self):
1185 markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
1186 self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
1187
1188 def test_string(self):
1189 # A tag that contains only a text node makes that node
1190 # available as .string.
1191 soup = self.soup("<b>foo</b>")
1192 self.assertEqual(soup.b.string, 'foo')
1193
1194 def test_empty_tag_has_no_string(self):
1195 # A tag with no children has no .stirng.
1196 soup = self.soup("<b></b>")
1197 self.assertEqual(soup.b.string, None)
1198
1199 def test_tag_with_multiple_children_has_no_string(self):
1200 # A tag with no children has no .string.
1201 soup = self.soup("<a>foo<b></b><b></b></b>")
1202 self.assertEqual(soup.b.string, None)
1203
1204 soup = self.soup("<a>foo<b></b>bar</b>")
1205 self.assertEqual(soup.b.string, None)
1206
1207 # Even if all the children are strings, due to trickery,
1208 # it won't work--but this would be a good optimization.
1209 soup = self.soup("<a>foo</b>")
1210 soup.a.insert(1, "bar")
1211 self.assertEqual(soup.a.string, None)
1212
1213 def test_tag_with_recursive_string_has_string(self):
1214 # A tag with a single child which has a .string inherits that
1215 # .string.
1216 soup = self.soup("<a><b>foo</b></a>")
1217 self.assertEqual(soup.a.string, "foo")
1218 self.assertEqual(soup.string, "foo")
1219
1220 def test_lack_of_string(self):
1221 """Only a tag containing a single text node has a .string."""
1222 soup = self.soup("<b>f<i>e</i>o</b>")
1223 self.assertFalse(soup.b.string)
1224
1225 soup = self.soup("<b></b>")
1226 self.assertFalse(soup.b.string)
1227
1228 def test_all_text(self):
1229 """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
1230 soup = self.soup("<a>a<b>r</b> <r> t </r></a>")
1231 self.assertEqual(soup.a.text, "ar t ")
1232 self.assertEqual(soup.a.get_text(strip=True), "art")
1233 self.assertEqual(soup.a.get_text(","), "a,r, , t ")
1234 self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
1235
1236 def test_get_text_ignores_comments(self):
1237 soup = self.soup("foo<!--IGNORE-->bar")
1238 self.assertEqual(soup.get_text(), "foobar")
1239
1240 self.assertEqual(
1241 soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
1242 self.assertEqual(
1243 soup.get_text(types=None), "fooIGNOREbar")
1244
1245 def test_all_strings_ignores_comments(self):
1246 soup = self.soup("foo<!--IGNORE-->bar")
1247 self.assertEqual(['foo', 'bar'], list(soup.strings))
1248
1249class TestCDAtaListAttributes(SoupTest):
1250
1251 """Testing cdata-list attributes like 'class'.
1252 """
1253 def test_single_value_becomes_list(self):
1254 soup = self.soup("<a class='foo'>")
1255 self.assertEqual(["foo"],soup.a['class'])
1256
1257 def test_multiple_values_becomes_list(self):
1258 soup = self.soup("<a class='foo bar'>")
1259 self.assertEqual(["foo", "bar"], soup.a['class'])
1260
1261 def test_multiple_values_separated_by_weird_whitespace(self):
1262 soup = self.soup("<a class='foo\tbar\nbaz'>")
1263 self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
1264
1265 def test_attributes_joined_into_string_on_output(self):
1266 soup = self.soup("<a class='foo\tbar'>")
1267 self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
1268
1269 def test_accept_charset(self):
1270 soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
1271 self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
1272
1273 def test_cdata_attribute_applying_only_to_one_tag(self):
1274 data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
1275 soup = self.soup(data)
1276 # We saw in another test that accept-charset is a cdata-list
1277 # attribute for the <form> tag. But it's not a cdata-list
1278 # attribute for any other tag.
1279 self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
1280
1281 def test_string_has_immutable_name_property(self):
1282 string = self.soup("s").string
1283 self.assertEqual(None, string.name)
1284 def t():
1285 string.name = 'foo'
1286 self.assertRaises(AttributeError, t)
1287
1288class TestPersistence(SoupTest):
1289 "Testing features like pickle and deepcopy."
1290
1291 def setUp(self):
1292 super(TestPersistence, self).setUp()
1293 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
1294"http://www.w3.org/TR/REC-html40/transitional.dtd">
1295<html>
1296<head>
1297<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
1298<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
1299<link rev="made" href="mailto:leonardr@segfault.org">
1300<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
1301<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
1302<meta name="author" content="Leonard Richardson">
1303</head>
1304<body>
1305<a href="foo">foo</a>
1306<a href="foo"><b>bar</b></a>
1307</body>
1308</html>"""
1309 self.tree = self.soup(self.page)
1310
1311 def test_pickle_and_unpickle_identity(self):
1312 # Pickling a tree, then unpickling it, yields a tree identical
1313 # to the original.
1314 dumped = pickle.dumps(self.tree, 2)
1315 loaded = pickle.loads(dumped)
1316 self.assertEqual(loaded.__class__, BeautifulSoup)
1317 self.assertEqual(loaded.decode(), self.tree.decode())
1318
1319 def test_deepcopy_identity(self):
1320 # Making a deepcopy of a tree yields an identical tree.
1321 copied = copy.deepcopy(self.tree)
1322 self.assertEqual(copied.decode(), self.tree.decode())
1323
1324 def test_unicode_pickle(self):
1325 # A tree containing Unicode characters can be pickled.
1326 html = "<b>\N{SNOWMAN}</b>"
1327 soup = self.soup(html)
1328 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1329 loaded = pickle.loads(dumped)
1330 self.assertEqual(loaded.decode(), soup.decode())
1331
1332 def test_copy_navigablestring_is_not_attached_to_tree(self):
1333 html = "<b>Foo<a></a></b><b>Bar</b>"
1334 soup = self.soup(html)
1335 s1 = soup.find(string="Foo")
1336 s2 = copy.copy(s1)
1337 self.assertEqual(s1, s2)
1338 self.assertEqual(None, s2.parent)
1339 self.assertEqual(None, s2.next_element)
1340 self.assertNotEqual(None, s1.next_sibling)
1341 self.assertEqual(None, s2.next_sibling)
1342 self.assertEqual(None, s2.previous_element)
1343
1344 def test_copy_navigablestring_subclass_has_same_type(self):
1345 html = "<b><!--Foo--></b>"
1346 soup = self.soup(html)
1347 s1 = soup.string
1348 s2 = copy.copy(s1)
1349 self.assertEqual(s1, s2)
1350 self.assertTrue(isinstance(s2, Comment))
1351
1352 def test_copy_entire_soup(self):
1353 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1354 soup = self.soup(html)
1355 soup_copy = copy.copy(soup)
1356 self.assertEqual(soup, soup_copy)
1357
1358 def test_copy_tag_copies_contents(self):
1359 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1360 soup = self.soup(html)
1361 div = soup.div
1362 div_copy = copy.copy(div)
1363
1364 # The two tags look the same, and evaluate to equal.
1365 self.assertEqual(str(div), str(div_copy))
1366 self.assertEqual(div, div_copy)
1367
1368 # But they're not the same object.
1369 self.assertFalse(div is div_copy)
1370
1371 # And they don't have the same relation to the parse tree. The
1372 # copy is not associated with a parse tree at all.
1373 self.assertEqual(None, div_copy.parent)
1374 self.assertEqual(None, div_copy.previous_element)
1375 self.assertEqual(None, div_copy.find(string='Bar').next_element)
1376 self.assertNotEqual(None, div.find(string='Bar').next_element)
1377
1378class TestSubstitutions(SoupTest):
1379
1380 def test_default_formatter_is_minimal(self):
1381 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1382 soup = self.soup(markup)
1383 decoded = soup.decode(formatter="minimal")
1384 # The < is converted back into &lt; but the e-with-acute is left alone.
1385 self.assertEqual(
1386 decoded,
1387 self.document_for(
1388 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1389
1390 def test_formatter_html(self):
1391 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1392 soup = self.soup(markup)
1393 decoded = soup.decode(formatter="html")
1394 self.assertEqual(
1395 decoded,
1396 self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1397
1398 def test_formatter_minimal(self):
1399 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1400 soup = self.soup(markup)
1401 decoded = soup.decode(formatter="minimal")
1402 # The < is converted back into &lt; but the e-with-acute is left alone.
1403 self.assertEqual(
1404 decoded,
1405 self.document_for(
1406 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1407
1408 def test_formatter_null(self):
1409 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1410 soup = self.soup(markup)
1411 decoded = soup.decode(formatter=None)
1412 # Neither the angle brackets nor the e-with-acute are converted.
1413 # This is not valid HTML, but it's what the user wanted.
1414 self.assertEqual(decoded,
1415 self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1416
1417 def test_formatter_custom(self):
1418 markup = "<b>&lt;foo&gt;</b><b>bar</b>"
1419 soup = self.soup(markup)
1420 decoded = soup.decode(formatter = lambda x: x.upper())
1421 # Instead of normal entity conversion code, the custom
1422 # callable is called on every string.
1423 self.assertEqual(
1424 decoded,
1425 self.document_for("<b><FOO></b><b>BAR</b>"))
1426
1427 def test_formatter_is_run_on_attribute_values(self):
1428 markup = '<a href="http://a.com?a=b&c=é">e</a>'
1429 soup = self.soup(markup)
1430 a = soup.a
1431
1432 expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
1433
1434 self.assertEqual(expect_minimal, a.decode())
1435 self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1436
1437 expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1438 self.assertEqual(expect_html, a.decode(formatter="html"))
1439
1440 self.assertEqual(markup, a.decode(formatter=None))
1441 expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
1442 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1443
1444 def test_formatter_skips_script_tag_for_html_documents(self):
1445 doc = """
1446 <script type="text/javascript">
1447 console.log("< < hey > > ");
1448 </script>
1449"""
1450 encoded = BeautifulSoup(doc, 'html.parser').encode()
1451 self.assertTrue(b"< < hey > >" in encoded)
1452
1453 def test_formatter_skips_style_tag_for_html_documents(self):
1454 doc = """
1455 <style type="text/css">
1456 console.log("< < hey > > ");
1457 </style>
1458"""
1459 encoded = BeautifulSoup(doc, 'html.parser').encode()
1460 self.assertTrue(b"< < hey > >" in encoded)
1461
1462 def test_prettify_leaves_preformatted_text_alone(self):
1463 soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
1464 # Everything outside the <pre> tag is reformatted, but everything
1465 # inside is left alone.
1466 self.assertEqual(
1467 '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
1468 soup.div.prettify())
1469
1470 def test_prettify_accepts_formatter(self):
1471 soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
1472 pretty = soup.prettify(formatter = lambda x: x.upper())
1473 self.assertTrue("FOO" in pretty)
1474
1475 def test_prettify_outputs_unicode_by_default(self):
1476 soup = self.soup("<a></a>")
1477 self.assertEqual(str, type(soup.prettify()))
1478
1479 def test_prettify_can_encode_data(self):
1480 soup = self.soup("<a></a>")
1481 self.assertEqual(bytes, type(soup.prettify("utf-8")))
1482
1483 def test_html_entity_substitution_off_by_default(self):
1484 markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1485 soup = self.soup(markup)
1486 encoded = soup.b.encode("utf-8")
1487 self.assertEqual(encoded, markup.encode('utf-8'))
1488
1489 def test_encoding_substitution(self):
1490 # Here's the <meta> tag saying that a document is
1491 # encoded in Shift-JIS.
1492 meta_tag = ('<meta content="text/html; charset=x-sjis" '
1493 'http-equiv="Content-type"/>')
1494 soup = self.soup(meta_tag)
1495
1496 # Parse the document, and the charset apprears unchanged.
1497 self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
1498
1499 # Encode the document into some encoding, and the encoding is
1500 # substituted into the meta tag.
1501 utf_8 = soup.encode("utf-8")
1502 self.assertTrue(b"charset=utf-8" in utf_8)
1503
1504 euc_jp = soup.encode("euc_jp")
1505 self.assertTrue(b"charset=euc_jp" in euc_jp)
1506
1507 shift_jis = soup.encode("shift-jis")
1508 self.assertTrue(b"charset=shift-jis" in shift_jis)
1509
1510 utf_16_u = soup.encode("utf-16").decode("utf-16")
1511 self.assertTrue("charset=utf-16" in utf_16_u)
1512
1513 def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
1514 markup = ('<head><meta content="text/html; charset=x-sjis" '
1515 'http-equiv="Content-type"/></head><pre>foo</pre>')
1516
1517 # Beautiful Soup used to try to rewrite the meta tag even if the
1518 # meta tag got filtered out by the strainer. This test makes
1519 # sure that doesn't happen.
1520 strainer = SoupStrainer('pre')
1521 soup = self.soup(markup, parse_only=strainer)
1522 self.assertEqual(soup.contents[0].name, 'pre')
1523
1524class TestEncoding(SoupTest):
1525 """Test the ability to encode objects into strings."""
1526
1527 def test_unicode_string_can_be_encoded(self):
1528 html = "<b>\N{SNOWMAN}</b>"
1529 soup = self.soup(html)
1530 self.assertEqual(soup.b.string.encode("utf-8"),
1531 "\N{SNOWMAN}".encode("utf-8"))
1532
1533 def test_tag_containing_unicode_string_can_be_encoded(self):
1534 html = "<b>\N{SNOWMAN}</b>"
1535 soup = self.soup(html)
1536 self.assertEqual(
1537 soup.b.encode("utf-8"), html.encode("utf-8"))
1538
1539 def test_encoding_substitutes_unrecognized_characters_by_default(self):
1540 html = "<b>\N{SNOWMAN}</b>"
1541 soup = self.soup(html)
1542 self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1543
1544 def test_encoding_can_be_made_strict(self):
1545 html = "<b>\N{SNOWMAN}</b>"
1546 soup = self.soup(html)
1547 self.assertRaises(
1548 UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1549
1550 def test_decode_contents(self):
1551 html = "<b>\N{SNOWMAN}</b>"
1552 soup = self.soup(html)
1553 self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
1554
1555 def test_encode_contents(self):
1556 html = "<b>\N{SNOWMAN}</b>"
1557 soup = self.soup(html)
1558 self.assertEqual(
1559 "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1560 encoding="utf8"))
1561
1562 def test_deprecated_renderContents(self):
1563 html = "<b>\N{SNOWMAN}</b>"
1564 soup = self.soup(html)
1565 self.assertEqual(
1566 "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1567
1568 def test_repr(self):
1569 html = "<b>\N{SNOWMAN}</b>"
1570 soup = self.soup(html)
1571 if PY3K:
1572 self.assertEqual(html, repr(soup))
1573 else:
1574 self.assertEqual(b'<b>\\u2603</b>', repr(soup))
1575
1576class TestNavigableStringSubclasses(SoupTest):
1577
1578 def test_cdata(self):
1579 # None of the current builders turn CDATA sections into CData
1580 # objects, but you can create them manually.
1581 soup = self.soup("")
1582 cdata = CData("foo")
1583 soup.insert(1, cdata)
1584 self.assertEqual(str(soup), "<![CDATA[foo]]>")
1585 self.assertEqual(soup.find(text="foo"), "foo")
1586 self.assertEqual(soup.contents[0], "foo")
1587
1588 def test_cdata_is_never_formatted(self):
1589 """Text inside a CData object is passed into the formatter.
1590
1591 But the return value is ignored.
1592 """
1593
1594 self.count = 0
1595 def increment(*args):
1596 self.count += 1
1597 return "BITTER FAILURE"
1598
1599 soup = self.soup("")
1600 cdata = CData("<><><>")
1601 soup.insert(1, cdata)
1602 self.assertEqual(
1603 b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
1604 self.assertEqual(1, self.count)
1605
1606 def test_doctype_ends_in_newline(self):
1607 # Unlike other NavigableString subclasses, a DOCTYPE always ends
1608 # in a newline.
1609 doctype = Doctype("foo")
1610 soup = self.soup("")
1611 soup.insert(1, doctype)
1612 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1613
1614 def test_declaration(self):
1615 d = Declaration("foo")
1616 self.assertEqual("<?foo?>", d.output_ready())
1617
1618class TestSoupSelector(TreeTest):
1619
1620 HTML = """
1621<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
1622"http://www.w3.org/TR/html4/strict.dtd">
1623<html>
1624<head>
1625<title>The title</title>
1626<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1627</head>
1628<body>
1629<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
1630<div id="main" class="fancy">
1631<div id="inner">
1632<h1 id="header1">An H1</h1>
1633<p>Some text</p>
1634<p class="onep" id="p1">Some more text</p>
1635<h2 id="header2">An H2</h2>
1636<p class="class1 class2 class3" id="pmulti">Another</p>
1637<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
1638<h2 id="header3">Another H2</h2>
1639<a id="me" href="http://simonwillison.net/" rel="me">me</a>
1640<span class="s1">
1641<a href="#" id="s1a1">span1a1</a>
1642<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
1643<span class="span2">
1644<a href="#" id="s2a1">span2a1</a>
1645</span>
1646<span class="span3"></span>
1647<custom-dashed-tag class="dashed" id="dash2"/>
1648<div data-tag="dashedvalue" id="data1"/>
1649</span>
1650</div>
1651<x id="xid">
1652<z id="zida"/>
1653<z id="zidab"/>
1654<z id="zidac"/>
1655</x>
1656<y id="yid">
1657<z id="zidb"/>
1658</y>
1659<p lang="en" id="lang-en">English</p>
1660<p lang="en-gb" id="lang-en-gb">English UK</p>
1661<p lang="en-us" id="lang-en-us">English US</p>
1662<p lang="fr" id="lang-fr">French</p>
1663</div>
1664
1665<div id="footer">
1666</div>
1667"""
1668
1669 def setUp(self):
1670 self.soup = BeautifulSoup(self.HTML, 'html.parser')
1671
1672 def assertSelects(self, selector, expected_ids):
1673 el_ids = [el['id'] for el in self.soup.select(selector)]
1674 el_ids.sort()
1675 expected_ids.sort()
1676 self.assertEqual(expected_ids, el_ids,
1677 "Selector %s, expected [%s], got [%s]" % (
1678 selector, ', '.join(expected_ids), ', '.join(el_ids)
1679 )
1680 )
1681
1682 assertSelect = assertSelects
1683
1684 def assertSelectMultiple(self, *tests):
1685 for selector, expected_ids in tests:
1686 self.assertSelect(selector, expected_ids)
1687
1688 def test_one_tag_one(self):
1689 els = self.soup.select('title')
1690 self.assertEqual(len(els), 1)
1691 self.assertEqual(els[0].name, 'title')
1692 self.assertEqual(els[0].contents, ['The title'])
1693
1694 def test_one_tag_many(self):
1695 els = self.soup.select('div')
1696 self.assertEqual(len(els), 4)
1697 for div in els:
1698 self.assertEqual(div.name, 'div')
1699
1700 el = self.soup.select_one('div')
1701 self.assertEqual('main', el['id'])
1702
1703 def test_select_one_returns_none_if_no_match(self):
1704 match = self.soup.select_one('nonexistenttag')
1705 self.assertEqual(None, match)
1706
1707
1708 def test_tag_in_tag_one(self):
1709 els = self.soup.select('div div')
1710 self.assertSelects('div div', ['inner', 'data1'])
1711
1712 def test_tag_in_tag_many(self):
1713 for selector in ('html div', 'html body div', 'body div'):
1714 self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
1715
1716 def test_tag_no_match(self):
1717 self.assertEqual(len(self.soup.select('del')), 0)
1718
1719 def test_invalid_tag(self):
1720 self.assertRaises(ValueError, self.soup.select, 'tag%t')
1721
1722 def test_select_dashed_tag_ids(self):
1723 self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
1724
1725 def test_select_dashed_by_id(self):
1726 dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
1727 self.assertEqual(dashed[0].name, 'custom-dashed-tag')
1728 self.assertEqual(dashed[0]['id'], 'dash2')
1729
1730 def test_dashed_tag_text(self):
1731 self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
1732
1733 def test_select_dashed_matches_find_all(self):
1734 self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
1735
1736 def test_header_tags(self):
1737 self.assertSelectMultiple(
1738 ('h1', ['header1']),
1739 ('h2', ['header2', 'header3']),
1740 )
1741
1742 def test_class_one(self):
1743 for selector in ('.onep', 'p.onep', 'html p.onep'):
1744 els = self.soup.select(selector)
1745 self.assertEqual(len(els), 1)
1746 self.assertEqual(els[0].name, 'p')
1747 self.assertEqual(els[0]['class'], ['onep'])
1748
1749 def test_class_mismatched_tag(self):
1750 els = self.soup.select('div.onep')
1751 self.assertEqual(len(els), 0)
1752
1753 def test_one_id(self):
1754 for selector in ('div#inner', '#inner', 'div div#inner'):
1755 self.assertSelects(selector, ['inner'])
1756
1757 def test_bad_id(self):
1758 els = self.soup.select('#doesnotexist')
1759 self.assertEqual(len(els), 0)
1760
1761 def test_items_in_id(self):
1762 els = self.soup.select('div#inner p')
1763 self.assertEqual(len(els), 3)
1764 for el in els:
1765 self.assertEqual(el.name, 'p')
1766 self.assertEqual(els[1]['class'], ['onep'])
1767 self.assertFalse(els[0].has_attr('class'))
1768
1769 def test_a_bunch_of_emptys(self):
1770 for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
1771 self.assertEqual(len(self.soup.select(selector)), 0)
1772
1773 def test_multi_class_support(self):
1774 for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
1775 '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
1776 self.assertSelects(selector, ['pmulti'])
1777
1778 def test_multi_class_selection(self):
1779 for selector in ('.class1.class3', '.class3.class2',
1780 '.class1.class2.class3'):
1781 self.assertSelects(selector, ['pmulti'])
1782
1783 def test_child_selector(self):
1784 self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
1785 self.assertSelects('.s1 > a span', ['s1a2s1'])
1786
1787 def test_child_selector_id(self):
1788 self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
1789
1790 def test_attribute_equals(self):
1791 self.assertSelectMultiple(
1792 ('p[class="onep"]', ['p1']),
1793 ('p[id="p1"]', ['p1']),
1794 ('[class="onep"]', ['p1']),
1795 ('[id="p1"]', ['p1']),
1796 ('link[rel="stylesheet"]', ['l1']),
1797 ('link[type="text/css"]', ['l1']),
1798 ('link[href="blah.css"]', ['l1']),
1799 ('link[href="no-blah.css"]', []),
1800 ('[rel="stylesheet"]', ['l1']),
1801 ('[type="text/css"]', ['l1']),
1802 ('[href="blah.css"]', ['l1']),
1803 ('[href="no-blah.css"]', []),
1804 ('p[href="no-blah.css"]', []),
1805 ('[href="no-blah.css"]', []),
1806 )
1807
1808 def test_attribute_tilde(self):
1809 self.assertSelectMultiple(
1810 ('p[class~="class1"]', ['pmulti']),
1811 ('p[class~="class2"]', ['pmulti']),
1812 ('p[class~="class3"]', ['pmulti']),
1813 ('[class~="class1"]', ['pmulti']),
1814 ('[class~="class2"]', ['pmulti']),
1815 ('[class~="class3"]', ['pmulti']),
1816 ('a[rel~="friend"]', ['bob']),
1817 ('a[rel~="met"]', ['bob']),
1818 ('[rel~="friend"]', ['bob']),
1819 ('[rel~="met"]', ['bob']),
1820 )
1821
1822 def test_attribute_startswith(self):
1823 self.assertSelectMultiple(
1824 ('[rel^="style"]', ['l1']),
1825 ('link[rel^="style"]', ['l1']),
1826 ('notlink[rel^="notstyle"]', []),
1827 ('[rel^="notstyle"]', []),
1828 ('link[rel^="notstyle"]', []),
1829 ('link[href^="bla"]', ['l1']),
1830 ('a[href^="http://"]', ['bob', 'me']),
1831 ('[href^="http://"]', ['bob', 'me']),
1832 ('[id^="p"]', ['pmulti', 'p1']),
1833 ('[id^="m"]', ['me', 'main']),
1834 ('div[id^="m"]', ['main']),
1835 ('a[id^="m"]', ['me']),
1836 ('div[data-tag^="dashed"]', ['data1'])
1837 )
1838
1839 def test_attribute_endswith(self):
1840 self.assertSelectMultiple(
1841 ('[href$=".css"]', ['l1']),
1842 ('link[href$=".css"]', ['l1']),
1843 ('link[id$="1"]', ['l1']),
1844 ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
1845 ('div[id$="1"]', ['data1']),
1846 ('[id$="noending"]', []),
1847 )
1848
1849 def test_attribute_contains(self):
1850 self.assertSelectMultiple(
1851 # From test_attribute_startswith
1852 ('[rel*="style"]', ['l1']),
1853 ('link[rel*="style"]', ['l1']),
1854 ('notlink[rel*="notstyle"]', []),
1855 ('[rel*="notstyle"]', []),
1856 ('link[rel*="notstyle"]', []),
1857 ('link[href*="bla"]', ['l1']),
1858 ('[href*="http://"]', ['bob', 'me']),
1859 ('[id*="p"]', ['pmulti', 'p1']),
1860 ('div[id*="m"]', ['main']),
1861 ('a[id*="m"]', ['me']),
1862 # From test_attribute_endswith
1863 ('[href*=".css"]', ['l1']),
1864 ('link[href*=".css"]', ['l1']),
1865 ('link[id*="1"]', ['l1']),
1866 ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
1867 ('div[id*="1"]', ['data1']),
1868 ('[id*="noending"]', []),
1869 # New for this test
1870 ('[href*="."]', ['bob', 'me', 'l1']),
1871 ('a[href*="."]', ['bob', 'me']),
1872 ('link[href*="."]', ['l1']),
1873 ('div[id*="n"]', ['main', 'inner']),
1874 ('div[id*="nn"]', ['inner']),
1875 ('div[data-tag*="edval"]', ['data1'])
1876 )
1877
1878 def test_attribute_exact_or_hypen(self):
1879 self.assertSelectMultiple(
1880 ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1881 ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1882 ('p[lang|="fr"]', ['lang-fr']),
1883 ('p[lang|="gb"]', []),
1884 )
1885
1886 def test_attribute_exists(self):
1887 self.assertSelectMultiple(
1888 ('[rel]', ['l1', 'bob', 'me']),
1889 ('link[rel]', ['l1']),
1890 ('a[rel]', ['bob', 'me']),
1891 ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
1892 ('p[class]', ['p1', 'pmulti']),
1893 ('[blah]', []),
1894 ('p[blah]', []),
1895 ('div[data-tag]', ['data1'])
1896 )
1897
1898 def test_unsupported_pseudoclass(self):
1899 self.assertRaises(
1900 NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
1901
1902 self.assertRaises(
1903 NotImplementedError, self.soup.select, "a:nth-of-type(a)")
1904
1905
1906 def test_nth_of_type(self):
1907 # Try to select first paragraph
1908 els = self.soup.select('div#inner p:nth-of-type(1)')
1909 self.assertEqual(len(els), 1)
1910 self.assertEqual(els[0].string, 'Some text')
1911
1912 # Try to select third paragraph
1913 els = self.soup.select('div#inner p:nth-of-type(3)')
1914 self.assertEqual(len(els), 1)
1915 self.assertEqual(els[0].string, 'Another')
1916
1917 # Try to select (non-existent!) fourth paragraph
1918 els = self.soup.select('div#inner p:nth-of-type(4)')
1919 self.assertEqual(len(els), 0)
1920
1921 # Pass in an invalid value.
1922 self.assertRaises(
1923 ValueError, self.soup.select, 'div p:nth-of-type(0)')
1924
1925 def test_nth_of_type_direct_descendant(self):
1926 els = self.soup.select('div#inner > p:nth-of-type(1)')
1927 self.assertEqual(len(els), 1)
1928 self.assertEqual(els[0].string, 'Some text')
1929
1930 def test_id_child_selector_nth_of_type(self):
1931 self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
1932
1933 def test_select_on_element(self):
1934 # Other tests operate on the tree; this operates on an element
1935 # within the tree.
1936 inner = self.soup.find("div", id="main")
1937 selected = inner.select("div")
1938 # The <div id="inner"> tag was selected. The <div id="footer">
1939 # tag was not.
1940 self.assertSelectsIDs(selected, ['inner', 'data1'])
1941
1942 def test_overspecified_child_id(self):
1943 self.assertSelects(".fancy #inner", ['inner'])
1944 self.assertSelects(".normal #inner", [])
1945
1946 def test_adjacent_sibling_selector(self):
1947 self.assertSelects('#p1 + h2', ['header2'])
1948 self.assertSelects('#p1 + h2 + p', ['pmulti'])
1949 self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
1950 self.assertEqual([], self.soup.select('#p1 + p'))
1951
1952 def test_general_sibling_selector(self):
1953 self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
1954 self.assertSelects('#p1 ~ #header2', ['header2'])
1955 self.assertSelects('#p1 ~ h2 + a', ['me'])
1956 self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
1957 self.assertEqual([], self.soup.select('#inner ~ h2'))
1958
1959 def test_dangling_combinator(self):
1960 self.assertRaises(ValueError, self.soup.select, 'h1 >')
1961
1962 def test_sibling_combinator_wont_select_same_tag_twice(self):
1963 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
1964
1965 # Test the selector grouping operator (the comma)
1966 def test_multiple_select(self):
1967 self.assertSelects('x, y', ['xid', 'yid'])
1968
1969 def test_multiple_select_with_no_space(self):
1970 self.assertSelects('x,y', ['xid', 'yid'])
1971
1972 def test_multiple_select_with_more_space(self):
1973 self.assertSelects('x, y', ['xid', 'yid'])
1974
1975 def test_multiple_select_duplicated(self):
1976 self.assertSelects('x, x', ['xid'])
1977
1978 def test_multiple_select_sibling(self):
1979 self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
1980
1981 def test_multiple_select_tag_and_direct_descendant(self):
1982 self.assertSelects('x, y > z', ['xid', 'zidb'])
1983
1984 def test_multiple_select_direct_descendant_and_tags(self):
1985 self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1986
1987 def test_multiple_select_indirect_descendant(self):
1988 self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1989
1990 def test_invalid_multiple_select(self):
1991 self.assertRaises(ValueError, self.soup.select, ',x, y')
1992 self.assertRaises(ValueError, self.soup.select, 'x,,y')
1993
1994 def test_multiple_select_attrs(self):
1995 self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
1996
1997 def test_multiple_select_ids(self):
1998 self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
1999
2000 def test_multiple_select_nested(self):
2001 self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
2002
2003
2004
diff --git a/bitbake/lib/hashserv/__init__.py b/bitbake/lib/hashserv/__init__.py
index 552a33278f..ac891e0174 100644
--- a/bitbake/lib/hashserv/__init__.py
+++ b/bitbake/lib/hashserv/__init__.py
@@ -5,39 +5,15 @@
5 5
6import asyncio 6import asyncio
7from contextlib import closing 7from contextlib import closing
8import re
9import itertools 8import itertools
10import json 9import json
11from collections import namedtuple 10from collections import namedtuple
12from urllib.parse import urlparse 11from urllib.parse import urlparse
13 12from bb.asyncrpc.client import parse_address, ADDR_TYPE_UNIX, ADDR_TYPE_WS
14UNIX_PREFIX = "unix://"
15WS_PREFIX = "ws://"
16WSS_PREFIX = "wss://"
17
18ADDR_TYPE_UNIX = 0
19ADDR_TYPE_TCP = 1
20ADDR_TYPE_WS = 2
21 13
22User = namedtuple("User", ("username", "permissions")) 14User = namedtuple("User", ("username", "permissions"))
23 15
24 16
25def parse_address(addr):
26 if addr.startswith(UNIX_PREFIX):
27 return (ADDR_TYPE_UNIX, (addr[len(UNIX_PREFIX) :],))
28 elif addr.startswith(WS_PREFIX) or addr.startswith(WSS_PREFIX):
29 return (ADDR_TYPE_WS, (addr,))
30 else:
31 m = re.match(r"\[(?P<host>[^\]]*)\]:(?P<port>\d+)$", addr)
32 if m is not None:
33 host = m.group("host")
34 port = m.group("port")
35 else:
36 host, port = addr.split(":")
37
38 return (ADDR_TYPE_TCP, (host, int(port)))
39
40
41def create_server( 17def create_server(
42 addr, 18 addr,
43 dbname, 19 dbname,
@@ -50,6 +26,7 @@ def create_server(
50 anon_perms=None, 26 anon_perms=None,
51 admin_username=None, 27 admin_username=None,
52 admin_password=None, 28 admin_password=None,
29 reuseport=False,
53): 30):
54 def sqlite_engine(): 31 def sqlite_engine():
55 from .sqlite import DatabaseEngine 32 from .sqlite import DatabaseEngine
@@ -85,9 +62,9 @@ def create_server(
85 s.start_unix_server(*a) 62 s.start_unix_server(*a)
86 elif typ == ADDR_TYPE_WS: 63 elif typ == ADDR_TYPE_WS:
87 url = urlparse(a[0]) 64 url = urlparse(a[0])
88 s.start_websocket_server(url.hostname, url.port) 65 s.start_websocket_server(url.hostname, url.port, reuseport=reuseport)
89 else: 66 else:
90 s.start_tcp_server(*a) 67 s.start_tcp_server(*a, reuseport=reuseport)
91 68
92 return s 69 return s
93 70
diff --git a/bitbake/lib/hashserv/client.py b/bitbake/lib/hashserv/client.py
index b269879ecf..a510f3284f 100644
--- a/bitbake/lib/hashserv/client.py
+++ b/bitbake/lib/hashserv/client.py
@@ -5,6 +5,7 @@
5 5
6import logging 6import logging
7import socket 7import socket
8import asyncio
8import bb.asyncrpc 9import bb.asyncrpc
9import json 10import json
10from . import create_async_client 11from . import create_async_client
@@ -13,6 +14,66 @@ from . import create_async_client
13logger = logging.getLogger("hashserv.client") 14logger = logging.getLogger("hashserv.client")
14 15
15 16
17class Batch(object):
18 def __init__(self):
19 self.done = False
20 self.cond = asyncio.Condition()
21 self.pending = []
22 self.results = []
23 self.sent_count = 0
24
25 async def recv(self, socket):
26 while True:
27 async with self.cond:
28 await self.cond.wait_for(lambda: self.pending or self.done)
29
30 if not self.pending:
31 if self.done:
32 return
33 continue
34
35 r = await socket.recv()
36 self.results.append(r)
37
38 async with self.cond:
39 self.pending.pop(0)
40
41 async def send(self, socket, msgs):
42 try:
43 # In the event of a restart due to a reconnect, all in-flight
44 # messages need to be resent first to keep to result count in sync
45 for m in self.pending:
46 await socket.send(m)
47
48 for m in msgs:
49 # Add the message to the pending list before attempting to send
50 # it so that if the send fails it will be retried
51 async with self.cond:
52 self.pending.append(m)
53 self.cond.notify()
54 self.sent_count += 1
55
56 await socket.send(m)
57
58 finally:
59 async with self.cond:
60 self.done = True
61 self.cond.notify()
62
63 async def process(self, socket, msgs):
64 await asyncio.gather(
65 self.recv(socket),
66 self.send(socket, msgs),
67 )
68
69 if len(self.results) != self.sent_count:
70 raise ValueError(
71 f"Expected result count {len(self.results)}. Expected {self.sent_count}"
72 )
73
74 return self.results
75
76
16class AsyncClient(bb.asyncrpc.AsyncClient): 77class AsyncClient(bb.asyncrpc.AsyncClient):
17 MODE_NORMAL = 0 78 MODE_NORMAL = 0
18 MODE_GET_STREAM = 1 79 MODE_GET_STREAM = 1
@@ -27,9 +88,7 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
27 88
28 async def setup_connection(self): 89 async def setup_connection(self):
29 await super().setup_connection() 90 await super().setup_connection()
30 cur_mode = self.mode
31 self.mode = self.MODE_NORMAL 91 self.mode = self.MODE_NORMAL
32 await self._set_mode(cur_mode)
33 if self.username: 92 if self.username:
34 # Save off become user temporarily because auth() resets it 93 # Save off become user temporarily because auth() resets it
35 become = self.saved_become_user 94 become = self.saved_become_user
@@ -38,25 +97,52 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
38 if become: 97 if become:
39 await self.become_user(become) 98 await self.become_user(become)
40 99
41 async def send_stream(self, msg): 100 async def send_stream_batch(self, mode, msgs):
101 """
102 Does a "batch" process of stream messages. This sends the query
103 messages as fast as possible, and simultaneously attempts to read the
104 messages back. This helps to mitigate the effects of latency to the
105 hash equivalence server be allowing multiple queries to be "in-flight"
106 at once
107
108 The implementation does more complicated tracking using a count of sent
109 messages so that `msgs` can be a generator function (i.e. its length is
110 unknown)
111
112 """
113
114 b = Batch()
115
42 async def proc(): 116 async def proc():
43 await self.socket.send(msg) 117 nonlocal b
44 return await self.socket.recv() 118
119 await self._set_mode(mode)
120 return await b.process(self.socket, msgs)
45 121
46 return await self._send_wrapper(proc) 122 return await self._send_wrapper(proc)
47 123
124 async def invoke(self, *args, skip_mode=False, **kwargs):
125 # It's OK if connection errors cause a failure here, because the mode
126 # is also reset to normal on a new connection
127 if not skip_mode:
128 await self._set_mode(self.MODE_NORMAL)
129 return await super().invoke(*args, **kwargs)
130
48 async def _set_mode(self, new_mode): 131 async def _set_mode(self, new_mode):
49 async def stream_to_normal(): 132 async def stream_to_normal():
133 # Check if already in normal mode (e.g. due to a connection reset)
134 if self.mode == self.MODE_NORMAL:
135 return "ok"
50 await self.socket.send("END") 136 await self.socket.send("END")
51 return await self.socket.recv() 137 return await self.socket.recv()
52 138
53 async def normal_to_stream(command): 139 async def normal_to_stream(command):
54 r = await self.invoke({command: None}) 140 r = await self.invoke({command: None}, skip_mode=True)
55 if r != "ok": 141 if r != "ok":
142 self.check_invoke_error(r)
56 raise ConnectionError( 143 raise ConnectionError(
57 f"Unable to transition to stream mode: Bad response from server {r!r}" 144 f"Unable to transition to stream mode: Bad response from server {r!r}"
58 ) 145 )
59
60 self.logger.debug("Mode is now %s", command) 146 self.logger.debug("Mode is now %s", command)
61 147
62 if new_mode == self.mode: 148 if new_mode == self.mode:
@@ -84,14 +170,17 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
84 self.mode = new_mode 170 self.mode = new_mode
85 171
86 async def get_unihash(self, method, taskhash): 172 async def get_unihash(self, method, taskhash):
87 await self._set_mode(self.MODE_GET_STREAM) 173 r = await self.get_unihash_batch([(method, taskhash)])
88 r = await self.send_stream("%s %s" % (method, taskhash)) 174 return r[0]
89 if not r: 175
90 return None 176 async def get_unihash_batch(self, args):
91 return r 177 result = await self.send_stream_batch(
178 self.MODE_GET_STREAM,
179 (f"{method} {taskhash}" for method, taskhash in args),
180 )
181 return [r if r else None for r in result]
92 182
93 async def report_unihash(self, taskhash, method, outhash, unihash, extra={}): 183 async def report_unihash(self, taskhash, method, outhash, unihash, extra={}):
94 await self._set_mode(self.MODE_NORMAL)
95 m = extra.copy() 184 m = extra.copy()
96 m["taskhash"] = taskhash 185 m["taskhash"] = taskhash
97 m["method"] = method 186 m["method"] = method
@@ -100,7 +189,6 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
100 return await self.invoke({"report": m}) 189 return await self.invoke({"report": m})
101 190
102 async def report_unihash_equiv(self, taskhash, method, unihash, extra={}): 191 async def report_unihash_equiv(self, taskhash, method, unihash, extra={}):
103 await self._set_mode(self.MODE_NORMAL)
104 m = extra.copy() 192 m = extra.copy()
105 m["taskhash"] = taskhash 193 m["taskhash"] = taskhash
106 m["method"] = method 194 m["method"] = method
@@ -108,18 +196,19 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
108 return await self.invoke({"report-equiv": m}) 196 return await self.invoke({"report-equiv": m})
109 197
110 async def get_taskhash(self, method, taskhash, all_properties=False): 198 async def get_taskhash(self, method, taskhash, all_properties=False):
111 await self._set_mode(self.MODE_NORMAL)
112 return await self.invoke( 199 return await self.invoke(
113 {"get": {"taskhash": taskhash, "method": method, "all": all_properties}} 200 {"get": {"taskhash": taskhash, "method": method, "all": all_properties}}
114 ) 201 )
115 202
116 async def unihash_exists(self, unihash): 203 async def unihash_exists(self, unihash):
117 await self._set_mode(self.MODE_EXIST_STREAM) 204 r = await self.unihash_exists_batch([unihash])
118 r = await self.send_stream(unihash) 205 return r[0]
119 return r == "true" 206
207 async def unihash_exists_batch(self, unihashes):
208 result = await self.send_stream_batch(self.MODE_EXIST_STREAM, unihashes)
209 return [r == "true" for r in result]
120 210
121 async def get_outhash(self, method, outhash, taskhash, with_unihash=True): 211 async def get_outhash(self, method, outhash, taskhash, with_unihash=True):
122 await self._set_mode(self.MODE_NORMAL)
123 return await self.invoke( 212 return await self.invoke(
124 { 213 {
125 "get-outhash": { 214 "get-outhash": {
@@ -132,27 +221,21 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
132 ) 221 )
133 222
134 async def get_stats(self): 223 async def get_stats(self):
135 await self._set_mode(self.MODE_NORMAL)
136 return await self.invoke({"get-stats": None}) 224 return await self.invoke({"get-stats": None})
137 225
138 async def reset_stats(self): 226 async def reset_stats(self):
139 await self._set_mode(self.MODE_NORMAL)
140 return await self.invoke({"reset-stats": None}) 227 return await self.invoke({"reset-stats": None})
141 228
142 async def backfill_wait(self): 229 async def backfill_wait(self):
143 await self._set_mode(self.MODE_NORMAL)
144 return (await self.invoke({"backfill-wait": None}))["tasks"] 230 return (await self.invoke({"backfill-wait": None}))["tasks"]
145 231
146 async def remove(self, where): 232 async def remove(self, where):
147 await self._set_mode(self.MODE_NORMAL)
148 return await self.invoke({"remove": {"where": where}}) 233 return await self.invoke({"remove": {"where": where}})
149 234
150 async def clean_unused(self, max_age): 235 async def clean_unused(self, max_age):
151 await self._set_mode(self.MODE_NORMAL)
152 return await self.invoke({"clean-unused": {"max_age_seconds": max_age}}) 236 return await self.invoke({"clean-unused": {"max_age_seconds": max_age}})
153 237
154 async def auth(self, username, token): 238 async def auth(self, username, token):
155 await self._set_mode(self.MODE_NORMAL)
156 result = await self.invoke({"auth": {"username": username, "token": token}}) 239 result = await self.invoke({"auth": {"username": username, "token": token}})
157 self.username = username 240 self.username = username
158 self.password = token 241 self.password = token
@@ -160,7 +243,6 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
160 return result 243 return result
161 244
162 async def refresh_token(self, username=None): 245 async def refresh_token(self, username=None):
163 await self._set_mode(self.MODE_NORMAL)
164 m = {} 246 m = {}
165 if username: 247 if username:
166 m["username"] = username 248 m["username"] = username
@@ -174,34 +256,28 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
174 return result 256 return result
175 257
176 async def set_user_perms(self, username, permissions): 258 async def set_user_perms(self, username, permissions):
177 await self._set_mode(self.MODE_NORMAL)
178 return await self.invoke( 259 return await self.invoke(
179 {"set-user-perms": {"username": username, "permissions": permissions}} 260 {"set-user-perms": {"username": username, "permissions": permissions}}
180 ) 261 )
181 262
182 async def get_user(self, username=None): 263 async def get_user(self, username=None):
183 await self._set_mode(self.MODE_NORMAL)
184 m = {} 264 m = {}
185 if username: 265 if username:
186 m["username"] = username 266 m["username"] = username
187 return await self.invoke({"get-user": m}) 267 return await self.invoke({"get-user": m})
188 268
189 async def get_all_users(self): 269 async def get_all_users(self):
190 await self._set_mode(self.MODE_NORMAL)
191 return (await self.invoke({"get-all-users": {}}))["users"] 270 return (await self.invoke({"get-all-users": {}}))["users"]
192 271
193 async def new_user(self, username, permissions): 272 async def new_user(self, username, permissions):
194 await self._set_mode(self.MODE_NORMAL)
195 return await self.invoke( 273 return await self.invoke(
196 {"new-user": {"username": username, "permissions": permissions}} 274 {"new-user": {"username": username, "permissions": permissions}}
197 ) 275 )
198 276
199 async def delete_user(self, username): 277 async def delete_user(self, username):
200 await self._set_mode(self.MODE_NORMAL)
201 return await self.invoke({"delete-user": {"username": username}}) 278 return await self.invoke({"delete-user": {"username": username}})
202 279
203 async def become_user(self, username): 280 async def become_user(self, username):
204 await self._set_mode(self.MODE_NORMAL)
205 result = await self.invoke({"become-user": {"username": username}}) 281 result = await self.invoke({"become-user": {"username": username}})
206 if username == self.username: 282 if username == self.username:
207 self.saved_become_user = None 283 self.saved_become_user = None
@@ -210,15 +286,12 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
210 return result 286 return result
211 287
212 async def get_db_usage(self): 288 async def get_db_usage(self):
213 await self._set_mode(self.MODE_NORMAL)
214 return (await self.invoke({"get-db-usage": {}}))["usage"] 289 return (await self.invoke({"get-db-usage": {}}))["usage"]
215 290
216 async def get_db_query_columns(self): 291 async def get_db_query_columns(self):
217 await self._set_mode(self.MODE_NORMAL)
218 return (await self.invoke({"get-db-query-columns": {}}))["columns"] 292 return (await self.invoke({"get-db-query-columns": {}}))["columns"]
219 293
220 async def gc_status(self): 294 async def gc_status(self):
221 await self._set_mode(self.MODE_NORMAL)
222 return await self.invoke({"gc-status": {}}) 295 return await self.invoke({"gc-status": {}})
223 296
224 async def gc_mark(self, mark, where): 297 async def gc_mark(self, mark, where):
@@ -231,7 +304,6 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
231 kept. In addition, any new entries added to the database after this 304 kept. In addition, any new entries added to the database after this
232 command will be automatically marked with "mark" 305 command will be automatically marked with "mark"
233 """ 306 """
234 await self._set_mode(self.MODE_NORMAL)
235 return await self.invoke({"gc-mark": {"mark": mark, "where": where}}) 307 return await self.invoke({"gc-mark": {"mark": mark, "where": where}})
236 308
237 async def gc_sweep(self, mark): 309 async def gc_sweep(self, mark):
@@ -242,7 +314,6 @@ class AsyncClient(bb.asyncrpc.AsyncClient):
242 It is recommended to clean unused outhash entries after running this to 314 It is recommended to clean unused outhash entries after running this to
243 cleanup any dangling outhashes 315 cleanup any dangling outhashes
244 """ 316 """
245 await self._set_mode(self.MODE_NORMAL)
246 return await self.invoke({"gc-sweep": {"mark": mark}}) 317 return await self.invoke({"gc-sweep": {"mark": mark}})
247 318
248 319
@@ -256,10 +327,12 @@ class Client(bb.asyncrpc.Client):
256 "connect_tcp", 327 "connect_tcp",
257 "connect_websocket", 328 "connect_websocket",
258 "get_unihash", 329 "get_unihash",
330 "get_unihash_batch",
259 "report_unihash", 331 "report_unihash",
260 "report_unihash_equiv", 332 "report_unihash_equiv",
261 "get_taskhash", 333 "get_taskhash",
262 "unihash_exists", 334 "unihash_exists",
335 "unihash_exists_batch",
263 "get_outhash", 336 "get_outhash",
264 "get_stats", 337 "get_stats",
265 "reset_stats", 338 "reset_stats",
@@ -283,83 +356,3 @@ class Client(bb.asyncrpc.Client):
283 356
284 def _get_async_client(self): 357 def _get_async_client(self):
285 return AsyncClient(self.username, self.password) 358 return AsyncClient(self.username, self.password)
286
287
288class ClientPool(bb.asyncrpc.ClientPool):
289 def __init__(
290 self,
291 address,
292 max_clients,
293 *,
294 username=None,
295 password=None,
296 become=None,
297 ):
298 super().__init__(max_clients)
299 self.address = address
300 self.username = username
301 self.password = password
302 self.become = become
303
304 async def _new_client(self):
305 client = await create_async_client(
306 self.address,
307 username=self.username,
308 password=self.password,
309 )
310 if self.become:
311 await client.become_user(self.become)
312 return client
313
314 def _run_key_tasks(self, queries, call):
315 results = {key: None for key in queries.keys()}
316
317 def make_task(key, args):
318 async def task(client):
319 nonlocal results
320 unihash = await call(client, args)
321 results[key] = unihash
322
323 return task
324
325 def gen_tasks():
326 for key, args in queries.items():
327 yield make_task(key, args)
328
329 self.run_tasks(gen_tasks())
330 return results
331
332 def get_unihashes(self, queries):
333 """
334 Query multiple unihashes in parallel.
335
336 The queries argument is a dictionary with arbitrary key. The values
337 must be a tuple of (method, taskhash).
338
339 Returns a dictionary with a corresponding key for each input key, and
340 the value is the queried unihash (which might be none if the query
341 failed)
342 """
343
344 async def call(client, args):
345 method, taskhash = args
346 return await client.get_unihash(method, taskhash)
347
348 return self._run_key_tasks(queries, call)
349
350 def unihashes_exist(self, queries):
351 """
352 Query multiple unihash existence checks in parallel.
353
354 The queries argument is a dictionary with arbitrary key. The values
355 must be a unihash.
356
357 Returns a dictionary with a corresponding key for each input key, and
358 the value is True or False if the unihash is known by the server (or
359 None if there was a failure)
360 """
361
362 async def call(client, unihash):
363 return await client.unihash_exists(unihash)
364
365 return self._run_key_tasks(queries, call)
diff --git a/bitbake/lib/hashserv/tests.py b/bitbake/lib/hashserv/tests.py
index 0809453cf8..13ccb20ebf 100644
--- a/bitbake/lib/hashserv/tests.py
+++ b/bitbake/lib/hashserv/tests.py
@@ -8,7 +8,6 @@
8from . import create_server, create_client 8from . import create_server, create_client
9from .server import DEFAULT_ANON_PERMS, ALL_PERMISSIONS 9from .server import DEFAULT_ANON_PERMS, ALL_PERMISSIONS
10from bb.asyncrpc import InvokeError 10from bb.asyncrpc import InvokeError
11from .client import ClientPool
12import hashlib 11import hashlib
13import logging 12import logging
14import multiprocessing 13import multiprocessing
@@ -94,9 +93,6 @@ class HashEquivalenceTestSetup(object):
94 return self.start_client(self.auth_server_address, user["username"], user["token"]) 93 return self.start_client(self.auth_server_address, user["username"], user["token"])
95 94
96 def setUp(self): 95 def setUp(self):
97 if sys.version_info < (3, 5, 0):
98 self.skipTest('Python 3.5 or later required')
99
100 self.temp_dir = tempfile.TemporaryDirectory(prefix='bb-hashserv') 96 self.temp_dir = tempfile.TemporaryDirectory(prefix='bb-hashserv')
101 self.addCleanup(self.temp_dir.cleanup) 97 self.addCleanup(self.temp_dir.cleanup)
102 98
@@ -555,8 +551,7 @@ class HashEquivalenceCommonTests(object):
555 # shares a taskhash with Task 2 551 # shares a taskhash with Task 2
556 self.assertClientGetHash(self.client, taskhash2, unihash2) 552 self.assertClientGetHash(self.client, taskhash2, unihash2)
557 553
558 554 def test_get_unihash_batch(self):
559 def test_client_pool_get_unihashes(self):
560 TEST_INPUT = ( 555 TEST_INPUT = (
561 # taskhash outhash unihash 556 # taskhash outhash unihash
562 ('8aa96fcffb5831b3c2c0cb75f0431e3f8b20554a', 'afe240a439959ce86f5e322f8c208e1fedefea9e813f2140c81af866cc9edf7e','218e57509998197d570e2c98512d0105985dffc9'), 557 ('8aa96fcffb5831b3c2c0cb75f0431e3f8b20554a', 'afe240a439959ce86f5e322f8c208e1fedefea9e813f2140c81af866cc9edf7e','218e57509998197d570e2c98512d0105985dffc9'),
@@ -573,28 +568,27 @@ class HashEquivalenceCommonTests(object):
573 "6b6be7a84ab179b4240c4302518dc3f6", 568 "6b6be7a84ab179b4240c4302518dc3f6",
574 ) 569 )
575 570
576 with ClientPool(self.server_address, 10) as client_pool: 571 for taskhash, outhash, unihash in TEST_INPUT:
577 for taskhash, outhash, unihash in TEST_INPUT: 572 self.client.report_unihash(taskhash, self.METHOD, outhash, unihash)
578 self.client.report_unihash(taskhash, self.METHOD, outhash, unihash)
579
580 query = {idx: (self.METHOD, data[0]) for idx, data in enumerate(TEST_INPUT)}
581 for idx, taskhash in enumerate(EXTRA_QUERIES):
582 query[idx + len(TEST_INPUT)] = (self.METHOD, taskhash)
583
584 result = client_pool.get_unihashes(query)
585
586 self.assertDictEqual(result, {
587 0: "218e57509998197d570e2c98512d0105985dffc9",
588 1: "218e57509998197d570e2c98512d0105985dffc9",
589 2: "218e57509998197d570e2c98512d0105985dffc9",
590 3: "3b5d3d83f07f259e9086fcb422c855286e18a57d",
591 4: "f46d3fbb439bd9b921095da657a4de906510d2cd",
592 5: "f46d3fbb439bd9b921095da657a4de906510d2cd",
593 6: "05d2a63c81e32f0a36542ca677e8ad852365c538",
594 7: None,
595 })
596 573
597 def test_client_pool_unihash_exists(self): 574
575 result = self.client.get_unihash_batch(
576 [(self.METHOD, data[0]) for data in TEST_INPUT] +
577 [(self.METHOD, e) for e in EXTRA_QUERIES]
578 )
579
580 self.assertListEqual(result, [
581 "218e57509998197d570e2c98512d0105985dffc9",
582 "218e57509998197d570e2c98512d0105985dffc9",
583 "218e57509998197d570e2c98512d0105985dffc9",
584 "3b5d3d83f07f259e9086fcb422c855286e18a57d",
585 "f46d3fbb439bd9b921095da657a4de906510d2cd",
586 "f46d3fbb439bd9b921095da657a4de906510d2cd",
587 "05d2a63c81e32f0a36542ca677e8ad852365c538",
588 None,
589 ])
590
591 def test_unihash_exists_batch(self):
598 TEST_INPUT = ( 592 TEST_INPUT = (
599 # taskhash outhash unihash 593 # taskhash outhash unihash
600 ('8aa96fcffb5831b3c2c0cb75f0431e3f8b20554a', 'afe240a439959ce86f5e322f8c208e1fedefea9e813f2140c81af866cc9edf7e','218e57509998197d570e2c98512d0105985dffc9'), 594 ('8aa96fcffb5831b3c2c0cb75f0431e3f8b20554a', 'afe240a439959ce86f5e322f8c208e1fedefea9e813f2140c81af866cc9edf7e','218e57509998197d570e2c98512d0105985dffc9'),
@@ -614,28 +608,24 @@ class HashEquivalenceCommonTests(object):
614 result_unihashes = set() 608 result_unihashes = set()
615 609
616 610
617 with ClientPool(self.server_address, 10) as client_pool: 611 for taskhash, outhash, unihash in TEST_INPUT:
618 for taskhash, outhash, unihash in TEST_INPUT: 612 result = self.client.report_unihash(taskhash, self.METHOD, outhash, unihash)
619 result = self.client.report_unihash(taskhash, self.METHOD, outhash, unihash) 613 result_unihashes.add(result["unihash"])
620 result_unihashes.add(result["unihash"])
621
622 query = {}
623 expected = {}
624 614
625 for _, _, unihash in TEST_INPUT: 615 query = []
626 idx = len(query) 616 expected = []
627 query[idx] = unihash
628 expected[idx] = unihash in result_unihashes
629 617
618 for _, _, unihash in TEST_INPUT:
619 query.append(unihash)
620 expected.append(unihash in result_unihashes)
630 621
631 for unihash in EXTRA_QUERIES:
632 idx = len(query)
633 query[idx] = unihash
634 expected[idx] = False
635 622
636 result = client_pool.unihashes_exist(query) 623 for unihash in EXTRA_QUERIES:
637 self.assertDictEqual(result, expected) 624 query.append(unihash)
625 expected.append(False)
638 626
627 result = self.client.unihash_exists_batch(query)
628 self.assertListEqual(result, expected)
639 629
640 def test_auth_read_perms(self): 630 def test_auth_read_perms(self):
641 admin_client = self.start_auth_server() 631 admin_client = self.start_auth_server()
diff --git a/bitbake/lib/prserv/__init__.py b/bitbake/lib/prserv/__init__.py
index 38ced818ad..a817b03c1e 100644
--- a/bitbake/lib/prserv/__init__.py
+++ b/bitbake/lib/prserv/__init__.py
@@ -4,17 +4,92 @@
4# SPDX-License-Identifier: GPL-2.0-only 4# SPDX-License-Identifier: GPL-2.0-only
5# 5#
6 6
7__version__ = "1.0.0"
8 7
9import os, time 8__version__ = "2.0.0"
10import sys,logging
11 9
12def init_logger(logfile, loglevel): 10import logging
13 numeric_level = getattr(logging, loglevel.upper(), None) 11logger = logging.getLogger("BitBake.PRserv")
14 if not isinstance(numeric_level, int):
15 raise ValueError('Invalid log level: %s' % loglevel)
16 FORMAT = '%(asctime)-15s %(message)s'
17 logging.basicConfig(level=numeric_level, filename=logfile, format=FORMAT)
18 12
19class NotFoundError(Exception): 13from bb.asyncrpc.client import parse_address, ADDR_TYPE_UNIX, ADDR_TYPE_WS
20 pass 14
15def create_server(addr, dbpath, upstream=None, read_only=False):
16 from . import serv
17
18 s = serv.PRServer(dbpath, upstream=upstream, read_only=read_only)
19 host, port = addr.split(":")
20 s.start_tcp_server(host, int(port))
21
22 return s
23
24def increase_revision(ver):
25 """Take a revision string such as "1" or "1.2.3" or even a number and increase its last number
26 This fails if the last number is not an integer"""
27
28 fields=str(ver).split('.')
29 last = fields[-1]
30
31 try:
32 val = int(last)
33 except Exception as e:
34 logger.critical("Unable to increase revision value %s: %s" % (ver, e))
35 raise e
36
37 return ".".join(fields[0:-1] + list(str(val + 1)))
38
39def _revision_greater_or_equal(rev1, rev2):
40 """Compares x.y.z revision numbers, using integer comparison
41 Returns True if rev1 is greater or equal to rev2"""
42
43 fields1 = rev1.split(".")
44 fields2 = rev2.split(".")
45 l1 = len(fields1)
46 l2 = len(fields2)
47
48 for i in range(l1):
49 val1 = int(fields1[i])
50 if i < l2:
51 val2 = int(fields2[i])
52 if val2 < val1:
53 return True
54 elif val2 > val1:
55 return False
56 else:
57 return True
58 return True
59
60def revision_smaller(rev1, rev2):
61 """Compares x.y.z revision numbers, using integer comparison
62 Returns True if rev1 is strictly smaller than rev2"""
63 return not(_revision_greater_or_equal(rev1, rev2))
64
65def revision_greater(rev1, rev2):
66 """Compares x.y.z revision numbers, using integer comparison
67 Returns True if rev1 is strictly greater than rev2"""
68 return _revision_greater_or_equal(rev1, rev2) and (rev1 != rev2)
69
70def create_client(addr):
71 from . import client
72
73 c = client.PRClient()
74
75 try:
76 (typ, a) = parse_address(addr)
77 c.connect_tcp(*a)
78 return c
79 except Exception as e:
80 c.close()
81 raise e
82
83async def create_async_client(addr):
84 from . import client
85
86 c = client.PRAsyncClient()
87
88 try:
89 (typ, a) = parse_address(addr)
90 await c.connect_tcp(*a)
91 return c
92
93 except Exception as e:
94 await c.close()
95 raise e
diff --git a/bitbake/lib/prserv/client.py b/bitbake/lib/prserv/client.py
index 6b81356fac..9f5794c433 100644
--- a/bitbake/lib/prserv/client.py
+++ b/bitbake/lib/prserv/client.py
@@ -6,45 +6,67 @@
6 6
7import logging 7import logging
8import bb.asyncrpc 8import bb.asyncrpc
9from . import create_async_client
9 10
10logger = logging.getLogger("BitBake.PRserv") 11logger = logging.getLogger("BitBake.PRserv")
11 12
12class PRAsyncClient(bb.asyncrpc.AsyncClient): 13class PRAsyncClient(bb.asyncrpc.AsyncClient):
13 def __init__(self): 14 def __init__(self):
14 super().__init__('PRSERVICE', '1.0', logger) 15 super().__init__("PRSERVICE", "1.0", logger)
15 16
16 async def getPR(self, version, pkgarch, checksum): 17 async def getPR(self, version, pkgarch, checksum, history=False):
17 response = await self.invoke( 18 response = await self.invoke(
18 {'get-pr': {'version': version, 'pkgarch': pkgarch, 'checksum': checksum}} 19 {"get-pr": {"version": version, "pkgarch": pkgarch, "checksum": checksum, "history": history}}
19 ) 20 )
20 if response: 21 if response:
21 return response['value'] 22 return response["value"]
23
24 async def test_pr(self, version, pkgarch, checksum, history=False):
25 response = await self.invoke(
26 {"test-pr": {"version": version, "pkgarch": pkgarch, "checksum": checksum, "history": history}}
27 )
28 if response:
29 return response["value"]
30
31 async def test_package(self, version, pkgarch):
32 response = await self.invoke(
33 {"test-package": {"version": version, "pkgarch": pkgarch}}
34 )
35 if response:
36 return response["value"]
37
38 async def max_package_pr(self, version, pkgarch):
39 response = await self.invoke(
40 {"max-package-pr": {"version": version, "pkgarch": pkgarch}}
41 )
42 if response:
43 return response["value"]
22 44
23 async def importone(self, version, pkgarch, checksum, value): 45 async def importone(self, version, pkgarch, checksum, value):
24 response = await self.invoke( 46 response = await self.invoke(
25 {'import-one': {'version': version, 'pkgarch': pkgarch, 'checksum': checksum, 'value': value}} 47 {"import-one": {"version": version, "pkgarch": pkgarch, "checksum": checksum, "value": value}}
26 ) 48 )
27 if response: 49 if response:
28 return response['value'] 50 return response["value"]
29 51
30 async def export(self, version, pkgarch, checksum, colinfo): 52 async def export(self, version, pkgarch, checksum, colinfo, history=False):
31 response = await self.invoke( 53 response = await self.invoke(
32 {'export': {'version': version, 'pkgarch': pkgarch, 'checksum': checksum, 'colinfo': colinfo}} 54 {"export": {"version": version, "pkgarch": pkgarch, "checksum": checksum, "colinfo": colinfo, "history": history}}
33 ) 55 )
34 if response: 56 if response:
35 return (response['metainfo'], response['datainfo']) 57 return (response["metainfo"], response["datainfo"])
36 58
37 async def is_readonly(self): 59 async def is_readonly(self):
38 response = await self.invoke( 60 response = await self.invoke(
39 {'is-readonly': {}} 61 {"is-readonly": {}}
40 ) 62 )
41 if response: 63 if response:
42 return response['readonly'] 64 return response["readonly"]
43 65
44class PRClient(bb.asyncrpc.Client): 66class PRClient(bb.asyncrpc.Client):
45 def __init__(self): 67 def __init__(self):
46 super().__init__() 68 super().__init__()
47 self._add_methods('getPR', 'importone', 'export', 'is_readonly') 69 self._add_methods("getPR", "test_pr", "test_package", "max_package_pr", "importone", "export", "is_readonly")
48 70
49 def _get_async_client(self): 71 def _get_async_client(self):
50 return PRAsyncClient() 72 return PRAsyncClient()
diff --git a/bitbake/lib/prserv/db.py b/bitbake/lib/prserv/db.py
index b4bda7078c..2da493ddf5 100644
--- a/bitbake/lib/prserv/db.py
+++ b/bitbake/lib/prserv/db.py
@@ -8,19 +8,13 @@ import logging
8import os.path 8import os.path
9import errno 9import errno
10import prserv 10import prserv
11import time 11import sqlite3
12 12
13try: 13from contextlib import closing
14 import sqlite3 14from . import increase_revision, revision_greater, revision_smaller
15except ImportError:
16 from pysqlite2 import dbapi2 as sqlite3
17 15
18logger = logging.getLogger("BitBake.PRserv") 16logger = logging.getLogger("BitBake.PRserv")
19 17
20sqlversion = sqlite3.sqlite_version_info
21if sqlversion[0] < 3 or (sqlversion[0] == 3 and sqlversion[1] < 3):
22 raise Exception("sqlite3 version 3.3.0 or later is required.")
23
24# 18#
25# "No History" mode - for a given query tuple (version, pkgarch, checksum), 19# "No History" mode - for a given query tuple (version, pkgarch, checksum),
26# the returned value will be the largest among all the values of the same 20# the returned value will be the largest among all the values of the same
@@ -29,245 +23,232 @@ if sqlversion[0] < 3 or (sqlversion[0] == 3 and sqlversion[1] < 3):
29# "History" mode - Return a new higher value for previously unseen query 23# "History" mode - Return a new higher value for previously unseen query
30# tuple (version, pkgarch, checksum), otherwise return historical value. 24# tuple (version, pkgarch, checksum), otherwise return historical value.
31# Value can decrement if returning to a previous build. 25# Value can decrement if returning to a previous build.
32#
33 26
34class PRTable(object): 27class PRTable(object):
35 def __init__(self, conn, table, nohist, read_only): 28 def __init__(self, conn, table, read_only):
36 self.conn = conn 29 self.conn = conn
37 self.nohist = nohist
38 self.read_only = read_only 30 self.read_only = read_only
39 self.dirty = False 31 self.table = table
40 if nohist: 32
41 self.table = "%s_nohist" % table 33 # Creating the table even if the server is read-only.
42 else: 34 # This avoids a race condition if a shared database
43 self.table = "%s_hist" % table 35 # is accessed by a read-only server first.
44 36
45 if self.read_only: 37 with closing(self.conn.cursor()) as cursor:
46 table_exists = self._execute( 38 cursor.execute("CREATE TABLE IF NOT EXISTS %s \
47 "SELECT count(*) FROM sqlite_master \
48 WHERE type='table' AND name='%s'" % (self.table))
49 if not table_exists:
50 raise prserv.NotFoundError
51 else:
52 self._execute("CREATE TABLE IF NOT EXISTS %s \
53 (version TEXT NOT NULL, \ 39 (version TEXT NOT NULL, \
54 pkgarch TEXT NOT NULL, \ 40 pkgarch TEXT NOT NULL, \
55 checksum TEXT NOT NULL, \ 41 checksum TEXT NOT NULL, \
56 value INTEGER, \ 42 value TEXT, \
57 PRIMARY KEY (version, pkgarch, checksum));" % self.table) 43 PRIMARY KEY (version, pkgarch, checksum, value));" % self.table)
58
59 def _execute(self, *query):
60 """Execute a query, waiting to acquire a lock if necessary"""
61 start = time.time()
62 end = start + 20
63 while True:
64 try:
65 return self.conn.execute(*query)
66 except sqlite3.OperationalError as exc:
67 if 'is locked' in str(exc) and end > time.time():
68 continue
69 raise exc
70
71 def sync(self):
72 if not self.read_only:
73 self.conn.commit() 44 self.conn.commit()
74 self._execute("BEGIN EXCLUSIVE TRANSACTION") 45
75 46 def _extremum_value(self, rows, is_max):
76 def sync_if_dirty(self): 47 value = None
77 if self.dirty: 48
78 self.sync() 49 for row in rows:
79 self.dirty = False 50 current_value = row[0]
80 51 if value is None:
81 def _getValueHist(self, version, pkgarch, checksum): 52 value = current_value
82 data=self._execute("SELECT value FROM %s WHERE version=? AND pkgarch=? AND checksum=?;" % self.table, 53 else:
83 (version, pkgarch, checksum)) 54 if is_max:
84 row=data.fetchone() 55 is_new_extremum = revision_greater(current_value, value)
85 if row is not None:
86 return row[0]
87 else:
88 #no value found, try to insert
89 if self.read_only:
90 data = self._execute("SELECT ifnull(max(value)+1,0) FROM %s where version=? AND pkgarch=?;" % (self.table),
91 (version, pkgarch))
92 row = data.fetchone()
93 if row is not None:
94 return row[0]
95 else: 56 else:
96 return 0 57 is_new_extremum = revision_smaller(current_value, value)
58 if is_new_extremum:
59 value = current_value
60 return value
61
62 def _max_value(self, rows):
63 return self._extremum_value(rows, True)
97 64
98 try: 65 def _min_value(self, rows):
99 self._execute("INSERT INTO %s VALUES (?, ?, ?, (select ifnull(max(value)+1,0) from %s where version=? AND pkgarch=?));" 66 return self._extremum_value(rows, False)
100 % (self.table,self.table),
101 (version,pkgarch, checksum,version, pkgarch))
102 except sqlite3.IntegrityError as exc:
103 logger.error(str(exc))
104 67
105 self.dirty = True 68 def test_package(self, version, pkgarch):
69 """Returns whether the specified package version is found in the database for the specified architecture"""
106 70
107 data=self._execute("SELECT value FROM %s WHERE version=? AND pkgarch=? AND checksum=?;" % self.table, 71 # Just returns the value if found or None otherwise
108 (version, pkgarch, checksum)) 72 with closing(self.conn.cursor()) as cursor:
73 data=cursor.execute("SELECT value FROM %s WHERE version=? AND pkgarch=?;" % self.table,
74 (version, pkgarch))
109 row=data.fetchone() 75 row=data.fetchone()
110 if row is not None: 76 if row is not None:
111 return row[0] 77 return True
112 else: 78 else:
113 raise prserv.NotFoundError 79 return False
114 80
115 def _getValueNohist(self, version, pkgarch, checksum): 81 def test_checksum_value(self, version, pkgarch, checksum, value):
116 data=self._execute("SELECT value FROM %s \ 82 """Returns whether the specified value is found in the database for the specified package, architecture and checksum"""
117 WHERE version=? AND pkgarch=? AND checksum=? AND \
118 value >= (select max(value) from %s where version=? AND pkgarch=?);"
119 % (self.table, self.table),
120 (version, pkgarch, checksum, version, pkgarch))
121 row=data.fetchone()
122 if row is not None:
123 return row[0]
124 else:
125 #no value found, try to insert
126 if self.read_only:
127 data = self._execute("SELECT ifnull(max(value)+1,0) FROM %s where version=? AND pkgarch=?;" % (self.table),
128 (version, pkgarch))
129 row = data.fetchone()
130 if row is not None:
131 return row[0]
132 else:
133 return 0
134 83
135 try: 84 with closing(self.conn.cursor()) as cursor:
136 self._execute("INSERT OR REPLACE INTO %s VALUES (?, ?, ?, (select ifnull(max(value)+1,0) from %s where version=? AND pkgarch=?));" 85 data=cursor.execute("SELECT value FROM %s WHERE version=? AND pkgarch=? and checksum=? and value=?;" % self.table,
137 % (self.table,self.table), 86 (version, pkgarch, checksum, value))
138 (version, pkgarch, checksum, version, pkgarch)) 87 row=data.fetchone()
139 except sqlite3.IntegrityError as exc: 88 if row is not None:
140 logger.error(str(exc)) 89 return True
141 self.conn.rollback() 90 else:
91 return False
142 92
143 self.dirty = True 93 def test_value(self, version, pkgarch, value):
94 """Returns whether the specified value is found in the database for the specified package and architecture"""
144 95
145 data=self._execute("SELECT value FROM %s WHERE version=? AND pkgarch=? AND checksum=?;" % self.table, 96 # Just returns the value if found or None otherwise
146 (version, pkgarch, checksum)) 97 with closing(self.conn.cursor()) as cursor:
98 data=cursor.execute("SELECT value FROM %s WHERE version=? AND pkgarch=? and value=?;" % self.table,
99 (version, pkgarch, value))
147 row=data.fetchone() 100 row=data.fetchone()
148 if row is not None: 101 if row is not None:
149 return row[0] 102 return True
150 else: 103 else:
151 raise prserv.NotFoundError 104 return False
152 105
153 def getValue(self, version, pkgarch, checksum): 106
154 if self.nohist: 107 def find_package_max_value(self, version, pkgarch):
155 return self._getValueNohist(version, pkgarch, checksum) 108 """Returns the greatest value for (version, pkgarch), or None if not found. Doesn't create a new value"""
156 else: 109
157 return self._getValueHist(version, pkgarch, checksum) 110 with closing(self.conn.cursor()) as cursor:
158 111 data = cursor.execute("SELECT value FROM %s where version=? AND pkgarch=?;" % (self.table),
159 def _importHist(self, version, pkgarch, checksum, value): 112 (version, pkgarch))
160 if self.read_only: 113 rows = data.fetchall()
161 return None 114 value = self._max_value(rows)
162 115 return value
163 val = None 116
164 data = self._execute("SELECT value FROM %s WHERE version=? AND pkgarch=? AND checksum=?;" % self.table, 117 def find_value(self, version, pkgarch, checksum, history=False):
165 (version, pkgarch, checksum)) 118 """Returns the value for the specified checksum if found or None otherwise."""
166 row = data.fetchone() 119
167 if row is not None: 120 if history:
168 val=row[0] 121 return self.find_min_value(version, pkgarch, checksum)
169 else: 122 else:
170 #no value found, try to insert 123 return self.find_max_value(version, pkgarch, checksum)
171 try: 124
172 self._execute("INSERT INTO %s VALUES (?, ?, ?, ?);" % (self.table), 125
126 def _find_extremum_value(self, version, pkgarch, checksum, is_max):
127 """Returns the maximum (if is_max is True) or minimum (if is_max is False) value
128 for (version, pkgarch, checksum), or None if not found. Doesn't create a new value"""
129
130 with closing(self.conn.cursor()) as cursor:
131 data = cursor.execute("SELECT value FROM %s where version=? AND pkgarch=? AND checksum=?;" % (self.table),
132 (version, pkgarch, checksum))
133 rows = data.fetchall()
134 return self._extremum_value(rows, is_max)
135
136 def find_max_value(self, version, pkgarch, checksum):
137 return self._find_extremum_value(version, pkgarch, checksum, True)
138
139 def find_min_value(self, version, pkgarch, checksum):
140 return self._find_extremum_value(version, pkgarch, checksum, False)
141
142 def find_new_subvalue(self, version, pkgarch, base):
143 """Take and increase the greatest "<base>.y" value for (version, pkgarch), or return "<base>.0" if not found.
144 This doesn't store a new value."""
145
146 with closing(self.conn.cursor()) as cursor:
147 data = cursor.execute("SELECT value FROM %s where version=? AND pkgarch=? AND value LIKE '%s.%%';" % (self.table, base),
148 (version, pkgarch))
149 rows = data.fetchall()
150 value = self._max_value(rows)
151
152 if value is not None:
153 return increase_revision(value)
154 else:
155 return base + ".0"
156
157 def store_value(self, version, pkgarch, checksum, value):
158 """Store value in the database"""
159
160 if not self.read_only and not self.test_checksum_value(version, pkgarch, checksum, value):
161 with closing(self.conn.cursor()) as cursor:
162 cursor.execute("INSERT INTO %s VALUES (?, ?, ?, ?);" % (self.table),
173 (version, pkgarch, checksum, value)) 163 (version, pkgarch, checksum, value))
174 except sqlite3.IntegrityError as exc: 164 self.conn.commit()
175 logger.error(str(exc))
176 165
177 self.dirty = True 166 def _get_value(self, version, pkgarch, checksum, history):
178 167
179 data = self._execute("SELECT value FROM %s WHERE version=? AND pkgarch=? AND checksum=?;" % self.table, 168 max_value = self.find_package_max_value(version, pkgarch)
180 (version, pkgarch, checksum))
181 row = data.fetchone()
182 if row is not None:
183 val = row[0]
184 return val
185 169
186 def _importNohist(self, version, pkgarch, checksum, value): 170 if max_value is None:
187 if self.read_only: 171 # version, pkgarch completely unknown. Return initial value.
188 return None 172 return "0"
189 173
190 try: 174 value = self.find_value(version, pkgarch, checksum, history)
191 #try to insert 175
192 self._execute("INSERT INTO %s VALUES (?, ?, ?, ?);" % (self.table), 176 if value is None:
193 (version, pkgarch, checksum,value)) 177 # version, pkgarch found but not checksum. Create a new value from the maximum one
194 except sqlite3.IntegrityError as exc: 178 return increase_revision(max_value)
195 #already have the record, try to update 179
196 try: 180 if history:
197 self._execute("UPDATE %s SET value=? WHERE version=? AND pkgarch=? AND checksum=? AND value<?" 181 return value
198 % (self.table), 182
199 (value,version,pkgarch,checksum,value)) 183 # "no history" mode - If the value is not the maximum value for the package, need to increase it.
200 except sqlite3.IntegrityError as exc: 184 if max_value > value:
201 logger.error(str(exc)) 185 return increase_revision(max_value)
202
203 self.dirty = True
204
205 data = self._execute("SELECT value FROM %s WHERE version=? AND pkgarch=? AND checksum=? AND value>=?;" % self.table,
206 (version,pkgarch,checksum,value))
207 row=data.fetchone()
208 if row is not None:
209 return row[0]
210 else: 186 else:
211 return None 187 return value
188
189 def get_value(self, version, pkgarch, checksum, history):
190 value = self._get_value(version, pkgarch, checksum, history)
191 if not self.read_only:
192 self.store_value(version, pkgarch, checksum, value)
193 return value
212 194
213 def importone(self, version, pkgarch, checksum, value): 195 def importone(self, version, pkgarch, checksum, value):
214 if self.nohist: 196 self.store_value(version, pkgarch, checksum, value)
215 return self._importNohist(version, pkgarch, checksum, value) 197 return value
216 else:
217 return self._importHist(version, pkgarch, checksum, value)
218 198
219 def export(self, version, pkgarch, checksum, colinfo): 199 def export(self, version, pkgarch, checksum, colinfo, history=False):
220 metainfo = {} 200 metainfo = {}
221 #column info 201 with closing(self.conn.cursor()) as cursor:
222 if colinfo: 202 #column info
223 metainfo['tbl_name'] = self.table 203 if colinfo:
224 metainfo['core_ver'] = prserv.__version__ 204 metainfo["tbl_name"] = self.table
225 metainfo['col_info'] = [] 205 metainfo["core_ver"] = prserv.__version__
226 data = self._execute("PRAGMA table_info(%s);" % self.table) 206 metainfo["col_info"] = []
207 data = cursor.execute("PRAGMA table_info(%s);" % self.table)
208 for row in data:
209 col = {}
210 col["name"] = row["name"]
211 col["type"] = row["type"]
212 col["notnull"] = row["notnull"]
213 col["dflt_value"] = row["dflt_value"]
214 col["pk"] = row["pk"]
215 metainfo["col_info"].append(col)
216
217 #data info
218 datainfo = []
219
220 if history:
221 sqlstmt = "SELECT * FROM %s as T1 WHERE 1=1 " % self.table
222 else:
223 sqlstmt = "SELECT T1.version, T1.pkgarch, T1.checksum, T1.value FROM %s as T1, \
224 (SELECT version, pkgarch, max(value) as maxvalue FROM %s GROUP BY version, pkgarch) as T2 \
225 WHERE T1.version=T2.version AND T1.pkgarch=T2.pkgarch AND T1.value=T2.maxvalue " % (self.table, self.table)
226 sqlarg = []
227 where = ""
228 if version:
229 where += "AND T1.version=? "
230 sqlarg.append(str(version))
231 if pkgarch:
232 where += "AND T1.pkgarch=? "
233 sqlarg.append(str(pkgarch))
234 if checksum:
235 where += "AND T1.checksum=? "
236 sqlarg.append(str(checksum))
237
238 sqlstmt += where + ";"
239
240 if len(sqlarg):
241 data = cursor.execute(sqlstmt, tuple(sqlarg))
242 else:
243 data = cursor.execute(sqlstmt)
227 for row in data: 244 for row in data:
228 col = {} 245 if row["version"]:
229 col['name'] = row['name'] 246 col = {}
230 col['type'] = row['type'] 247 col["version"] = row["version"]
231 col['notnull'] = row['notnull'] 248 col["pkgarch"] = row["pkgarch"]
232 col['dflt_value'] = row['dflt_value'] 249 col["checksum"] = row["checksum"]
233 col['pk'] = row['pk'] 250 col["value"] = row["value"]
234 metainfo['col_info'].append(col) 251 datainfo.append(col)
235
236 #data info
237 datainfo = []
238
239 if self.nohist:
240 sqlstmt = "SELECT T1.version, T1.pkgarch, T1.checksum, T1.value FROM %s as T1, \
241 (SELECT version,pkgarch,max(value) as maxvalue FROM %s GROUP BY version,pkgarch) as T2 \
242 WHERE T1.version=T2.version AND T1.pkgarch=T2.pkgarch AND T1.value=T2.maxvalue " % (self.table, self.table)
243 else:
244 sqlstmt = "SELECT * FROM %s as T1 WHERE 1=1 " % self.table
245 sqlarg = []
246 where = ""
247 if version:
248 where += "AND T1.version=? "
249 sqlarg.append(str(version))
250 if pkgarch:
251 where += "AND T1.pkgarch=? "
252 sqlarg.append(str(pkgarch))
253 if checksum:
254 where += "AND T1.checksum=? "
255 sqlarg.append(str(checksum))
256
257 sqlstmt += where + ";"
258
259 if len(sqlarg):
260 data = self._execute(sqlstmt, tuple(sqlarg))
261 else:
262 data = self._execute(sqlstmt)
263 for row in data:
264 if row['version']:
265 col = {}
266 col['version'] = row['version']
267 col['pkgarch'] = row['pkgarch']
268 col['checksum'] = row['checksum']
269 col['value'] = row['value']
270 datainfo.append(col)
271 return (metainfo, datainfo) 252 return (metainfo, datainfo)
272 253
273 def dump_db(self, fd): 254 def dump_db(self, fd):
@@ -275,14 +256,13 @@ class PRTable(object):
275 for line in self.conn.iterdump(): 256 for line in self.conn.iterdump():
276 writeCount = writeCount + len(line) + 1 257 writeCount = writeCount + len(line) + 1
277 fd.write(line) 258 fd.write(line)
278 fd.write('\n') 259 fd.write("\n")
279 return writeCount 260 return writeCount
280 261
281class PRData(object): 262class PRData(object):
282 """Object representing the PR database""" 263 """Object representing the PR database"""
283 def __init__(self, filename, nohist=True, read_only=False): 264 def __init__(self, filename, read_only=False):
284 self.filename=os.path.abspath(filename) 265 self.filename=os.path.abspath(filename)
285 self.nohist=nohist
286 self.read_only = read_only 266 self.read_only = read_only
287 #build directory hierarchy 267 #build directory hierarchy
288 try: 268 try:
@@ -292,28 +272,30 @@ class PRData(object):
292 raise e 272 raise e
293 uri = "file:%s%s" % (self.filename, "?mode=ro" if self.read_only else "") 273 uri = "file:%s%s" % (self.filename, "?mode=ro" if self.read_only else "")
294 logger.debug("Opening PRServ database '%s'" % (uri)) 274 logger.debug("Opening PRServ database '%s'" % (uri))
295 self.connection=sqlite3.connect(uri, uri=True, isolation_level="EXCLUSIVE", check_same_thread = False) 275 self.connection=sqlite3.connect(uri, uri=True)
296 self.connection.row_factory=sqlite3.Row 276 self.connection.row_factory=sqlite3.Row
297 if not self.read_only: 277 self.connection.execute("PRAGMA synchronous = OFF;")
298 self.connection.execute("pragma synchronous = off;") 278 self.connection.execute("PRAGMA journal_mode = WAL;")
299 self.connection.execute("PRAGMA journal_mode = MEMORY;") 279 self.connection.commit()
300 self._tables={} 280 self._tables={}
301 281
302 def disconnect(self): 282 def disconnect(self):
283 self.connection.commit()
303 self.connection.close() 284 self.connection.close()
304 285
305 def __getitem__(self,tblname): 286 def __getitem__(self, tblname):
306 if not isinstance(tblname, str): 287 if not isinstance(tblname, str):
307 raise TypeError("tblname argument must be a string, not '%s'" % 288 raise TypeError("tblname argument must be a string, not '%s'" %
308 type(tblname)) 289 type(tblname))
309 if tblname in self._tables: 290 if tblname in self._tables:
310 return self._tables[tblname] 291 return self._tables[tblname]
311 else: 292 else:
312 tableobj = self._tables[tblname] = PRTable(self.connection, tblname, self.nohist, self.read_only) 293 tableobj = self._tables[tblname] = PRTable(self.connection, tblname, self.read_only)
313 return tableobj 294 return tableobj
314 295
315 def __delitem__(self, tblname): 296 def __delitem__(self, tblname):
316 if tblname in self._tables: 297 if tblname in self._tables:
317 del self._tables[tblname] 298 del self._tables[tblname]
318 logger.info("drop table %s" % (tblname)) 299 logger.info("drop table %s" % (tblname))
319 self.connection.execute("DROP TABLE IF EXISTS %s;" % tblname) 300 self.connection.execute("DROP TABLE IF EXISTS %s;" % tblname)
301 self.connection.commit()
diff --git a/bitbake/lib/prserv/serv.py b/bitbake/lib/prserv/serv.py
index 5fc8863f70..e175886308 100644
--- a/bitbake/lib/prserv/serv.py
+++ b/bitbake/lib/prserv/serv.py
@@ -12,6 +12,7 @@ import sqlite3
12import prserv 12import prserv
13import prserv.db 13import prserv.db
14import errno 14import errno
15from . import create_async_client, revision_smaller, increase_revision
15import bb.asyncrpc 16import bb.asyncrpc
16 17
17logger = logging.getLogger("BitBake.PRserv") 18logger = logging.getLogger("BitBake.PRserv")
@@ -20,16 +21,19 @@ PIDPREFIX = "/tmp/PRServer_%s_%s.pid"
20singleton = None 21singleton = None
21 22
22class PRServerClient(bb.asyncrpc.AsyncServerConnection): 23class PRServerClient(bb.asyncrpc.AsyncServerConnection):
23 def __init__(self, socket, table, read_only): 24 def __init__(self, socket, server):
24 super().__init__(socket, 'PRSERVICE', logger) 25 super().__init__(socket, "PRSERVICE", server.logger)
26 self.server = server
27
25 self.handlers.update({ 28 self.handlers.update({
26 'get-pr': self.handle_get_pr, 29 "get-pr": self.handle_get_pr,
27 'import-one': self.handle_import_one, 30 "test-pr": self.handle_test_pr,
28 'export': self.handle_export, 31 "test-package": self.handle_test_package,
29 'is-readonly': self.handle_is_readonly, 32 "max-package-pr": self.handle_max_package_pr,
33 "import-one": self.handle_import_one,
34 "export": self.handle_export,
35 "is-readonly": self.handle_is_readonly,
30 }) 36 })
31 self.table = table
32 self.read_only = read_only
33 37
34 def validate_proto_version(self): 38 def validate_proto_version(self):
35 return (self.proto_version == (1, 0)) 39 return (self.proto_version == (1, 0))
@@ -38,104 +42,213 @@ class PRServerClient(bb.asyncrpc.AsyncServerConnection):
38 try: 42 try:
39 return await super().dispatch_message(msg) 43 return await super().dispatch_message(msg)
40 except: 44 except:
41 self.table.sync()
42 raise 45 raise
43 else: 46
44 self.table.sync_if_dirty() 47 async def handle_test_pr(self, request):
48 '''Finds the PR value corresponding to the request. If not found, returns None and doesn't insert a new value'''
49 version = request["version"]
50 pkgarch = request["pkgarch"]
51 checksum = request["checksum"]
52 history = request["history"]
53
54 value = self.server.table.find_value(version, pkgarch, checksum, history)
55 return {"value": value}
56
57 async def handle_test_package(self, request):
58 '''Tells whether there are entries for (version, pkgarch) in the db. Returns True or False'''
59 version = request["version"]
60 pkgarch = request["pkgarch"]
61
62 value = self.server.table.test_package(version, pkgarch)
63 return {"value": value}
64
65 async def handle_max_package_pr(self, request):
66 '''Finds the greatest PR value for (version, pkgarch) in the db. Returns None if no entry was found'''
67 version = request["version"]
68 pkgarch = request["pkgarch"]
69
70 value = self.server.table.find_package_max_value(version, pkgarch)
71 return {"value": value}
45 72
46 async def handle_get_pr(self, request): 73 async def handle_get_pr(self, request):
47 version = request['version'] 74 version = request["version"]
48 pkgarch = request['pkgarch'] 75 pkgarch = request["pkgarch"]
49 checksum = request['checksum'] 76 checksum = request["checksum"]
77 history = request["history"]
50 78
51 response = None 79 if self.upstream_client is None:
52 try: 80 value = self.server.table.get_value(version, pkgarch, checksum, history)
53 value = self.table.getValue(version, pkgarch, checksum) 81 return {"value": value}
54 response = {'value': value}
55 except prserv.NotFoundError:
56 logger.error("can not find value for (%s, %s)",version, checksum)
57 except sqlite3.Error as exc:
58 logger.error(str(exc))
59 82
60 return response 83 # We have an upstream server.
84 # Check whether the local server already knows the requested configuration.
85 # If the configuration is a new one, the generated value we will add will
86 # depend on what's on the upstream server. That's why we're calling find_value()
87 # instead of get_value() directly.
88
89 value = self.server.table.find_value(version, pkgarch, checksum, history)
90 upstream_max = await self.upstream_client.max_package_pr(version, pkgarch)
91
92 if value is not None:
93
94 # The configuration is already known locally.
95
96 if history:
97 value = self.server.table.get_value(version, pkgarch, checksum, history)
98 else:
99 existing_value = value
100 # In "no history", we need to make sure the value doesn't decrease
101 # and is at least greater than the maximum upstream value
102 # and the maximum local value
103
104 local_max = self.server.table.find_package_max_value(version, pkgarch)
105 if revision_smaller(value, local_max):
106 value = increase_revision(local_max)
107
108 if revision_smaller(value, upstream_max):
109 # Ask upstream whether it knows the checksum
110 upstream_value = await self.upstream_client.test_pr(version, pkgarch, checksum)
111 if upstream_value is None:
112 # Upstream doesn't have our checksum, let create a new one
113 value = upstream_max + ".0"
114 else:
115 # Fine to take the same value as upstream
116 value = upstream_max
117
118 if not value == existing_value and not self.server.read_only:
119 self.server.table.store_value(version, pkgarch, checksum, value)
120
121 return {"value": value}
122
123 # The configuration is a new one for the local server
124 # Let's ask the upstream server whether it knows it
125
126 known_upstream = await self.upstream_client.test_package(version, pkgarch)
127
128 if not known_upstream:
129
130 # The package is not known upstream, must be a local-only package
131 # Let's compute the PR number using the local-only method
132
133 value = self.server.table.get_value(version, pkgarch, checksum, history)
134 return {"value": value}
135
136 # The package is known upstream, let's ask the upstream server
137 # whether it knows our new output hash
138
139 value = await self.upstream_client.test_pr(version, pkgarch, checksum)
140
141 if value is not None:
142
143 # Upstream knows this output hash, let's store it and use it too.
144
145 if not self.server.read_only:
146 self.server.table.store_value(version, pkgarch, checksum, value)
147 # If the local server is read only, won't be able to store the new
148 # value in the database and will have to keep asking the upstream server
149 return {"value": value}
150
151 # The output hash doesn't exist upstream, get the most recent number from upstream (x)
152 # Then, we want to have a new PR value for the local server: x.y
153
154 upstream_max = await self.upstream_client.max_package_pr(version, pkgarch)
155 # Here we know that the package is known upstream, so upstream_max can't be None
156 subvalue = self.server.table.find_new_subvalue(version, pkgarch, upstream_max)
157
158 if not self.server.read_only:
159 self.server.table.store_value(version, pkgarch, checksum, subvalue)
160
161 return {"value": subvalue}
162
163 async def process_requests(self):
164 if self.server.upstream is not None:
165 self.upstream_client = await create_async_client(self.server.upstream)
166 else:
167 self.upstream_client = None
168
169 try:
170 await super().process_requests()
171 finally:
172 if self.upstream_client is not None:
173 await self.upstream_client.close()
61 174
62 async def handle_import_one(self, request): 175 async def handle_import_one(self, request):
63 response = None 176 response = None
64 if not self.read_only: 177 if not self.server.read_only:
65 version = request['version'] 178 version = request["version"]
66 pkgarch = request['pkgarch'] 179 pkgarch = request["pkgarch"]
67 checksum = request['checksum'] 180 checksum = request["checksum"]
68 value = request['value'] 181 value = request["value"]
69 182
70 value = self.table.importone(version, pkgarch, checksum, value) 183 value = self.server.table.importone(version, pkgarch, checksum, value)
71 if value is not None: 184 if value is not None:
72 response = {'value': value} 185 response = {"value": value}
73 186
74 return response 187 return response
75 188
76 async def handle_export(self, request): 189 async def handle_export(self, request):
77 version = request['version'] 190 version = request["version"]
78 pkgarch = request['pkgarch'] 191 pkgarch = request["pkgarch"]
79 checksum = request['checksum'] 192 checksum = request["checksum"]
80 colinfo = request['colinfo'] 193 colinfo = request["colinfo"]
194 history = request["history"]
81 195
82 try: 196 try:
83 (metainfo, datainfo) = self.table.export(version, pkgarch, checksum, colinfo) 197 (metainfo, datainfo) = self.server.table.export(version, pkgarch, checksum, colinfo, history)
84 except sqlite3.Error as exc: 198 except sqlite3.Error as exc:
85 logger.error(str(exc)) 199 self.logger.error(str(exc))
86 metainfo = datainfo = None 200 metainfo = datainfo = None
87 201
88 return {'metainfo': metainfo, 'datainfo': datainfo} 202 return {"metainfo": metainfo, "datainfo": datainfo}
89 203
90 async def handle_is_readonly(self, request): 204 async def handle_is_readonly(self, request):
91 return {'readonly': self.read_only} 205 return {"readonly": self.server.read_only}
92 206
93class PRServer(bb.asyncrpc.AsyncServer): 207class PRServer(bb.asyncrpc.AsyncServer):
94 def __init__(self, dbfile, read_only=False): 208 def __init__(self, dbfile, read_only=False, upstream=None):
95 super().__init__(logger) 209 super().__init__(logger)
96 self.dbfile = dbfile 210 self.dbfile = dbfile
97 self.table = None 211 self.table = None
98 self.read_only = read_only 212 self.read_only = read_only
213 self.upstream = upstream
99 214
100 def accept_client(self, socket): 215 def accept_client(self, socket):
101 return PRServerClient(socket, self.table, self.read_only) 216 return PRServerClient(socket, self)
102 217
103 def start(self): 218 def start(self):
104 tasks = super().start() 219 tasks = super().start()
105 self.db = prserv.db.PRData(self.dbfile, read_only=self.read_only) 220 self.db = prserv.db.PRData(self.dbfile, read_only=self.read_only)
106 self.table = self.db["PRMAIN"] 221 self.table = self.db["PRMAIN"]
107 222
108 logger.info("Started PRServer with DBfile: %s, Address: %s, PID: %s" % 223 self.logger.info("Started PRServer with DBfile: %s, Address: %s, PID: %s" %
109 (self.dbfile, self.address, str(os.getpid()))) 224 (self.dbfile, self.address, str(os.getpid())))
110 225
226 if self.upstream is not None:
227 self.logger.info("And upstream PRServer: %s " % (self.upstream))
228
111 return tasks 229 return tasks
112 230
113 async def stop(self): 231 async def stop(self):
114 self.table.sync_if_dirty()
115 self.db.disconnect() 232 self.db.disconnect()
116 await super().stop() 233 await super().stop()
117 234
118 def signal_handler(self):
119 super().signal_handler()
120 if self.table:
121 self.table.sync()
122
123class PRServSingleton(object): 235class PRServSingleton(object):
124 def __init__(self, dbfile, logfile, host, port): 236 def __init__(self, dbfile, logfile, host, port, upstream):
125 self.dbfile = dbfile 237 self.dbfile = dbfile
126 self.logfile = logfile 238 self.logfile = logfile
127 self.host = host 239 self.host = host
128 self.port = port 240 self.port = port
241 self.upstream = upstream
129 242
130 def start(self): 243 def start(self):
131 self.prserv = PRServer(self.dbfile) 244 self.prserv = PRServer(self.dbfile, upstream=self.upstream)
132 self.prserv.start_tcp_server(socket.gethostbyname(self.host), self.port) 245 self.prserv.start_tcp_server(socket.gethostbyname(self.host), self.port)
133 self.process = self.prserv.serve_as_process(log_level=logging.WARNING) 246 self.process = self.prserv.serve_as_process(log_level=logging.WARNING)
134 247
135 if not self.prserv.address: 248 if not self.prserv.address:
136 raise PRServiceConfigError 249 raise PRServiceConfigError
137 if not self.port: 250 if not self.port:
138 self.port = int(self.prserv.address.rsplit(':', 1)[1]) 251 self.port = int(self.prserv.address.rsplit(":", 1)[1])
139 252
140def run_as_daemon(func, pidfile, logfile): 253def run_as_daemon(func, pidfile, logfile):
141 """ 254 """
@@ -171,18 +284,18 @@ def run_as_daemon(func, pidfile, logfile):
171 # stdout/stderr or it could be 'real' unix fd forking where we need 284 # stdout/stderr or it could be 'real' unix fd forking where we need
172 # to physically close the fds to prevent the program launching us from 285 # to physically close the fds to prevent the program launching us from
173 # potentially hanging on a pipe. Handle both cases. 286 # potentially hanging on a pipe. Handle both cases.
174 si = open('/dev/null', 'r') 287 si = open("/dev/null", "r")
175 try: 288 try:
176 os.dup2(si.fileno(),sys.stdin.fileno()) 289 os.dup2(si.fileno(), sys.stdin.fileno())
177 except (AttributeError, io.UnsupportedOperation): 290 except (AttributeError, io.UnsupportedOperation):
178 sys.stdin = si 291 sys.stdin = si
179 so = open(logfile, 'a+') 292 so = open(logfile, "a+")
180 try: 293 try:
181 os.dup2(so.fileno(),sys.stdout.fileno()) 294 os.dup2(so.fileno(), sys.stdout.fileno())
182 except (AttributeError, io.UnsupportedOperation): 295 except (AttributeError, io.UnsupportedOperation):
183 sys.stdout = so 296 sys.stdout = so
184 try: 297 try:
185 os.dup2(so.fileno(),sys.stderr.fileno()) 298 os.dup2(so.fileno(), sys.stderr.fileno())
186 except (AttributeError, io.UnsupportedOperation): 299 except (AttributeError, io.UnsupportedOperation):
187 sys.stderr = so 300 sys.stderr = so
188 301
@@ -200,14 +313,14 @@ def run_as_daemon(func, pidfile, logfile):
200 313
201 # write pidfile 314 # write pidfile
202 pid = str(os.getpid()) 315 pid = str(os.getpid())
203 with open(pidfile, 'w') as pf: 316 with open(pidfile, "w") as pf:
204 pf.write("%s\n" % pid) 317 pf.write("%s\n" % pid)
205 318
206 func() 319 func()
207 os.remove(pidfile) 320 os.remove(pidfile)
208 os._exit(0) 321 os._exit(0)
209 322
210def start_daemon(dbfile, host, port, logfile, read_only=False): 323def start_daemon(dbfile, host, port, logfile, read_only=False, upstream=None):
211 ip = socket.gethostbyname(host) 324 ip = socket.gethostbyname(host)
212 pidfile = PIDPREFIX % (ip, port) 325 pidfile = PIDPREFIX % (ip, port)
213 try: 326 try:
@@ -223,7 +336,7 @@ def start_daemon(dbfile, host, port, logfile, read_only=False):
223 336
224 dbfile = os.path.abspath(dbfile) 337 dbfile = os.path.abspath(dbfile)
225 def daemon_main(): 338 def daemon_main():
226 server = PRServer(dbfile, read_only=read_only) 339 server = PRServer(dbfile, read_only=read_only, upstream=upstream)
227 server.start_tcp_server(ip, port) 340 server.start_tcp_server(ip, port)
228 server.serve_forever() 341 server.serve_forever()
229 342
@@ -245,15 +358,15 @@ def stop_daemon(host, port):
245 # so at least advise the user which ports the corresponding server is listening 358 # so at least advise the user which ports the corresponding server is listening
246 ports = [] 359 ports = []
247 portstr = "" 360 portstr = ""
248 for pf in glob.glob(PIDPREFIX % (ip,'*')): 361 for pf in glob.glob(PIDPREFIX % (ip, "*")):
249 bn = os.path.basename(pf) 362 bn = os.path.basename(pf)
250 root, _ = os.path.splitext(bn) 363 root, _ = os.path.splitext(bn)
251 ports.append(root.split('_')[-1]) 364 ports.append(root.split("_")[-1])
252 if len(ports): 365 if len(ports):
253 portstr = "Wrong port? Other ports listening at %s: %s" % (host, ' '.join(ports)) 366 portstr = "Wrong port? Other ports listening at %s: %s" % (host, " ".join(ports))
254 367
255 sys.stderr.write("pidfile %s does not exist. Daemon not running? %s\n" 368 sys.stderr.write("pidfile %s does not exist. Daemon not running? %s\n"
256 % (pidfile,portstr)) 369 % (pidfile, portstr))
257 return 1 370 return 1
258 371
259 try: 372 try:
@@ -284,7 +397,7 @@ def is_running(pid):
284 return True 397 return True
285 398
286def is_local_special(host, port): 399def is_local_special(host, port):
287 if (host == 'localhost' or host == '127.0.0.1') and not port: 400 if (host == "localhost" or host == "127.0.0.1") and not port:
288 return True 401 return True
289 else: 402 else:
290 return False 403 return False
@@ -295,7 +408,7 @@ class PRServiceConfigError(Exception):
295def auto_start(d): 408def auto_start(d):
296 global singleton 409 global singleton
297 410
298 host_params = list(filter(None, (d.getVar('PRSERV_HOST') or '').split(':'))) 411 host_params = list(filter(None, (d.getVar("PRSERV_HOST") or "").split(":")))
299 if not host_params: 412 if not host_params:
300 # Shutdown any existing PR Server 413 # Shutdown any existing PR Server
301 auto_shutdown() 414 auto_shutdown()
@@ -304,12 +417,15 @@ def auto_start(d):
304 if len(host_params) != 2: 417 if len(host_params) != 2:
305 # Shutdown any existing PR Server 418 # Shutdown any existing PR Server
306 auto_shutdown() 419 auto_shutdown()
307 logger.critical('\n'.join(['PRSERV_HOST: incorrect format', 420 logger.critical("\n".join(["PRSERV_HOST: incorrect format",
308 'Usage: PRSERV_HOST = "<hostname>:<port>"'])) 421 'Usage: PRSERV_HOST = "<hostname>:<port>"']))
309 raise PRServiceConfigError 422 raise PRServiceConfigError
310 423
311 host = host_params[0].strip().lower() 424 host = host_params[0].strip().lower()
312 port = int(host_params[1]) 425 port = int(host_params[1])
426
427 upstream = d.getVar("PRSERV_UPSTREAM") or None
428
313 if is_local_special(host, port): 429 if is_local_special(host, port):
314 import bb.utils 430 import bb.utils
315 cachedir = (d.getVar("PERSISTENT_DIR") or d.getVar("CACHE")) 431 cachedir = (d.getVar("PERSISTENT_DIR") or d.getVar("CACHE"))
@@ -324,7 +440,7 @@ def auto_start(d):
324 auto_shutdown() 440 auto_shutdown()
325 if not singleton: 441 if not singleton:
326 bb.utils.mkdirhier(cachedir) 442 bb.utils.mkdirhier(cachedir)
327 singleton = PRServSingleton(os.path.abspath(dbfile), os.path.abspath(logfile), host, port) 443 singleton = PRServSingleton(os.path.abspath(dbfile), os.path.abspath(logfile), host, port, upstream)
328 singleton.start() 444 singleton.start()
329 if singleton: 445 if singleton:
330 host = singleton.host 446 host = singleton.host
@@ -357,8 +473,8 @@ def connect(host, port):
357 473
358 global singleton 474 global singleton
359 475
360 if host.strip().lower() == 'localhost' and not port: 476 if host.strip().lower() == "localhost" and not port:
361 host = 'localhost' 477 host = "localhost"
362 port = singleton.port 478 port = singleton.port
363 479
364 conn = client.PRClient() 480 conn = client.PRClient()
diff --git a/bitbake/lib/prserv/tests.py b/bitbake/lib/prserv/tests.py
new file mode 100644
index 0000000000..8765b129f2
--- /dev/null
+++ b/bitbake/lib/prserv/tests.py
@@ -0,0 +1,386 @@
1#! /usr/bin/env python3
2#
3# Copyright (C) 2024 BitBake Contributors
4#
5# SPDX-License-Identifier: GPL-2.0-only
6#
7
8from . import create_server, create_client, increase_revision, revision_greater, revision_smaller, _revision_greater_or_equal
9import prserv.db as db
10from bb.asyncrpc import InvokeError
11import logging
12import os
13import sys
14import tempfile
15import unittest
16import socket
17import subprocess
18from pathlib import Path
19
20THIS_DIR = Path(__file__).parent
21BIN_DIR = THIS_DIR.parent.parent / "bin"
22
23version = "dummy-1.0-r0"
24pkgarch = "core2-64"
25other_arch = "aarch64"
26
27checksumX = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4f0"
28checksum0 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a0"
29checksum1 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a1"
30checksum2 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a2"
31checksum3 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a3"
32checksum4 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a4"
33checksum5 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a5"
34checksum6 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a6"
35checksum7 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a7"
36checksum8 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a8"
37checksum9 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4a9"
38checksum10 = "51bf8189dbe9ea81fa6dd89608bf19380c437a9cf12f6c6239887801ba4ab4aa"
39
40def server_prefunc(server, name):
41 logging.basicConfig(level=logging.DEBUG, filename='prserv-%s.log' % name, filemode='w',
42 format='%(levelname)s %(filename)s:%(lineno)d %(message)s')
43 server.logger.debug("Running server %s" % name)
44 sys.stdout = open('prserv-stdout-%s.log' % name, 'w')
45 sys.stderr = sys.stdout
46
47class PRTestSetup(object):
48
49 def start_server(self, name, dbfile, upstream=None, read_only=False, prefunc=server_prefunc):
50
51 def cleanup_server(server):
52 if server.process.exitcode is not None:
53 return
54 server.process.terminate()
55 server.process.join()
56
57 server = create_server(socket.gethostbyname("localhost") + ":0",
58 dbfile,
59 upstream=upstream,
60 read_only=read_only)
61
62 server.serve_as_process(prefunc=prefunc, args=(name,))
63 self.addCleanup(cleanup_server, server)
64
65 return server
66
67 def start_client(self, server_address):
68 def cleanup_client(client):
69 client.close()
70
71 client = create_client(server_address)
72 self.addCleanup(cleanup_client, client)
73
74 return client
75
76class FunctionTests(unittest.TestCase):
77
78 def setUp(self):
79 self.temp_dir = tempfile.TemporaryDirectory(prefix='bb-prserv')
80 self.addCleanup(self.temp_dir.cleanup)
81
82 def test_increase_revision(self):
83 self.assertEqual(increase_revision("1"), "2")
84 self.assertEqual(increase_revision("1.0"), "1.1")
85 self.assertEqual(increase_revision("1.1.1"), "1.1.2")
86 self.assertEqual(increase_revision("1.1.1.3"), "1.1.1.4")
87 self.assertRaises(ValueError, increase_revision, "1.a")
88 self.assertRaises(ValueError, increase_revision, "1.")
89 self.assertRaises(ValueError, increase_revision, "")
90
91 def test_revision_greater_or_equal(self):
92 self.assertTrue(_revision_greater_or_equal("2", "2"))
93 self.assertTrue(_revision_greater_or_equal("2", "1"))
94 self.assertTrue(_revision_greater_or_equal("10", "2"))
95 self.assertTrue(_revision_greater_or_equal("1.10", "1.2"))
96 self.assertFalse(_revision_greater_or_equal("1.2", "1.10"))
97 self.assertTrue(_revision_greater_or_equal("1.10", "1"))
98 self.assertTrue(_revision_greater_or_equal("1.10.1", "1.10"))
99 self.assertFalse(_revision_greater_or_equal("1.10.1", "1.10.2"))
100 self.assertTrue(_revision_greater_or_equal("1.10.1", "1.10.1"))
101 self.assertTrue(_revision_greater_or_equal("1.10.1", "1"))
102 self.assertTrue(revision_greater("1.20", "1.3"))
103 self.assertTrue(revision_smaller("1.3", "1.20"))
104
105 # DB tests
106
107 def test_db(self):
108 dbfile = os.path.join(self.temp_dir.name, "testtable.sqlite3")
109
110 self.db = db.PRData(dbfile)
111 self.table = self.db["PRMAIN"]
112
113 self.table.store_value(version, pkgarch, checksum0, "0")
114 self.table.store_value(version, pkgarch, checksum1, "1")
115 # "No history" mode supports multiple PRs for the same checksum
116 self.table.store_value(version, pkgarch, checksum0, "2")
117 self.table.store_value(version, pkgarch, checksum2, "1.0")
118
119 self.assertTrue(self.table.test_package(version, pkgarch))
120 self.assertFalse(self.table.test_package(version, other_arch))
121
122 self.assertTrue(self.table.test_value(version, pkgarch, "0"))
123 self.assertTrue(self.table.test_value(version, pkgarch, "1"))
124 self.assertTrue(self.table.test_value(version, pkgarch, "2"))
125
126 self.assertEqual(self.table.find_package_max_value(version, pkgarch), "2")
127
128 self.assertEqual(self.table.find_min_value(version, pkgarch, checksum0), "0")
129 self.assertEqual(self.table.find_max_value(version, pkgarch, checksum0), "2")
130
131 # Test history modes
132 self.assertEqual(self.table.find_value(version, pkgarch, checksum0, True), "0")
133 self.assertEqual(self.table.find_value(version, pkgarch, checksum0, False), "2")
134
135 self.assertEqual(self.table.find_new_subvalue(version, pkgarch, "3"), "3.0")
136 self.assertEqual(self.table.find_new_subvalue(version, pkgarch, "1"), "1.1")
137
138 # Revision comparison tests
139 self.table.store_value(version, pkgarch, checksum1, "1.3")
140 self.table.store_value(version, pkgarch, checksum1, "1.20")
141 self.assertEqual(self.table.find_min_value(version, pkgarch, checksum1), "1")
142 self.assertEqual(self.table.find_max_value(version, pkgarch, checksum1), "1.20")
143
144class PRBasicTests(PRTestSetup, unittest.TestCase):
145
146 def setUp(self):
147 self.temp_dir = tempfile.TemporaryDirectory(prefix='bb-prserv')
148 self.addCleanup(self.temp_dir.cleanup)
149
150 dbfile = os.path.join(self.temp_dir.name, "prtest-basic.sqlite3")
151
152 self.server1 = self.start_server("basic", dbfile)
153 self.client1 = self.start_client(self.server1.address)
154
155 def test_basic(self):
156
157 # Checks on non existing configuration
158
159 result = self.client1.test_pr(version, pkgarch, checksum0)
160 self.assertIsNone(result, "test_pr should return 'None' for a non existing PR")
161
162 result = self.client1.test_package(version, pkgarch)
163 self.assertFalse(result, "test_package should return 'False' for a non existing PR")
164
165 result = self.client1.max_package_pr(version, pkgarch)
166 self.assertIsNone(result, "max_package_pr should return 'None' for a non existing PR")
167
168 # Add a first configuration
169
170 result = self.client1.getPR(version, pkgarch, checksum0)
171 self.assertEqual(result, "0", "getPR: initial PR of a package should be '0'")
172
173 result = self.client1.test_pr(version, pkgarch, checksum0)
174 self.assertEqual(result, "0", "test_pr should return '0' here, matching the result of getPR")
175
176 result = self.client1.test_package(version, pkgarch)
177 self.assertTrue(result, "test_package should return 'True' for an existing PR")
178
179 result = self.client1.max_package_pr(version, pkgarch)
180 self.assertEqual(result, "0", "max_package_pr should return '0' in the current test series")
181
182 # Check that the same request gets the same value
183
184 result = self.client1.getPR(version, pkgarch, checksum0)
185 self.assertEqual(result, "0", "getPR: asking for the same PR a second time in a row should return the same value.")
186
187 # Add new configurations
188
189 result = self.client1.getPR(version, pkgarch, checksum1)
190 self.assertEqual(result, "1", "getPR: second PR of a package should be '1'")
191
192 result = self.client1.test_pr(version, pkgarch, checksum1)
193 self.assertEqual(result, "1", "test_pr should return '1' here, matching the result of getPR")
194
195 result = self.client1.max_package_pr(version, pkgarch)
196 self.assertEqual(result, "1", "max_package_pr should return '1' in the current test series")
197
198 result = self.client1.getPR(version, pkgarch, checksum2)
199 self.assertEqual(result, "2", "getPR: second PR of a package should be '2'")
200
201 result = self.client1.test_pr(version, pkgarch, checksum2)
202 self.assertEqual(result, "2", "test_pr should return '2' here, matching the result of getPR")
203
204 result = self.client1.max_package_pr(version, pkgarch)
205 self.assertEqual(result, "2", "max_package_pr should return '2' in the current test series")
206
207 result = self.client1.getPR(version, pkgarch, checksum3)
208 self.assertEqual(result, "3", "getPR: second PR of a package should be '3'")
209
210 result = self.client1.test_pr(version, pkgarch, checksum3)
211 self.assertEqual(result, "3", "test_pr should return '3' here, matching the result of getPR")
212
213 result = self.client1.max_package_pr(version, pkgarch)
214 self.assertEqual(result, "3", "max_package_pr should return '3' in the current test series")
215
216 # Ask again for the first configuration
217
218 result = self.client1.getPR(version, pkgarch, checksum0)
219 self.assertEqual(result, "4", "getPR: should return '4' in this configuration")
220
221 # Ask again with explicit "no history" mode
222
223 result = self.client1.getPR(version, pkgarch, checksum0, False)
224 self.assertEqual(result, "4", "getPR: should return '4' in this configuration")
225
226 # Ask again with explicit "history" mode. This should return the first recorded PR for checksum0
227
228 result = self.client1.getPR(version, pkgarch, checksum0, True)
229 self.assertEqual(result, "0", "getPR: should return '0' in this configuration")
230
231 # Check again that another pkgarg resets the counters
232
233 result = self.client1.test_pr(version, other_arch, checksum0)
234 self.assertIsNone(result, "test_pr should return 'None' for a non existing PR")
235
236 result = self.client1.test_package(version, other_arch)
237 self.assertFalse(result, "test_package should return 'False' for a non existing PR")
238
239 result = self.client1.max_package_pr(version, other_arch)
240 self.assertIsNone(result, "max_package_pr should return 'None' for a non existing PR")
241
242 # Now add the configuration
243
244 result = self.client1.getPR(version, other_arch, checksum0)
245 self.assertEqual(result, "0", "getPR: initial PR of a package should be '0'")
246
247 result = self.client1.test_pr(version, other_arch, checksum0)
248 self.assertEqual(result, "0", "test_pr should return '0' here, matching the result of getPR")
249
250 result = self.client1.test_package(version, other_arch)
251 self.assertTrue(result, "test_package should return 'True' for an existing PR")
252
253 result = self.client1.max_package_pr(version, other_arch)
254 self.assertEqual(result, "0", "max_package_pr should return '0' in the current test series")
255
256 result = self.client1.is_readonly()
257 self.assertFalse(result, "Server should not be described as 'read-only'")
258
259class PRUpstreamTests(PRTestSetup, unittest.TestCase):
260
261 def setUp(self):
262
263 self.temp_dir = tempfile.TemporaryDirectory(prefix='bb-prserv')
264 self.addCleanup(self.temp_dir.cleanup)
265
266 dbfile2 = os.path.join(self.temp_dir.name, "prtest-upstream2.sqlite3")
267 self.server2 = self.start_server("upstream2", dbfile2)
268 self.client2 = self.start_client(self.server2.address)
269
270 dbfile1 = os.path.join(self.temp_dir.name, "prtest-upstream1.sqlite3")
271 self.server1 = self.start_server("upstream1", dbfile1, upstream=self.server2.address)
272 self.client1 = self.start_client(self.server1.address)
273
274 dbfile0 = os.path.join(self.temp_dir.name, "prtest-local.sqlite3")
275 self.server0 = self.start_server("local", dbfile0, upstream=self.server1.address)
276 self.client0 = self.start_client(self.server0.address)
277 self.shared_db = dbfile0
278
279 def test_upstream_and_readonly(self):
280
281 # For identical checksums, all servers should return the same PR
282
283 result = self.client2.getPR(version, pkgarch, checksum0)
284 self.assertEqual(result, "0", "getPR: initial PR of a package should be '0'")
285
286 result = self.client1.getPR(version, pkgarch, checksum0)
287 self.assertEqual(result, "0", "getPR: initial PR of a package should be '0' (same as upstream)")
288
289 result = self.client0.getPR(version, pkgarch, checksum0)
290 self.assertEqual(result, "0", "getPR: initial PR of a package should be '0' (same as upstream)")
291
292 # Now introduce new checksums on server1 for, same version
293
294 result = self.client1.getPR(version, pkgarch, checksum1)
295 self.assertEqual(result, "0.0", "getPR: first PR of a package which has a different checksum upstream should be '0.0'")
296
297 result = self.client1.getPR(version, pkgarch, checksum2)
298 self.assertEqual(result, "0.1", "getPR: second PR of a package that has a different checksum upstream should be '0.1'")
299
300 # Now introduce checksums on server0 for, same version
301
302 result = self.client1.getPR(version, pkgarch, checksum1)
303 self.assertEqual(result, "0.2", "getPR: can't decrease for known PR")
304
305 result = self.client1.getPR(version, pkgarch, checksum2)
306 self.assertEqual(result, "0.3")
307
308 result = self.client1.max_package_pr(version, pkgarch)
309 self.assertEqual(result, "0.3")
310
311 result = self.client0.getPR(version, pkgarch, checksum3)
312 self.assertEqual(result, "0.3.0", "getPR: first PR of a package that doesn't exist upstream should be '0.3.0'")
313
314 result = self.client0.getPR(version, pkgarch, checksum4)
315 self.assertEqual(result, "0.3.1", "getPR: second PR of a package that doesn't exist upstream should be '0.3.1'")
316
317 result = self.client0.getPR(version, pkgarch, checksum3)
318 self.assertEqual(result, "0.3.2")
319
320 # More upstream updates
321 # Here, we assume no communication between server2 and server0. server2 only impacts server0
322 # after impacting server1
323
324 self.assertEqual(self.client2.getPR(version, pkgarch, checksum5), "1")
325 self.assertEqual(self.client1.getPR(version, pkgarch, checksum6), "1.0")
326 self.assertEqual(self.client1.getPR(version, pkgarch, checksum7), "1.1")
327 self.assertEqual(self.client0.getPR(version, pkgarch, checksum8), "1.1.0")
328 self.assertEqual(self.client0.getPR(version, pkgarch, checksum9), "1.1.1")
329
330 # "history" mode tests
331
332 self.assertEqual(self.client2.getPR(version, pkgarch, checksum0, True), "0")
333 self.assertEqual(self.client1.getPR(version, pkgarch, checksum2, True), "0.1")
334 self.assertEqual(self.client0.getPR(version, pkgarch, checksum3, True), "0.3.0")
335
336 # More "no history" mode tests
337
338 self.assertEqual(self.client2.getPR(version, pkgarch, checksum0), "2")
339 self.assertEqual(self.client1.getPR(version, pkgarch, checksum0), "2") # Same as upstream
340 self.assertEqual(self.client0.getPR(version, pkgarch, checksum0), "2") # Same as upstream
341 self.assertEqual(self.client1.getPR(version, pkgarch, checksum7), "3") # This could be surprising, but since the previous revision was "2", increasing it yields "3".
342 # We don't know how many upstream servers we have
343 # Start read-only server with server1 as upstream
344 self.server_ro = self.start_server("local-ro", self.shared_db, upstream=self.server1.address, read_only=True)
345 self.client_ro = self.start_client(self.server_ro.address)
346
347 self.assertTrue(self.client_ro.is_readonly(), "Database should be described as 'read-only'")
348
349 # Checks on non existing configurations
350 self.assertIsNone(self.client_ro.test_pr(version, pkgarch, checksumX))
351 self.assertFalse(self.client_ro.test_package("unknown", pkgarch))
352
353 # Look up existing configurations
354 self.assertEqual(self.client_ro.getPR(version, pkgarch, checksum0), "3") # "no history" mode
355 self.assertEqual(self.client_ro.getPR(version, pkgarch, checksum0, True), "0") # "history" mode
356 self.assertEqual(self.client_ro.getPR(version, pkgarch, checksum3), "3")
357 self.assertEqual(self.client_ro.getPR(version, pkgarch, checksum3, True), "0.3.0")
358 self.assertEqual(self.client_ro.max_package_pr(version, pkgarch), "2") # normal as "3" was never saved
359
360 # Try to insert a new value. Here this one is know upstream.
361 self.assertEqual(self.client_ro.getPR(version, pkgarch, checksum7), "3")
362 # Try to insert a completely new value. As the max upstream value is already "3", it should be "3.0"
363 self.assertEqual(self.client_ro.getPR(version, pkgarch, checksum10), "3.0")
364 # Same with another value which only exists in the upstream upstream server
365 # This time, as the upstream server doesn't know it, it will ask its upstream server. So that's a known one.
366 self.assertEqual(self.client_ro.getPR(version, pkgarch, checksum9), "3")
367
368class ScriptTests(unittest.TestCase):
369
370 def setUp(self):
371
372 self.temp_dir = tempfile.TemporaryDirectory(prefix='bb-prserv')
373 self.addCleanup(self.temp_dir.cleanup)
374 self.dbfile = os.path.join(self.temp_dir.name, "prtest.sqlite3")
375
376 def test_1_start_bitbake_prserv(self):
377 try:
378 subprocess.check_call([BIN_DIR / "bitbake-prserv", "--start", "-f", self.dbfile])
379 except subprocess.CalledProcessError as e:
380 self.fail("Failed to start bitbake-prserv: %s" % e.returncode)
381
382 def test_2_stop_bitbake_prserv(self):
383 try:
384 subprocess.check_call([BIN_DIR / "bitbake-prserv", "--stop"])
385 except subprocess.CalledProcessError as e:
386 self.fail("Failed to stop bitbake-prserv: %s" % e.returncode)