summaryrefslogtreecommitdiffstats
path: root/scripts/sstate-cache-management.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/sstate-cache-management.py')
-rwxr-xr-xscripts/sstate-cache-management.py329
1 files changed, 329 insertions, 0 deletions
diff --git a/scripts/sstate-cache-management.py b/scripts/sstate-cache-management.py
new file mode 100755
index 0000000000..d3f600bd28
--- /dev/null
+++ b/scripts/sstate-cache-management.py
@@ -0,0 +1,329 @@
1#!/usr/bin/env python3
2#
3# Copyright OpenEmbedded Contributors
4#
5# SPDX-License-Identifier: MIT
6#
7
8import argparse
9import os
10import re
11import sys
12
13from collections import defaultdict
14from concurrent.futures import ThreadPoolExecutor
15from dataclasses import dataclass
16from pathlib import Path
17
18if sys.version_info < (3, 8, 0):
19 raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.")
20
21SSTATE_PREFIX = "sstate:"
22SSTATE_EXTENSION = ".tar.zst"
23# SSTATE_EXTENSION = ".tgz"
24# .siginfo.done files are mentioned in the original script?
25SSTATE_SUFFIXES = (
26 SSTATE_EXTENSION,
27 f"{SSTATE_EXTENSION}.siginfo",
28 f"{SSTATE_EXTENSION}.done",
29)
30
31RE_SSTATE_PKGSPEC = re.compile(
32 rf"""sstate:(?P<pn>[^:]*):
33 (?P<package_target>[^:]*):
34 (?P<pv>[^:]*):
35 (?P<pr>[^:]*):
36 (?P<sstate_pkgarch>[^:]*):
37 (?P<sstate_version>[^_]*):
38 (?P<bb_unihash>[^_]*)_
39 (?P<bb_task>[^:]*)
40 (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""",
41 re.X,
42)
43
44
45# Really we'd like something like a Path subclass which implements a stat
46# cache here, unfortunately there's no good way to do that transparently
47# (yet); see:
48#
49# https://github.com/python/cpython/issues/70219
50# https://discuss.python.org/t/make-pathlib-extensible/3428/77
51@dataclass
52class SstateEntry:
53 """Class for keeping track of an entry in sstate-cache."""
54
55 path: Path
56 match: re.Match
57 stat_result: os.stat_result = None
58
59 def __hash__(self):
60 return self.path.__hash__()
61
62 def __getattr__(self, name):
63 return self.match.group(name)
64
65
66# this is what's in the original script; as far as I can tell, it's an
67# implementation artefact which we don't need?
68def find_archs():
69 # all_archs
70 builder_arch = os.uname().machine
71
72 # FIXME
73 layer_paths = [Path("../..")]
74
75 tune_archs = set()
76 re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"')
77 for path in layer_paths:
78 for tunefile in [
79 p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file()
80 ]:
81 with open(tunefile) as f:
82 for line in f:
83 m = re_tune.match(line)
84 if m:
85 tune_archs.update(m.group(1).split())
86
87 # all_machines
88 machine_archs = set()
89 for path in layer_paths:
90 for machine_file in path.glob("meta*/conf/machine/*.conf"):
91 machine_archs.add(machine_file.parts[-1][:-5])
92
93 extra_archs = set()
94 all_archs = (
95 set(
96 arch.replace("-", "_")
97 for arch in machine_archs | tune_archs | set(["allarch", builder_arch])
98 )
99 | extra_archs
100 )
101
102 print(all_archs)
103
104
105# again, not needed?
106def find_tasks():
107 print(set([p.bb_task for p in paths]))
108
109
110def collect_sstate_paths(args):
111 def scandir(path, paths):
112 # Assume everything is a directory; by not checking we avoid needing an
113 # additional stat which is potentially a synchronous roundtrip over NFS
114 try:
115 for p in path.iterdir():
116 filename = p.parts[-1]
117 if filename.startswith(SSTATE_PREFIX):
118 if filename.endswith(SSTATE_SUFFIXES):
119 m = RE_SSTATE_PKGSPEC.match(p.parts[-1])
120 assert m
121 paths.add(SstateEntry(p, m))
122 # ignore other things (includes things like lockfiles)
123 else:
124 scandir(p, paths)
125
126 except NotADirectoryError:
127 pass
128
129 paths = set()
130 # TODO: parellise scandir
131 scandir(Path(args.cache_dir), paths)
132
133 def path_stat(p):
134 p.stat_result = p.path.lstat()
135
136 if args.remove_duplicated:
137 # This is probably slightly performance negative on a local filesystem
138 # when we interact with the GIL; over NFS it's a massive win.
139 with ThreadPoolExecutor(max_workers=args.jobs) as executor:
140 executor.map(path_stat, paths)
141
142 return paths
143
144
145def remove_by_stamps(args, paths):
146 all_sums = set()
147 for stamps_dir in args.stamps_dir:
148 stamps_path = Path(stamps_dir)
149 assert stamps_path.is_dir()
150 re_sigdata = re.compile(r"do_.*\.sigdata\.([^.]*)")
151 all_sums |= set(
152 [
153 re_sigdata.search(x.parts[-1]).group(1)
154 for x in stamps_path.glob("*/*/*.do_*.sigdata.*")
155 ]
156 )
157 re_setscene = re.compile(r"do_.*_setscene\.([^.]*)")
158 all_sums |= set(
159 [
160 re_setscene.search(x.parts[-1]).group(1)
161 for x in stamps_path.glob("*/*/*.do_*_setscene.*")
162 ]
163 )
164 return [p for p in paths if p.bb_unihash not in all_sums]
165
166
167def remove_duplicated(args, paths):
168 # Skip populate_lic as it produces duplicates in a normal build
169 #
170 # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates
171 valid_paths = [p for p in paths if p.bb_task != "populate_lic"]
172
173 keep = dict()
174 remove = list()
175 for p in valid_paths:
176 sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext])
177 if sstate_sig not in keep:
178 keep[sstate_sig] = p
179 elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime:
180 remove.append(keep[sstate_sig])
181 keep[sstate_sig] = p
182 else:
183 remove.append(p)
184
185 return remove
186
187
188def remove_orphans(args, paths):
189 remove = list()
190 pathsigs = defaultdict(list)
191 for p in paths:
192 sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task])
193 pathsigs[sstate_sig].append(p)
194 for k, v in pathsigs.items():
195 if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0:
196 remove.extend(v)
197 return remove
198
199
200def parse_arguments():
201 parser = argparse.ArgumentParser(description="sstate cache management utility.")
202
203 parser.add_argument(
204 "--cache-dir",
205 default=os.environ.get("SSTATE_CACHE_DIR"),
206 help="""Specify sstate cache directory, will use the environment
207 variable SSTATE_CACHE_DIR if it is not specified.""",
208 )
209
210 # parser.add_argument(
211 # "--extra-archs",
212 # help="""Specify list of architectures which should be tested, this list
213 # will be extended with native arch, allarch and empty arch. The
214 # script won't be trying to generate list of available archs from
215 # AVAILTUNES in tune files.""",
216 # )
217
218 # parser.add_argument(
219 # "--extra-layer",
220 # help="""Specify the layer which will be used for searching the archs,
221 # it will search the meta and meta-* layers in the top dir by
222 # default, and will search meta, meta-*, <layer1>, <layer2>,
223 # ...<layern> when specified. Use "," as the separator.
224 #
225 # This is useless for --stamps-dir or when --extra-archs is used.""",
226 # )
227
228 parser.add_argument(
229 "-d",
230 "--remove-duplicated",
231 action="store_true",
232 help="""Remove the duplicated sstate cache files of one package, only
233 the newest one will be kept. The duplicated sstate cache files
234 of one package must have the same arch, which means sstate cache
235 files with multiple archs are not considered duplicate.
236
237 Conflicts with --stamps-dir.""",
238 )
239
240 parser.add_argument(
241 "--remove-orphans",
242 action="store_true",
243 help=f"""Remove orphan siginfo files from the sstate cache, i.e. those
244 where this is no {SSTATE_EXTENSION} file but there are associated
245 tracking files.""",
246 )
247
248 parser.add_argument(
249 "--stamps-dir",
250 action="append",
251 help="""Specify the build directory's stamps directories, the sstate
252 cache file which IS USED by these build diretories will be KEPT,
253 other sstate cache files in cache-dir will be removed. Can be
254 specified multiple times for several directories.
255
256 Conflicts with --remove-duplicated.""",
257 )
258
259 parser.add_argument(
260 "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel."
261 )
262
263 # parser.add_argument(
264 # "-L",
265 # "--follow-symlink",
266 # action="store_true",
267 # help="Remove both the symbol link and the destination file, default: no.",
268 # )
269
270 parser.add_argument(
271 "-y",
272 "--yes",
273 action="store_true",
274 help="""Automatic yes to prompts; assume "yes" as answer to all prompts
275 and run non-interactively.""",
276 )
277
278 parser.add_argument(
279 "-v", "--verbose", action="store_true", help="Explain what is being done."
280 )
281
282 parser.add_argument(
283 "-D",
284 "--debug",
285 action="count",
286 default=0,
287 help="Show debug info, repeat for more debug info.",
288 )
289
290 args = parser.parse_args()
291 if args.cache_dir is None or (
292 not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans
293 ):
294 parser.print_usage()
295 sys.exit(1)
296
297 return args
298
299
300def main():
301 args = parse_arguments()
302
303 paths = collect_sstate_paths(args)
304 if args.remove_duplicated:
305 remove = remove_duplicated(args, paths)
306 elif args.stamps_dir:
307 remove = remove_by_stamps(args, paths)
308 else:
309 remove = list()
310
311 if args.remove_orphans:
312 remove = set(remove) | set(remove_orphans(args, paths))
313
314 if args.debug >= 1:
315 print("\n".join([str(p.path) for p in remove]))
316 print(f"{len(remove)} out of {len(paths)} files will be removed!")
317 if not args.yes:
318 print("Do you want to continue (y/n)?")
319 confirm = input() in ("y", "Y")
320 else:
321 confirm = True
322 if confirm:
323 # TODO: parallelise remove
324 for p in remove:
325 p.path.unlink()
326
327
328if __name__ == "__main__":
329 main()