diff options
Diffstat (limited to 'scripts/sstate-cache-management.py')
-rwxr-xr-x | scripts/sstate-cache-management.py | 329 |
1 files changed, 329 insertions, 0 deletions
diff --git a/scripts/sstate-cache-management.py b/scripts/sstate-cache-management.py new file mode 100755 index 0000000000..d3f600bd28 --- /dev/null +++ b/scripts/sstate-cache-management.py | |||
@@ -0,0 +1,329 @@ | |||
1 | #!/usr/bin/env python3 | ||
2 | # | ||
3 | # Copyright OpenEmbedded Contributors | ||
4 | # | ||
5 | # SPDX-License-Identifier: MIT | ||
6 | # | ||
7 | |||
8 | import argparse | ||
9 | import os | ||
10 | import re | ||
11 | import sys | ||
12 | |||
13 | from collections import defaultdict | ||
14 | from concurrent.futures import ThreadPoolExecutor | ||
15 | from dataclasses import dataclass | ||
16 | from pathlib import Path | ||
17 | |||
18 | if sys.version_info < (3, 8, 0): | ||
19 | raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.") | ||
20 | |||
21 | SSTATE_PREFIX = "sstate:" | ||
22 | SSTATE_EXTENSION = ".tar.zst" | ||
23 | # SSTATE_EXTENSION = ".tgz" | ||
24 | # .siginfo.done files are mentioned in the original script? | ||
25 | SSTATE_SUFFIXES = ( | ||
26 | SSTATE_EXTENSION, | ||
27 | f"{SSTATE_EXTENSION}.siginfo", | ||
28 | f"{SSTATE_EXTENSION}.done", | ||
29 | ) | ||
30 | |||
31 | RE_SSTATE_PKGSPEC = re.compile( | ||
32 | rf"""sstate:(?P<pn>[^:]*): | ||
33 | (?P<package_target>[^:]*): | ||
34 | (?P<pv>[^:]*): | ||
35 | (?P<pr>[^:]*): | ||
36 | (?P<sstate_pkgarch>[^:]*): | ||
37 | (?P<sstate_version>[^_]*): | ||
38 | (?P<bb_unihash>[^_]*)_ | ||
39 | (?P<bb_task>[^:]*) | ||
40 | (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""", | ||
41 | re.X, | ||
42 | ) | ||
43 | |||
44 | |||
45 | # Really we'd like something like a Path subclass which implements a stat | ||
46 | # cache here, unfortunately there's no good way to do that transparently | ||
47 | # (yet); see: | ||
48 | # | ||
49 | # https://github.com/python/cpython/issues/70219 | ||
50 | # https://discuss.python.org/t/make-pathlib-extensible/3428/77 | ||
51 | @dataclass | ||
52 | class SstateEntry: | ||
53 | """Class for keeping track of an entry in sstate-cache.""" | ||
54 | |||
55 | path: Path | ||
56 | match: re.Match | ||
57 | stat_result: os.stat_result = None | ||
58 | |||
59 | def __hash__(self): | ||
60 | return self.path.__hash__() | ||
61 | |||
62 | def __getattr__(self, name): | ||
63 | return self.match.group(name) | ||
64 | |||
65 | |||
66 | # this is what's in the original script; as far as I can tell, it's an | ||
67 | # implementation artefact which we don't need? | ||
68 | def find_archs(): | ||
69 | # all_archs | ||
70 | builder_arch = os.uname().machine | ||
71 | |||
72 | # FIXME | ||
73 | layer_paths = [Path("../..")] | ||
74 | |||
75 | tune_archs = set() | ||
76 | re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"') | ||
77 | for path in layer_paths: | ||
78 | for tunefile in [ | ||
79 | p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file() | ||
80 | ]: | ||
81 | with open(tunefile) as f: | ||
82 | for line in f: | ||
83 | m = re_tune.match(line) | ||
84 | if m: | ||
85 | tune_archs.update(m.group(1).split()) | ||
86 | |||
87 | # all_machines | ||
88 | machine_archs = set() | ||
89 | for path in layer_paths: | ||
90 | for machine_file in path.glob("meta*/conf/machine/*.conf"): | ||
91 | machine_archs.add(machine_file.parts[-1][:-5]) | ||
92 | |||
93 | extra_archs = set() | ||
94 | all_archs = ( | ||
95 | set( | ||
96 | arch.replace("-", "_") | ||
97 | for arch in machine_archs | tune_archs | set(["allarch", builder_arch]) | ||
98 | ) | ||
99 | | extra_archs | ||
100 | ) | ||
101 | |||
102 | print(all_archs) | ||
103 | |||
104 | |||
105 | # again, not needed? | ||
106 | def find_tasks(): | ||
107 | print(set([p.bb_task for p in paths])) | ||
108 | |||
109 | |||
110 | def collect_sstate_paths(args): | ||
111 | def scandir(path, paths): | ||
112 | # Assume everything is a directory; by not checking we avoid needing an | ||
113 | # additional stat which is potentially a synchronous roundtrip over NFS | ||
114 | try: | ||
115 | for p in path.iterdir(): | ||
116 | filename = p.parts[-1] | ||
117 | if filename.startswith(SSTATE_PREFIX): | ||
118 | if filename.endswith(SSTATE_SUFFIXES): | ||
119 | m = RE_SSTATE_PKGSPEC.match(p.parts[-1]) | ||
120 | assert m | ||
121 | paths.add(SstateEntry(p, m)) | ||
122 | # ignore other things (includes things like lockfiles) | ||
123 | else: | ||
124 | scandir(p, paths) | ||
125 | |||
126 | except NotADirectoryError: | ||
127 | pass | ||
128 | |||
129 | paths = set() | ||
130 | # TODO: parellise scandir | ||
131 | scandir(Path(args.cache_dir), paths) | ||
132 | |||
133 | def path_stat(p): | ||
134 | p.stat_result = p.path.lstat() | ||
135 | |||
136 | if args.remove_duplicated: | ||
137 | # This is probably slightly performance negative on a local filesystem | ||
138 | # when we interact with the GIL; over NFS it's a massive win. | ||
139 | with ThreadPoolExecutor(max_workers=args.jobs) as executor: | ||
140 | executor.map(path_stat, paths) | ||
141 | |||
142 | return paths | ||
143 | |||
144 | |||
145 | def remove_by_stamps(args, paths): | ||
146 | all_sums = set() | ||
147 | for stamps_dir in args.stamps_dir: | ||
148 | stamps_path = Path(stamps_dir) | ||
149 | assert stamps_path.is_dir() | ||
150 | re_sigdata = re.compile(r"do_.*\.sigdata\.([^.]*)") | ||
151 | all_sums |= set( | ||
152 | [ | ||
153 | re_sigdata.search(x.parts[-1]).group(1) | ||
154 | for x in stamps_path.glob("*/*/*.do_*.sigdata.*") | ||
155 | ] | ||
156 | ) | ||
157 | re_setscene = re.compile(r"do_.*_setscene\.([^.]*)") | ||
158 | all_sums |= set( | ||
159 | [ | ||
160 | re_setscene.search(x.parts[-1]).group(1) | ||
161 | for x in stamps_path.glob("*/*/*.do_*_setscene.*") | ||
162 | ] | ||
163 | ) | ||
164 | return [p for p in paths if p.bb_unihash not in all_sums] | ||
165 | |||
166 | |||
167 | def remove_duplicated(args, paths): | ||
168 | # Skip populate_lic as it produces duplicates in a normal build | ||
169 | # | ||
170 | # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates | ||
171 | valid_paths = [p for p in paths if p.bb_task != "populate_lic"] | ||
172 | |||
173 | keep = dict() | ||
174 | remove = list() | ||
175 | for p in valid_paths: | ||
176 | sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext]) | ||
177 | if sstate_sig not in keep: | ||
178 | keep[sstate_sig] = p | ||
179 | elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime: | ||
180 | remove.append(keep[sstate_sig]) | ||
181 | keep[sstate_sig] = p | ||
182 | else: | ||
183 | remove.append(p) | ||
184 | |||
185 | return remove | ||
186 | |||
187 | |||
188 | def remove_orphans(args, paths): | ||
189 | remove = list() | ||
190 | pathsigs = defaultdict(list) | ||
191 | for p in paths: | ||
192 | sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task]) | ||
193 | pathsigs[sstate_sig].append(p) | ||
194 | for k, v in pathsigs.items(): | ||
195 | if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0: | ||
196 | remove.extend(v) | ||
197 | return remove | ||
198 | |||
199 | |||
200 | def parse_arguments(): | ||
201 | parser = argparse.ArgumentParser(description="sstate cache management utility.") | ||
202 | |||
203 | parser.add_argument( | ||
204 | "--cache-dir", | ||
205 | default=os.environ.get("SSTATE_CACHE_DIR"), | ||
206 | help="""Specify sstate cache directory, will use the environment | ||
207 | variable SSTATE_CACHE_DIR if it is not specified.""", | ||
208 | ) | ||
209 | |||
210 | # parser.add_argument( | ||
211 | # "--extra-archs", | ||
212 | # help="""Specify list of architectures which should be tested, this list | ||
213 | # will be extended with native arch, allarch and empty arch. The | ||
214 | # script won't be trying to generate list of available archs from | ||
215 | # AVAILTUNES in tune files.""", | ||
216 | # ) | ||
217 | |||
218 | # parser.add_argument( | ||
219 | # "--extra-layer", | ||
220 | # help="""Specify the layer which will be used for searching the archs, | ||
221 | # it will search the meta and meta-* layers in the top dir by | ||
222 | # default, and will search meta, meta-*, <layer1>, <layer2>, | ||
223 | # ...<layern> when specified. Use "," as the separator. | ||
224 | # | ||
225 | # This is useless for --stamps-dir or when --extra-archs is used.""", | ||
226 | # ) | ||
227 | |||
228 | parser.add_argument( | ||
229 | "-d", | ||
230 | "--remove-duplicated", | ||
231 | action="store_true", | ||
232 | help="""Remove the duplicated sstate cache files of one package, only | ||
233 | the newest one will be kept. The duplicated sstate cache files | ||
234 | of one package must have the same arch, which means sstate cache | ||
235 | files with multiple archs are not considered duplicate. | ||
236 | |||
237 | Conflicts with --stamps-dir.""", | ||
238 | ) | ||
239 | |||
240 | parser.add_argument( | ||
241 | "--remove-orphans", | ||
242 | action="store_true", | ||
243 | help=f"""Remove orphan siginfo files from the sstate cache, i.e. those | ||
244 | where this is no {SSTATE_EXTENSION} file but there are associated | ||
245 | tracking files.""", | ||
246 | ) | ||
247 | |||
248 | parser.add_argument( | ||
249 | "--stamps-dir", | ||
250 | action="append", | ||
251 | help="""Specify the build directory's stamps directories, the sstate | ||
252 | cache file which IS USED by these build diretories will be KEPT, | ||
253 | other sstate cache files in cache-dir will be removed. Can be | ||
254 | specified multiple times for several directories. | ||
255 | |||
256 | Conflicts with --remove-duplicated.""", | ||
257 | ) | ||
258 | |||
259 | parser.add_argument( | ||
260 | "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel." | ||
261 | ) | ||
262 | |||
263 | # parser.add_argument( | ||
264 | # "-L", | ||
265 | # "--follow-symlink", | ||
266 | # action="store_true", | ||
267 | # help="Remove both the symbol link and the destination file, default: no.", | ||
268 | # ) | ||
269 | |||
270 | parser.add_argument( | ||
271 | "-y", | ||
272 | "--yes", | ||
273 | action="store_true", | ||
274 | help="""Automatic yes to prompts; assume "yes" as answer to all prompts | ||
275 | and run non-interactively.""", | ||
276 | ) | ||
277 | |||
278 | parser.add_argument( | ||
279 | "-v", "--verbose", action="store_true", help="Explain what is being done." | ||
280 | ) | ||
281 | |||
282 | parser.add_argument( | ||
283 | "-D", | ||
284 | "--debug", | ||
285 | action="count", | ||
286 | default=0, | ||
287 | help="Show debug info, repeat for more debug info.", | ||
288 | ) | ||
289 | |||
290 | args = parser.parse_args() | ||
291 | if args.cache_dir is None or ( | ||
292 | not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans | ||
293 | ): | ||
294 | parser.print_usage() | ||
295 | sys.exit(1) | ||
296 | |||
297 | return args | ||
298 | |||
299 | |||
300 | def main(): | ||
301 | args = parse_arguments() | ||
302 | |||
303 | paths = collect_sstate_paths(args) | ||
304 | if args.remove_duplicated: | ||
305 | remove = remove_duplicated(args, paths) | ||
306 | elif args.stamps_dir: | ||
307 | remove = remove_by_stamps(args, paths) | ||
308 | else: | ||
309 | remove = list() | ||
310 | |||
311 | if args.remove_orphans: | ||
312 | remove = set(remove) | set(remove_orphans(args, paths)) | ||
313 | |||
314 | if args.debug >= 1: | ||
315 | print("\n".join([str(p.path) for p in remove])) | ||
316 | print(f"{len(remove)} out of {len(paths)} files will be removed!") | ||
317 | if not args.yes: | ||
318 | print("Do you want to continue (y/n)?") | ||
319 | confirm = input() in ("y", "Y") | ||
320 | else: | ||
321 | confirm = True | ||
322 | if confirm: | ||
323 | # TODO: parallelise remove | ||
324 | for p in remove: | ||
325 | p.path.unlink() | ||
326 | |||
327 | |||
328 | if __name__ == "__main__": | ||
329 | main() | ||