diff options
| -rwxr-xr-x | scripts/sstate-cache-management.py | 329 |
1 files changed, 329 insertions, 0 deletions
diff --git a/scripts/sstate-cache-management.py b/scripts/sstate-cache-management.py new file mode 100755 index 0000000000..09b7aa2aef --- /dev/null +++ b/scripts/sstate-cache-management.py | |||
| @@ -0,0 +1,329 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | # | ||
| 3 | # Copyright OpenEmbedded Contributors | ||
| 4 | # | ||
| 5 | # SPDX-License-Identifier: MIT | ||
| 6 | # | ||
| 7 | |||
| 8 | import argparse | ||
| 9 | import os | ||
| 10 | import re | ||
| 11 | import sys | ||
| 12 | |||
| 13 | from collections import defaultdict | ||
| 14 | from concurrent.futures import ThreadPoolExecutor | ||
| 15 | from dataclasses import dataclass | ||
| 16 | from pathlib import Path | ||
| 17 | |||
| 18 | if sys.version_info < (3, 8, 0): | ||
| 19 | raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.") | ||
| 20 | |||
| 21 | SSTATE_PREFIX = "sstate:" | ||
| 22 | SSTATE_EXTENSION = ".tar.zst" | ||
| 23 | # SSTATE_EXTENSION = ".tgz" | ||
| 24 | # .siginfo.done files are mentioned in the original script? | ||
| 25 | SSTATE_SUFFIXES = ( | ||
| 26 | SSTATE_EXTENSION, | ||
| 27 | f"{SSTATE_EXTENSION}.siginfo", | ||
| 28 | f"{SSTATE_EXTENSION}.done", | ||
| 29 | ) | ||
| 30 | |||
| 31 | RE_SSTATE_PKGSPEC = re.compile( | ||
| 32 | rf"""sstate:(?P<pn>[^:]*): | ||
| 33 | (?P<package_target>[^:]*): | ||
| 34 | (?P<pv>[^:]*): | ||
| 35 | (?P<pr>[^:]*): | ||
| 36 | (?P<sstate_pkgarch>[^:]*): | ||
| 37 | (?P<sstate_version>[^_]*): | ||
| 38 | (?P<bb_unihash>[^_]*)_ | ||
| 39 | (?P<bb_task>[^:]*) | ||
| 40 | (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""", | ||
| 41 | re.X, | ||
| 42 | ) | ||
| 43 | |||
| 44 | |||
| 45 | # Really we'd like something like a Path subclass which implements a stat | ||
| 46 | # cache here, unfortunately there's no good way to do that transparently | ||
| 47 | # (yet); see: | ||
| 48 | # | ||
| 49 | # https://github.com/python/cpython/issues/70219 | ||
| 50 | # https://discuss.python.org/t/make-pathlib-extensible/3428/77 | ||
| 51 | @dataclass | ||
| 52 | class SstateEntry: | ||
| 53 | """Class for keeping track of an entry in sstate-cache.""" | ||
| 54 | |||
| 55 | path: Path | ||
| 56 | match: re.Match | ||
| 57 | stat_result: os.stat_result = None | ||
| 58 | |||
| 59 | def __hash__(self): | ||
| 60 | return self.path.__hash__() | ||
| 61 | |||
| 62 | def __getattr__(self, name): | ||
| 63 | return self.match.group(name) | ||
| 64 | |||
| 65 | |||
| 66 | # this is what's in the original script; as far as I can tell, it's an | ||
| 67 | # implementation artefact which we don't need? | ||
| 68 | def find_archs(): | ||
| 69 | # all_archs | ||
| 70 | builder_arch = os.uname().machine | ||
| 71 | |||
| 72 | # FIXME | ||
| 73 | layer_paths = [Path("../..")] | ||
| 74 | |||
| 75 | tune_archs = set() | ||
| 76 | re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"') | ||
| 77 | for path in layer_paths: | ||
| 78 | for tunefile in [ | ||
| 79 | p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file() | ||
| 80 | ]: | ||
| 81 | with open(tunefile) as f: | ||
| 82 | for line in f: | ||
| 83 | m = re_tune.match(line) | ||
| 84 | if m: | ||
| 85 | tune_archs.update(m.group(1).split()) | ||
| 86 | |||
| 87 | # all_machines | ||
| 88 | machine_archs = set() | ||
| 89 | for path in layer_paths: | ||
| 90 | for machine_file in path.glob("meta*/conf/machine/*.conf"): | ||
| 91 | machine_archs.add(machine_file.parts[-1][:-5]) | ||
| 92 | |||
| 93 | extra_archs = set() | ||
| 94 | all_archs = ( | ||
| 95 | set( | ||
| 96 | arch.replace("-", "_") | ||
| 97 | for arch in machine_archs | tune_archs | set(["allarch", builder_arch]) | ||
| 98 | ) | ||
| 99 | | extra_archs | ||
| 100 | ) | ||
| 101 | |||
| 102 | print(all_archs) | ||
| 103 | |||
| 104 | |||
| 105 | # again, not needed? | ||
| 106 | def find_tasks(): | ||
| 107 | print(set([p.bb_task for p in paths])) | ||
| 108 | |||
| 109 | |||
| 110 | def collect_sstate_paths(args): | ||
| 111 | def scandir(path, paths): | ||
| 112 | # Assume everything is a directory; by not checking we avoid needing an | ||
| 113 | # additional stat which is potentially a synchronous roundtrip over NFS | ||
| 114 | try: | ||
| 115 | for p in path.iterdir(): | ||
| 116 | filename = p.parts[-1] | ||
| 117 | if filename.startswith(SSTATE_PREFIX): | ||
| 118 | if filename.endswith(SSTATE_SUFFIXES): | ||
| 119 | m = RE_SSTATE_PKGSPEC.match(p.parts[-1]) | ||
| 120 | assert m | ||
| 121 | paths.add(SstateEntry(p, m)) | ||
| 122 | # ignore other things (includes things like lockfiles) | ||
| 123 | else: | ||
| 124 | scandir(p, paths) | ||
| 125 | |||
| 126 | except NotADirectoryError: | ||
| 127 | pass | ||
| 128 | |||
| 129 | paths = set() | ||
| 130 | # TODO: parellise scandir | ||
| 131 | scandir(Path(args.cache_dir), paths) | ||
| 132 | |||
| 133 | def path_stat(p): | ||
| 134 | p.stat_result = p.path.lstat() | ||
| 135 | |||
| 136 | if args.remove_duplicated: | ||
| 137 | # This is probably slightly performance negative on a local filesystem | ||
| 138 | # when we interact with the GIL; over NFS it's a massive win. | ||
| 139 | with ThreadPoolExecutor(max_workers=args.jobs) as executor: | ||
| 140 | executor.map(path_stat, paths) | ||
| 141 | |||
| 142 | return paths | ||
| 143 | |||
| 144 | |||
| 145 | def remove_by_stamps(args, paths): | ||
| 146 | all_sums = set() | ||
| 147 | for stamps_dir in args.stamps_dir: | ||
| 148 | stamps_path = Path(stamps_dir) | ||
| 149 | assert stamps_path.is_dir() | ||
| 150 | re_sigdata = re.compile(r"do_.*.sigdata\.([^.]*)") | ||
| 151 | all_sums |= set( | ||
| 152 | [ | ||
| 153 | re_sigdata.search(x.parts[-1]).group(1) | ||
| 154 | for x in stamps_path.glob("*/*/*.do_*.sigdata.*") | ||
| 155 | ] | ||
| 156 | ) | ||
| 157 | re_setscene = re.compile(r"do_.*_setscene\.([^.]*)") | ||
| 158 | all_sums |= set( | ||
| 159 | [ | ||
| 160 | re_setscene.search(x.parts[-1]).group(1) | ||
| 161 | for x in stamps_path.glob("*/*/*.do_*_setscene.*") | ||
| 162 | ] | ||
| 163 | ) | ||
| 164 | return [p for p in paths if p.bb_unihash not in all_sums] | ||
| 165 | |||
| 166 | |||
| 167 | def remove_duplicated(args, paths): | ||
| 168 | # Skip populate_lic as it produces duplicates in a normal build | ||
| 169 | # | ||
| 170 | # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates | ||
| 171 | valid_paths = [p for p in paths if p.bb_task != "populate_lic"] | ||
| 172 | |||
| 173 | keep = dict() | ||
| 174 | remove = list() | ||
| 175 | for p in valid_paths: | ||
| 176 | sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext]) | ||
| 177 | if sstate_sig not in keep: | ||
| 178 | keep[sstate_sig] = p | ||
| 179 | elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime: | ||
| 180 | remove.append(keep[sstate_sig]) | ||
| 181 | keep[sstate_sig] = p | ||
| 182 | else: | ||
| 183 | remove.append(p) | ||
| 184 | |||
| 185 | return remove | ||
| 186 | |||
| 187 | |||
| 188 | def remove_orphans(args, paths): | ||
| 189 | remove = list() | ||
| 190 | pathsigs = defaultdict(list) | ||
| 191 | for p in paths: | ||
| 192 | sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task]) | ||
| 193 | pathsigs[sstate_sig].append(p) | ||
| 194 | for k, v in pathsigs.items(): | ||
| 195 | if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0: | ||
| 196 | remove.extend(v) | ||
| 197 | return remove | ||
| 198 | |||
| 199 | |||
| 200 | def parse_arguments(): | ||
| 201 | parser = argparse.ArgumentParser(description="sstate cache management utility.") | ||
| 202 | |||
| 203 | parser.add_argument( | ||
| 204 | "--cache-dir", | ||
| 205 | default=os.environ.get("SSTATE_CACHE_DIR"), | ||
| 206 | help="""Specify sstate cache directory, will use the environment | ||
| 207 | variable SSTATE_CACHE_DIR if it is not specified.""", | ||
| 208 | ) | ||
| 209 | |||
| 210 | # parser.add_argument( | ||
| 211 | # "--extra-archs", | ||
| 212 | # help="""Specify list of architectures which should be tested, this list | ||
| 213 | # will be extended with native arch, allarch and empty arch. The | ||
| 214 | # script won't be trying to generate list of available archs from | ||
| 215 | # AVAILTUNES in tune files.""", | ||
| 216 | # ) | ||
| 217 | |||
| 218 | # parser.add_argument( | ||
| 219 | # "--extra-layer", | ||
| 220 | # help="""Specify the layer which will be used for searching the archs, | ||
| 221 | # it will search the meta and meta-* layers in the top dir by | ||
| 222 | # default, and will search meta, meta-*, <layer1>, <layer2>, | ||
| 223 | # ...<layern> when specified. Use "," as the separator. | ||
| 224 | # | ||
| 225 | # This is useless for --stamps-dir or when --extra-archs is used.""", | ||
| 226 | # ) | ||
| 227 | |||
| 228 | parser.add_argument( | ||
| 229 | "-d", | ||
| 230 | "--remove-duplicated", | ||
| 231 | action="store_true", | ||
| 232 | help="""Remove the duplicated sstate cache files of one package, only | ||
| 233 | the newest one will be kept. The duplicated sstate cache files | ||
| 234 | of one package must have the same arch, which means sstate cache | ||
| 235 | files with multiple archs are not considered duplicate. | ||
| 236 | |||
| 237 | Conflicts with --stamps-dir.""", | ||
| 238 | ) | ||
| 239 | |||
| 240 | parser.add_argument( | ||
| 241 | "--remove-orphans", | ||
| 242 | action="store_true", | ||
| 243 | help=f"""Remove orphan siginfo files from the sstate cache, i.e. those | ||
| 244 | where this is no {SSTATE_EXTENSION} file but there are associated | ||
| 245 | tracking files.""", | ||
| 246 | ) | ||
| 247 | |||
| 248 | parser.add_argument( | ||
| 249 | "--stamps-dir", | ||
| 250 | action="append", | ||
| 251 | help="""Specify the build directory's stamps directories, the sstate | ||
| 252 | cache file which IS USED by these build diretories will be KEPT, | ||
| 253 | other sstate cache files in cache-dir will be removed. Can be | ||
| 254 | specified multiple times for several directories. | ||
| 255 | |||
| 256 | Conflicts with --remove-duplicated.""", | ||
| 257 | ) | ||
| 258 | |||
| 259 | parser.add_argument( | ||
| 260 | "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel." | ||
| 261 | ) | ||
| 262 | |||
| 263 | # parser.add_argument( | ||
| 264 | # "-L", | ||
| 265 | # "--follow-symlink", | ||
| 266 | # action="store_true", | ||
| 267 | # help="Remove both the symbol link and the destination file, default: no.", | ||
| 268 | # ) | ||
| 269 | |||
| 270 | parser.add_argument( | ||
| 271 | "-y", | ||
| 272 | "--yes", | ||
| 273 | action="store_true", | ||
| 274 | help="""Automatic yes to prompts; assume "yes" as answer to all prompts | ||
| 275 | and run non-interactively.""", | ||
| 276 | ) | ||
| 277 | |||
| 278 | parser.add_argument( | ||
| 279 | "-v", "--verbose", action="store_true", help="Explain what is being done." | ||
| 280 | ) | ||
| 281 | |||
| 282 | parser.add_argument( | ||
| 283 | "-D", | ||
| 284 | "--debug", | ||
| 285 | action="count", | ||
| 286 | default=0, | ||
| 287 | help="Show debug info, repeat for more debug info.", | ||
| 288 | ) | ||
| 289 | |||
| 290 | args = parser.parse_args() | ||
| 291 | if args.cache_dir is None or ( | ||
| 292 | not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans | ||
| 293 | ): | ||
| 294 | parser.print_usage() | ||
| 295 | sys.exit(1) | ||
| 296 | |||
| 297 | return args | ||
| 298 | |||
| 299 | |||
| 300 | def main(): | ||
| 301 | args = parse_arguments() | ||
| 302 | |||
| 303 | paths = collect_sstate_paths(args) | ||
| 304 | if args.remove_duplicated: | ||
| 305 | remove = remove_duplicated(args, paths) | ||
| 306 | elif args.stamps_dir: | ||
| 307 | remove = remove_by_stamps(args, paths) | ||
| 308 | else: | ||
| 309 | remove = list() | ||
| 310 | |||
| 311 | if args.remove_orphans: | ||
| 312 | remove = set(remove) | set(remove_orphans(args, paths)) | ||
| 313 | |||
| 314 | if args.debug >= 1: | ||
| 315 | print("\n".join([str(p.path) for p in remove])) | ||
| 316 | print(f"{len(remove)} out of {len(paths)} files will be removed!") | ||
| 317 | if not args.yes: | ||
| 318 | print("Do you want to continue (y/n)?") | ||
| 319 | confirm = input() in ("y", "Y") | ||
| 320 | else: | ||
| 321 | confirm = True | ||
| 322 | if confirm: | ||
| 323 | # TODO: parallelise remove | ||
| 324 | for p in remove: | ||
| 325 | p.path.unlink() | ||
| 326 | |||
| 327 | |||
| 328 | if __name__ == "__main__": | ||
| 329 | main() | ||
