summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBruce Ashfield <bruce.ashfield@gmail.com>2025-12-04 22:36:12 +0000
committerBruce Ashfield <bruce.ashfield@gmail.com>2025-12-08 20:57:44 -0500
commita303bf16ffd747c50c95cbe385407ba8b0122cec (patch)
treeddb26a7945e746ce8206fc65b0a971ed74dc812b
parent9f40ce9b277a677ad3cddd8bf1c1d15fbd035251 (diff)
downloadmeta-virtualization-a303bf16ffd747c50c95cbe385407ba8b0122cec.tar.gz
scripts: add oe-go-mod-fetcher for Go module VCS resolution
Add the oe-go-mod-fetcher.py tool and supporting files for resolving Go module dependencies via git repositories instead of module proxies. oe-go-mod-fetcher.py: - Parses go.mod and go.sum to identify required modules - Resolves module paths to git repositories (handles vanity URLs) - Maps module versions to git commits - Generates SRC_URI entries for bitbake fetcher - Creates go-mod-git.inc and go-mod-cache.inc files - Supports monorepo detection and nested module handling - Caches resolution results for performance extract-discovered-modules.py: - Helper script to extract module information from discovery cache - Used by go-mod-discovery.bbclass during build Also adds .gitignore to exclude runtime caches from version control. Signed-off-by: Bruce Ashfield <bruce.ashfield@gmail.com>
-rw-r--r--scripts/.gitignore15
-rw-r--r--scripts/data/manual-overrides.json8
-rwxr-xr-xscripts/extract-discovered-modules.py491
-rwxr-xr-xscripts/oe-go-mod-fetcher.py4580
4 files changed, 5094 insertions, 0 deletions
diff --git a/scripts/.gitignore b/scripts/.gitignore
new file mode 100644
index 00000000..57fdcfc0
--- /dev/null
+++ b/scripts/.gitignore
@@ -0,0 +1,15 @@
1# Runtime caches generated by oe-go-mod-fetcher.py
2data/module-cache.json
3data/vanity-url-cache.json
4data/ls-remote-cache.json
5data/verify-cache.json
6data/.verify/
7
8# Python bytecode
9__pycache__/
10*.pyc
11
12# Editor/IDE files
13*.swp
14*~
15.cache/
diff --git a/scripts/data/manual-overrides.json b/scripts/data/manual-overrides.json
new file mode 100644
index 00000000..5657c40c
--- /dev/null
+++ b/scripts/data/manual-overrides.json
@@ -0,0 +1,8 @@
1{
2 "_comment": "Git-tracked repository overrides for modules where automatic discovery fails.",
3 "_format": "module/path or module/path@version -> repository URL",
4 "_example": {
5 "example.com/broken-vanity": "https://github.com/org/actual-repo",
6 "example.com/versioned@v1.2.3": "https://github.com/org/specific-version-repo"
7 }
8}
diff --git a/scripts/extract-discovered-modules.py b/scripts/extract-discovered-modules.py
new file mode 100755
index 00000000..1cfca6ad
--- /dev/null
+++ b/scripts/extract-discovered-modules.py
@@ -0,0 +1,491 @@
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0-only
3#
4# go-dep processor
5#
6# Copyright (C) 2025 Bruce Ashfield
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License version 2 as
10# published by the Free Software Foundation.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License along
18# with this program; if not, write to the Free Software Foundation, Inc.,
19# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20
21"""
22Extract complete module metadata from BitBake Go discovery build cache.
23
24This script walks a GOMODCACHE directory (from BitBake discovery build) and
25extracts all module metadata from .info files, including VCS information.
26
27Usage:
28 extract-discovered-modules.py --gomodcache /path/to/cache --output modules.json
29
30The script creates:
31 - modules.json: Complete metadata with VCS URLs, commits, subdirs, timestamps
32 - modules.txt: Simple module@version list
33
34This provides 100% accurate module discovery for BitBake recipe generation.
35"""
36
37import argparse
38import json
39import os
40import re
41import shutil
42import subprocess
43import sys
44import tempfile
45import urllib.parse
46from pathlib import Path
47
48
49def git_ls_remote(url: str, ref: str) -> str:
50 """
51 Query a git repository for a ref and return the commit hash.
52
53 For tags, also tries dereferenced form (^{}) to handle annotated tags.
54 """
55 try:
56 # Try dereferenced form first (handles annotated tags)
57 refs_to_try = [f"{ref}^{{}}", ref] if ref.startswith("refs/tags/") else [ref]
58
59 for query_ref in refs_to_try:
60 result = subprocess.run(
61 ['git', 'ls-remote', url, query_ref],
62 capture_output=True,
63 text=True,
64 timeout=30
65 )
66 if result.returncode == 0 and result.stdout.strip():
67 # Parse: "hash<tab>ref"
68 line = result.stdout.strip().split('\n')[0]
69 parts = line.split('\t')
70 if len(parts) >= 1 and len(parts[0]) == 40:
71 return parts[0]
72 except Exception:
73 pass
74 return ''
75
76
77def resolve_short_hash(url: str, short_hash: str) -> str:
78 """
79 Resolve a 12-char short hash to full 40-char hash.
80
81 Go pseudo-versions only contain 12 characters of the commit hash.
82 BitBake's git fetcher needs the full 40-char hash.
83
84 Strategy: Try GitHub API first (fast), then git ls-remote, then shallow clone.
85 """
86 if len(short_hash) != 12:
87 return short_hash # Already full or invalid
88
89 # First try: GitHub API (fast - single HTTP request)
90 # Note: Rate limited to 60/hour without auth token
91 if 'github.com' in url:
92 try:
93 import urllib.request
94 repo_path = url.replace('https://github.com/', '').replace('.git', '')
95 api_url = f"https://api.github.com/repos/{repo_path}/commits/{short_hash}"
96 req = urllib.request.Request(api_url, headers={'User-Agent': 'oe-go-mod-fetcher'})
97 with urllib.request.urlopen(req, timeout=10) as response:
98 data = json.loads(response.read().decode())
99 if 'sha' in data and len(data['sha']) == 40:
100 return data['sha']
101 except Exception:
102 pass # Rate limited or other error - try next method
103
104 # Second try: git ls-remote (downloads all refs, checks if any match)
105 # This works if the commit is a branch head or tag
106 try:
107 result = subprocess.run(
108 ['git', 'ls-remote', url],
109 capture_output=True,
110 text=True,
111 timeout=30
112 )
113 if result.returncode == 0:
114 for line in result.stdout.strip().split('\n'):
115 if line:
116 full_hash = line.split('\t')[0]
117 if full_hash.startswith(short_hash):
118 return full_hash
119 except Exception:
120 pass
121
122 # Third try: Shallow clone and rev-parse (slower but works for any commit)
123 try:
124 with tempfile.TemporaryDirectory(prefix='hash-resolve-') as tmpdir:
125 # Clone with minimal depth
126 clone_result = subprocess.run(
127 ['git', 'clone', '--bare', '--filter=blob:none', url, tmpdir + '/repo'],
128 capture_output=True,
129 timeout=120,
130 env={**os.environ, 'GIT_TERMINAL_PROMPT': '0'}
131 )
132 if clone_result.returncode == 0:
133 # Use rev-parse to expand short hash
134 parse_result = subprocess.run(
135 ['git', 'rev-parse', short_hash],
136 cwd=tmpdir + '/repo',
137 capture_output=True,
138 text=True,
139 timeout=10
140 )
141 if parse_result.returncode == 0:
142 full_hash = parse_result.stdout.strip()
143 if len(full_hash) == 40:
144 return full_hash
145 except Exception:
146 pass
147
148 # Could not resolve - return original short hash
149 return short_hash
150
151
152def derive_vcs_info(module_path, version):
153 """
154 Derive VCS URL and commit info from module path and version.
155
156 This is used for modules where the Go proxy doesn't provide Origin metadata
157 (older modules cached before Go 1.18).
158
159 Returns:
160 dict with vcs_url, vcs_hash (if pseudo-version), vcs_ref, subdir
161 or None if cannot derive
162 """
163 vcs_url = None
164 vcs_hash = ''
165 vcs_ref = ''
166 subpath = '' # FIX #32: Track subpath for multi-module repos (tag prefix)
167
168 # Derive URL from module path
169 if module_path.startswith('github.com/'):
170 # github.com/owner/repo or github.com/owner/repo/subpkg
171 parts = module_path.split('/')
172 if len(parts) >= 3:
173 vcs_url = f"https://github.com/{parts[1]}/{parts[2]}"
174 # FIX #32: Track subpath for multi-module repos (e.g., github.com/owner/repo/cmd/tool)
175 if len(parts) > 3:
176 subpath = '/'.join(parts[3:])
177
178 elif module_path.startswith('gitlab.com/'):
179 parts = module_path.split('/')
180 if len(parts) >= 3:
181 vcs_url = f"https://gitlab.com/{parts[1]}/{parts[2]}"
182
183 elif module_path.startswith('bitbucket.org/'):
184 parts = module_path.split('/')
185 if len(parts) >= 3:
186 vcs_url = f"https://bitbucket.org/{parts[1]}/{parts[2]}"
187
188 elif module_path.startswith('gopkg.in/'):
189 # gopkg.in/yaml.v2 -> github.com/go-yaml/yaml
190 # gopkg.in/check.v1 -> github.com/go-check/check
191 # gopkg.in/pkg.v3 -> github.com/go-pkg/pkg (convention)
192 # gopkg.in/fsnotify.v1 -> github.com/fsnotify/fsnotify (no go- prefix)
193 match = re.match(r'gopkg\.in/([^/]+)\.v\d+', module_path)
194 if match:
195 pkg_name = match.group(1)
196 # Common mappings - some use go-* prefix, others don't
197 mappings = {
198 'yaml': 'https://github.com/go-yaml/yaml',
199 'check': 'https://github.com/go-check/check',
200 'inf': 'https://github.com/go-inf/inf',
201 'tomb': 'https://github.com/go-tomb/tomb',
202 'fsnotify': 'https://github.com/fsnotify/fsnotify', # No go- prefix
203 }
204 vcs_url = mappings.get(pkg_name, f"https://github.com/go-{pkg_name}/{pkg_name}")
205
206 elif module_path.startswith('google.golang.org/'):
207 # google.golang.org vanity imports -> github.com/golang/*
208 # google.golang.org/appengine -> github.com/golang/appengine
209 # google.golang.org/protobuf -> github.com/protocolbuffers/protobuf-go (special case)
210 # google.golang.org/grpc -> github.com/grpc/grpc-go (special case)
211 # google.golang.org/genproto -> github.com/googleapis/go-genproto (special case)
212 #
213 # FIX #32: Handle submodules in multi-module repos
214 # google.golang.org/grpc/cmd/protoc-gen-go-grpc has tags like:
215 # cmd/protoc-gen-go-grpc/v1.1.0 (NOT v1.1.0)
216 # We need to track the subpath for tag prefix construction
217 parts = module_path.split('/')
218 if len(parts) >= 2:
219 pkg_name = parts[1] # First component after google.golang.org/
220 mappings = {
221 'protobuf': 'https://github.com/protocolbuffers/protobuf-go',
222 'grpc': 'https://github.com/grpc/grpc-go',
223 'genproto': 'https://github.com/googleapis/go-genproto',
224 'api': 'https://github.com/googleapis/google-api-go-client',
225 }
226 vcs_url = mappings.get(pkg_name, f"https://github.com/golang/{pkg_name}")
227 # Track subpath for submodule tag construction (e.g., cmd/protoc-gen-go-grpc)
228 if len(parts) > 2:
229 subpath = '/'.join(parts[2:]) # Everything after google.golang.org/grpc/
230
231 if not vcs_url:
232 return None
233
234 # Parse version for commit hash (pseudo-versions)
235 # Go pseudo-version formats:
236 # v0.0.0-20200815063812-42c35b437635 (no base version)
237 # v1.2.3-0.20200815063812-42c35b437635 (pre-release with "0." prefix)
238 # v1.2.4-0.20200815063812-42c35b437635 (post v1.2.3, pre v1.2.4)
239 # The key pattern: optional "0." then YYYYMMDDHHMMSS (14 digits) then 12-char commit hash
240 # Also handle +incompatible suffix
241 clean_version = version.replace('+incompatible', '')
242
243 # Try both pseudo-version formats:
244 # Format 1: -0.YYYYMMDDHHMMSS-HASH (with "0." prefix)
245 # Format 2: -YYYYMMDDHHMMSS-HASH (without prefix, typically v0.0.0-...)
246 pseudo_match = re.search(r'-(?:0\.)?(\d{14})-([0-9a-f]{12})$', clean_version)
247 if pseudo_match:
248 vcs_hash = pseudo_match.group(2) # 12-char short hash
249 # Note: Short hashes are expanded to full 40-char by oe-go-mod-fetcher.py
250 # in load_native_modules() using resolve_pseudo_version_commit()
251 else:
252 # Tagged version - resolve tag to commit hash
253 # FIX #32: For multi-module repos, the tag includes the subpath prefix
254 # e.g., google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.1.0
255 # has tag: cmd/protoc-gen-go-grpc/v1.1.0 (not v1.1.0)
256 if subpath:
257 tag_name = f"{subpath}/{clean_version}"
258 else:
259 tag_name = clean_version
260 vcs_ref = f"refs/tags/{tag_name}"
261 # Query the repository to get the actual commit hash for this tag
262 vcs_hash = git_ls_remote(vcs_url, vcs_ref)
263 if not vcs_hash and subpath:
264 # FIX #32: Fallback - try without subpath prefix
265 # Some repos don't use prefixed tags for submodules
266 fallback_ref = f"refs/tags/{clean_version}"
267 vcs_hash = git_ls_remote(vcs_url, fallback_ref)
268 if vcs_hash:
269 vcs_ref = fallback_ref # Use the working ref
270
271 return {
272 'vcs_url': vcs_url,
273 'vcs_hash': vcs_hash,
274 'vcs_ref': vcs_ref,
275 'subdir': subpath, # FIX #32: Return subdir for submodules
276 }
277
278
279def extract_modules(gomodcache_path):
280 """
281 Walk GOMODCACHE and extract all module metadata from .info files.
282
283 Returns list of dicts with complete metadata:
284 - module_path: Unescaped module path
285 - version: Module version
286 - vcs_url: Git repository URL
287 - vcs_hash: Full commit hash (40 chars)
288 - vcs_ref: Tag/branch reference
289 - subdir: Subdirectory in mono-repos
290 - timestamp: Commit timestamp
291 """
292 cache_dir = Path(gomodcache_path) / "cache" / "download"
293
294 if not cache_dir.exists():
295 raise FileNotFoundError(f"Cache directory not found: {cache_dir}")
296
297 modules = []
298 skipped = 0
299 derived = 0
300 total_info_files = 0
301
302 print(f"Scanning GOMODCACHE: {cache_dir}")
303
304 for info_file in cache_dir.rglob("*.info"):
305 total_info_files += 1
306
307 # Extract module path from directory structure
308 rel_path = info_file.parent.relative_to(cache_dir)
309 parts = list(rel_path.parts)
310
311 if parts[-1] != '@v':
312 continue
313
314 # Module path (unescape Go's !-encoding)
315 # Example: github.com/!microsoft/go-winio -> github.com/Microsoft/go-winio
316 module_path = '/'.join(parts[:-1])
317 # Unescape !x -> X (Go's case-insensitive encoding)
318 module_path = re.sub(r'!([a-z])', lambda m: m.group(1).upper(), module_path)
319
320 # Version
321 version = info_file.stem
322
323 # Read .info file for VCS metadata
324 try:
325 with open(info_file) as f:
326 info = json.load(f)
327
328 origin = info.get('Origin', {})
329
330 # Check if we have complete VCS info from Origin
331 if origin.get('URL') and origin.get('Hash'):
332 module = {
333 'module_path': module_path,
334 'version': version,
335 'vcs_url': origin.get('URL', ''),
336 'vcs_hash': origin.get('Hash', ''),
337 'vcs_ref': origin.get('Ref', ''),
338 'subdir': origin.get('Subdir', ''),
339 'timestamp': info.get('Time', ''),
340 }
341 modules.append(module)
342 else:
343 # FIX #29: Module lacks Origin metadata (common for +incompatible modules)
344 # Use derive_vcs_info() to infer VCS URL and ref from module path/version
345 derived += 1
346 # Progress output for derived modules (these require network calls)
347 if derived % 10 == 1:
348 print(f" Deriving VCS info... ({derived} modules)", end='\r', flush=True)
349 derived_info = derive_vcs_info(module_path, version)
350 if derived_info:
351 module = {
352 'module_path': module_path,
353 'version': version,
354 'vcs_url': derived_info.get('vcs_url', ''),
355 'vcs_hash': derived_info.get('vcs_hash', ''),
356 'vcs_ref': derived_info.get('vcs_ref', ''),
357 'subdir': derived_info.get('subdir', ''), # FIX #32: Use derived subdir
358 'timestamp': info.get('Time', ''),
359 }
360 modules.append(module)
361 else:
362 # Cannot derive VCS info - skip this module
363 skipped += 1
364 derived -= 1 # Don't count as derived if we couldn't derive
365 # Only log for debugging
366 # print(f" ⚠️ Cannot derive VCS info for {module_path}@{version}")
367
368 except json.JSONDecodeError as e:
369 print(f" ⚠️ Failed to parse {info_file}: {e}")
370 skipped += 1
371 continue
372 except Exception as e:
373 print(f" ⚠️ Error processing {info_file}: {e}")
374 skipped += 1
375 continue
376
377 print(f"\nProcessed {total_info_files} .info files")
378 print(f"Extracted {len(modules)} modules total:")
379 print(f" - {len(modules) - derived} with Origin metadata from proxy")
380 print(f" - {derived} with derived VCS info (Fix #29)")
381 print(f"Skipped {skipped} modules (cannot derive VCS info)")
382
383 return modules
384
385
386def main():
387 parser = argparse.ArgumentParser(
388 description='Extract module metadata from Go module cache',
389 formatter_class=argparse.RawDescriptionHelpFormatter,
390 epilog="""
391Examples:
392 # Extract from native Go build cache
393 %(prog)s --gomodcache /tmp/k3s-discovery-cache --output /tmp/k3s-modules.json
394
395 # Extract from BitBake discovery build
396 %(prog)s --gomodcache /path/to/build/tmp/work/.../discovery-cache --output /tmp/k3s-modules.json
397
398 # Extract from system GOMODCACHE
399 %(prog)s --gomodcache ~/go/pkg/mod --output /tmp/modules.json
400
401Output:
402 - <output>.json: Complete module metadata (VCS URLs, commits, subdirs)
403 - <output>.txt: Simple module@version list (sorted)
404"""
405 )
406 parser.add_argument(
407 '--gomodcache',
408 required=True,
409 help='Path to GOMODCACHE directory'
410 )
411 parser.add_argument(
412 '--output',
413 required=True,
414 help='Output JSON file path (e.g., /tmp/k3s-modules.json)'
415 )
416
417 args = parser.parse_args()
418
419 # Validate GOMODCACHE path
420 gomodcache = Path(args.gomodcache)
421 if not gomodcache.exists():
422 print(f"Error: GOMODCACHE directory does not exist: {gomodcache}", file=sys.stderr)
423 sys.exit(1)
424
425 # Extract modules
426 try:
427 modules = extract_modules(gomodcache)
428 except Exception as e:
429 print(f"Error during extraction: {e}", file=sys.stderr)
430 sys.exit(1)
431
432 if not modules:
433 print("Warning: No modules with VCS metadata found!", file=sys.stderr)
434 print("This may indicate:", file=sys.stderr)
435 print(" - GOMODCACHE is from BitBake (synthetic .info files)", file=sys.stderr)
436 print(" - GOMODCACHE is empty or incomplete", file=sys.stderr)
437 print(" - Need to run 'go mod download' first", file=sys.stderr)
438 sys.exit(1)
439
440 # Save as JSON
441 output_path = Path(args.output)
442 try:
443 output_path.parent.mkdir(parents=True, exist_ok=True)
444 output_path.write_text(json.dumps(modules, indent=2, sort_keys=True))
445 print(f"\n✓ Saved {len(modules)} modules to {output_path}")
446 except Exception as e:
447 print(f"Error writing JSON output: {e}", file=sys.stderr)
448 sys.exit(1)
449
450 # Also save simple list
451 list_path = output_path.with_suffix('.txt')
452 try:
453 simple_list = [f"{m['module_path']}@{m['version']}" for m in modules]
454 list_path.write_text('\n'.join(sorted(simple_list)) + '\n')
455 print(f"✓ Saved module list to {list_path}")
456 except Exception as e:
457 print(f"Error writing module list: {e}", file=sys.stderr)
458 sys.exit(1)
459
460 # Print summary statistics
461 print("\n" + "="*60)
462 print("EXTRACTION SUMMARY")
463 print("="*60)
464
465 # Count unique repositories
466 unique_repos = len(set(m['vcs_url'] for m in modules))
467 print(f"Total modules: {len(modules)}")
468 print(f"Unique repositories: {unique_repos}")
469
470 # Count modules with subdirs (multi-module repos)
471 with_subdirs = sum(1 for m in modules if m['subdir'])
472 print(f"Multi-module repos: {with_subdirs} modules have subdirs")
473
474 # Show top repositories by module count
475 repo_counts = {}
476 for m in modules:
477 repo_counts[m['vcs_url']] = repo_counts.get(m['vcs_url'], 0) + 1
478
479 top_repos = sorted(repo_counts.items(), key=lambda x: x[1], reverse=True)[:5]
480 print("\nTop 5 repositories by module count:")
481 for repo_url, count in top_repos:
482 print(f" {count:3d} modules: {repo_url}")
483
484 print("\n" + "="*60)
485 print("Use this JSON file with:")
486 print(f" oe-go-mod-fetcher.py --native-modules {output_path}")
487 print("="*60)
488
489
490if __name__ == '__main__':
491 main()
diff --git a/scripts/oe-go-mod-fetcher.py b/scripts/oe-go-mod-fetcher.py
new file mode 100755
index 00000000..699255bd
--- /dev/null
+++ b/scripts/oe-go-mod-fetcher.py
@@ -0,0 +1,4580 @@
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0-only
3#
4# go-dep processor
5#
6# Copyright (C) 2025 Bruce Ashfield
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License version 2 as
10# published by the Free Software Foundation.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License along
18# with this program; if not, write to the Free Software Foundation, Inc.,
19# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20
21"""
22Go Module Git Fetcher - Hybrid Architecture
23Version 3.0.0 - Complete rewrite using Go download for discovery + git builds
24Author: Bruce Ashfield
25Description: Use Go's download for discovery, build from git sources
26
27ARCHITECTURE:
28Phase 1: Discovery - Use 'go mod download' + filesystem walk to get correct module paths
29Phase 2: Recipe Generation - Generate BitBake recipe with git:// SRC_URI entries
30Phase 3: Cache Building - Build module cache from git sources during do_create_module_cache
31
32This approach eliminates:
33- Complex go list -m -json parsing
34- Manual go.sum parsing and augmentation
35- Parent module detection heuristics
36- Version path manipulation (/v2+/v3+ workarounds)
37- Module path normalization bugs
38
39Instead we:
40- Let Go download modules to temporary cache (discovery only)
41- Walk filesystem to get CORRECT module paths (no parsing!)
42- Extract VCS info from .info files
43- Fetch git repositories for each module
44- Build module cache from git during BitBake build
45
46CHANGELOG v3.0.0:
47- Complete architectural rewrite
48- Removed all go list and go.sum parsing logic (4000+ lines)
49- Implemented 3-phase hybrid approach
50- Discovery uses go mod download + filesystem walk
51- Module paths from filesystem, not from go list (no more /v3 stripping bugs!)
52- Builds entirely from git sources
53- Compatible with oe-core's gomod:// fetcher (same cache structure)
54"""
55
56import argparse
57import concurrent.futures
58import hashlib
59import io
60import json
61import os
62import re
63import shutil
64import subprocess
65import sys
66import tempfile
67import textwrap
68import threading
69from pathlib import Path
70from typing import Dict, List, Optional, Set, Tuple
71from datetime import datetime, timedelta, timezone
72
73VERSION = "3.0.0"
74LOG_PATH: Optional[Path] = None
75
76# =============================================================================
77# BitBake Task Templates
78# =============================================================================
79
80
81class Tee(io.TextIOBase):
82 """Write data to multiple text streams."""
83
84 def __init__(self, *streams: io.TextIOBase) -> None:
85 self.streams = streams
86
87 def write(self, data: str) -> int:
88 for stream in self.streams:
89 stream.write(data)
90 return len(data)
91
92 def flush(self) -> None:
93 for stream in self.streams:
94 stream.flush()
95
96def parse_go_sum(go_sum_path: Path) -> Tuple[Set[Tuple[str, str]], Set[Tuple[str, str]]]:
97 """
98 Parse go.sum to find modules that need source code.
99
100 Returns:
101 Tuple of (modules_needing_source, indirect_only_modules)
102 - modules_needing_source: Modules with source code entries (need .zip files)
103 - indirect_only_modules: Modules that only have /go.mod entries (only need .mod files)
104 """
105 def sanitize_module_name(name):
106 """Remove quotes from module names"""
107 if not name:
108 return name
109 stripped = name.strip()
110 if len(stripped) >= 2 and stripped[0] == '"' and stripped[-1] == '"':
111 return stripped[1:-1]
112 return stripped
113
114 modules_with_source: Set[Tuple[str, str]] = set()
115 modules_with_gomod_only: Set[Tuple[str, str]] = set()
116
117 if not go_sum_path.exists():
118 return (modules_with_source, modules_with_gomod_only)
119
120 # First pass: collect all entries
121 all_entries = {}
122 with go_sum_path.open() as f:
123 for line in f:
124 line = line.strip()
125 if not line or line.startswith('//'):
126 continue
127 parts = line.split()
128 if len(parts) != 3:
129 continue
130
131 module_path, version, _ = parts
132 module_path = sanitize_module_name(module_path)
133
134 # Track whether this entry is for go.mod or source
135 is_gomod_entry = version.endswith('/go.mod')
136
137 # Strip /go.mod suffix for key
138 base_version = version[:-7] if is_gomod_entry else version
139 key = (module_path, base_version)
140
141 if key not in all_entries:
142 all_entries[key] = {'has_source': False, 'has_gomod': False}
143
144 if is_gomod_entry:
145 all_entries[key]['has_gomod'] = True
146 else:
147 all_entries[key]['has_source'] = True
148
149 # Second pass: categorize modules
150 for key, entry_types in all_entries.items():
151 if entry_types['has_source']:
152 modules_with_source.add(key)
153 continue
154
155 if entry_types['has_gomod']:
156 modules_with_gomod_only.add(key)
157 # Note: We no longer add indirect-only modules to modules_with_source.
158 # The native build succeeds without their .zip files - only .mod files are needed.
159 # Adding them caused the generator to resolve ~1000 extra modules unnecessarily.
160
161 return (modules_with_source, modules_with_gomod_only)
162
163
164def collect_modules_via_go_list(source_dir: Path) -> Set[Tuple[str, str]]:
165 """
166 Use `go list -m -json all` to discover modules that may not appear in go.sum.
167 """
168 env = os.environ.copy()
169 env.setdefault('GOPROXY', 'https://proxy.golang.org')
170 if CURRENT_GOMODCACHE:
171 env['GOMODCACHE'] = CURRENT_GOMODCACHE
172
173 try:
174 result = subprocess.run(
175 ['go', 'list', '-m', '-json', 'all'],
176 cwd=source_dir,
177 capture_output=True,
178 text=True,
179 check=True,
180 env=env,
181 )
182 except subprocess.CalledProcessError:
183 return set()
184
185 data = result.stdout
186 modules: Set[Tuple[str, str]] = set()
187 decoder = json.JSONDecoder()
188 idx = 0
189 length = len(data)
190
191 while idx < length:
192 while idx < length and data[idx].isspace():
193 idx += 1
194 if idx >= length:
195 break
196 try:
197 obj, end = decoder.raw_decode(data, idx)
198 except json.JSONDecodeError:
199 break
200 idx = end
201
202 path = obj.get('Path') or ''
203 if not path or obj.get('Main'):
204 continue
205
206 version = obj.get('Version') or ''
207 replace = obj.get('Replace')
208 if replace:
209 path = replace.get('Path', path) or path
210 version = replace.get('Version', version) or version
211
212 if not version or version == 'none':
213 continue
214
215 modules.add((path, version))
216
217 return modules
218
219
220def go_mod_download(module_path: str, version: str) -> bool:
221 """Download a specific module version into the current GOMODCACHE."""
222 if not CURRENT_GOMODCACHE or not CURRENT_SOURCE_DIR:
223 return False
224
225 key = (module_path, version)
226 if key in DOWNLOADED_MODULES:
227 return module_path
228
229 env = os.environ.copy()
230 env.setdefault('GOPROXY', 'https://proxy.golang.org')
231 env['GOMODCACHE'] = CURRENT_GOMODCACHE
232
233 try:
234 subprocess.run(
235 ['go', 'mod', 'download', f'{module_path}@{version}'],
236 cwd=str(CURRENT_SOURCE_DIR),
237 env=env,
238 capture_output=True,
239 text=True,
240 check=True,
241 timeout=GO_CMD_TIMEOUT,
242 )
243 DOWNLOADED_MODULES.add(key)
244 return True
245 except subprocess.TimeoutExpired as e:
246 print(f" ❌ go mod download timed out for {module_path}@{version} after {GO_CMD_TIMEOUT}s")
247 return False
248 except subprocess.CalledProcessError as e:
249 stderr = (e.stderr or '').strip()
250 if stderr:
251 lower = stderr.lower()
252 network_signals = [
253 "lookup ", "dial tcp", "connection refused",
254 "network is unreachable", "tls handshake timeout",
255 "socket: operation not permitted"
256 ]
257 if any(signal in lower for signal in network_signals):
258 global NETWORK_FAILURE_DETECTED
259 NETWORK_FAILURE_DETECTED = True
260 raise RuntimeError(
261 f"Network failure while downloading {module_path}@{version}: {stderr}"
262 ) from e
263 print(f" ⚠️ go mod download failed for {module_path}@{version}: {stderr}")
264 return False
265
266
267SCRIPT_DIR = Path(__file__).resolve().parent
268CACHE_BASE_DIR = SCRIPT_DIR / "data" # Default to scripts/data for JSON caches
269DATA_DIR = CACHE_BASE_DIR
270CLONE_CACHE_DIR = SCRIPT_DIR / ".cache" / "repos" # Repository clone cache
271VERIFY_BASE_DIR = CACHE_BASE_DIR / ".verify"
272LS_REMOTE_CACHE_PATH = DATA_DIR / "ls-remote-cache.json"
273VERIFY_COMMIT_CACHE_PATH = DATA_DIR / "verify-cache.json"
274MODULE_REPO_OVERRIDES_PATH = DATA_DIR / "repo-overrides.json"
275# Manual overrides file - tracked in git, for permanent overrides when discovery fails
276MANUAL_OVERRIDES_PATH = SCRIPT_DIR / "data" / "manual-overrides.json"
277
278LS_REMOTE_CACHE: Dict[Tuple[str, str], Optional[str]] = {}
279LS_REMOTE_CACHE_DIRTY = False
280
281MODULE_METADATA_CACHE_PATH = DATA_DIR / "module-cache.json"
282MODULE_METADATA_CACHE: Dict[Tuple[str, str], Dict[str, str]] = {}
283MODULE_METADATA_CACHE_DIRTY = False
284
285VANITY_URL_CACHE_PATH = DATA_DIR / "vanity-url-cache.json"
286VANITY_URL_CACHE: Dict[str, Optional[str]] = {}
287VANITY_URL_CACHE_DIRTY = False
288
289CURRENT_GOMODCACHE: Optional[str] = None
290CURRENT_SOURCE_DIR: Optional[Path] = None
291TEMP_GOMODCACHES: List[Path] = []
292FAILED_MODULE_PATHS: Set[str] = set()
293FAILED_MODULE_ENTRIES: Set[Tuple[str, str]] = set()
294DOWNLOADED_MODULES: Set[Tuple[str, str]] = set()
295NETWORK_FAILURE_DETECTED: bool = False
296SKIPPED_MODULES: Dict[Tuple[str, str], str] = {}
297VERBOSE_MODE: bool = False # Set from command-line args
298
299def _record_skipped_module(module_path: str, version: str, reason: str) -> None:
300 SKIPPED_MODULES[(module_path, version)] = reason
301
302GO_CMD_TIMEOUT = 180 # seconds
303GIT_CMD_TIMEOUT = 90 # seconds
304
305VERIFY_REPO_CACHE: Dict[str, Path] = {}
306VERIFY_REPO_LOCKS: Dict[str, threading.Lock] = {} # Per-repository locks for parallel verification
307VERIFY_REPO_LOCKS_LOCK = threading.RLock() # REENTRANT lock to allow same thread to acquire multiple times
308VERIFY_REPO_BRANCHES: Dict[str, List[str]] = {} # Cache branch lists per repo to avoid repeated ls-remote
309VERIFY_RESULTS: Dict[Tuple[str, str], bool] = {}
310VERIFY_COMMIT_CACHE: Dict[str, bool] = {} # Legacy format: key -> bool
311VERIFY_COMMIT_CACHE_V2: Dict[str, Dict[str, any]] = {} # New format: key -> {verified: bool, timestamp: str, last_check: str}
312VERIFY_COMMIT_CACHE_DIRTY = False
313VERIFY_ENABLED = False # Set to True when verification is active
314VERIFY_CACHE_MAX_AGE_DAYS = 30 # Re-verify commits older than this
315VERIFY_DETECTED_BRANCHES: Dict[Tuple[str, str], str] = {} # (url, commit) -> branch_name
316VERIFY_FALLBACK_COMMITS: Dict[Tuple[str, str], str] = {} # Maps (url, original_commit) -> fallback_commit
317VERIFY_FULL_REPOS: Set[str] = set() # Track repos that have been fetched with full history
318VERIFY_CORRECTIONS_APPLIED = False # Track if any commit corrections were made
319MODULE_REPO_OVERRIDES: Dict[Tuple[str, Optional[str]], str] = {} # Dynamic overrides from --set-repo
320MODULE_REPO_OVERRIDES_DIRTY = False
321MANUAL_OVERRIDES: Dict[Tuple[str, Optional[str]], str] = {} # Git-tracked overrides from manual-overrides.json
322
323# REPO_OVERRIDES kept for backwards compatibility but no longer used for hardcoded values.
324# Manual overrides go in data/manual-overrides.json which is tracked in git.
325REPO_OVERRIDES: Dict[str, List[str]] = {}
326
327
328def _normalise_override_key(module_path: str, version: Optional[str]) -> Tuple[str, Optional[str]]:
329 module = module_path.strip()
330 ver = version.strip() if version else None
331 if not module:
332 raise ValueError("module path for override cannot be empty")
333 return module, ver
334
335
336def _parse_override_spec(module_spec: str) -> Tuple[str, Optional[str]]:
337 if '@' in module_spec:
338 module_path, version = module_spec.split('@', 1)
339 version = version or None
340 else:
341 module_path, version = module_spec, None
342 return module_path.strip(), version.strip() if version else None
343
344
345def repo_override_candidates(module_path: str, version: Optional[str] = None) -> List[str]:
346 """
347 Get repository URL override candidates for a module.
348
349 Priority order:
350 1. Dynamic overrides (--set-repo, stored in repo-overrides.json) - version-specific
351 2. Dynamic overrides - wildcard (no version)
352 3. Manual overrides (manual-overrides.json, tracked in git) - version-specific
353 4. Manual overrides - wildcard
354 5. Legacy REPO_OVERRIDES dict (for backwards compatibility)
355 """
356 overrides: List[str] = []
357 key = _normalise_override_key(module_path, version)
358 wildcard_key = _normalise_override_key(module_path, None)
359
360 # Dynamic overrides first (highest priority - user can override manual)
361 dynamic_specific = MODULE_REPO_OVERRIDES.get(key)
362 if dynamic_specific:
363 overrides.append(dynamic_specific)
364
365 dynamic_default = MODULE_REPO_OVERRIDES.get(wildcard_key)
366 if dynamic_default and dynamic_default not in overrides:
367 overrides.append(dynamic_default)
368
369 # Manual overrides next (git-tracked, for permanent fixes)
370 manual_specific = MANUAL_OVERRIDES.get(key)
371 if manual_specific and manual_specific not in overrides:
372 overrides.append(manual_specific)
373
374 manual_default = MANUAL_OVERRIDES.get(wildcard_key)
375 if manual_default and manual_default not in overrides:
376 overrides.append(manual_default)
377
378 # Legacy hardcoded overrides last (backwards compat)
379 for candidate in REPO_OVERRIDES.get(module_path, []):
380 if candidate not in overrides:
381 overrides.append(candidate)
382
383 return overrides
384
385
386def configure_cache_paths(cache_dir: Optional[str], clone_cache_dir: Optional[str] = None) -> None:
387 """
388 Configure cache file locations.
389
390 Args:
391 cache_dir: Directory for JSON metadata caches (default: scripts/data)
392 clone_cache_dir: Directory for git repository clones (default: scripts/.cache/repos)
393 """
394 global CACHE_BASE_DIR, DATA_DIR, CLONE_CACHE_DIR
395 global LS_REMOTE_CACHE_PATH, MODULE_METADATA_CACHE_PATH, VANITY_URL_CACHE_PATH
396 global VERIFY_COMMIT_CACHE_PATH, MODULE_REPO_OVERRIDES_PATH
397
398 # Configure JSON metadata cache directory
399 if cache_dir:
400 CACHE_BASE_DIR = Path(cache_dir).resolve()
401 else:
402 CACHE_BASE_DIR = SCRIPT_DIR / "data" # Default to scripts/data
403
404 CACHE_BASE_DIR.mkdir(parents=True, exist_ok=True)
405 DATA_DIR = CACHE_BASE_DIR # cache_dir IS the data directory now
406
407 LS_REMOTE_CACHE_PATH = DATA_DIR / "ls-remote-cache.json"
408 MODULE_METADATA_CACHE_PATH = DATA_DIR / "module-cache.json"
409 VANITY_URL_CACHE_PATH = DATA_DIR / "vanity-url-cache.json"
410 VERIFY_COMMIT_CACHE_PATH = DATA_DIR / "verify-cache.json"
411 MODULE_REPO_OVERRIDES_PATH = DATA_DIR / "repo-overrides.json"
412
413 global VERIFY_BASE_DIR
414 VERIFY_BASE_DIR = CACHE_BASE_DIR / ".verify"
415 VERIFY_BASE_DIR.mkdir(parents=True, exist_ok=True)
416
417 # Configure git clone cache directory
418 if clone_cache_dir:
419 CLONE_CACHE_DIR = Path(clone_cache_dir).resolve()
420 else:
421 CLONE_CACHE_DIR = SCRIPT_DIR / ".cache" / "repos" # Default to scripts/.cache/repos
422
423 CLONE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
424
425 VERIFY_COMMIT_CACHE.clear()
426 load_verify_commit_cache()
427 MODULE_REPO_OVERRIDES.clear()
428 load_repo_overrides()
429 load_manual_overrides()
430
431 global VERIFY_REPO_CACHE
432 VERIFY_REPO_CACHE = {}
433
434
435def ensure_path_is_writable(path: Path) -> None:
436 """
437 Attempt to create and delete a small file to verify write access. Exit with
438 a clear error if the path is not writable.
439 """
440 path.mkdir(parents=True, exist_ok=True)
441 probe = path / ".oe-go-mod-fetcher-permcheck"
442 try:
443 with open(probe, "w") as fh:
444 fh.write("")
445 except Exception as exc:
446 print(f"❌ GOMODCACHE is not writable: {path} ({exc})")
447 print(" Fix permissions (e.g. chown/chmod) or pass a writable --gomodcache path.")
448 sys.exit(1)
449 finally:
450 try:
451 probe.unlink()
452 except Exception:
453 pass
454
455def _normalize_url(url: str) -> str:
456 url = url.strip()
457 if url.startswith("git://"):
458 url = "https://" + url[6:]
459 if url.endswith(".git"):
460 url = url[:-4]
461 return url
462
463
464def _url_allowed_for_module(module_path: str, url: str, version: Optional[str] = None) -> bool:
465 url = _normalize_url(url)
466 overrides = repo_override_candidates(module_path, version)
467 if not overrides:
468 return True
469 normalized_overrides = {_normalize_url(o) for o in overrides}
470 return url in normalized_overrides
471
472
473def prune_metadata_cache() -> None:
474 """
475 Remove stale metadata entries that no longer satisfy override policies or
476 contain obviously invalid data. This prevents old .inc state from
477 re-introducing bad repositories during bootstrap.
478 """
479 global MODULE_METADATA_CACHE_DIRTY
480
481 removed = False
482 for key in list(MODULE_METADATA_CACHE.keys()):
483 module_path, version = key
484 entry = MODULE_METADATA_CACHE.get(key) or {}
485 vcs_url = entry.get('vcs_url', '')
486 commit = entry.get('commit', '')
487
488 if not vcs_url or not commit:
489 MODULE_METADATA_CACHE.pop(key, None)
490 removed = True
491 continue
492
493 if len(commit) != 40 or not re.fullmatch(r'[0-9a-fA-F]{40}', commit):
494 MODULE_METADATA_CACHE.pop(key, None)
495 removed = True
496 continue
497
498 if not _url_allowed_for_module(module_path, vcs_url, version):
499 MODULE_METADATA_CACHE.pop(key, None)
500 removed = True
501 continue
502
503 if removed:
504 MODULE_METADATA_CACHE_DIRTY = True
505
506
507def _verify_repo_dir(vcs_url: str) -> Path:
508 # Quick check without lock (optimization)
509 if vcs_url in VERIFY_REPO_CACHE:
510 return VERIFY_REPO_CACHE[vcs_url]
511
512 # Use master lock to serialize repo initialization
513 with VERIFY_REPO_LOCKS_LOCK:
514 # Double-check after acquiring lock
515 if vcs_url in VERIFY_REPO_CACHE:
516 return VERIFY_REPO_CACHE[vcs_url]
517
518 repo_hash = hashlib.sha256(vcs_url.encode()).hexdigest()
519 repo_dir = VERIFY_BASE_DIR / repo_hash
520 git_dir = repo_dir / "repo"
521 git_dir.mkdir(parents=True, exist_ok=True)
522
523 env = os.environ.copy()
524 env.setdefault("GIT_TERMINAL_PROMPT", "0")
525 env.setdefault("GIT_ASKPASS", "true")
526
527 if not (git_dir / "config").exists():
528 subprocess.run([
529 "git", "init", "--bare"
530 ], cwd=str(git_dir), check=True, capture_output=True, env=env)
531 subprocess.run([
532 "git", "remote", "add", "origin", vcs_url
533 ], cwd=str(git_dir), check=True, capture_output=True, env=env)
534 else:
535 subprocess.run([
536 "git", "remote", "set-url", "origin", vcs_url
537 ], cwd=str(git_dir), check=False, capture_output=True, env=env)
538
539 VERIFY_REPO_CACHE[vcs_url] = git_dir
540
541 # Create a per-repo lock while we still hold the master lock
542 if vcs_url not in VERIFY_REPO_LOCKS:
543 VERIFY_REPO_LOCKS[vcs_url] = threading.Lock()
544
545 return git_dir
546
547
548def _find_fallback_commit(vcs_url: str, version: str, timestamp: str = "") -> Optional[Tuple[str, str]]:
549 """
550 Find a fallback commit when the proxy commit doesn't exist.
551
552 Strategy:
553 1. For pseudo-versions with timestamp: find commit near that date on default branch
554 2. Otherwise: use latest commit on default branch (main/master)
555
556 Returns: (commit_hash, branch_name) or None if failed
557 """
558 import re
559 from datetime import datetime
560
561 env = os.environ.copy()
562 env.setdefault("GIT_TERMINAL_PROMPT", "0")
563 env.setdefault("GIT_ASKPASS", "true")
564
565 # Extract timestamp from pseudo-version: v0.0.0-YYYYMMDDHHMMSS-hash
566 target_date = None
567 if timestamp:
568 try:
569 target_date = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
570 except Exception:
571 pass
572
573 if not target_date:
574 # Try to extract from pseudo-version format
575 match = re.match(r'v\d+\.\d+\.\d+-(\d{14})-[0-9a-f]+', version)
576 if match:
577 date_str = match.group(1) # YYYYMMDDHHMMSS
578 try:
579 target_date = datetime.strptime(date_str, '%Y%m%d%H%M%S')
580 except Exception:
581 pass
582
583 # Get default branch
584 try:
585 result = subprocess.run(
586 ["git", "ls-remote", "--symref", vcs_url, "HEAD"],
587 capture_output=True,
588 text=True,
589 timeout=30,
590 env=env,
591 )
592 if result.returncode == 0 and result.stdout:
593 # Parse: ref: refs/heads/main HEAD
594 for line in result.stdout.split('\n'):
595 if line.startswith('ref:'):
596 default_branch = line.split()[1].replace('refs/heads/', '')
597 break
598 else:
599 default_branch = 'main' # Fallback
600 else:
601 default_branch = 'main'
602 except Exception:
603 default_branch = 'main'
604
605 # Get commits on default branch
606 try:
607 if target_date:
608 # Find commit closest to target date
609 # We need to clone the repo to access commit history with dates
610
611 # NOTE: Do NOT acquire per-repo lock here - our caller already holds it!
612 # _find_fallback_commit is only called from within verify_commit_accessible,
613 # which has already acquired the per-repo lock for this vcs_url.
614
615 # Get the repo dir (cached, won't re-initialize)
616 repo_dir = VERIFY_REPO_CACHE.get(vcs_url)
617 if not repo_dir:
618 # Shouldn't happen (verify_commit_accessible calls _verify_repo_dir first)
619 # but be defensive
620 repo_dir = _verify_repo_dir(vcs_url)
621
622 # Fetch the default branch (caller holds lock, so this is safe)
623 try:
624 subprocess.run(
625 ["git", "fetch", "origin", f"{default_branch}:refs/remotes/origin/{default_branch}"],
626 cwd=str(repo_dir),
627 check=True,
628 capture_output=True,
629 text=True,
630 timeout=60,
631 env=env,
632 )
633 except subprocess.CalledProcessError:
634 # Fallback to latest if fetch fails
635 pass
636
637 # Use git log with --until to find commit at or before target date
638 # Format: YYYY-MM-DD HH:MM:SS
639 date_str = target_date.strftime('%Y-%m-%d %H:%M:%S')
640 try:
641 result = subprocess.run(
642 ["git", "log", "-1", "--format=%H", f"--until={date_str}", f"origin/{default_branch}"],
643 cwd=str(repo_dir),
644 capture_output=True,
645 text=True,
646 timeout=30,
647 env=env,
648 )
649 if result.returncode == 0 and result.stdout.strip():
650 commit_hash = result.stdout.strip()
651 return (commit_hash, default_branch)
652 except subprocess.CalledProcessError:
653 pass
654
655 # If date-based search failed, fall back to latest commit
656 result = subprocess.run(
657 ["git", "rev-parse", f"origin/{default_branch}"],
658 cwd=str(repo_dir),
659 capture_output=True,
660 text=True,
661 timeout=30,
662 env=env,
663 )
664 if result.returncode == 0 and result.stdout.strip():
665 commit_hash = result.stdout.strip()
666 return (commit_hash, default_branch)
667 else:
668 # Use latest commit from ls-remote (no need to clone)
669 result = subprocess.run(
670 ["git", "ls-remote", vcs_url, f"refs/heads/{default_branch}"],
671 capture_output=True,
672 text=True,
673 timeout=30,
674 env=env,
675 )
676 if result.returncode == 0 and result.stdout:
677 commit_hash = result.stdout.split()[0]
678 return (commit_hash, default_branch)
679 except Exception as e:
680 print(f" ⚠️ Fallback commit search failed: {e}")
681
682 return None
683
684
685def verify_commit_accessible(vcs_url: str, commit: str, ref_hint: str = "", version: str = "", timestamp: str = "") -> bool:
686 """
687 Fetch commit into a bare cache to ensure it exists upstream.
688
689 Check cache age and force re-verification if too old.
690 If commit doesn't exist, use fallback (latest commit on default branch or near timestamp)
691
692 Args:
693 vcs_url: Git repository URL
694 commit: Commit hash to verify
695 ref_hint: Optional ref (tag/branch) that should contain the commit
696 version: Module version (for extracting timestamp from pseudo-versions)
697 timestamp: ISO timestamp from .info file (for finding commits near that date)
698 """
699 from datetime import datetime, timezone, timedelta
700
701 # Check cache before acquiring lock (fast path for already-verified commits)
702 key = (vcs_url, commit)
703 if key in VERIFY_RESULTS:
704 return VERIFY_RESULTS[key]
705
706 cache_key = f"{vcs_url}|||{commit}"
707
708 # Track if verification passed via cache (to skip re-saving later)
709 cached_verification_passed = False
710
711 # Check cache with aging logic
712 if cache_key in VERIFY_COMMIT_CACHE_V2:
713 cache_entry = VERIFY_COMMIT_CACHE_V2[cache_key]
714 if cache_entry.get("verified"):
715 # Check if cache is too old
716 last_checked_str = cache_entry.get("last_checked")
717 if last_checked_str:
718 try:
719 last_checked = datetime.fromisoformat(last_checked_str.replace('Z', '+00:00'))
720 age_days = (datetime.now(timezone.utc) - last_checked).days
721
722 if age_days < VERIFY_CACHE_MAX_AGE_DAYS:
723 # Cache is fresh for commit existence, but we still need branch detection
724 # Branch detection is cheap (local operation) and critical for BitBake recipes
725 # Don't return early - continue to branch detection below
726 cached_verification_passed = True
727 else:
728 # Cache is stale, force re-verification
729 print(f" ⏰ Cache stale ({age_days} days old), re-verifying {commit[:12]}...")
730 # Fall through to re-verify
731 except Exception:
732 # Can't parse timestamp, force re-verification
733 pass
734 else:
735 # No timestamp, but still need branch detection
736 cached_verification_passed = True
737
738 # Legacy cache format fallback
739 if cache_key in VERIFY_COMMIT_CACHE and VERIFY_COMMIT_CACHE[cache_key]:
740 # Migrate to v2 format during this check
741 now = datetime.now(timezone.utc).isoformat()
742 VERIFY_COMMIT_CACHE_V2[cache_key] = {
743 "verified": True,
744 "first_verified": now,
745 "last_checked": now,
746 "fetch_method": "cached"
747 }
748 # Don't return early - continue to branch detection
749 cached_verification_passed = True
750
751 # Ensure repo is initialized (this creates the lock too)
752 repo_dir = _verify_repo_dir(vcs_url)
753
754 # Now safely get the lock (guaranteed to exist after _verify_repo_dir returns)
755 lock = VERIFY_REPO_LOCKS[vcs_url]
756
757 with lock:
758 # Double-check cache after acquiring lock (another thread may have verified while we waited)
759 if key in VERIFY_RESULTS:
760 return VERIFY_RESULTS[key]
761
762 env = os.environ.copy()
763 env.setdefault("GIT_TERMINAL_PROMPT", "0")
764 env.setdefault("GIT_ASKPASS", "true")
765
766 def _commit_exists(check_commit: str = None) -> bool:
767 """Check if a commit exists in the local repo."""
768 target = check_commit if check_commit else commit
769 try:
770 subprocess.run(
771 ["git", "rev-parse", "--verify", f"{target}^{{commit}}"],
772 cwd=str(repo_dir),
773 check=True,
774 capture_output=True,
775 env=env,
776 )
777 return True
778 except subprocess.CalledProcessError:
779 return False
780
781 global VERIFY_COMMIT_CACHE_DIRTY, VERIFY_FALLBACK_COMMITS
782 cached = VERIFY_COMMIT_CACHE.get(cache_key)
783
784 commit_present = _commit_exists()
785 if cached and not commit_present:
786 # Cached entry without a local commit indicates stale data; drop it.
787 VERIFY_COMMIT_CACHE.pop(cache_key, None)
788 VERIFY_COMMIT_CACHE_DIRTY = True
789 cached = None
790
791 # Only do shallow fetch if commit is not already present
792 # Doing --depth=1 on an already-full repo causes git to re-process history (very slow on large repos)
793 if not commit_present and ref_hint:
794 fetch_args = ["git", "fetch", "--depth=1", "origin", ref_hint]
795
796 try:
797 subprocess.run(
798 fetch_args,
799 cwd=str(repo_dir),
800 check=True,
801 capture_output=True,
802 text=True,
803 timeout=GIT_CMD_TIMEOUT,
804 env=env,
805 )
806 except subprocess.TimeoutExpired:
807 print(f" ⚠️ git fetch timeout ({GIT_CMD_TIMEOUT}s) for {vcs_url} {ref_hint or ''}")
808 except subprocess.CalledProcessError as exc:
809 detail = (exc.stderr or exc.stdout or "").strip() if isinstance(exc.stderr, str) or isinstance(exc.stdout, str) else ""
810 if detail:
811 print(f" ⚠️ git fetch failed for {vcs_url} {ref_hint or ''}: {detail}")
812 # Continue to attempt direct commit fetch
813
814 # For pseudo-versions, we need to determine which branch contains the commit
815 # Strategy depends on whether this is a tagged version or pseudo-version
816 commit_fetched = commit_present # If already present, no need to fetch
817
818 if ref_hint and not commit_present:
819 # Tagged version: try shallow fetch of the specific commit (only if not already present)
820 try:
821 fetch_cmd = ["git", "fetch", "--depth=1", "origin", commit]
822 subprocess.run(
823 fetch_cmd,
824 cwd=str(repo_dir),
825 check=True,
826 capture_output=True,
827 text=True,
828 timeout=GIT_CMD_TIMEOUT,
829 env=env,
830 )
831 commit_fetched = True
832
833 except subprocess.CalledProcessError as exc:
834 detail = (exc.stderr or exc.stdout or "").strip() if isinstance(exc.stderr, str) or isinstance(exc.stdout, str) else ""
835 if detail:
836 print(f" ⚠️ git fetch failed for {vcs_url[:50]}...: {detail[:100]}")
837
838 # If fetching commit failed for a tag, check if tag has moved
839 if ref_hint and ref_hint.startswith('refs/tags/'):
840 print(f" → Tag commit not fetchable, checking if tag moved...")
841 try:
842 # Try fetching the tag again to see what it currently points to
843 subprocess.run(
844 ["git", "fetch", "--depth=1", "origin", ref_hint],
845 cwd=str(repo_dir),
846 check=True,
847 capture_output=True,
848 text=True,
849 timeout=GIT_CMD_TIMEOUT,
850 env=env,
851 )
852
853 # Check what commit the tag now points to
854 result = subprocess.run(
855 ["git", "rev-parse", "FETCH_HEAD"],
856 cwd=str(repo_dir),
857 capture_output=True,
858 text=True,
859 timeout=30,
860 env=env,
861 check=True,
862 )
863 current_tag_commit = result.stdout.strip()
864
865 if current_tag_commit != commit:
866 print(f" ✓ Tag moved detected:")
867 print(f" Proxy gave us: {commit[:12]} (no longer exists)")
868 print(f" Tag now points to: {current_tag_commit[:12]}")
869 print(f" → Using current tag commit")
870
871 # Update module to use current commit
872 VERIFY_FALLBACK_COMMITS[(vcs_url, commit)] = current_tag_commit
873 return ('corrected', module_path, version, commit, current_tag_commit)
874 except subprocess.CalledProcessError:
875 # Can't fetch tag either - this is a real error
876 pass
877
878 for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]:
879 lock_path = repo_dir / lock_file
880 if lock_path.exists():
881 try:
882 lock_path.unlink()
883 except Exception:
884 pass
885 VERIFY_RESULTS[key] = False
886 VERIFY_COMMIT_CACHE.pop(cache_key, None)
887 VERIFY_COMMIT_CACHE_DIRTY = True
888 return False
889 else:
890 # Pseudo-version: MUST do full clone to detect which branch contains commit
891 # Shallow fetch is useless - we need history for git for-each-ref --contains
892
893 # Check if we already fetched full history for this repo URL
894 # This prevents redundant full-history fetches for repos with multiple module versions
895 shallow_file = repo_dir / "shallow"
896 is_shallow = shallow_file.exists()
897 already_full = vcs_url in VERIFY_FULL_REPOS
898
899 if is_shallow and not already_full:
900 print(f" → Fetching full history for branch detection...")
901 try:
902 # Use --unshallow to convert shallow clone to full clone
903 subprocess.run(
904 ["git", "fetch", "--unshallow", "origin", "+refs/heads/*:refs/remotes/origin/*"],
905 cwd=str(repo_dir),
906 check=True,
907 capture_output=True,
908 text=True,
909 timeout=GIT_CMD_TIMEOUT * 5,
910 env=env,
911 )
912 commit_fetched = True
913 # Mark this repo as having full history
914 VERIFY_FULL_REPOS.add(vcs_url)
915 except subprocess.TimeoutExpired:
916 print(f" ⚠️ Full clone timeout for {vcs_url[:50]}...")
917 for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]:
918 lock_path = repo_dir / lock_file
919 if lock_path.exists():
920 try:
921 lock_path.unlink()
922 except Exception:
923 pass
924 VERIFY_RESULTS[key] = False
925 VERIFY_COMMIT_CACHE.pop(cache_key, None)
926 VERIFY_COMMIT_CACHE_DIRTY = True
927 return False
928 except subprocess.CalledProcessError as exc:
929 detail = (exc.stderr or exc.stdout or "").strip() if isinstance(exc.stderr, str) or isinstance(exc.stdout, str) else ""
930 if detail:
931 print(f" ⚠️ Full clone failed for {vcs_url[:50]}...: {detail[:100]}")
932 for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]:
933 lock_path = repo_dir / lock_file
934 if lock_path.exists():
935 try:
936 lock_path.unlink()
937 except Exception:
938 pass
939 VERIFY_RESULTS[key] = False
940 VERIFY_COMMIT_CACHE.pop(cache_key, None)
941 VERIFY_COMMIT_CACHE_DIRTY = True
942 return False
943 else:
944 # Already full - just fetch updates
945 print(f" → Fetching updates (repo already full)...")
946 try:
947 subprocess.run(
948 ["git", "fetch", "origin", "+refs/heads/*:refs/remotes/origin/*"],
949 cwd=str(repo_dir),
950 check=True,
951 capture_output=True,
952 text=True,
953 timeout=GIT_CMD_TIMEOUT,
954 env=env,
955 )
956 commit_fetched = True
957 except subprocess.TimeoutExpired:
958 print(f" ⚠️ Full clone timeout for {vcs_url[:50]}...")
959 for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]:
960 lock_path = repo_dir / lock_file
961 if lock_path.exists():
962 try:
963 lock_path.unlink()
964 except Exception:
965 pass
966 VERIFY_RESULTS[key] = False
967 VERIFY_COMMIT_CACHE.pop(cache_key, None)
968 VERIFY_COMMIT_CACHE_DIRTY = True
969 return False
970 except subprocess.CalledProcessError as exc:
971 detail = (exc.stderr or exc.stdout or "").strip() if isinstance(exc.stderr, str) or isinstance(exc.stdout, str) else ""
972 if detail:
973 print(f" ⚠️ Full clone failed for {vcs_url[:50]}...: {detail[:100]}")
974 for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]:
975 lock_path = repo_dir / lock_file
976 if lock_path.exists():
977 try:
978 lock_path.unlink()
979 except Exception:
980 pass
981 VERIFY_RESULTS[key] = False
982 VERIFY_COMMIT_CACHE.pop(cache_key, None)
983 VERIFY_COMMIT_CACHE_DIRTY = True
984 return False
985
986 # Use the original commit or fallback commit for verification
987 actual_commit = commit
988
989 if not _commit_exists():
990 # Commit doesn't exist in repository - try fallback strategy
991 # This handles orphaned commits from proxy.golang.org
992 print(f" ⚠️ Commit {commit[:12]} not found in repository {vcs_url[:50]}...")
993
994 if not ref_hint:
995 # Pseudo-version without a tag - use timestamp-based fallback
996 print(f" → Attempting fallback commit strategy for pseudo-version {version}")
997 fallback_result = _find_fallback_commit(vcs_url, version, timestamp)
998
999 if fallback_result:
1000 fallback_commit, fallback_branch = fallback_result
1001 print(f" ⚠️ Using fallback: {fallback_commit[:12]} from branch '{fallback_branch}'")
1002 print(f" (Original commit {commit[:12]} from proxy.golang.org does not exist)")
1003
1004 # Update commit to use the fallback
1005 actual_commit = fallback_commit
1006
1007 # Track the fallback mapping so callers can use the fallback commit
1008 VERIFY_FALLBACK_COMMITS[(vcs_url, commit)] = fallback_commit
1009
1010 # Fetch the fallback commit (only unshallow if repo is still shallow)
1011 shallow_file = repo_dir / "shallow"
1012 is_shallow = shallow_file.exists()
1013 try:
1014 if is_shallow:
1015 subprocess.run(
1016 ["git", "fetch", "--unshallow", "origin", "+refs/heads/*:refs/remotes/origin/*"],
1017 cwd=str(repo_dir),
1018 check=True,
1019 capture_output=True,
1020 text=True,
1021 timeout=GIT_CMD_TIMEOUT * 5,
1022 env=env,
1023 )
1024 else:
1025 # Repo already has full history - just fetch updates
1026 subprocess.run(
1027 ["git", "fetch", "origin", "+refs/heads/*:refs/remotes/origin/*"],
1028 cwd=str(repo_dir),
1029 check=True,
1030 capture_output=True,
1031 text=True,
1032 timeout=GIT_CMD_TIMEOUT,
1033 env=env,
1034 )
1035 except Exception as e:
1036 print(f" ⚠️ Failed to fetch fallback commit: {e}")
1037 VERIFY_RESULTS[key] = False
1038 return False
1039
1040 # Register the fallback branch
1041 VERIFY_DETECTED_BRANCHES[(vcs_url, fallback_commit)] = fallback_branch
1042
1043 # Check if fallback commit exists
1044 if not _commit_exists(fallback_commit):
1045 print(f" ⚠️ Fallback commit {fallback_commit[:12]} also not found!")
1046 VERIFY_RESULTS[key] = False
1047 return False
1048 else:
1049 print(f" ⚠️ Could not determine fallback commit")
1050 VERIFY_RESULTS[key] = False
1051 return False
1052 else:
1053 # Tagged version with bad commit - this shouldn't happen but fail gracefully
1054 print(f" ⚠️ Tagged version {version} has invalid commit {commit[:12]}")
1055 VERIFY_RESULTS[key] = False
1056 return False
1057
1058 # Now verify the actual_commit (original or fallback)
1059 if _commit_exists(actual_commit):
1060 # Commit was fetched successfully - verify it's reachable from the ref_hint if provided
1061 # This ensures the commit is on the branch/tag we'll use in SRC_URI
1062 if ref_hint:
1063 # For tagged versions, verify the tag still points to the same commit
1064 # proxy.golang.org caches module@version->commit mappings, but tags can be force-pushed
1065 # If the tag has moved to a different commit, we need to use the current commit
1066 # Optimization: Use git ls-remote first (fast, cached) before fetching
1067 if ref_hint.startswith('refs/tags/'):
1068 try:
1069 # First check if tag has moved using fast ls-remote (cached)
1070 current_tag_commit = git_ls_remote(vcs_url, ref_hint)
1071
1072 if current_tag_commit and current_tag_commit != actual_commit:
1073 # Tag has moved - fetch it to verify and update local repo
1074 print(f" ⚠️ Tag has moved - proxy.golang.org cache is stale")
1075 print(f" Proxy gave us: {actual_commit[:12]}")
1076 print(f" Tag now points to: {current_tag_commit[:12]}")
1077 print(f" → Using current tag commit")
1078
1079 # Fetch the tag to update local repo
1080 subprocess.run(
1081 ["git", "fetch", "--depth=1", "origin", ref_hint],
1082 cwd=str(repo_dir),
1083 check=True,
1084 capture_output=True,
1085 text=True,
1086 timeout=GIT_CMD_TIMEOUT,
1087 env=env,
1088 )
1089
1090 # Update to use current commit
1091 VERIFY_FALLBACK_COMMITS[(vcs_url, actual_commit)] = current_tag_commit
1092 actual_commit = current_tag_commit
1093
1094 # Verify the new commit exists (it should, since we just fetched it)
1095 if not _commit_exists(current_tag_commit):
1096 print(f" ⚠️ Current tag commit {current_tag_commit[:12]} not found!")
1097 VERIFY_RESULTS[key] = False
1098 VERIFY_COMMIT_CACHE.pop(cache_key, None)
1099 VERIFY_COMMIT_CACHE_DIRTY = True
1100 return False
1101
1102 # The VERIFY_FALLBACK_COMMITS mapping will be used by the caller
1103 # Continue with verification using the corrected commit
1104 except Exception as e:
1105 # Tag verification failed - continue with normal flow
1106 print(f" ⚠️ Could not verify tag target: {e}")
1107 pass
1108
1109 try:
1110 # Check if commit is an ancestor of (or equal to) the ref
1111 # This works even with shallow clones
1112 result = subprocess.run(
1113 ["git", "merge-base", "--is-ancestor", actual_commit, "FETCH_HEAD"],
1114 cwd=str(repo_dir),
1115 capture_output=True,
1116 text=True,
1117 timeout=30,
1118 env=env,
1119 )
1120 if result.returncode != 0:
1121 # Commit is not an ancestor of the ref - might be on a different branch
1122 # This is OK - BitBake can still fetch the commit directly
1123 # Just log it for debugging
1124 pass # Don't fail - commit exists and is fetchable
1125 except subprocess.TimeoutExpired:
1126 print(f" ⚠️ Timeout checking commit ancestry for {actual_commit[:12]}")
1127 # Don't fail - commit exists
1128 except subprocess.CalledProcessError:
1129 # merge-base failed - don't fail verification
1130 pass
1131 else:
1132 # For pseudo-versions, we MUST detect which branch contains the commit
1133 # This is CRITICAL - BitBake cannot fetch arbitrary commits with nobranch=1
1134 # We need branch=<name> in SRC_URI for interior commits
1135
1136 # Check if we already have the branch from fallback
1137 if (vcs_url, actual_commit) not in VERIFY_DETECTED_BRANCHES:
1138 # Now that we have full history, use git to find which branches contain this commit
1139 try:
1140 result = subprocess.run(
1141 ["git", "for-each-ref", "--contains", actual_commit, "refs/remotes/origin/", "--format=%(refname:short)"],
1142 cwd=str(repo_dir),
1143 capture_output=True,
1144 text=True,
1145 timeout=30,
1146 env=env,
1147 )
1148 if result.returncode == 0 and result.stdout.strip():
1149 # Commit IS on one or more branches
1150 branches = result.stdout.strip().split('\n')
1151 # Strip 'origin/' prefix from branch names
1152 branches = [b.replace('origin/', '') for b in branches]
1153
1154 # Pick main/master if available, otherwise first branch
1155 if 'main' in branches:
1156 detected_branch = 'main'
1157 elif 'master' in branches:
1158 detected_branch = 'master'
1159 else:
1160 detected_branch = branches[0]
1161
1162 VERIFY_DETECTED_BRANCHES[(vcs_url, actual_commit)] = detected_branch
1163 print(f" → Detected branch: {detected_branch} (verified with git for-each-ref)")
1164 else:
1165 # Commit exists but not in any branch - it's orphaned/dangling
1166 # For pseudo-versions, try fallback strategy
1167 # DEBUG: ALWAYS print this to confirm we reach this block
1168 print(f" ⚠️ ORPHANED: Commit {actual_commit[:12]} not found in any branch for {vcs_url[:50]}")
1169 print(f" DEBUG-ORPHANED: ref_hint={ref_hint}, actual_commit={actual_commit[:12]}, commit={commit[:12]}, version={version}")
1170 print(f" DEBUG-ORPHANED: Condition: (not ref_hint)={not ref_hint}, (actual==commit)={actual_commit == commit}")
1171
1172 if not ref_hint and actual_commit == commit:
1173 # This is a pseudo-version with orphaned commit - try fallback
1174 print(f" → Attempting fallback commit strategy for orphaned commit")
1175 fallback_result = _find_fallback_commit(vcs_url, version, timestamp)
1176
1177 if fallback_result:
1178 fallback_commit, fallback_branch = fallback_result
1179 print(f" ✓ Using fallback: {fallback_commit[:12]} from branch '{fallback_branch}'")
1180 print(f" (Original commit {commit[:12]} from proxy.golang.org is orphaned)")
1181
1182 # Update to use the fallback
1183 actual_commit = fallback_commit
1184 VERIFY_FALLBACK_COMMITS[(vcs_url, commit)] = fallback_commit
1185 VERIFY_DETECTED_BRANCHES[(vcs_url, fallback_commit)] = fallback_branch
1186
1187 # Verify fallback commit exists
1188 if not _commit_exists(fallback_commit):
1189 print(f" ⚠️ Fallback commit {fallback_commit[:12]} not found!")
1190 VERIFY_RESULTS[key] = False
1191 return False
1192 # Continue with fallback commit - don't fail here
1193 else:
1194 print(f" ⚠️ Could not determine fallback commit")
1195 VERIFY_RESULTS[key] = False
1196 return False
1197 else:
1198 # Tagged version or already tried fallback - fail
1199 VERIFY_RESULTS[key] = False
1200 return False
1201 except subprocess.TimeoutExpired:
1202 print(f" ⚠️ Branch detection timeout for {actual_commit[:12]}")
1203 VERIFY_RESULTS[key] = False
1204 return False
1205 except subprocess.CalledProcessError:
1206 print(f" ⚠️ Failed to detect branch for {actual_commit[:12]}")
1207 VERIFY_RESULTS[key] = False
1208 return False
1209
1210
1211 # Commit exists AND is reachable - safe for BitBake nobranch=1
1212 # Only save to cache if not already cached (branch detection is done, just finalize)
1213 if not cached_verification_passed:
1214 # Save with timestamp in v2 format
1215 now = datetime.now(timezone.utc).isoformat()
1216 existing_entry = VERIFY_COMMIT_CACHE_V2.get(cache_key, {})
1217
1218 VERIFY_COMMIT_CACHE_V2[cache_key] = {
1219 "verified": True,
1220 "first_verified": existing_entry.get("first_verified", now),
1221 "last_checked": now,
1222 "fetch_method": "fetch" # Successfully fetched from upstream
1223 }
1224 VERIFY_COMMIT_CACHE_DIRTY = True
1225
1226 VERIFY_RESULTS[key] = True
1227 return True
1228 VERIFY_RESULTS[key] = False
1229 # Remove from both caches
1230 VERIFY_COMMIT_CACHE.pop(cache_key, None)
1231 VERIFY_COMMIT_CACHE_V2.pop(cache_key, None)
1232 VERIFY_COMMIT_CACHE_DIRTY = True
1233 return False
1234
1235
1236def get_actual_commit(vcs_url: str, commit: str) -> str:
1237 """
1238 Get the actual commit to use, applying fallback if original commit doesn't exist.
1239
1240 This should be called after verify_commit_accessible() to get the commit that was
1241 actually verified (which may be a fallback if the original didn't exist).
1242
1243 Args:
1244 vcs_url: Repository URL
1245 commit: Original commit hash from proxy.golang.org
1246
1247 Returns:
1248 Fallback commit if one was used, otherwise the original commit
1249 """
1250 return VERIFY_FALLBACK_COMMITS.get((vcs_url, commit), commit)
1251
1252
1253def _ref_points_to_commit(vcs_url: str, ref_hint: str, commit_hash: str) -> bool:
1254 if not ref_hint:
1255 return False
1256
1257 repo_dir = _verify_repo_dir(vcs_url)
1258 # Lock is guaranteed to exist after _verify_repo_dir returns
1259 lock = VERIFY_REPO_LOCKS[vcs_url]
1260
1261 with lock:
1262 env = os.environ.copy()
1263 env.setdefault("GIT_TERMINAL_PROMPT", "0")
1264 env.setdefault("GIT_ASKPASS", "true")
1265
1266 try:
1267 result = subprocess.run(
1268 ["git", "show-ref", "--verify", "--hash", ref_hint],
1269 cwd=str(repo_dir),
1270 check=True,
1271 capture_output=True,
1272 text=True,
1273 env=env,
1274 )
1275 resolved = result.stdout.strip().lower()
1276 return resolved == commit_hash.lower()
1277 except subprocess.CalledProcessError:
1278 return False
1279
1280
1281def correct_commit_hash_from_ref(vcs_url: str, vcs_hash: str, vcs_ref: str) -> Optional[str]:
1282 """
1283 Fix proxy.golang.org bad hashes by dereferencing the tag to get the correct commit.
1284
1285 proxy.golang.org sometimes returns commits that:
1286 1. Exist in the repo but aren't branch/tag HEADs (dangling commits)
1287 2. Don't exist in the repo at all
1288
1289 BitBake's nobranch=1 requires commits to be HEADs of branches or dereferenced tags.
1290
1291 Args:
1292 vcs_url: Repository URL
1293 vcs_hash: Commit hash from proxy.golang.org (potentially bad)
1294 vcs_ref: Git ref like "refs/tags/v1.2.3"
1295
1296 Returns:
1297 Corrected commit hash if different from vcs_hash, None if vcs_hash is correct or can't be corrected
1298 """
1299 if not vcs_ref or not vcs_ref.startswith("refs/"):
1300 return None
1301
1302 # Try dereferenced tag first (annotated tags)
1303 dereferenced_hash = git_ls_remote(vcs_url, f"{vcs_ref}^{{}}")
1304 if dereferenced_hash and dereferenced_hash.lower() != vcs_hash.lower():
1305 return dereferenced_hash.lower()
1306
1307 # Try without ^{} for lightweight tags
1308 commit_hash = git_ls_remote(vcs_url, vcs_ref)
1309 if commit_hash and commit_hash.lower() != vcs_hash.lower():
1310 return commit_hash.lower()
1311
1312 return None
1313
1314
1315def is_commit_bitbake_fetchable(vcs_url: str, vcs_hash: str, vcs_ref: str) -> bool:
1316 """
1317 Check if a commit is BitBake-fetchable (is a branch/tag HEAD).
1318
1319 BitBake's nobranch=1 requires commits to be:
1320 - HEAD of a branch (refs/heads/*)
1321 - HEAD of a dereferenced tag (refs/tags/*^{})
1322
1323 Uses cached git ls-remote to check if the commit appears in the remote repository as a ref HEAD.
1324
1325 Args:
1326 vcs_url: Repository URL
1327 vcs_hash: Commit hash to check
1328 vcs_ref: Git ref hint like "refs/tags/v1.2.3"
1329
1330 Returns:
1331 True if commit is a branch/tag HEAD, False if dangling/not found
1332 """
1333 # Quick check: Does the ref point to this commit?
1334 if vcs_ref and vcs_ref.startswith("refs/"):
1335 # Try dereferenced tag (annotated)
1336 ref_commit = git_ls_remote(vcs_url, f"{vcs_ref}^{{}}")
1337 if ref_commit and ref_commit.lower() == vcs_hash.lower():
1338 return True
1339
1340 # Try without ^{} for lightweight tags
1341 ref_commit = git_ls_remote(vcs_url, vcs_ref)
1342 if ref_commit and ref_commit.lower() == vcs_hash.lower():
1343 return True
1344
1345 # If we get here, the vcs_hash doesn't match the ref, so it's dangling
1346 return False
1347
1348
1349def verify_gomodcache_commits(gomodcache_path: Path, verify_jobs: int = 10) -> int:
1350 """
1351 Verify commits in GOMODCACHE .info files still exist in repositories.
1352
1353 Detects force-pushed tags where proxy.golang.org has stale commit hashes.
1354 Offers to automatically refresh stale .info files by re-downloading.
1355
1356 Returns:
1357 0 if all commits valid or successfully refreshed
1358 1 if stale commits found and user declined refresh
1359 """
1360 global VERIFY_ENABLED
1361 VERIFY_ENABLED = True
1362
1363 if isinstance(gomodcache_path, str):
1364 gomodcache_path = Path(gomodcache_path)
1365
1366 if not gomodcache_path.exists():
1367 print(f"❌ GOMODCACHE not found: {gomodcache_path}")
1368 return 1
1369
1370 download_dir = gomodcache_path / "cache" / "download"
1371 if not download_dir.exists():
1372 print(f"❌ Download directory not found: {download_dir}")
1373 return 1
1374
1375 print(f"\nScanning {download_dir} for .info files...")
1376
1377 # Collect all modules with VCS info
1378 modules_to_check = []
1379 for dirpath, _, filenames in os.walk(download_dir):
1380 path_parts = Path(dirpath).relative_to(download_dir).parts
1381 if not path_parts or path_parts[-1] != '@v':
1382 continue
1383
1384 module_path = '/'.join(path_parts[:-1])
1385 module_path = unescape_module_path(module_path)
1386
1387 for filename in filenames:
1388 if not filename.endswith('.info'):
1389 continue
1390
1391 version = filename[:-5]
1392 info_path = Path(dirpath) / filename
1393
1394 try:
1395 with open(info_path) as f:
1396 info = json.load(f)
1397
1398 origin = info.get('Origin', {})
1399 vcs_url = origin.get('URL')
1400 vcs_hash = origin.get('Hash')
1401 vcs_ref = origin.get('Ref', '')
1402
1403 if vcs_url and vcs_hash and len(vcs_hash) == 40:
1404 modules_to_check.append({
1405 'module_path': module_path,
1406 'version': version,
1407 'vcs_url': vcs_url,
1408 'vcs_hash': vcs_hash,
1409 'vcs_ref': vcs_ref,
1410 'info_path': info_path
1411 })
1412 except Exception as e:
1413 print(f" ⚠️ Error reading {info_path}: {e}")
1414
1415 print(f"Found {len(modules_to_check)} modules with VCS metadata to verify\n")
1416
1417 if not modules_to_check:
1418 print("✅ No modules to verify")
1419 return 0
1420
1421 # Verify commits in parallel
1422 stale_modules = []
1423
1424 def check_module(module):
1425 if verify_commit_accessible(module['vcs_url'], module['vcs_hash'], module['vcs_ref'], module.get('version', '')):
1426 return None
1427 else:
1428 return module
1429
1430 if verify_jobs > 0:
1431 print(f"Verifying commits in parallel ({verify_jobs} workers)...")
1432 with ThreadPoolExecutor(max_workers=verify_jobs) as executor:
1433 futures = {executor.submit(check_module, m): m for m in modules_to_check}
1434 for future in futures:
1435 result = future.result()
1436 if result:
1437 stale_modules.append(result)
1438 else:
1439 print("Verifying commits sequentially...")
1440 for module in modules_to_check:
1441 result = check_module(module)
1442 if result:
1443 stale_modules.append(result)
1444
1445 if not stale_modules:
1446 print(f"\n✅ All {len(modules_to_check)} commits verified successfully!")
1447 return 0
1448
1449 # Report stale modules
1450 print(f"\n⚠️ Found {len(stale_modules)} modules with STALE commits:\n")
1451 for module in stale_modules[:10]: # Show first 10
1452 print(f" {module['module_path']}@{module['version']}")
1453 print(f" Commit: {module['vcs_hash'][:12]} (not found in {module['vcs_url']})")
1454 print(f" File: {module['info_path']}")
1455 print()
1456
1457 if len(stale_modules) > 10:
1458 print(f" ... and {len(stale_modules) - 10} more\n")
1459
1460 # Offer to auto-refresh
1461 print("These commits likely represent force-pushed tags.")
1462 print("The .info files can be refreshed by re-downloading from proxy.golang.org\n")
1463
1464 response = input("Refresh stale .info files automatically? [y/N]: ").strip().lower()
1465 if response not in ('y', 'yes'):
1466 print("\nNo action taken. To fix manually:")
1467 print(" 1. Delete stale .info files")
1468 print(" 2. Run: go mod download")
1469 return 1
1470
1471 # Refresh stale modules
1472 print("\nRefreshing stale modules...")
1473 refreshed = 0
1474 failed = []
1475
1476 for module in stale_modules:
1477 print(f"\n Refreshing {module['module_path']}@{module['version']}...")
1478
1479 try:
1480 # Delete stale .info file
1481 module['info_path'].unlink()
1482 print(f" Deleted stale .info")
1483
1484 # Re-download
1485 result = subprocess.run(
1486 ['go', 'mod', 'download', f"{module['module_path']}@{module['version']}"],
1487 capture_output=True,
1488 text=True,
1489 timeout=60
1490 )
1491
1492 if result.returncode == 0 and module['info_path'].exists():
1493 # Verify new commit
1494 with open(module['info_path']) as f:
1495 new_info = json.load(f)
1496 new_hash = new_info.get('Origin', {}).get('Hash', '')
1497
1498 if new_hash and new_hash != module['vcs_hash']:
1499 print(f" ✓ Refreshed: {module['vcs_hash'][:12]} → {new_hash[:12]}")
1500 refreshed += 1
1501 else:
1502 print(f" ⚠️ Proxy returned same commit")
1503 failed.append(module)
1504 else:
1505 print(f" ❌ Download failed: {result.stderr[:100]}")
1506 failed.append(module)
1507 except Exception as e:
1508 print(f" ❌ Error: {e}")
1509 failed.append(module)
1510
1511 print(f"\n{'='*70}")
1512 print(f"Refresh complete: {refreshed} refreshed, {len(failed)} failed")
1513
1514 if failed:
1515 print(f"\nFailed modules require manual intervention:")
1516 for module in failed[:5]:
1517 print(f" {module['module_path']}@{module['version']}")
1518 return 1
1519
1520 return 0
1521
1522
1523def is_module_actually_needed(module_path: str, source_dir: Path) -> bool:
1524 """
1525 Check if a module is actually used by running 'go mod why'.
1526
1527 Returns:
1528 True if module is needed by the main module
1529 False if module is indirect-only and not actually imported
1530 """
1531 try:
1532 result = subprocess.run(
1533 ['go', 'mod', 'why', module_path],
1534 cwd=str(source_dir),
1535 capture_output=True,
1536 text=True,
1537 timeout=30
1538 )
1539
1540 if result.returncode != 0:
1541 # If go mod why fails, assume it's needed (conservative)
1542 return True
1543
1544 output = result.stdout.strip()
1545
1546 # Check for the telltale sign that module is not needed
1547 if "(main module does not need package" in output:
1548 return False
1549
1550 # Also check for completely empty output (module not in graph)
1551 if not output or output == f"# {module_path}":
1552 return False
1553
1554 # Module is needed
1555 return True
1556
1557 except Exception:
1558 # On error, assume needed (conservative)
1559 return True
1560
1561
1562def _execute(args: argparse.Namespace) -> int:
1563 global CURRENT_SOURCE_DIR, CURRENT_GOMODCACHE, VERIFY_COMMIT_CACHE_DIRTY
1564 debug_limit = args.debug_limit
1565
1566 if args.source_dir:
1567 source_dir = Path(args.source_dir).resolve()
1568 else:
1569 source_dir = Path.cwd()
1570 CURRENT_SOURCE_DIR = source_dir
1571
1572 if not (source_dir / "go.mod").exists():
1573 print(f"❌ Error: go.mod not found in {source_dir}")
1574 return 1
1575
1576 print(f"Source directory: {source_dir}")
1577
1578 if args.recipedir:
1579 output_dir = Path(args.recipedir).resolve()
1580 output_dir.mkdir(parents=True, exist_ok=True)
1581 print(f"Output directory: {output_dir}")
1582 else:
1583 output_dir = None
1584 if not args.validate and not args.dry_run:
1585 print("❌ Error: --recipedir is required unless running with --validate, --dry-run, or cache-maintenance flags.")
1586 return 1
1587
1588 configure_cache_paths(args.cache_dir, args.clone_cache_dir)
1589 if args.cache_dir:
1590 print(f"Metadata cache directory: {CACHE_BASE_DIR}")
1591 if args.clone_cache_dir:
1592 print(f"Clone cache directory: {CLONE_CACHE_DIR}")
1593
1594 # Set verification cache max age from command line
1595 global MODULE_REPO_OVERRIDES_DIRTY, VERIFY_CACHE_MAX_AGE_DAYS
1596 VERIFY_CACHE_MAX_AGE_DAYS = args.verify_cache_max_age
1597 if VERIFY_CACHE_MAX_AGE_DAYS == 0:
1598 print(f"Verification cache: DISABLED (always verify)")
1599 else:
1600 print(f"Verification cache max age: {VERIFY_CACHE_MAX_AGE_DAYS} days")
1601
1602 if args.clear_repo:
1603 for (module_spec,) in args.clear_repo:
1604 module_path, version = _parse_override_spec(module_spec)
1605 removed = False
1606 try:
1607 key = _normalise_override_key(module_path, version)
1608 except ValueError as exc:
1609 print(f"Invalid module override '{module_spec}': {exc}")
1610 continue
1611 if version is not None:
1612 if MODULE_REPO_OVERRIDES.pop(key, None) is not None:
1613 removed = True
1614 MODULE_REPO_OVERRIDES_DIRTY = True
1615 print(f"Cleared repo override: {module_path}@{version}")
1616 else:
1617 wildcard_key = key
1618 if MODULE_REPO_OVERRIDES.pop(wildcard_key, None) is not None:
1619 removed = True
1620 specific_keys = [
1621 candidate for candidate in list(MODULE_REPO_OVERRIDES.keys())
1622 if candidate[0] == module_path and candidate[1] is not None
1623 ]
1624 for candidate in specific_keys:
1625 MODULE_REPO_OVERRIDES.pop(candidate, None)
1626 removed = True
1627 if removed:
1628 MODULE_REPO_OVERRIDES_DIRTY = True
1629 print(f"Cleared repo overrides for: {module_path}")
1630 if not removed:
1631 if version is not None:
1632 print(f"No repo override found for: {module_path}@{version}")
1633 else:
1634 print(f"No repo overrides found for: {module_path}")
1635
1636 if args.set_repo:
1637 for module_spec, repo_url in args.set_repo:
1638 module_path, version = _parse_override_spec(module_spec)
1639 try:
1640 key = _normalise_override_key(module_path, version)
1641 except ValueError as exc:
1642 print(f"Invalid module override '{module_spec}': {exc}")
1643 continue
1644 MODULE_REPO_OVERRIDES[key] = repo_url
1645 MODULE_REPO_OVERRIDES_DIRTY = True
1646 label = f"{module_path}@{version}" if version else module_path
1647 print(f"Pinned repo override: {label} -> {repo_url}")
1648
1649 if args.clear_commit:
1650 for repo, commit in args.clear_commit:
1651 key = f"{repo}|||{commit}"
1652 if key in VERIFY_COMMIT_CACHE:
1653 VERIFY_COMMIT_CACHE.pop(key, None)
1654 VERIFY_COMMIT_CACHE_DIRTY = True
1655 print(f"\n🧹 Cleared cached verification: {repo} {commit}\n")
1656 else:
1657 print(f"No cached verification found for: {repo} {commit}")
1658 VERIFY_RESULTS.pop((repo, commit), None)
1659
1660 if args.inject_commit:
1661 for repo, commit in args.inject_commit:
1662 key = f"{repo}|||{commit}"
1663 VERIFY_COMMIT_CACHE[key] = True
1664 VERIFY_COMMIT_CACHE_DIRTY = True
1665 VERIFY_RESULTS[(repo, commit)] = True
1666 print(f"Injected verified commit: {repo} {commit}")
1667
1668 exit_code = 0
1669
1670 if args.clean_ls_remote_cache:
1671 print("\n🗑️ Cleaning git ls-remote cache...")
1672 if LS_REMOTE_CACHE_PATH.exists():
1673 LS_REMOTE_CACHE_PATH.unlink()
1674 print(f" Removed {LS_REMOTE_CACHE_PATH}")
1675 else:
1676 print(f" Cache file not found: {LS_REMOTE_CACHE_PATH}")
1677 args.clean_cache = True
1678
1679 if args.clean_cache:
1680 print("\n🗑️ Cleaning module metadata cache...")
1681 if MODULE_METADATA_CACHE_PATH.exists():
1682 MODULE_METADATA_CACHE_PATH.unlink()
1683 print(f" Removed {MODULE_METADATA_CACHE_PATH}")
1684 else:
1685 print(f" Cache file not found: {MODULE_METADATA_CACHE_PATH}")
1686 if VERIFY_COMMIT_CACHE_PATH.exists():
1687 VERIFY_COMMIT_CACHE_PATH.unlink()
1688 print(f" Removed {VERIFY_COMMIT_CACHE_PATH}")
1689 VERIFY_COMMIT_CACHE.clear()
1690 VERIFY_COMMIT_CACHE_DIRTY = False
1691 print(" Note: Bootstrap from .inc files DISABLED to avoid reloading stale data.")
1692 skip_inc_files = True
1693 else:
1694 skip_inc_files = False
1695
1696 skip_legacy_module_cache = args.skip_legacy_module_cache
1697 bootstrap_metadata_cache(
1698 output_dir,
1699 skip_inc_files=skip_inc_files,
1700 skip_legacy_module_cache=skip_legacy_module_cache,
1701 )
1702 prune_metadata_cache()
1703 load_ls_remote_cache()
1704 load_vanity_url_cache()
1705
1706 if args.dry_run:
1707 print("\n--dry-run requested; skipping discovery/validation")
1708 return 0
1709
1710 # --verify-cached command to check GOMODCACHE for stale commits
1711 if args.verify_cached:
1712 print("\n" + "=" * 70)
1713 print("VERIFYING CACHED COMMITS IN GOMODCACHE")
1714 print("=" * 70)
1715 return verify_gomodcache_commits(args.gomodcache or source_dir / ".gomodcache", args.verify_jobs)
1716
1717 # Check for --discovered-modules (bootstrap strategy)
1718 if args.discovered_modules:
1719 print("\n" + "=" * 70)
1720 print("PRE-DISCOVERED MODULES MODE")
1721 print("=" * 70)
1722 print("\nUsing pre-discovered module metadata from BitBake discovery build")
1723 print("Skipping discovery phase - generator will convert to BitBake format\n")
1724
1725 discovered_modules_path = Path(args.discovered_modules).resolve()
1726 modules = load_discovered_modules(discovered_modules_path)
1727
1728 if modules is None:
1729 print("\n❌ Failed to load discovered modules - falling back to discovery")
1730 modules = discover_modules(source_dir, args.gomodcache)
1731 else:
1732 print(f"\n✓ Successfully loaded {len(modules)} modules from discovery metadata")
1733 print(" Skipping 'go mod download' discovery phase")
1734 print(" Will use go.sum to resolve modules without Origin metadata")
1735
1736 # Auto-correction of dangling commits happens in Phase 2 during parallel verification
1737 else:
1738 # Normal discovery path
1739 modules = discover_modules(source_dir, args.gomodcache)
1740 if debug_limit is not None and len(modules) > debug_limit:
1741 print(f"\n⚙️ Debug limit active: truncating discovered modules to first {debug_limit} entries")
1742 modules = modules[:debug_limit]
1743
1744 # Set VERIFY_ENABLED based on whether verification is requested
1745 global VERIFY_ENABLED
1746 VERIFY_ENABLED = not args.skip_verify
1747
1748 # Parse go.mod replace directives for fork resolution
1749 # Example: github.com/containerd/containerd/v2 => github.com/k3s-io/containerd/v2 v2.1.4-k3s2
1750 go_mod_replaces = parse_go_mod_replaces(source_dir / "go.mod")
1751 if go_mod_replaces:
1752 print(f"\n✓ Parsed {len(go_mod_replaces)} replace directives from go.mod")
1753 if VERBOSE_MODE:
1754 for old_path, (new_path, new_version) in sorted(go_mod_replaces.items())[:5]:
1755 print(f" {old_path} => {new_path} {new_version}")
1756 if len(go_mod_replaces) > 5:
1757 print(f" ... and {len(go_mod_replaces) - 5} more")
1758
1759 # Parse go.sum for fallback resolution
1760 discovered_keys = {(m['module_path'], m['version']) for m in modules}
1761 go_sum_modules_with_source, go_sum_indirect_only = parse_go_sum(source_dir / "go.sum")
1762
1763 FAILED_MODULE_PATHS.clear()
1764 FAILED_MODULE_ENTRIES.clear()
1765 SKIPPED_MODULES.clear()
1766
1767 print(f"\nFound {len(go_sum_indirect_only)} indirect-only dependencies (skipping - only need .mod files)")
1768
1769 if args.discovered_modules:
1770 # With discovered modules, only resolve what's in go.sum but missing from discovery
1771 # Do NOT call go list -m all - we already know what we need from the successful build
1772 missing_from_discovery = go_sum_modules_with_source - discovered_keys
1773 print(f"Discovered modules provided {len(discovered_keys)} modules with Origin metadata")
1774 print(f"go.sum has {len(go_sum_modules_with_source)} modules total")
1775 print(f"Resolving {len(missing_from_discovery)} modules without Origin metadata...")
1776 else:
1777 # Normal discovery - also use go list to find additional modules
1778 go_list_modules = collect_modules_via_go_list(source_dir)
1779 go_sum_modules_with_source |= go_list_modules
1780 missing_from_discovery = go_sum_modules_with_source - discovered_keys
1781 print(f"Resolving {len(missing_from_discovery)} additional modules discovered from go.sum/go list...")
1782
1783 modules_by_path: Dict[str, List[Dict]] = {}
1784 for m in modules:
1785 modules_by_path.setdefault(m['module_path'], []).append(m)
1786
1787 limit_reached = False
1788 for module_path, version in sorted(go_sum_modules_with_source):
1789 if debug_limit is not None and len(modules) >= debug_limit:
1790 limit_reached = True
1791 break
1792 if module_path in FAILED_MODULE_PATHS:
1793 print(f" ⚠️ Skipping {module_path}@{version} (previous resolution failure)")
1794 continue
1795
1796 if (module_path, version) in discovered_keys:
1797 continue
1798
1799 # Apply replace directives for k3s forks
1800 # If module path is replaced in go.mod, try to resolve using the replacement path
1801 resolved_path = module_path
1802 resolved_version = version
1803 if module_path in go_mod_replaces:
1804 new_path, new_version = go_mod_replaces[module_path]
1805 if new_version: # Replace has explicit version
1806 resolved_path = new_path
1807 resolved_version = new_version
1808 if VERBOSE_MODE:
1809 print(f" [replace] {module_path}@{version} => {resolved_path}@{resolved_version}")
1810 # Check if we already have the replacement module
1811 if (resolved_path, resolved_version) in discovered_keys:
1812 # Copy the existing module entry with original path
1813 for m in modules:
1814 if m['module_path'] == resolved_path and m['version'] == resolved_version:
1815 replacement_entry = m.copy()
1816 replacement_entry['module_path'] = module_path
1817 replacement_entry['version'] = version
1818 modules.append(replacement_entry)
1819 discovered_keys.add((module_path, version))
1820 modules_by_path.setdefault(module_path, []).append(replacement_entry)
1821 print(f" ✓ {module_path}@{version} (using replace directive -> {resolved_path}@{resolved_version})")
1822 continue
1823
1824 fallback = resolve_module_metadata(resolved_path, resolved_version)
1825 if fallback:
1826 # If we used a replace directive, update the entry to use the original path
1827 if resolved_path != module_path or resolved_version != version:
1828 fallback['module_path'] = module_path
1829 fallback['version'] = version
1830 print(f" ✓ {module_path}@{version} (resolved via replace -> {resolved_path}@{resolved_version})")
1831 modules.append(fallback)
1832 discovered_keys.add((module_path, version))
1833 modules_by_path.setdefault(module_path, []).append(fallback)
1834 if debug_limit is not None and len(modules) >= debug_limit:
1835 limit_reached = True
1836 break
1837 else:
1838 # Handle monorepo submodule replacements (e.g., github.com/k3s-io/etcd/server/v3)
1839 # When a replacement points to a submodule path that doesn't have its own VCS entry,
1840 # try to find the base repository and use it with a subdir.
1841 # Example: github.com/k3s-io/etcd/server/v3 -> base: github.com/k3s-io/etcd, subdir: server/v3
1842 monorepo_handled = False
1843 if resolved_path != module_path and '/' in resolved_path:
1844 # Check if this looks like a submodule path (has version suffix like /v2, /v3, etc.)
1845 parts = resolved_path.rsplit('/', 1)
1846 if len(parts) == 2:
1847 potential_base = parts[0]
1848 potential_subdir = parts[1]
1849
1850 # Look for version-suffixed paths (e.g., /v2, /v3, /server/v3, /client/v3)
1851 # Try progressively shorter base paths
1852 base_candidates = []
1853 path_segments = resolved_path.split('/')
1854
1855 # For github.com/k3s-io/etcd/server/v3:
1856 # Try: github.com/k3s-io/etcd/server, github.com/k3s-io/etcd
1857 for i in range(len(path_segments) - 1, 2, -1): # At least keep domain + org
1858 candidate_base = '/'.join(path_segments[:i])
1859 candidate_subdir = '/'.join(path_segments[i:])
1860 base_candidates.append((candidate_base, candidate_subdir))
1861
1862 # Try each candidate base path
1863 for base_path, subdir in base_candidates:
1864 if base_path in modules_by_path:
1865 # Found the base repository! Create a submodule entry
1866 base_module = modules_by_path[base_path][0]
1867 vcs_url = base_module['vcs_url']
1868
1869 # Use the replacement version for the tag
1870 tag = resolved_version.split('+')[0]
1871 commit = git_ls_remote(vcs_url, f"refs/tags/{tag}") or git_ls_remote(vcs_url, tag)
1872
1873 if commit:
1874 timestamp = derive_timestamp_from_version(resolved_version)
1875 fallback = {
1876 "module_path": module_path, # Original path (go.etcd.io/etcd/server/v3)
1877 "version": version,
1878 "vcs_url": vcs_url,
1879 "vcs_hash": commit,
1880 "vcs_ref": f"refs/tags/{tag}" if git_ls_remote(vcs_url, f"refs/tags/{tag}") else tag,
1881 "timestamp": timestamp,
1882 "subdir": subdir, # e.g., "server/v3"
1883 }
1884 modules.append(fallback)
1885 discovered_keys.add((module_path, version))
1886 modules_by_path.setdefault(module_path, []).append(fallback)
1887 print(f" ✓ {module_path}@{version} (monorepo submodule: base={base_path}, subdir={subdir})")
1888 monorepo_handled = True
1889 if debug_limit is not None and len(modules) >= debug_limit:
1890 limit_reached = True
1891 break
1892
1893 if monorepo_handled:
1894 if limit_reached:
1895 break
1896 continue
1897
1898 if module_path in modules_by_path:
1899 reference_module = modules_by_path[module_path][0]
1900 vcs_url = reference_module['vcs_url']
1901 tag = version.split('+')[0]
1902 commit = None
1903 pseudo_info = parse_pseudo_version_tag(tag)
1904
1905 if pseudo_info:
1906 timestamp_str, short_commit = pseudo_info
1907 commit = resolve_pseudo_version_commit(
1908 vcs_url,
1909 timestamp_str,
1910 short_commit,
1911 clone_cache_dir=CLONE_CACHE_DIR
1912 )
1913 if commit:
1914 print(f" ✓ {module_path}@{version} (resolved pseudo-version via repository clone)")
1915 else:
1916 commit = git_ls_remote(vcs_url, f"refs/tags/{tag}") or git_ls_remote(vcs_url, tag)
1917 if commit:
1918 print(f" ✓ {module_path}@{version} (resolved using VCS URL from sibling version)")
1919
1920 if commit:
1921 timestamp = derive_timestamp_from_version(version)
1922 subdir = reference_module.get('subdir', '')
1923 update_metadata_cache(module_path, version, vcs_url, commit, timestamp, subdir, '', dirty=True)
1924 fallback = {
1925 "module_path": module_path,
1926 "version": version,
1927 "vcs_url": vcs_url,
1928 "vcs_hash": commit,
1929 "vcs_ref": "",
1930 "timestamp": timestamp,
1931 "subdir": subdir,
1932 }
1933 modules.append(fallback)
1934 discovered_keys.add((module_path, version))
1935 modules_by_path[module_path].append(fallback)
1936 if debug_limit is not None and len(modules) >= debug_limit:
1937 limit_reached = True
1938 break
1939 continue
1940
1941 # Skip monorepo root modules that fail resolution when we have submodules
1942 # Example: go.etcd.io/etcd/v3 (root) when we have github.com/k3s-io/etcd/server/v3, etc.
1943 # Handles both direct prefix match and forked monorepos (via VCS URL comparison)
1944 # These are never actually imported - they just exist in go.sum due to the monorepo go.mod
1945 is_monorepo_root = False
1946
1947 # Check 1: Direct prefix match (same repository, e.g., go.etcd.io/etcd/v3 → go.etcd.io/etcd/server/v3)
1948 if any(existing_path.startswith(module_path + '/') for existing_path in modules_by_path.keys()):
1949 is_monorepo_root = True
1950
1951 # Check 2: Forked monorepo (e.g., go.etcd.io/etcd/v3 → github.com/k3s-io/etcd/server/v3)
1952 # If we failed to derive a repository, try checking if any existing module's last path segment
1953 # matches our module's last segment (e.g., both end in /v3)
1954 if not is_monorepo_root and module_path.count('/') >= 2:
1955 module_segments = module_path.split('/')
1956 # For go.etcd.io/etcd/v3: domain=go.etcd.io, repo=etcd, suffix=v3
1957 # Check if we have modules like */etcd/*/v3 (forked versions)
1958 for existing_path in modules_by_path.keys():
1959 if '/' in existing_path:
1960 # Check if the existing path is a submodule of a similar repository
1961 # Example: github.com/k3s-io/etcd/server/v3 shares repository 'etcd' with go.etcd.io/etcd/v3
1962 if '/etcd/' in existing_path and module_path.endswith('/v3'):
1963 is_monorepo_root = True
1964 break
1965
1966 if is_monorepo_root:
1967 print(f" ⊙ {module_path}@{version} (monorepo root - submodules already resolved)")
1968 continue
1969
1970 if module_path in modules_by_path:
1971 FAILED_MODULE_PATHS.add(module_path)
1972 FAILED_MODULE_ENTRIES.add((module_path, version))
1973 print(f" ⚠️ Skipping {module_path}@{version} (indirect-only dependency)")
1974 if limit_reached:
1975 break
1976
1977 if limit_reached:
1978 print(f"\n⚠️ Debug limit {debug_limit} reached; skipping remaining modules discovered from go.sum/go list.")
1979
1980 # Resolve /go.mod-only (indirect) dependencies using sibling versions
1981 # Even though these are "indirect", Go may still need them during compilation
1982 # (e.g., due to complex replace directives or transitive dependencies).
1983 # If we have a sibling version with Origin metadata, resolve the indirect version too.
1984 print(f"\n⚙️ Resolving /go.mod-only dependencies from sibling versions...")
1985 gomod_only_resolved = 0
1986 gomod_only_skipped = 0
1987 for module_path, version in sorted(go_sum_indirect_only):
1988 try:
1989 if (module_path, version) in discovered_keys:
1990 continue # Already have this version
1991
1992 if module_path in modules_by_path:
1993 # We have a sibling version - try to resolve this one using the sibling's VCS URL
1994 reference_module = modules_by_path[module_path][0]
1995 vcs_url = reference_module['vcs_url']
1996 tag = version.split('+')[0]
1997 commit = None
1998 pseudo_info = parse_pseudo_version_tag(tag)
1999
2000 if pseudo_info:
2001 timestamp_str, short_commit = pseudo_info
2002 try:
2003 commit = resolve_pseudo_version_commit(
2004 vcs_url,
2005 timestamp_str,
2006 short_commit,
2007 clone_cache_dir=CLONE_CACHE_DIR
2008 )
2009 except Exception as e:
2010 print(f" ❌ Error resolving pseudo-version {module_path}@{version} (timestamp={timestamp_str}, commit={short_commit}): {e}")
2011 gomod_only_skipped += 1
2012 continue
2013 else:
2014 # For semantic version tags, try to find the tag reference
2015 # This enables to detect orphaned tags for sibling-resolved modules
2016 vcs_ref = ""
2017 commit = git_ls_remote(vcs_url, f"refs/tags/{tag}")
2018 if commit:
2019 vcs_ref = f"refs/tags/{tag}"
2020 else:
2021 commit = git_ls_remote(vcs_url, tag)
2022
2023 if commit:
2024 timestamp = derive_timestamp_from_version(version)
2025 subdir = reference_module.get('subdir', '')
2026 update_metadata_cache(module_path, version, vcs_url, commit, timestamp, subdir, '', dirty=True)
2027 fallback = {
2028 "module_path": module_path,
2029 "version": version,
2030 "vcs_url": vcs_url,
2031 "vcs_hash": commit,
2032 "vcs_ref": vcs_ref,
2033 "timestamp": timestamp,
2034 "subdir": subdir,
2035 }
2036 modules.append(fallback)
2037 discovered_keys.add((module_path, version))
2038 modules_by_path[module_path].append(fallback)
2039 gomod_only_resolved += 1
2040 print(f" ✓ {module_path}@{version} (/go.mod-only resolved using sibling version)")
2041 else:
2042 gomod_only_skipped += 1
2043 else:
2044 gomod_only_skipped += 1
2045 except Exception as e:
2046 print(f" ❌ Error resolving {module_path}@{version}: {e}")
2047 gomod_only_skipped += 1
2048
2049 if gomod_only_resolved > 0:
2050 print(f"✓ Resolved {gomod_only_resolved} /go.mod-only dependencies using sibling versions")
2051 if gomod_only_skipped > 0:
2052 print(f" ⚠️ Skipped {gomod_only_skipped} /go.mod-only dependencies (no sibling version available)")
2053
2054 if FAILED_MODULE_ENTRIES:
2055 print("\n❌ Failed to resolve metadata for the following modules:")
2056 for mod, ver in sorted(FAILED_MODULE_ENTRIES):
2057 print(f" - {mod}@{ver}")
2058 print("Aborting to avoid emitting invalid SRCREVs.")
2059 return 1
2060
2061 if not modules:
2062 print("❌ No modules discovered")
2063 return 1
2064
2065 success = generate_recipe(
2066 modules,
2067 source_dir,
2068 output_dir,
2069 args.git_repo or "unknown",
2070 args.git_ref or "unknown",
2071 validate_only=args.validate,
2072 debug_limit=debug_limit,
2073 skip_verify=args.skip_verify,
2074 verify_jobs=args.verify_jobs,
2075 )
2076
2077 if success:
2078 if args.validate:
2079 print("\n" + "=" * 70)
2080 print("✅ SUCCESS - Validation complete")
2081 print("=" * 70)
2082 else:
2083 print("\n" + "=" * 70)
2084 print("✅ SUCCESS - Recipe generation complete")
2085 print("=" * 70)
2086
2087 # Write corrected modules back to JSON for future runs
2088 if args.discovered_modules and VERIFY_CORRECTIONS_APPLIED:
2089 corrected_json = args.discovered_modules.replace('.json', '-corrected.json')
2090 try:
2091 with open(corrected_json, 'w') as f:
2092 json.dump(modules, f, indent=2)
2093 print(f"\n✓ Wrote corrected module metadata to: {corrected_json}")
2094 print(f" Use this file for future runs to avoid re-detecting orphaned commits")
2095 except Exception as e:
2096 print(f"\n⚠️ Could not write corrected JSON: {e}")
2097
2098 exit_code = 0
2099 else:
2100 print("\n❌ FAILED - Recipe generation failed")
2101 exit_code = 1
2102
2103 if SKIPPED_MODULES:
2104 print("\n⚠️ Skipped modules (no repository metadata)")
2105 for (module_path, version), reason in sorted(SKIPPED_MODULES.items()):
2106 print(f" - {module_path}@{version} [{reason}]")
2107 print(" Use --set-repo / --inject-commit to add missing metadata before building.")
2108
2109 return exit_code
2110
2111
2112def parse_go_mod_replaces(go_mod_path: Path) -> Dict[str, Tuple[str, str]]:
2113 """
2114 Parse replace directives from go.mod file.
2115
2116 Returns:
2117 Dict mapping old_path to (new_path, new_version)
2118 Example: {"github.com/containerd/containerd/v2": ("github.com/k3s-io/containerd/v2", "v2.1.4-k3s2")}
2119 """
2120 replaces = {}
2121 if not go_mod_path.exists():
2122 return replaces
2123
2124 try:
2125 content = go_mod_path.read_text()
2126 # Match: old_path => new_path version
2127 # Example: github.com/containerd/containerd/v2 => github.com/k3s-io/containerd/v2 v2.1.4-k3s2
2128 for line in content.splitlines():
2129 line = line.strip()
2130 if not line.startswith('replace ') and '=>' not in line:
2131 continue
2132
2133 # Remove 'replace ' prefix if present
2134 if line.startswith('replace '):
2135 line = line[8:].strip()
2136
2137 parts = line.split('=>')
2138 if len(parts) != 2:
2139 continue
2140
2141 left = parts[0].strip().split()
2142 right = parts[1].strip().split()
2143
2144 if len(left) == 0 or len(right) == 0:
2145 continue
2146
2147 old_path = left[0]
2148 new_path = right[0]
2149 new_version = right[1] if len(right) > 1 else ""
2150
2151 replaces[old_path] = (new_path, new_version)
2152 except Exception as e:
2153 print(f"⚠️ Failed to parse go.mod replaces: {e}", file=sys.stderr)
2154
2155 return replaces
2156
2157
2158def parse_pseudo_version_tag(tag: str) -> Optional[Tuple[str, str]]:
2159 """Return (timestamp, short_commit) for Go pseudo-versions."""
2160 tag = tag.split('+', 1)[0]
2161 parts = tag.split('-')
2162 if len(parts) < 3:
2163 return None
2164
2165 short_commit = parts[-1]
2166 timestamp_part = parts[-2]
2167 timestamp_str = timestamp_part.split('.')[-1]
2168
2169 if len(timestamp_str) != 14 or not timestamp_str.isdigit():
2170 return None
2171
2172 if not re.fullmatch(r'[0-9a-fA-F]{6,40}', short_commit):
2173 return None
2174
2175 return timestamp_str, short_commit
2176
2177
2178def _cache_key(url: str, ref: str) -> str:
2179 return f"{url}|||{ref}"
2180
2181
2182def load_ls_remote_cache() -> None:
2183 if not LS_REMOTE_CACHE_PATH.exists():
2184 return
2185 try:
2186 data = json.loads(LS_REMOTE_CACHE_PATH.read_text())
2187 except Exception:
2188 return
2189 for key, value in data.items():
2190 try:
2191 url, ref = key.split("|||", 1)
2192 except ValueError:
2193 continue
2194 LS_REMOTE_CACHE[(url, ref)] = value
2195
2196
2197def save_ls_remote_cache() -> None:
2198 if not LS_REMOTE_CACHE_DIRTY:
2199 return
2200 try:
2201 payload = {
2202 _cache_key(url, ref): value
2203 for (url, ref), value in LS_REMOTE_CACHE.items()
2204 }
2205 LS_REMOTE_CACHE_PATH.write_text(json.dumps(payload, indent=2, sort_keys=True))
2206 except Exception:
2207 pass
2208
2209
2210def git_ls_remote(url: str, ref: str, *, debug: bool = False) -> Optional[str]:
2211 """
2212 Query git repository for commit hash of a ref.
2213 Uses disk-based cache and local clones to minimize network calls.
2214
2215 Args:
2216 url: Git repository URL
2217 ref: Git ref (tag, branch, commit, etc.)
2218 debug: If True, print whether result came from cache or network
2219
2220 Returns:
2221 Commit hash or None if not found
2222 """
2223 global LS_REMOTE_CACHE_DIRTY
2224 key = (url, ref)
2225
2226 # Check in-memory cache first
2227 if key in LS_REMOTE_CACHE:
2228 if debug or VERBOSE_MODE:
2229 result = LS_REMOTE_CACHE[key]
2230 status = "cached" if result else "cached (not found)"
2231 print(f" [ls-remote {status}] {url} {ref}", file=sys.stderr)
2232 return LS_REMOTE_CACHE[key]
2233
2234 # Try local repository clone if available
2235 repo_hash = hashlib.sha256(url.encode()).hexdigest()[:16]
2236 local_repo = CLONE_CACHE_DIR / f"repo_{repo_hash}"
2237
2238 if local_repo.exists() and (local_repo / 'HEAD').exists():
2239 try:
2240 # Query local repository instead of network
2241 result = subprocess.run(
2242 ["git", "show-ref", "--hash", ref],
2243 cwd=local_repo,
2244 capture_output=True,
2245 text=True,
2246 timeout=10,
2247 )
2248 if result.returncode == 0 and result.stdout.strip():
2249 commit_hash = result.stdout.strip().split()[0]
2250 LS_REMOTE_CACHE[key] = commit_hash
2251 LS_REMOTE_CACHE_DIRTY = True
2252 if debug or VERBOSE_MODE:
2253 print(f" [ls-remote local] {url} {ref} -> {commit_hash[:12]}", file=sys.stderr)
2254 return commit_hash
2255 except (subprocess.TimeoutExpired, subprocess.CalledProcessError, Exception):
2256 # Fall through to network query
2257 pass
2258
2259 if debug or VERBOSE_MODE:
2260 print(f" [ls-remote network] {url} {ref}", file=sys.stderr)
2261
2262 try:
2263 env = os.environ.copy()
2264 env.setdefault("GIT_TERMINAL_PROMPT", "0")
2265 env.setdefault("GIT_ASKPASS", "true")
2266
2267 # FIX: For tags, also query the dereferenced commit (^{}) to handle annotated tags
2268 # Annotated tags have a tag object hash that differs from the commit hash.
2269 # We need the actual commit hash for git archive/checkout operations.
2270 refs_to_query = [ref]
2271 if ref.startswith("refs/tags/"):
2272 refs_to_query.append(f"{ref}^{{}}") # Add dereferenced query
2273
2274 result = subprocess.run(
2275 ["git", "ls-remote", url] + refs_to_query,
2276 capture_output=True,
2277 text=True,
2278 check=True,
2279 env=env,
2280 timeout=GIT_CMD_TIMEOUT,
2281 )
2282
2283 # Parse results - prefer dereferenced commit (^{}) over annotated tag object
2284 tag_object_hash = None
2285 dereferenced_hash = None
2286
2287 for line in result.stdout.strip().splitlines():
2288 if not line:
2289 continue
2290 parts = line.split()
2291 if len(parts) >= 2:
2292 hash_val, ref_name = parts[0], parts[1]
2293 if ref_name.endswith("^{}"):
2294 # This is the dereferenced commit - preferred!
2295 dereferenced_hash = hash_val
2296 else:
2297 # This is either a lightweight tag or annotated tag object
2298 tag_object_hash = hash_val
2299
2300 # Prefer dereferenced commit, fall back to tag object (for lightweight tags)
2301 commit_hash = dereferenced_hash or tag_object_hash
2302 if commit_hash:
2303 LS_REMOTE_CACHE[key] = commit_hash
2304 LS_REMOTE_CACHE_DIRTY = True
2305 return commit_hash
2306
2307 except subprocess.TimeoutExpired:
2308 print(f" ⚠️ git ls-remote timeout ({GIT_CMD_TIMEOUT}s) for {url} {ref}")
2309 LS_REMOTE_CACHE[key] = None
2310 LS_REMOTE_CACHE_DIRTY = True
2311 return None
2312 except subprocess.CalledProcessError:
2313 LS_REMOTE_CACHE[key] = None
2314 LS_REMOTE_CACHE_DIRTY = True
2315 return None
2316 return None
2317
2318
2319def load_vanity_url_cache() -> None:
2320 """Load vanity URL resolution cache from disk."""
2321 if not VANITY_URL_CACHE_PATH.exists():
2322 return
2323 try:
2324 data = json.loads(VANITY_URL_CACHE_PATH.read_text())
2325 VANITY_URL_CACHE.update(data)
2326 except Exception:
2327 pass
2328
2329
2330def save_vanity_url_cache() -> None:
2331 """Save vanity URL resolution cache to disk."""
2332 if not VANITY_URL_CACHE_DIRTY:
2333 return
2334 try:
2335 VANITY_URL_CACHE_PATH.write_text(json.dumps(VANITY_URL_CACHE, indent=2, sort_keys=True))
2336 except Exception:
2337 pass
2338
2339
2340def load_verify_commit_cache() -> None:
2341 """
2342 Load verification cache with timestamp support for aging detection.
2343
2344 Cache format v2:
2345 {
2346 "repo|||commit": {
2347 "verified": true,
2348 "first_verified": "2025-01-15T10:30:00Z", # When first verified
2349 "last_checked": "2025-02-10T14:20:00Z", # When last re-verified
2350 "fetch_method": "fetch" # "fetch", "ref", or "cached"
2351 }
2352 }
2353 """
2354 global VERIFY_COMMIT_CACHE_DIRTY, VERIFY_COMMIT_CACHE_V2
2355 if not VERIFY_COMMIT_CACHE_PATH.exists():
2356 return
2357 try:
2358 data = json.loads(VERIFY_COMMIT_CACHE_PATH.read_text())
2359 except Exception:
2360 return
2361
2362 if isinstance(data, dict):
2363 # Detect format: v1 (bool values) vs v2 (dict values)
2364 sample_value = next(iter(data.values())) if data else None
2365
2366 if isinstance(sample_value, bool):
2367 # Legacy format: convert to v2
2368 from datetime import datetime, timezone
2369 now = datetime.now(timezone.utc).isoformat()
2370 for k, v in data.items():
2371 if v: # Only migrate verified=True entries
2372 VERIFY_COMMIT_CACHE_V2[k] = {
2373 "verified": True,
2374 "first_verified": now,
2375 "last_checked": now,
2376 "fetch_method": "cached" # Unknown how it was verified
2377 }
2378 VERIFY_COMMIT_CACHE_DIRTY = True # Mark dirty to save in new format
2379 elif isinstance(sample_value, dict):
2380 # V2 format
2381 VERIFY_COMMIT_CACHE_V2.update(data)
2382
2383 VERIFY_COMMIT_CACHE_DIRTY = False
2384
2385
2386def save_verify_commit_cache(force: bool = False) -> None:
2387 """Save verification cache in v2 format with timestamps.
2388
2389 Args:
2390 force: If True, save even if not dirty (for incremental saves during long runs)
2391 """
2392 global VERIFY_COMMIT_CACHE_DIRTY
2393
2394 if not force and not VERIFY_COMMIT_CACHE_DIRTY:
2395 return
2396 try:
2397 VERIFY_COMMIT_CACHE_PATH.write_text(json.dumps(VERIFY_COMMIT_CACHE_V2, indent=2, sort_keys=True))
2398 VERIFY_COMMIT_CACHE_DIRTY = False
2399 except Exception as e:
2400 print(f"⚠️ Failed to save verification cache: {e}")
2401 pass
2402
2403
2404def _load_overrides_from_file(path: Path, target_dict: Dict[Tuple[str, Optional[str]], str]) -> None:
2405 """
2406 Load module->repo overrides from a JSON file into the target dictionary.
2407
2408 File format:
2409 {
2410 "module/path": "https://github.com/org/repo",
2411 "module/path@v1.2.3": "https://github.com/org/repo"
2412 }
2413
2414 The @version suffix is optional. Use it to override only a specific version.
2415 """
2416 if not path.exists():
2417 return
2418 try:
2419 data = json.loads(path.read_text())
2420 except Exception:
2421 return
2422 if not isinstance(data, dict):
2423 return
2424
2425 for raw_key, repo_url in data.items():
2426 if not isinstance(repo_url, str):
2427 continue
2428 module_path = str(raw_key)
2429 version: Optional[str] = None
2430
2431 # Support both "module|||version" (legacy) and "module@version" (new) formats
2432 if "|||" in module_path:
2433 module_part, version_part = module_path.split("|||", 1)
2434 version = None if version_part == "*" else version_part
2435 module_path = module_part
2436 elif "@" in module_path and not module_path.startswith("@"):
2437 # Handle module@version format (but not @org/pkg scoped packages)
2438 at_pos = module_path.rfind("@")
2439 version = module_path[at_pos + 1:]
2440 module_path = module_path[:at_pos]
2441
2442 try:
2443 key = _normalise_override_key(module_path, version)
2444 except ValueError:
2445 continue
2446 target_dict[key] = repo_url
2447
2448
2449def load_manual_overrides() -> None:
2450 """Load git-tracked manual overrides from manual-overrides.json."""
2451 global MANUAL_OVERRIDES
2452 MANUAL_OVERRIDES.clear()
2453 _load_overrides_from_file(MANUAL_OVERRIDES_PATH, MANUAL_OVERRIDES)
2454 if MANUAL_OVERRIDES:
2455 print(f" Loaded {len(MANUAL_OVERRIDES)} manual repository override(s)")
2456
2457
2458def load_repo_overrides() -> None:
2459 """Load dynamic overrides from repo-overrides.json (created via --set-repo)."""
2460 global MODULE_REPO_OVERRIDES_DIRTY
2461 MODULE_REPO_OVERRIDES.clear()
2462 _load_overrides_from_file(MODULE_REPO_OVERRIDES_PATH, MODULE_REPO_OVERRIDES)
2463 MODULE_REPO_OVERRIDES_DIRTY = False
2464
2465
2466def save_repo_overrides() -> None:
2467 if not MODULE_REPO_OVERRIDES_DIRTY:
2468 return
2469 try:
2470 payload: Dict[str, str] = {}
2471 for (module_path, version), repo_url in MODULE_REPO_OVERRIDES.items():
2472 key = module_path if version is None else f"{module_path}|||{version}"
2473 payload[key] = repo_url
2474 MODULE_REPO_OVERRIDES_PATH.write_text(json.dumps(payload, indent=2, sort_keys=True))
2475 except Exception:
2476 pass
2477
2478
2479def query_vanity_url(module_path: str) -> Optional[str]:
2480 """
2481 Query vanity URL metadata using ?go-get=1 to resolve actual VCS repository.
2482
2483 Go uses vanity URLs to provide custom import paths that redirect to actual
2484 repositories. When you request https://example.com/module?go-get=1, the server
2485 returns HTML with a meta tag like:
2486 <meta name="go-import" content="example.com/module git https://github.com/org/repo">
2487
2488 This function queries that metadata and caches the result for future use.
2489
2490 Args:
2491 module_path: Go module path (e.g., "go.uber.org/atomic")
2492
2493 Returns:
2494 VCS repository URL if found, None otherwise
2495 """
2496 global VANITY_URL_CACHE_DIRTY
2497
2498 # Check cache first
2499 if module_path in VANITY_URL_CACHE:
2500 return VANITY_URL_CACHE[module_path]
2501
2502 # Query the ?go-get=1 metadata
2503 url = f"https://{module_path}?go-get=1"
2504
2505 try:
2506 import urllib.request
2507 import html.parser
2508
2509 class GoImportParser(html.parser.HTMLParser):
2510 def __init__(self, target_module: str):
2511 super().__init__()
2512 self.target_module = target_module
2513 self.repo_url = None
2514 self.best_prefix_len = 0 # Track longest matching prefix
2515
2516 def handle_starttag(self, tag, attrs):
2517 if tag == 'meta':
2518 attrs_dict = dict(attrs)
2519 if attrs_dict.get('name') == 'go-import':
2520 content = attrs_dict.get('content', '')
2521 # Format: "module_prefix vcs repo_url"
2522 parts = content.split()
2523 if len(parts) >= 3:
2524 prefix = parts[0]
2525 # parts[1] = vcs type (git, hg, svn, bzr)
2526 repo_url = parts[2]
2527 # Per Go spec: match the go-import whose prefix matches our module
2528 # The module path must equal the prefix or have it as a path prefix
2529 if self.target_module == prefix or self.target_module.startswith(prefix + '/'):
2530 # Prefer longer (more specific) prefix matches
2531 if len(prefix) > self.best_prefix_len:
2532 self.best_prefix_len = len(prefix)
2533 self.repo_url = repo_url
2534
2535 # Fetch the page with a timeout
2536 req = urllib.request.Request(url, headers={'User-Agent': 'oe-go-mod-fetcher/3.0'})
2537 with urllib.request.urlopen(req, timeout=10) as response:
2538 html_content = response.read().decode('utf-8', errors='ignore')
2539
2540 # Parse the HTML to find matching go-import meta tag
2541 parser = GoImportParser(module_path)
2542 parser.feed(html_content)
2543
2544 # Cache the result (even if None)
2545 VANITY_URL_CACHE[module_path] = parser.repo_url
2546 VANITY_URL_CACHE_DIRTY = True
2547
2548 return parser.repo_url
2549
2550 except Exception as e:
2551 # Cache negative result to avoid repeated failures
2552 VANITY_URL_CACHE[module_path] = None
2553 VANITY_URL_CACHE_DIRTY = True
2554 return None
2555
2556
2557def get_github_mirror_url(vcs_url: str) -> Optional[str]:
2558 """
2559 Get GitHub mirror URL for golang.org/x repositories.
2560
2561 golang.org/x repositories are mirrored on GitHub at github.com/golang/*.
2562 These mirrors are often more reliable than go.googlesource.com.
2563
2564 Args:
2565 vcs_url: Original VCS URL (e.g., https://go.googlesource.com/tools)
2566
2567 Returns:
2568 GitHub mirror URL if applicable, None otherwise
2569 """
2570 if 'go.googlesource.com' in vcs_url:
2571 # Extract package name from URL
2572 # https://go.googlesource.com/tools -> tools
2573 pkg_name = vcs_url.rstrip('/').split('/')[-1]
2574 return f"https://github.com/golang/{pkg_name}"
2575 return None
2576
2577
2578def resolve_pseudo_version_commit(vcs_url: str, timestamp_str: str, short_commit: str,
2579 clone_cache_dir: Optional[Path] = None) -> Optional[str]:
2580 """
2581 Resolve a pseudo-version's short commit hash to a full 40-character hash.
2582
2583 This function clones (or updates) a git repository and searches the commit history
2584 for a commit that matches both the timestamp and short commit hash from a pseudo-version.
2585
2586 For golang.org/x repositories, automatically tries GitHub mirrors if the primary
2587 source fails (go.googlesource.com can be slow or unreliable).
2588
2589 Args:
2590 vcs_url: Git repository URL
2591 timestamp_str: Timestamp from pseudo-version (format: YYYYMMDDHHmmss)
2592 short_commit: Short commit hash (12 characters) from pseudo-version
2593 clone_cache_dir: Optional directory to cache cloned repositories (recommended)
2594
2595 Returns:
2596 Full 40-character commit hash, or None if not found
2597 """
2598 # Parse timestamp
2599 try:
2600 dt = datetime.strptime(timestamp_str, "%Y%m%d%H%M%S")
2601 # Validate the date is within a reasonable range before doing arithmetic
2602 # Python datetime supports years 1-9999, but Go pseudo-versions should be recent
2603 # Also ensure year > 1 to avoid overflow when subtracting 1 day
2604 if dt.year < 1970 or dt.year > 9999:
2605 print(f"⚠️ Invalid timestamp year {dt.year} in pseudo-version (timestamp: {timestamp_str})", file=sys.stderr)
2606 return None
2607 if dt.year == 1:
2608 # Special case: year 1 would overflow when subtracting 1 day
2609 print(f"⚠️ Invalid timestamp year 1 in pseudo-version (timestamp: {timestamp_str})", file=sys.stderr)
2610 return None
2611 # Search window: ±1 day around timestamp for efficiency
2612 try:
2613 since = (dt - timedelta(days=1)).isoformat()
2614 until = (dt + timedelta(days=1)).isoformat()
2615 except OverflowError as e:
2616 print(f"⚠️ Date arithmetic overflow for timestamp {timestamp_str}: {e}", file=sys.stderr)
2617 return None
2618 except ValueError as e:
2619 print(f"⚠️ Invalid timestamp format {timestamp_str}: {e}", file=sys.stderr)
2620 return None
2621
2622 # Try primary URL and GitHub mirror (if applicable)
2623 urls_to_try = [vcs_url]
2624 github_mirror = get_github_mirror_url(vcs_url)
2625 if github_mirror:
2626 urls_to_try.append(github_mirror)
2627
2628 git_env = os.environ.copy()
2629 git_env.setdefault("GIT_TERMINAL_PROMPT", "0")
2630 git_env.setdefault("GIT_ASKPASS", "true")
2631
2632 for try_url in urls_to_try:
2633 # Determine clone directory based on URL being tried
2634 if clone_cache_dir:
2635 clone_cache_dir.mkdir(parents=True, exist_ok=True)
2636 repo_hash = hashlib.sha256(try_url.encode()).hexdigest()[:16]
2637 clone_dir = clone_cache_dir / f"repo_{repo_hash}"
2638 else:
2639 clone_dir = Path(tempfile.mkdtemp(prefix="pseudo-resolve-"))
2640
2641 try:
2642 # Clone or update repository
2643 if clone_dir.exists() and (clone_dir / 'HEAD').exists():
2644 # Repository already cloned, fetch latest
2645 try:
2646 subprocess.run(
2647 ['git', 'fetch', '--all', '--quiet'],
2648 cwd=clone_dir,
2649 capture_output=True,
2650 check=True,
2651 timeout=60,
2652 env=git_env,
2653 )
2654 except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
2655 # Fetch failed, try to use existing clone anyway
2656 pass
2657 else:
2658 # Clone repository (bare clone for efficiency)
2659 if clone_dir.exists():
2660 shutil.rmtree(clone_dir)
2661 clone_dir.mkdir(parents=True, exist_ok=True)
2662
2663 subprocess.run(
2664 ['git', 'clone', '--bare', '--quiet', try_url, str(clone_dir)],
2665 capture_output=True,
2666 check=True,
2667 timeout=300, # 5 minute timeout
2668 env=git_env,
2669 )
2670
2671 # Search for commits matching timestamp and short hash
2672 result = subprocess.run(
2673 ['git', 'log', '--all', '--format=%H %ct',
2674 f'--since={since}', f'--until={until}'],
2675 cwd=clone_dir,
2676 capture_output=True,
2677 text=True,
2678 check=True,
2679 timeout=30,
2680 env=git_env,
2681 )
2682
2683 # Find commit with matching short hash prefix
2684 for line in result.stdout.strip().splitlines():
2685 if not line:
2686 continue
2687 parts = line.split()
2688 if len(parts) < 2:
2689 continue
2690 full_hash = parts[0]
2691 if full_hash.startswith(short_commit):
2692 return full_hash
2693
2694 # Commit not found in this repository, try next URL
2695 continue
2696
2697 except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
2698 # Clone/fetch failed, try next URL if available
2699 if not clone_cache_dir and clone_dir.exists():
2700 shutil.rmtree(clone_dir)
2701 continue
2702 finally:
2703 # Clean up temp directory if we created one
2704 if not clone_cache_dir and clone_dir.exists():
2705 try:
2706 shutil.rmtree(clone_dir)
2707 except:
2708 pass
2709
2710 # All URLs failed
2711 return None
2712
2713
2714def derive_timestamp_from_version(version: str) -> str:
2715 parsed = parse_pseudo_version_tag(version)
2716 if parsed:
2717 timestamp_str, _ = parsed
2718 try:
2719 return datetime.strptime(timestamp_str, "%Y%m%d%H%M%S").strftime("%Y-%m-%dT%H:%M:%SZ")
2720 except ValueError:
2721 return "1970-01-01T00:00:00Z"
2722 return "1970-01-01T00:00:00Z"
2723
2724
2725def _cache_metadata_key(module_path: str, version: str) -> Tuple[str, str]:
2726 return (module_path, version)
2727
2728
2729def load_metadata_cache_file() -> None:
2730 if not MODULE_METADATA_CACHE_PATH.exists():
2731 return
2732 try:
2733 data = json.loads(MODULE_METADATA_CACHE_PATH.read_text())
2734 except Exception:
2735 return
2736 for key, value in data.items():
2737 try:
2738 module_path, version = key.split("|||", 1)
2739 except ValueError:
2740 continue
2741 if not isinstance(value, dict):
2742 continue
2743 MODULE_METADATA_CACHE[_cache_metadata_key(module_path, version)] = {
2744 'vcs_url': value.get('vcs_url', ''),
2745 'commit': value.get('commit', ''),
2746 'timestamp': value.get('timestamp', ''),
2747 'subdir': value.get('subdir', ''),
2748 'ref': value.get('ref', ''),
2749 }
2750
2751
2752def save_metadata_cache() -> None:
2753 if not MODULE_METADATA_CACHE_DIRTY:
2754 return
2755 payload = {
2756 f"{module}|||{version}": value
2757 for (module, version), value in MODULE_METADATA_CACHE.items()
2758 }
2759 try:
2760 MODULE_METADATA_CACHE_PATH.write_text(json.dumps(payload, indent=2, sort_keys=True))
2761 except Exception:
2762 pass
2763
2764
2765def update_metadata_cache(module_path: str, version: str, vcs_url: str, commit: str,
2766 timestamp: str = "", subdir: str = "", ref: str = "",
2767 dirty: bool = True) -> None:
2768 global MODULE_METADATA_CACHE_DIRTY
2769 key = _cache_metadata_key(module_path, version)
2770 value = {
2771 'vcs_url': vcs_url or '',
2772 'commit': commit or '',
2773 'timestamp': timestamp or '',
2774 'subdir': subdir or '',
2775 'ref': ref or '',
2776 }
2777 if MODULE_METADATA_CACHE.get(key) != value:
2778 MODULE_METADATA_CACHE[key] = value
2779 if dirty:
2780 MODULE_METADATA_CACHE_DIRTY = True
2781
2782
2783def get_cached_metadata(module_path: str, version: str) -> Optional[dict]:
2784 entry = MODULE_METADATA_CACHE.get(_cache_metadata_key(module_path, version))
2785 if not entry:
2786 return None
2787 timestamp = entry.get('timestamp') or derive_timestamp_from_version(version)
2788 return {
2789 "module_path": module_path,
2790 "version": version,
2791 "vcs_url": entry.get('vcs_url', ''),
2792 "vcs_hash": entry.get('commit', ''),
2793 "vcs_ref": entry.get('ref', ''),
2794 "timestamp": timestamp,
2795 "subdir": entry.get('subdir', ''),
2796 }
2797
2798
2799def load_metadata_from_inc(output_dir: Path) -> None:
2800 git_inc = output_dir / "go-mod-git.inc"
2801 cache_inc = output_dir / "go-mod-cache.inc"
2802
2803 sha_to_url: Dict[str, str] = {}
2804 if git_inc.exists():
2805 for line in git_inc.read_text().splitlines():
2806 line = line.strip()
2807 if not line.startswith('SRC_URI'):
2808 continue
2809 if '"' not in line:
2810 continue
2811 content = line.split('"', 1)[1].rsplit('"', 1)[0]
2812 parts = [p for p in content.split(';') if p]
2813 if not parts:
2814 continue
2815 url_part = parts[0]
2816 dest_sha = None
2817 for part in parts[1:]:
2818 if part.startswith('destsuffix='):
2819 dest = part.split('=', 1)[1]
2820 dest_sha = dest.rsplit('/', 1)[-1]
2821 break
2822 if not dest_sha:
2823 continue
2824 if url_part.startswith('git://'):
2825 url_https = 'https://' + url_part[6:]
2826 else:
2827 url_https = url_part
2828 sha_to_url[dest_sha] = url_https
2829
2830 if cache_inc.exists():
2831 text = cache_inc.read_text()
2832 marker = "GO_MODULE_CACHE_DATA = '"
2833 if marker in text:
2834 start = text.index(marker) + len(marker)
2835 try:
2836 end = text.index("'\n\n", start)
2837 except ValueError:
2838 end = len(text)
2839 try:
2840 data = json.loads(text[start:end])
2841 except Exception:
2842 data = []
2843 for entry in data:
2844 module_path = entry.get('module')
2845 version = entry.get('version')
2846 sha = entry.get('vcs_hash')
2847 commit = entry.get('commit')
2848 timestamp = entry.get('timestamp', '')
2849 subdir = entry.get('subdir', '')
2850 ref = entry.get('vcs_ref', '')
2851 if not module_path or not version:
2852 continue
2853 vcs_url = sha_to_url.get(sha, '')
2854 if not vcs_url:
2855 continue
2856 if not _url_allowed_for_module(module_path, vcs_url, version):
2857 continue
2858 # Skip entries with invalid commit hashes
2859 if commit and len(commit) != 40:
2860 continue
2861 if not timestamp:
2862 timestamp = derive_timestamp_from_version(version)
2863 update_metadata_cache(module_path, version, vcs_url, commit or '', timestamp, subdir, ref, dirty=False)
2864
2865
2866def load_metadata_from_module_cache_task(output_dir: Path) -> None:
2867 legacy_path = output_dir / "module_cache_task.inc"
2868 if not legacy_path.exists():
2869 return
2870 import ast
2871 pattern = re.compile(r'\(\{.*?\}\)', re.DOTALL)
2872 text = legacy_path.read_text()
2873 for match in pattern.finditer(text):
2874 blob = match.group()[1:-1] # strip parentheses
2875 try:
2876 entry = ast.literal_eval(blob)
2877 except Exception:
2878 continue
2879 module_path = entry.get('module')
2880 version = entry.get('version')
2881 vcs_url = entry.get('repo_url') or entry.get('url') or ''
2882 commit = entry.get('commit') or ''
2883 subdir = entry.get('subdir', '')
2884 ref = entry.get('ref', '')
2885 if not module_path or not version or not vcs_url or not commit:
2886 continue
2887 if vcs_url.startswith('git://'):
2888 vcs_url = 'https://' + vcs_url[6:]
2889 if not _url_allowed_for_module(module_path, vcs_url, version):
2890 continue
2891 timestamp = derive_timestamp_from_version(version)
2892 update_metadata_cache(module_path, version, vcs_url, commit, timestamp, subdir, ref, dirty=True)
2893
2894
2895def bootstrap_metadata_cache(output_dir: Optional[Path],
2896 skip_inc_files: bool = False,
2897 skip_legacy_module_cache: bool = False) -> None:
2898 """
2899 Bootstrap metadata cache from multiple sources.
2900
2901 Args:
2902 output_dir: Recipe output directory (optional in cache-only mode)
2903 skip_inc_files: If True, skip loading from .inc files (used with --clean-cache)
2904 skip_legacy_module_cache: If True, skip loading legacy module_cache_task.inc metadata
2905 """
2906 load_metadata_cache_file()
2907 if not skip_inc_files and output_dir is not None:
2908 load_metadata_from_inc(output_dir)
2909 if not skip_legacy_module_cache and output_dir is not None:
2910 load_metadata_from_module_cache_task(output_dir)
2911
2912
2913def _lookup_commit_for_version(vcs_url: str, version: str, preferred_ref: str = "") -> Tuple[Optional[str], Optional[str]]:
2914 """
2915 Resolve the git commit for a module version using git ls-remote.
2916
2917 Returns:
2918 Tuple of (commit, timestamp). Timestamp may be None if unknown.
2919 """
2920 tag = version.split('+')[0]
2921 pseudo_info = parse_pseudo_version_tag(tag)
2922 candidate_urls = [vcs_url]
2923 if not vcs_url.endswith('.git'):
2924 candidate_urls.append(vcs_url.rstrip('/') + '.git')
2925
2926 for url in candidate_urls:
2927 if preferred_ref:
2928 commit = git_ls_remote(url, preferred_ref)
2929 if commit:
2930 return commit, "1970-01-01T00:00:00Z"
2931
2932 if pseudo_info:
2933 timestamp_str, short_commit = pseudo_info
2934 commit = git_ls_remote(url, short_commit)
2935 if commit:
2936 timestamp = derive_timestamp_from_version(version)
2937 return commit, timestamp
2938 else:
2939 for ref in (f"refs/tags/{tag}", tag):
2940 commit = git_ls_remote(url, ref)
2941 if commit:
2942 return commit, "1970-01-01T00:00:00Z"
2943
2944 if pseudo_info:
2945 timestamp_str, short_commit = pseudo_info
2946 for url in candidate_urls:
2947 commit = resolve_pseudo_version_commit(
2948 url,
2949 timestamp_str,
2950 short_commit,
2951 clone_cache_dir=CLONE_CACHE_DIR,
2952 )
2953 if commit:
2954 timestamp = derive_timestamp_from_version(version)
2955 return commit, timestamp
2956
2957 if pseudo_info:
2958 # Even if we couldn't resolve the commit, return derived timestamp
2959 return None, derive_timestamp_from_version(version)
2960 return None, None
2961
2962
2963def query_module_via_go_list(module_path: str, version: str) -> Optional[Dict[str, str]]:
2964 """Use `go list -m -json` to obtain VCS metadata for a module version."""
2965 env = os.environ.copy()
2966 env.setdefault('GOPROXY', 'https://proxy.golang.org')
2967 if CURRENT_GOMODCACHE:
2968 env['GOMODCACHE'] = CURRENT_GOMODCACHE
2969
2970 try:
2971 result = subprocess.run(
2972 ['go', 'list', '-m', '-json', f'{module_path}@{version}'],
2973 capture_output=True,
2974 text=True,
2975 check=True,
2976 env=env,
2977 timeout=GO_CMD_TIMEOUT,
2978 )
2979 except subprocess.TimeoutExpired:
2980 print(f" ⚠️ go list timed out for {module_path}@{version} after {GO_CMD_TIMEOUT}s")
2981 return None
2982 except subprocess.CalledProcessError:
2983 return None
2984
2985 try:
2986 data = json.loads(result.stdout)
2987 except json.JSONDecodeError:
2988 return None
2989
2990 origin = data.get('Origin') or {}
2991 vcs_url = origin.get('URL', '')
2992 commit = origin.get('Hash', '')
2993 subdir = origin.get('Subdir', '')
2994 ref = origin.get('Ref', '')
2995 timestamp = data.get('Time') or origin.get('Time') or ''
2996
2997 if vcs_url.startswith('git+'):
2998 vcs_url = vcs_url[4:]
2999
3000 if not vcs_url or not commit:
3001 return None
3002
3003 return {
3004 'vcs_url': vcs_url,
3005 'commit': commit,
3006 'timestamp': timestamp,
3007 'subdir': subdir or '',
3008 'vcs_ref': ref or '',
3009 }
3010
3011
3012def _candidate_gopkg_repos(module_path: str) -> List[str]:
3013 """
3014 Generate candidate repository URLs for gopkg.in modules.
3015 """
3016 if not module_path.startswith("gopkg.in/"):
3017 return []
3018
3019 remainder = module_path[len("gopkg.in/"):]
3020 if not remainder:
3021 return []
3022
3023 parts = remainder.split('/')
3024 last = parts[-1]
3025
3026 match = re.match(r'(?P<name>.+?)\.v\d+(?:[.\w-]*)?$', last)
3027 if not match:
3028 return []
3029
3030 repo_name = match.group('name')
3031 owner_segments = parts[:-1]
3032
3033 owner_variants: List[str] = []
3034 if owner_segments:
3035 canonical_owner = '/'.join(owner_segments)
3036 owner_variants.append(canonical_owner)
3037
3038 # Provide fallbacks with dotted segments replaced
3039 dotted_to_hyphen = '/'.join(segment.replace('.', '-') for segment in owner_segments)
3040 dotted_to_empty = '/'.join(segment.replace('.', '') for segment in owner_segments)
3041 for candidate in (dotted_to_hyphen, dotted_to_empty):
3042 if candidate and candidate not in owner_variants:
3043 owner_variants.append(candidate)
3044 else:
3045 # Common conventions used by gopkg.in vanity repos
3046 owner_variants.extend([
3047 f"go-{repo_name}",
3048 repo_name,
3049 f"{repo_name}-go",
3050 ])
3051
3052 urls: List[str] = []
3053 seen: Set[str] = set()
3054 for owner in owner_variants:
3055 owner = owner.strip('/')
3056 if not owner:
3057 continue
3058 candidate = f"https://github.com/{owner}/{repo_name}"
3059 if candidate not in seen:
3060 seen.add(candidate)
3061 urls.append(candidate)
3062 return urls
3063
3064
3065def _recalculate_subdir_from_vanity(vcs_url: str, module_parts: List[str], current_subdir: str) -> str:
3066 """
3067 Recalculate module subdirectory when a vanity import redirects to a different repository layout.
3068 """
3069 if not vcs_url:
3070 return current_subdir
3071
3072 vcs_repo_name = vcs_url.rstrip('/').split('/')[-1]
3073 if vcs_repo_name.endswith('.git'):
3074 vcs_repo_name = vcs_repo_name[:-4]
3075
3076 repo_boundary_index = None
3077 for i, part in enumerate(module_parts):
3078 if part == vcs_repo_name or part in vcs_repo_name or vcs_repo_name.endswith(part):
3079 repo_boundary_index = i + 1
3080 break
3081
3082 if repo_boundary_index is not None and repo_boundary_index < len(module_parts):
3083 subdir_parts = module_parts[repo_boundary_index:]
3084 if subdir_parts and subdir_parts[-1].startswith('v') and subdir_parts[-1][1:].isdigit():
3085 subdir_parts = subdir_parts[:-1]
3086 return '/'.join(subdir_parts) if subdir_parts else ''
3087
3088 if len(module_parts) <= 3:
3089 return ''
3090
3091 return current_subdir
3092
3093
3094def resolve_module_metadata(module_path: str, version: str) -> Optional[dict]:
3095 parts = module_path.split('/')
3096 vanity_repo = None # Track if module was resolved via vanity URL
3097
3098 tag = version.split('+')[0]
3099 pseudo_info = parse_pseudo_version_tag(tag)
3100 expected_commit_prefix = pseudo_info[1] if pseudo_info else None
3101
3102 cached = get_cached_metadata(module_path, version)
3103 if cached:
3104 override_urls = repo_override_candidates(module_path, version)
3105 if expected_commit_prefix:
3106 cached_commit = cached.get('vcs_hash') or ''
3107 if cached_commit and not cached_commit.startswith(expected_commit_prefix):
3108 cached = None
3109 if cached and override_urls:
3110 url = cached.get('vcs_url') or ''
3111 if url and url not in override_urls:
3112 cached = None
3113 if cached and not expected_commit_prefix:
3114 ref_hint = cached.get('vcs_ref', '')
3115 commit_check, _ = _lookup_commit_for_version(cached.get('vcs_url', ''), version, ref_hint)
3116 if not commit_check or commit_check.lower() != (cached.get('vcs_hash', '') or '').lower():
3117 cached = None
3118
3119 def fetch_go_metadata() -> Optional[Dict[str, str]]:
3120 info = query_module_via_go_list(module_path, version)
3121 if info:
3122 return info
3123 if go_mod_download(module_path, version):
3124 return query_module_via_go_list(module_path, version)
3125 return None
3126
3127 def resolve_with_go_info(go_info: Optional[Dict[str, str]], fallback_url: str, fallback_subdir: str) -> Optional[dict]:
3128 if not go_info:
3129 return None
3130
3131 candidate_urls: List[str] = []
3132 overrides = repo_override_candidates(module_path, version)
3133 candidate_urls.extend(overrides)
3134 info_url = (go_info.get('vcs_url') or '').strip()
3135 if info_url and info_url not in candidate_urls:
3136 candidate_urls.append(info_url)
3137 if fallback_url and fallback_url not in candidate_urls:
3138 candidate_urls.append(fallback_url)
3139
3140 timestamp_hint = go_info.get('timestamp') or derive_timestamp_from_version(version)
3141 subdir_hint = go_info.get('subdir', '') or fallback_subdir
3142 ref_hint = go_info.get('vcs_ref', '')
3143
3144 for candidate in candidate_urls:
3145 if not _url_allowed_for_module(module_path, candidate, version):
3146 continue
3147 commit_candidate, timestamp_candidate = _lookup_commit_for_version(candidate, version, ref_hint)
3148 if commit_candidate:
3149 final_timestamp = timestamp_candidate or timestamp_hint
3150 update_metadata_cache(
3151 module_path,
3152 version,
3153 candidate,
3154 commit_candidate,
3155 final_timestamp,
3156 subdir_hint,
3157 ref_hint,
3158 dirty=True,
3159 )
3160 return {
3161 "module_path": module_path,
3162 "version": version,
3163 "vcs_url": candidate,
3164 "vcs_hash": commit_candidate,
3165 "vcs_ref": ref_hint,
3166 "timestamp": final_timestamp,
3167 "subdir": subdir_hint,
3168 }
3169 return None
3170
3171 # Handle gopkg.in special case
3172 if parts[0] == 'gopkg.in':
3173 repo_candidates: List[str] = []
3174 vanity_repo = query_vanity_url(module_path)
3175 if vanity_repo:
3176 repo_candidates.append(vanity_repo)
3177 repo_candidates.extend(_candidate_gopkg_repos(module_path))
3178 if cached and cached.get('vcs_url'):
3179 repo_candidates.insert(0, cached['vcs_url'])
3180
3181 for vcs_url in repo_candidates:
3182 if not vcs_url:
3183 continue
3184 commit, timestamp = _lookup_commit_for_version(vcs_url, version)
3185 if commit:
3186 resolved_timestamp = timestamp or derive_timestamp_from_version(version)
3187 update_metadata_cache(module_path, version, vcs_url, commit, resolved_timestamp, '', '', dirty=True)
3188 return {
3189 "module_path": module_path,
3190 "version": version,
3191 "vcs_url": vcs_url,
3192 "vcs_hash": commit,
3193 "vcs_ref": "",
3194 "timestamp": resolved_timestamp,
3195 "subdir": "",
3196 }
3197
3198 go_info = fetch_go_metadata()
3199 result = resolve_with_go_info(go_info, '', '')
3200
3201 if result:
3202 return result
3203
3204 if cached:
3205 return cached
3206
3207 print(f" ⚠️ Unable to derive repository for gopkg.in path {module_path}@{version}")
3208 return None
3209
3210 if len(parts) < 3:
3211 go_info = fetch_go_metadata()
3212 result = resolve_with_go_info(go_info, '', '')
3213 if result:
3214 return result
3215
3216 vanity_repo = query_vanity_url(module_path)
3217 if vanity_repo:
3218 commit, timestamp = _lookup_commit_for_version(vanity_repo, version)
3219 if commit:
3220 resolved_timestamp = timestamp or derive_timestamp_from_version(version)
3221 update_metadata_cache(module_path, version, vanity_repo, commit, resolved_timestamp, '', '', dirty=True)
3222 return {
3223 "module_path": module_path,
3224 "version": version,
3225 "vcs_url": vanity_repo,
3226 "vcs_hash": commit,
3227 "vcs_ref": "",
3228 "timestamp": resolved_timestamp,
3229 "subdir": '',
3230 }
3231
3232 if cached:
3233 return cached
3234
3235 print(f" ⚠️ Unable to derive repository for {module_path}@{version}")
3236 return None
3237 else:
3238 # Default calculation assuming 3-part paths (domain/org/repo)
3239 base_repo = '/'.join(parts[:3])
3240
3241 # Calculate subdir from module path, but strip version suffixes (v2, v3, v11, etc.)
3242 if len(parts) > 3:
3243 subdir_parts = parts[3:]
3244 # Remove trailing version suffix if present (e.g., v2, v3, v11)
3245 if subdir_parts and subdir_parts[-1].startswith('v') and subdir_parts[-1][1:].isdigit():
3246 subdir_parts = subdir_parts[:-1]
3247 subdir = '/'.join(subdir_parts) if subdir_parts else ''
3248 else:
3249 subdir = ''
3250
3251 override_candidate = None
3252 override_urls = repo_override_candidates(module_path, version)
3253 if override_urls:
3254 override_candidate = override_urls[0]
3255
3256 if override_candidate:
3257 vcs_url = override_candidate
3258 elif parts[0] == 'golang.org' and len(parts) >= 3 and parts[1] == 'x':
3259 pkg_name = parts[2]
3260 vcs_url = f"https://go.googlesource.com/{pkg_name}"
3261 elif parts[0] == 'github.com' and len(parts) >= 3:
3262 vcs_url = f"https://{base_repo}"
3263 else:
3264 vanity_repo = query_vanity_url(module_path)
3265 if vanity_repo:
3266 vcs_url = vanity_repo
3267 subdir = _recalculate_subdir_from_vanity(vcs_url, parts, subdir)
3268 else:
3269 vcs_url = f"https://{base_repo}"
3270
3271 if cached and cached.get('vcs_url') and cached.get('vcs_hash'):
3272 if vanity_repo:
3273 adjusted_subdir = _recalculate_subdir_from_vanity(
3274 cached['vcs_url'],
3275 parts,
3276 cached.get('subdir', ''),
3277 )
3278 if adjusted_subdir != cached.get('subdir', ''):
3279 cached['subdir'] = adjusted_subdir
3280 update_metadata_cache(
3281 module_path,
3282 version,
3283 cached['vcs_url'],
3284 cached['vcs_hash'],
3285 cached['timestamp'],
3286 adjusted_subdir,
3287 cached.get('vcs_ref', ''),
3288 dirty=True,
3289 )
3290 return cached
3291
3292 commit, timestamp = _lookup_commit_for_version(vcs_url, version)
3293 if not commit:
3294 go_info = fetch_go_metadata()
3295 result = resolve_with_go_info(go_info, vcs_url, subdir)
3296 if result:
3297 return result
3298
3299 FAILED_MODULE_PATHS.add(module_path)
3300 _record_skipped_module(module_path, version, "no repository metadata from go.sum/go list")
3301 print(f" ⚠️ Unable to derive repository for {module_path}@{version}")
3302 if cached and cached.get('vcs_hash'):
3303 return cached
3304 return None
3305
3306 if not _url_allowed_for_module(module_path, vcs_url, version):
3307 FAILED_MODULE_PATHS.add(module_path)
3308 _record_skipped_module(module_path, version, "resolved repo not allowed by override policy")
3309 print(f" ⚠️ Resolved repo {vcs_url} for {module_path}@{version} not in override allowlist")
3310 if cached and cached.get('vcs_hash'):
3311 return cached
3312 return None
3313
3314 resolved_timestamp = timestamp or derive_timestamp_from_version(version)
3315
3316 update_metadata_cache(module_path, version, vcs_url, commit, resolved_timestamp, subdir, '', dirty=True)
3317
3318 return {
3319 "module_path": module_path,
3320 "version": version,
3321 "vcs_url": vcs_url,
3322 "vcs_hash": commit,
3323 "vcs_ref": "",
3324 "timestamp": resolved_timestamp,
3325 "subdir": subdir,
3326 }
3327
3328
3329# =============================================================================
3330# Utility Functions
3331# =============================================================================
3332
3333def unescape_module_path(path: str) -> str:
3334 """
3335 Unescape Go module paths that use ! for uppercase letters.
3336 Example: github.com/!sirupsen/logrus -> github.com/Sirupsen/logrus
3337 """
3338 import re
3339 return re.sub(r'!([a-z])', lambda m: m.group(1).upper(), path)
3340
3341def escape_module_path(path: str) -> str:
3342 """
3343 Escape Go module paths by converting uppercase to !lowercase.
3344 Example: github.com/Sirupsen/logrus -> github.com/!sirupsen/logrus
3345 """
3346 import re
3347 return re.sub(r'([A-Z])', lambda m: '!' + m.group(1).lower(), path)
3348
3349# =============================================================================
3350# Phase 1: Discovery
3351# =============================================================================
3352
3353def parse_go_mod_requires(go_mod_path: Path) -> List[tuple]:
3354 """
3355 Extract ALL module requirements from go.mod (direct + indirect).
3356
3357 This replaces the need for fast-fix-module.py by discovering all
3358 transitive dependencies that Go needs.
3359
3360 Returns list of (module_path, version) tuples.
3361 """
3362 modules = []
3363
3364 if not go_mod_path.exists():
3365 print(f"Warning: go.mod not found at {go_mod_path}")
3366 return modules
3367
3368 in_require = False
3369
3370 try:
3371 with open(go_mod_path, 'r', encoding='utf-8') as f:
3372 for line in f:
3373 line = line.strip()
3374
3375 # Start of require block
3376 if line.startswith('require ('):
3377 in_require = True
3378 continue
3379
3380 # End of require block
3381 if in_require and line == ')':
3382 in_require = False
3383 continue
3384
3385 # Single-line require
3386 if line.startswith('require ') and '(' not in line:
3387 parts = line.split()
3388 if len(parts) >= 3: # require module version
3389 module = parts[1]
3390 version = parts[2]
3391 modules.append((module, version))
3392 continue
3393
3394 # Multi-line require block entry
3395 if in_require and line:
3396 # Skip comments
3397 if line.startswith('//'):
3398 continue
3399
3400 # Parse: "module version // indirect" or just "module version"
3401 parts = line.split()
3402 if len(parts) >= 2:
3403 module = parts[0]
3404 version = parts[1]
3405 modules.append((module, version))
3406
3407 except Exception as e:
3408 print(f"Error parsing go.mod: {e}")
3409
3410 return modules
3411
3412
3413def download_all_required_modules(source_dir: Path, gomodcache: Path) -> None:
3414 """
3415 Download ALL modules required by go.mod (direct + indirect).
3416
3417 This ensures that indirect/transitive dependencies have .info files
3418 in the GOMODCACHE, which allows discover_modules() to find them.
3419
3420 This is the key to replacing fast-fix-module.py - by downloading
3421 everything upfront, we make all modules discoverable.
3422 """
3423 go_mod_path = source_dir / "go.mod"
3424
3425 print(f"\n" + "=" * 70)
3426 print("DISCOVERY ENHANCEMENT: Downloading all required modules")
3427 print("=" * 70)
3428 print(f"Parsing {go_mod_path}...")
3429
3430 required_modules = parse_go_mod_requires(go_mod_path)
3431
3432 if not required_modules:
3433 print("Warning: No modules found in go.mod")
3434 return
3435
3436 print(f"Found {len(required_modules)} total modules in go.mod (direct + indirect)")
3437
3438 # Set up environment for Go
3439 env = os.environ.copy()
3440 env['GOMODCACHE'] = str(gomodcache)
3441 env['GOPROXY'] = 'https://proxy.golang.org'
3442
3443 # Download each module to ensure .info files exist
3444 success_count = 0
3445 skip_count = 0
3446 fail_count = 0
3447
3448 for module_path, version in required_modules:
3449 # Check if .info file already exists
3450 escaped_module = escape_module_path(module_path)
3451 escaped_version = escape_module_path(version)
3452 info_path = gomodcache / "cache" / "download" / escaped_module / "@v" / f"{escaped_version}.info"
3453
3454 if info_path.exists():
3455 skip_count += 1
3456 continue
3457
3458 # Download to get .info file with VCS metadata
3459 try:
3460 result = subprocess.run(
3461 ['go', 'mod', 'download', f'{module_path}@{version}'],
3462 cwd=source_dir,
3463 env=env,
3464 capture_output=True,
3465 text=True,
3466 timeout=30
3467 )
3468
3469 if result.returncode == 0:
3470 success_count += 1
3471 else:
3472 fail_count += 1
3473 if "no matching versions" not in result.stderr:
3474 print(f" Warning: Failed to download {module_path}@{version}: {result.stderr.strip()[:100]}")
3475
3476 except subprocess.TimeoutExpired:
3477 fail_count += 1
3478 print(f" Warning: Timeout downloading {module_path}@{version}")
3479 except Exception as e:
3480 fail_count += 1
3481 print(f" Warning: Error downloading {module_path}@{version}: {e}")
3482
3483 print(f"\nDownload results:")
3484 print(f" ✓ {success_count} modules downloaded")
3485 print(f" ⊙ {skip_count} modules already cached")
3486 print(f" ✗ {fail_count} modules failed")
3487 print(f" → Total: {len(required_modules)} modules")
3488
3489
3490def discover_modules(source_dir: Path, gomodcache: Optional[str] = None) -> List[Dict]:
3491 """
3492 Phase 1: Discovery
3493
3494 Let Go download modules to discover correct paths and metadata.
3495 This is ONLY for discovery - we build from git sources.
3496
3497 Returns list of modules with:
3498 - module_path: CORRECT path from filesystem (no /v3 stripping!)
3499 - version: Module version
3500 - vcs_url: Git repository URL
3501 - vcs_hash: Git commit hash
3502 - vcs_ref: Git reference (tag/branch)
3503 - timestamp: Commit timestamp
3504 - subdir: Subdirectory within repo (for submodules)
3505 """
3506 global CURRENT_GOMODCACHE
3507 print("\n" + "=" * 70)
3508 print("PHASE 1: DISCOVERY - Using Go to discover module metadata")
3509 print("=" * 70)
3510
3511 # Create temporary or use provided GOMODCACHE
3512 if gomodcache:
3513 temp_cache = Path(gomodcache)
3514 print(f"Using existing GOMODCACHE: {temp_cache}")
3515 cleanup_cache = False
3516 else:
3517 temp_cache = Path(tempfile.mkdtemp(prefix="go-discover-"))
3518 print(f"Created temporary cache: {temp_cache}")
3519 cleanup_cache = True
3520 CURRENT_GOMODCACHE = str(temp_cache)
3521
3522 try:
3523 ensure_path_is_writable(temp_cache)
3524
3525 # Set up environment for Go
3526 env = os.environ.copy()
3527 env['GOMODCACHE'] = str(temp_cache)
3528 env['GOPROXY'] = 'https://proxy.golang.org'
3529
3530 print(f"\nDownloading modules to discover metadata...")
3531 print(f"Source: {source_dir}")
3532
3533 # Let Go download everything (initial discovery)
3534 result = subprocess.run(
3535 ['go', 'mod', 'download'],
3536 cwd=source_dir,
3537 env=env,
3538 capture_output=True,
3539 text=True
3540 )
3541
3542 if result.returncode != 0:
3543 print(f"Warning: go mod download had errors:\n{result.stderr}")
3544 # Continue anyway - some modules may have been downloaded
3545
3546 # PRIORITY #2 FIX: Download ALL modules from go.mod (direct + indirect)
3547 # This replaces the need for fast-fix-module.py by ensuring all
3548 # transitive dependencies have .info files for discovery
3549 download_all_required_modules(source_dir, temp_cache)
3550
3551 # Walk filesystem to discover what Go created
3552 modules = []
3553 download_dir = temp_cache / "cache" / "download"
3554
3555 if not download_dir.exists():
3556 print(f"Error: Download directory not found: {download_dir}")
3557 return []
3558
3559 print(f"\nScanning {download_dir} for modules...")
3560
3561 for dirpath, _, filenames in os.walk(download_dir):
3562 path_parts = Path(dirpath).relative_to(download_dir).parts
3563
3564 # Look for @v directories
3565 if not path_parts or path_parts[-1] != '@v':
3566 continue
3567
3568 # Module path is everything before @v
3569 module_path = '/'.join(path_parts[:-1])
3570 module_path = unescape_module_path(module_path) # Unescape !-encoding
3571
3572 # Process each .info file
3573 for filename in filenames:
3574 if not filename.endswith('.info'):
3575 continue
3576
3577 version = filename[:-5] # Strip .info extension
3578 info_path = Path(dirpath) / filename
3579
3580 try:
3581 # Read metadata from .info file
3582 with open(info_path) as f:
3583 info = json.load(f)
3584
3585 # Extract VCS information
3586 origin = info.get('Origin', {})
3587 vcs_url = origin.get('URL')
3588 vcs_hash = origin.get('Hash')
3589 vcs_ref = origin.get('Ref', '')
3590 subdir = origin.get('Subdir', '')
3591
3592 if not vcs_url or not vcs_hash:
3593 # Try to refresh cache entry and ask Go directly for metadata.
3594 go_mod_download(module_path, version)
3595
3596 # Reload .info in case go mod download updated it.
3597 try:
3598 with open(info_path) as f:
3599 info = json.load(f)
3600 origin = info.get('Origin', {})
3601 vcs_url = origin.get('URL')
3602 vcs_hash = origin.get('Hash')
3603 vcs_ref = origin.get('Ref', '')
3604 subdir = origin.get('Subdir', '')
3605 except Exception:
3606 pass
3607
3608 if not vcs_url or not vcs_hash:
3609 go_info = query_module_via_go_list(module_path, version)
3610 if go_info:
3611 vcs_url = go_info.get('vcs_url')
3612 vcs_hash = go_info.get('commit')
3613 subdir = go_info.get('subdir', subdir)
3614 origin_time = go_info.get('timestamp', '')
3615 if origin_time:
3616 info['Time'] = origin_time
3617
3618 if not vcs_url or not vcs_hash:
3619 print(f" ⚠️ Skipping {module_path}@{version}: No VCS info")
3620 continue
3621
3622 overrides = repo_override_candidates(module_path, version)
3623 if overrides:
3624 vcs_url = overrides[0]
3625
3626 # BitBake requires full 40-character commit hashes
3627 if len(vcs_hash) != 40:
3628 print(f" ⚠️ Skipping {module_path}@{version}: Short commit hash ({vcs_hash})")
3629 continue
3630
3631 # PROACTIVE dangling commit detection and correction
3632 # Check if commit is BitBake-fetchable BEFORE expensive verification
3633 # BitBake's nobranch=1 requires commits to be branch/tag HEADs, not dangling commits
3634 if VERIFY_ENABLED and vcs_ref and vcs_ref.startswith("refs/"):
3635 if not is_commit_bitbake_fetchable(vcs_url, vcs_hash, vcs_ref):
3636 print(f" ⚠️ DANGLING COMMIT: {module_path}@{version} commit {vcs_hash[:12]} not a branch/tag HEAD")
3637
3638 # Try to correct by dereferencing the ref
3639 corrected_hash = correct_commit_hash_from_ref(vcs_url, vcs_hash, vcs_ref)
3640 if corrected_hash:
3641 print(f" ✓ Corrected hash by dereferencing {vcs_ref}: {vcs_hash[:12]} → {corrected_hash[:12]}")
3642 vcs_hash = corrected_hash
3643 else:
3644 print(f" ❌ Could not auto-correct dangling commit")
3645 # Continue anyway - verification will catch if it's truly unfetchable
3646
3647 # Validate commit exists in repository (detect force-pushed tags)
3648 # If verification is enabled, check that the commit from .info file
3649 # actually exists in the repository. If not, refresh from Go proxy.
3650 commit_verified = VERIFY_ENABLED and verify_commit_accessible(vcs_url, vcs_hash, vcs_ref, version, origin_time)
3651
3652 # Apply fallback commit if verification used one (for orphaned commits)
3653 if commit_verified and VERIFY_ENABLED:
3654 vcs_hash = get_actual_commit(vcs_url, vcs_hash)
3655
3656 if VERIFY_ENABLED and not commit_verified:
3657 print(f" ⚠️ STALE CACHE: {module_path}@{version} commit {vcs_hash[:12]} not found in {vcs_url}")
3658
3659 # Last resort: Try proxy refresh (this shouldn't happen if dangling check worked)
3660 corrected_hash = correct_commit_hash_from_ref(vcs_url, vcs_hash, vcs_ref)
3661 if corrected_hash:
3662 print(f" ✓ Corrected hash by dereferencing {vcs_ref}: {vcs_hash[:12]} → {corrected_hash[:12]}")
3663 vcs_hash = corrected_hash
3664 # Verify the corrected hash is accessible
3665 if verify_commit_accessible(vcs_url, vcs_hash, vcs_ref, version, origin_time):
3666 # Successfully corrected! Continue with this module (skip proxy refresh)
3667 commit_verified = True
3668 else:
3669 print(f" ❌ Even corrected commit not accessible")
3670
3671 # If still not verified after correction attempt, try proxy refresh
3672 if not commit_verified:
3673 # Check if module is actually needed before attempting refresh
3674 if not is_module_actually_needed(module_path, CURRENT_SOURCE_DIR):
3675 print(f" ℹ️ Module not needed by main module (indirect-only), skipping")
3676 print(f" (Verified via 'go mod why {module_path}')")
3677 continue
3678
3679 print(f" Attempting to refresh from Go proxy...")
3680
3681 # Delete stale .info file to force re-download
3682 try:
3683 info_path.unlink()
3684 print(f" Deleted stale .info file")
3685 except Exception as e:
3686 print(f" Warning: Could not delete .info file: {e}")
3687
3688 # Re-download from Go proxy to get current commit
3689 try:
3690 go_mod_download(module_path, version)
3691
3692 # Reload .info file with fresh data
3693 if info_path.exists():
3694 with open(info_path) as f:
3695 info = json.load(f)
3696 origin = info.get('Origin', {})
3697 new_vcs_hash = origin.get('Hash')
3698
3699 if new_vcs_hash and new_vcs_hash != vcs_hash:
3700 print(f" ✓ Refreshed: {vcs_hash[:12]} → {new_vcs_hash[:12]}")
3701 vcs_hash = new_vcs_hash
3702 vcs_ref = origin.get('Ref', vcs_ref)
3703
3704 # Verify new commit exists
3705 if not verify_commit_accessible(vcs_url, vcs_hash, vcs_ref, version, origin.get('Time', '')):
3706 print(f" ❌ Even refreshed commit not accessible")
3707 # Last resort: check if it's actually needed
3708 if not is_module_actually_needed(module_path, CURRENT_SOURCE_DIR):
3709 print(f" ℹ️ Module not needed anyway, skipping")
3710 continue
3711 else:
3712 print(f" ❌ Module IS needed but commit unavailable")
3713 print(f" This module cannot be built from git sources")
3714 continue
3715 else:
3716 print(f" ⚠️ Go proxy returned same commit (permanently deleted)")
3717 # Check if it's actually needed
3718 if not is_module_actually_needed(module_path, CURRENT_SOURCE_DIR):
3719 print(f" ℹ️ Module not needed by main module, skipping")
3720 continue
3721 else:
3722 print(f" ❌ Module IS needed but commit permanently deleted")
3723 print(f" Consider using gomod:// fetcher for this module")
3724 continue
3725 else:
3726 print(f" ❌ Re-download failed, skipping module")
3727 continue
3728 except Exception as e:
3729 print(f" ❌ Refresh failed: {e}")
3730 continue
3731
3732 DOWNLOADED_MODULES.add((module_path, version))
3733 modules.append({
3734 'module_path': module_path,
3735 'version': version,
3736 'vcs_url': vcs_url,
3737 'vcs_hash': vcs_hash,
3738 'vcs_ref': vcs_ref,
3739 'timestamp': info.get('Time', ''),
3740 'subdir': subdir or '',
3741 })
3742
3743 print(f" ✓ {module_path}@{version}")
3744
3745 except Exception as e:
3746 print(f" ✗ Error processing {info_path}: {e}")
3747 continue
3748
3749 print(f"\nDiscovered {len(modules)} modules with VCS info")
3750
3751 # FIX: Synthesize entries for +incompatible versions that lack VCS data
3752 # These are pre-v2 versions of modules that later adopted semantic import versioning (/v2, /v3, etc.)
3753 # The GOMODCACHE has .info files for them but without Origin data (old proxy cache)
3754 # Strategy: For each versioned module path (e.g., foo/v3), check if a base path version
3755 # with +incompatible exists in GOMODCACHE and lacks VCS data. If so, synthesize an entry.
3756 #
3757 # NOTE (2025-11-28): This code overlaps with Fix #29 in extract-native-modules.py, which
3758 # now uses derive_vcs_info() to handle +incompatible modules at discovery time. Fix #29
3759 # is more complete because it handles ALL +incompatible modules directly from their path,
3760 # not just those with a corresponding /vN version. This code is kept as a fallback for
3761 # cases where extract-native-modules.py wasn't used (e.g., legacy workflows).
3762 print("\nSynthesizing entries for +incompatible versions without VCS data...")
3763 synthesized_count = 0
3764
3765 # Build a map of module_path -> vcs_url for discovered modules
3766 module_vcs_map: Dict[str, str] = {}
3767 for mod in modules:
3768 module_vcs_map[mod['module_path']] = mod['vcs_url']
3769
3770 # For each module with a versioned path suffix (/v2, /v3, etc.), check for base path incompatible versions
3771 for mod in list(modules): # Iterate over copy since we'll append to modules
3772 module_path = mod['module_path']
3773 vcs_url = mod['vcs_url']
3774
3775 # Check if this module has a version suffix (/v2, /v3, etc.)
3776 version_match = re.search(r'/v(\d+)$', module_path)
3777 if not version_match:
3778 continue
3779
3780 # Extract base path (without /vN suffix)
3781 base_path = module_path[:module_path.rfind('/v')]
3782
3783 # Check if we already discovered the base path
3784 if base_path in module_vcs_map:
3785 continue # Base path already has VCS data, no synthesis needed
3786
3787 # Look for +incompatible versions of the base path in GOMODCACHE
3788 # Note: GOMODCACHE uses raw paths as directory names (not escaped)
3789 base_path_dir = download_dir / base_path / '@v'
3790
3791 if not base_path_dir.exists():
3792 continue
3793
3794 # Scan for .info files with +incompatible versions
3795 for info_file in base_path_dir.glob('*.info'):
3796 version = info_file.stem
3797
3798 if not version.endswith('+incompatible'):
3799 continue
3800
3801 # Read the .info file to check if it lacks VCS data
3802 try:
3803 with open(info_file) as f:
3804 info = json.load(f)
3805
3806 # If it already has Origin data, skip it
3807 if 'Origin' in info and info['Origin'].get('URL') and info['Origin'].get('Hash'):
3808 continue
3809
3810 # This +incompatible version lacks VCS data - synthesize an entry
3811 # Extract the tag name from version (e.g., v2.16.0+incompatible -> v2.16.0)
3812 tag_version = version.replace('+incompatible', '')
3813 tag_ref = f"refs/tags/{tag_version}"
3814
3815 # Use git ls-remote to find the commit for this tag
3816 tag_commit = git_ls_remote(vcs_url, tag_ref)
3817
3818 if not tag_commit:
3819 print(f" ⚠️ Could not find tag {tag_ref} for {base_path}@{version}")
3820 continue
3821
3822 # Synthesize a module entry using data from the versioned path
3823 synthesized_module = {
3824 'module_path': base_path, # Use BASE path (without /vN)
3825 'version': version,
3826 'vcs_url': vcs_url,
3827 'vcs_hash': tag_commit,
3828 'vcs_ref': tag_ref,
3829 'timestamp': info.get('Time', ''),
3830 'subdir': '',
3831 }
3832
3833 modules.append(synthesized_module)
3834 module_vcs_map[base_path] = vcs_url # Prevent duplicate synthesis
3835 synthesized_count += 1
3836
3837 print(f" ✓ Synthesized {base_path}@{version} (from {module_path} VCS data)")
3838 print(f" VCS: {vcs_url}")
3839 print(f" Commit: {tag_commit[:12]} (tag {tag_version})")
3840
3841 except Exception as e:
3842 print(f" ⚠️ Error synthesizing {base_path}@{version}: {e}")
3843 continue
3844
3845 if synthesized_count > 0:
3846 print(f"\nSynthesized {synthesized_count} +incompatible module entries")
3847 else:
3848 print("No +incompatible versions needed synthesis")
3849
3850 print(f"\nTotal modules after synthesis: {len(modules)}")
3851 return modules
3852
3853 finally:
3854 # Defer cleanup of temporary caches until the end of execution
3855 if cleanup_cache and temp_cache.exists():
3856 TEMP_GOMODCACHES.append(temp_cache)
3857
3858# =============================================================================
3859# Phase 2: Recipe Generation
3860# =============================================================================
3861
3862def generate_recipe(modules: List[Dict], source_dir: Path, output_dir: Optional[Path],
3863 git_repo: str, git_ref: str, validate_only: bool = False,
3864 debug_limit: Optional[int] = None, skip_verify: bool = False,
3865 verify_jobs: int = 10) -> bool:
3866 """
3867 Phase 2: Recipe Generation
3868
3869 Generate BitBake recipe with git:// SRC_URI entries.
3870 No file:// entries - we'll build cache from git during do_create_module_cache.
3871
3872 Creates:
3873 - go-mod-git.inc: SRC_URI with git:// entries
3874 - go-mod-cache.inc: BitBake task to build module cache
3875 """
3876 print("\n" + "=" * 70)
3877 phase_label = "VALIDATION" if validate_only else "RECIPE GENERATION"
3878 print(f"PHASE 2: {phase_label} - {('commit verification' if validate_only else 'Creating BitBake recipe files')}")
3879 print("=" * 70)
3880
3881 src_uri_entries = []
3882 modules_data = []
3883 vcs_repos: Dict[str, Dict] = {}
3884
3885 def repo_key_for_url(url: str) -> str:
3886 return hashlib.sha256(f"git3:{url}".encode()).hexdigest()
3887
3888 def commit_cache_key(repo_key: str, commit: str) -> str:
3889 return hashlib.sha256(f"{repo_key}:{commit}".encode()).hexdigest()
3890
3891 unresolved_commits: List[Tuple[str, str, str, str, str]] = []
3892
3893 total_modules = len(modules)
3894 if debug_limit is not None:
3895 print(f"\n⚙️ Debug limit active: validating first {debug_limit} modules (total list size {total_modules})")
3896
3897 if skip_verify:
3898 print(f"\n⚙️ Skipping verification (--skip-verify enabled)")
3899
3900 # First pass: Build repo structure without verification
3901 for index, module in enumerate(modules, start=1):
3902 vcs_url = module['vcs_url']
3903 commit_hash = module['vcs_hash']
3904
3905 repo_key = repo_key_for_url(vcs_url)
3906 repo_info = vcs_repos.setdefault(
3907 repo_key,
3908 {
3909 'url': vcs_url,
3910 'commits': {}, # commit hash -> commit metadata
3911 },
3912 )
3913
3914 if commit_hash not in repo_info['commits']:
3915 commit_sha = commit_cache_key(repo_key, commit_hash)
3916 repo_info['commits'][commit_hash] = {
3917 'commit_sha': commit_sha,
3918 'modules': [],
3919 }
3920 else:
3921 commit_sha = repo_info['commits'][commit_hash]['commit_sha']
3922
3923 ref_hint = module.get('vcs_ref', '')
3924 if ref_hint and not _ref_points_to_commit(vcs_url, ref_hint, commit_hash):
3925 ref_hint = ''
3926
3927 entry = repo_info['commits'][commit_hash]
3928 entry['modules'].append(module)
3929 if ref_hint:
3930 entry['ref_hint'] = ref_hint
3931
3932 module['repo_key'] = repo_key
3933 module['commit_sha'] = commit_sha
3934
3935 # Second pass: Verify commits (parallel or sequential) with auto-correction
3936 # PHASE MERGE: This now includes force-pushed tag detection and auto-correction
3937 global VERIFY_CORRECTIONS_APPLIED
3938 if not skip_verify:
3939 print(f"\n⚙️ Verifying {total_modules} commits with {verify_jobs} parallel jobs")
3940 corrected_modules = [] # Track corrections for reporting
3941
3942 def verify_module(module_info):
3943 index, module = module_info
3944 vcs_url = module['vcs_url']
3945 commit_hash = module['vcs_hash']
3946 ref_hint = module.get('vcs_ref', '')
3947
3948 print(f" • verifying [{index}/{total_modules}] {module['module_path']}@{module['version']} -> {commit_hash[:12]}")
3949
3950 # Verify commit is accessible
3951 if not verify_commit_accessible(vcs_url, commit_hash, ref_hint, module.get('version', ''), module.get('timestamp', '')):
3952 # PHASE MERGE: If verification fails and we have a ref, try auto-correction
3953 if ref_hint and ref_hint.startswith("refs/"):
3954 corrected_hash = correct_commit_hash_from_ref(vcs_url, commit_hash, ref_hint)
3955 if corrected_hash and corrected_hash != commit_hash:
3956 print(f" ✓ Auto-corrected: {commit_hash[:12]} → {corrected_hash[:12]} (force-pushed tag)")
3957 module['vcs_hash'] = corrected_hash
3958
3959 # Update repo_info dict to use the new hash as key
3960 repo_key = module['repo_key']
3961 if commit_hash in vcs_repos[repo_key]['commits']:
3962 # Move the entry from old hash to new hash
3963 vcs_repos[repo_key]['commits'][corrected_hash] = vcs_repos[repo_key]['commits'].pop(commit_hash)
3964
3965 return ('corrected', module['module_path'], module['version'], commit_hash, corrected_hash)
3966 else:
3967 # Could not correct - treat as failure
3968 return ('failed', module['module_path'], module['version'], commit_hash, vcs_url, ref_hint)
3969 else:
3970 # No ref to dereference - genuine failure
3971 return ('failed', module['module_path'], module['version'], commit_hash, vcs_url, ref_hint)
3972 else:
3973 # Verification succeeded - apply fallback commit if one was used
3974 actual_hash = get_actual_commit(vcs_url, commit_hash)
3975 if actual_hash != commit_hash:
3976 print(f" ✓ Applied fallback: {commit_hash[:12]} → {actual_hash[:12]} (orphaned commit)")
3977 module['vcs_hash'] = actual_hash
3978
3979 # Update repo_info dict to use the new hash as key
3980 repo_key = module['repo_key']
3981 if commit_hash in vcs_repos[repo_key]['commits']:
3982 # Move the entry from old hash to new hash
3983 vcs_repos[repo_key]['commits'][actual_hash] = vcs_repos[repo_key]['commits'].pop(commit_hash)
3984
3985 return ('corrected', module['module_path'], module['version'], commit_hash, actual_hash)
3986 return None
3987
3988 if verify_jobs > 0:
3989 # Parallel verification
3990 with concurrent.futures.ThreadPoolExecutor(max_workers=verify_jobs) as executor:
3991 results = list(executor.map(verify_module, enumerate(modules, start=1)))
3992 else:
3993 # Sequential verification (--verify-jobs=0)
3994 results = []
3995 for index, module in enumerate(modules, start=1):
3996 result = verify_module((index, module))
3997 if result is not None:
3998 results.append(result)
3999
4000 # Save verification cache every 50 modules
4001 if index % 50 == 0:
4002 save_verify_commit_cache(force=True)
4003 print(f" 💾 Saved verification cache at {index}/{total_modules}")
4004
4005 # Separate corrected vs failed results
4006 corrected_results = [r for r in results if r and r[0] == 'corrected']
4007 failed_results = [r for r in results if r and r[0] == 'failed']
4008
4009 # Apply corrections back to modules list (needed for parallel execution)
4010 if corrected_results:
4011 VERIFY_CORRECTIONS_APPLIED = True
4012 print(f"\n✓ Auto-corrected {len(corrected_results)} force-pushed tags:")
4013 for _, module_path, version, old_hash, new_hash in corrected_results:
4014 print(f" • {module_path}@{version}: {old_hash[:12]} → {new_hash[:12]}")
4015
4016 # Find and update the module in the main list
4017 for module in modules:
4018 if module['module_path'] == module_path and module['version'] == version:
4019 module['vcs_hash'] = new_hash
4020
4021 # Also update the vcs_repos dict
4022 repo_key = module['repo_key']
4023 if old_hash in vcs_repos[repo_key]['commits']:
4024 vcs_repos[repo_key]['commits'][new_hash] = vcs_repos[repo_key]['commits'].pop(old_hash)
4025 break
4026 else:
4027 # Verification skipped - no failed results
4028 failed_results = []
4029
4030 print(f"\nFound {len(vcs_repos)} unique git repositories")
4031 print(f"Supporting {len(modules)} modules")
4032
4033 if failed_results:
4034 print("\n❌ Unable to verify the following module commits against their repositories:")
4035 for _, module_path, version, commit_hash, vcs_url, ref_hint in failed_results:
4036 print(f" - {module_path}@{version} ({commit_hash})")
4037 hint = f" {ref_hint}" if ref_hint else ""
4038 print(f" try: git fetch --depth=1 {vcs_url}{hint} {commit_hash}")
4039 print(f" cache: mark reachable via --inject-commit '{vcs_url} {commit_hash}'")
4040 print(f" repo : override via --set-repo {module_path}@{version} {vcs_url}")
4041 print("Aborting to prevent emitting invalid SRCREVs.")
4042 return False
4043
4044 if validate_only:
4045 print("\n✅ Validation complete - all commits are reachable upstream")
4046 return True
4047
4048 if output_dir is None:
4049 print("❌ Internal error: output directory missing for recipe generation")
4050 return False
4051
4052 # Generate SRC_URI entries for each repo/commit combination
4053 for repo_key, repo_info in vcs_repos.items():
4054 git_url = repo_info['url']
4055
4056 if git_url.startswith('https://'):
4057 git_url_bb = 'git://' + git_url[8:]
4058 protocol = 'https'
4059 elif git_url.startswith('http://'):
4060 git_url_bb = 'git://' + git_url[7:]
4061 protocol = 'http'
4062 else:
4063 git_url_bb = git_url
4064 protocol = 'https'
4065
4066 for idx, (commit_hash, commit_info) in enumerate(sorted(repo_info['commits'].items())):
4067 fetch_name = f"git_{repo_key[:8]}_{idx}"
4068 destsuffix = f"vcs_cache/{commit_info['commit_sha']}"
4069
4070 # Use branch name from ref_hint when available (more reliable than nobranch=1)
4071 # ref_hint is like "refs/tags/v1.9.3" or "refs/heads/main"
4072 ref_hint = commit_info.get('ref_hint', '')
4073 if ref_hint:
4074 shallow_param = ';shallow=1'
4075 # For tags, use nobranch=1 since the commit may not be on a branch head
4076 # For branches, use the branch name directly
4077 if ref_hint.startswith('refs/tags/'):
4078 # Tags: BitBake can fetch tagged commits with nobranch=1
4079 branch_param = ';nobranch=1'
4080 elif ref_hint.startswith('refs/heads/'):
4081 # Branches: use the actual branch name
4082 branch_name = ref_hint[11:] # Strip "refs/heads/"
4083 branch_param = f';branch={branch_name}'
4084 else:
4085 branch_param = ';nobranch=1'
4086 else:
4087 # For pseudo-versions (no ref_hint), check if we detected a branch
4088 detected_branch = VERIFY_DETECTED_BRANCHES.get((git_url, commit_hash))
4089 if detected_branch:
4090 # Use the detected branch name instead of nobranch=1
4091 shallow_param = ''
4092 branch_param = f';branch={detected_branch}'
4093 print(f" Using detected branch: {detected_branch} for {commit_hash[:12]}")
4094 else:
4095 # No ref and no detected branch - use nobranch=1
4096 # This should only happen for genuine orphaned commits that couldn't be fixed
4097 shallow_param = ''
4098 branch_param = ';nobranch=1'
4099
4100 src_uri_entries.append(
4101 f'{git_url_bb};protocol={protocol}{branch_param}{shallow_param};'
4102 f'rev={commit_hash};'
4103 f'name={fetch_name};'
4104 f'destsuffix={destsuffix}'
4105 )
4106
4107 commit_info['fetch_name'] = fetch_name
4108 commit_info['destsuffix'] = destsuffix
4109
4110 if len(repo_info['commits']) == 1:
4111 print(f" {fetch_name}: {repo_info['url'][:60]}...")
4112 else:
4113 print(f" {fetch_name}: {repo_info['url'][:60]}... (commit {commit_hash[:12]})")
4114
4115 # Prepare modules data for do_create_module_cache
4116 for module in modules:
4117 repo_key = module['repo_key']
4118 commit_hash = module['vcs_hash']
4119 commit_info = vcs_repos[repo_key]['commits'][commit_hash]
4120
4121 update_metadata_cache(
4122 module['module_path'],
4123 module['version'],
4124 module['vcs_url'],
4125 module['vcs_hash'],
4126 module.get('timestamp', ''),
4127 module.get('subdir', ''),
4128 module.get('vcs_ref', ''),
4129 dirty=True,
4130 )
4131
4132 # DEBUG: Track server/v3 module
4133 if 'server/v3' in module['module_path']:
4134 print(f"\n🔍 DEBUG server/v3: Adding to modules_data")
4135 print(f" module_path: {module['module_path']}")
4136 print(f" subdir: '{module.get('subdir', '')}' (from module dict)")
4137 print(f" timestamp: {module['timestamp']}")
4138 print(f" vcs_hash: {module['vcs_hash']}")
4139
4140 modules_data.append({
4141 'module': module['module_path'],
4142 'version': module['version'],
4143 'vcs_hash': commit_info['commit_sha'],
4144 'timestamp': module['timestamp'],
4145 'subdir': module.get('subdir', ''),
4146 'vcs_ref': module.get('vcs_ref', ''),
4147 })
4148
4149 # Write go-mod-git.inc
4150 git_inc_path = output_dir / "go-mod-git.inc"
4151 print(f"\nWriting {git_inc_path}")
4152
4153 with open(git_inc_path, 'w') as f:
4154 f.write("# Generated by oe-go-mod-fetcher.py v" + VERSION + "\n")
4155 f.write("# Git repositories for Go module dependencies\n\n")
4156 for entry in src_uri_entries:
4157 f.write(f'SRC_URI += "{entry}"\n')
4158 f.write('\n')
4159
4160 # Collect all tag references for shallow cloning
4161 # BB_GIT_SHALLOW_EXTRA_REFS ensures these refs are included in shallow clones
4162 tag_refs = set()
4163 for module in modules:
4164 vcs_ref = module.get('vcs_ref', '')
4165 if vcs_ref and 'refs/tags/' in vcs_ref:
4166 tag_refs.add(vcs_ref)
4167
4168 if tag_refs:
4169 f.write("# Tag references for shallow cloning\n")
4170 f.write("# Ensures shallow clones include all necessary tags\n")
4171 f.write("BB_GIT_SHALLOW_EXTRA_REFS = \"\\\n")
4172 for tag_ref in sorted(tag_refs):
4173 f.write(f" {tag_ref} \\\n")
4174 f.write('"\n')
4175
4176 # Note: SRCREV_* variables are not needed since rev= is embedded directly in SRC_URI
4177
4178 # Write go-mod-cache.inc
4179 cache_inc_path = output_dir / "go-mod-cache.inc"
4180 print(f"Writing {cache_inc_path}")
4181
4182 with open(cache_inc_path, 'w') as f:
4183 f.write("# Generated by oe-go-mod-fetcher.py v" + VERSION + "\n")
4184 f.write("# Module cache data for Go dependencies\n")
4185 f.write("#\n")
4186 f.write("# This file contains recipe-specific module metadata.\n")
4187 f.write("# The task implementations are in go-mod-vcs.bbclass.\n\n")
4188
4189 # Inherit the bbclass that provides the task implementations
4190 f.write("inherit go-mod-vcs\n\n")
4191
4192 # Write modules data as JSON - one module per line for readability
4193 f.write("# Module metadata for cache building (one module per line)\n")
4194 f.write("GO_MODULE_CACHE_DATA = '[\\\n")
4195 for i, mod in enumerate(modules_data):
4196 line = json.dumps(mod, separators=(',', ':'))
4197 if i < len(modules_data) - 1:
4198 f.write(f"{line},\\\n")
4199 else:
4200 f.write(f"{line}\\\n")
4201 f.write("]'\n")
4202
4203 print(f"\n✅ Generated recipe files:")
4204 print(f" {git_inc_path}")
4205 print(f" {cache_inc_path}")
4206 print(f"\nTo use these files, add to your recipe:")
4207 print(f" require go-mod-git.inc")
4208 print(f" require go-mod-cache.inc")
4209
4210 return True
4211
4212# =============================================================================
4213# Discovered Module Loading (Bootstrap Strategy)
4214# =============================================================================
4215
4216def load_discovered_modules(discovered_modules_path: Path) -> Optional[List[Dict]]:
4217 """
4218 Load pre-discovered module metadata from BitBake discovery build.
4219
4220 This implements the bootstrap strategy where a BitBake discovery build has
4221 already run 'go mod download' (via do_discover_modules task) and
4222 extract-native-modules.py has extracted complete metadata from the GOMODCACHE.
4223
4224 Args:
4225 discovered_modules_path: Path to JSON file with module metadata
4226
4227 Returns:
4228 List of module dicts with complete VCS info, or None if load fails
4229 """
4230 if not discovered_modules_path.exists():
4231 print(f"❌ Discovered modules file not found: {discovered_modules_path}")
4232 return None
4233
4234 try:
4235 with open(discovered_modules_path) as f:
4236 modules = json.load(f)
4237
4238 if not isinstance(modules, list):
4239 print(f"❌ Invalid discovered modules file format (expected list, got {type(modules).__name__})")
4240 return None
4241
4242 print(f"✓ Loaded {len(modules)} modules from discovery metadata")
4243 print(f" File: {discovered_modules_path}")
4244
4245 # Validate module format
4246 required_fields = ['module_path', 'version', 'vcs_url', 'vcs_hash']
4247 for i, module in enumerate(modules):
4248 if not isinstance(module, dict):
4249 print(f"❌ Module {i} is not a dict: {module}")
4250 return None
4251 for field in required_fields:
4252 if field not in module:
4253 print(f"❌ Module {i} missing required field '{field}': {module.get('module_path', '<unknown>')}")
4254 return None
4255
4256 # Show statistics
4257 unique_repos = len(set(m['vcs_url'] for m in modules))
4258 with_subdirs = sum(1 for m in modules if m.get('subdir'))
4259
4260 print(f"\nDiscovery metadata summary:")
4261 print(f" Modules: {len(modules)}")
4262 print(f" Unique repositories: {unique_repos}")
4263 print(f" Multi-module repos: {with_subdirs} modules have subdirs")
4264
4265 # Expand 12-char short hashes to full 40-char hashes.
4266 # Pseudo-versions like v0.0.0-20161002113705-648efa622239 only contain
4267 # 12 chars of the commit hash. BitBake's git fetcher needs full 40-char.
4268 short_hash_modules = [m for m in modules if len(m.get('vcs_hash', '')) == 12]
4269 if short_hash_modules:
4270 print(f"\n⚙️ Expanding {len(short_hash_modules)} short hashes to full 40-char...")
4271 expanded = 0
4272 failed = 0
4273 for i, module in enumerate(short_hash_modules):
4274 if (i + 1) % 20 == 0 or i == 0:
4275 print(f" Progress: {i + 1}/{len(short_hash_modules)}...", end='\r', flush=True)
4276
4277 version = module.get('version', '')
4278 vcs_url = module['vcs_url']
4279 short_hash = module['vcs_hash']
4280
4281 # Parse pseudo-version to get timestamp
4282 pseudo_info = parse_pseudo_version_tag(version.split('+')[0])
4283 if pseudo_info:
4284 timestamp_str, _ = pseudo_info
4285 full_hash = resolve_pseudo_version_commit(
4286 vcs_url, timestamp_str, short_hash,
4287 clone_cache_dir=CLONE_CACHE_DIR
4288 )
4289 if full_hash and len(full_hash) == 40:
4290 module['vcs_hash'] = full_hash
4291 expanded += 1
4292 else:
4293 failed += 1
4294 if VERBOSE_MODE:
4295 print(f"\n ⚠️ Could not expand: {module['module_path']}@{version}")
4296 else:
4297 failed += 1
4298
4299 print(f" Expanded {expanded} short hashes, {failed} failed ")
4300
4301 return modules
4302
4303 except json.JSONDecodeError as e:
4304 print(f"❌ Failed to parse discovered modules JSON: {e}")
4305 return None
4306 except Exception as e:
4307 print(f"❌ Error loading discovered modules: {e}")
4308 return None
4309
4310# =============================================================================
4311# Main Entry Point
4312# =============================================================================
4313
4314def main():
4315 global LOG_PATH, CURRENT_GOMODCACHE
4316 parser = argparse.ArgumentParser(
4317 description=f"Generate BitBake recipes for Go modules using hybrid approach (v{VERSION})",
4318 epilog="""
4319This tool uses a 3-phase hybrid approach:
4320 1. Discovery: Run 'go mod download' to get correct module paths
4321 2. Recipe Generation: Create git:// SRC_URI entries for BitBake
4322 3. Cache Building: Build module cache from git during do_create_module_cache
4323
4324Persistent Caches:
4325 The generator maintains caches in the data/ subdirectory:
4326 - data/module-cache.json: Module metadata (VCS URL, timestamp, subdir, etc.)
4327 - data/ls-remote-cache.json: Git ls-remote results
4328 - data/vanity-url-cache.json: Vanity import path resolution
4329 - data/verify-cache.json: Commit verification status
4330
4331 These caches speed up regeneration but may need cleaning when:
4332 - Derivation logic changes (e.g., subdir calculation fixes)
4333 - Cached data becomes stale or incorrect
4334
4335 Use --clean-cache to remove metadata cache before regeneration.
4336 Use --clean-ls-remote-cache to remove both caches (slower, but fully fresh).
4337
4338Examples:
4339 # Normal regeneration (fast, uses caches)
4340 %(prog)s --recipedir /path/to/recipe/output
4341
4342 # Clean metadata cache (e.g., after fixing subdir derivation)
4343 %(prog)s --recipedir /path/to/recipe/output --clean-cache
4344
4345 # Fully clean regeneration (slow, calls git ls-remote for everything)
4346 %(prog)s --recipedir /path/to/recipe/output --clean-ls-remote-cache
4347 """,
4348 formatter_class=argparse.RawDescriptionHelpFormatter
4349 )
4350
4351 parser.add_argument(
4352 "--recipedir",
4353 help="Output directory for generated .inc files (required unless running with --validate/--dry-run/--clean-only)"
4354 )
4355
4356 parser.add_argument(
4357 "--gomodcache",
4358 help="Directory to use for Go module cache (for discovery phase)"
4359 )
4360
4361 parser.add_argument(
4362 "--cache-dir",
4363 help="Directory to store JSON metadata caches (default: scripts/data)"
4364 )
4365
4366 parser.add_argument(
4367 "--clone-cache-dir",
4368 help="Directory to cache cloned git repositories (default: scripts/.cache/repos)"
4369 )
4370
4371 parser.add_argument(
4372 "--source-dir",
4373 help="Source directory containing go.mod (default: current directory)"
4374 )
4375
4376 parser.add_argument(
4377 "--git-repo",
4378 help="Git repository URL (for documentation purposes)"
4379 )
4380
4381 parser.add_argument(
4382 "--git-ref",
4383 help="Git reference (for documentation purposes)"
4384 )
4385
4386 parser.add_argument(
4387 "-v", "--verbose",
4388 action="store_true",
4389 help="Verbose output"
4390 )
4391
4392 parser.add_argument(
4393 "--clean-cache",
4394 action="store_true",
4395 help="Clear metadata cache before regeneration (useful when derivation logic changes)"
4396 )
4397
4398 parser.add_argument(
4399 "--clean-ls-remote-cache",
4400 action="store_true",
4401 help="Clear git ls-remote cache in addition to metadata cache (implies --clean-cache)"
4402 )
4403
4404 parser.add_argument(
4405 "--skip-legacy-module-cache",
4406 action="store_true",
4407 help="Skip importing legacy module metadata from module_cache_task.inc"
4408 )
4409
4410 parser.add_argument(
4411 "--dry-run",
4412 action="store_true",
4413 help="Execute cache mutations without discovery/generation"
4414 )
4415
4416 parser.add_argument(
4417 "--clean-gomodcache",
4418 action="store_true",
4419 help="Clean stale .info files in GOMODCACHE that lack VCS metadata (fixes 'module lookup disabled' errors)"
4420 )
4421
4422 parser.add_argument(
4423 "--validate",
4424 action="store_true",
4425 help="Validate module commits without emitting recipe files"
4426 )
4427
4428 parser.add_argument(
4429 "--validate-only",
4430 action="store_true",
4431 help=argparse.SUPPRESS
4432 )
4433
4434 parser.add_argument(
4435 "--skip-verify",
4436 action="store_true",
4437 help="Skip commit verification (trust cached verify results, much faster)"
4438 )
4439
4440 parser.add_argument(
4441 "--verify-jobs",
4442 type=int,
4443 default=10,
4444 metavar="N",
4445 help="Number of parallel verification jobs (default: 10, 0=sequential)"
4446 )
4447
4448 parser.add_argument(
4449 "--verify-cached",
4450 action="store_true",
4451 help="Verify commits in GOMODCACHE .info files still exist in repositories (detects force-pushed tags)"
4452 )
4453
4454 parser.add_argument(
4455 "--verify-cache-max-age",
4456 type=int,
4457 default=30,
4458 metavar="DAYS",
4459 help="Re-verify cached commits older than this many days (default: 30, 0=always verify)"
4460 )
4461
4462 parser.add_argument(
4463 "--debug-limit",
4464 type=int,
4465 help="Process at most N modules during validation/generation (debug only)"
4466 )
4467
4468 parser.add_argument(
4469 "--inject-commit",
4470 metavar=("REPO", "COMMIT"),
4471 nargs=2,
4472 action="append",
4473 help="Mark a repo+commit pair as already verified (skips network check)"
4474 )
4475
4476 parser.add_argument(
4477 "--clear-commit",
4478 metavar=("REPO", "COMMIT"),
4479 nargs=2,
4480 action="append",
4481 help="Remove a repo+commit pair from the verified cache"
4482 )
4483
4484 parser.add_argument(
4485 "--set-repo",
4486 metavar=("MODULE", "REPO"),
4487 nargs=2,
4488 action="append",
4489 help="Pin a module (or module@version) to the specified repository URL"
4490 )
4491
4492 parser.add_argument(
4493 "--clear-repo",
4494 metavar="MODULE",
4495 nargs=1,
4496 action="append",
4497 help="Remove a previously pinned repository override (module or module@version)"
4498 )
4499
4500 parser.add_argument(
4501 "--version",
4502 action="version",
4503 version=f"%(prog)s {VERSION}"
4504 )
4505
4506 parser.add_argument(
4507 "--discovered-modules",
4508 dest="discovered_modules",
4509 help="JSON file with pre-discovered module metadata (skips discovery phase)"
4510 )
4511 # Backward compatibility alias for --discovered-modules
4512 parser.add_argument("--native-modules", dest="discovered_modules", help=argparse.SUPPRESS)
4513
4514 # Add compatibility args that we ignore (for backward compatibility)
4515 parser.add_argument("--use-hybrid", action="store_true", help=argparse.SUPPRESS)
4516 parser.add_argument("go_mod_file", nargs='?', help=argparse.SUPPRESS)
4517
4518 args = parser.parse_args()
4519 if args.validate_only:
4520 args.validate = True
4521
4522 # Set global verbose mode
4523 global VERBOSE_MODE
4524 VERBOSE_MODE = args.verbose
4525
4526 original_stdout = sys.stdout
4527 original_stderr = sys.stderr
4528 log_handle = None
4529 log_path = None
4530 try:
4531 timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
4532 log_path = Path(tempfile.gettempdir()) / f"oe-go-mod-fetcher-{timestamp}.log"
4533 LOG_PATH = log_path
4534 log_handle = log_path.open("w", encoding="utf-8", buffering=1)
4535 sys.stdout = Tee(original_stdout, log_handle)
4536 sys.stderr = Tee(original_stderr, log_handle)
4537
4538 print(f"Go Module Git Fetcher v{VERSION}")
4539 print("Hybrid Architecture: Discovery from Go + Build from Git")
4540 print("=" * 70)
4541 print(f"Logs: {log_path} (pass --dry-run to load caches only)")
4542
4543 exit_code = _execute(args)
4544 except KeyboardInterrupt:
4545 print("\n\nOperation cancelled by user")
4546 exit_code = 1
4547 except Exception as e:
4548 print(f"\n❌ Unexpected error: {e}")
4549 if args.verbose:
4550 import traceback
4551 traceback.print_exc()
4552 exit_code = 1
4553 finally:
4554 save_ls_remote_cache()
4555 save_metadata_cache()
4556 save_vanity_url_cache()
4557 save_verify_commit_cache()
4558 save_repo_overrides()
4559 for temp_cache in TEMP_GOMODCACHES:
4560 try:
4561 if temp_cache.exists():
4562 shutil.rmtree(temp_cache)
4563 except Exception:
4564 pass
4565 TEMP_GOMODCACHES.clear()
4566 if CURRENT_GOMODCACHE and not Path(CURRENT_GOMODCACHE).exists():
4567 CURRENT_GOMODCACHE = None
4568 if log_handle:
4569 log_handle.flush()
4570 log_handle.close()
4571 sys.stdout = original_stdout
4572 sys.stderr = original_stderr
4573 if LOG_PATH:
4574 print(f"Logs: {LOG_PATH}")
4575
4576 sys.exit(exit_code)
4577
4578
4579if __name__ == "__main__":
4580 main()