#!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0-only # # go-dep processor # # Copyright (C) 2025 Bruce Ashfield # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ Go Module Git Fetcher - Hybrid Architecture Version 3.0.0 - Complete rewrite using Go download for discovery + git builds Author: Bruce Ashfield Description: Use Go's download for discovery, build from git sources ARCHITECTURE: Phase 1: Discovery - Use 'go mod download' + filesystem walk to get correct module paths Phase 2: Recipe Generation - Generate BitBake recipe with git:// SRC_URI entries Phase 3: Cache Building - Build module cache from git sources during do_create_module_cache This approach eliminates: - Complex go list -m -json parsing - Manual go.sum parsing and augmentation - Parent module detection heuristics - Version path manipulation (/v2+/v3+ workarounds) - Module path normalization bugs Instead we: - Let Go download modules to temporary cache (discovery only) - Walk filesystem to get CORRECT module paths (no parsing!) - Extract VCS info from .info files - Fetch git repositories for each module - Build module cache from git during BitBake build CHANGELOG v3.0.0: - Complete architectural rewrite - Removed all go list and go.sum parsing logic (4000+ lines) - Implemented 3-phase hybrid approach - Discovery uses go mod download + filesystem walk - Module paths from filesystem, not from go list (no more /v3 stripping bugs!) - Builds entirely from git sources - Compatible with oe-core's gomod:// fetcher (same cache structure) """ import argparse import concurrent.futures import hashlib import io import json import os import re import shutil import subprocess import sys import tempfile import textwrap import threading from pathlib import Path from typing import Dict, List, Optional, Set, Tuple from datetime import datetime, timedelta, timezone VERSION = "3.0.0" LOG_PATH: Optional[Path] = None # ============================================================================= # BitBake Task Templates # ============================================================================= class Tee(io.TextIOBase): """Write data to multiple text streams.""" def __init__(self, *streams: io.TextIOBase) -> None: self.streams = streams def write(self, data: str) -> int: for stream in self.streams: stream.write(data) return len(data) def flush(self) -> None: for stream in self.streams: stream.flush() def parse_go_sum(go_sum_path: Path) -> Tuple[Set[Tuple[str, str]], Set[Tuple[str, str]]]: """ Parse go.sum to find modules that need source code. Returns: Tuple of (modules_needing_source, indirect_only_modules) - modules_needing_source: Modules with source code entries (need .zip files) - indirect_only_modules: Modules that only have /go.mod entries (only need .mod files) """ def sanitize_module_name(name): """Remove quotes from module names""" if not name: return name stripped = name.strip() if len(stripped) >= 2 and stripped[0] == '"' and stripped[-1] == '"': return stripped[1:-1] return stripped modules_with_source: Set[Tuple[str, str]] = set() modules_with_gomod_only: Set[Tuple[str, str]] = set() if not go_sum_path.exists(): return (modules_with_source, modules_with_gomod_only) # First pass: collect all entries all_entries = {} with go_sum_path.open() as f: for line in f: line = line.strip() if not line or line.startswith('//'): continue parts = line.split() if len(parts) != 3: continue module_path, version, _ = parts module_path = sanitize_module_name(module_path) # Track whether this entry is for go.mod or source is_gomod_entry = version.endswith('/go.mod') # Strip /go.mod suffix for key base_version = version[:-7] if is_gomod_entry else version key = (module_path, base_version) if key not in all_entries: all_entries[key] = {'has_source': False, 'has_gomod': False} if is_gomod_entry: all_entries[key]['has_gomod'] = True else: all_entries[key]['has_source'] = True # Second pass: categorize modules for key, entry_types in all_entries.items(): if entry_types['has_source']: modules_with_source.add(key) continue if entry_types['has_gomod']: modules_with_gomod_only.add(key) # Note: We no longer add indirect-only modules to modules_with_source. # The native build succeeds without their .zip files - only .mod files are needed. # Adding them caused the generator to resolve ~1000 extra modules unnecessarily. return (modules_with_source, modules_with_gomod_only) def collect_modules_via_go_list(source_dir: Path) -> Set[Tuple[str, str]]: """ Use `go list -m -json all` to discover modules that may not appear in go.sum. """ env = os.environ.copy() env.setdefault('GOPROXY', 'https://proxy.golang.org') if CURRENT_GOMODCACHE: env['GOMODCACHE'] = CURRENT_GOMODCACHE try: result = subprocess.run( ['go', 'list', '-m', '-json', 'all'], cwd=source_dir, capture_output=True, text=True, check=True, env=env, ) except subprocess.CalledProcessError: return set() data = result.stdout modules: Set[Tuple[str, str]] = set() decoder = json.JSONDecoder() idx = 0 length = len(data) while idx < length: while idx < length and data[idx].isspace(): idx += 1 if idx >= length: break try: obj, end = decoder.raw_decode(data, idx) except json.JSONDecodeError: break idx = end path = obj.get('Path') or '' if not path or obj.get('Main'): continue version = obj.get('Version') or '' replace = obj.get('Replace') if replace: path = replace.get('Path', path) or path version = replace.get('Version', version) or version if not version or version == 'none': continue modules.add((path, version)) return modules def go_mod_download(module_path: str, version: str) -> bool: """Download a specific module version into the current GOMODCACHE.""" if not CURRENT_GOMODCACHE or not CURRENT_SOURCE_DIR: return False key = (module_path, version) if key in DOWNLOADED_MODULES: return module_path env = os.environ.copy() env.setdefault('GOPROXY', 'https://proxy.golang.org') env['GOMODCACHE'] = CURRENT_GOMODCACHE try: subprocess.run( ['go', 'mod', 'download', f'{module_path}@{version}'], cwd=str(CURRENT_SOURCE_DIR), env=env, capture_output=True, text=True, check=True, timeout=GO_CMD_TIMEOUT, ) DOWNLOADED_MODULES.add(key) return True except subprocess.TimeoutExpired as e: print(f" ❌ go mod download timed out for {module_path}@{version} after {GO_CMD_TIMEOUT}s") return False except subprocess.CalledProcessError as e: stderr = (e.stderr or '').strip() if stderr: lower = stderr.lower() network_signals = [ "lookup ", "dial tcp", "connection refused", "network is unreachable", "tls handshake timeout", "socket: operation not permitted" ] if any(signal in lower for signal in network_signals): global NETWORK_FAILURE_DETECTED NETWORK_FAILURE_DETECTED = True raise RuntimeError( f"Network failure while downloading {module_path}@{version}: {stderr}" ) from e print(f" ⚠️ go mod download failed for {module_path}@{version}: {stderr}") return False SCRIPT_DIR = Path(__file__).resolve().parent CACHE_BASE_DIR = SCRIPT_DIR / "data" # Default to scripts/data for JSON caches DATA_DIR = CACHE_BASE_DIR CLONE_CACHE_DIR = SCRIPT_DIR / ".cache" / "repos" # Repository clone cache VERIFY_BASE_DIR = CACHE_BASE_DIR / ".verify" LS_REMOTE_CACHE_PATH = DATA_DIR / "ls-remote-cache.json" VERIFY_COMMIT_CACHE_PATH = DATA_DIR / "verify-cache.json" MODULE_REPO_OVERRIDES_PATH = DATA_DIR / "repo-overrides.json" # Manual overrides file - tracked in git, for permanent overrides when discovery fails MANUAL_OVERRIDES_PATH = SCRIPT_DIR / "data" / "manual-overrides.json" LS_REMOTE_CACHE: Dict[Tuple[str, str], Optional[str]] = {} LS_REMOTE_CACHE_DIRTY = False MODULE_METADATA_CACHE_PATH = DATA_DIR / "module-cache.json" MODULE_METADATA_CACHE: Dict[Tuple[str, str], Dict[str, str]] = {} MODULE_METADATA_CACHE_DIRTY = False VANITY_URL_CACHE_PATH = DATA_DIR / "vanity-url-cache.json" VANITY_URL_CACHE: Dict[str, Optional[str]] = {} VANITY_URL_CACHE_DIRTY = False CURRENT_GOMODCACHE: Optional[str] = None CURRENT_SOURCE_DIR: Optional[Path] = None TEMP_GOMODCACHES: List[Path] = [] FAILED_MODULE_PATHS: Set[str] = set() FAILED_MODULE_ENTRIES: Set[Tuple[str, str]] = set() DOWNLOADED_MODULES: Set[Tuple[str, str]] = set() NETWORK_FAILURE_DETECTED: bool = False SKIPPED_MODULES: Dict[Tuple[str, str], str] = {} VERBOSE_MODE: bool = False # Set from command-line args def _record_skipped_module(module_path: str, version: str, reason: str) -> None: SKIPPED_MODULES[(module_path, version)] = reason GO_CMD_TIMEOUT = 180 # seconds GIT_CMD_TIMEOUT = 90 # seconds VERIFY_REPO_CACHE: Dict[str, Path] = {} VERIFY_REPO_LOCKS: Dict[str, threading.Lock] = {} # Per-repository locks for parallel verification VERIFY_REPO_LOCKS_LOCK = threading.RLock() # REENTRANT lock to allow same thread to acquire multiple times VERIFY_REPO_BRANCHES: Dict[str, List[str]] = {} # Cache branch lists per repo to avoid repeated ls-remote VERIFY_RESULTS: Dict[Tuple[str, str], bool] = {} VERIFY_COMMIT_CACHE: Dict[str, bool] = {} # Legacy format: key -> bool VERIFY_COMMIT_CACHE_V2: Dict[str, Dict[str, any]] = {} # New format: key -> {verified: bool, timestamp: str, last_check: str} VERIFY_COMMIT_CACHE_DIRTY = False VERIFY_ENABLED = False # Set to True when verification is active VERIFY_CACHE_MAX_AGE_DAYS = 30 # Re-verify commits older than this VERIFY_DETECTED_BRANCHES: Dict[Tuple[str, str], str] = {} # (url, commit) -> branch_name VERIFY_FALLBACK_COMMITS: Dict[Tuple[str, str], str] = {} # Maps (url, original_commit) -> fallback_commit VERIFY_FULL_REPOS: Set[str] = set() # Track repos that have been fetched with full history VERIFY_CORRECTIONS_APPLIED = False # Track if any commit corrections were made MODULE_REPO_OVERRIDES: Dict[Tuple[str, Optional[str]], str] = {} # Dynamic overrides from --set-repo MODULE_REPO_OVERRIDES_DIRTY = False MANUAL_OVERRIDES: Dict[Tuple[str, Optional[str]], str] = {} # Git-tracked overrides from manual-overrides.json # REPO_OVERRIDES kept for backwards compatibility but no longer used for hardcoded values. # Manual overrides go in data/manual-overrides.json which is tracked in git. REPO_OVERRIDES: Dict[str, List[str]] = {} def _normalise_override_key(module_path: str, version: Optional[str]) -> Tuple[str, Optional[str]]: module = module_path.strip() ver = version.strip() if version else None if not module: raise ValueError("module path for override cannot be empty") return module, ver def _parse_override_spec(module_spec: str) -> Tuple[str, Optional[str]]: if '@' in module_spec: module_path, version = module_spec.split('@', 1) version = version or None else: module_path, version = module_spec, None return module_path.strip(), version.strip() if version else None def repo_override_candidates(module_path: str, version: Optional[str] = None) -> List[str]: """ Get repository URL override candidates for a module. Priority order: 1. Dynamic overrides (--set-repo, stored in repo-overrides.json) - version-specific 2. Dynamic overrides - wildcard (no version) 3. Manual overrides (manual-overrides.json, tracked in git) - version-specific 4. Manual overrides - wildcard 5. Legacy REPO_OVERRIDES dict (for backwards compatibility) """ overrides: List[str] = [] key = _normalise_override_key(module_path, version) wildcard_key = _normalise_override_key(module_path, None) # Dynamic overrides first (highest priority - user can override manual) dynamic_specific = MODULE_REPO_OVERRIDES.get(key) if dynamic_specific: overrides.append(dynamic_specific) dynamic_default = MODULE_REPO_OVERRIDES.get(wildcard_key) if dynamic_default and dynamic_default not in overrides: overrides.append(dynamic_default) # Manual overrides next (git-tracked, for permanent fixes) manual_specific = MANUAL_OVERRIDES.get(key) if manual_specific and manual_specific not in overrides: overrides.append(manual_specific) manual_default = MANUAL_OVERRIDES.get(wildcard_key) if manual_default and manual_default not in overrides: overrides.append(manual_default) # Legacy hardcoded overrides last (backwards compat) for candidate in REPO_OVERRIDES.get(module_path, []): if candidate not in overrides: overrides.append(candidate) return overrides def configure_cache_paths(cache_dir: Optional[str], clone_cache_dir: Optional[str] = None) -> None: """ Configure cache file locations. Args: cache_dir: Directory for JSON metadata caches (default: scripts/data) clone_cache_dir: Directory for git repository clones (default: scripts/.cache/repos) """ global CACHE_BASE_DIR, DATA_DIR, CLONE_CACHE_DIR global LS_REMOTE_CACHE_PATH, MODULE_METADATA_CACHE_PATH, VANITY_URL_CACHE_PATH global VERIFY_COMMIT_CACHE_PATH, MODULE_REPO_OVERRIDES_PATH # Configure JSON metadata cache directory if cache_dir: CACHE_BASE_DIR = Path(cache_dir).resolve() else: CACHE_BASE_DIR = SCRIPT_DIR / "data" # Default to scripts/data CACHE_BASE_DIR.mkdir(parents=True, exist_ok=True) DATA_DIR = CACHE_BASE_DIR # cache_dir IS the data directory now LS_REMOTE_CACHE_PATH = DATA_DIR / "ls-remote-cache.json" MODULE_METADATA_CACHE_PATH = DATA_DIR / "module-cache.json" VANITY_URL_CACHE_PATH = DATA_DIR / "vanity-url-cache.json" VERIFY_COMMIT_CACHE_PATH = DATA_DIR / "verify-cache.json" MODULE_REPO_OVERRIDES_PATH = DATA_DIR / "repo-overrides.json" global VERIFY_BASE_DIR VERIFY_BASE_DIR = CACHE_BASE_DIR / ".verify" VERIFY_BASE_DIR.mkdir(parents=True, exist_ok=True) # Configure git clone cache directory if clone_cache_dir: CLONE_CACHE_DIR = Path(clone_cache_dir).resolve() else: CLONE_CACHE_DIR = SCRIPT_DIR / ".cache" / "repos" # Default to scripts/.cache/repos CLONE_CACHE_DIR.mkdir(parents=True, exist_ok=True) VERIFY_COMMIT_CACHE.clear() load_verify_commit_cache() MODULE_REPO_OVERRIDES.clear() load_repo_overrides() load_manual_overrides() global VERIFY_REPO_CACHE VERIFY_REPO_CACHE = {} def ensure_path_is_writable(path: Path) -> None: """ Attempt to create and delete a small file to verify write access. Exit with a clear error if the path is not writable. """ path.mkdir(parents=True, exist_ok=True) probe = path / ".oe-go-mod-fetcher-permcheck" try: with open(probe, "w") as fh: fh.write("") except Exception as exc: print(f"❌ GOMODCACHE is not writable: {path} ({exc})") print(" Fix permissions (e.g. chown/chmod) or pass a writable --gomodcache path.") sys.exit(1) finally: try: probe.unlink() except Exception: pass def _normalize_url(url: str) -> str: url = url.strip() if url.startswith("git://"): url = "https://" + url[6:] if url.endswith(".git"): url = url[:-4] return url def _url_allowed_for_module(module_path: str, url: str, version: Optional[str] = None) -> bool: url = _normalize_url(url) overrides = repo_override_candidates(module_path, version) if not overrides: return True normalized_overrides = {_normalize_url(o) for o in overrides} return url in normalized_overrides def prune_metadata_cache() -> None: """ Remove stale metadata entries that no longer satisfy override policies or contain obviously invalid data. This prevents old .inc state from re-introducing bad repositories during bootstrap. """ global MODULE_METADATA_CACHE_DIRTY removed = False for key in list(MODULE_METADATA_CACHE.keys()): module_path, version = key entry = MODULE_METADATA_CACHE.get(key) or {} vcs_url = entry.get('vcs_url', '') commit = entry.get('commit', '') if not vcs_url or not commit: MODULE_METADATA_CACHE.pop(key, None) removed = True continue if len(commit) != 40 or not re.fullmatch(r'[0-9a-fA-F]{40}', commit): MODULE_METADATA_CACHE.pop(key, None) removed = True continue if not _url_allowed_for_module(module_path, vcs_url, version): MODULE_METADATA_CACHE.pop(key, None) removed = True continue if removed: MODULE_METADATA_CACHE_DIRTY = True def _verify_repo_dir(vcs_url: str) -> Path: # Quick check without lock (optimization) if vcs_url in VERIFY_REPO_CACHE: return VERIFY_REPO_CACHE[vcs_url] # Use master lock to serialize repo initialization with VERIFY_REPO_LOCKS_LOCK: # Double-check after acquiring lock if vcs_url in VERIFY_REPO_CACHE: return VERIFY_REPO_CACHE[vcs_url] repo_hash = hashlib.sha256(vcs_url.encode()).hexdigest() repo_dir = VERIFY_BASE_DIR / repo_hash git_dir = repo_dir / "repo" git_dir.mkdir(parents=True, exist_ok=True) env = os.environ.copy() env.setdefault("GIT_TERMINAL_PROMPT", "0") env.setdefault("GIT_ASKPASS", "true") if not (git_dir / "config").exists(): subprocess.run([ "git", "init", "--bare" ], cwd=str(git_dir), check=True, capture_output=True, env=env) subprocess.run([ "git", "remote", "add", "origin", vcs_url ], cwd=str(git_dir), check=True, capture_output=True, env=env) else: subprocess.run([ "git", "remote", "set-url", "origin", vcs_url ], cwd=str(git_dir), check=False, capture_output=True, env=env) VERIFY_REPO_CACHE[vcs_url] = git_dir # Create a per-repo lock while we still hold the master lock if vcs_url not in VERIFY_REPO_LOCKS: VERIFY_REPO_LOCKS[vcs_url] = threading.Lock() return git_dir def _find_fallback_commit(vcs_url: str, version: str, timestamp: str = "") -> Optional[Tuple[str, str]]: """ Find a fallback commit when the proxy commit doesn't exist. Strategy: 1. For pseudo-versions with timestamp: find commit near that date on default branch 2. Otherwise: use latest commit on default branch (main/master) Returns: (commit_hash, branch_name) or None if failed """ import re from datetime import datetime env = os.environ.copy() env.setdefault("GIT_TERMINAL_PROMPT", "0") env.setdefault("GIT_ASKPASS", "true") # Extract timestamp from pseudo-version: v0.0.0-YYYYMMDDHHMMSS-hash target_date = None if timestamp: try: target_date = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) except Exception: pass if not target_date: # Try to extract from pseudo-version format match = re.match(r'v\d+\.\d+\.\d+-(\d{14})-[0-9a-f]+', version) if match: date_str = match.group(1) # YYYYMMDDHHMMSS try: target_date = datetime.strptime(date_str, '%Y%m%d%H%M%S') except Exception: pass # Get default branch try: result = subprocess.run( ["git", "ls-remote", "--symref", vcs_url, "HEAD"], capture_output=True, text=True, timeout=30, env=env, ) if result.returncode == 0 and result.stdout: # Parse: ref: refs/heads/main HEAD for line in result.stdout.split('\n'): if line.startswith('ref:'): default_branch = line.split()[1].replace('refs/heads/', '') break else: default_branch = 'main' # Fallback else: default_branch = 'main' except Exception: default_branch = 'main' # Get commits on default branch try: if target_date: # Find commit closest to target date # We need to clone the repo to access commit history with dates # NOTE: Do NOT acquire per-repo lock here - our caller already holds it! # _find_fallback_commit is only called from within verify_commit_accessible, # which has already acquired the per-repo lock for this vcs_url. # Get the repo dir (cached, won't re-initialize) repo_dir = VERIFY_REPO_CACHE.get(vcs_url) if not repo_dir: # Shouldn't happen (verify_commit_accessible calls _verify_repo_dir first) # but be defensive repo_dir = _verify_repo_dir(vcs_url) # Fetch the default branch (caller holds lock, so this is safe) try: subprocess.run( ["git", "fetch", "origin", f"{default_branch}:refs/remotes/origin/{default_branch}"], cwd=str(repo_dir), check=True, capture_output=True, text=True, timeout=60, env=env, ) except subprocess.CalledProcessError: # Fallback to latest if fetch fails pass # Use git log with --until to find commit at or before target date # Format: YYYY-MM-DD HH:MM:SS date_str = target_date.strftime('%Y-%m-%d %H:%M:%S') try: result = subprocess.run( ["git", "log", "-1", "--format=%H", f"--until={date_str}", f"origin/{default_branch}"], cwd=str(repo_dir), capture_output=True, text=True, timeout=30, env=env, ) if result.returncode == 0 and result.stdout.strip(): commit_hash = result.stdout.strip() return (commit_hash, default_branch) except subprocess.CalledProcessError: pass # If date-based search failed, fall back to latest commit result = subprocess.run( ["git", "rev-parse", f"origin/{default_branch}"], cwd=str(repo_dir), capture_output=True, text=True, timeout=30, env=env, ) if result.returncode == 0 and result.stdout.strip(): commit_hash = result.stdout.strip() return (commit_hash, default_branch) else: # Use latest commit from ls-remote (no need to clone) result = subprocess.run( ["git", "ls-remote", vcs_url, f"refs/heads/{default_branch}"], capture_output=True, text=True, timeout=30, env=env, ) if result.returncode == 0 and result.stdout: commit_hash = result.stdout.split()[0] return (commit_hash, default_branch) except Exception as e: print(f" ⚠️ Fallback commit search failed: {e}") return None def verify_commit_accessible(vcs_url: str, commit: str, ref_hint: str = "", version: str = "", timestamp: str = "") -> bool: """ Fetch commit into a bare cache to ensure it exists upstream. Check cache age and force re-verification if too old. If commit doesn't exist, use fallback (latest commit on default branch or near timestamp) Args: vcs_url: Git repository URL commit: Commit hash to verify ref_hint: Optional ref (tag/branch) that should contain the commit version: Module version (for extracting timestamp from pseudo-versions) timestamp: ISO timestamp from .info file (for finding commits near that date) """ from datetime import datetime, timezone, timedelta # Check cache before acquiring lock (fast path for already-verified commits) key = (vcs_url, commit) if key in VERIFY_RESULTS: return VERIFY_RESULTS[key] cache_key = f"{vcs_url}|||{commit}" # Track if verification passed via cache (to skip re-saving later) cached_verification_passed = False # Check cache with aging logic if cache_key in VERIFY_COMMIT_CACHE_V2: cache_entry = VERIFY_COMMIT_CACHE_V2[cache_key] if cache_entry.get("verified"): # Check if cache is too old last_checked_str = cache_entry.get("last_checked") if last_checked_str: try: last_checked = datetime.fromisoformat(last_checked_str.replace('Z', '+00:00')) age_days = (datetime.now(timezone.utc) - last_checked).days if age_days < VERIFY_CACHE_MAX_AGE_DAYS: # Cache is fresh for commit existence, but we still need branch detection # Branch detection is cheap (local operation) and critical for BitBake recipes # Don't return early - continue to branch detection below cached_verification_passed = True else: # Cache is stale, force re-verification print(f" ⏰ Cache stale ({age_days} days old), re-verifying {commit[:12]}...") # Fall through to re-verify except Exception: # Can't parse timestamp, force re-verification pass else: # No timestamp, but still need branch detection cached_verification_passed = True # Legacy cache format fallback if cache_key in VERIFY_COMMIT_CACHE and VERIFY_COMMIT_CACHE[cache_key]: # Migrate to v2 format during this check now = datetime.now(timezone.utc).isoformat() VERIFY_COMMIT_CACHE_V2[cache_key] = { "verified": True, "first_verified": now, "last_checked": now, "fetch_method": "cached" } # Don't return early - continue to branch detection cached_verification_passed = True # Ensure repo is initialized (this creates the lock too) repo_dir = _verify_repo_dir(vcs_url) # Now safely get the lock (guaranteed to exist after _verify_repo_dir returns) lock = VERIFY_REPO_LOCKS[vcs_url] with lock: # Double-check cache after acquiring lock (another thread may have verified while we waited) if key in VERIFY_RESULTS: return VERIFY_RESULTS[key] env = os.environ.copy() env.setdefault("GIT_TERMINAL_PROMPT", "0") env.setdefault("GIT_ASKPASS", "true") def _commit_exists(check_commit: str = None) -> bool: """Check if a commit exists in the local repo.""" target = check_commit if check_commit else commit try: subprocess.run( ["git", "rev-parse", "--verify", f"{target}^{{commit}}"], cwd=str(repo_dir), check=True, capture_output=True, env=env, ) return True except subprocess.CalledProcessError: return False global VERIFY_COMMIT_CACHE_DIRTY, VERIFY_FALLBACK_COMMITS cached = VERIFY_COMMIT_CACHE.get(cache_key) commit_present = _commit_exists() if cached and not commit_present: # Cached entry without a local commit indicates stale data; drop it. VERIFY_COMMIT_CACHE.pop(cache_key, None) VERIFY_COMMIT_CACHE_DIRTY = True cached = None # Only do shallow fetch if commit is not already present # Doing --depth=1 on an already-full repo causes git to re-process history (very slow on large repos) if not commit_present and ref_hint: fetch_args = ["git", "fetch", "--depth=1", "origin", ref_hint] try: subprocess.run( fetch_args, cwd=str(repo_dir), check=True, capture_output=True, text=True, timeout=GIT_CMD_TIMEOUT, env=env, ) except subprocess.TimeoutExpired: print(f" ⚠️ git fetch timeout ({GIT_CMD_TIMEOUT}s) for {vcs_url} {ref_hint or ''}") except subprocess.CalledProcessError as exc: detail = (exc.stderr or exc.stdout or "").strip() if isinstance(exc.stderr, str) or isinstance(exc.stdout, str) else "" if detail: print(f" ⚠️ git fetch failed for {vcs_url} {ref_hint or ''}: {detail}") # Continue to attempt direct commit fetch # For pseudo-versions, we need to determine which branch contains the commit # Strategy depends on whether this is a tagged version or pseudo-version commit_fetched = commit_present # If already present, no need to fetch if ref_hint and not commit_present: # Tagged version: try shallow fetch of the specific commit (only if not already present) try: fetch_cmd = ["git", "fetch", "--depth=1", "origin", commit] subprocess.run( fetch_cmd, cwd=str(repo_dir), check=True, capture_output=True, text=True, timeout=GIT_CMD_TIMEOUT, env=env, ) commit_fetched = True except subprocess.CalledProcessError as exc: detail = (exc.stderr or exc.stdout or "").strip() if isinstance(exc.stderr, str) or isinstance(exc.stdout, str) else "" if detail: print(f" ⚠️ git fetch failed for {vcs_url[:50]}...: {detail[:100]}") # If fetching commit failed for a tag, check if tag has moved if ref_hint and ref_hint.startswith('refs/tags/'): print(f" → Tag commit not fetchable, checking if tag moved...") try: # Try fetching the tag again to see what it currently points to subprocess.run( ["git", "fetch", "--depth=1", "origin", ref_hint], cwd=str(repo_dir), check=True, capture_output=True, text=True, timeout=GIT_CMD_TIMEOUT, env=env, ) # Check what commit the tag now points to result = subprocess.run( ["git", "rev-parse", "FETCH_HEAD"], cwd=str(repo_dir), capture_output=True, text=True, timeout=30, env=env, check=True, ) current_tag_commit = result.stdout.strip() if current_tag_commit != commit: print(f" ✓ Tag moved detected:") print(f" Proxy gave us: {commit[:12]} (no longer exists)") print(f" Tag now points to: {current_tag_commit[:12]}") print(f" → Using current tag commit") # Update module to use current commit VERIFY_FALLBACK_COMMITS[(vcs_url, commit)] = current_tag_commit return ('corrected', module_path, version, commit, current_tag_commit) except subprocess.CalledProcessError: # Can't fetch tag either - this is a real error pass for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]: lock_path = repo_dir / lock_file if lock_path.exists(): try: lock_path.unlink() except Exception: pass VERIFY_RESULTS[key] = False VERIFY_COMMIT_CACHE.pop(cache_key, None) VERIFY_COMMIT_CACHE_DIRTY = True return False else: # Pseudo-version: MUST do full clone to detect which branch contains commit # Shallow fetch is useless - we need history for git for-each-ref --contains # Check if we already fetched full history for this repo URL # This prevents redundant full-history fetches for repos with multiple module versions shallow_file = repo_dir / "shallow" is_shallow = shallow_file.exists() already_full = vcs_url in VERIFY_FULL_REPOS if is_shallow and not already_full: print(f" → Fetching full history for branch detection...") try: # Use --unshallow to convert shallow clone to full clone subprocess.run( ["git", "fetch", "--unshallow", "origin", "+refs/heads/*:refs/remotes/origin/*"], cwd=str(repo_dir), check=True, capture_output=True, text=True, timeout=GIT_CMD_TIMEOUT * 5, env=env, ) commit_fetched = True # Mark this repo as having full history VERIFY_FULL_REPOS.add(vcs_url) except subprocess.TimeoutExpired: print(f" ⚠️ Full clone timeout for {vcs_url[:50]}...") for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]: lock_path = repo_dir / lock_file if lock_path.exists(): try: lock_path.unlink() except Exception: pass VERIFY_RESULTS[key] = False VERIFY_COMMIT_CACHE.pop(cache_key, None) VERIFY_COMMIT_CACHE_DIRTY = True return False except subprocess.CalledProcessError as exc: detail = (exc.stderr or exc.stdout or "").strip() if isinstance(exc.stderr, str) or isinstance(exc.stdout, str) else "" if detail: print(f" ⚠️ Full clone failed for {vcs_url[:50]}...: {detail[:100]}") for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]: lock_path = repo_dir / lock_file if lock_path.exists(): try: lock_path.unlink() except Exception: pass VERIFY_RESULTS[key] = False VERIFY_COMMIT_CACHE.pop(cache_key, None) VERIFY_COMMIT_CACHE_DIRTY = True return False else: # Already full - just fetch updates print(f" → Fetching updates (repo already full)...") try: subprocess.run( ["git", "fetch", "origin", "+refs/heads/*:refs/remotes/origin/*"], cwd=str(repo_dir), check=True, capture_output=True, text=True, timeout=GIT_CMD_TIMEOUT, env=env, ) commit_fetched = True except subprocess.TimeoutExpired: print(f" ⚠️ Full clone timeout for {vcs_url[:50]}...") for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]: lock_path = repo_dir / lock_file if lock_path.exists(): try: lock_path.unlink() except Exception: pass VERIFY_RESULTS[key] = False VERIFY_COMMIT_CACHE.pop(cache_key, None) VERIFY_COMMIT_CACHE_DIRTY = True return False except subprocess.CalledProcessError as exc: detail = (exc.stderr or exc.stdout or "").strip() if isinstance(exc.stderr, str) or isinstance(exc.stdout, str) else "" if detail: print(f" ⚠️ Full clone failed for {vcs_url[:50]}...: {detail[:100]}") for lock_file in ["shallow.lock", "index.lock", "HEAD.lock"]: lock_path = repo_dir / lock_file if lock_path.exists(): try: lock_path.unlink() except Exception: pass VERIFY_RESULTS[key] = False VERIFY_COMMIT_CACHE.pop(cache_key, None) VERIFY_COMMIT_CACHE_DIRTY = True return False # Use the original commit or fallback commit for verification actual_commit = commit if not _commit_exists(): # Commit doesn't exist in repository - try fallback strategy # This handles orphaned commits from proxy.golang.org print(f" ⚠️ Commit {commit[:12]} not found in repository {vcs_url[:50]}...") if not ref_hint: # Pseudo-version without a tag - use timestamp-based fallback print(f" → Attempting fallback commit strategy for pseudo-version {version}") fallback_result = _find_fallback_commit(vcs_url, version, timestamp) if fallback_result: fallback_commit, fallback_branch = fallback_result print(f" ⚠️ Using fallback: {fallback_commit[:12]} from branch '{fallback_branch}'") print(f" (Original commit {commit[:12]} from proxy.golang.org does not exist)") # Update commit to use the fallback actual_commit = fallback_commit # Track the fallback mapping so callers can use the fallback commit VERIFY_FALLBACK_COMMITS[(vcs_url, commit)] = fallback_commit # Fetch the fallback commit (only unshallow if repo is still shallow) shallow_file = repo_dir / "shallow" is_shallow = shallow_file.exists() try: if is_shallow: subprocess.run( ["git", "fetch", "--unshallow", "origin", "+refs/heads/*:refs/remotes/origin/*"], cwd=str(repo_dir), check=True, capture_output=True, text=True, timeout=GIT_CMD_TIMEOUT * 5, env=env, ) else: # Repo already has full history - just fetch updates subprocess.run( ["git", "fetch", "origin", "+refs/heads/*:refs/remotes/origin/*"], cwd=str(repo_dir), check=True, capture_output=True, text=True, timeout=GIT_CMD_TIMEOUT, env=env, ) except Exception as e: print(f" ⚠️ Failed to fetch fallback commit: {e}") VERIFY_RESULTS[key] = False return False # Register the fallback branch VERIFY_DETECTED_BRANCHES[(vcs_url, fallback_commit)] = fallback_branch # Check if fallback commit exists if not _commit_exists(fallback_commit): print(f" ⚠️ Fallback commit {fallback_commit[:12]} also not found!") VERIFY_RESULTS[key] = False return False else: print(f" ⚠️ Could not determine fallback commit") VERIFY_RESULTS[key] = False return False else: # Tagged version with bad commit - this shouldn't happen but fail gracefully print(f" ⚠️ Tagged version {version} has invalid commit {commit[:12]}") VERIFY_RESULTS[key] = False return False # Now verify the actual_commit (original or fallback) if _commit_exists(actual_commit): # Commit was fetched successfully - verify it's reachable from the ref_hint if provided # This ensures the commit is on the branch/tag we'll use in SRC_URI if ref_hint: # For tagged versions, verify the tag still points to the same commit # proxy.golang.org caches module@version->commit mappings, but tags can be force-pushed # If the tag has moved to a different commit, we need to use the current commit # Optimization: Use git ls-remote first (fast, cached) before fetching if ref_hint.startswith('refs/tags/'): try: # First check if tag has moved using fast ls-remote (cached) # FIX #37: Use dereferenced tag (^{}) to get the actual commit hash # For annotated tags, ref_hint returns the tag object hash, not the commit # Example: refs/tags/v1.0.1 -> c49ff274 (tag object) # refs/tags/v1.0.1^{} -> 37c8de36 (actual commit) current_tag_commit = git_ls_remote(vcs_url, f"{ref_hint}^{{}}") if current_tag_commit and current_tag_commit != actual_commit: # Tag has moved - fetch it to verify and update local repo print(f" ⚠️ Tag has moved - proxy.golang.org cache is stale") print(f" Proxy gave us: {actual_commit[:12]}") print(f" Tag now points to: {current_tag_commit[:12]}") print(f" → Using current tag commit") # Fetch the tag to update local repo subprocess.run( ["git", "fetch", "--depth=1", "origin", ref_hint], cwd=str(repo_dir), check=True, capture_output=True, text=True, timeout=GIT_CMD_TIMEOUT, env=env, ) # Update to use current commit VERIFY_FALLBACK_COMMITS[(vcs_url, actual_commit)] = current_tag_commit actual_commit = current_tag_commit # Verify the new commit exists (it should, since we just fetched it) if not _commit_exists(current_tag_commit): print(f" ⚠️ Current tag commit {current_tag_commit[:12]} not found!") VERIFY_RESULTS[key] = False VERIFY_COMMIT_CACHE.pop(cache_key, None) VERIFY_COMMIT_CACHE_DIRTY = True return False # The VERIFY_FALLBACK_COMMITS mapping will be used by the caller # Continue with verification using the corrected commit except Exception as e: # Tag verification failed - continue with normal flow print(f" ⚠️ Could not verify tag target: {e}") pass try: # Check if commit is an ancestor of (or equal to) the ref # This works even with shallow clones result = subprocess.run( ["git", "merge-base", "--is-ancestor", actual_commit, "FETCH_HEAD"], cwd=str(repo_dir), capture_output=True, text=True, timeout=30, env=env, ) if result.returncode != 0: # Commit is not an ancestor of the ref - might be on a different branch # This is OK - BitBake can still fetch the commit directly # Just log it for debugging pass # Don't fail - commit exists and is fetchable except subprocess.TimeoutExpired: print(f" ⚠️ Timeout checking commit ancestry for {actual_commit[:12]}") # Don't fail - commit exists except subprocess.CalledProcessError: # merge-base failed - don't fail verification pass else: # For pseudo-versions, we MUST detect which branch contains the commit # This is CRITICAL - BitBake cannot fetch arbitrary commits with nobranch=1 # We need branch= in SRC_URI for interior commits # Check if we already have the branch from fallback if (vcs_url, actual_commit) not in VERIFY_DETECTED_BRANCHES: # Now that we have full history, use git to find which branches contain this commit try: result = subprocess.run( ["git", "for-each-ref", "--contains", actual_commit, "refs/remotes/origin/", "--format=%(refname:short)"], cwd=str(repo_dir), capture_output=True, text=True, timeout=30, env=env, ) if result.returncode == 0 and result.stdout.strip(): # Commit IS on one or more branches branches = result.stdout.strip().split('\n') # Strip 'origin/' prefix from branch names branches = [b.replace('origin/', '') for b in branches] # Pick main/master if available, otherwise first branch if 'main' in branches: detected_branch = 'main' elif 'master' in branches: detected_branch = 'master' else: detected_branch = branches[0] VERIFY_DETECTED_BRANCHES[(vcs_url, actual_commit)] = detected_branch print(f" → Detected branch: {detected_branch} (verified with git for-each-ref)") else: # Commit exists but not in any branch - it's orphaned/dangling # For pseudo-versions, try fallback strategy # DEBUG: ALWAYS print this to confirm we reach this block print(f" ⚠️ ORPHANED: Commit {actual_commit[:12]} not found in any branch for {vcs_url[:50]}") print(f" DEBUG-ORPHANED: ref_hint={ref_hint}, actual_commit={actual_commit[:12]}, commit={commit[:12]}, version={version}") print(f" DEBUG-ORPHANED: Condition: (not ref_hint)={not ref_hint}, (actual==commit)={actual_commit == commit}") if not ref_hint and actual_commit == commit: # This is a pseudo-version with orphaned commit - try fallback print(f" → Attempting fallback commit strategy for orphaned commit") fallback_result = _find_fallback_commit(vcs_url, version, timestamp) if fallback_result: fallback_commit, fallback_branch = fallback_result print(f" ✓ Using fallback: {fallback_commit[:12]} from branch '{fallback_branch}'") print(f" (Original commit {commit[:12]} from proxy.golang.org is orphaned)") # Update to use the fallback actual_commit = fallback_commit VERIFY_FALLBACK_COMMITS[(vcs_url, commit)] = fallback_commit VERIFY_DETECTED_BRANCHES[(vcs_url, fallback_commit)] = fallback_branch # Verify fallback commit exists if not _commit_exists(fallback_commit): print(f" ⚠️ Fallback commit {fallback_commit[:12]} not found!") VERIFY_RESULTS[key] = False return False # Continue with fallback commit - don't fail here else: print(f" ⚠️ Could not determine fallback commit") VERIFY_RESULTS[key] = False return False else: # Tagged version or already tried fallback - fail VERIFY_RESULTS[key] = False return False except subprocess.TimeoutExpired: print(f" ⚠️ Branch detection timeout for {actual_commit[:12]}") VERIFY_RESULTS[key] = False return False except subprocess.CalledProcessError: print(f" ⚠️ Failed to detect branch for {actual_commit[:12]}") VERIFY_RESULTS[key] = False return False # Commit exists AND is reachable - safe for BitBake nobranch=1 # Only save to cache if not already cached (branch detection is done, just finalize) if not cached_verification_passed: # Save with timestamp in v2 format now = datetime.now(timezone.utc).isoformat() existing_entry = VERIFY_COMMIT_CACHE_V2.get(cache_key, {}) VERIFY_COMMIT_CACHE_V2[cache_key] = { "verified": True, "first_verified": existing_entry.get("first_verified", now), "last_checked": now, "fetch_method": "fetch" # Successfully fetched from upstream } VERIFY_COMMIT_CACHE_DIRTY = True VERIFY_RESULTS[key] = True return True VERIFY_RESULTS[key] = False # Remove from both caches VERIFY_COMMIT_CACHE.pop(cache_key, None) VERIFY_COMMIT_CACHE_V2.pop(cache_key, None) VERIFY_COMMIT_CACHE_DIRTY = True return False def get_actual_commit(vcs_url: str, commit: str) -> str: """ Get the actual commit to use, applying fallback if original commit doesn't exist. This should be called after verify_commit_accessible() to get the commit that was actually verified (which may be a fallback if the original didn't exist). Args: vcs_url: Repository URL commit: Original commit hash from proxy.golang.org Returns: Fallback commit if one was used, otherwise the original commit """ return VERIFY_FALLBACK_COMMITS.get((vcs_url, commit), commit) def _ref_points_to_commit(vcs_url: str, ref_hint: str, commit_hash: str) -> bool: if not ref_hint: return False repo_dir = _verify_repo_dir(vcs_url) # Lock is guaranteed to exist after _verify_repo_dir returns lock = VERIFY_REPO_LOCKS[vcs_url] with lock: env = os.environ.copy() env.setdefault("GIT_TERMINAL_PROMPT", "0") env.setdefault("GIT_ASKPASS", "true") try: result = subprocess.run( ["git", "show-ref", "--verify", "--hash", ref_hint], cwd=str(repo_dir), check=True, capture_output=True, text=True, env=env, ) resolved = result.stdout.strip().lower() return resolved == commit_hash.lower() except subprocess.CalledProcessError: return False def correct_commit_hash_from_ref(vcs_url: str, vcs_hash: str, vcs_ref: str) -> Optional[str]: """ Fix proxy.golang.org bad hashes by dereferencing the tag to get the correct commit. proxy.golang.org sometimes returns commits that: 1. Exist in the repo but aren't branch/tag HEADs (dangling commits) 2. Don't exist in the repo at all BitBake's nobranch=1 requires commits to be HEADs of branches or dereferenced tags. Args: vcs_url: Repository URL vcs_hash: Commit hash from proxy.golang.org (potentially bad) vcs_ref: Git ref like "refs/tags/v1.2.3" Returns: Corrected commit hash if different from vcs_hash, None if vcs_hash is correct or can't be corrected """ if not vcs_ref or not vcs_ref.startswith("refs/"): return None # Try dereferenced tag first (annotated tags) dereferenced_hash = git_ls_remote(vcs_url, f"{vcs_ref}^{{}}") if dereferenced_hash and dereferenced_hash.lower() != vcs_hash.lower(): return dereferenced_hash.lower() # Try without ^{} for lightweight tags commit_hash = git_ls_remote(vcs_url, vcs_ref) if commit_hash and commit_hash.lower() != vcs_hash.lower(): return commit_hash.lower() return None def is_commit_bitbake_fetchable(vcs_url: str, vcs_hash: str, vcs_ref: str) -> bool: """ Check if a commit is BitBake-fetchable (is a branch/tag HEAD). BitBake's nobranch=1 requires commits to be: - HEAD of a branch (refs/heads/*) - HEAD of a dereferenced tag (refs/tags/*^{}) Uses cached git ls-remote to check if the commit appears in the remote repository as a ref HEAD. Args: vcs_url: Repository URL vcs_hash: Commit hash to check vcs_ref: Git ref hint like "refs/tags/v1.2.3" Returns: True if commit is a branch/tag HEAD, False if dangling/not found """ # Quick check: Does the ref point to this commit? if vcs_ref and vcs_ref.startswith("refs/"): # Try dereferenced tag (annotated) ref_commit = git_ls_remote(vcs_url, f"{vcs_ref}^{{}}") if ref_commit and ref_commit.lower() == vcs_hash.lower(): return True # Try without ^{} for lightweight tags ref_commit = git_ls_remote(vcs_url, vcs_ref) if ref_commit and ref_commit.lower() == vcs_hash.lower(): return True # If we get here, the vcs_hash doesn't match the ref, so it's dangling return False def verify_gomodcache_commits(gomodcache_path: Path, verify_jobs: int = 10) -> int: """ Verify commits in GOMODCACHE .info files still exist in repositories. Detects force-pushed tags where proxy.golang.org has stale commit hashes. Offers to automatically refresh stale .info files by re-downloading. Returns: 0 if all commits valid or successfully refreshed 1 if stale commits found and user declined refresh """ global VERIFY_ENABLED VERIFY_ENABLED = True if isinstance(gomodcache_path, str): gomodcache_path = Path(gomodcache_path) if not gomodcache_path.exists(): print(f"❌ GOMODCACHE not found: {gomodcache_path}") return 1 download_dir = gomodcache_path / "cache" / "download" if not download_dir.exists(): print(f"❌ Download directory not found: {download_dir}") return 1 print(f"\nScanning {download_dir} for .info files...") # Collect all modules with VCS info modules_to_check = [] for dirpath, _, filenames in os.walk(download_dir): path_parts = Path(dirpath).relative_to(download_dir).parts if not path_parts or path_parts[-1] != '@v': continue module_path = '/'.join(path_parts[:-1]) module_path = unescape_module_path(module_path) for filename in filenames: if not filename.endswith('.info'): continue version = filename[:-5] info_path = Path(dirpath) / filename try: with open(info_path) as f: info = json.load(f) origin = info.get('Origin', {}) vcs_url = origin.get('URL') vcs_hash = origin.get('Hash') vcs_ref = origin.get('Ref', '') if vcs_url and vcs_hash and len(vcs_hash) == 40: modules_to_check.append({ 'module_path': module_path, 'version': version, 'vcs_url': vcs_url, 'vcs_hash': vcs_hash, 'vcs_ref': vcs_ref, 'info_path': info_path }) except Exception as e: print(f" ⚠️ Error reading {info_path}: {e}") print(f"Found {len(modules_to_check)} modules with VCS metadata to verify\n") if not modules_to_check: print("✅ No modules to verify") return 0 # Verify commits in parallel stale_modules = [] def check_module(module): if verify_commit_accessible(module['vcs_url'], module['vcs_hash'], module['vcs_ref'], module.get('version', '')): return None else: return module if verify_jobs > 0: print(f"Verifying commits in parallel ({verify_jobs} workers)...") with ThreadPoolExecutor(max_workers=verify_jobs) as executor: futures = {executor.submit(check_module, m): m for m in modules_to_check} for future in futures: result = future.result() if result: stale_modules.append(result) else: print("Verifying commits sequentially...") for module in modules_to_check: result = check_module(module) if result: stale_modules.append(result) if not stale_modules: print(f"\n✅ All {len(modules_to_check)} commits verified successfully!") return 0 # Report stale modules print(f"\n⚠️ Found {len(stale_modules)} modules with STALE commits:\n") for module in stale_modules[:10]: # Show first 10 print(f" {module['module_path']}@{module['version']}") print(f" Commit: {module['vcs_hash'][:12]} (not found in {module['vcs_url']})") print(f" File: {module['info_path']}") print() if len(stale_modules) > 10: print(f" ... and {len(stale_modules) - 10} more\n") # Offer to auto-refresh print("These commits likely represent force-pushed tags.") print("The .info files can be refreshed by re-downloading from proxy.golang.org\n") response = input("Refresh stale .info files automatically? [y/N]: ").strip().lower() if response not in ('y', 'yes'): print("\nNo action taken. To fix manually:") print(" 1. Delete stale .info files") print(" 2. Run: go mod download") return 1 # Refresh stale modules print("\nRefreshing stale modules...") refreshed = 0 failed = [] for module in stale_modules: print(f"\n Refreshing {module['module_path']}@{module['version']}...") try: # Delete stale .info file module['info_path'].unlink() print(f" Deleted stale .info") # Re-download result = subprocess.run( ['go', 'mod', 'download', f"{module['module_path']}@{module['version']}"], capture_output=True, text=True, timeout=60 ) if result.returncode == 0 and module['info_path'].exists(): # Verify new commit with open(module['info_path']) as f: new_info = json.load(f) new_hash = new_info.get('Origin', {}).get('Hash', '') if new_hash and new_hash != module['vcs_hash']: print(f" ✓ Refreshed: {module['vcs_hash'][:12]} → {new_hash[:12]}") refreshed += 1 else: print(f" ⚠️ Proxy returned same commit") failed.append(module) else: print(f" ❌ Download failed: {result.stderr[:100]}") failed.append(module) except Exception as e: print(f" ❌ Error: {e}") failed.append(module) print(f"\n{'='*70}") print(f"Refresh complete: {refreshed} refreshed, {len(failed)} failed") if failed: print(f"\nFailed modules require manual intervention:") for module in failed[:5]: print(f" {module['module_path']}@{module['version']}") return 1 return 0 def is_module_actually_needed(module_path: str, source_dir: Path) -> bool: """ Check if a module is actually used by running 'go mod why'. Returns: True if module is needed by the main module False if module is indirect-only and not actually imported """ try: result = subprocess.run( ['go', 'mod', 'why', module_path], cwd=str(source_dir), capture_output=True, text=True, timeout=30 ) if result.returncode != 0: # If go mod why fails, assume it's needed (conservative) return True output = result.stdout.strip() # Check for the telltale sign that module is not needed if "(main module does not need package" in output: return False # Also check for completely empty output (module not in graph) if not output or output == f"# {module_path}": return False # Module is needed return True except Exception: # On error, assume needed (conservative) return True def _execute(args: argparse.Namespace) -> int: global CURRENT_SOURCE_DIR, CURRENT_GOMODCACHE, VERIFY_COMMIT_CACHE_DIRTY debug_limit = args.debug_limit if args.source_dir: source_dir = Path(args.source_dir).resolve() else: source_dir = Path.cwd() CURRENT_SOURCE_DIR = source_dir if not (source_dir / "go.mod").exists(): print(f"❌ Error: go.mod not found in {source_dir}") return 1 print(f"Source directory: {source_dir}") if args.recipedir: output_dir = Path(args.recipedir).resolve() output_dir.mkdir(parents=True, exist_ok=True) print(f"Output directory: {output_dir}") else: output_dir = None if not args.validate and not args.dry_run: print("❌ Error: --recipedir is required unless running with --validate, --dry-run, or cache-maintenance flags.") return 1 configure_cache_paths(args.cache_dir, args.clone_cache_dir) if args.cache_dir: print(f"Metadata cache directory: {CACHE_BASE_DIR}") if args.clone_cache_dir: print(f"Clone cache directory: {CLONE_CACHE_DIR}") # Set verification cache max age from command line global MODULE_REPO_OVERRIDES_DIRTY, VERIFY_CACHE_MAX_AGE_DAYS VERIFY_CACHE_MAX_AGE_DAYS = args.verify_cache_max_age if VERIFY_CACHE_MAX_AGE_DAYS == 0: print(f"Verification cache: DISABLED (always verify)") else: print(f"Verification cache max age: {VERIFY_CACHE_MAX_AGE_DAYS} days") if args.clear_repo: for (module_spec,) in args.clear_repo: module_path, version = _parse_override_spec(module_spec) removed = False try: key = _normalise_override_key(module_path, version) except ValueError as exc: print(f"Invalid module override '{module_spec}': {exc}") continue if version is not None: if MODULE_REPO_OVERRIDES.pop(key, None) is not None: removed = True MODULE_REPO_OVERRIDES_DIRTY = True print(f"Cleared repo override: {module_path}@{version}") else: wildcard_key = key if MODULE_REPO_OVERRIDES.pop(wildcard_key, None) is not None: removed = True specific_keys = [ candidate for candidate in list(MODULE_REPO_OVERRIDES.keys()) if candidate[0] == module_path and candidate[1] is not None ] for candidate in specific_keys: MODULE_REPO_OVERRIDES.pop(candidate, None) removed = True if removed: MODULE_REPO_OVERRIDES_DIRTY = True print(f"Cleared repo overrides for: {module_path}") if not removed: if version is not None: print(f"No repo override found for: {module_path}@{version}") else: print(f"No repo overrides found for: {module_path}") if args.set_repo: for module_spec, repo_url in args.set_repo: module_path, version = _parse_override_spec(module_spec) try: key = _normalise_override_key(module_path, version) except ValueError as exc: print(f"Invalid module override '{module_spec}': {exc}") continue MODULE_REPO_OVERRIDES[key] = repo_url MODULE_REPO_OVERRIDES_DIRTY = True label = f"{module_path}@{version}" if version else module_path print(f"Pinned repo override: {label} -> {repo_url}") if args.clear_commit: for repo, commit in args.clear_commit: key = f"{repo}|||{commit}" if key in VERIFY_COMMIT_CACHE: VERIFY_COMMIT_CACHE.pop(key, None) VERIFY_COMMIT_CACHE_DIRTY = True print(f"\n🧹 Cleared cached verification: {repo} {commit}\n") else: print(f"No cached verification found for: {repo} {commit}") VERIFY_RESULTS.pop((repo, commit), None) if args.inject_commit: for repo, commit in args.inject_commit: key = f"{repo}|||{commit}" VERIFY_COMMIT_CACHE[key] = True VERIFY_COMMIT_CACHE_DIRTY = True VERIFY_RESULTS[(repo, commit)] = True print(f"Injected verified commit: {repo} {commit}") exit_code = 0 if args.clean_ls_remote_cache: print("\n🗑️ Cleaning git ls-remote cache...") if LS_REMOTE_CACHE_PATH.exists(): LS_REMOTE_CACHE_PATH.unlink() print(f" Removed {LS_REMOTE_CACHE_PATH}") else: print(f" Cache file not found: {LS_REMOTE_CACHE_PATH}") args.clean_cache = True if args.clean_cache: print("\n🗑️ Cleaning module metadata cache...") if MODULE_METADATA_CACHE_PATH.exists(): MODULE_METADATA_CACHE_PATH.unlink() print(f" Removed {MODULE_METADATA_CACHE_PATH}") else: print(f" Cache file not found: {MODULE_METADATA_CACHE_PATH}") if VERIFY_COMMIT_CACHE_PATH.exists(): VERIFY_COMMIT_CACHE_PATH.unlink() print(f" Removed {VERIFY_COMMIT_CACHE_PATH}") VERIFY_COMMIT_CACHE.clear() VERIFY_COMMIT_CACHE_DIRTY = False print(" Note: Bootstrap from .inc files DISABLED to avoid reloading stale data.") skip_inc_files = True else: skip_inc_files = False skip_legacy_module_cache = args.skip_legacy_module_cache bootstrap_metadata_cache( output_dir, skip_inc_files=skip_inc_files, skip_legacy_module_cache=skip_legacy_module_cache, ) prune_metadata_cache() load_ls_remote_cache() load_vanity_url_cache() if args.dry_run: print("\n--dry-run requested; skipping discovery/validation") return 0 # --verify-cached command to check GOMODCACHE for stale commits if args.verify_cached: print("\n" + "=" * 70) print("VERIFYING CACHED COMMITS IN GOMODCACHE") print("=" * 70) return verify_gomodcache_commits(args.gomodcache or source_dir / ".gomodcache", args.verify_jobs) # Check for --discovered-modules (bootstrap strategy) if args.discovered_modules: print("\n" + "=" * 70) print("PRE-DISCOVERED MODULES MODE") print("=" * 70) print("\nUsing pre-discovered module metadata from BitBake discovery build") print("Skipping discovery phase - generator will convert to BitBake format\n") discovered_modules_path = Path(args.discovered_modules).resolve() modules = load_discovered_modules(discovered_modules_path) if modules is None: print("\n❌ Failed to load discovered modules - falling back to discovery") modules = discover_modules(source_dir, args.gomodcache) else: print(f"\n✓ Successfully loaded {len(modules)} modules from discovery metadata") print(" Skipping 'go mod download' discovery phase") print(" Will use go.sum to resolve modules without Origin metadata") # Auto-correction of dangling commits happens in Phase 2 during parallel verification else: # Normal discovery path modules = discover_modules(source_dir, args.gomodcache) if debug_limit is not None and len(modules) > debug_limit: print(f"\n⚙️ Debug limit active: truncating discovered modules to first {debug_limit} entries") modules = modules[:debug_limit] # Set VERIFY_ENABLED based on whether verification is requested global VERIFY_ENABLED VERIFY_ENABLED = not args.skip_verify # Parse go.mod replace directives for fork resolution # Example: github.com/containerd/containerd/v2 => github.com/k3s-io/containerd/v2 v2.1.4-k3s2 go_mod_replaces = parse_go_mod_replaces(source_dir / "go.mod") if go_mod_replaces: print(f"\n✓ Parsed {len(go_mod_replaces)} replace directives from go.mod") if VERBOSE_MODE: for old_path, (new_path, new_version) in sorted(go_mod_replaces.items())[:5]: print(f" {old_path} => {new_path} {new_version}") if len(go_mod_replaces) > 5: print(f" ... and {len(go_mod_replaces) - 5} more") # Parse go.sum for fallback resolution discovered_keys = {(m['module_path'], m['version']) for m in modules} go_sum_modules_with_source, go_sum_indirect_only = parse_go_sum(source_dir / "go.sum") FAILED_MODULE_PATHS.clear() FAILED_MODULE_ENTRIES.clear() SKIPPED_MODULES.clear() print(f"\nFound {len(go_sum_indirect_only)} indirect-only dependencies (skipping - only need .mod files)") if args.discovered_modules: # With discovered modules, only resolve what's in go.sum but missing from discovery # Do NOT call go list -m all - we already know what we need from the successful build missing_from_discovery = go_sum_modules_with_source - discovered_keys print(f"Discovered modules provided {len(discovered_keys)} modules with Origin metadata") print(f"go.sum has {len(go_sum_modules_with_source)} modules total") print(f"Resolving {len(missing_from_discovery)} modules without Origin metadata...") else: # Normal discovery - also use go list to find additional modules go_list_modules = collect_modules_via_go_list(source_dir) go_sum_modules_with_source |= go_list_modules missing_from_discovery = go_sum_modules_with_source - discovered_keys print(f"Resolving {len(missing_from_discovery)} additional modules discovered from go.sum/go list...") modules_by_path: Dict[str, List[Dict]] = {} for m in modules: modules_by_path.setdefault(m['module_path'], []).append(m) limit_reached = False for module_path, version in sorted(go_sum_modules_with_source): if debug_limit is not None and len(modules) >= debug_limit: limit_reached = True break if module_path in FAILED_MODULE_PATHS: print(f" ⚠️ Skipping {module_path}@{version} (previous resolution failure)") continue if (module_path, version) in discovered_keys: continue # Apply replace directives for k3s forks # If module path is replaced in go.mod, try to resolve using the replacement path resolved_path = module_path resolved_version = version if module_path in go_mod_replaces: new_path, new_version = go_mod_replaces[module_path] if new_version: # Replace has explicit version resolved_path = new_path resolved_version = new_version if VERBOSE_MODE: print(f" [replace] {module_path}@{version} => {resolved_path}@{resolved_version}") # Check if we already have the replacement module if (resolved_path, resolved_version) in discovered_keys: # Copy the existing module entry with original path for m in modules: if m['module_path'] == resolved_path and m['version'] == resolved_version: replacement_entry = m.copy() replacement_entry['module_path'] = module_path replacement_entry['version'] = version modules.append(replacement_entry) discovered_keys.add((module_path, version)) modules_by_path.setdefault(module_path, []).append(replacement_entry) print(f" ✓ {module_path}@{version} (using replace directive -> {resolved_path}@{resolved_version})") continue fallback = resolve_module_metadata(resolved_path, resolved_version) if fallback: # If we used a replace directive, update the entry to use the original path if resolved_path != module_path or resolved_version != version: fallback['module_path'] = module_path fallback['version'] = version print(f" ✓ {module_path}@{version} (resolved via replace -> {resolved_path}@{resolved_version})") modules.append(fallback) discovered_keys.add((module_path, version)) modules_by_path.setdefault(module_path, []).append(fallback) if debug_limit is not None and len(modules) >= debug_limit: limit_reached = True break else: # Handle monorepo submodule replacements (e.g., github.com/k3s-io/etcd/server/v3) # When a replacement points to a submodule path that doesn't have its own VCS entry, # try to find the base repository and use it with a subdir. # Example: github.com/k3s-io/etcd/server/v3 -> base: github.com/k3s-io/etcd, subdir: server/v3 monorepo_handled = False if resolved_path != module_path and '/' in resolved_path: # Check if this looks like a submodule path (has version suffix like /v2, /v3, etc.) parts = resolved_path.rsplit('/', 1) if len(parts) == 2: potential_base = parts[0] potential_subdir = parts[1] # Look for version-suffixed paths (e.g., /v2, /v3, /server/v3, /client/v3) # Try progressively shorter base paths base_candidates = [] path_segments = resolved_path.split('/') # For github.com/k3s-io/etcd/server/v3: # Try: github.com/k3s-io/etcd/server, github.com/k3s-io/etcd for i in range(len(path_segments) - 1, 2, -1): # At least keep domain + org candidate_base = '/'.join(path_segments[:i]) candidate_subdir = '/'.join(path_segments[i:]) base_candidates.append((candidate_base, candidate_subdir)) # Try each candidate base path for base_path, subdir in base_candidates: if base_path in modules_by_path: # Found the base repository! Create a submodule entry base_module = modules_by_path[base_path][0] vcs_url = base_module['vcs_url'] # Use the replacement version for the tag tag = resolved_version.split('+')[0] commit = git_ls_remote(vcs_url, f"refs/tags/{tag}") or git_ls_remote(vcs_url, tag) if commit: timestamp = derive_timestamp_from_version(resolved_version) fallback = { "module_path": module_path, # Original path (go.etcd.io/etcd/server/v3) "version": version, "vcs_url": vcs_url, "vcs_hash": commit, "vcs_ref": f"refs/tags/{tag}" if git_ls_remote(vcs_url, f"refs/tags/{tag}") else tag, "timestamp": timestamp, "subdir": subdir, # e.g., "server/v3" } modules.append(fallback) discovered_keys.add((module_path, version)) modules_by_path.setdefault(module_path, []).append(fallback) print(f" ✓ {module_path}@{version} (monorepo submodule: base={base_path}, subdir={subdir})") monorepo_handled = True if debug_limit is not None and len(modules) >= debug_limit: limit_reached = True break if monorepo_handled: if limit_reached: break continue if module_path in modules_by_path: reference_module = modules_by_path[module_path][0] vcs_url = reference_module['vcs_url'] tag = version.split('+')[0] commit = None pseudo_info = parse_pseudo_version_tag(tag) if pseudo_info: timestamp_str, short_commit = pseudo_info commit = resolve_pseudo_version_commit( vcs_url, timestamp_str, short_commit, clone_cache_dir=CLONE_CACHE_DIR ) if commit: print(f" ✓ {module_path}@{version} (resolved pseudo-version via repository clone)") else: commit = git_ls_remote(vcs_url, f"refs/tags/{tag}") or git_ls_remote(vcs_url, tag) if commit: print(f" ✓ {module_path}@{version} (resolved using VCS URL from sibling version)") if commit: timestamp = derive_timestamp_from_version(version) subdir = reference_module.get('subdir', '') update_metadata_cache(module_path, version, vcs_url, commit, timestamp, subdir, '', dirty=True) fallback = { "module_path": module_path, "version": version, "vcs_url": vcs_url, "vcs_hash": commit, "vcs_ref": "", "timestamp": timestamp, "subdir": subdir, } modules.append(fallback) discovered_keys.add((module_path, version)) modules_by_path[module_path].append(fallback) if debug_limit is not None and len(modules) >= debug_limit: limit_reached = True break continue # Skip monorepo root modules that fail resolution when we have submodules # Example: go.etcd.io/etcd/v3 (root) when we have github.com/k3s-io/etcd/server/v3, etc. # Handles both direct prefix match and forked monorepos (via VCS URL comparison) # These are never actually imported - they just exist in go.sum due to the monorepo go.mod is_monorepo_root = False # Check 1: Direct prefix match (same repository, e.g., go.etcd.io/etcd/v3 → go.etcd.io/etcd/server/v3) if any(existing_path.startswith(module_path + '/') for existing_path in modules_by_path.keys()): is_monorepo_root = True # Check 2: Forked monorepo (e.g., go.etcd.io/etcd/v3 → github.com/k3s-io/etcd/server/v3) # If we failed to derive a repository, try checking if any existing module's last path segment # matches our module's last segment (e.g., both end in /v3) if not is_monorepo_root and module_path.count('/') >= 2: module_segments = module_path.split('/') # For go.etcd.io/etcd/v3: domain=go.etcd.io, repo=etcd, suffix=v3 # Check if we have modules like */etcd/*/v3 (forked versions) for existing_path in modules_by_path.keys(): if '/' in existing_path: # Check if the existing path is a submodule of a similar repository # Example: github.com/k3s-io/etcd/server/v3 shares repository 'etcd' with go.etcd.io/etcd/v3 if '/etcd/' in existing_path and module_path.endswith('/v3'): is_monorepo_root = True break if is_monorepo_root: print(f" ⊙ {module_path}@{version} (monorepo root - submodules already resolved)") continue if module_path in modules_by_path: FAILED_MODULE_PATHS.add(module_path) FAILED_MODULE_ENTRIES.add((module_path, version)) print(f" ⚠️ Skipping {module_path}@{version} (indirect-only dependency)") if limit_reached: break if limit_reached: print(f"\n⚠️ Debug limit {debug_limit} reached; skipping remaining modules discovered from go.sum/go list.") # Resolve /go.mod-only (indirect) dependencies using sibling versions # Even though these are "indirect", Go may still need them during compilation # (e.g., due to complex replace directives or transitive dependencies). # If we have a sibling version with Origin metadata, resolve the indirect version too. print(f"\n⚙️ Resolving /go.mod-only dependencies from sibling versions...") gomod_only_resolved = 0 gomod_only_skipped = 0 for module_path, version in sorted(go_sum_indirect_only): try: if (module_path, version) in discovered_keys: continue # Already have this version if module_path in modules_by_path: # We have a sibling version - try to resolve this one using the sibling's VCS URL reference_module = modules_by_path[module_path][0] vcs_url = reference_module['vcs_url'] tag = version.split('+')[0] commit = None pseudo_info = parse_pseudo_version_tag(tag) if pseudo_info: timestamp_str, short_commit = pseudo_info try: commit = resolve_pseudo_version_commit( vcs_url, timestamp_str, short_commit, clone_cache_dir=CLONE_CACHE_DIR ) except Exception as e: print(f" ❌ Error resolving pseudo-version {module_path}@{version} (timestamp={timestamp_str}, commit={short_commit}): {e}") gomod_only_skipped += 1 continue else: # For semantic version tags, try to find the tag reference # This enables to detect orphaned tags for sibling-resolved modules vcs_ref = "" commit = git_ls_remote(vcs_url, f"refs/tags/{tag}") if commit: vcs_ref = f"refs/tags/{tag}" else: commit = git_ls_remote(vcs_url, tag) if commit: timestamp = derive_timestamp_from_version(version) subdir = reference_module.get('subdir', '') update_metadata_cache(module_path, version, vcs_url, commit, timestamp, subdir, '', dirty=True) fallback = { "module_path": module_path, "version": version, "vcs_url": vcs_url, "vcs_hash": commit, "vcs_ref": vcs_ref, "timestamp": timestamp, "subdir": subdir, } modules.append(fallback) discovered_keys.add((module_path, version)) modules_by_path[module_path].append(fallback) gomod_only_resolved += 1 print(f" ✓ {module_path}@{version} (/go.mod-only resolved using sibling version)") else: gomod_only_skipped += 1 else: gomod_only_skipped += 1 except Exception as e: print(f" ❌ Error resolving {module_path}@{version}: {e}") gomod_only_skipped += 1 if gomod_only_resolved > 0: print(f"✓ Resolved {gomod_only_resolved} /go.mod-only dependencies using sibling versions") if gomod_only_skipped > 0: print(f" ⚠️ Skipped {gomod_only_skipped} /go.mod-only dependencies (no sibling version available)") if FAILED_MODULE_ENTRIES: print("\n❌ Failed to resolve metadata for the following modules:") for mod, ver in sorted(FAILED_MODULE_ENTRIES): print(f" - {mod}@{ver}") print("Aborting to avoid emitting invalid SRCREVs.") return 1 if not modules: print("❌ No modules discovered") return 1 success = generate_recipe( modules, source_dir, output_dir, args.git_repo or "unknown", args.git_ref or "unknown", validate_only=args.validate, debug_limit=debug_limit, skip_verify=args.skip_verify, verify_jobs=args.verify_jobs, ) if success: if args.validate: print("\n" + "=" * 70) print("✅ SUCCESS - Validation complete") print("=" * 70) else: print("\n" + "=" * 70) print("✅ SUCCESS - Recipe generation complete") print("=" * 70) # Write corrected modules back to JSON for future runs if args.discovered_modules and VERIFY_CORRECTIONS_APPLIED: corrected_json = args.discovered_modules.replace('.json', '-corrected.json') try: with open(corrected_json, 'w') as f: json.dump(modules, f, indent=2) print(f"\n✓ Wrote corrected module metadata to: {corrected_json}") print(f" Use this file for future runs to avoid re-detecting orphaned commits") except Exception as e: print(f"\n⚠️ Could not write corrected JSON: {e}") exit_code = 0 else: print("\n❌ FAILED - Recipe generation failed") exit_code = 1 if SKIPPED_MODULES: print("\n⚠️ Skipped modules (no repository metadata)") for (module_path, version), reason in sorted(SKIPPED_MODULES.items()): print(f" - {module_path}@{version} [{reason}]") print(" Use --set-repo / --inject-commit to add missing metadata before building.") return exit_code def parse_go_mod_replaces(go_mod_path: Path) -> Dict[str, Tuple[str, str]]: """ Parse replace directives from go.mod file. Returns: Dict mapping old_path to (new_path, new_version) Example: {"github.com/containerd/containerd/v2": ("github.com/k3s-io/containerd/v2", "v2.1.4-k3s2")} """ replaces = {} if not go_mod_path.exists(): return replaces try: content = go_mod_path.read_text() # Match: old_path => new_path version # Example: github.com/containerd/containerd/v2 => github.com/k3s-io/containerd/v2 v2.1.4-k3s2 for line in content.splitlines(): line = line.strip() if not line.startswith('replace ') and '=>' not in line: continue # Remove 'replace ' prefix if present if line.startswith('replace '): line = line[8:].strip() parts = line.split('=>') if len(parts) != 2: continue left = parts[0].strip().split() right = parts[1].strip().split() if len(left) == 0 or len(right) == 0: continue old_path = left[0] new_path = right[0] new_version = right[1] if len(right) > 1 else "" replaces[old_path] = (new_path, new_version) except Exception as e: print(f"⚠️ Failed to parse go.mod replaces: {e}", file=sys.stderr) return replaces def parse_pseudo_version_tag(tag: str) -> Optional[Tuple[str, str]]: """Return (timestamp, short_commit) for Go pseudo-versions.""" tag = tag.split('+', 1)[0] parts = tag.split('-') if len(parts) < 3: return None short_commit = parts[-1] timestamp_part = parts[-2] timestamp_str = timestamp_part.split('.')[-1] if len(timestamp_str) != 14 or not timestamp_str.isdigit(): return None if not re.fullmatch(r'[0-9a-fA-F]{6,40}', short_commit): return None return timestamp_str, short_commit def _cache_key(url: str, ref: str) -> str: return f"{url}|||{ref}" def load_ls_remote_cache() -> None: if not LS_REMOTE_CACHE_PATH.exists(): return try: data = json.loads(LS_REMOTE_CACHE_PATH.read_text()) except Exception: return for key, value in data.items(): try: url, ref = key.split("|||", 1) except ValueError: continue LS_REMOTE_CACHE[(url, ref)] = value def save_ls_remote_cache() -> None: if not LS_REMOTE_CACHE_DIRTY: return try: payload = { _cache_key(url, ref): value for (url, ref), value in LS_REMOTE_CACHE.items() } LS_REMOTE_CACHE_PATH.write_text(json.dumps(payload, indent=2, sort_keys=True)) except Exception: pass def git_ls_remote(url: str, ref: str, *, debug: bool = False) -> Optional[str]: """ Query git repository for commit hash of a ref. Uses disk-based cache and local clones to minimize network calls. Args: url: Git repository URL ref: Git ref (tag, branch, commit, etc.) debug: If True, print whether result came from cache or network Returns: Commit hash or None if not found """ global LS_REMOTE_CACHE_DIRTY key = (url, ref) # Check in-memory cache first if key in LS_REMOTE_CACHE: if debug or VERBOSE_MODE: result = LS_REMOTE_CACHE[key] status = "cached" if result else "cached (not found)" print(f" [ls-remote {status}] {url} {ref}", file=sys.stderr) return LS_REMOTE_CACHE[key] # Try local repository clone if available repo_hash = hashlib.sha256(url.encode()).hexdigest()[:16] local_repo = CLONE_CACHE_DIR / f"repo_{repo_hash}" if local_repo.exists() and (local_repo / 'HEAD').exists(): try: # Query local repository instead of network result = subprocess.run( ["git", "show-ref", "--hash", ref], cwd=local_repo, capture_output=True, text=True, timeout=10, ) if result.returncode == 0 and result.stdout.strip(): commit_hash = result.stdout.strip().split()[0] LS_REMOTE_CACHE[key] = commit_hash LS_REMOTE_CACHE_DIRTY = True if debug or VERBOSE_MODE: print(f" [ls-remote local] {url} {ref} -> {commit_hash[:12]}", file=sys.stderr) return commit_hash except (subprocess.TimeoutExpired, subprocess.CalledProcessError, Exception): # Fall through to network query pass if debug or VERBOSE_MODE: print(f" [ls-remote network] {url} {ref}", file=sys.stderr) try: env = os.environ.copy() env.setdefault("GIT_TERMINAL_PROMPT", "0") env.setdefault("GIT_ASKPASS", "true") # FIX: For tags, also query the dereferenced commit (^{}) to handle annotated tags # Annotated tags have a tag object hash that differs from the commit hash. # We need the actual commit hash for git archive/checkout operations. refs_to_query = [ref] if ref.startswith("refs/tags/"): refs_to_query.append(f"{ref}^{{}}") # Add dereferenced query result = subprocess.run( ["git", "ls-remote", url] + refs_to_query, capture_output=True, text=True, check=True, env=env, timeout=GIT_CMD_TIMEOUT, ) # Parse results - prefer dereferenced commit (^{}) over annotated tag object tag_object_hash = None dereferenced_hash = None for line in result.stdout.strip().splitlines(): if not line: continue parts = line.split() if len(parts) >= 2: hash_val, ref_name = parts[0], parts[1] if ref_name.endswith("^{}"): # This is the dereferenced commit - preferred! dereferenced_hash = hash_val else: # This is either a lightweight tag or annotated tag object tag_object_hash = hash_val # Prefer dereferenced commit, fall back to tag object (for lightweight tags) commit_hash = dereferenced_hash or tag_object_hash if commit_hash: LS_REMOTE_CACHE[key] = commit_hash LS_REMOTE_CACHE_DIRTY = True return commit_hash except subprocess.TimeoutExpired: print(f" ⚠️ git ls-remote timeout ({GIT_CMD_TIMEOUT}s) for {url} {ref}") LS_REMOTE_CACHE[key] = None LS_REMOTE_CACHE_DIRTY = True return None except subprocess.CalledProcessError: LS_REMOTE_CACHE[key] = None LS_REMOTE_CACHE_DIRTY = True return None return None def load_vanity_url_cache() -> None: """Load vanity URL resolution cache from disk.""" if not VANITY_URL_CACHE_PATH.exists(): return try: data = json.loads(VANITY_URL_CACHE_PATH.read_text()) VANITY_URL_CACHE.update(data) except Exception: pass def save_vanity_url_cache() -> None: """Save vanity URL resolution cache to disk.""" if not VANITY_URL_CACHE_DIRTY: return try: VANITY_URL_CACHE_PATH.write_text(json.dumps(VANITY_URL_CACHE, indent=2, sort_keys=True)) except Exception: pass def load_verify_commit_cache() -> None: """ Load verification cache with timestamp support for aging detection. Cache format v2: { "repo|||commit": { "verified": true, "first_verified": "2025-01-15T10:30:00Z", # When first verified "last_checked": "2025-02-10T14:20:00Z", # When last re-verified "fetch_method": "fetch" # "fetch", "ref", or "cached" } } """ global VERIFY_COMMIT_CACHE_DIRTY, VERIFY_COMMIT_CACHE_V2 if not VERIFY_COMMIT_CACHE_PATH.exists(): return try: data = json.loads(VERIFY_COMMIT_CACHE_PATH.read_text()) except Exception: return if isinstance(data, dict): # Detect format: v1 (bool values) vs v2 (dict values) sample_value = next(iter(data.values())) if data else None if isinstance(sample_value, bool): # Legacy format: convert to v2 from datetime import datetime, timezone now = datetime.now(timezone.utc).isoformat() for k, v in data.items(): if v: # Only migrate verified=True entries VERIFY_COMMIT_CACHE_V2[k] = { "verified": True, "first_verified": now, "last_checked": now, "fetch_method": "cached" # Unknown how it was verified } VERIFY_COMMIT_CACHE_DIRTY = True # Mark dirty to save in new format elif isinstance(sample_value, dict): # V2 format VERIFY_COMMIT_CACHE_V2.update(data) VERIFY_COMMIT_CACHE_DIRTY = False def save_verify_commit_cache(force: bool = False) -> None: """Save verification cache in v2 format with timestamps. Args: force: If True, save even if not dirty (for incremental saves during long runs) """ global VERIFY_COMMIT_CACHE_DIRTY if not force and not VERIFY_COMMIT_CACHE_DIRTY: return try: VERIFY_COMMIT_CACHE_PATH.write_text(json.dumps(VERIFY_COMMIT_CACHE_V2, indent=2, sort_keys=True)) VERIFY_COMMIT_CACHE_DIRTY = False except Exception as e: print(f"⚠️ Failed to save verification cache: {e}") pass def _load_overrides_from_file(path: Path, target_dict: Dict[Tuple[str, Optional[str]], str]) -> None: """ Load module->repo overrides from a JSON file into the target dictionary. File format: { "module/path": "https://github.com/org/repo", "module/path@v1.2.3": "https://github.com/org/repo" } The @version suffix is optional. Use it to override only a specific version. """ if not path.exists(): return try: data = json.loads(path.read_text()) except Exception: return if not isinstance(data, dict): return for raw_key, repo_url in data.items(): if not isinstance(repo_url, str): continue module_path = str(raw_key) version: Optional[str] = None # Support both "module|||version" (legacy) and "module@version" (new) formats if "|||" in module_path: module_part, version_part = module_path.split("|||", 1) version = None if version_part == "*" else version_part module_path = module_part elif "@" in module_path and not module_path.startswith("@"): # Handle module@version format (but not @org/pkg scoped packages) at_pos = module_path.rfind("@") version = module_path[at_pos + 1:] module_path = module_path[:at_pos] try: key = _normalise_override_key(module_path, version) except ValueError: continue target_dict[key] = repo_url def load_manual_overrides() -> None: """Load git-tracked manual overrides from manual-overrides.json.""" global MANUAL_OVERRIDES MANUAL_OVERRIDES.clear() _load_overrides_from_file(MANUAL_OVERRIDES_PATH, MANUAL_OVERRIDES) if MANUAL_OVERRIDES: print(f" Loaded {len(MANUAL_OVERRIDES)} manual repository override(s)") def load_repo_overrides() -> None: """Load dynamic overrides from repo-overrides.json (created via --set-repo).""" global MODULE_REPO_OVERRIDES_DIRTY MODULE_REPO_OVERRIDES.clear() _load_overrides_from_file(MODULE_REPO_OVERRIDES_PATH, MODULE_REPO_OVERRIDES) MODULE_REPO_OVERRIDES_DIRTY = False def save_repo_overrides() -> None: if not MODULE_REPO_OVERRIDES_DIRTY: return try: payload: Dict[str, str] = {} for (module_path, version), repo_url in MODULE_REPO_OVERRIDES.items(): key = module_path if version is None else f"{module_path}|||{version}" payload[key] = repo_url MODULE_REPO_OVERRIDES_PATH.write_text(json.dumps(payload, indent=2, sort_keys=True)) except Exception: pass def query_vanity_url(module_path: str) -> Optional[str]: """ Query vanity URL metadata using ?go-get=1 to resolve actual VCS repository. Go uses vanity URLs to provide custom import paths that redirect to actual repositories. When you request https://example.com/module?go-get=1, the server returns HTML with a meta tag like: This function queries that metadata and caches the result for future use. Args: module_path: Go module path (e.g., "go.uber.org/atomic") Returns: VCS repository URL if found, None otherwise """ global VANITY_URL_CACHE_DIRTY # Check cache first if module_path in VANITY_URL_CACHE: return VANITY_URL_CACHE[module_path] # Query the ?go-get=1 metadata url = f"https://{module_path}?go-get=1" try: import urllib.request import html.parser class GoImportParser(html.parser.HTMLParser): def __init__(self, target_module: str): super().__init__() self.target_module = target_module self.repo_url = None self.best_prefix_len = 0 # Track longest matching prefix def handle_starttag(self, tag, attrs): if tag == 'meta': attrs_dict = dict(attrs) if attrs_dict.get('name') == 'go-import': content = attrs_dict.get('content', '') # Format: "module_prefix vcs repo_url" parts = content.split() if len(parts) >= 3: prefix = parts[0] # parts[1] = vcs type (git, hg, svn, bzr) repo_url = parts[2] # Per Go spec: match the go-import whose prefix matches our module # The module path must equal the prefix or have it as a path prefix if self.target_module == prefix or self.target_module.startswith(prefix + '/'): # Prefer longer (more specific) prefix matches if len(prefix) > self.best_prefix_len: self.best_prefix_len = len(prefix) self.repo_url = repo_url # Fetch the page with a timeout req = urllib.request.Request(url, headers={'User-Agent': 'oe-go-mod-fetcher/3.0'}) with urllib.request.urlopen(req, timeout=10) as response: html_content = response.read().decode('utf-8', errors='ignore') # Parse the HTML to find matching go-import meta tag parser = GoImportParser(module_path) parser.feed(html_content) # Cache the result (even if None) VANITY_URL_CACHE[module_path] = parser.repo_url VANITY_URL_CACHE_DIRTY = True return parser.repo_url except Exception as e: # Cache negative result to avoid repeated failures VANITY_URL_CACHE[module_path] = None VANITY_URL_CACHE_DIRTY = True return None def get_github_mirror_url(vcs_url: str) -> Optional[str]: """ Get GitHub mirror URL for golang.org/x repositories. golang.org/x repositories are mirrored on GitHub at github.com/golang/*. These mirrors are often more reliable than go.googlesource.com. Args: vcs_url: Original VCS URL (e.g., https://go.googlesource.com/tools) Returns: GitHub mirror URL if applicable, None otherwise """ if 'go.googlesource.com' in vcs_url: # Extract package name from URL # https://go.googlesource.com/tools -> tools pkg_name = vcs_url.rstrip('/').split('/')[-1] return f"https://github.com/golang/{pkg_name}" return None def resolve_pseudo_version_commit(vcs_url: str, timestamp_str: str, short_commit: str, clone_cache_dir: Optional[Path] = None) -> Optional[str]: """ Resolve a pseudo-version's short commit hash to a full 40-character hash. This function clones (or updates) a git repository and searches the commit history for a commit that matches both the timestamp and short commit hash from a pseudo-version. For golang.org/x repositories, automatically tries GitHub mirrors if the primary source fails (go.googlesource.com can be slow or unreliable). Args: vcs_url: Git repository URL timestamp_str: Timestamp from pseudo-version (format: YYYYMMDDHHmmss) short_commit: Short commit hash (12 characters) from pseudo-version clone_cache_dir: Optional directory to cache cloned repositories (recommended) Returns: Full 40-character commit hash, or None if not found """ # Parse timestamp try: dt = datetime.strptime(timestamp_str, "%Y%m%d%H%M%S") # Validate the date is within a reasonable range before doing arithmetic # Python datetime supports years 1-9999, but Go pseudo-versions should be recent # Also ensure year > 1 to avoid overflow when subtracting 1 day if dt.year < 1970 or dt.year > 9999: print(f"⚠️ Invalid timestamp year {dt.year} in pseudo-version (timestamp: {timestamp_str})", file=sys.stderr) return None if dt.year == 1: # Special case: year 1 would overflow when subtracting 1 day print(f"⚠️ Invalid timestamp year 1 in pseudo-version (timestamp: {timestamp_str})", file=sys.stderr) return None # Search window: ±1 day around timestamp for efficiency try: since = (dt - timedelta(days=1)).isoformat() until = (dt + timedelta(days=1)).isoformat() except OverflowError as e: print(f"⚠️ Date arithmetic overflow for timestamp {timestamp_str}: {e}", file=sys.stderr) return None except ValueError as e: print(f"⚠️ Invalid timestamp format {timestamp_str}: {e}", file=sys.stderr) return None # Try primary URL and GitHub mirror (if applicable) urls_to_try = [vcs_url] github_mirror = get_github_mirror_url(vcs_url) if github_mirror: urls_to_try.append(github_mirror) git_env = os.environ.copy() git_env.setdefault("GIT_TERMINAL_PROMPT", "0") git_env.setdefault("GIT_ASKPASS", "true") for try_url in urls_to_try: # Determine clone directory based on URL being tried if clone_cache_dir: clone_cache_dir.mkdir(parents=True, exist_ok=True) repo_hash = hashlib.sha256(try_url.encode()).hexdigest()[:16] clone_dir = clone_cache_dir / f"repo_{repo_hash}" else: clone_dir = Path(tempfile.mkdtemp(prefix="pseudo-resolve-")) try: # Clone or update repository if clone_dir.exists() and (clone_dir / 'HEAD').exists(): # Repository already cloned, fetch latest try: subprocess.run( ['git', 'fetch', '--all', '--quiet'], cwd=clone_dir, capture_output=True, check=True, timeout=60, env=git_env, ) except (subprocess.CalledProcessError, subprocess.TimeoutExpired): # Fetch failed, try to use existing clone anyway pass else: # Clone repository (bare clone for efficiency) if clone_dir.exists(): shutil.rmtree(clone_dir) clone_dir.mkdir(parents=True, exist_ok=True) subprocess.run( ['git', 'clone', '--bare', '--quiet', try_url, str(clone_dir)], capture_output=True, check=True, timeout=300, # 5 minute timeout env=git_env, ) # Search for commits matching timestamp and short hash result = subprocess.run( ['git', 'log', '--all', '--format=%H %ct', f'--since={since}', f'--until={until}'], cwd=clone_dir, capture_output=True, text=True, check=True, timeout=30, env=git_env, ) # Find commit with matching short hash prefix for line in result.stdout.strip().splitlines(): if not line: continue parts = line.split() if len(parts) < 2: continue full_hash = parts[0] if full_hash.startswith(short_commit): return full_hash # Commit not found in this repository, try next URL continue except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: # Clone/fetch failed, try next URL if available if not clone_cache_dir and clone_dir.exists(): shutil.rmtree(clone_dir) continue finally: # Clean up temp directory if we created one if not clone_cache_dir and clone_dir.exists(): try: shutil.rmtree(clone_dir) except: pass # All URLs failed return None def derive_timestamp_from_version(version: str) -> str: parsed = parse_pseudo_version_tag(version) if parsed: timestamp_str, _ = parsed try: return datetime.strptime(timestamp_str, "%Y%m%d%H%M%S").strftime("%Y-%m-%dT%H:%M:%SZ") except ValueError: return "1970-01-01T00:00:00Z" return "1970-01-01T00:00:00Z" def _cache_metadata_key(module_path: str, version: str) -> Tuple[str, str]: return (module_path, version) def load_metadata_cache_file() -> None: if not MODULE_METADATA_CACHE_PATH.exists(): return try: data = json.loads(MODULE_METADATA_CACHE_PATH.read_text()) except Exception: return for key, value in data.items(): try: module_path, version = key.split("|||", 1) except ValueError: continue if not isinstance(value, dict): continue MODULE_METADATA_CACHE[_cache_metadata_key(module_path, version)] = { 'vcs_url': value.get('vcs_url', ''), 'commit': value.get('commit', ''), 'timestamp': value.get('timestamp', ''), 'subdir': value.get('subdir', ''), 'ref': value.get('ref', ''), } def save_metadata_cache() -> None: if not MODULE_METADATA_CACHE_DIRTY: return payload = { f"{module}|||{version}": value for (module, version), value in MODULE_METADATA_CACHE.items() } try: MODULE_METADATA_CACHE_PATH.write_text(json.dumps(payload, indent=2, sort_keys=True)) except Exception: pass def update_metadata_cache(module_path: str, version: str, vcs_url: str, commit: str, timestamp: str = "", subdir: str = "", ref: str = "", dirty: bool = True) -> None: global MODULE_METADATA_CACHE_DIRTY key = _cache_metadata_key(module_path, version) value = { 'vcs_url': vcs_url or '', 'commit': commit or '', 'timestamp': timestamp or '', 'subdir': subdir or '', 'ref': ref or '', } if MODULE_METADATA_CACHE.get(key) != value: MODULE_METADATA_CACHE[key] = value if dirty: MODULE_METADATA_CACHE_DIRTY = True def get_cached_metadata(module_path: str, version: str) -> Optional[dict]: entry = MODULE_METADATA_CACHE.get(_cache_metadata_key(module_path, version)) if not entry: return None timestamp = entry.get('timestamp') or derive_timestamp_from_version(version) return { "module_path": module_path, "version": version, "vcs_url": entry.get('vcs_url', ''), "vcs_hash": entry.get('commit', ''), "vcs_ref": entry.get('ref', ''), "timestamp": timestamp, "subdir": entry.get('subdir', ''), } def load_metadata_from_inc(output_dir: Path) -> None: git_inc = output_dir / "go-mod-git.inc" cache_inc = output_dir / "go-mod-cache.inc" sha_to_url: Dict[str, str] = {} if git_inc.exists(): for line in git_inc.read_text().splitlines(): line = line.strip() if not line.startswith('SRC_URI'): continue if '"' not in line: continue content = line.split('"', 1)[1].rsplit('"', 1)[0] parts = [p for p in content.split(';') if p] if not parts: continue url_part = parts[0] dest_sha = None for part in parts[1:]: if part.startswith('destsuffix='): dest = part.split('=', 1)[1] dest_sha = dest.rsplit('/', 1)[-1] break if not dest_sha: continue if url_part.startswith('git://'): url_https = 'https://' + url_part[6:] else: url_https = url_part sha_to_url[dest_sha] = url_https if cache_inc.exists(): text = cache_inc.read_text() marker = "GO_MODULE_CACHE_DATA = '" if marker in text: start = text.index(marker) + len(marker) try: end = text.index("'\n\n", start) except ValueError: end = len(text) try: data = json.loads(text[start:end]) except Exception: data = [] for entry in data: module_path = entry.get('module') version = entry.get('version') sha = entry.get('vcs_hash') commit = entry.get('commit') timestamp = entry.get('timestamp', '') subdir = entry.get('subdir', '') ref = entry.get('vcs_ref', '') if not module_path or not version: continue vcs_url = sha_to_url.get(sha, '') if not vcs_url: continue if not _url_allowed_for_module(module_path, vcs_url, version): continue # Skip entries with invalid commit hashes if commit and len(commit) != 40: continue if not timestamp: timestamp = derive_timestamp_from_version(version) update_metadata_cache(module_path, version, vcs_url, commit or '', timestamp, subdir, ref, dirty=False) def load_metadata_from_module_cache_task(output_dir: Path) -> None: legacy_path = output_dir / "module_cache_task.inc" if not legacy_path.exists(): return import ast pattern = re.compile(r'\(\{.*?\}\)', re.DOTALL) text = legacy_path.read_text() for match in pattern.finditer(text): blob = match.group()[1:-1] # strip parentheses try: entry = ast.literal_eval(blob) except Exception: continue module_path = entry.get('module') version = entry.get('version') vcs_url = entry.get('repo_url') or entry.get('url') or '' commit = entry.get('commit') or '' subdir = entry.get('subdir', '') ref = entry.get('ref', '') if not module_path or not version or not vcs_url or not commit: continue if vcs_url.startswith('git://'): vcs_url = 'https://' + vcs_url[6:] if not _url_allowed_for_module(module_path, vcs_url, version): continue timestamp = derive_timestamp_from_version(version) update_metadata_cache(module_path, version, vcs_url, commit, timestamp, subdir, ref, dirty=True) def bootstrap_metadata_cache(output_dir: Optional[Path], skip_inc_files: bool = False, skip_legacy_module_cache: bool = False) -> None: """ Bootstrap metadata cache from multiple sources. Args: output_dir: Recipe output directory (optional in cache-only mode) skip_inc_files: If True, skip loading from .inc files (used with --clean-cache) skip_legacy_module_cache: If True, skip loading legacy module_cache_task.inc metadata """ load_metadata_cache_file() if not skip_inc_files and output_dir is not None: load_metadata_from_inc(output_dir) if not skip_legacy_module_cache and output_dir is not None: load_metadata_from_module_cache_task(output_dir) def _lookup_commit_for_version(vcs_url: str, version: str, preferred_ref: str = "") -> Tuple[Optional[str], Optional[str]]: """ Resolve the git commit for a module version using git ls-remote. Returns: Tuple of (commit, timestamp). Timestamp may be None if unknown. """ tag = version.split('+')[0] pseudo_info = parse_pseudo_version_tag(tag) candidate_urls = [vcs_url] if not vcs_url.endswith('.git'): candidate_urls.append(vcs_url.rstrip('/') + '.git') for url in candidate_urls: if preferred_ref: commit = git_ls_remote(url, preferred_ref) if commit: return commit, "1970-01-01T00:00:00Z" if pseudo_info: timestamp_str, short_commit = pseudo_info commit = git_ls_remote(url, short_commit) if commit: timestamp = derive_timestamp_from_version(version) return commit, timestamp else: for ref in (f"refs/tags/{tag}", tag): commit = git_ls_remote(url, ref) if commit: return commit, "1970-01-01T00:00:00Z" if pseudo_info: timestamp_str, short_commit = pseudo_info for url in candidate_urls: commit = resolve_pseudo_version_commit( url, timestamp_str, short_commit, clone_cache_dir=CLONE_CACHE_DIR, ) if commit: timestamp = derive_timestamp_from_version(version) return commit, timestamp if pseudo_info: # Even if we couldn't resolve the commit, return derived timestamp return None, derive_timestamp_from_version(version) return None, None def query_module_via_go_list(module_path: str, version: str) -> Optional[Dict[str, str]]: """Use `go list -m -json` to obtain VCS metadata for a module version.""" env = os.environ.copy() env.setdefault('GOPROXY', 'https://proxy.golang.org') if CURRENT_GOMODCACHE: env['GOMODCACHE'] = CURRENT_GOMODCACHE try: result = subprocess.run( ['go', 'list', '-m', '-json', f'{module_path}@{version}'], capture_output=True, text=True, check=True, env=env, timeout=GO_CMD_TIMEOUT, ) except subprocess.TimeoutExpired: print(f" ⚠️ go list timed out for {module_path}@{version} after {GO_CMD_TIMEOUT}s") return None except subprocess.CalledProcessError: return None try: data = json.loads(result.stdout) except json.JSONDecodeError: return None origin = data.get('Origin') or {} vcs_url = origin.get('URL', '') commit = origin.get('Hash', '') subdir = origin.get('Subdir', '') ref = origin.get('Ref', '') timestamp = data.get('Time') or origin.get('Time') or '' if vcs_url.startswith('git+'): vcs_url = vcs_url[4:] if not vcs_url or not commit: return None return { 'vcs_url': vcs_url, 'commit': commit, 'timestamp': timestamp, 'subdir': subdir or '', 'vcs_ref': ref or '', } def _candidate_gopkg_repos(module_path: str) -> List[str]: """ Generate candidate repository URLs for gopkg.in modules. """ if not module_path.startswith("gopkg.in/"): return [] remainder = module_path[len("gopkg.in/"):] if not remainder: return [] parts = remainder.split('/') last = parts[-1] match = re.match(r'(?P.+?)\.v\d+(?:[.\w-]*)?$', last) if not match: return [] repo_name = match.group('name') owner_segments = parts[:-1] owner_variants: List[str] = [] if owner_segments: canonical_owner = '/'.join(owner_segments) owner_variants.append(canonical_owner) # Provide fallbacks with dotted segments replaced dotted_to_hyphen = '/'.join(segment.replace('.', '-') for segment in owner_segments) dotted_to_empty = '/'.join(segment.replace('.', '') for segment in owner_segments) for candidate in (dotted_to_hyphen, dotted_to_empty): if candidate and candidate not in owner_variants: owner_variants.append(candidate) else: # Common conventions used by gopkg.in vanity repos owner_variants.extend([ f"go-{repo_name}", repo_name, f"{repo_name}-go", ]) urls: List[str] = [] seen: Set[str] = set() for owner in owner_variants: owner = owner.strip('/') if not owner: continue candidate = f"https://github.com/{owner}/{repo_name}" if candidate not in seen: seen.add(candidate) urls.append(candidate) return urls def _recalculate_subdir_from_vanity(vcs_url: str, module_parts: List[str], current_subdir: str) -> str: """ Recalculate module subdirectory when a vanity import redirects to a different repository layout. """ if not vcs_url: return current_subdir vcs_repo_name = vcs_url.rstrip('/').split('/')[-1] if vcs_repo_name.endswith('.git'): vcs_repo_name = vcs_repo_name[:-4] repo_boundary_index = None for i, part in enumerate(module_parts): if part == vcs_repo_name or part in vcs_repo_name or vcs_repo_name.endswith(part): repo_boundary_index = i + 1 break if repo_boundary_index is not None and repo_boundary_index < len(module_parts): subdir_parts = module_parts[repo_boundary_index:] if subdir_parts and subdir_parts[-1].startswith('v') and subdir_parts[-1][1:].isdigit(): subdir_parts = subdir_parts[:-1] return '/'.join(subdir_parts) if subdir_parts else '' if len(module_parts) <= 3: return '' return current_subdir def resolve_module_metadata(module_path: str, version: str) -> Optional[dict]: parts = module_path.split('/') vanity_repo = None # Track if module was resolved via vanity URL tag = version.split('+')[0] pseudo_info = parse_pseudo_version_tag(tag) expected_commit_prefix = pseudo_info[1] if pseudo_info else None cached = get_cached_metadata(module_path, version) if cached: override_urls = repo_override_candidates(module_path, version) if expected_commit_prefix: cached_commit = cached.get('vcs_hash') or '' if cached_commit and not cached_commit.startswith(expected_commit_prefix): cached = None if cached and override_urls: url = cached.get('vcs_url') or '' if url and url not in override_urls: cached = None if cached and not expected_commit_prefix: ref_hint = cached.get('vcs_ref', '') commit_check, _ = _lookup_commit_for_version(cached.get('vcs_url', ''), version, ref_hint) if not commit_check or commit_check.lower() != (cached.get('vcs_hash', '') or '').lower(): cached = None def fetch_go_metadata() -> Optional[Dict[str, str]]: info = query_module_via_go_list(module_path, version) if info: return info if go_mod_download(module_path, version): return query_module_via_go_list(module_path, version) return None def resolve_with_go_info(go_info: Optional[Dict[str, str]], fallback_url: str, fallback_subdir: str) -> Optional[dict]: if not go_info: return None candidate_urls: List[str] = [] overrides = repo_override_candidates(module_path, version) candidate_urls.extend(overrides) info_url = (go_info.get('vcs_url') or '').strip() if info_url and info_url not in candidate_urls: candidate_urls.append(info_url) if fallback_url and fallback_url not in candidate_urls: candidate_urls.append(fallback_url) timestamp_hint = go_info.get('timestamp') or derive_timestamp_from_version(version) subdir_hint = go_info.get('subdir', '') or fallback_subdir ref_hint = go_info.get('vcs_ref', '') for candidate in candidate_urls: if not _url_allowed_for_module(module_path, candidate, version): continue commit_candidate, timestamp_candidate = _lookup_commit_for_version(candidate, version, ref_hint) if commit_candidate: final_timestamp = timestamp_candidate or timestamp_hint update_metadata_cache( module_path, version, candidate, commit_candidate, final_timestamp, subdir_hint, ref_hint, dirty=True, ) return { "module_path": module_path, "version": version, "vcs_url": candidate, "vcs_hash": commit_candidate, "vcs_ref": ref_hint, "timestamp": final_timestamp, "subdir": subdir_hint, } return None # Handle gopkg.in special case if parts[0] == 'gopkg.in': repo_candidates: List[str] = [] vanity_repo = query_vanity_url(module_path) if vanity_repo: repo_candidates.append(vanity_repo) repo_candidates.extend(_candidate_gopkg_repos(module_path)) if cached and cached.get('vcs_url'): repo_candidates.insert(0, cached['vcs_url']) for vcs_url in repo_candidates: if not vcs_url: continue commit, timestamp = _lookup_commit_for_version(vcs_url, version) if commit: resolved_timestamp = timestamp or derive_timestamp_from_version(version) update_metadata_cache(module_path, version, vcs_url, commit, resolved_timestamp, '', '', dirty=True) return { "module_path": module_path, "version": version, "vcs_url": vcs_url, "vcs_hash": commit, "vcs_ref": "", "timestamp": resolved_timestamp, "subdir": "", } go_info = fetch_go_metadata() result = resolve_with_go_info(go_info, '', '') if result: return result if cached: return cached print(f" ⚠️ Unable to derive repository for gopkg.in path {module_path}@{version}") return None if len(parts) < 3: go_info = fetch_go_metadata() result = resolve_with_go_info(go_info, '', '') if result: return result vanity_repo = query_vanity_url(module_path) if vanity_repo: commit, timestamp = _lookup_commit_for_version(vanity_repo, version) if commit: resolved_timestamp = timestamp or derive_timestamp_from_version(version) update_metadata_cache(module_path, version, vanity_repo, commit, resolved_timestamp, '', '', dirty=True) return { "module_path": module_path, "version": version, "vcs_url": vanity_repo, "vcs_hash": commit, "vcs_ref": "", "timestamp": resolved_timestamp, "subdir": '', } if cached: return cached print(f" ⚠️ Unable to derive repository for {module_path}@{version}") return None else: # Default calculation assuming 3-part paths (domain/org/repo) base_repo = '/'.join(parts[:3]) # Calculate subdir from module path, but strip version suffixes (v2, v3, v11, etc.) if len(parts) > 3: subdir_parts = parts[3:] # Remove trailing version suffix if present (e.g., v2, v3, v11) if subdir_parts and subdir_parts[-1].startswith('v') and subdir_parts[-1][1:].isdigit(): subdir_parts = subdir_parts[:-1] subdir = '/'.join(subdir_parts) if subdir_parts else '' else: subdir = '' override_candidate = None override_urls = repo_override_candidates(module_path, version) if override_urls: override_candidate = override_urls[0] if override_candidate: vcs_url = override_candidate elif parts[0] == 'golang.org' and len(parts) >= 3 and parts[1] == 'x': pkg_name = parts[2] vcs_url = f"https://go.googlesource.com/{pkg_name}" elif parts[0] == 'github.com' and len(parts) >= 3: vcs_url = f"https://{base_repo}" else: vanity_repo = query_vanity_url(module_path) if vanity_repo: vcs_url = vanity_repo subdir = _recalculate_subdir_from_vanity(vcs_url, parts, subdir) else: vcs_url = f"https://{base_repo}" if cached and cached.get('vcs_url') and cached.get('vcs_hash'): if vanity_repo: adjusted_subdir = _recalculate_subdir_from_vanity( cached['vcs_url'], parts, cached.get('subdir', ''), ) if adjusted_subdir != cached.get('subdir', ''): cached['subdir'] = adjusted_subdir update_metadata_cache( module_path, version, cached['vcs_url'], cached['vcs_hash'], cached['timestamp'], adjusted_subdir, cached.get('vcs_ref', ''), dirty=True, ) return cached commit, timestamp = _lookup_commit_for_version(vcs_url, version) if not commit: go_info = fetch_go_metadata() result = resolve_with_go_info(go_info, vcs_url, subdir) if result: return result FAILED_MODULE_PATHS.add(module_path) _record_skipped_module(module_path, version, "no repository metadata from go.sum/go list") print(f" ⚠️ Unable to derive repository for {module_path}@{version}") if cached and cached.get('vcs_hash'): return cached return None if not _url_allowed_for_module(module_path, vcs_url, version): FAILED_MODULE_PATHS.add(module_path) _record_skipped_module(module_path, version, "resolved repo not allowed by override policy") print(f" ⚠️ Resolved repo {vcs_url} for {module_path}@{version} not in override allowlist") if cached and cached.get('vcs_hash'): return cached return None resolved_timestamp = timestamp or derive_timestamp_from_version(version) update_metadata_cache(module_path, version, vcs_url, commit, resolved_timestamp, subdir, '', dirty=True) return { "module_path": module_path, "version": version, "vcs_url": vcs_url, "vcs_hash": commit, "vcs_ref": "", "timestamp": resolved_timestamp, "subdir": subdir, } # ============================================================================= # Utility Functions # ============================================================================= def unescape_module_path(path: str) -> str: """ Unescape Go module paths that use ! for uppercase letters. Example: github.com/!sirupsen/logrus -> github.com/Sirupsen/logrus """ import re return re.sub(r'!([a-z])', lambda m: m.group(1).upper(), path) def escape_module_path(path: str) -> str: """ Escape Go module paths by converting uppercase to !lowercase. Example: github.com/Sirupsen/logrus -> github.com/!sirupsen/logrus """ import re return re.sub(r'([A-Z])', lambda m: '!' + m.group(1).lower(), path) # ============================================================================= # Phase 1: Discovery # ============================================================================= def parse_go_mod_requires(go_mod_path: Path) -> List[tuple]: """ Extract ALL module requirements from go.mod (direct + indirect). This replaces the need for fast-fix-module.py by discovering all transitive dependencies that Go needs. Returns list of (module_path, version) tuples. """ modules = [] if not go_mod_path.exists(): print(f"Warning: go.mod not found at {go_mod_path}") return modules in_require = False try: with open(go_mod_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() # Start of require block if line.startswith('require ('): in_require = True continue # End of require block if in_require and line == ')': in_require = False continue # Single-line require if line.startswith('require ') and '(' not in line: parts = line.split() if len(parts) >= 3: # require module version module = parts[1] version = parts[2] modules.append((module, version)) continue # Multi-line require block entry if in_require and line: # Skip comments if line.startswith('//'): continue # Parse: "module version // indirect" or just "module version" parts = line.split() if len(parts) >= 2: module = parts[0] version = parts[1] modules.append((module, version)) except Exception as e: print(f"Error parsing go.mod: {e}") return modules def download_all_required_modules(source_dir: Path, gomodcache: Path) -> None: """ Download ALL modules required by go.mod (direct + indirect). This ensures that indirect/transitive dependencies have .info files in the GOMODCACHE, which allows discover_modules() to find them. This is the key to replacing fast-fix-module.py - by downloading everything upfront, we make all modules discoverable. """ go_mod_path = source_dir / "go.mod" print(f"\n" + "=" * 70) print("DISCOVERY ENHANCEMENT: Downloading all required modules") print("=" * 70) print(f"Parsing {go_mod_path}...") required_modules = parse_go_mod_requires(go_mod_path) if not required_modules: print("Warning: No modules found in go.mod") return print(f"Found {len(required_modules)} total modules in go.mod (direct + indirect)") # Set up environment for Go env = os.environ.copy() env['GOMODCACHE'] = str(gomodcache) env['GOPROXY'] = 'https://proxy.golang.org' # Download each module to ensure .info files exist success_count = 0 skip_count = 0 fail_count = 0 for module_path, version in required_modules: # Check if .info file already exists escaped_module = escape_module_path(module_path) escaped_version = escape_module_path(version) info_path = gomodcache / "cache" / "download" / escaped_module / "@v" / f"{escaped_version}.info" if info_path.exists(): skip_count += 1 continue # Download to get .info file with VCS metadata try: result = subprocess.run( ['go', 'mod', 'download', f'{module_path}@{version}'], cwd=source_dir, env=env, capture_output=True, text=True, timeout=30 ) if result.returncode == 0: success_count += 1 else: fail_count += 1 if "no matching versions" not in result.stderr: print(f" Warning: Failed to download {module_path}@{version}: {result.stderr.strip()[:100]}") except subprocess.TimeoutExpired: fail_count += 1 print(f" Warning: Timeout downloading {module_path}@{version}") except Exception as e: fail_count += 1 print(f" Warning: Error downloading {module_path}@{version}: {e}") print(f"\nDownload results:") print(f" ✓ {success_count} modules downloaded") print(f" ⊙ {skip_count} modules already cached") print(f" ✗ {fail_count} modules failed") print(f" → Total: {len(required_modules)} modules") def discover_modules(source_dir: Path, gomodcache: Optional[str] = None) -> List[Dict]: """ Phase 1: Discovery Let Go download modules to discover correct paths and metadata. This is ONLY for discovery - we build from git sources. Returns list of modules with: - module_path: CORRECT path from filesystem (no /v3 stripping!) - version: Module version - vcs_url: Git repository URL - vcs_hash: Git commit hash - vcs_ref: Git reference (tag/branch) - timestamp: Commit timestamp - subdir: Subdirectory within repo (for submodules) """ global CURRENT_GOMODCACHE print("\n" + "=" * 70) print("PHASE 1: DISCOVERY - Using Go to discover module metadata") print("=" * 70) # Create temporary or use provided GOMODCACHE if gomodcache: temp_cache = Path(gomodcache) print(f"Using existing GOMODCACHE: {temp_cache}") cleanup_cache = False else: temp_cache = Path(tempfile.mkdtemp(prefix="go-discover-")) print(f"Created temporary cache: {temp_cache}") cleanup_cache = True CURRENT_GOMODCACHE = str(temp_cache) try: ensure_path_is_writable(temp_cache) # Set up environment for Go env = os.environ.copy() env['GOMODCACHE'] = str(temp_cache) env['GOPROXY'] = 'https://proxy.golang.org' print(f"\nDownloading modules to discover metadata...") print(f"Source: {source_dir}") # Let Go download everything (initial discovery) result = subprocess.run( ['go', 'mod', 'download'], cwd=source_dir, env=env, capture_output=True, text=True ) if result.returncode != 0: print(f"Warning: go mod download had errors:\n{result.stderr}") # Continue anyway - some modules may have been downloaded # PRIORITY #2 FIX: Download ALL modules from go.mod (direct + indirect) # This replaces the need for fast-fix-module.py by ensuring all # transitive dependencies have .info files for discovery download_all_required_modules(source_dir, temp_cache) # Walk filesystem to discover what Go created modules = [] download_dir = temp_cache / "cache" / "download" if not download_dir.exists(): print(f"Error: Download directory not found: {download_dir}") return [] print(f"\nScanning {download_dir} for modules...") for dirpath, _, filenames in os.walk(download_dir): path_parts = Path(dirpath).relative_to(download_dir).parts # Look for @v directories if not path_parts or path_parts[-1] != '@v': continue # Module path is everything before @v module_path = '/'.join(path_parts[:-1]) module_path = unescape_module_path(module_path) # Unescape !-encoding # Process each .info file for filename in filenames: if not filename.endswith('.info'): continue version = filename[:-5] # Strip .info extension info_path = Path(dirpath) / filename try: # Read metadata from .info file with open(info_path) as f: info = json.load(f) # Extract VCS information origin = info.get('Origin', {}) vcs_url = origin.get('URL') vcs_hash = origin.get('Hash') vcs_ref = origin.get('Ref', '') subdir = origin.get('Subdir', '') if not vcs_url or not vcs_hash: # Try to refresh cache entry and ask Go directly for metadata. go_mod_download(module_path, version) # Reload .info in case go mod download updated it. try: with open(info_path) as f: info = json.load(f) origin = info.get('Origin', {}) vcs_url = origin.get('URL') vcs_hash = origin.get('Hash') vcs_ref = origin.get('Ref', '') subdir = origin.get('Subdir', '') except Exception: pass if not vcs_url or not vcs_hash: go_info = query_module_via_go_list(module_path, version) if go_info: vcs_url = go_info.get('vcs_url') vcs_hash = go_info.get('commit') subdir = go_info.get('subdir', subdir) origin_time = go_info.get('timestamp', '') if origin_time: info['Time'] = origin_time if not vcs_url or not vcs_hash: print(f" ⚠️ Skipping {module_path}@{version}: No VCS info") continue overrides = repo_override_candidates(module_path, version) if overrides: vcs_url = overrides[0] # BitBake requires full 40-character commit hashes if len(vcs_hash) != 40: print(f" ⚠️ Skipping {module_path}@{version}: Short commit hash ({vcs_hash})") continue # PROACTIVE dangling commit detection and correction # Check if commit is BitBake-fetchable BEFORE expensive verification # BitBake's nobranch=1 requires commits to be branch/tag HEADs, not dangling commits if VERIFY_ENABLED and vcs_ref and vcs_ref.startswith("refs/"): if not is_commit_bitbake_fetchable(vcs_url, vcs_hash, vcs_ref): print(f" ⚠️ DANGLING COMMIT: {module_path}@{version} commit {vcs_hash[:12]} not a branch/tag HEAD") # Try to correct by dereferencing the ref corrected_hash = correct_commit_hash_from_ref(vcs_url, vcs_hash, vcs_ref) if corrected_hash: print(f" ✓ Corrected hash by dereferencing {vcs_ref}: {vcs_hash[:12]} → {corrected_hash[:12]}") vcs_hash = corrected_hash else: print(f" ❌ Could not auto-correct dangling commit") # Continue anyway - verification will catch if it's truly unfetchable # Validate commit exists in repository (detect force-pushed tags) # If verification is enabled, check that the commit from .info file # actually exists in the repository. If not, refresh from Go proxy. commit_verified = VERIFY_ENABLED and verify_commit_accessible(vcs_url, vcs_hash, vcs_ref, version, origin_time) # Apply fallback commit if verification used one (for orphaned commits) if commit_verified and VERIFY_ENABLED: vcs_hash = get_actual_commit(vcs_url, vcs_hash) if VERIFY_ENABLED and not commit_verified: print(f" ⚠️ STALE CACHE: {module_path}@{version} commit {vcs_hash[:12]} not found in {vcs_url}") # Last resort: Try proxy refresh (this shouldn't happen if dangling check worked) corrected_hash = correct_commit_hash_from_ref(vcs_url, vcs_hash, vcs_ref) if corrected_hash: print(f" ✓ Corrected hash by dereferencing {vcs_ref}: {vcs_hash[:12]} → {corrected_hash[:12]}") vcs_hash = corrected_hash # Verify the corrected hash is accessible if verify_commit_accessible(vcs_url, vcs_hash, vcs_ref, version, origin_time): # Successfully corrected! Continue with this module (skip proxy refresh) commit_verified = True else: print(f" ❌ Even corrected commit not accessible") # If still not verified after correction attempt, try proxy refresh if not commit_verified: # Check if module is actually needed before attempting refresh if not is_module_actually_needed(module_path, CURRENT_SOURCE_DIR): print(f" ℹ️ Module not needed by main module (indirect-only), skipping") print(f" (Verified via 'go mod why {module_path}')") continue print(f" Attempting to refresh from Go proxy...") # Delete stale .info file to force re-download try: info_path.unlink() print(f" Deleted stale .info file") except Exception as e: print(f" Warning: Could not delete .info file: {e}") # Re-download from Go proxy to get current commit try: go_mod_download(module_path, version) # Reload .info file with fresh data if info_path.exists(): with open(info_path) as f: info = json.load(f) origin = info.get('Origin', {}) new_vcs_hash = origin.get('Hash') if new_vcs_hash and new_vcs_hash != vcs_hash: print(f" ✓ Refreshed: {vcs_hash[:12]} → {new_vcs_hash[:12]}") vcs_hash = new_vcs_hash vcs_ref = origin.get('Ref', vcs_ref) # Verify new commit exists if not verify_commit_accessible(vcs_url, vcs_hash, vcs_ref, version, origin.get('Time', '')): print(f" ❌ Even refreshed commit not accessible") # Last resort: check if it's actually needed if not is_module_actually_needed(module_path, CURRENT_SOURCE_DIR): print(f" ℹ️ Module not needed anyway, skipping") continue else: print(f" ❌ Module IS needed but commit unavailable") print(f" This module cannot be built from git sources") continue else: print(f" ⚠️ Go proxy returned same commit (permanently deleted)") # Check if it's actually needed if not is_module_actually_needed(module_path, CURRENT_SOURCE_DIR): print(f" ℹ️ Module not needed by main module, skipping") continue else: print(f" ❌ Module IS needed but commit permanently deleted") print(f" Consider using gomod:// fetcher for this module") continue else: print(f" ❌ Re-download failed, skipping module") continue except Exception as e: print(f" ❌ Refresh failed: {e}") continue DOWNLOADED_MODULES.add((module_path, version)) modules.append({ 'module_path': module_path, 'version': version, 'vcs_url': vcs_url, 'vcs_hash': vcs_hash, 'vcs_ref': vcs_ref, 'timestamp': info.get('Time', ''), 'subdir': subdir or '', }) print(f" ✓ {module_path}@{version}") except Exception as e: print(f" ✗ Error processing {info_path}: {e}") continue print(f"\nDiscovered {len(modules)} modules with VCS info") # FIX: Synthesize entries for +incompatible versions that lack VCS data # These are pre-v2 versions of modules that later adopted semantic import versioning (/v2, /v3, etc.) # The GOMODCACHE has .info files for them but without Origin data (old proxy cache) # Strategy: For each versioned module path (e.g., foo/v3), check if a base path version # with +incompatible exists in GOMODCACHE and lacks VCS data. If so, synthesize an entry. # # NOTE (2025-11-28): This code overlaps with Fix #29 in extract-native-modules.py, which # now uses derive_vcs_info() to handle +incompatible modules at discovery time. Fix #29 # is more complete because it handles ALL +incompatible modules directly from their path, # not just those with a corresponding /vN version. This code is kept as a fallback for # cases where extract-native-modules.py wasn't used (e.g., legacy workflows). print("\nSynthesizing entries for +incompatible versions without VCS data...") synthesized_count = 0 # Build a map of module_path -> vcs_url for discovered modules module_vcs_map: Dict[str, str] = {} for mod in modules: module_vcs_map[mod['module_path']] = mod['vcs_url'] # For each module with a versioned path suffix (/v2, /v3, etc.), check for base path incompatible versions for mod in list(modules): # Iterate over copy since we'll append to modules module_path = mod['module_path'] vcs_url = mod['vcs_url'] # Check if this module has a version suffix (/v2, /v3, etc.) version_match = re.search(r'/v(\d+)$', module_path) if not version_match: continue # Extract base path (without /vN suffix) base_path = module_path[:module_path.rfind('/v')] # Check if we already discovered the base path if base_path in module_vcs_map: continue # Base path already has VCS data, no synthesis needed # Look for +incompatible versions of the base path in GOMODCACHE # Note: GOMODCACHE uses raw paths as directory names (not escaped) base_path_dir = download_dir / base_path / '@v' if not base_path_dir.exists(): continue # Scan for .info files with +incompatible versions for info_file in base_path_dir.glob('*.info'): version = info_file.stem if not version.endswith('+incompatible'): continue # Read the .info file to check if it lacks VCS data try: with open(info_file) as f: info = json.load(f) # If it already has Origin data, skip it if 'Origin' in info and info['Origin'].get('URL') and info['Origin'].get('Hash'): continue # This +incompatible version lacks VCS data - synthesize an entry # Extract the tag name from version (e.g., v2.16.0+incompatible -> v2.16.0) tag_version = version.replace('+incompatible', '') tag_ref = f"refs/tags/{tag_version}" # Use git ls-remote to find the commit for this tag tag_commit = git_ls_remote(vcs_url, tag_ref) if not tag_commit: print(f" ⚠️ Could not find tag {tag_ref} for {base_path}@{version}") continue # Synthesize a module entry using data from the versioned path synthesized_module = { 'module_path': base_path, # Use BASE path (without /vN) 'version': version, 'vcs_url': vcs_url, 'vcs_hash': tag_commit, 'vcs_ref': tag_ref, 'timestamp': info.get('Time', ''), 'subdir': '', } modules.append(synthesized_module) module_vcs_map[base_path] = vcs_url # Prevent duplicate synthesis synthesized_count += 1 print(f" ✓ Synthesized {base_path}@{version} (from {module_path} VCS data)") print(f" VCS: {vcs_url}") print(f" Commit: {tag_commit[:12]} (tag {tag_version})") except Exception as e: print(f" ⚠️ Error synthesizing {base_path}@{version}: {e}") continue if synthesized_count > 0: print(f"\nSynthesized {synthesized_count} +incompatible module entries") else: print("No +incompatible versions needed synthesis") print(f"\nTotal modules after synthesis: {len(modules)}") return modules finally: # Defer cleanup of temporary caches until the end of execution if cleanup_cache and temp_cache.exists(): TEMP_GOMODCACHES.append(temp_cache) # ============================================================================= # Phase 2: Recipe Generation # ============================================================================= def generate_recipe(modules: List[Dict], source_dir: Path, output_dir: Optional[Path], git_repo: str, git_ref: str, validate_only: bool = False, debug_limit: Optional[int] = None, skip_verify: bool = False, verify_jobs: int = 10) -> bool: """ Phase 2: Recipe Generation Generate BitBake recipe with git:// SRC_URI entries. No file:// entries - we'll build cache from git during do_create_module_cache. Creates: - go-mod-git.inc: SRC_URI with git:// entries - go-mod-cache.inc: BitBake task to build module cache """ print("\n" + "=" * 70) phase_label = "VALIDATION" if validate_only else "RECIPE GENERATION" print(f"PHASE 2: {phase_label} - {('commit verification' if validate_only else 'Creating BitBake recipe files')}") print("=" * 70) src_uri_entries = [] modules_data = [] vcs_repos: Dict[str, Dict] = {} def repo_key_for_url(url: str) -> str: return hashlib.sha256(f"git3:{url}".encode()).hexdigest() def commit_cache_key(repo_key: str, commit: str) -> str: return hashlib.sha256(f"{repo_key}:{commit}".encode()).hexdigest() unresolved_commits: List[Tuple[str, str, str, str, str]] = [] total_modules = len(modules) if debug_limit is not None: print(f"\n⚙️ Debug limit active: validating first {debug_limit} modules (total list size {total_modules})") if skip_verify: print(f"\n⚙️ Skipping verification (--skip-verify enabled)") # First pass: Build repo structure without verification for index, module in enumerate(modules, start=1): vcs_url = module['vcs_url'] commit_hash = module['vcs_hash'] repo_key = repo_key_for_url(vcs_url) repo_info = vcs_repos.setdefault( repo_key, { 'url': vcs_url, 'commits': {}, # commit hash -> commit metadata }, ) if commit_hash not in repo_info['commits']: commit_sha = commit_cache_key(repo_key, commit_hash) repo_info['commits'][commit_hash] = { 'commit_sha': commit_sha, 'modules': [], } else: commit_sha = repo_info['commits'][commit_hash]['commit_sha'] ref_hint = module.get('vcs_ref', '') if ref_hint and not _ref_points_to_commit(vcs_url, ref_hint, commit_hash): ref_hint = '' entry = repo_info['commits'][commit_hash] entry['modules'].append(module) if ref_hint: entry['ref_hint'] = ref_hint module['repo_key'] = repo_key module['commit_sha'] = commit_sha # Second pass: Verify commits (parallel or sequential) with auto-correction # PHASE MERGE: This now includes force-pushed tag detection and auto-correction global VERIFY_CORRECTIONS_APPLIED if not skip_verify: print(f"\n⚙️ Verifying {total_modules} commits with {verify_jobs} parallel jobs") corrected_modules = [] # Track corrections for reporting def verify_module(module_info): index, module = module_info vcs_url = module['vcs_url'] commit_hash = module['vcs_hash'] ref_hint = module.get('vcs_ref', '') print(f" • verifying [{index}/{total_modules}] {module['module_path']}@{module['version']} -> {commit_hash[:12]}") # Verify commit is accessible if not verify_commit_accessible(vcs_url, commit_hash, ref_hint, module.get('version', ''), module.get('timestamp', '')): # PHASE MERGE: If verification fails and we have a ref, try auto-correction if ref_hint and ref_hint.startswith("refs/"): corrected_hash = correct_commit_hash_from_ref(vcs_url, commit_hash, ref_hint) if corrected_hash and corrected_hash != commit_hash: print(f" ✓ Auto-corrected: {commit_hash[:12]} → {corrected_hash[:12]} (force-pushed tag)") module['vcs_hash'] = corrected_hash # Update repo_info dict to use the new hash as key repo_key = module['repo_key'] if commit_hash in vcs_repos[repo_key]['commits']: # Move the entry from old hash to new hash vcs_repos[repo_key]['commits'][corrected_hash] = vcs_repos[repo_key]['commits'].pop(commit_hash) return ('corrected', module['module_path'], module['version'], commit_hash, corrected_hash) else: # Could not correct - treat as failure return ('failed', module['module_path'], module['version'], commit_hash, vcs_url, ref_hint) else: # No ref to dereference - genuine failure return ('failed', module['module_path'], module['version'], commit_hash, vcs_url, ref_hint) else: # Verification succeeded - apply fallback commit if one was used actual_hash = get_actual_commit(vcs_url, commit_hash) if actual_hash != commit_hash: print(f" ✓ Applied fallback: {commit_hash[:12]} → {actual_hash[:12]} (orphaned commit)") module['vcs_hash'] = actual_hash # Update repo_info dict to use the new hash as key repo_key = module['repo_key'] if commit_hash in vcs_repos[repo_key]['commits']: # Move the entry from old hash to new hash vcs_repos[repo_key]['commits'][actual_hash] = vcs_repos[repo_key]['commits'].pop(commit_hash) return ('corrected', module['module_path'], module['version'], commit_hash, actual_hash) return None if verify_jobs > 0: # Parallel verification with concurrent.futures.ThreadPoolExecutor(max_workers=verify_jobs) as executor: results = list(executor.map(verify_module, enumerate(modules, start=1))) else: # Sequential verification (--verify-jobs=0) results = [] for index, module in enumerate(modules, start=1): result = verify_module((index, module)) if result is not None: results.append(result) # Save verification cache every 50 modules if index % 50 == 0: save_verify_commit_cache(force=True) print(f" 💾 Saved verification cache at {index}/{total_modules}") # Separate corrected vs failed results corrected_results = [r for r in results if r and r[0] == 'corrected'] failed_results = [r for r in results if r and r[0] == 'failed'] # Apply corrections back to modules list (needed for parallel execution) if corrected_results: VERIFY_CORRECTIONS_APPLIED = True print(f"\n✓ Auto-corrected {len(corrected_results)} force-pushed tags:") for _, module_path, version, old_hash, new_hash in corrected_results: print(f" • {module_path}@{version}: {old_hash[:12]} → {new_hash[:12]}") # Find and update the module in the main list for module in modules: if module['module_path'] == module_path and module['version'] == version: module['vcs_hash'] = new_hash # Also update the vcs_repos dict repo_key = module['repo_key'] if old_hash in vcs_repos[repo_key]['commits']: vcs_repos[repo_key]['commits'][new_hash] = vcs_repos[repo_key]['commits'].pop(old_hash) break else: # Verification skipped - no failed results failed_results = [] print(f"\nFound {len(vcs_repos)} unique git repositories") print(f"Supporting {len(modules)} modules") if failed_results: print("\n❌ Unable to verify the following module commits against their repositories:") for _, module_path, version, commit_hash, vcs_url, ref_hint in failed_results: print(f" - {module_path}@{version} ({commit_hash})") hint = f" {ref_hint}" if ref_hint else "" print(f" try: git fetch --depth=1 {vcs_url}{hint} {commit_hash}") print(f" cache: mark reachable via --inject-commit '{vcs_url} {commit_hash}'") print(f" repo : override via --set-repo {module_path}@{version} {vcs_url}") print("Aborting to prevent emitting invalid SRCREVs.") return False if validate_only: print("\n✅ Validation complete - all commits are reachable upstream") return True if output_dir is None: print("❌ Internal error: output directory missing for recipe generation") return False # Generate SRC_URI entries for each repo/commit combination for repo_key, repo_info in vcs_repos.items(): git_url = repo_info['url'] if git_url.startswith('https://'): git_url_bb = 'git://' + git_url[8:] protocol = 'https' elif git_url.startswith('http://'): git_url_bb = 'git://' + git_url[7:] protocol = 'http' else: git_url_bb = git_url protocol = 'https' for idx, (commit_hash, commit_info) in enumerate(sorted(repo_info['commits'].items())): fetch_name = f"git_{repo_key[:8]}_{idx}" destsuffix = f"vcs_cache/{commit_info['commit_sha']}" # Use branch name from ref_hint when available (more reliable than nobranch=1) # ref_hint is like "refs/tags/v1.9.3" or "refs/heads/main" ref_hint = commit_info.get('ref_hint', '') if ref_hint: shallow_param = ';shallow=1' # For tags, use nobranch=1 since the commit may not be on a branch head # For branches, use the branch name directly if ref_hint.startswith('refs/tags/'): # Tags: BitBake can fetch tagged commits with nobranch=1 branch_param = ';nobranch=1' elif ref_hint.startswith('refs/heads/'): # Branches: use the actual branch name branch_name = ref_hint[11:] # Strip "refs/heads/" branch_param = f';branch={branch_name}' else: branch_param = ';nobranch=1' else: # For pseudo-versions (no ref_hint), check if we detected a branch detected_branch = VERIFY_DETECTED_BRANCHES.get((git_url, commit_hash)) if detected_branch: # Use the detected branch name instead of nobranch=1 shallow_param = '' branch_param = f';branch={detected_branch}' print(f" Using detected branch: {detected_branch} for {commit_hash[:12]}") else: # No ref and no detected branch - use nobranch=1 # This should only happen for genuine orphaned commits that couldn't be fixed shallow_param = '' branch_param = ';nobranch=1' src_uri_entries.append( f'{git_url_bb};protocol={protocol}{branch_param}{shallow_param};' f'rev={commit_hash};' f'name={fetch_name};' f'destsuffix={destsuffix}' ) commit_info['fetch_name'] = fetch_name commit_info['destsuffix'] = destsuffix if len(repo_info['commits']) == 1: print(f" {fetch_name}: {repo_info['url'][:60]}...") else: print(f" {fetch_name}: {repo_info['url'][:60]}... (commit {commit_hash[:12]})") # Prepare modules data for do_create_module_cache for module in modules: repo_key = module['repo_key'] commit_hash = module['vcs_hash'] commit_info = vcs_repos[repo_key]['commits'][commit_hash] update_metadata_cache( module['module_path'], module['version'], module['vcs_url'], module['vcs_hash'], module.get('timestamp', ''), module.get('subdir', ''), module.get('vcs_ref', ''), dirty=True, ) # DEBUG: Track server/v3 module if 'server/v3' in module['module_path']: print(f"\n🔍 DEBUG server/v3: Adding to modules_data") print(f" module_path: {module['module_path']}") print(f" subdir: '{module.get('subdir', '')}' (from module dict)") print(f" timestamp: {module['timestamp']}") print(f" vcs_hash: {module['vcs_hash']}") modules_data.append({ 'module': module['module_path'], 'version': module['version'], 'vcs_hash': commit_info['commit_sha'], 'timestamp': module['timestamp'], 'subdir': module.get('subdir', ''), 'vcs_ref': module.get('vcs_ref', ''), }) # Write go-mod-git.inc git_inc_path = output_dir / "go-mod-git.inc" print(f"\nWriting {git_inc_path}") with open(git_inc_path, 'w') as f: f.write("# Generated by oe-go-mod-fetcher.py v" + VERSION + "\n") f.write("# Git repositories for Go module dependencies\n\n") for entry in src_uri_entries: f.write(f'SRC_URI += "{entry}"\n') f.write('\n') # Collect all tag references for shallow cloning # BB_GIT_SHALLOW_EXTRA_REFS ensures these refs are included in shallow clones tag_refs = set() for module in modules: vcs_ref = module.get('vcs_ref', '') if vcs_ref and 'refs/tags/' in vcs_ref: tag_refs.add(vcs_ref) if tag_refs: f.write("# Tag references for shallow cloning\n") f.write("# Ensures shallow clones include all necessary tags\n") f.write("BB_GIT_SHALLOW_EXTRA_REFS = \"\\\n") for tag_ref in sorted(tag_refs): f.write(f" {tag_ref} \\\n") f.write('"\n') # Note: SRCREV_* variables are not needed since rev= is embedded directly in SRC_URI # Write go-mod-cache.inc cache_inc_path = output_dir / "go-mod-cache.inc" print(f"Writing {cache_inc_path}") with open(cache_inc_path, 'w') as f: f.write("# Generated by oe-go-mod-fetcher.py v" + VERSION + "\n") f.write("# Module cache data for Go dependencies\n") f.write("#\n") f.write("# This file contains recipe-specific module metadata.\n") f.write("# The task implementations are in go-mod-vcs.bbclass.\n\n") # Inherit the bbclass that provides the task implementations f.write("inherit go-mod-vcs\n\n") # Write modules data as JSON - one module per line for readability f.write("# Module metadata for cache building (one module per line)\n") f.write("GO_MODULE_CACHE_DATA = '[\\\n") for i, mod in enumerate(modules_data): line = json.dumps(mod, separators=(',', ':')) if i < len(modules_data) - 1: f.write(f"{line},\\\n") else: f.write(f"{line}\\\n") f.write("]'\n") print(f"\n✅ Generated recipe files:") print(f" {git_inc_path}") print(f" {cache_inc_path}") print(f"\nTo use these files, add to your recipe:") print(f" require go-mod-git.inc") print(f" require go-mod-cache.inc") return True # ============================================================================= # Discovered Module Loading (Bootstrap Strategy) # ============================================================================= def load_discovered_modules(discovered_modules_path: Path) -> Optional[List[Dict]]: """ Load pre-discovered module metadata from BitBake discovery build. This implements the bootstrap strategy where a BitBake discovery build has already run 'go mod download' (via do_discover_modules task) and extract-native-modules.py has extracted complete metadata from the GOMODCACHE. Args: discovered_modules_path: Path to JSON file with module metadata Returns: List of module dicts with complete VCS info, or None if load fails """ if not discovered_modules_path.exists(): print(f"❌ Discovered modules file not found: {discovered_modules_path}") return None try: with open(discovered_modules_path) as f: modules = json.load(f) if not isinstance(modules, list): print(f"❌ Invalid discovered modules file format (expected list, got {type(modules).__name__})") return None print(f"✓ Loaded {len(modules)} modules from discovery metadata") print(f" File: {discovered_modules_path}") # Validate module format required_fields = ['module_path', 'version', 'vcs_url', 'vcs_hash'] for i, module in enumerate(modules): if not isinstance(module, dict): print(f"❌ Module {i} is not a dict: {module}") return None for field in required_fields: if field not in module: print(f"❌ Module {i} missing required field '{field}': {module.get('module_path', '')}") return None # Show statistics unique_repos = len(set(m['vcs_url'] for m in modules)) with_subdirs = sum(1 for m in modules if m.get('subdir')) print(f"\nDiscovery metadata summary:") print(f" Modules: {len(modules)}") print(f" Unique repositories: {unique_repos}") print(f" Multi-module repos: {with_subdirs} modules have subdirs") # Expand 12-char short hashes to full 40-char hashes. # Pseudo-versions like v0.0.0-20161002113705-648efa622239 only contain # 12 chars of the commit hash. BitBake's git fetcher needs full 40-char. short_hash_modules = [m for m in modules if len(m.get('vcs_hash', '')) == 12] if short_hash_modules: print(f"\n⚙️ Expanding {len(short_hash_modules)} short hashes to full 40-char...") expanded = 0 failed = 0 for i, module in enumerate(short_hash_modules): if (i + 1) % 20 == 0 or i == 0: print(f" Progress: {i + 1}/{len(short_hash_modules)}...", end='\r', flush=True) version = module.get('version', '') vcs_url = module['vcs_url'] short_hash = module['vcs_hash'] # Parse pseudo-version to get timestamp pseudo_info = parse_pseudo_version_tag(version.split('+')[0]) if pseudo_info: timestamp_str, _ = pseudo_info full_hash = resolve_pseudo_version_commit( vcs_url, timestamp_str, short_hash, clone_cache_dir=CLONE_CACHE_DIR ) if full_hash and len(full_hash) == 40: module['vcs_hash'] = full_hash expanded += 1 else: failed += 1 if VERBOSE_MODE: print(f"\n ⚠️ Could not expand: {module['module_path']}@{version}") else: failed += 1 print(f" Expanded {expanded} short hashes, {failed} failed ") # Filter out modules with empty vcs_hash - these are typically pre-Go 1.18 # modules lacking Origin metadata (e.g. pre-release pseudo-versions) that # cannot be fetched from git. They are usually transitive dependencies that # aren't actually needed by the build. empty_hash_modules = [m for m in modules if not m.get('vcs_hash')] if empty_hash_modules: print(f"\n⚠️ Filtering out {len(empty_hash_modules)} modules with empty vcs_hash:") for m in empty_hash_modules: print(f" - {m['module_path']}@{m['version']}") modules = [m for m in modules if m.get('vcs_hash')] print(f" Remaining modules: {len(modules)}") return modules except json.JSONDecodeError as e: print(f"❌ Failed to parse discovered modules JSON: {e}") return None except Exception as e: print(f"❌ Error loading discovered modules: {e}") return None # ============================================================================= # Main Entry Point # ============================================================================= def main(): global LOG_PATH, CURRENT_GOMODCACHE parser = argparse.ArgumentParser( description=f"Generate BitBake recipes for Go modules using hybrid approach (v{VERSION})", epilog=""" This tool uses a 3-phase hybrid approach: 1. Discovery: Run 'go mod download' to get correct module paths 2. Recipe Generation: Create git:// SRC_URI entries for BitBake 3. Cache Building: Build module cache from git during do_create_module_cache Persistent Caches: The generator maintains caches in the data/ subdirectory: - data/module-cache.json: Module metadata (VCS URL, timestamp, subdir, etc.) - data/ls-remote-cache.json: Git ls-remote results - data/vanity-url-cache.json: Vanity import path resolution - data/verify-cache.json: Commit verification status These caches speed up regeneration but may need cleaning when: - Derivation logic changes (e.g., subdir calculation fixes) - Cached data becomes stale or incorrect Use --clean-cache to remove metadata cache before regeneration. Use --clean-ls-remote-cache to remove both caches (slower, but fully fresh). Examples: # Normal regeneration (fast, uses caches) %(prog)s --recipedir /path/to/recipe/output # Clean metadata cache (e.g., after fixing subdir derivation) %(prog)s --recipedir /path/to/recipe/output --clean-cache # Fully clean regeneration (slow, calls git ls-remote for everything) %(prog)s --recipedir /path/to/recipe/output --clean-ls-remote-cache """, formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument( "--recipedir", help="Output directory for generated .inc files (required unless running with --validate/--dry-run/--clean-only)" ) parser.add_argument( "--gomodcache", help="Directory to use for Go module cache (for discovery phase)" ) parser.add_argument( "--cache-dir", help="Directory to store JSON metadata caches (default: scripts/data)" ) parser.add_argument( "--clone-cache-dir", help="Directory to cache cloned git repositories (default: scripts/.cache/repos)" ) parser.add_argument( "--source-dir", help="Source directory containing go.mod (default: current directory)" ) parser.add_argument( "--git-repo", help="Git repository URL (for documentation purposes)" ) parser.add_argument( "--git-ref", help="Git reference (for documentation purposes)" ) parser.add_argument( "-v", "--verbose", action="store_true", help="Verbose output" ) parser.add_argument( "--clean-cache", action="store_true", help="Clear metadata cache before regeneration (useful when derivation logic changes)" ) parser.add_argument( "--clean-ls-remote-cache", action="store_true", help="Clear git ls-remote cache in addition to metadata cache (implies --clean-cache)" ) parser.add_argument( "--skip-legacy-module-cache", action="store_true", help="Skip importing legacy module metadata from module_cache_task.inc" ) parser.add_argument( "--dry-run", action="store_true", help="Execute cache mutations without discovery/generation" ) parser.add_argument( "--clean-gomodcache", action="store_true", help="Clean stale .info files in GOMODCACHE that lack VCS metadata (fixes 'module lookup disabled' errors)" ) parser.add_argument( "--validate", action="store_true", help="Validate module commits without emitting recipe files" ) parser.add_argument( "--validate-only", action="store_true", help=argparse.SUPPRESS ) parser.add_argument( "--skip-verify", action="store_true", help="Skip commit verification (trust cached verify results, much faster)" ) parser.add_argument( "--verify-jobs", type=int, default=10, metavar="N", help="Number of parallel verification jobs (default: 10, 0=sequential)" ) parser.add_argument( "--verify-cached", action="store_true", help="Verify commits in GOMODCACHE .info files still exist in repositories (detects force-pushed tags)" ) parser.add_argument( "--verify-cache-max-age", type=int, default=30, metavar="DAYS", help="Re-verify cached commits older than this many days (default: 30, 0=always verify)" ) parser.add_argument( "--debug-limit", type=int, help="Process at most N modules during validation/generation (debug only)" ) parser.add_argument( "--inject-commit", metavar=("REPO", "COMMIT"), nargs=2, action="append", help="Mark a repo+commit pair as already verified (skips network check)" ) parser.add_argument( "--clear-commit", metavar=("REPO", "COMMIT"), nargs=2, action="append", help="Remove a repo+commit pair from the verified cache" ) parser.add_argument( "--set-repo", metavar=("MODULE", "REPO"), nargs=2, action="append", help="Pin a module (or module@version) to the specified repository URL" ) parser.add_argument( "--clear-repo", metavar="MODULE", nargs=1, action="append", help="Remove a previously pinned repository override (module or module@version)" ) parser.add_argument( "--version", action="version", version=f"%(prog)s {VERSION}" ) parser.add_argument( "--discovered-modules", dest="discovered_modules", help="JSON file with pre-discovered module metadata (skips discovery phase)" ) # Backward compatibility alias for --discovered-modules parser.add_argument("--native-modules", dest="discovered_modules", help=argparse.SUPPRESS) # Add compatibility args that we ignore (for backward compatibility) parser.add_argument("--use-hybrid", action="store_true", help=argparse.SUPPRESS) parser.add_argument("go_mod_file", nargs='?', help=argparse.SUPPRESS) args = parser.parse_args() if args.validate_only: args.validate = True # Set global verbose mode global VERBOSE_MODE VERBOSE_MODE = args.verbose original_stdout = sys.stdout original_stderr = sys.stderr log_handle = None log_path = None try: timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") log_path = Path(tempfile.gettempdir()) / f"oe-go-mod-fetcher-{timestamp}.log" LOG_PATH = log_path log_handle = log_path.open("w", encoding="utf-8", buffering=1) sys.stdout = Tee(original_stdout, log_handle) sys.stderr = Tee(original_stderr, log_handle) print(f"Go Module Git Fetcher v{VERSION}") print("Hybrid Architecture: Discovery from Go + Build from Git") print("=" * 70) print(f"Logs: {log_path} (pass --dry-run to load caches only)") exit_code = _execute(args) except KeyboardInterrupt: print("\n\nOperation cancelled by user") exit_code = 1 except Exception as e: print(f"\n❌ Unexpected error: {e}") if args.verbose: import traceback traceback.print_exc() exit_code = 1 finally: save_ls_remote_cache() save_metadata_cache() save_vanity_url_cache() save_verify_commit_cache() save_repo_overrides() for temp_cache in TEMP_GOMODCACHES: try: if temp_cache.exists(): shutil.rmtree(temp_cache) except Exception: pass TEMP_GOMODCACHES.clear() if CURRENT_GOMODCACHE and not Path(CURRENT_GOMODCACHE).exists(): CURRENT_GOMODCACHE = None if log_handle: log_handle.flush() log_handle.close() sys.stdout = original_stdout sys.stderr = original_stderr if LOG_PATH: print(f"Logs: {LOG_PATH}") sys.exit(exit_code) if __name__ == "__main__": main()