summaryrefslogtreecommitdiffstats
path: root/scripts/extract-discovered-modules.py
diff options
context:
space:
mode:
authorBruce Ashfield <bruce.ashfield@gmail.com>2025-12-04 22:36:12 +0000
committerBruce Ashfield <bruce.ashfield@gmail.com>2025-12-08 20:57:44 -0500
commita303bf16ffd747c50c95cbe385407ba8b0122cec (patch)
treeddb26a7945e746ce8206fc65b0a971ed74dc812b /scripts/extract-discovered-modules.py
parent9f40ce9b277a677ad3cddd8bf1c1d15fbd035251 (diff)
downloadmeta-virtualization-a303bf16ffd747c50c95cbe385407ba8b0122cec.tar.gz
scripts: add oe-go-mod-fetcher for Go module VCS resolution
Add the oe-go-mod-fetcher.py tool and supporting files for resolving Go module dependencies via git repositories instead of module proxies. oe-go-mod-fetcher.py: - Parses go.mod and go.sum to identify required modules - Resolves module paths to git repositories (handles vanity URLs) - Maps module versions to git commits - Generates SRC_URI entries for bitbake fetcher - Creates go-mod-git.inc and go-mod-cache.inc files - Supports monorepo detection and nested module handling - Caches resolution results for performance extract-discovered-modules.py: - Helper script to extract module information from discovery cache - Used by go-mod-discovery.bbclass during build Also adds .gitignore to exclude runtime caches from version control. Signed-off-by: Bruce Ashfield <bruce.ashfield@gmail.com>
Diffstat (limited to 'scripts/extract-discovered-modules.py')
-rwxr-xr-xscripts/extract-discovered-modules.py491
1 files changed, 491 insertions, 0 deletions
diff --git a/scripts/extract-discovered-modules.py b/scripts/extract-discovered-modules.py
new file mode 100755
index 00000000..1cfca6ad
--- /dev/null
+++ b/scripts/extract-discovered-modules.py
@@ -0,0 +1,491 @@
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0-only
3#
4# go-dep processor
5#
6# Copyright (C) 2025 Bruce Ashfield
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License version 2 as
10# published by the Free Software Foundation.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License along
18# with this program; if not, write to the Free Software Foundation, Inc.,
19# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20
21"""
22Extract complete module metadata from BitBake Go discovery build cache.
23
24This script walks a GOMODCACHE directory (from BitBake discovery build) and
25extracts all module metadata from .info files, including VCS information.
26
27Usage:
28 extract-discovered-modules.py --gomodcache /path/to/cache --output modules.json
29
30The script creates:
31 - modules.json: Complete metadata with VCS URLs, commits, subdirs, timestamps
32 - modules.txt: Simple module@version list
33
34This provides 100% accurate module discovery for BitBake recipe generation.
35"""
36
37import argparse
38import json
39import os
40import re
41import shutil
42import subprocess
43import sys
44import tempfile
45import urllib.parse
46from pathlib import Path
47
48
49def git_ls_remote(url: str, ref: str) -> str:
50 """
51 Query a git repository for a ref and return the commit hash.
52
53 For tags, also tries dereferenced form (^{}) to handle annotated tags.
54 """
55 try:
56 # Try dereferenced form first (handles annotated tags)
57 refs_to_try = [f"{ref}^{{}}", ref] if ref.startswith("refs/tags/") else [ref]
58
59 for query_ref in refs_to_try:
60 result = subprocess.run(
61 ['git', 'ls-remote', url, query_ref],
62 capture_output=True,
63 text=True,
64 timeout=30
65 )
66 if result.returncode == 0 and result.stdout.strip():
67 # Parse: "hash<tab>ref"
68 line = result.stdout.strip().split('\n')[0]
69 parts = line.split('\t')
70 if len(parts) >= 1 and len(parts[0]) == 40:
71 return parts[0]
72 except Exception:
73 pass
74 return ''
75
76
77def resolve_short_hash(url: str, short_hash: str) -> str:
78 """
79 Resolve a 12-char short hash to full 40-char hash.
80
81 Go pseudo-versions only contain 12 characters of the commit hash.
82 BitBake's git fetcher needs the full 40-char hash.
83
84 Strategy: Try GitHub API first (fast), then git ls-remote, then shallow clone.
85 """
86 if len(short_hash) != 12:
87 return short_hash # Already full or invalid
88
89 # First try: GitHub API (fast - single HTTP request)
90 # Note: Rate limited to 60/hour without auth token
91 if 'github.com' in url:
92 try:
93 import urllib.request
94 repo_path = url.replace('https://github.com/', '').replace('.git', '')
95 api_url = f"https://api.github.com/repos/{repo_path}/commits/{short_hash}"
96 req = urllib.request.Request(api_url, headers={'User-Agent': 'oe-go-mod-fetcher'})
97 with urllib.request.urlopen(req, timeout=10) as response:
98 data = json.loads(response.read().decode())
99 if 'sha' in data and len(data['sha']) == 40:
100 return data['sha']
101 except Exception:
102 pass # Rate limited or other error - try next method
103
104 # Second try: git ls-remote (downloads all refs, checks if any match)
105 # This works if the commit is a branch head or tag
106 try:
107 result = subprocess.run(
108 ['git', 'ls-remote', url],
109 capture_output=True,
110 text=True,
111 timeout=30
112 )
113 if result.returncode == 0:
114 for line in result.stdout.strip().split('\n'):
115 if line:
116 full_hash = line.split('\t')[0]
117 if full_hash.startswith(short_hash):
118 return full_hash
119 except Exception:
120 pass
121
122 # Third try: Shallow clone and rev-parse (slower but works for any commit)
123 try:
124 with tempfile.TemporaryDirectory(prefix='hash-resolve-') as tmpdir:
125 # Clone with minimal depth
126 clone_result = subprocess.run(
127 ['git', 'clone', '--bare', '--filter=blob:none', url, tmpdir + '/repo'],
128 capture_output=True,
129 timeout=120,
130 env={**os.environ, 'GIT_TERMINAL_PROMPT': '0'}
131 )
132 if clone_result.returncode == 0:
133 # Use rev-parse to expand short hash
134 parse_result = subprocess.run(
135 ['git', 'rev-parse', short_hash],
136 cwd=tmpdir + '/repo',
137 capture_output=True,
138 text=True,
139 timeout=10
140 )
141 if parse_result.returncode == 0:
142 full_hash = parse_result.stdout.strip()
143 if len(full_hash) == 40:
144 return full_hash
145 except Exception:
146 pass
147
148 # Could not resolve - return original short hash
149 return short_hash
150
151
152def derive_vcs_info(module_path, version):
153 """
154 Derive VCS URL and commit info from module path and version.
155
156 This is used for modules where the Go proxy doesn't provide Origin metadata
157 (older modules cached before Go 1.18).
158
159 Returns:
160 dict with vcs_url, vcs_hash (if pseudo-version), vcs_ref, subdir
161 or None if cannot derive
162 """
163 vcs_url = None
164 vcs_hash = ''
165 vcs_ref = ''
166 subpath = '' # FIX #32: Track subpath for multi-module repos (tag prefix)
167
168 # Derive URL from module path
169 if module_path.startswith('github.com/'):
170 # github.com/owner/repo or github.com/owner/repo/subpkg
171 parts = module_path.split('/')
172 if len(parts) >= 3:
173 vcs_url = f"https://github.com/{parts[1]}/{parts[2]}"
174 # FIX #32: Track subpath for multi-module repos (e.g., github.com/owner/repo/cmd/tool)
175 if len(parts) > 3:
176 subpath = '/'.join(parts[3:])
177
178 elif module_path.startswith('gitlab.com/'):
179 parts = module_path.split('/')
180 if len(parts) >= 3:
181 vcs_url = f"https://gitlab.com/{parts[1]}/{parts[2]}"
182
183 elif module_path.startswith('bitbucket.org/'):
184 parts = module_path.split('/')
185 if len(parts) >= 3:
186 vcs_url = f"https://bitbucket.org/{parts[1]}/{parts[2]}"
187
188 elif module_path.startswith('gopkg.in/'):
189 # gopkg.in/yaml.v2 -> github.com/go-yaml/yaml
190 # gopkg.in/check.v1 -> github.com/go-check/check
191 # gopkg.in/pkg.v3 -> github.com/go-pkg/pkg (convention)
192 # gopkg.in/fsnotify.v1 -> github.com/fsnotify/fsnotify (no go- prefix)
193 match = re.match(r'gopkg\.in/([^/]+)\.v\d+', module_path)
194 if match:
195 pkg_name = match.group(1)
196 # Common mappings - some use go-* prefix, others don't
197 mappings = {
198 'yaml': 'https://github.com/go-yaml/yaml',
199 'check': 'https://github.com/go-check/check',
200 'inf': 'https://github.com/go-inf/inf',
201 'tomb': 'https://github.com/go-tomb/tomb',
202 'fsnotify': 'https://github.com/fsnotify/fsnotify', # No go- prefix
203 }
204 vcs_url = mappings.get(pkg_name, f"https://github.com/go-{pkg_name}/{pkg_name}")
205
206 elif module_path.startswith('google.golang.org/'):
207 # google.golang.org vanity imports -> github.com/golang/*
208 # google.golang.org/appengine -> github.com/golang/appengine
209 # google.golang.org/protobuf -> github.com/protocolbuffers/protobuf-go (special case)
210 # google.golang.org/grpc -> github.com/grpc/grpc-go (special case)
211 # google.golang.org/genproto -> github.com/googleapis/go-genproto (special case)
212 #
213 # FIX #32: Handle submodules in multi-module repos
214 # google.golang.org/grpc/cmd/protoc-gen-go-grpc has tags like:
215 # cmd/protoc-gen-go-grpc/v1.1.0 (NOT v1.1.0)
216 # We need to track the subpath for tag prefix construction
217 parts = module_path.split('/')
218 if len(parts) >= 2:
219 pkg_name = parts[1] # First component after google.golang.org/
220 mappings = {
221 'protobuf': 'https://github.com/protocolbuffers/protobuf-go',
222 'grpc': 'https://github.com/grpc/grpc-go',
223 'genproto': 'https://github.com/googleapis/go-genproto',
224 'api': 'https://github.com/googleapis/google-api-go-client',
225 }
226 vcs_url = mappings.get(pkg_name, f"https://github.com/golang/{pkg_name}")
227 # Track subpath for submodule tag construction (e.g., cmd/protoc-gen-go-grpc)
228 if len(parts) > 2:
229 subpath = '/'.join(parts[2:]) # Everything after google.golang.org/grpc/
230
231 if not vcs_url:
232 return None
233
234 # Parse version for commit hash (pseudo-versions)
235 # Go pseudo-version formats:
236 # v0.0.0-20200815063812-42c35b437635 (no base version)
237 # v1.2.3-0.20200815063812-42c35b437635 (pre-release with "0." prefix)
238 # v1.2.4-0.20200815063812-42c35b437635 (post v1.2.3, pre v1.2.4)
239 # The key pattern: optional "0." then YYYYMMDDHHMMSS (14 digits) then 12-char commit hash
240 # Also handle +incompatible suffix
241 clean_version = version.replace('+incompatible', '')
242
243 # Try both pseudo-version formats:
244 # Format 1: -0.YYYYMMDDHHMMSS-HASH (with "0." prefix)
245 # Format 2: -YYYYMMDDHHMMSS-HASH (without prefix, typically v0.0.0-...)
246 pseudo_match = re.search(r'-(?:0\.)?(\d{14})-([0-9a-f]{12})$', clean_version)
247 if pseudo_match:
248 vcs_hash = pseudo_match.group(2) # 12-char short hash
249 # Note: Short hashes are expanded to full 40-char by oe-go-mod-fetcher.py
250 # in load_native_modules() using resolve_pseudo_version_commit()
251 else:
252 # Tagged version - resolve tag to commit hash
253 # FIX #32: For multi-module repos, the tag includes the subpath prefix
254 # e.g., google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.1.0
255 # has tag: cmd/protoc-gen-go-grpc/v1.1.0 (not v1.1.0)
256 if subpath:
257 tag_name = f"{subpath}/{clean_version}"
258 else:
259 tag_name = clean_version
260 vcs_ref = f"refs/tags/{tag_name}"
261 # Query the repository to get the actual commit hash for this tag
262 vcs_hash = git_ls_remote(vcs_url, vcs_ref)
263 if not vcs_hash and subpath:
264 # FIX #32: Fallback - try without subpath prefix
265 # Some repos don't use prefixed tags for submodules
266 fallback_ref = f"refs/tags/{clean_version}"
267 vcs_hash = git_ls_remote(vcs_url, fallback_ref)
268 if vcs_hash:
269 vcs_ref = fallback_ref # Use the working ref
270
271 return {
272 'vcs_url': vcs_url,
273 'vcs_hash': vcs_hash,
274 'vcs_ref': vcs_ref,
275 'subdir': subpath, # FIX #32: Return subdir for submodules
276 }
277
278
279def extract_modules(gomodcache_path):
280 """
281 Walk GOMODCACHE and extract all module metadata from .info files.
282
283 Returns list of dicts with complete metadata:
284 - module_path: Unescaped module path
285 - version: Module version
286 - vcs_url: Git repository URL
287 - vcs_hash: Full commit hash (40 chars)
288 - vcs_ref: Tag/branch reference
289 - subdir: Subdirectory in mono-repos
290 - timestamp: Commit timestamp
291 """
292 cache_dir = Path(gomodcache_path) / "cache" / "download"
293
294 if not cache_dir.exists():
295 raise FileNotFoundError(f"Cache directory not found: {cache_dir}")
296
297 modules = []
298 skipped = 0
299 derived = 0
300 total_info_files = 0
301
302 print(f"Scanning GOMODCACHE: {cache_dir}")
303
304 for info_file in cache_dir.rglob("*.info"):
305 total_info_files += 1
306
307 # Extract module path from directory structure
308 rel_path = info_file.parent.relative_to(cache_dir)
309 parts = list(rel_path.parts)
310
311 if parts[-1] != '@v':
312 continue
313
314 # Module path (unescape Go's !-encoding)
315 # Example: github.com/!microsoft/go-winio -> github.com/Microsoft/go-winio
316 module_path = '/'.join(parts[:-1])
317 # Unescape !x -> X (Go's case-insensitive encoding)
318 module_path = re.sub(r'!([a-z])', lambda m: m.group(1).upper(), module_path)
319
320 # Version
321 version = info_file.stem
322
323 # Read .info file for VCS metadata
324 try:
325 with open(info_file) as f:
326 info = json.load(f)
327
328 origin = info.get('Origin', {})
329
330 # Check if we have complete VCS info from Origin
331 if origin.get('URL') and origin.get('Hash'):
332 module = {
333 'module_path': module_path,
334 'version': version,
335 'vcs_url': origin.get('URL', ''),
336 'vcs_hash': origin.get('Hash', ''),
337 'vcs_ref': origin.get('Ref', ''),
338 'subdir': origin.get('Subdir', ''),
339 'timestamp': info.get('Time', ''),
340 }
341 modules.append(module)
342 else:
343 # FIX #29: Module lacks Origin metadata (common for +incompatible modules)
344 # Use derive_vcs_info() to infer VCS URL and ref from module path/version
345 derived += 1
346 # Progress output for derived modules (these require network calls)
347 if derived % 10 == 1:
348 print(f" Deriving VCS info... ({derived} modules)", end='\r', flush=True)
349 derived_info = derive_vcs_info(module_path, version)
350 if derived_info:
351 module = {
352 'module_path': module_path,
353 'version': version,
354 'vcs_url': derived_info.get('vcs_url', ''),
355 'vcs_hash': derived_info.get('vcs_hash', ''),
356 'vcs_ref': derived_info.get('vcs_ref', ''),
357 'subdir': derived_info.get('subdir', ''), # FIX #32: Use derived subdir
358 'timestamp': info.get('Time', ''),
359 }
360 modules.append(module)
361 else:
362 # Cannot derive VCS info - skip this module
363 skipped += 1
364 derived -= 1 # Don't count as derived if we couldn't derive
365 # Only log for debugging
366 # print(f" ⚠️ Cannot derive VCS info for {module_path}@{version}")
367
368 except json.JSONDecodeError as e:
369 print(f" ⚠️ Failed to parse {info_file}: {e}")
370 skipped += 1
371 continue
372 except Exception as e:
373 print(f" ⚠️ Error processing {info_file}: {e}")
374 skipped += 1
375 continue
376
377 print(f"\nProcessed {total_info_files} .info files")
378 print(f"Extracted {len(modules)} modules total:")
379 print(f" - {len(modules) - derived} with Origin metadata from proxy")
380 print(f" - {derived} with derived VCS info (Fix #29)")
381 print(f"Skipped {skipped} modules (cannot derive VCS info)")
382
383 return modules
384
385
386def main():
387 parser = argparse.ArgumentParser(
388 description='Extract module metadata from Go module cache',
389 formatter_class=argparse.RawDescriptionHelpFormatter,
390 epilog="""
391Examples:
392 # Extract from native Go build cache
393 %(prog)s --gomodcache /tmp/k3s-discovery-cache --output /tmp/k3s-modules.json
394
395 # Extract from BitBake discovery build
396 %(prog)s --gomodcache /path/to/build/tmp/work/.../discovery-cache --output /tmp/k3s-modules.json
397
398 # Extract from system GOMODCACHE
399 %(prog)s --gomodcache ~/go/pkg/mod --output /tmp/modules.json
400
401Output:
402 - <output>.json: Complete module metadata (VCS URLs, commits, subdirs)
403 - <output>.txt: Simple module@version list (sorted)
404"""
405 )
406 parser.add_argument(
407 '--gomodcache',
408 required=True,
409 help='Path to GOMODCACHE directory'
410 )
411 parser.add_argument(
412 '--output',
413 required=True,
414 help='Output JSON file path (e.g., /tmp/k3s-modules.json)'
415 )
416
417 args = parser.parse_args()
418
419 # Validate GOMODCACHE path
420 gomodcache = Path(args.gomodcache)
421 if not gomodcache.exists():
422 print(f"Error: GOMODCACHE directory does not exist: {gomodcache}", file=sys.stderr)
423 sys.exit(1)
424
425 # Extract modules
426 try:
427 modules = extract_modules(gomodcache)
428 except Exception as e:
429 print(f"Error during extraction: {e}", file=sys.stderr)
430 sys.exit(1)
431
432 if not modules:
433 print("Warning: No modules with VCS metadata found!", file=sys.stderr)
434 print("This may indicate:", file=sys.stderr)
435 print(" - GOMODCACHE is from BitBake (synthetic .info files)", file=sys.stderr)
436 print(" - GOMODCACHE is empty or incomplete", file=sys.stderr)
437 print(" - Need to run 'go mod download' first", file=sys.stderr)
438 sys.exit(1)
439
440 # Save as JSON
441 output_path = Path(args.output)
442 try:
443 output_path.parent.mkdir(parents=True, exist_ok=True)
444 output_path.write_text(json.dumps(modules, indent=2, sort_keys=True))
445 print(f"\n✓ Saved {len(modules)} modules to {output_path}")
446 except Exception as e:
447 print(f"Error writing JSON output: {e}", file=sys.stderr)
448 sys.exit(1)
449
450 # Also save simple list
451 list_path = output_path.with_suffix('.txt')
452 try:
453 simple_list = [f"{m['module_path']}@{m['version']}" for m in modules]
454 list_path.write_text('\n'.join(sorted(simple_list)) + '\n')
455 print(f"✓ Saved module list to {list_path}")
456 except Exception as e:
457 print(f"Error writing module list: {e}", file=sys.stderr)
458 sys.exit(1)
459
460 # Print summary statistics
461 print("\n" + "="*60)
462 print("EXTRACTION SUMMARY")
463 print("="*60)
464
465 # Count unique repositories
466 unique_repos = len(set(m['vcs_url'] for m in modules))
467 print(f"Total modules: {len(modules)}")
468 print(f"Unique repositories: {unique_repos}")
469
470 # Count modules with subdirs (multi-module repos)
471 with_subdirs = sum(1 for m in modules if m['subdir'])
472 print(f"Multi-module repos: {with_subdirs} modules have subdirs")
473
474 # Show top repositories by module count
475 repo_counts = {}
476 for m in modules:
477 repo_counts[m['vcs_url']] = repo_counts.get(m['vcs_url'], 0) + 1
478
479 top_repos = sorted(repo_counts.items(), key=lambda x: x[1], reverse=True)[:5]
480 print("\nTop 5 repositories by module count:")
481 for repo_url, count in top_repos:
482 print(f" {count:3d} modules: {repo_url}")
483
484 print("\n" + "="*60)
485 print("Use this JSON file with:")
486 print(f" oe-go-mod-fetcher.py --native-modules {output_path}")
487 print("="*60)
488
489
490if __name__ == '__main__':
491 main()