summaryrefslogtreecommitdiffstats
path: root/scripts/extract-discovered-modules.py
blob: 1cfca6ad8a18e15ea2a974602a2ee806bdc24ea2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0-only
#
# go-dep processor
#
# Copyright (C) 2025 Bruce Ashfield
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

"""
Extract complete module metadata from BitBake Go discovery build cache.

This script walks a GOMODCACHE directory (from BitBake discovery build) and
extracts all module metadata from .info files, including VCS information.

Usage:
    extract-discovered-modules.py --gomodcache /path/to/cache --output modules.json

The script creates:
    - modules.json: Complete metadata with VCS URLs, commits, subdirs, timestamps
    - modules.txt: Simple module@version list

This provides 100% accurate module discovery for BitBake recipe generation.
"""

import argparse
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
import urllib.parse
from pathlib import Path


def git_ls_remote(url: str, ref: str) -> str:
    """
    Query a git repository for a ref and return the commit hash.

    For tags, also tries dereferenced form (^{}) to handle annotated tags.
    """
    try:
        # Try dereferenced form first (handles annotated tags)
        refs_to_try = [f"{ref}^{{}}", ref] if ref.startswith("refs/tags/") else [ref]

        for query_ref in refs_to_try:
            result = subprocess.run(
                ['git', 'ls-remote', url, query_ref],
                capture_output=True,
                text=True,
                timeout=30
            )
            if result.returncode == 0 and result.stdout.strip():
                # Parse: "hash<tab>ref"
                line = result.stdout.strip().split('\n')[0]
                parts = line.split('\t')
                if len(parts) >= 1 and len(parts[0]) == 40:
                    return parts[0]
    except Exception:
        pass
    return ''


def resolve_short_hash(url: str, short_hash: str) -> str:
    """
    Resolve a 12-char short hash to full 40-char hash.

    Go pseudo-versions only contain 12 characters of the commit hash.
    BitBake's git fetcher needs the full 40-char hash.

    Strategy: Try GitHub API first (fast), then git ls-remote, then shallow clone.
    """
    if len(short_hash) != 12:
        return short_hash  # Already full or invalid

    # First try: GitHub API (fast - single HTTP request)
    # Note: Rate limited to 60/hour without auth token
    if 'github.com' in url:
        try:
            import urllib.request
            repo_path = url.replace('https://github.com/', '').replace('.git', '')
            api_url = f"https://api.github.com/repos/{repo_path}/commits/{short_hash}"
            req = urllib.request.Request(api_url, headers={'User-Agent': 'oe-go-mod-fetcher'})
            with urllib.request.urlopen(req, timeout=10) as response:
                data = json.loads(response.read().decode())
                if 'sha' in data and len(data['sha']) == 40:
                    return data['sha']
        except Exception:
            pass  # Rate limited or other error - try next method

    # Second try: git ls-remote (downloads all refs, checks if any match)
    # This works if the commit is a branch head or tag
    try:
        result = subprocess.run(
            ['git', 'ls-remote', url],
            capture_output=True,
            text=True,
            timeout=30
        )
        if result.returncode == 0:
            for line in result.stdout.strip().split('\n'):
                if line:
                    full_hash = line.split('\t')[0]
                    if full_hash.startswith(short_hash):
                        return full_hash
    except Exception:
        pass

    # Third try: Shallow clone and rev-parse (slower but works for any commit)
    try:
        with tempfile.TemporaryDirectory(prefix='hash-resolve-') as tmpdir:
            # Clone with minimal depth
            clone_result = subprocess.run(
                ['git', 'clone', '--bare', '--filter=blob:none', url, tmpdir + '/repo'],
                capture_output=True,
                timeout=120,
                env={**os.environ, 'GIT_TERMINAL_PROMPT': '0'}
            )
            if clone_result.returncode == 0:
                # Use rev-parse to expand short hash
                parse_result = subprocess.run(
                    ['git', 'rev-parse', short_hash],
                    cwd=tmpdir + '/repo',
                    capture_output=True,
                    text=True,
                    timeout=10
                )
                if parse_result.returncode == 0:
                    full_hash = parse_result.stdout.strip()
                    if len(full_hash) == 40:
                        return full_hash
    except Exception:
        pass

    # Could not resolve - return original short hash
    return short_hash


def derive_vcs_info(module_path, version):
    """
    Derive VCS URL and commit info from module path and version.

    This is used for modules where the Go proxy doesn't provide Origin metadata
    (older modules cached before Go 1.18).

    Returns:
        dict with vcs_url, vcs_hash (if pseudo-version), vcs_ref, subdir
        or None if cannot derive
    """
    vcs_url = None
    vcs_hash = ''
    vcs_ref = ''
    subpath = ''  # FIX #32: Track subpath for multi-module repos (tag prefix)

    # Derive URL from module path
    if module_path.startswith('github.com/'):
        # github.com/owner/repo or github.com/owner/repo/subpkg
        parts = module_path.split('/')
        if len(parts) >= 3:
            vcs_url = f"https://github.com/{parts[1]}/{parts[2]}"
            # FIX #32: Track subpath for multi-module repos (e.g., github.com/owner/repo/cmd/tool)
            if len(parts) > 3:
                subpath = '/'.join(parts[3:])

    elif module_path.startswith('gitlab.com/'):
        parts = module_path.split('/')
        if len(parts) >= 3:
            vcs_url = f"https://gitlab.com/{parts[1]}/{parts[2]}"

    elif module_path.startswith('bitbucket.org/'):
        parts = module_path.split('/')
        if len(parts) >= 3:
            vcs_url = f"https://bitbucket.org/{parts[1]}/{parts[2]}"

    elif module_path.startswith('gopkg.in/'):
        # gopkg.in/yaml.v2 -> github.com/go-yaml/yaml
        # gopkg.in/check.v1 -> github.com/go-check/check
        # gopkg.in/pkg.v3 -> github.com/go-pkg/pkg (convention)
        # gopkg.in/fsnotify.v1 -> github.com/fsnotify/fsnotify (no go- prefix)
        match = re.match(r'gopkg\.in/([^/]+)\.v\d+', module_path)
        if match:
            pkg_name = match.group(1)
            # Common mappings - some use go-* prefix, others don't
            mappings = {
                'yaml': 'https://github.com/go-yaml/yaml',
                'check': 'https://github.com/go-check/check',
                'inf': 'https://github.com/go-inf/inf',
                'tomb': 'https://github.com/go-tomb/tomb',
                'fsnotify': 'https://github.com/fsnotify/fsnotify',  # No go- prefix
            }
            vcs_url = mappings.get(pkg_name, f"https://github.com/go-{pkg_name}/{pkg_name}")

    elif module_path.startswith('google.golang.org/'):
        # google.golang.org vanity imports -> github.com/golang/*
        # google.golang.org/appengine -> github.com/golang/appengine
        # google.golang.org/protobuf -> github.com/protocolbuffers/protobuf-go (special case)
        # google.golang.org/grpc -> github.com/grpc/grpc-go (special case)
        # google.golang.org/genproto -> github.com/googleapis/go-genproto (special case)
        #
        # FIX #32: Handle submodules in multi-module repos
        # google.golang.org/grpc/cmd/protoc-gen-go-grpc has tags like:
        #   cmd/protoc-gen-go-grpc/v1.1.0 (NOT v1.1.0)
        # We need to track the subpath for tag prefix construction
        parts = module_path.split('/')
        if len(parts) >= 2:
            pkg_name = parts[1]  # First component after google.golang.org/
            mappings = {
                'protobuf': 'https://github.com/protocolbuffers/protobuf-go',
                'grpc': 'https://github.com/grpc/grpc-go',
                'genproto': 'https://github.com/googleapis/go-genproto',
                'api': 'https://github.com/googleapis/google-api-go-client',
            }
            vcs_url = mappings.get(pkg_name, f"https://github.com/golang/{pkg_name}")
            # Track subpath for submodule tag construction (e.g., cmd/protoc-gen-go-grpc)
            if len(parts) > 2:
                subpath = '/'.join(parts[2:])  # Everything after google.golang.org/grpc/

    if not vcs_url:
        return None

    # Parse version for commit hash (pseudo-versions)
    # Go pseudo-version formats:
    #   v0.0.0-20200815063812-42c35b437635           (no base version)
    #   v1.2.3-0.20200815063812-42c35b437635         (pre-release with "0." prefix)
    #   v1.2.4-0.20200815063812-42c35b437635         (post v1.2.3, pre v1.2.4)
    # The key pattern: optional "0." then YYYYMMDDHHMMSS (14 digits) then 12-char commit hash
    # Also handle +incompatible suffix
    clean_version = version.replace('+incompatible', '')

    # Try both pseudo-version formats:
    # Format 1: -0.YYYYMMDDHHMMSS-HASH (with "0." prefix)
    # Format 2: -YYYYMMDDHHMMSS-HASH (without prefix, typically v0.0.0-...)
    pseudo_match = re.search(r'-(?:0\.)?(\d{14})-([0-9a-f]{12})$', clean_version)
    if pseudo_match:
        vcs_hash = pseudo_match.group(2)  # 12-char short hash
        # Note: Short hashes are expanded to full 40-char by oe-go-mod-fetcher.py
        # in load_native_modules() using resolve_pseudo_version_commit()
    else:
        # Tagged version - resolve tag to commit hash
        # FIX #32: For multi-module repos, the tag includes the subpath prefix
        # e.g., google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.1.0
        #       has tag: cmd/protoc-gen-go-grpc/v1.1.0 (not v1.1.0)
        if subpath:
            tag_name = f"{subpath}/{clean_version}"
        else:
            tag_name = clean_version
        vcs_ref = f"refs/tags/{tag_name}"
        # Query the repository to get the actual commit hash for this tag
        vcs_hash = git_ls_remote(vcs_url, vcs_ref)
        if not vcs_hash and subpath:
            # FIX #32: Fallback - try without subpath prefix
            # Some repos don't use prefixed tags for submodules
            fallback_ref = f"refs/tags/{clean_version}"
            vcs_hash = git_ls_remote(vcs_url, fallback_ref)
            if vcs_hash:
                vcs_ref = fallback_ref  # Use the working ref

    return {
        'vcs_url': vcs_url,
        'vcs_hash': vcs_hash,
        'vcs_ref': vcs_ref,
        'subdir': subpath,  # FIX #32: Return subdir for submodules
    }


def extract_modules(gomodcache_path):
    """
    Walk GOMODCACHE and extract all module metadata from .info files.

    Returns list of dicts with complete metadata:
    - module_path: Unescaped module path
    - version: Module version
    - vcs_url: Git repository URL
    - vcs_hash: Full commit hash (40 chars)
    - vcs_ref: Tag/branch reference
    - subdir: Subdirectory in mono-repos
    - timestamp: Commit timestamp
    """
    cache_dir = Path(gomodcache_path) / "cache" / "download"

    if not cache_dir.exists():
        raise FileNotFoundError(f"Cache directory not found: {cache_dir}")

    modules = []
    skipped = 0
    derived = 0
    total_info_files = 0

    print(f"Scanning GOMODCACHE: {cache_dir}")

    for info_file in cache_dir.rglob("*.info"):
        total_info_files += 1

        # Extract module path from directory structure
        rel_path = info_file.parent.relative_to(cache_dir)
        parts = list(rel_path.parts)

        if parts[-1] != '@v':
            continue

        # Module path (unescape Go's !-encoding)
        # Example: github.com/!microsoft/go-winio -> github.com/Microsoft/go-winio
        module_path = '/'.join(parts[:-1])
        # Unescape !x -> X (Go's case-insensitive encoding)
        module_path = re.sub(r'!([a-z])', lambda m: m.group(1).upper(), module_path)

        # Version
        version = info_file.stem

        # Read .info file for VCS metadata
        try:
            with open(info_file) as f:
                info = json.load(f)

            origin = info.get('Origin', {})

            # Check if we have complete VCS info from Origin
            if origin.get('URL') and origin.get('Hash'):
                module = {
                    'module_path': module_path,
                    'version': version,
                    'vcs_url': origin.get('URL', ''),
                    'vcs_hash': origin.get('Hash', ''),
                    'vcs_ref': origin.get('Ref', ''),
                    'subdir': origin.get('Subdir', ''),
                    'timestamp': info.get('Time', ''),
                }
                modules.append(module)
            else:
                # FIX #29: Module lacks Origin metadata (common for +incompatible modules)
                # Use derive_vcs_info() to infer VCS URL and ref from module path/version
                derived += 1
                # Progress output for derived modules (these require network calls)
                if derived % 10 == 1:
                    print(f"  Deriving VCS info... ({derived} modules)", end='\r', flush=True)
                derived_info = derive_vcs_info(module_path, version)
                if derived_info:
                    module = {
                        'module_path': module_path,
                        'version': version,
                        'vcs_url': derived_info.get('vcs_url', ''),
                        'vcs_hash': derived_info.get('vcs_hash', ''),
                        'vcs_ref': derived_info.get('vcs_ref', ''),
                        'subdir': derived_info.get('subdir', ''),  # FIX #32: Use derived subdir
                        'timestamp': info.get('Time', ''),
                    }
                    modules.append(module)
                else:
                    # Cannot derive VCS info - skip this module
                    skipped += 1
                    derived -= 1  # Don't count as derived if we couldn't derive
                    # Only log for debugging
                    # print(f"  ⚠️  Cannot derive VCS info for {module_path}@{version}")

        except json.JSONDecodeError as e:
            print(f"  ⚠️  Failed to parse {info_file}: {e}")
            skipped += 1
            continue
        except Exception as e:
            print(f"  ⚠️  Error processing {info_file}: {e}")
            skipped += 1
            continue

    print(f"\nProcessed {total_info_files} .info files")
    print(f"Extracted {len(modules)} modules total:")
    print(f"  - {len(modules) - derived} with Origin metadata from proxy")
    print(f"  - {derived} with derived VCS info (Fix #29)")
    print(f"Skipped {skipped} modules (cannot derive VCS info)")

    return modules


def main():
    parser = argparse.ArgumentParser(
        description='Extract module metadata from Go module cache',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Extract from native Go build cache
    %(prog)s --gomodcache /tmp/k3s-discovery-cache --output /tmp/k3s-modules.json

    # Extract from BitBake discovery build
    %(prog)s --gomodcache /path/to/build/tmp/work/.../discovery-cache --output /tmp/k3s-modules.json

    # Extract from system GOMODCACHE
    %(prog)s --gomodcache ~/go/pkg/mod --output /tmp/modules.json

Output:
    - <output>.json: Complete module metadata (VCS URLs, commits, subdirs)
    - <output>.txt: Simple module@version list (sorted)
"""
    )
    parser.add_argument(
        '--gomodcache',
        required=True,
        help='Path to GOMODCACHE directory'
    )
    parser.add_argument(
        '--output',
        required=True,
        help='Output JSON file path (e.g., /tmp/k3s-modules.json)'
    )

    args = parser.parse_args()

    # Validate GOMODCACHE path
    gomodcache = Path(args.gomodcache)
    if not gomodcache.exists():
        print(f"Error: GOMODCACHE directory does not exist: {gomodcache}", file=sys.stderr)
        sys.exit(1)

    # Extract modules
    try:
        modules = extract_modules(gomodcache)
    except Exception as e:
        print(f"Error during extraction: {e}", file=sys.stderr)
        sys.exit(1)

    if not modules:
        print("Warning: No modules with VCS metadata found!", file=sys.stderr)
        print("This may indicate:", file=sys.stderr)
        print("  - GOMODCACHE is from BitBake (synthetic .info files)", file=sys.stderr)
        print("  - GOMODCACHE is empty or incomplete", file=sys.stderr)
        print("  - Need to run 'go mod download' first", file=sys.stderr)
        sys.exit(1)

    # Save as JSON
    output_path = Path(args.output)
    try:
        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_text(json.dumps(modules, indent=2, sort_keys=True))
        print(f"\n✓ Saved {len(modules)} modules to {output_path}")
    except Exception as e:
        print(f"Error writing JSON output: {e}", file=sys.stderr)
        sys.exit(1)

    # Also save simple list
    list_path = output_path.with_suffix('.txt')
    try:
        simple_list = [f"{m['module_path']}@{m['version']}" for m in modules]
        list_path.write_text('\n'.join(sorted(simple_list)) + '\n')
        print(f"✓ Saved module list to {list_path}")
    except Exception as e:
        print(f"Error writing module list: {e}", file=sys.stderr)
        sys.exit(1)

    # Print summary statistics
    print("\n" + "="*60)
    print("EXTRACTION SUMMARY")
    print("="*60)

    # Count unique repositories
    unique_repos = len(set(m['vcs_url'] for m in modules))
    print(f"Total modules:      {len(modules)}")
    print(f"Unique repositories: {unique_repos}")

    # Count modules with subdirs (multi-module repos)
    with_subdirs = sum(1 for m in modules if m['subdir'])
    print(f"Multi-module repos:  {with_subdirs} modules have subdirs")

    # Show top repositories by module count
    repo_counts = {}
    for m in modules:
        repo_counts[m['vcs_url']] = repo_counts.get(m['vcs_url'], 0) + 1

    top_repos = sorted(repo_counts.items(), key=lambda x: x[1], reverse=True)[:5]
    print("\nTop 5 repositories by module count:")
    for repo_url, count in top_repos:
        print(f"  {count:3d} modules: {repo_url}")

    print("\n" + "="*60)
    print("Use this JSON file with:")
    print(f"  oe-go-mod-fetcher.py --native-modules {output_path}")
    print("="*60)


if __name__ == '__main__':
    main()