scripts/generate-license-hashes.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0-only
#
# Generate license-hashes.csv from OE-core's common-licenses directory.
#
# Usage:
#   python3 generate-license-hashes.py /path/to/meta/files/common-licenses
#
# Output: CSV to stdout with columns: md5,crunched_md5,spdx_id
# Redirect to scripts/data/license-hashes.csv to update the bundled database.

import hashlib
import os
import re
import sys


def squashspaces(s):
    return re.sub(r"\s+", " ", s).strip()


def crunch_license(licfile):
    """
    Normalize license text by removing non-material content, then compute MD5.
    Same algorithm as OE-core's oe/license_finder.py _crunch_license().
    """
    license_title_re = re.compile(
        r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$')
    license_statement_re = re.compile(
        r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)'
        r'|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$')
    copyright_re = re.compile(
        r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$')
    disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$')
    email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$')
    header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$')
    tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$')
    url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$')

    lictext = []
    with open(licfile, 'r', errors='surrogateescape') as f:
        for line in f:
            if copyright_re.match(line):
                continue
            if disclaimer_re.match(line):
                continue
            if email_re.match(line):
                continue
            if header_re.match(line):
                continue
            if tag_re.match(line):
                continue
            if url_re.match(line):
                continue
            if license_title_re.match(line):
                continue
            if license_statement_re.match(line):
                continue
            line = line.replace('*', '').replace('#', '')
            line = line.replace('sub-license', 'sublicense')
            line = squashspaces(line)
            line = (line.replace(u"\u2018", "'").replace(u"\u2019", "'")
                    .replace(u"\u201c", "'").replace(u"\u201d", "'")
                    .replace('"', "'").replace('`', "'"))
            line = line.replace("{", "[").replace("}", "]")
            if line:
                lictext.append(line)

    m = hashlib.md5()
    try:
        m.update(' '.join(lictext).encode('utf-8'))
        return m.hexdigest()
    except UnicodeEncodeError:
        return None


def md5_file(path):
    m = hashlib.md5()
    with open(path, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            m.update(chunk)
    return m.hexdigest()


def generate_hashes(common_license_dir):
    """Generate CSV lines from a common-licenses directory."""
    entries = []
    for fn in sorted(os.listdir(common_license_dir)):
        path = os.path.join(common_license_dir, fn)
        if not os.path.isfile(path):
            continue
        exact_md5 = md5_file(path)
        crunched_md5 = crunch_license(path) or ''
        entries.append((exact_md5, crunched_md5, fn))
    return entries


def main():
    if len(sys.argv) != 2:
        print(f"Usage: {sys.argv[0]} <common-licenses-dir>", file=sys.stderr)
        print(f"  Generates license-hashes.csv to stdout", file=sys.stderr)
        sys.exit(1)

    common_license_dir = sys.argv[1]
    if not os.path.isdir(common_license_dir):
        print(f"Error: not a directory: {common_license_dir}", file=sys.stderr)
        sys.exit(1)

    entries = generate_hashes(common_license_dir)
    print("# Generated by generate-license-hashes.py from OE-core common-licenses")
    print("# Format: exact_md5,crunched_md5,spdx_id")
    for exact_md5, crunched_md5, spdx_id in entries:
        print(f"{exact_md5},{crunched_md5},{spdx_id}")

    print(f"# Generated {len(entries)} entries", file=sys.stderr)


if __name__ == "__main__":
    main()