1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0-only
#
# Generate license-hashes.csv from OE-core's common-licenses directory.
#
# Usage:
# python3 generate-license-hashes.py /path/to/meta/files/common-licenses
#
# Output: CSV to stdout with columns: md5,crunched_md5,spdx_id
# Redirect to scripts/data/license-hashes.csv to update the bundled database.
import hashlib
import os
import re
import sys
def squashspaces(s):
return re.sub(r"\s+", " ", s).strip()
def crunch_license(licfile):
"""
Normalize license text by removing non-material content, then compute MD5.
Same algorithm as OE-core's oe/license_finder.py _crunch_license().
"""
license_title_re = re.compile(
r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$')
license_statement_re = re.compile(
r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)'
r'|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$')
copyright_re = re.compile(
r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$')
disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$')
email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$')
header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$')
tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$')
url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$')
lictext = []
with open(licfile, 'r', errors='surrogateescape') as f:
for line in f:
if copyright_re.match(line):
continue
if disclaimer_re.match(line):
continue
if email_re.match(line):
continue
if header_re.match(line):
continue
if tag_re.match(line):
continue
if url_re.match(line):
continue
if license_title_re.match(line):
continue
if license_statement_re.match(line):
continue
line = line.replace('*', '').replace('#', '')
line = line.replace('sub-license', 'sublicense')
line = squashspaces(line)
line = (line.replace(u"\u2018", "'").replace(u"\u2019", "'")
.replace(u"\u201c", "'").replace(u"\u201d", "'")
.replace('"', "'").replace('`', "'"))
line = line.replace("{", "[").replace("}", "]")
if line:
lictext.append(line)
m = hashlib.md5()
try:
m.update(' '.join(lictext).encode('utf-8'))
return m.hexdigest()
except UnicodeEncodeError:
return None
def md5_file(path):
m = hashlib.md5()
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(8192), b''):
m.update(chunk)
return m.hexdigest()
def generate_hashes(common_license_dir):
"""Generate CSV lines from a common-licenses directory."""
entries = []
for fn in sorted(os.listdir(common_license_dir)):
path = os.path.join(common_license_dir, fn)
if not os.path.isfile(path):
continue
exact_md5 = md5_file(path)
crunched_md5 = crunch_license(path) or ''
entries.append((exact_md5, crunched_md5, fn))
return entries
def main():
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <common-licenses-dir>", file=sys.stderr)
print(f" Generates license-hashes.csv to stdout", file=sys.stderr)
sys.exit(1)
common_license_dir = sys.argv[1]
if not os.path.isdir(common_license_dir):
print(f"Error: not a directory: {common_license_dir}", file=sys.stderr)
sys.exit(1)
entries = generate_hashes(common_license_dir)
print("# Generated by generate-license-hashes.py from OE-core common-licenses")
print("# Format: exact_md5,crunched_md5,spdx_id")
for exact_md5, crunched_md5, spdx_id in entries:
print(f"{exact_md5},{crunched_md5},{spdx_id}")
print(f"# Generated {len(entries)} entries", file=sys.stderr)
if __name__ == "__main__":
main()
|