#!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0-only # # Generate license-hashes.csv from OE-core's common-licenses directory. # # Usage: # python3 generate-license-hashes.py /path/to/meta/files/common-licenses # # Output: CSV to stdout with columns: md5,crunched_md5,spdx_id # Redirect to scripts/data/license-hashes.csv to update the bundled database. import hashlib import os import re import sys def squashspaces(s): return re.sub(r"\s+", " ", s).strip() def crunch_license(licfile): """ Normalize license text by removing non-material content, then compute MD5. Same algorithm as OE-core's oe/license_finder.py _crunch_license(). """ license_title_re = re.compile( r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$') license_statement_re = re.compile( r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)' r'|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$') copyright_re = re.compile( r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$') disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$') email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$') header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$') tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$') url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$') lictext = [] with open(licfile, 'r', errors='surrogateescape') as f: for line in f: if copyright_re.match(line): continue if disclaimer_re.match(line): continue if email_re.match(line): continue if header_re.match(line): continue if tag_re.match(line): continue if url_re.match(line): continue if license_title_re.match(line): continue if license_statement_re.match(line): continue line = line.replace('*', '').replace('#', '') line = line.replace('sub-license', 'sublicense') line = squashspaces(line) line = (line.replace(u"\u2018", "'").replace(u"\u2019", "'") .replace(u"\u201c", "'").replace(u"\u201d", "'") .replace('"', "'").replace('`', "'")) line = line.replace("{", "[").replace("}", "]") if line: lictext.append(line) m = hashlib.md5() try: m.update(' '.join(lictext).encode('utf-8')) return m.hexdigest() except UnicodeEncodeError: return None def md5_file(path): m = hashlib.md5() with open(path, 'rb') as f: for chunk in iter(lambda: f.read(8192), b''): m.update(chunk) return m.hexdigest() def generate_hashes(common_license_dir): """Generate CSV lines from a common-licenses directory.""" entries = [] for fn in sorted(os.listdir(common_license_dir)): path = os.path.join(common_license_dir, fn) if not os.path.isfile(path): continue exact_md5 = md5_file(path) crunched_md5 = crunch_license(path) or '' entries.append((exact_md5, crunched_md5, fn)) return entries def main(): if len(sys.argv) != 2: print(f"Usage: {sys.argv[0]} ", file=sys.stderr) print(f" Generates license-hashes.csv to stdout", file=sys.stderr) sys.exit(1) common_license_dir = sys.argv[1] if not os.path.isdir(common_license_dir): print(f"Error: not a directory: {common_license_dir}", file=sys.stderr) sys.exit(1) entries = generate_hashes(common_license_dir) print("# Generated by generate-license-hashes.py from OE-core common-licenses") print("# Format: exact_md5,crunched_md5,spdx_id") for exact_md5, crunched_md5, spdx_id in entries: print(f"{exact_md5},{crunched_md5},{spdx_id}") print(f"# Generated {len(entries)} entries", file=sys.stderr) if __name__ == "__main__": main()