1 files changed, 179 insertions, 0 deletions
diff --git a/meta/lib/oe/license_finder.py b/meta/lib/oe/license_finder.py
new file mode 100644
index 0000000000..16f5d7c94c
--- /dev/null
+++ b/meta/lib/oe/license_finder.py
@@ -0,0 +1,179 @@
+#
+# Copyright OpenEmbedded Contributors
+#
+# SPDX-License-Identifier: GPL-2.0-only
+#
+import fnmatch
+import hashlib
+import logging
+import os
+import re
+import bb
+import bb.utils
+logger = logging.getLogger("BitBake.OE.LicenseFinder")
+def _load_hash_csv(d):
+    """
+    Load a mapping of (checksum: license name) from all files/license-hashes.csv
+    files that can be found in the available layers.
+    """
+    import csv
+    md5sums = {}
+    # Read license md5sums from csv file
+    for path in d.getVar('BBPATH').split(':'):
+        csv_path = os.path.join(path, 'files', 'license-hashes.csv')
+        if os.path.isfile(csv_path):
+            with open(csv_path, newline='') as csv_file:
+                reader = csv.DictReader(csv_file, delimiter=',', fieldnames=['md5sum', 'license'])
+                for row in reader:
+                    md5sums[row['md5sum']] = row['license']
+    return md5sums
+def _crunch_known_licenses(d):
+    """
+    Calculate the MD5 checksums for the original and "crunched" versions of all
+    known licenses.
+    """
+    md5sums = {}
+    lic_dirs = [d.getVar('COMMON_LICENSE_DIR')] + (d.getVar('LICENSE_PATH') or "").split()
+    for lic_dir in lic_dirs:
+        for fn in os.listdir(lic_dir):
+            path = os.path.join(lic_dir, fn)
+            # Hash the exact contents
+            md5value = bb.utils.md5_file(path)
+            md5sums[md5value] = fn
+            # Also hash a "crunched" version
+            md5value = _crunch_license(path)
+            md5sums[md5value] = fn
+    return md5sums
+def _crunch_license(licfile):
+    '''
+    Remove non-material text from a license file and then calculate its
+    md5sum. This works well for licenses that contain a copyright statement,
+    but is also a useful way to handle people's insistence upon reformatting
+    the license text slightly (with no material difference to the text of the
+    license).
+    '''
+    import oe.utils
+    # Note: these are carefully constructed!
+    license_title_re = re.compile(r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$')
+    license_statement_re = re.compile(r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$')
+    copyright_re = re.compile(r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$')
+    disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$')
+    email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$')
+    header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$')
+    tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$')
+    url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$')
+    lictext = []
+    with open(licfile, 'r', errors='surrogateescape') as f:
+        for line in f:
+            # Drop opening statements
+            if copyright_re.match(line):
+                continue
+            elif disclaimer_re.match(line):
+                continue
+            elif email_re.match(line):
+                continue
+            elif header_re.match(line):
+                continue
+            elif tag_re.match(line):
+                continue
+            elif url_re.match(line):
+                continue
+            elif license_title_re.match(line):
+                continue
+            elif license_statement_re.match(line):
+                continue
+            # Strip comment symbols
+            line = line.replace('*', '') \
+                       .replace('#', '')
+            # Unify spelling
+            line = line.replace('sub-license', 'sublicense')
+            # Squash spaces
+            line = oe.utils.squashspaces(line.strip())
+            # Replace smart quotes, double quotes and backticks with single quotes
+            line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'')
+            # Unify brackets
+            line = line.replace("{", "[").replace("}", "]")
+            if line:
+                lictext.append(line)
+    m = hashlib.md5()
+    try:
+        m.update(' '.join(lictext).encode('utf-8'))
+        md5val = m.hexdigest()
+    except UnicodeEncodeError:
+        md5val = None
+    return md5val
+def find_license_files(srctree, first_only=False):
+    """
+    Search srctree for files that look like they could be licenses.
+    If first_only is True, only return the first file found.
+    """
+    licspecs = ['*LICEN[CS]E*', 'COPYING*', '*[Ll]icense*', 'LEGAL*', '[Ll]egal*', '*GPL*', 'README.lic*', 'COPYRIGHT*', '[Cc]opyright*', 'e[dp]l-v10']
+    skip_extensions = (".html", ".js", ".json", ".svg", ".ts", ".go", ".sh")
+    licfiles = []
+    for root, dirs, files in os.walk(srctree):
+        # Sort files so that LICENSE is before LICENSE.subcomponent, which is
+        # meaningful if first_only is set.
+        for fn in sorted(files):
+            if fn.endswith(skip_extensions):
+                continue
+            for spec in licspecs:
+                if fnmatch.fnmatch(fn, spec):
+                    fullpath = os.path.join(root, fn)
+                    if not fullpath in licfiles:
+                        licfiles.append(fullpath)
+                        if first_only:
+                            return licfiles
+    return licfiles
+def match_licenses(licfiles, srctree, d, extra_hashes={}):
+    md5sums = {}
+    md5sums.update(_load_hash_csv(d))
+    md5sums.update(_crunch_known_licenses(d))
+    md5sums.update(extra_hashes)
+    licenses = []
+    for licfile in sorted(licfiles):
+        resolved_licfile = d.expand(licfile)
+        md5value = bb.utils.md5_file(resolved_licfile)
+        license = md5sums.get(md5value, None)
+        if not license:
+            crunched_md5 = _crunch_license(resolved_licfile)
+            license = md5sums.get(crunched_md5, None)
+            if not license:
+                license = 'Unknown'
+                logger.info("Please add the following line for '%s' to a 'license-hashes.csv' " \
+                    "and replace `Unknown` with the license:\n" \
+                    "%s,Unknown" % (os.path.relpath(licfile, srctree + "/.."), md5value))
+        licenses.append((license, os.path.relpath(licfile, srctree), md5value))
+    return licenses
+def find_licenses(srctree, d, first_only=False, extra_hashes={}):
+    licfiles = find_license_files(srctree, first_only)
+    licenses = match_licenses(licfiles, srctree, d, extra_hashes)
+    # FIXME should we grab at least one source file with a license header and add that too?
+    return licenses

diff --git a/meta/lib/oe/license_finder.py b/meta/lib/oe/license_finder.py new file mode 100644 index 0000000000..16f5d7c94c --- /dev/null +++ b/meta/lib/oe/license_finder.py
@@ -0,0 +1,179 @@
	1	#
	2	# Copyright OpenEmbedded Contributors
	3	#
	4	# SPDX-License-Identifier: GPL-2.0-only
	5	#
	6
	7	import fnmatch
	8	import hashlib
	9	import logging
	10	import os
	11	import re
	12
	13	import bb
	14	import bb.utils
	15
	16	logger = logging.getLogger("BitBake.OE.LicenseFinder")
	17
	18	def _load_hash_csv(d):
	19	"""
	20	Load a mapping of (checksum: license name) from all files/license-hashes.csv
	21	files that can be found in the available layers.
	22	"""
	23	import csv
	24	md5sums = {}
	25
	26	# Read license md5sums from csv file
	27	for path in d.getVar('BBPATH').split(':'):
	28	csv_path = os.path.join(path, 'files', 'license-hashes.csv')
	29	if os.path.isfile(csv_path):
	30	with open(csv_path, newline='') as csv_file:
	31	reader = csv.DictReader(csv_file, delimiter=',', fieldnames=['md5sum', 'license'])
	32	for row in reader:
	33	md5sums[row['md5sum']] = row['license']
	34
	35	return md5sums
	36
	37
	38	def _crunch_known_licenses(d):
	39	"""
	40	Calculate the MD5 checksums for the original and "crunched" versions of all
	41	known licenses.
	42	"""
	43	md5sums = {}
	44
	45	lic_dirs = [d.getVar('COMMON_LICENSE_DIR')] + (d.getVar('LICENSE_PATH') or "").split()
	46	for lic_dir in lic_dirs:
	47	for fn in os.listdir(lic_dir):
	48	path = os.path.join(lic_dir, fn)
	49	# Hash the exact contents
	50	md5value = bb.utils.md5_file(path)
	51	md5sums[md5value] = fn
	52	# Also hash a "crunched" version
	53	md5value = _crunch_license(path)
	54	md5sums[md5value] = fn
	55
	56	return md5sums
	57
	58
	59	def _crunch_license(licfile):
	60	'''
	61	Remove non-material text from a license file and then calculate its
	62	md5sum. This works well for licenses that contain a copyright statement,
	63	but is also a useful way to handle people's insistence upon reformatting
	64	the license text slightly (with no material difference to the text of the
	65	license).
	66	'''
	67
	68	import oe.utils
	69
	70	# Note: these are carefully constructed!
	71	license_title_re = re.compile(r'^#\(? (This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$')
	72	license_statement_re = re.compile(r'^((This (project\|software)\|.{1,10}) is( free software)? (released\|licen[sc]ed)\|(Released\|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$')
	73	copyright_re = re.compile(r'^ [#\]* (Modified work \|MIT LICENSED )?Copyright ?(\([cC]\))? .$')
	74	disclaimer_re = re.compile(r'^ \? ?All [Rr]ights [Rr]eserved\.$')
	75	email_re = re.compile(r'^.<[\w\.-]@[\w\.\-]*>$')
	76	header_re = re.compile(r'^(\/\*!?)? ?[\-=\]* ?(\*\/)?$')
	77	tag_re = re.compile(r'^ *@?\(?([Ll]icense\|MIT)\)?$')
	78	url_re = re.compile(r'^ [#\]* *https?:\/\/[\w\.\/\-]+$')
	79
	80	lictext = []
	81	with open(licfile, 'r', errors='surrogateescape') as f:
	82	for line in f:
	83	# Drop opening statements
	84	if copyright_re.match(line):
	85	continue
	86	elif disclaimer_re.match(line):
	87	continue
	88	elif email_re.match(line):
	89	continue
	90	elif header_re.match(line):
	91	continue
	92	elif tag_re.match(line):
	93	continue
	94	elif url_re.match(line):
	95	continue
	96	elif license_title_re.match(line):
	97	continue
	98	elif license_statement_re.match(line):
	99	continue
	100	# Strip comment symbols
	101	line = line.replace('*', '') \
	102	.replace('#', '')
	103	# Unify spelling
	104	line = line.replace('sub-license', 'sublicense')
	105	# Squash spaces
	106	line = oe.utils.squashspaces(line.strip())
	107	# Replace smart quotes, double quotes and backticks with single quotes
	108	line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'')
	109	# Unify brackets
	110	line = line.replace("{", "[").replace("}", "]")
	111	if line:
	112	lictext.append(line)
	113
	114	m = hashlib.md5()
	115	try:
	116	m.update(' '.join(lictext).encode('utf-8'))
	117	md5val = m.hexdigest()
	118	except UnicodeEncodeError:
	119	md5val = None
	120	return md5val
	121
	122
	123	def find_license_files(srctree, first_only=False):
	124	"""
	125	Search srctree for files that look like they could be licenses.
	126	If first_only is True, only return the first file found.
	127	"""
	128	licspecs = ['LICEN[CS]E', 'COPYING', '[Ll]icense', 'LEGAL', '[Ll]egal', 'GPL', 'README.lic', 'COPYRIGHT', '[Cc]opyright', 'e[dp]l-v10']
	129	skip_extensions = (".html", ".js", ".json", ".svg", ".ts", ".go", ".sh")
	130	licfiles = []
	131	for root, dirs, files in os.walk(srctree):
	132	# Sort files so that LICENSE is before LICENSE.subcomponent, which is
	133	# meaningful if first_only is set.
	134	for fn in sorted(files):
	135	if fn.endswith(skip_extensions):
	136	continue
	137	for spec in licspecs:
	138	if fnmatch.fnmatch(fn, spec):
	139	fullpath = os.path.join(root, fn)
	140	if not fullpath in licfiles:
	141	licfiles.append(fullpath)
	142	if first_only:
	143	return licfiles
	144
	145	return licfiles
	146
	147
	148	def match_licenses(licfiles, srctree, d, extra_hashes={}):
	149	md5sums = {}
	150	md5sums.update(_load_hash_csv(d))
	151	md5sums.update(_crunch_known_licenses(d))
	152	md5sums.update(extra_hashes)
	153
	154	licenses = []
	155	for licfile in sorted(licfiles):
	156	resolved_licfile = d.expand(licfile)
	157	md5value = bb.utils.md5_file(resolved_licfile)
	158	license = md5sums.get(md5value, None)
	159	if not license:
	160	crunched_md5 = _crunch_license(resolved_licfile)
	161	license = md5sums.get(crunched_md5, None)
	162	if not license:
	163	license = 'Unknown'
	164	logger.info("Please add the following line for '%s' to a 'license-hashes.csv' " \
	165	"and replace `Unknown` with the license:\n" \
	166	"%s,Unknown" % (os.path.relpath(licfile, srctree + "/.."), md5value))
	167
	168	licenses.append((license, os.path.relpath(licfile, srctree), md5value))
	169
	170	return licenses
	171
	172
	173	def find_licenses(srctree, d, first_only=False, extra_hashes={}):
	174	licfiles = find_license_files(srctree, first_only)
	175	licenses = match_licenses(licfiles, srctree, d, extra_hashes)
	176
	177	# FIXME should we grab at least one source file with a license header and add that too?
	178
	179	return licenses