summaryrefslogtreecommitdiffstats
path: root/meta/lib/oe/license_finder.py
diff options
context:
space:
mode:
Diffstat (limited to 'meta/lib/oe/license_finder.py')
-rw-r--r--meta/lib/oe/license_finder.py179
1 files changed, 179 insertions, 0 deletions
diff --git a/meta/lib/oe/license_finder.py b/meta/lib/oe/license_finder.py
new file mode 100644
index 0000000000..16f5d7c94c
--- /dev/null
+++ b/meta/lib/oe/license_finder.py
@@ -0,0 +1,179 @@
1#
2# Copyright OpenEmbedded Contributors
3#
4# SPDX-License-Identifier: GPL-2.0-only
5#
6
7import fnmatch
8import hashlib
9import logging
10import os
11import re
12
13import bb
14import bb.utils
15
16logger = logging.getLogger("BitBake.OE.LicenseFinder")
17
18def _load_hash_csv(d):
19 """
20 Load a mapping of (checksum: license name) from all files/license-hashes.csv
21 files that can be found in the available layers.
22 """
23 import csv
24 md5sums = {}
25
26 # Read license md5sums from csv file
27 for path in d.getVar('BBPATH').split(':'):
28 csv_path = os.path.join(path, 'files', 'license-hashes.csv')
29 if os.path.isfile(csv_path):
30 with open(csv_path, newline='') as csv_file:
31 reader = csv.DictReader(csv_file, delimiter=',', fieldnames=['md5sum', 'license'])
32 for row in reader:
33 md5sums[row['md5sum']] = row['license']
34
35 return md5sums
36
37
38def _crunch_known_licenses(d):
39 """
40 Calculate the MD5 checksums for the original and "crunched" versions of all
41 known licenses.
42 """
43 md5sums = {}
44
45 lic_dirs = [d.getVar('COMMON_LICENSE_DIR')] + (d.getVar('LICENSE_PATH') or "").split()
46 for lic_dir in lic_dirs:
47 for fn in os.listdir(lic_dir):
48 path = os.path.join(lic_dir, fn)
49 # Hash the exact contents
50 md5value = bb.utils.md5_file(path)
51 md5sums[md5value] = fn
52 # Also hash a "crunched" version
53 md5value = _crunch_license(path)
54 md5sums[md5value] = fn
55
56 return md5sums
57
58
59def _crunch_license(licfile):
60 '''
61 Remove non-material text from a license file and then calculate its
62 md5sum. This works well for licenses that contain a copyright statement,
63 but is also a useful way to handle people's insistence upon reformatting
64 the license text slightly (with no material difference to the text of the
65 license).
66 '''
67
68 import oe.utils
69
70 # Note: these are carefully constructed!
71 license_title_re = re.compile(r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$')
72 license_statement_re = re.compile(r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$')
73 copyright_re = re.compile(r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$')
74 disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$')
75 email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$')
76 header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$')
77 tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$')
78 url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$')
79
80 lictext = []
81 with open(licfile, 'r', errors='surrogateescape') as f:
82 for line in f:
83 # Drop opening statements
84 if copyright_re.match(line):
85 continue
86 elif disclaimer_re.match(line):
87 continue
88 elif email_re.match(line):
89 continue
90 elif header_re.match(line):
91 continue
92 elif tag_re.match(line):
93 continue
94 elif url_re.match(line):
95 continue
96 elif license_title_re.match(line):
97 continue
98 elif license_statement_re.match(line):
99 continue
100 # Strip comment symbols
101 line = line.replace('*', '') \
102 .replace('#', '')
103 # Unify spelling
104 line = line.replace('sub-license', 'sublicense')
105 # Squash spaces
106 line = oe.utils.squashspaces(line.strip())
107 # Replace smart quotes, double quotes and backticks with single quotes
108 line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'')
109 # Unify brackets
110 line = line.replace("{", "[").replace("}", "]")
111 if line:
112 lictext.append(line)
113
114 m = hashlib.md5()
115 try:
116 m.update(' '.join(lictext).encode('utf-8'))
117 md5val = m.hexdigest()
118 except UnicodeEncodeError:
119 md5val = None
120 return md5val
121
122
123def find_license_files(srctree, first_only=False):
124 """
125 Search srctree for files that look like they could be licenses.
126 If first_only is True, only return the first file found.
127 """
128 licspecs = ['*LICEN[CS]E*', 'COPYING*', '*[Ll]icense*', 'LEGAL*', '[Ll]egal*', '*GPL*', 'README.lic*', 'COPYRIGHT*', '[Cc]opyright*', 'e[dp]l-v10']
129 skip_extensions = (".html", ".js", ".json", ".svg", ".ts", ".go", ".sh")
130 licfiles = []
131 for root, dirs, files in os.walk(srctree):
132 # Sort files so that LICENSE is before LICENSE.subcomponent, which is
133 # meaningful if first_only is set.
134 for fn in sorted(files):
135 if fn.endswith(skip_extensions):
136 continue
137 for spec in licspecs:
138 if fnmatch.fnmatch(fn, spec):
139 fullpath = os.path.join(root, fn)
140 if not fullpath in licfiles:
141 licfiles.append(fullpath)
142 if first_only:
143 return licfiles
144
145 return licfiles
146
147
148def match_licenses(licfiles, srctree, d, extra_hashes={}):
149 md5sums = {}
150 md5sums.update(_load_hash_csv(d))
151 md5sums.update(_crunch_known_licenses(d))
152 md5sums.update(extra_hashes)
153
154 licenses = []
155 for licfile in sorted(licfiles):
156 resolved_licfile = d.expand(licfile)
157 md5value = bb.utils.md5_file(resolved_licfile)
158 license = md5sums.get(md5value, None)
159 if not license:
160 crunched_md5 = _crunch_license(resolved_licfile)
161 license = md5sums.get(crunched_md5, None)
162 if not license:
163 license = 'Unknown'
164 logger.info("Please add the following line for '%s' to a 'license-hashes.csv' " \
165 "and replace `Unknown` with the license:\n" \
166 "%s,Unknown" % (os.path.relpath(licfile, srctree + "/.."), md5value))
167
168 licenses.append((license, os.path.relpath(licfile, srctree), md5value))
169
170 return licenses
171
172
173def find_licenses(srctree, d, first_only=False, extra_hashes={}):
174 licfiles = find_license_files(srctree, first_only)
175 licenses = match_licenses(licfiles, srctree, d, extra_hashes)
176
177 # FIXME should we grab at least one source file with a license header and add that too?
178
179 return licenses