summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--meta/files/license-hashes.csv37
-rw-r--r--meta/lib/oe/license_finder.py242
2 files changed, 279 insertions, 0 deletions
diff --git a/meta/files/license-hashes.csv b/meta/files/license-hashes.csv
new file mode 100644
index 0000000000..80851111b3
--- /dev/null
+++ b/meta/files/license-hashes.csv
@@ -0,0 +1,37 @@
10636e73ff0215e8d672dc4c32c317bb3,GPL-2.0-only
212f884d2ae1ff87c09e5b7ccc2c4ca7e,GPL-2.0-only
318810669f13b87348459e611d31ab760,GPL-2.0-only
4252890d9eee26aab7b432e8b8a616475,LGPL-2.0-only
52d5025d4aa3495befef8f17206a5b0a1,LGPL-2.1-only
63214f080875748938ba060314b4f727d,LGPL-2.0-only
7385c55653886acac3821999a3ccd17b3,Artistic-1.0 | GPL-2.0-only
8393a5ca445f6965873eca0259a17f833,GPL-2.0-only
93b83ef96387f14655fc854ddc3c6bd57,Apache-2.0
103bf50002aefd002f49e7bb854063f7e7,LGPL-2.0-only
114325afd396febcb659c36b49533135d4,GPL-2.0-only
124fbd65380cdd255951079008b364516c,LGPL-2.1-only
1354c7042be62e169199200bc6477f04d1,BSD-3-Clause
1455ca817ccb7d5b5b66355690e9abc605,LGPL-2.0-only
1559530bdf33659b29e73d4adb9f9f6552,GPL-2.0-only
165f30f0716dfdd0d91eb439ebec522ec2,LGPL-2.0-only
176a6a8e020838b23406c81b19c1d46df6,LGPL-3.0-only
18751419260aa954499f7abaabaa882bbe,GPL-2.0-only
197fbc338309ac38fefcd64b04bb903e34,LGPL-2.1-only
208ca43cbc842c2336e835926c2166c28b,GPL-2.0-only
2194d55d512a9ba36caa9b7df079bae19f,GPL-2.0-only
229ac2e7cff1ddaf48b6eab6028f23ef88,GPL-2.0-only
239f604d8a4f8e74f4f5140845a21b6674,LGPL-2.0-only
24a6f89e2100d9b6cdffcea4f398e37343,LGPL-2.1-only
25b234ee4d69f5fce4486a80fdaf4a4263,GPL-2.0-only
26bbb461211a33b134d42ed5ee802b37ff,LGPL-2.1-only
27bfe1f75d606912a4111c90743d6c7325,MPL-1.1-only
28c93c0550bd3173f4504b2cbd8991e50b,GPL-2.0-only
29d32239bcb673463ab874e80d47fae504,GPL-3.0-only
30d7810fab7487fb0aad327b76f1be7cd7,GPL-2.0-only
31d8045f3b8f929c1cb29a1e3fd737b499,LGPL-2.1-only
32db979804f025cf55aabec7129cb671ed,LGPL-2.0-only
33eb723b61539feef013de476e68b5c50a,GPL-2.0-only
34ebb5c50ab7cab4baeffba14977030c07,GPL-2.0-only
35f27defe1e96c2e1ecd4e0c9be8967949,GPL-3.0-only
36fad9b3332be894bab9bc501572864b29,LGPL-2.1-only
37fbc093901857fcd118f065f900982c24,LGPL-2.1-only
diff --git a/meta/lib/oe/license_finder.py b/meta/lib/oe/license_finder.py
new file mode 100644
index 0000000000..5b09059576
--- /dev/null
+++ b/meta/lib/oe/license_finder.py
@@ -0,0 +1,242 @@
1#
2# Copyright OpenEmbedded Contributors
3#
4# SPDX-License-Identifier: GPL-2.0-only
5#
6
7import fnmatch
8import hashlib
9import logging
10import os
11import re
12
13import bb
14
15logger = logging.getLogger("BitBake.OE.LicenseFinder")
16
17def get_license_md5sums(d, static_only=False, linenumbers=False):
18 import bb.utils
19 import csv
20 md5sums = {}
21 if not static_only and not linenumbers:
22 # Gather md5sums of license files in common license dir
23 commonlicdir = d.getVar('COMMON_LICENSE_DIR')
24 for fn in os.listdir(commonlicdir):
25 md5value = bb.utils.md5_file(os.path.join(commonlicdir, fn))
26 md5sums[md5value] = fn
27
28 # The following were extracted from common values in various recipes
29 # (double checking the license against the license file itself, not just
30 # the LICENSE value in the recipe)
31
32 # Read license md5sums from csv file
33 for path in d.getVar('BBPATH').split(':'):
34 csv_path = os.path.join(path, 'files', 'license-hashes.csv')
35 if os.path.isfile(csv_path):
36 with open(csv_path, newline='') as csv_file:
37 fieldnames = ['md5sum', 'license', 'beginline', 'endline', 'md5']
38 reader = csv.DictReader(csv_file, delimiter=',', fieldnames=fieldnames)
39 for row in reader:
40 if linenumbers:
41 md5sums[row['md5sum']] = (
42 row['license'], row['beginline'], row['endline'], row['md5'])
43 else:
44 md5sums[row['md5sum']] = row['license']
45
46 return md5sums
47
48
49def crunch_known_licenses(d):
50 '''
51 Calculate the MD5 checksums for the crunched versions of all common
52 licenses. Also add additional known checksums.
53 '''
54
55 crunched_md5sums = {}
56
57 # common licenses
58 crunched_md5sums['ad4e9d34a2e966dfe9837f18de03266d'] = 'GFDL-1.1-only'
59 crunched_md5sums['d014fb11a34eb67dc717fdcfc97e60ed'] = 'GFDL-1.2-only'
60 crunched_md5sums['e020ca655b06c112def28e597ab844f1'] = 'GFDL-1.3-only'
61
62 # The following two were gleaned from the "forever" npm package
63 crunched_md5sums['0a97f8e4cbaf889d6fa51f84b89a79f6'] = 'ISC'
64 # https://github.com/waffle-gl/waffle/blob/master/LICENSE.txt
65 crunched_md5sums['50fab24ce589d69af8964fdbfe414c60'] = 'BSD-2-Clause'
66 # https://github.com/spigwitmer/fakeds1963s/blob/master/LICENSE
67 crunched_md5sums['88a4355858a1433fea99fae34a44da88'] = 'GPL-2.0-only'
68 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
69 crunched_md5sums['063b5c3ebb5f3aa4c85a2ed18a31fbe7'] = 'GPL-2.0-only'
70 # https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv2.1
71 crunched_md5sums['7f5202f4d44ed15dcd4915f5210417d8'] = 'LGPL-2.1-only'
72 # unixODBC-2.3.4 COPYING
73 crunched_md5sums['3debde09238a8c8e1f6a847e1ec9055b'] = 'LGPL-2.1-only'
74 # https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv3
75 crunched_md5sums['f90c613c51aa35da4d79dd55fc724ceb'] = 'LGPL-3.0-only'
76 # https://raw.githubusercontent.com/eclipse/mosquitto/v1.4.14/epl-v10
77 crunched_md5sums['efe2cb9a35826992b9df68224e3c2628'] = 'EPL-1.0'
78
79 # https://raw.githubusercontent.com/jquery/esprima/3.1.3/LICENSE.BSD
80 crunched_md5sums['80fa7b56a28e8c902e6af194003220a5'] = 'BSD-2-Clause'
81 # https://raw.githubusercontent.com/npm/npm-install-checks/master/LICENSE
82 crunched_md5sums['e659f77bfd9002659e112d0d3d59b2c1'] = 'BSD-2-Clause'
83 # https://raw.githubusercontent.com/silverwind/default-gateway/4.2.0/LICENSE
84 crunched_md5sums['4c641f2d995c47f5cb08bdb4b5b6ea05'] = 'BSD-2-Clause'
85 # https://raw.githubusercontent.com/tad-lispy/node-damerau-levenshtein/v1.0.5/LICENSE
86 crunched_md5sums['2b8c039b2b9a25f0feb4410c4542d346'] = 'BSD-2-Clause'
87 # https://raw.githubusercontent.com/terser/terser/v3.17.0/LICENSE
88 crunched_md5sums['8bd23871802951c9ad63855151204c2c'] = 'BSD-2-Clause'
89 # https://raw.githubusercontent.com/alexei/sprintf.js/1.0.3/LICENSE
90 crunched_md5sums['008c22318c8ea65928bf730ddd0273e3'] = 'BSD-3-Clause'
91 # https://raw.githubusercontent.com/Caligatio/jsSHA/v3.2.0/LICENSE
92 crunched_md5sums['0e46634a01bfef056892949acaea85b1'] = 'BSD-3-Clause'
93 # https://raw.githubusercontent.com/d3/d3-path/v1.0.9/LICENSE
94 crunched_md5sums['b5f72aef53d3b2b432702c30b0215666'] = 'BSD-3-Clause'
95 # https://raw.githubusercontent.com/feross/ieee754/v1.1.13/LICENSE
96 crunched_md5sums['a39327c997c20da0937955192d86232d'] = 'BSD-3-Clause'
97 # https://raw.githubusercontent.com/joyent/node-extsprintf/v1.3.0/LICENSE
98 crunched_md5sums['721f23a96ff4161ca3a5f071bbe18108'] = 'MIT'
99 # https://raw.githubusercontent.com/pvorb/clone/v0.2.0/LICENSE
100 crunched_md5sums['b376d29a53c9573006b9970709231431'] = 'MIT'
101 # https://raw.githubusercontent.com/andris9/encoding/v0.1.12/LICENSE
102 crunched_md5sums['85d8a977ee9d7c5ab4ac03c9b95431c4'] = 'MIT-0'
103 # https://raw.githubusercontent.com/faye/websocket-driver-node/0.7.3/LICENSE.md
104 crunched_md5sums['b66384e7137e41a9b1904ef4d39703b6'] = 'Apache-2.0'
105 # https://raw.githubusercontent.com/less/less.js/v4.1.1/LICENSE
106 crunched_md5sums['b27575459e02221ccef97ec0bfd457ae'] = 'Apache-2.0'
107 # https://raw.githubusercontent.com/microsoft/TypeScript/v3.5.3/LICENSE.txt
108 crunched_md5sums['a54a1a6a39e7f9dbb4a23a42f5c7fd1c'] = 'Apache-2.0'
109 # https://raw.githubusercontent.com/request/request/v2.87.0/LICENSE
110 crunched_md5sums['1034431802e57486b393d00c5d262b8a'] = 'Apache-2.0'
111 # https://raw.githubusercontent.com/dchest/tweetnacl-js/v0.14.5/LICENSE
112 crunched_md5sums['75605e6bdd564791ab698fca65c94a4f'] = 'Unlicense'
113 # https://raw.githubusercontent.com/stackgl/gl-mat3/v2.0.0/LICENSE.md
114 crunched_md5sums['75512892d6f59dddb6d1c7e191957e9c'] = 'Zlib'
115
116 commonlicdir = d.getVar('COMMON_LICENSE_DIR')
117 for fn in sorted(os.listdir(commonlicdir)):
118 md5value, lictext = crunch_license(os.path.join(commonlicdir, fn))
119 if md5value not in crunched_md5sums:
120 crunched_md5sums[md5value] = fn
121 elif fn != crunched_md5sums[md5value]:
122 bb.debug(2, "crunched_md5sums['%s'] is already set to '%s' rather than '%s'" % (md5value, crunched_md5sums[md5value], fn))
123 else:
124 bb.debug(2, "crunched_md5sums['%s'] is already set to '%s'" % (md5value, crunched_md5sums[md5value]))
125
126 return crunched_md5sums
127
128
129def crunch_license(licfile):
130 '''
131 Remove non-material text from a license file and then calculate its
132 md5sum. This works well for licenses that contain a copyright statement,
133 but is also a useful way to handle people's insistence upon reformatting
134 the license text slightly (with no material difference to the text of the
135 license).
136 '''
137
138 import oe.utils
139
140 # Note: these are carefully constructed!
141 license_title_re = re.compile(r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$')
142 license_statement_re = re.compile(r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$')
143 copyright_re = re.compile(r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$')
144 disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$')
145 email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$')
146 header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$')
147 tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$')
148 url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$')
149
150 lictext = []
151 with open(licfile, 'r', errors='surrogateescape') as f:
152 for line in f:
153 # Drop opening statements
154 if copyright_re.match(line):
155 continue
156 elif disclaimer_re.match(line):
157 continue
158 elif email_re.match(line):
159 continue
160 elif header_re.match(line):
161 continue
162 elif tag_re.match(line):
163 continue
164 elif url_re.match(line):
165 continue
166 elif license_title_re.match(line):
167 continue
168 elif license_statement_re.match(line):
169 continue
170 # Strip comment symbols
171 line = line.replace('*', '') \
172 .replace('#', '')
173 # Unify spelling
174 line = line.replace('sub-license', 'sublicense')
175 # Squash spaces
176 line = oe.utils.squashspaces(line.strip())
177 # Replace smart quotes, double quotes and backticks with single quotes
178 line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'')
179 # Unify brackets
180 line = line.replace("{", "[").replace("}", "]")
181 if line:
182 lictext.append(line)
183
184 m = hashlib.md5()
185 try:
186 m.update(' '.join(lictext).encode('utf-8'))
187 md5val = m.hexdigest()
188 except UnicodeEncodeError:
189 md5val = None
190 lictext = ''
191 return md5val, lictext
192
193
194def find_license_files(srctree):
195 licspecs = ['*LICEN[CS]E*', 'COPYING*', '*[Ll]icense*', 'LEGAL*', '[Ll]egal*', '*GPL*', 'README.lic*', 'COPYRIGHT*', '[Cc]opyright*', 'e[dp]l-v10']
196 skip_extensions = (".html", ".js", ".json", ".svg", ".ts", ".go")
197 licfiles = []
198 for root, dirs, files in os.walk(srctree):
199 for fn in files:
200 if fn.endswith(skip_extensions):
201 continue
202 for spec in licspecs:
203 if fnmatch.fnmatch(fn, spec):
204 fullpath = os.path.join(root, fn)
205 if not fullpath in licfiles:
206 licfiles.append(fullpath)
207
208 return licfiles
209
210
211def match_licenses(licfiles, srctree, d):
212 import bb
213 md5sums = get_license_md5sums(d)
214
215 crunched_md5sums = crunch_known_licenses(d)
216
217 licenses = []
218 for licfile in sorted(licfiles):
219 resolved_licfile = d.expand(licfile)
220 md5value = bb.utils.md5_file(resolved_licfile)
221 license = md5sums.get(md5value, None)
222 if not license:
223 crunched_md5, lictext = crunch_license(resolved_licfile)
224 license = crunched_md5sums.get(crunched_md5, None)
225 if lictext and not license:
226 license = 'Unknown'
227 logger.info("Please add the following line for '%s' to a 'license-hashes.csv' " \
228 "and replace `Unknown` with the license:\n" \
229 "%s,Unknown" % (os.path.relpath(licfile, srctree + "/.."), md5value))
230 if license:
231 licenses.append((license, os.path.relpath(licfile, srctree), md5value))
232
233 return licenses
234
235
236def find_licenses(srctree, d):
237 licfiles = find_license_files(srctree)
238 licenses = match_licenses(licfiles, srctree, d)
239
240 # FIXME should we grab at least one source file with a license header and add that too?
241
242 return licenses