summaryrefslogtreecommitdiffstats
path: root/meta/lib
diff options
context:
space:
mode:
authorRoss Burton <ross.burton@arm.com>2025-06-13 14:16:11 +0100
committerRichard Purdie <richard.purdie@linuxfoundation.org>2025-06-16 17:57:30 +0100
commit36adc8135d3eb8170f8944b6942704fa71589665 (patch)
treeb4f08bb435095411405c21350615e3a6c301821d /meta/lib
parent19953c90ce24653d2fbde3a79c2d5ca991f72564 (diff)
downloadpoky-36adc8135d3eb8170f8944b6942704fa71589665.tar.gz
lib/oe/license_finder: extract license finding code from recipetool
This code is 99% identical to the original code in recipetool/create.py, but with two minor changes: - The implicit recipetool logger is changed to an explicit logger - The CSV of license hashes is moved to meta/files/ (From OE-Core rev: b132652c6e520121c6b0e7e873b0d33ede0309b5) Signed-off-by: Ross Burton <ross.burton@arm.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'meta/lib')
-rw-r--r--meta/lib/oe/license_finder.py242
1 files changed, 242 insertions, 0 deletions
diff --git a/meta/lib/oe/license_finder.py b/meta/lib/oe/license_finder.py
new file mode 100644
index 0000000000..5b09059576
--- /dev/null
+++ b/meta/lib/oe/license_finder.py
@@ -0,0 +1,242 @@
1#
2# Copyright OpenEmbedded Contributors
3#
4# SPDX-License-Identifier: GPL-2.0-only
5#
6
7import fnmatch
8import hashlib
9import logging
10import os
11import re
12
13import bb
14
15logger = logging.getLogger("BitBake.OE.LicenseFinder")
16
17def get_license_md5sums(d, static_only=False, linenumbers=False):
18 import bb.utils
19 import csv
20 md5sums = {}
21 if not static_only and not linenumbers:
22 # Gather md5sums of license files in common license dir
23 commonlicdir = d.getVar('COMMON_LICENSE_DIR')
24 for fn in os.listdir(commonlicdir):
25 md5value = bb.utils.md5_file(os.path.join(commonlicdir, fn))
26 md5sums[md5value] = fn
27
28 # The following were extracted from common values in various recipes
29 # (double checking the license against the license file itself, not just
30 # the LICENSE value in the recipe)
31
32 # Read license md5sums from csv file
33 for path in d.getVar('BBPATH').split(':'):
34 csv_path = os.path.join(path, 'files', 'license-hashes.csv')
35 if os.path.isfile(csv_path):
36 with open(csv_path, newline='') as csv_file:
37 fieldnames = ['md5sum', 'license', 'beginline', 'endline', 'md5']
38 reader = csv.DictReader(csv_file, delimiter=',', fieldnames=fieldnames)
39 for row in reader:
40 if linenumbers:
41 md5sums[row['md5sum']] = (
42 row['license'], row['beginline'], row['endline'], row['md5'])
43 else:
44 md5sums[row['md5sum']] = row['license']
45
46 return md5sums
47
48
49def crunch_known_licenses(d):
50 '''
51 Calculate the MD5 checksums for the crunched versions of all common
52 licenses. Also add additional known checksums.
53 '''
54
55 crunched_md5sums = {}
56
57 # common licenses
58 crunched_md5sums['ad4e9d34a2e966dfe9837f18de03266d'] = 'GFDL-1.1-only'
59 crunched_md5sums['d014fb11a34eb67dc717fdcfc97e60ed'] = 'GFDL-1.2-only'
60 crunched_md5sums['e020ca655b06c112def28e597ab844f1'] = 'GFDL-1.3-only'
61
62 # The following two were gleaned from the "forever" npm package
63 crunched_md5sums['0a97f8e4cbaf889d6fa51f84b89a79f6'] = 'ISC'
64 # https://github.com/waffle-gl/waffle/blob/master/LICENSE.txt
65 crunched_md5sums['50fab24ce589d69af8964fdbfe414c60'] = 'BSD-2-Clause'
66 # https://github.com/spigwitmer/fakeds1963s/blob/master/LICENSE
67 crunched_md5sums['88a4355858a1433fea99fae34a44da88'] = 'GPL-2.0-only'
68 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
69 crunched_md5sums['063b5c3ebb5f3aa4c85a2ed18a31fbe7'] = 'GPL-2.0-only'
70 # https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv2.1
71 crunched_md5sums['7f5202f4d44ed15dcd4915f5210417d8'] = 'LGPL-2.1-only'
72 # unixODBC-2.3.4 COPYING
73 crunched_md5sums['3debde09238a8c8e1f6a847e1ec9055b'] = 'LGPL-2.1-only'
74 # https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv3
75 crunched_md5sums['f90c613c51aa35da4d79dd55fc724ceb'] = 'LGPL-3.0-only'
76 # https://raw.githubusercontent.com/eclipse/mosquitto/v1.4.14/epl-v10
77 crunched_md5sums['efe2cb9a35826992b9df68224e3c2628'] = 'EPL-1.0'
78
79 # https://raw.githubusercontent.com/jquery/esprima/3.1.3/LICENSE.BSD
80 crunched_md5sums['80fa7b56a28e8c902e6af194003220a5'] = 'BSD-2-Clause'
81 # https://raw.githubusercontent.com/npm/npm-install-checks/master/LICENSE
82 crunched_md5sums['e659f77bfd9002659e112d0d3d59b2c1'] = 'BSD-2-Clause'
83 # https://raw.githubusercontent.com/silverwind/default-gateway/4.2.0/LICENSE
84 crunched_md5sums['4c641f2d995c47f5cb08bdb4b5b6ea05'] = 'BSD-2-Clause'
85 # https://raw.githubusercontent.com/tad-lispy/node-damerau-levenshtein/v1.0.5/LICENSE
86 crunched_md5sums['2b8c039b2b9a25f0feb4410c4542d346'] = 'BSD-2-Clause'
87 # https://raw.githubusercontent.com/terser/terser/v3.17.0/LICENSE
88 crunched_md5sums['8bd23871802951c9ad63855151204c2c'] = 'BSD-2-Clause'
89 # https://raw.githubusercontent.com/alexei/sprintf.js/1.0.3/LICENSE
90 crunched_md5sums['008c22318c8ea65928bf730ddd0273e3'] = 'BSD-3-Clause'
91 # https://raw.githubusercontent.com/Caligatio/jsSHA/v3.2.0/LICENSE
92 crunched_md5sums['0e46634a01bfef056892949acaea85b1'] = 'BSD-3-Clause'
93 # https://raw.githubusercontent.com/d3/d3-path/v1.0.9/LICENSE
94 crunched_md5sums['b5f72aef53d3b2b432702c30b0215666'] = 'BSD-3-Clause'
95 # https://raw.githubusercontent.com/feross/ieee754/v1.1.13/LICENSE
96 crunched_md5sums['a39327c997c20da0937955192d86232d'] = 'BSD-3-Clause'
97 # https://raw.githubusercontent.com/joyent/node-extsprintf/v1.3.0/LICENSE
98 crunched_md5sums['721f23a96ff4161ca3a5f071bbe18108'] = 'MIT'
99 # https://raw.githubusercontent.com/pvorb/clone/v0.2.0/LICENSE
100 crunched_md5sums['b376d29a53c9573006b9970709231431'] = 'MIT'
101 # https://raw.githubusercontent.com/andris9/encoding/v0.1.12/LICENSE
102 crunched_md5sums['85d8a977ee9d7c5ab4ac03c9b95431c4'] = 'MIT-0'
103 # https://raw.githubusercontent.com/faye/websocket-driver-node/0.7.3/LICENSE.md
104 crunched_md5sums['b66384e7137e41a9b1904ef4d39703b6'] = 'Apache-2.0'
105 # https://raw.githubusercontent.com/less/less.js/v4.1.1/LICENSE
106 crunched_md5sums['b27575459e02221ccef97ec0bfd457ae'] = 'Apache-2.0'
107 # https://raw.githubusercontent.com/microsoft/TypeScript/v3.5.3/LICENSE.txt
108 crunched_md5sums['a54a1a6a39e7f9dbb4a23a42f5c7fd1c'] = 'Apache-2.0'
109 # https://raw.githubusercontent.com/request/request/v2.87.0/LICENSE
110 crunched_md5sums['1034431802e57486b393d00c5d262b8a'] = 'Apache-2.0'
111 # https://raw.githubusercontent.com/dchest/tweetnacl-js/v0.14.5/LICENSE
112 crunched_md5sums['75605e6bdd564791ab698fca65c94a4f'] = 'Unlicense'
113 # https://raw.githubusercontent.com/stackgl/gl-mat3/v2.0.0/LICENSE.md
114 crunched_md5sums['75512892d6f59dddb6d1c7e191957e9c'] = 'Zlib'
115
116 commonlicdir = d.getVar('COMMON_LICENSE_DIR')
117 for fn in sorted(os.listdir(commonlicdir)):
118 md5value, lictext = crunch_license(os.path.join(commonlicdir, fn))
119 if md5value not in crunched_md5sums:
120 crunched_md5sums[md5value] = fn
121 elif fn != crunched_md5sums[md5value]:
122 bb.debug(2, "crunched_md5sums['%s'] is already set to '%s' rather than '%s'" % (md5value, crunched_md5sums[md5value], fn))
123 else:
124 bb.debug(2, "crunched_md5sums['%s'] is already set to '%s'" % (md5value, crunched_md5sums[md5value]))
125
126 return crunched_md5sums
127
128
129def crunch_license(licfile):
130 '''
131 Remove non-material text from a license file and then calculate its
132 md5sum. This works well for licenses that contain a copyright statement,
133 but is also a useful way to handle people's insistence upon reformatting
134 the license text slightly (with no material difference to the text of the
135 license).
136 '''
137
138 import oe.utils
139
140 # Note: these are carefully constructed!
141 license_title_re = re.compile(r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$')
142 license_statement_re = re.compile(r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$')
143 copyright_re = re.compile(r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$')
144 disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$')
145 email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$')
146 header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$')
147 tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$')
148 url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$')
149
150 lictext = []
151 with open(licfile, 'r', errors='surrogateescape') as f:
152 for line in f:
153 # Drop opening statements
154 if copyright_re.match(line):
155 continue
156 elif disclaimer_re.match(line):
157 continue
158 elif email_re.match(line):
159 continue
160 elif header_re.match(line):
161 continue
162 elif tag_re.match(line):
163 continue
164 elif url_re.match(line):
165 continue
166 elif license_title_re.match(line):
167 continue
168 elif license_statement_re.match(line):
169 continue
170 # Strip comment symbols
171 line = line.replace('*', '') \
172 .replace('#', '')
173 # Unify spelling
174 line = line.replace('sub-license', 'sublicense')
175 # Squash spaces
176 line = oe.utils.squashspaces(line.strip())
177 # Replace smart quotes, double quotes and backticks with single quotes
178 line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'')
179 # Unify brackets
180 line = line.replace("{", "[").replace("}", "]")
181 if line:
182 lictext.append(line)
183
184 m = hashlib.md5()
185 try:
186 m.update(' '.join(lictext).encode('utf-8'))
187 md5val = m.hexdigest()
188 except UnicodeEncodeError:
189 md5val = None
190 lictext = ''
191 return md5val, lictext
192
193
194def find_license_files(srctree):
195 licspecs = ['*LICEN[CS]E*', 'COPYING*', '*[Ll]icense*', 'LEGAL*', '[Ll]egal*', '*GPL*', 'README.lic*', 'COPYRIGHT*', '[Cc]opyright*', 'e[dp]l-v10']
196 skip_extensions = (".html", ".js", ".json", ".svg", ".ts", ".go")
197 licfiles = []
198 for root, dirs, files in os.walk(srctree):
199 for fn in files:
200 if fn.endswith(skip_extensions):
201 continue
202 for spec in licspecs:
203 if fnmatch.fnmatch(fn, spec):
204 fullpath = os.path.join(root, fn)
205 if not fullpath in licfiles:
206 licfiles.append(fullpath)
207
208 return licfiles
209
210
211def match_licenses(licfiles, srctree, d):
212 import bb
213 md5sums = get_license_md5sums(d)
214
215 crunched_md5sums = crunch_known_licenses(d)
216
217 licenses = []
218 for licfile in sorted(licfiles):
219 resolved_licfile = d.expand(licfile)
220 md5value = bb.utils.md5_file(resolved_licfile)
221 license = md5sums.get(md5value, None)
222 if not license:
223 crunched_md5, lictext = crunch_license(resolved_licfile)
224 license = crunched_md5sums.get(crunched_md5, None)
225 if lictext and not license:
226 license = 'Unknown'
227 logger.info("Please add the following line for '%s' to a 'license-hashes.csv' " \
228 "and replace `Unknown` with the license:\n" \
229 "%s,Unknown" % (os.path.relpath(licfile, srctree + "/.."), md5value))
230 if license:
231 licenses.append((license, os.path.relpath(licfile, srctree), md5value))
232
233 return licenses
234
235
236def find_licenses(srctree, d):
237 licfiles = find_license_files(srctree)
238 licenses = match_licenses(licfiles, srctree, d)
239
240 # FIXME should we grab at least one source file with a license header and add that too?
241
242 return licenses