diff options
| author | Ross Burton <ross.burton@arm.com> | 2025-06-13 14:16:11 +0100 |
|---|---|---|
| committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2025-06-16 17:57:30 +0100 |
| commit | 36adc8135d3eb8170f8944b6942704fa71589665 (patch) | |
| tree | b4f08bb435095411405c21350615e3a6c301821d /meta/lib | |
| parent | 19953c90ce24653d2fbde3a79c2d5ca991f72564 (diff) | |
| download | poky-36adc8135d3eb8170f8944b6942704fa71589665.tar.gz | |
lib/oe/license_finder: extract license finding code from recipetool
This code is 99% identical to the original code in recipetool/create.py,
but with two minor changes:
- The implicit recipetool logger is changed to an explicit logger
- The CSV of license hashes is moved to meta/files/
(From OE-Core rev: b132652c6e520121c6b0e7e873b0d33ede0309b5)
Signed-off-by: Ross Burton <ross.burton@arm.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'meta/lib')
| -rw-r--r-- | meta/lib/oe/license_finder.py | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/meta/lib/oe/license_finder.py b/meta/lib/oe/license_finder.py new file mode 100644 index 0000000000..5b09059576 --- /dev/null +++ b/meta/lib/oe/license_finder.py | |||
| @@ -0,0 +1,242 @@ | |||
| 1 | # | ||
| 2 | # Copyright OpenEmbedded Contributors | ||
| 3 | # | ||
| 4 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 5 | # | ||
| 6 | |||
| 7 | import fnmatch | ||
| 8 | import hashlib | ||
| 9 | import logging | ||
| 10 | import os | ||
| 11 | import re | ||
| 12 | |||
| 13 | import bb | ||
| 14 | |||
| 15 | logger = logging.getLogger("BitBake.OE.LicenseFinder") | ||
| 16 | |||
| 17 | def get_license_md5sums(d, static_only=False, linenumbers=False): | ||
| 18 | import bb.utils | ||
| 19 | import csv | ||
| 20 | md5sums = {} | ||
| 21 | if not static_only and not linenumbers: | ||
| 22 | # Gather md5sums of license files in common license dir | ||
| 23 | commonlicdir = d.getVar('COMMON_LICENSE_DIR') | ||
| 24 | for fn in os.listdir(commonlicdir): | ||
| 25 | md5value = bb.utils.md5_file(os.path.join(commonlicdir, fn)) | ||
| 26 | md5sums[md5value] = fn | ||
| 27 | |||
| 28 | # The following were extracted from common values in various recipes | ||
| 29 | # (double checking the license against the license file itself, not just | ||
| 30 | # the LICENSE value in the recipe) | ||
| 31 | |||
| 32 | # Read license md5sums from csv file | ||
| 33 | for path in d.getVar('BBPATH').split(':'): | ||
| 34 | csv_path = os.path.join(path, 'files', 'license-hashes.csv') | ||
| 35 | if os.path.isfile(csv_path): | ||
| 36 | with open(csv_path, newline='') as csv_file: | ||
| 37 | fieldnames = ['md5sum', 'license', 'beginline', 'endline', 'md5'] | ||
| 38 | reader = csv.DictReader(csv_file, delimiter=',', fieldnames=fieldnames) | ||
| 39 | for row in reader: | ||
| 40 | if linenumbers: | ||
| 41 | md5sums[row['md5sum']] = ( | ||
| 42 | row['license'], row['beginline'], row['endline'], row['md5']) | ||
| 43 | else: | ||
| 44 | md5sums[row['md5sum']] = row['license'] | ||
| 45 | |||
| 46 | return md5sums | ||
| 47 | |||
| 48 | |||
| 49 | def crunch_known_licenses(d): | ||
| 50 | ''' | ||
| 51 | Calculate the MD5 checksums for the crunched versions of all common | ||
| 52 | licenses. Also add additional known checksums. | ||
| 53 | ''' | ||
| 54 | |||
| 55 | crunched_md5sums = {} | ||
| 56 | |||
| 57 | # common licenses | ||
| 58 | crunched_md5sums['ad4e9d34a2e966dfe9837f18de03266d'] = 'GFDL-1.1-only' | ||
| 59 | crunched_md5sums['d014fb11a34eb67dc717fdcfc97e60ed'] = 'GFDL-1.2-only' | ||
| 60 | crunched_md5sums['e020ca655b06c112def28e597ab844f1'] = 'GFDL-1.3-only' | ||
| 61 | |||
| 62 | # The following two were gleaned from the "forever" npm package | ||
| 63 | crunched_md5sums['0a97f8e4cbaf889d6fa51f84b89a79f6'] = 'ISC' | ||
| 64 | # https://github.com/waffle-gl/waffle/blob/master/LICENSE.txt | ||
| 65 | crunched_md5sums['50fab24ce589d69af8964fdbfe414c60'] = 'BSD-2-Clause' | ||
| 66 | # https://github.com/spigwitmer/fakeds1963s/blob/master/LICENSE | ||
| 67 | crunched_md5sums['88a4355858a1433fea99fae34a44da88'] = 'GPL-2.0-only' | ||
| 68 | # http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt | ||
| 69 | crunched_md5sums['063b5c3ebb5f3aa4c85a2ed18a31fbe7'] = 'GPL-2.0-only' | ||
| 70 | # https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv2.1 | ||
| 71 | crunched_md5sums['7f5202f4d44ed15dcd4915f5210417d8'] = 'LGPL-2.1-only' | ||
| 72 | # unixODBC-2.3.4 COPYING | ||
| 73 | crunched_md5sums['3debde09238a8c8e1f6a847e1ec9055b'] = 'LGPL-2.1-only' | ||
| 74 | # https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv3 | ||
| 75 | crunched_md5sums['f90c613c51aa35da4d79dd55fc724ceb'] = 'LGPL-3.0-only' | ||
| 76 | # https://raw.githubusercontent.com/eclipse/mosquitto/v1.4.14/epl-v10 | ||
| 77 | crunched_md5sums['efe2cb9a35826992b9df68224e3c2628'] = 'EPL-1.0' | ||
| 78 | |||
| 79 | # https://raw.githubusercontent.com/jquery/esprima/3.1.3/LICENSE.BSD | ||
| 80 | crunched_md5sums['80fa7b56a28e8c902e6af194003220a5'] = 'BSD-2-Clause' | ||
| 81 | # https://raw.githubusercontent.com/npm/npm-install-checks/master/LICENSE | ||
| 82 | crunched_md5sums['e659f77bfd9002659e112d0d3d59b2c1'] = 'BSD-2-Clause' | ||
| 83 | # https://raw.githubusercontent.com/silverwind/default-gateway/4.2.0/LICENSE | ||
| 84 | crunched_md5sums['4c641f2d995c47f5cb08bdb4b5b6ea05'] = 'BSD-2-Clause' | ||
| 85 | # https://raw.githubusercontent.com/tad-lispy/node-damerau-levenshtein/v1.0.5/LICENSE | ||
| 86 | crunched_md5sums['2b8c039b2b9a25f0feb4410c4542d346'] = 'BSD-2-Clause' | ||
| 87 | # https://raw.githubusercontent.com/terser/terser/v3.17.0/LICENSE | ||
| 88 | crunched_md5sums['8bd23871802951c9ad63855151204c2c'] = 'BSD-2-Clause' | ||
| 89 | # https://raw.githubusercontent.com/alexei/sprintf.js/1.0.3/LICENSE | ||
| 90 | crunched_md5sums['008c22318c8ea65928bf730ddd0273e3'] = 'BSD-3-Clause' | ||
| 91 | # https://raw.githubusercontent.com/Caligatio/jsSHA/v3.2.0/LICENSE | ||
| 92 | crunched_md5sums['0e46634a01bfef056892949acaea85b1'] = 'BSD-3-Clause' | ||
| 93 | # https://raw.githubusercontent.com/d3/d3-path/v1.0.9/LICENSE | ||
| 94 | crunched_md5sums['b5f72aef53d3b2b432702c30b0215666'] = 'BSD-3-Clause' | ||
| 95 | # https://raw.githubusercontent.com/feross/ieee754/v1.1.13/LICENSE | ||
| 96 | crunched_md5sums['a39327c997c20da0937955192d86232d'] = 'BSD-3-Clause' | ||
| 97 | # https://raw.githubusercontent.com/joyent/node-extsprintf/v1.3.0/LICENSE | ||
| 98 | crunched_md5sums['721f23a96ff4161ca3a5f071bbe18108'] = 'MIT' | ||
| 99 | # https://raw.githubusercontent.com/pvorb/clone/v0.2.0/LICENSE | ||
| 100 | crunched_md5sums['b376d29a53c9573006b9970709231431'] = 'MIT' | ||
| 101 | # https://raw.githubusercontent.com/andris9/encoding/v0.1.12/LICENSE | ||
| 102 | crunched_md5sums['85d8a977ee9d7c5ab4ac03c9b95431c4'] = 'MIT-0' | ||
| 103 | # https://raw.githubusercontent.com/faye/websocket-driver-node/0.7.3/LICENSE.md | ||
| 104 | crunched_md5sums['b66384e7137e41a9b1904ef4d39703b6'] = 'Apache-2.0' | ||
| 105 | # https://raw.githubusercontent.com/less/less.js/v4.1.1/LICENSE | ||
| 106 | crunched_md5sums['b27575459e02221ccef97ec0bfd457ae'] = 'Apache-2.0' | ||
| 107 | # https://raw.githubusercontent.com/microsoft/TypeScript/v3.5.3/LICENSE.txt | ||
| 108 | crunched_md5sums['a54a1a6a39e7f9dbb4a23a42f5c7fd1c'] = 'Apache-2.0' | ||
| 109 | # https://raw.githubusercontent.com/request/request/v2.87.0/LICENSE | ||
| 110 | crunched_md5sums['1034431802e57486b393d00c5d262b8a'] = 'Apache-2.0' | ||
| 111 | # https://raw.githubusercontent.com/dchest/tweetnacl-js/v0.14.5/LICENSE | ||
| 112 | crunched_md5sums['75605e6bdd564791ab698fca65c94a4f'] = 'Unlicense' | ||
| 113 | # https://raw.githubusercontent.com/stackgl/gl-mat3/v2.0.0/LICENSE.md | ||
| 114 | crunched_md5sums['75512892d6f59dddb6d1c7e191957e9c'] = 'Zlib' | ||
| 115 | |||
| 116 | commonlicdir = d.getVar('COMMON_LICENSE_DIR') | ||
| 117 | for fn in sorted(os.listdir(commonlicdir)): | ||
| 118 | md5value, lictext = crunch_license(os.path.join(commonlicdir, fn)) | ||
| 119 | if md5value not in crunched_md5sums: | ||
| 120 | crunched_md5sums[md5value] = fn | ||
| 121 | elif fn != crunched_md5sums[md5value]: | ||
| 122 | bb.debug(2, "crunched_md5sums['%s'] is already set to '%s' rather than '%s'" % (md5value, crunched_md5sums[md5value], fn)) | ||
| 123 | else: | ||
| 124 | bb.debug(2, "crunched_md5sums['%s'] is already set to '%s'" % (md5value, crunched_md5sums[md5value])) | ||
| 125 | |||
| 126 | return crunched_md5sums | ||
| 127 | |||
| 128 | |||
| 129 | def crunch_license(licfile): | ||
| 130 | ''' | ||
| 131 | Remove non-material text from a license file and then calculate its | ||
| 132 | md5sum. This works well for licenses that contain a copyright statement, | ||
| 133 | but is also a useful way to handle people's insistence upon reformatting | ||
| 134 | the license text slightly (with no material difference to the text of the | ||
| 135 | license). | ||
| 136 | ''' | ||
| 137 | |||
| 138 | import oe.utils | ||
| 139 | |||
| 140 | # Note: these are carefully constructed! | ||
| 141 | license_title_re = re.compile(r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$') | ||
| 142 | license_statement_re = re.compile(r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$') | ||
| 143 | copyright_re = re.compile(r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$') | ||
| 144 | disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$') | ||
| 145 | email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$') | ||
| 146 | header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$') | ||
| 147 | tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$') | ||
| 148 | url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$') | ||
| 149 | |||
| 150 | lictext = [] | ||
| 151 | with open(licfile, 'r', errors='surrogateescape') as f: | ||
| 152 | for line in f: | ||
| 153 | # Drop opening statements | ||
| 154 | if copyright_re.match(line): | ||
| 155 | continue | ||
| 156 | elif disclaimer_re.match(line): | ||
| 157 | continue | ||
| 158 | elif email_re.match(line): | ||
| 159 | continue | ||
| 160 | elif header_re.match(line): | ||
| 161 | continue | ||
| 162 | elif tag_re.match(line): | ||
| 163 | continue | ||
| 164 | elif url_re.match(line): | ||
| 165 | continue | ||
| 166 | elif license_title_re.match(line): | ||
| 167 | continue | ||
| 168 | elif license_statement_re.match(line): | ||
| 169 | continue | ||
| 170 | # Strip comment symbols | ||
| 171 | line = line.replace('*', '') \ | ||
| 172 | .replace('#', '') | ||
| 173 | # Unify spelling | ||
| 174 | line = line.replace('sub-license', 'sublicense') | ||
| 175 | # Squash spaces | ||
| 176 | line = oe.utils.squashspaces(line.strip()) | ||
| 177 | # Replace smart quotes, double quotes and backticks with single quotes | ||
| 178 | line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'') | ||
| 179 | # Unify brackets | ||
| 180 | line = line.replace("{", "[").replace("}", "]") | ||
| 181 | if line: | ||
| 182 | lictext.append(line) | ||
| 183 | |||
| 184 | m = hashlib.md5() | ||
| 185 | try: | ||
| 186 | m.update(' '.join(lictext).encode('utf-8')) | ||
| 187 | md5val = m.hexdigest() | ||
| 188 | except UnicodeEncodeError: | ||
| 189 | md5val = None | ||
| 190 | lictext = '' | ||
| 191 | return md5val, lictext | ||
| 192 | |||
| 193 | |||
| 194 | def find_license_files(srctree): | ||
| 195 | licspecs = ['*LICEN[CS]E*', 'COPYING*', '*[Ll]icense*', 'LEGAL*', '[Ll]egal*', '*GPL*', 'README.lic*', 'COPYRIGHT*', '[Cc]opyright*', 'e[dp]l-v10'] | ||
| 196 | skip_extensions = (".html", ".js", ".json", ".svg", ".ts", ".go") | ||
| 197 | licfiles = [] | ||
| 198 | for root, dirs, files in os.walk(srctree): | ||
| 199 | for fn in files: | ||
| 200 | if fn.endswith(skip_extensions): | ||
| 201 | continue | ||
| 202 | for spec in licspecs: | ||
| 203 | if fnmatch.fnmatch(fn, spec): | ||
| 204 | fullpath = os.path.join(root, fn) | ||
| 205 | if not fullpath in licfiles: | ||
| 206 | licfiles.append(fullpath) | ||
| 207 | |||
| 208 | return licfiles | ||
| 209 | |||
| 210 | |||
| 211 | def match_licenses(licfiles, srctree, d): | ||
| 212 | import bb | ||
| 213 | md5sums = get_license_md5sums(d) | ||
| 214 | |||
| 215 | crunched_md5sums = crunch_known_licenses(d) | ||
| 216 | |||
| 217 | licenses = [] | ||
| 218 | for licfile in sorted(licfiles): | ||
| 219 | resolved_licfile = d.expand(licfile) | ||
| 220 | md5value = bb.utils.md5_file(resolved_licfile) | ||
| 221 | license = md5sums.get(md5value, None) | ||
| 222 | if not license: | ||
| 223 | crunched_md5, lictext = crunch_license(resolved_licfile) | ||
| 224 | license = crunched_md5sums.get(crunched_md5, None) | ||
| 225 | if lictext and not license: | ||
| 226 | license = 'Unknown' | ||
| 227 | logger.info("Please add the following line for '%s' to a 'license-hashes.csv' " \ | ||
| 228 | "and replace `Unknown` with the license:\n" \ | ||
| 229 | "%s,Unknown" % (os.path.relpath(licfile, srctree + "/.."), md5value)) | ||
| 230 | if license: | ||
| 231 | licenses.append((license, os.path.relpath(licfile, srctree), md5value)) | ||
| 232 | |||
| 233 | return licenses | ||
| 234 | |||
| 235 | |||
| 236 | def find_licenses(srctree, d): | ||
| 237 | licfiles = find_license_files(srctree) | ||
| 238 | licenses = match_licenses(licfiles, srctree, d) | ||
| 239 | |||
| 240 | # FIXME should we grab at least one source file with a license header and add that too? | ||
| 241 | |||
| 242 | return licenses | ||
