diff options
Diffstat (limited to 'meta/lib/oe/license_finder.py')
-rw-r--r-- | meta/lib/oe/license_finder.py | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/meta/lib/oe/license_finder.py b/meta/lib/oe/license_finder.py new file mode 100644 index 0000000000..16f5d7c94c --- /dev/null +++ b/meta/lib/oe/license_finder.py | |||
@@ -0,0 +1,179 @@ | |||
1 | # | ||
2 | # Copyright OpenEmbedded Contributors | ||
3 | # | ||
4 | # SPDX-License-Identifier: GPL-2.0-only | ||
5 | # | ||
6 | |||
7 | import fnmatch | ||
8 | import hashlib | ||
9 | import logging | ||
10 | import os | ||
11 | import re | ||
12 | |||
13 | import bb | ||
14 | import bb.utils | ||
15 | |||
16 | logger = logging.getLogger("BitBake.OE.LicenseFinder") | ||
17 | |||
18 | def _load_hash_csv(d): | ||
19 | """ | ||
20 | Load a mapping of (checksum: license name) from all files/license-hashes.csv | ||
21 | files that can be found in the available layers. | ||
22 | """ | ||
23 | import csv | ||
24 | md5sums = {} | ||
25 | |||
26 | # Read license md5sums from csv file | ||
27 | for path in d.getVar('BBPATH').split(':'): | ||
28 | csv_path = os.path.join(path, 'files', 'license-hashes.csv') | ||
29 | if os.path.isfile(csv_path): | ||
30 | with open(csv_path, newline='') as csv_file: | ||
31 | reader = csv.DictReader(csv_file, delimiter=',', fieldnames=['md5sum', 'license']) | ||
32 | for row in reader: | ||
33 | md5sums[row['md5sum']] = row['license'] | ||
34 | |||
35 | return md5sums | ||
36 | |||
37 | |||
38 | def _crunch_known_licenses(d): | ||
39 | """ | ||
40 | Calculate the MD5 checksums for the original and "crunched" versions of all | ||
41 | known licenses. | ||
42 | """ | ||
43 | md5sums = {} | ||
44 | |||
45 | lic_dirs = [d.getVar('COMMON_LICENSE_DIR')] + (d.getVar('LICENSE_PATH') or "").split() | ||
46 | for lic_dir in lic_dirs: | ||
47 | for fn in os.listdir(lic_dir): | ||
48 | path = os.path.join(lic_dir, fn) | ||
49 | # Hash the exact contents | ||
50 | md5value = bb.utils.md5_file(path) | ||
51 | md5sums[md5value] = fn | ||
52 | # Also hash a "crunched" version | ||
53 | md5value = _crunch_license(path) | ||
54 | md5sums[md5value] = fn | ||
55 | |||
56 | return md5sums | ||
57 | |||
58 | |||
59 | def _crunch_license(licfile): | ||
60 | ''' | ||
61 | Remove non-material text from a license file and then calculate its | ||
62 | md5sum. This works well for licenses that contain a copyright statement, | ||
63 | but is also a useful way to handle people's insistence upon reformatting | ||
64 | the license text slightly (with no material difference to the text of the | ||
65 | license). | ||
66 | ''' | ||
67 | |||
68 | import oe.utils | ||
69 | |||
70 | # Note: these are carefully constructed! | ||
71 | license_title_re = re.compile(r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$') | ||
72 | license_statement_re = re.compile(r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$') | ||
73 | copyright_re = re.compile(r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$') | ||
74 | disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$') | ||
75 | email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$') | ||
76 | header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$') | ||
77 | tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$') | ||
78 | url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$') | ||
79 | |||
80 | lictext = [] | ||
81 | with open(licfile, 'r', errors='surrogateescape') as f: | ||
82 | for line in f: | ||
83 | # Drop opening statements | ||
84 | if copyright_re.match(line): | ||
85 | continue | ||
86 | elif disclaimer_re.match(line): | ||
87 | continue | ||
88 | elif email_re.match(line): | ||
89 | continue | ||
90 | elif header_re.match(line): | ||
91 | continue | ||
92 | elif tag_re.match(line): | ||
93 | continue | ||
94 | elif url_re.match(line): | ||
95 | continue | ||
96 | elif license_title_re.match(line): | ||
97 | continue | ||
98 | elif license_statement_re.match(line): | ||
99 | continue | ||
100 | # Strip comment symbols | ||
101 | line = line.replace('*', '') \ | ||
102 | .replace('#', '') | ||
103 | # Unify spelling | ||
104 | line = line.replace('sub-license', 'sublicense') | ||
105 | # Squash spaces | ||
106 | line = oe.utils.squashspaces(line.strip()) | ||
107 | # Replace smart quotes, double quotes and backticks with single quotes | ||
108 | line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'') | ||
109 | # Unify brackets | ||
110 | line = line.replace("{", "[").replace("}", "]") | ||
111 | if line: | ||
112 | lictext.append(line) | ||
113 | |||
114 | m = hashlib.md5() | ||
115 | try: | ||
116 | m.update(' '.join(lictext).encode('utf-8')) | ||
117 | md5val = m.hexdigest() | ||
118 | except UnicodeEncodeError: | ||
119 | md5val = None | ||
120 | return md5val | ||
121 | |||
122 | |||
123 | def find_license_files(srctree, first_only=False): | ||
124 | """ | ||
125 | Search srctree for files that look like they could be licenses. | ||
126 | If first_only is True, only return the first file found. | ||
127 | """ | ||
128 | licspecs = ['*LICEN[CS]E*', 'COPYING*', '*[Ll]icense*', 'LEGAL*', '[Ll]egal*', '*GPL*', 'README.lic*', 'COPYRIGHT*', '[Cc]opyright*', 'e[dp]l-v10'] | ||
129 | skip_extensions = (".html", ".js", ".json", ".svg", ".ts", ".go", ".sh") | ||
130 | licfiles = [] | ||
131 | for root, dirs, files in os.walk(srctree): | ||
132 | # Sort files so that LICENSE is before LICENSE.subcomponent, which is | ||
133 | # meaningful if first_only is set. | ||
134 | for fn in sorted(files): | ||
135 | if fn.endswith(skip_extensions): | ||
136 | continue | ||
137 | for spec in licspecs: | ||
138 | if fnmatch.fnmatch(fn, spec): | ||
139 | fullpath = os.path.join(root, fn) | ||
140 | if not fullpath in licfiles: | ||
141 | licfiles.append(fullpath) | ||
142 | if first_only: | ||
143 | return licfiles | ||
144 | |||
145 | return licfiles | ||
146 | |||
147 | |||
148 | def match_licenses(licfiles, srctree, d, extra_hashes={}): | ||
149 | md5sums = {} | ||
150 | md5sums.update(_load_hash_csv(d)) | ||
151 | md5sums.update(_crunch_known_licenses(d)) | ||
152 | md5sums.update(extra_hashes) | ||
153 | |||
154 | licenses = [] | ||
155 | for licfile in sorted(licfiles): | ||
156 | resolved_licfile = d.expand(licfile) | ||
157 | md5value = bb.utils.md5_file(resolved_licfile) | ||
158 | license = md5sums.get(md5value, None) | ||
159 | if not license: | ||
160 | crunched_md5 = _crunch_license(resolved_licfile) | ||
161 | license = md5sums.get(crunched_md5, None) | ||
162 | if not license: | ||
163 | license = 'Unknown' | ||
164 | logger.info("Please add the following line for '%s' to a 'license-hashes.csv' " \ | ||
165 | "and replace `Unknown` with the license:\n" \ | ||
166 | "%s,Unknown" % (os.path.relpath(licfile, srctree + "/.."), md5value)) | ||
167 | |||
168 | licenses.append((license, os.path.relpath(licfile, srctree), md5value)) | ||
169 | |||
170 | return licenses | ||
171 | |||
172 | |||
173 | def find_licenses(srctree, d, first_only=False, extra_hashes={}): | ||
174 | licfiles = find_license_files(srctree, first_only) | ||
175 | licenses = match_licenses(licfiles, srctree, d, extra_hashes) | ||
176 | |||
177 | # FIXME should we grab at least one source file with a license header and add that too? | ||
178 | |||
179 | return licenses | ||