diff options
Diffstat (limited to 'scripts/lib/resulttool/regression.py')
-rw-r--r-- | scripts/lib/resulttool/regression.py | 281 |
1 files changed, 271 insertions, 10 deletions
diff --git a/scripts/lib/resulttool/regression.py b/scripts/lib/resulttool/regression.py index 9f952951b3..10e7d13841 100644 --- a/scripts/lib/resulttool/regression.py +++ b/scripts/lib/resulttool/regression.py | |||
@@ -7,15 +7,209 @@ | |||
7 | # | 7 | # |
8 | 8 | ||
9 | import resulttool.resultutils as resultutils | 9 | import resulttool.resultutils as resultutils |
10 | import json | ||
11 | 10 | ||
12 | from oeqa.utils.git import GitRepo | 11 | from oeqa.utils.git import GitRepo |
13 | import oeqa.utils.gitarchive as gitarchive | 12 | import oeqa.utils.gitarchive as gitarchive |
14 | 13 | ||
15 | def compare_result(logger, base_name, target_name, base_result, target_result): | 14 | METADATA_MATCH_TABLE = { |
15 | "oeselftest": "OESELFTEST_METADATA" | ||
16 | } | ||
17 | |||
18 | OESELFTEST_METADATA_GUESS_TABLE={ | ||
19 | "trigger-build-posttrigger": { | ||
20 | "run_all_tests": False, | ||
21 | "run_tests":["buildoptions.SourceMirroring.test_yocto_source_mirror"], | ||
22 | "skips": None, | ||
23 | "machine": None, | ||
24 | "select_tags":None, | ||
25 | "exclude_tags": None | ||
26 | }, | ||
27 | "reproducible": { | ||
28 | "run_all_tests": False, | ||
29 | "run_tests":["reproducible"], | ||
30 | "skips": None, | ||
31 | "machine": None, | ||
32 | "select_tags":None, | ||
33 | "exclude_tags": None | ||
34 | }, | ||
35 | "arch-qemu-quick": { | ||
36 | "run_all_tests": True, | ||
37 | "run_tests":None, | ||
38 | "skips": None, | ||
39 | "machine": None, | ||
40 | "select_tags":["machine"], | ||
41 | "exclude_tags": None | ||
42 | }, | ||
43 | "arch-qemu-full-x86-or-x86_64": { | ||
44 | "run_all_tests": True, | ||
45 | "run_tests":None, | ||
46 | "skips": None, | ||
47 | "machine": None, | ||
48 | "select_tags":["machine", "toolchain-system"], | ||
49 | "exclude_tags": None | ||
50 | }, | ||
51 | "arch-qemu-full-others": { | ||
52 | "run_all_tests": True, | ||
53 | "run_tests":None, | ||
54 | "skips": None, | ||
55 | "machine": None, | ||
56 | "select_tags":["machine", "toolchain-user"], | ||
57 | "exclude_tags": None | ||
58 | }, | ||
59 | "selftest": { | ||
60 | "run_all_tests": True, | ||
61 | "run_tests":None, | ||
62 | "skips": ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror", "reproducible"], | ||
63 | "machine": None, | ||
64 | "select_tags":None, | ||
65 | "exclude_tags": ["machine", "toolchain-system", "toolchain-user"] | ||
66 | }, | ||
67 | "bringup": { | ||
68 | "run_all_tests": True, | ||
69 | "run_tests":None, | ||
70 | "skips": ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror"], | ||
71 | "machine": None, | ||
72 | "select_tags":None, | ||
73 | "exclude_tags": ["machine", "toolchain-system", "toolchain-user"] | ||
74 | } | ||
75 | } | ||
76 | |||
77 | STATUS_STRINGS = { | ||
78 | "None": "No matching test result" | ||
79 | } | ||
80 | |||
81 | REGRESSIONS_DISPLAY_LIMIT=50 | ||
82 | |||
83 | MISSING_TESTS_BANNER = "-------------------------- Missing tests --------------------------" | ||
84 | ADDITIONAL_DATA_BANNER = "--------------------- Matches and improvements --------------------" | ||
85 | |||
86 | def test_has_at_least_one_matching_tag(test, tag_list): | ||
87 | return "oetags" in test and any(oetag in tag_list for oetag in test["oetags"]) | ||
88 | |||
89 | def all_tests_have_at_least_one_matching_tag(results, tag_list): | ||
90 | return all(test_has_at_least_one_matching_tag(test_result, tag_list) or test_name.startswith("ptestresult") for (test_name, test_result) in results.items()) | ||
91 | |||
92 | def any_test_have_any_matching_tag(results, tag_list): | ||
93 | return any(test_has_at_least_one_matching_tag(test, tag_list) for test in results.values()) | ||
94 | |||
95 | def have_skipped_test(result, test_prefix): | ||
96 | return all( result[test]['status'] == "SKIPPED" for test in result if test.startswith(test_prefix)) | ||
97 | |||
98 | def have_all_tests_skipped(result, test_prefixes_list): | ||
99 | return all(have_skipped_test(result, test_prefix) for test_prefix in test_prefixes_list) | ||
100 | |||
101 | def guess_oeselftest_metadata(results): | ||
102 | """ | ||
103 | When an oeselftest test result is lacking OESELFTEST_METADATA, we can try to guess it based on results content. | ||
104 | Check results for specific values (absence/presence of oetags, number and name of executed tests...), | ||
105 | and if it matches one of known configuration from autobuilder configuration, apply guessed OSELFTEST_METADATA | ||
106 | to it to allow proper test filtering. | ||
107 | This guessing process is tightly coupled to config.json in autobuilder. It should trigger less and less, | ||
108 | as new tests will have OESELFTEST_METADATA properly appended at test reporting time | ||
109 | """ | ||
110 | |||
111 | if len(results) == 1 and "buildoptions.SourceMirroring.test_yocto_source_mirror" in results: | ||
112 | return OESELFTEST_METADATA_GUESS_TABLE['trigger-build-posttrigger'] | ||
113 | elif all(result.startswith("reproducible") for result in results): | ||
114 | return OESELFTEST_METADATA_GUESS_TABLE['reproducible'] | ||
115 | elif all_tests_have_at_least_one_matching_tag(results, ["machine"]): | ||
116 | return OESELFTEST_METADATA_GUESS_TABLE['arch-qemu-quick'] | ||
117 | elif all_tests_have_at_least_one_matching_tag(results, ["machine", "toolchain-system"]): | ||
118 | return OESELFTEST_METADATA_GUESS_TABLE['arch-qemu-full-x86-or-x86_64'] | ||
119 | elif all_tests_have_at_least_one_matching_tag(results, ["machine", "toolchain-user"]): | ||
120 | return OESELFTEST_METADATA_GUESS_TABLE['arch-qemu-full-others'] | ||
121 | elif not any_test_have_any_matching_tag(results, ["machine", "toolchain-user", "toolchain-system"]): | ||
122 | if have_all_tests_skipped(results, ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror", "reproducible"]): | ||
123 | return OESELFTEST_METADATA_GUESS_TABLE['selftest'] | ||
124 | elif have_all_tests_skipped(results, ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror"]): | ||
125 | return OESELFTEST_METADATA_GUESS_TABLE['bringup'] | ||
126 | |||
127 | return None | ||
128 | |||
129 | |||
130 | def metadata_matches(base_configuration, target_configuration): | ||
131 | """ | ||
132 | For passed base and target, check test type. If test type matches one of | ||
133 | properties described in METADATA_MATCH_TABLE, compare metadata if it is | ||
134 | present in base. Return true if metadata matches, or if base lacks some | ||
135 | data (either TEST_TYPE or the corresponding metadata) | ||
136 | """ | ||
137 | test_type = base_configuration.get('TEST_TYPE') | ||
138 | if test_type not in METADATA_MATCH_TABLE: | ||
139 | return True | ||
140 | |||
141 | metadata_key = METADATA_MATCH_TABLE.get(test_type) | ||
142 | if target_configuration.get(metadata_key) != base_configuration.get(metadata_key): | ||
143 | return False | ||
144 | |||
145 | return True | ||
146 | |||
147 | |||
148 | def machine_matches(base_configuration, target_configuration): | ||
149 | return base_configuration.get('MACHINE') == target_configuration.get('MACHINE') | ||
150 | |||
151 | |||
152 | def can_be_compared(logger, base, target): | ||
153 | """ | ||
154 | Some tests are not relevant to be compared, for example some oeselftest | ||
155 | run with different tests sets or parameters. Return true if tests can be | ||
156 | compared | ||
157 | """ | ||
158 | ret = True | ||
159 | base_configuration = base['configuration'] | ||
160 | target_configuration = target['configuration'] | ||
161 | |||
162 | # Older test results lack proper OESELFTEST_METADATA: if not present, try to guess it based on tests results. | ||
163 | if base_configuration.get('TEST_TYPE') == 'oeselftest' and 'OESELFTEST_METADATA' not in base_configuration: | ||
164 | guess = guess_oeselftest_metadata(base['result']) | ||
165 | if guess is None: | ||
166 | logger.error(f"ERROR: did not manage to guess oeselftest metadata for {base_configuration['STARTTIME']}") | ||
167 | else: | ||
168 | logger.debug(f"Enriching {base_configuration['STARTTIME']} with {guess}") | ||
169 | base_configuration['OESELFTEST_METADATA'] = guess | ||
170 | if target_configuration.get('TEST_TYPE') == 'oeselftest' and 'OESELFTEST_METADATA' not in target_configuration: | ||
171 | guess = guess_oeselftest_metadata(target['result']) | ||
172 | if guess is None: | ||
173 | logger.error(f"ERROR: did not manage to guess oeselftest metadata for {target_configuration['STARTTIME']}") | ||
174 | else: | ||
175 | logger.debug(f"Enriching {target_configuration['STARTTIME']} with {guess}") | ||
176 | target_configuration['OESELFTEST_METADATA'] = guess | ||
177 | |||
178 | # Test runs with LTP results in should only be compared with other runs with LTP tests in them | ||
179 | if base_configuration.get('TEST_TYPE') == 'runtime' and any(result.startswith("ltpresult") for result in base['result']): | ||
180 | ret = target_configuration.get('TEST_TYPE') == 'runtime' and any(result.startswith("ltpresult") for result in target['result']) | ||
181 | |||
182 | return ret and metadata_matches(base_configuration, target_configuration) \ | ||
183 | and machine_matches(base_configuration, target_configuration) | ||
184 | |||
185 | def get_status_str(raw_status): | ||
186 | raw_status_lower = raw_status.lower() if raw_status else "None" | ||
187 | return STATUS_STRINGS.get(raw_status_lower, raw_status) | ||
188 | |||
189 | def get_additional_info_line(new_pass_count, new_tests): | ||
190 | result=[] | ||
191 | if new_tests: | ||
192 | result.append(f'+{new_tests} test(s) present') | ||
193 | if new_pass_count: | ||
194 | result.append(f'+{new_pass_count} test(s) now passing') | ||
195 | |||
196 | if not result: | ||
197 | return "" | ||
198 | |||
199 | return ' -> ' + ', '.join(result) + '\n' | ||
200 | |||
201 | def compare_result(logger, base_name, target_name, base_result, target_result, display_limit=None): | ||
16 | base_result = base_result.get('result') | 202 | base_result = base_result.get('result') |
17 | target_result = target_result.get('result') | 203 | target_result = target_result.get('result') |
18 | result = {} | 204 | result = {} |
205 | new_tests = 0 | ||
206 | regressions = {} | ||
207 | resultstring = "" | ||
208 | new_tests = 0 | ||
209 | new_pass_count = 0 | ||
210 | |||
211 | display_limit = int(display_limit) if display_limit else REGRESSIONS_DISPLAY_LIMIT | ||
212 | |||
19 | if base_result and target_result: | 213 | if base_result and target_result: |
20 | for k in base_result: | 214 | for k in base_result: |
21 | base_testcase = base_result[k] | 215 | base_testcase = base_result[k] |
@@ -27,12 +221,47 @@ def compare_result(logger, base_name, target_name, base_result, target_result): | |||
27 | result[k] = {'base': base_status, 'target': target_status} | 221 | result[k] = {'base': base_status, 'target': target_status} |
28 | else: | 222 | else: |
29 | logger.error('Failed to retrieved base test case status: %s' % k) | 223 | logger.error('Failed to retrieved base test case status: %s' % k) |
224 | |||
225 | # Also count new tests that were not present in base results: it | ||
226 | # could be newly added tests, but it could also highlights some tests | ||
227 | # renames or fixed faulty ptests | ||
228 | for k in target_result: | ||
229 | if k not in base_result: | ||
230 | new_tests += 1 | ||
30 | if result: | 231 | if result: |
31 | resultstring = "Regression: %s\n %s\n" % (base_name, target_name) | 232 | new_pass_count = sum(test['target'] is not None and test['target'].startswith("PASS") for test in result.values()) |
32 | for k in sorted(result): | 233 | # Print a regression report only if at least one test has a regression status (FAIL, SKIPPED, absent...) |
33 | resultstring += ' %s: %s -> %s\n' % (k, result[k]['base'], result[k]['target']) | 234 | if new_pass_count < len(result): |
235 | resultstring = "Regression: %s\n %s\n" % (base_name, target_name) | ||
236 | for k in sorted(result): | ||
237 | if not result[k]['target'] or not result[k]['target'].startswith("PASS"): | ||
238 | # Differentiate each ptest kind when listing regressions | ||
239 | key_parts = k.split('.') | ||
240 | key = '.'.join(key_parts[:2]) if k.startswith('ptest') else key_parts[0] | ||
241 | # Append new regression to corresponding test family | ||
242 | regressions[key] = regressions.setdefault(key, []) + [' %s: %s -> %s\n' % (k, get_status_str(result[k]['base']), get_status_str(result[k]['target']))] | ||
243 | resultstring += f" Total: {sum([len(regressions[r]) for r in regressions])} new regression(s):\n" | ||
244 | for k in regressions: | ||
245 | resultstring += f" {len(regressions[k])} regression(s) for {k}\n" | ||
246 | count_to_print=min([display_limit, len(regressions[k])]) if display_limit > 0 else len(regressions[k]) | ||
247 | resultstring += ''.join(regressions[k][:count_to_print]) | ||
248 | if count_to_print < len(regressions[k]): | ||
249 | resultstring+=' [...]\n' | ||
250 | if new_pass_count > 0: | ||
251 | resultstring += f' Additionally, {new_pass_count} previously failing test(s) is/are now passing\n' | ||
252 | if new_tests > 0: | ||
253 | resultstring += f' Additionally, {new_tests} new test(s) is/are present\n' | ||
254 | else: | ||
255 | resultstring = "%s\n%s\n" % (base_name, target_name) | ||
256 | result = None | ||
34 | else: | 257 | else: |
35 | resultstring = "Match: %s\n %s" % (base_name, target_name) | 258 | resultstring = "%s\n%s\n" % (base_name, target_name) |
259 | |||
260 | if not result: | ||
261 | additional_info = get_additional_info_line(new_pass_count, new_tests) | ||
262 | if additional_info: | ||
263 | resultstring += additional_info | ||
264 | |||
36 | return result, resultstring | 265 | return result, resultstring |
37 | 266 | ||
38 | def get_results(logger, source): | 267 | def get_results(logger, source): |
@@ -44,12 +273,38 @@ def regression(args, logger): | |||
44 | 273 | ||
45 | regression_common(args, logger, base_results, target_results) | 274 | regression_common(args, logger, base_results, target_results) |
46 | 275 | ||
276 | # Some test case naming is poor and contains random strings, particularly lttng/babeltrace. | ||
277 | # Truncating the test names works since they contain file and line number identifiers | ||
278 | # which allows us to match them without the random components. | ||
279 | def fixup_ptest_names(results, logger): | ||
280 | for r in results: | ||
281 | for i in results[r]: | ||
282 | tests = list(results[r][i]['result'].keys()) | ||
283 | for test in tests: | ||
284 | new = None | ||
285 | if test.startswith(("ptestresult.lttng-tools.", "ptestresult.babeltrace.", "ptestresult.babeltrace2")) and "_-_" in test: | ||
286 | new = test.split("_-_")[0] | ||
287 | elif test.startswith(("ptestresult.curl.")) and "__" in test: | ||
288 | new = test.split("__")[0] | ||
289 | elif test.startswith(("ptestresult.dbus.")) and "__" in test: | ||
290 | new = test.split("__")[0] | ||
291 | elif test.startswith("ptestresult.binutils") and "build-st-" in test: | ||
292 | new = test.split(" ")[0] | ||
293 | elif test.startswith("ptestresult.gcc") and "/tmp/runtest." in test: | ||
294 | new = ".".join(test.split(".")[:2]) | ||
295 | if new: | ||
296 | results[r][i]['result'][new] = results[r][i]['result'][test] | ||
297 | del results[r][i]['result'][test] | ||
298 | |||
47 | def regression_common(args, logger, base_results, target_results): | 299 | def regression_common(args, logger, base_results, target_results): |
48 | if args.base_result_id: | 300 | if args.base_result_id: |
49 | base_results = resultutils.filter_resultsdata(base_results, args.base_result_id) | 301 | base_results = resultutils.filter_resultsdata(base_results, args.base_result_id) |
50 | if args.target_result_id: | 302 | if args.target_result_id: |
51 | target_results = resultutils.filter_resultsdata(target_results, args.target_result_id) | 303 | target_results = resultutils.filter_resultsdata(target_results, args.target_result_id) |
52 | 304 | ||
305 | fixup_ptest_names(base_results, logger) | ||
306 | fixup_ptest_names(target_results, logger) | ||
307 | |||
53 | matches = [] | 308 | matches = [] |
54 | regressions = [] | 309 | regressions = [] |
55 | notfound = [] | 310 | notfound = [] |
@@ -62,7 +317,9 @@ def regression_common(args, logger, base_results, target_results): | |||
62 | # removing any pairs which match | 317 | # removing any pairs which match |
63 | for c in base.copy(): | 318 | for c in base.copy(): |
64 | for b in target.copy(): | 319 | for b in target.copy(): |
65 | res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b]) | 320 | if not can_be_compared(logger, base_results[a][c], target_results[a][b]): |
321 | continue | ||
322 | res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b], args.limit) | ||
66 | if not res: | 323 | if not res: |
67 | matches.append(resstr) | 324 | matches.append(resstr) |
68 | base.remove(c) | 325 | base.remove(c) |
@@ -71,15 +328,18 @@ def regression_common(args, logger, base_results, target_results): | |||
71 | # Should only now see regressions, we may not be able to match multiple pairs directly | 328 | # Should only now see regressions, we may not be able to match multiple pairs directly |
72 | for c in base: | 329 | for c in base: |
73 | for b in target: | 330 | for b in target: |
74 | res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b]) | 331 | if not can_be_compared(logger, base_results[a][c], target_results[a][b]): |
332 | continue | ||
333 | res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b], args.limit) | ||
75 | if res: | 334 | if res: |
76 | regressions.append(resstr) | 335 | regressions.append(resstr) |
77 | else: | 336 | else: |
78 | notfound.append("%s not found in target" % a) | 337 | notfound.append("%s not found in target" % a) |
79 | print("\n".join(sorted(matches))) | ||
80 | print("\n".join(sorted(regressions))) | 338 | print("\n".join(sorted(regressions))) |
339 | print("\n" + MISSING_TESTS_BANNER + "\n") | ||
81 | print("\n".join(sorted(notfound))) | 340 | print("\n".join(sorted(notfound))) |
82 | 341 | print("\n" + ADDITIONAL_DATA_BANNER + "\n") | |
342 | print("\n".join(sorted(matches))) | ||
83 | return 0 | 343 | return 0 |
84 | 344 | ||
85 | def regression_git(args, logger): | 345 | def regression_git(args, logger): |
@@ -183,4 +443,5 @@ def register_commands(subparsers): | |||
183 | parser_build.add_argument('--commit-number', help="Revision number to search for, redundant if --commit is specified") | 443 | parser_build.add_argument('--commit-number', help="Revision number to search for, redundant if --commit is specified") |
184 | parser_build.add_argument('--commit2', help="Revision to compare with") | 444 | parser_build.add_argument('--commit2', help="Revision to compare with") |
185 | parser_build.add_argument('--commit-number2', help="Revision number to compare with, redundant if --commit2 is specified") | 445 | parser_build.add_argument('--commit-number2', help="Revision number to compare with, redundant if --commit2 is specified") |
446 | parser_build.add_argument('-l', '--limit', default=REGRESSIONS_DISPLAY_LIMIT, help="Maximum number of changes to display per test. Can be set to 0 to print all changes") | ||
186 | 447 | ||