diff options
-rw-r--r-- | meta/recipes-devtools/gcc/gcc-14.2.inc | 1 | ||||
-rw-r--r-- | meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch | 447 |
2 files changed, 448 insertions, 0 deletions
diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc index 932a27995b..3d65bed92a 100644 --- a/meta/recipes-devtools/gcc/gcc-14.2.inc +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc | |||
@@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \ | |||
69 | file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \ | 69 | file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \ |
70 | file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \ | 70 | file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \ |
71 | file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \ | 71 | file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \ |
72 | file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \ | ||
72 | file://gcc.git-ab884fffe3fc82a710bea66ad651720d71c938b8.patch \ | 73 | file://gcc.git-ab884fffe3fc82a710bea66ad651720d71c938b8.patch \ |
73 | " | 74 | " |
74 | 75 | ||
diff --git a/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch new file mode 100644 index 0000000000..5bede60816 --- /dev/null +++ b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch | |||
@@ -0,0 +1,447 @@ | |||
1 | From 4e7735a8d87559bbddfe3a985786996e22241f8d Mon Sep 17 00:00:00 2001 | ||
2 | From: liuhongt <hongtao.liu@intel.com> | ||
3 | Date: Mon, 12 Aug 2024 14:35:31 +0800 | ||
4 | Subject: [PATCH] Move ix86_align_loops into a separate pass and insert the | ||
5 | pass after pass_endbr_and_patchable_area. | ||
6 | |||
7 | gcc/ChangeLog: | ||
8 | |||
9 | PR target/116174 | ||
10 | * config/i386/i386.cc (ix86_align_loops): Move this to .. | ||
11 | * config/i386/i386-features.cc (ix86_align_loops): .. here. | ||
12 | (class pass_align_tight_loops): New class. | ||
13 | (make_pass_align_tight_loops): New function. | ||
14 | * config/i386/i386-passes.def: Insert pass_align_tight_loops | ||
15 | after pass_insert_endbr_and_patchable_area. | ||
16 | * config/i386/i386-protos.h (make_pass_align_tight_loops): New | ||
17 | declare. | ||
18 | |||
19 | gcc/testsuite/ChangeLog: | ||
20 | |||
21 | * gcc.target/i386/pr116174.c: New test. | ||
22 | |||
23 | (cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8) | ||
24 | |||
25 | Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=patch;h=4e7735a8d87559bbddfe3a985786996e22241f8d] | ||
26 | |||
27 | Signed-off-by: Bin Lan <bin.lan.cn@windriver.com> | ||
28 | --- | ||
29 | gcc/config/i386/i386-features.cc | 191 +++++++++++++++++++++++ | ||
30 | gcc/config/i386/i386-passes.def | 3 + | ||
31 | gcc/config/i386/i386-protos.h | 1 + | ||
32 | gcc/config/i386/i386.cc | 146 ----------------- | ||
33 | gcc/testsuite/gcc.target/i386/pr116174.c | 12 ++ | ||
34 | 5 files changed, 207 insertions(+), 146 deletions(-) | ||
35 | create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c | ||
36 | |||
37 | diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc | ||
38 | index e3e004d55267..7de19d423637 100644 | ||
39 | --- a/gcc/config/i386/i386-features.cc | ||
40 | +++ b/gcc/config/i386/i386-features.cc | ||
41 | @@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) | ||
42 | return new pass_remove_partial_avx_dependency (ctxt); | ||
43 | } | ||
44 | |||
45 | +/* When a hot loop can be fit into one cacheline, | ||
46 | + force align the loop without considering the max skip. */ | ||
47 | +static void | ||
48 | +ix86_align_loops () | ||
49 | +{ | ||
50 | + basic_block bb; | ||
51 | + | ||
52 | + /* Don't do this when we don't know cache line size. */ | ||
53 | + if (ix86_cost->prefetch_block == 0) | ||
54 | + return; | ||
55 | + | ||
56 | + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); | ||
57 | + profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; | ||
58 | + FOR_EACH_BB_FN (bb, cfun) | ||
59 | + { | ||
60 | + rtx_insn *label = BB_HEAD (bb); | ||
61 | + bool has_fallthru = 0; | ||
62 | + edge e; | ||
63 | + edge_iterator ei; | ||
64 | + | ||
65 | + if (!LABEL_P (label)) | ||
66 | + continue; | ||
67 | + | ||
68 | + profile_count fallthru_count = profile_count::zero (); | ||
69 | + profile_count branch_count = profile_count::zero (); | ||
70 | + | ||
71 | + FOR_EACH_EDGE (e, ei, bb->preds) | ||
72 | + { | ||
73 | + if (e->flags & EDGE_FALLTHRU) | ||
74 | + has_fallthru = 1, fallthru_count += e->count (); | ||
75 | + else | ||
76 | + branch_count += e->count (); | ||
77 | + } | ||
78 | + | ||
79 | + if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) | ||
80 | + continue; | ||
81 | + | ||
82 | + if (bb->loop_father | ||
83 | + && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) | ||
84 | + && (has_fallthru | ||
85 | + ? (!(single_succ_p (bb) | ||
86 | + && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) | ||
87 | + && optimize_bb_for_speed_p (bb) | ||
88 | + && branch_count + fallthru_count > count_threshold | ||
89 | + && (branch_count > fallthru_count * param_align_loop_iterations)) | ||
90 | + /* In case there'no fallthru for the loop. | ||
91 | + Nops inserted won't be executed. */ | ||
92 | + : (branch_count > count_threshold | ||
93 | + || (bb->count > bb->prev_bb->count * 10 | ||
94 | + && (bb->prev_bb->count | ||
95 | + <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) | ||
96 | + { | ||
97 | + rtx_insn* insn, *end_insn; | ||
98 | + HOST_WIDE_INT size = 0; | ||
99 | + bool padding_p = true; | ||
100 | + basic_block tbb = bb; | ||
101 | + unsigned cond_branch_num = 0; | ||
102 | + bool detect_tight_loop_p = false; | ||
103 | + | ||
104 | + for (unsigned int i = 0; i != bb->loop_father->num_nodes; | ||
105 | + i++, tbb = tbb->next_bb) | ||
106 | + { | ||
107 | + /* Only handle continuous cfg layout. */ | ||
108 | + if (bb->loop_father != tbb->loop_father) | ||
109 | + { | ||
110 | + padding_p = false; | ||
111 | + break; | ||
112 | + } | ||
113 | + | ||
114 | + FOR_BB_INSNS (tbb, insn) | ||
115 | + { | ||
116 | + if (!NONDEBUG_INSN_P (insn)) | ||
117 | + continue; | ||
118 | + size += ix86_min_insn_size (insn); | ||
119 | + | ||
120 | + /* We don't know size of inline asm. | ||
121 | + Don't align loop for call. */ | ||
122 | + if (asm_noperands (PATTERN (insn)) >= 0 | ||
123 | + || CALL_P (insn)) | ||
124 | + { | ||
125 | + size = -1; | ||
126 | + break; | ||
127 | + } | ||
128 | + } | ||
129 | + | ||
130 | + if (size == -1 || size > ix86_cost->prefetch_block) | ||
131 | + { | ||
132 | + padding_p = false; | ||
133 | + break; | ||
134 | + } | ||
135 | + | ||
136 | + FOR_EACH_EDGE (e, ei, tbb->succs) | ||
137 | + { | ||
138 | + /* It could be part of the loop. */ | ||
139 | + if (e->dest == bb) | ||
140 | + { | ||
141 | + detect_tight_loop_p = true; | ||
142 | + break; | ||
143 | + } | ||
144 | + } | ||
145 | + | ||
146 | + if (detect_tight_loop_p) | ||
147 | + break; | ||
148 | + | ||
149 | + end_insn = BB_END (tbb); | ||
150 | + if (JUMP_P (end_insn)) | ||
151 | + { | ||
152 | + /* For decoded icache: | ||
153 | + 1. Up to two branches are allowed per Way. | ||
154 | + 2. A non-conditional branch is the last micro-op in a Way. | ||
155 | + */ | ||
156 | + if (onlyjump_p (end_insn) | ||
157 | + && (any_uncondjump_p (end_insn) | ||
158 | + || single_succ_p (tbb))) | ||
159 | + { | ||
160 | + padding_p = false; | ||
161 | + break; | ||
162 | + } | ||
163 | + else if (++cond_branch_num >= 2) | ||
164 | + { | ||
165 | + padding_p = false; | ||
166 | + break; | ||
167 | + } | ||
168 | + } | ||
169 | + | ||
170 | + } | ||
171 | + | ||
172 | + if (padding_p && detect_tight_loop_p) | ||
173 | + { | ||
174 | + emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), | ||
175 | + GEN_INT (0)), label); | ||
176 | + /* End of function. */ | ||
177 | + if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) | ||
178 | + break; | ||
179 | + /* Skip bb which already fits into one cacheline. */ | ||
180 | + bb = tbb; | ||
181 | + } | ||
182 | + } | ||
183 | + } | ||
184 | + | ||
185 | + loop_optimizer_finalize (); | ||
186 | + free_dominance_info (CDI_DOMINATORS); | ||
187 | +} | ||
188 | + | ||
189 | +namespace { | ||
190 | + | ||
191 | +const pass_data pass_data_align_tight_loops = | ||
192 | +{ | ||
193 | + RTL_PASS, /* type */ | ||
194 | + "align_tight_loops", /* name */ | ||
195 | + OPTGROUP_NONE, /* optinfo_flags */ | ||
196 | + TV_MACH_DEP, /* tv_id */ | ||
197 | + 0, /* properties_required */ | ||
198 | + 0, /* properties_provided */ | ||
199 | + 0, /* properties_destroyed */ | ||
200 | + 0, /* todo_flags_start */ | ||
201 | + 0, /* todo_flags_finish */ | ||
202 | +}; | ||
203 | + | ||
204 | +class pass_align_tight_loops : public rtl_opt_pass | ||
205 | +{ | ||
206 | +public: | ||
207 | + pass_align_tight_loops (gcc::context *ctxt) | ||
208 | + : rtl_opt_pass (pass_data_align_tight_loops, ctxt) | ||
209 | + {} | ||
210 | + | ||
211 | + /* opt_pass methods: */ | ||
212 | + bool gate (function *) final override | ||
213 | + { | ||
214 | + return optimize && optimize_function_for_speed_p (cfun); | ||
215 | + } | ||
216 | + | ||
217 | + unsigned int execute (function *) final override | ||
218 | + { | ||
219 | + timevar_push (TV_MACH_DEP); | ||
220 | +#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN | ||
221 | + ix86_align_loops (); | ||
222 | +#endif | ||
223 | + timevar_pop (TV_MACH_DEP); | ||
224 | + return 0; | ||
225 | + } | ||
226 | +}; // class pass_align_tight_loops | ||
227 | + | ||
228 | +} // anon namespace | ||
229 | + | ||
230 | +rtl_opt_pass * | ||
231 | +make_pass_align_tight_loops (gcc::context *ctxt) | ||
232 | +{ | ||
233 | + return new pass_align_tight_loops (ctxt); | ||
234 | +} | ||
235 | + | ||
236 | /* This compares the priority of target features in function DECL1 | ||
237 | and DECL2. It returns positive value if DECL1 is higher priority, | ||
238 | negative value if DECL2 is higher priority and 0 if they are the | ||
239 | diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def | ||
240 | index 7d96766f7b96..e500f15c9971 100644 | ||
241 | --- a/gcc/config/i386/i386-passes.def | ||
242 | +++ b/gcc/config/i386/i386-passes.def | ||
243 | @@ -31,5 +31,8 @@ along with GCC; see the file COPYING3. If not see | ||
244 | INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */); | ||
245 | |||
246 | INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); | ||
247 | + /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area. | ||
248 | + PR116174. */ | ||
249 | + INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); | ||
250 | |||
251 | INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); | ||
252 | diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h | ||
253 | index 46214a63974d..36c7b1aed42b 100644 | ||
254 | --- a/gcc/config/i386/i386-protos.h | ||
255 | +++ b/gcc/config/i386/i386-protos.h | ||
256 | @@ -419,6 +419,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area | ||
257 | (gcc::context *); | ||
258 | extern rtl_opt_pass *make_pass_remove_partial_avx_dependency | ||
259 | (gcc::context *); | ||
260 | +extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); | ||
261 | |||
262 | extern bool ix86_has_no_direct_extern_access; | ||
263 | |||
264 | diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc | ||
265 | index 6f89891d3cb5..288c69467d62 100644 | ||
266 | --- a/gcc/config/i386/i386.cc | ||
267 | +++ b/gcc/config/i386/i386.cc | ||
268 | @@ -23444,150 +23444,6 @@ ix86_split_stlf_stall_load () | ||
269 | } | ||
270 | } | ||
271 | |||
272 | -/* When a hot loop can be fit into one cacheline, | ||
273 | - force align the loop without considering the max skip. */ | ||
274 | -static void | ||
275 | -ix86_align_loops () | ||
276 | -{ | ||
277 | - basic_block bb; | ||
278 | - | ||
279 | - /* Don't do this when we don't know cache line size. */ | ||
280 | - if (ix86_cost->prefetch_block == 0) | ||
281 | - return; | ||
282 | - | ||
283 | - loop_optimizer_init (AVOID_CFG_MODIFICATIONS); | ||
284 | - profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; | ||
285 | - FOR_EACH_BB_FN (bb, cfun) | ||
286 | - { | ||
287 | - rtx_insn *label = BB_HEAD (bb); | ||
288 | - bool has_fallthru = 0; | ||
289 | - edge e; | ||
290 | - edge_iterator ei; | ||
291 | - | ||
292 | - if (!LABEL_P (label)) | ||
293 | - continue; | ||
294 | - | ||
295 | - profile_count fallthru_count = profile_count::zero (); | ||
296 | - profile_count branch_count = profile_count::zero (); | ||
297 | - | ||
298 | - FOR_EACH_EDGE (e, ei, bb->preds) | ||
299 | - { | ||
300 | - if (e->flags & EDGE_FALLTHRU) | ||
301 | - has_fallthru = 1, fallthru_count += e->count (); | ||
302 | - else | ||
303 | - branch_count += e->count (); | ||
304 | - } | ||
305 | - | ||
306 | - if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) | ||
307 | - continue; | ||
308 | - | ||
309 | - if (bb->loop_father | ||
310 | - && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) | ||
311 | - && (has_fallthru | ||
312 | - ? (!(single_succ_p (bb) | ||
313 | - && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) | ||
314 | - && optimize_bb_for_speed_p (bb) | ||
315 | - && branch_count + fallthru_count > count_threshold | ||
316 | - && (branch_count > fallthru_count * param_align_loop_iterations)) | ||
317 | - /* In case there'no fallthru for the loop. | ||
318 | - Nops inserted won't be executed. */ | ||
319 | - : (branch_count > count_threshold | ||
320 | - || (bb->count > bb->prev_bb->count * 10 | ||
321 | - && (bb->prev_bb->count | ||
322 | - <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) | ||
323 | - { | ||
324 | - rtx_insn* insn, *end_insn; | ||
325 | - HOST_WIDE_INT size = 0; | ||
326 | - bool padding_p = true; | ||
327 | - basic_block tbb = bb; | ||
328 | - unsigned cond_branch_num = 0; | ||
329 | - bool detect_tight_loop_p = false; | ||
330 | - | ||
331 | - for (unsigned int i = 0; i != bb->loop_father->num_nodes; | ||
332 | - i++, tbb = tbb->next_bb) | ||
333 | - { | ||
334 | - /* Only handle continuous cfg layout. */ | ||
335 | - if (bb->loop_father != tbb->loop_father) | ||
336 | - { | ||
337 | - padding_p = false; | ||
338 | - break; | ||
339 | - } | ||
340 | - | ||
341 | - FOR_BB_INSNS (tbb, insn) | ||
342 | - { | ||
343 | - if (!NONDEBUG_INSN_P (insn)) | ||
344 | - continue; | ||
345 | - size += ix86_min_insn_size (insn); | ||
346 | - | ||
347 | - /* We don't know size of inline asm. | ||
348 | - Don't align loop for call. */ | ||
349 | - if (asm_noperands (PATTERN (insn)) >= 0 | ||
350 | - || CALL_P (insn)) | ||
351 | - { | ||
352 | - size = -1; | ||
353 | - break; | ||
354 | - } | ||
355 | - } | ||
356 | - | ||
357 | - if (size == -1 || size > ix86_cost->prefetch_block) | ||
358 | - { | ||
359 | - padding_p = false; | ||
360 | - break; | ||
361 | - } | ||
362 | - | ||
363 | - FOR_EACH_EDGE (e, ei, tbb->succs) | ||
364 | - { | ||
365 | - /* It could be part of the loop. */ | ||
366 | - if (e->dest == bb) | ||
367 | - { | ||
368 | - detect_tight_loop_p = true; | ||
369 | - break; | ||
370 | - } | ||
371 | - } | ||
372 | - | ||
373 | - if (detect_tight_loop_p) | ||
374 | - break; | ||
375 | - | ||
376 | - end_insn = BB_END (tbb); | ||
377 | - if (JUMP_P (end_insn)) | ||
378 | - { | ||
379 | - /* For decoded icache: | ||
380 | - 1. Up to two branches are allowed per Way. | ||
381 | - 2. A non-conditional branch is the last micro-op in a Way. | ||
382 | - */ | ||
383 | - if (onlyjump_p (end_insn) | ||
384 | - && (any_uncondjump_p (end_insn) | ||
385 | - || single_succ_p (tbb))) | ||
386 | - { | ||
387 | - padding_p = false; | ||
388 | - break; | ||
389 | - } | ||
390 | - else if (++cond_branch_num >= 2) | ||
391 | - { | ||
392 | - padding_p = false; | ||
393 | - break; | ||
394 | - } | ||
395 | - } | ||
396 | - | ||
397 | - } | ||
398 | - | ||
399 | - if (padding_p && detect_tight_loop_p) | ||
400 | - { | ||
401 | - emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), | ||
402 | - GEN_INT (0)), label); | ||
403 | - /* End of function. */ | ||
404 | - if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) | ||
405 | - break; | ||
406 | - /* Skip bb which already fits into one cacheline. */ | ||
407 | - bb = tbb; | ||
408 | - } | ||
409 | - } | ||
410 | - } | ||
411 | - | ||
412 | - loop_optimizer_finalize (); | ||
413 | - free_dominance_info (CDI_DOMINATORS); | ||
414 | -} | ||
415 | - | ||
416 | /* Implement machine specific optimizations. We implement padding of returns | ||
417 | for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ | ||
418 | static void | ||
419 | @@ -23611,8 +23467,6 @@ ix86_reorg (void) | ||
420 | #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN | ||
421 | if (TARGET_FOUR_JUMP_LIMIT) | ||
422 | ix86_avoid_jump_mispredicts (); | ||
423 | - | ||
424 | - ix86_align_loops (); | ||
425 | #endif | ||
426 | } | ||
427 | } | ||
428 | diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c | ||
429 | new file mode 100644 | ||
430 | index 000000000000..8877d0b51af1 | ||
431 | --- /dev/null | ||
432 | +++ b/gcc/testsuite/gcc.target/i386/pr116174.c | ||
433 | @@ -0,0 +1,12 @@ | ||
434 | +/* { dg-do compile { target *-*-linux* } } */ | ||
435 | +/* { dg-options "-O2 -fcf-protection=branch" } */ | ||
436 | + | ||
437 | +char * | ||
438 | +foo (char *dest, const char *src) | ||
439 | +{ | ||
440 | + while ((*dest++ = *src++) != '\0') | ||
441 | + /* nothing */; | ||
442 | + return --dest; | ||
443 | +} | ||
444 | + | ||
445 | +/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */ | ||
446 | -- | ||
447 | 2.43.5 | ||