summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--meta/recipes-devtools/gcc/gcc-11.4.inc1
-rw-r--r--meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch2893
2 files changed, 2894 insertions, 0 deletions
diff --git a/meta/recipes-devtools/gcc/gcc-11.4.inc b/meta/recipes-devtools/gcc/gcc-11.4.inc
index 3670f494a6..88310e6b79 100644
--- a/meta/recipes-devtools/gcc/gcc-11.4.inc
+++ b/meta/recipes-devtools/gcc/gcc-11.4.inc
@@ -68,6 +68,7 @@ SRC_URI = "\
68 file://0002-aarch64-add-armv9-a-to-march.patch \ 68 file://0002-aarch64-add-armv9-a-to-march.patch \
69 file://0003-aarch64-Enable-FP16-feature-by-default-for-Armv9.patch \ 69 file://0003-aarch64-Enable-FP16-feature-by-default-for-Armv9.patch \
70 file://0004-arm-add-armv9-a-architecture-to-march.patch \ 70 file://0004-arm-add-armv9-a-architecture-to-march.patch \
71 file://CVE-2023-4039.patch \
71" 72"
72 73
73SRC_URI[sha256sum] = "3f2db222b007e8a4a23cd5ba56726ef08e8b1f1eb2055ee72c1402cea73a8dd9" 74SRC_URI[sha256sum] = "3f2db222b007e8a4a23cd5ba56726ef08e8b1f1eb2055ee72c1402cea73a8dd9"
diff --git a/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
new file mode 100644
index 0000000000..41684fe7dd
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
@@ -0,0 +1,2893 @@
1From: Richard Sandiford <richard.sandiford@arm.com>
2Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue
3Date: Tue, 12 Sep 2023 16:25:10 +0100
4
5This series of patches fixes deficiencies in GCC's -fstack-protector
6implementation for AArch64 when using dynamically allocated stack space.
7This is CVE-2023-4039. See:
8
9https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
10https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf
11
12for more details.
13
14The fix is to put the saved registers above the locals area when
15-fstack-protector is used.
16
17The series also fixes a stack-clash problem that I found while working
18on the CVE. In unpatched sources, the stack-clash problem would only
19trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an
20equivalent). But it would be a more significant issue with the new
21-fstack-protector frame layout. It's therefore important that both
22problems are fixed together.
23
24Some reorganisation of the code seemed necessary to fix the problems in a
25cleanish way. The series is therefore quite long, but only a handful of
26patches should have any effect on code generation.
27
28See the individual patches for a detailed description.
29
30Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches.
31I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039.
32
33CVE: CVE-2023-4039
34Upstream-Status: Backport
35Signed-off-by: Ross Burton <ross.burton@arm.com>
36
37
38From 52816ab48f97968f3fbfb5656250f3de7c00166d Mon Sep 17 00:00:00 2001
39From: Richard Sandiford <richard.sandiford@arm.com>
40Date: Tue, 12 Sep 2023 16:19:43 +0100
41Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping code
42
43aarch64_layout_frame uses a shorthand for referring to
44cfun->machine->frame:
45
46 aarch64_frame &frame = cfun->machine->frame;
47
48This patch does the same for some other heavy users of the structure.
49No functional change intended.
50
51gcc/
52 * config/aarch64/aarch64.c (aarch64_save_callee_saves): Use
53 a local shorthand for cfun->machine->frame.
54 (aarch64_restore_callee_saves, aarch64_get_separate_components):
55 (aarch64_process_components): Likewise.
56 (aarch64_allocate_and_probe_stack_space): Likewise.
57 (aarch64_expand_prologue, aarch64_expand_epilogue): Likewise.
58 (aarch64_layout_frame): Use existing shorthand for one more case.
59---
60 gcc/config/aarch64/aarch64.c | 115 ++++++++++++++++++-----------------
61 1 file changed, 60 insertions(+), 55 deletions(-)
62
63diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
64index 391a93f3018..77c1d1300a5 100644
65--- a/gcc/config/aarch64/aarch64.c
66+++ b/gcc/config/aarch64/aarch64.c
67@@ -7994,6 +7994,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
68 unsigned start, unsigned limit, bool skip_wb,
69 bool hard_fp_valid_p)
70 {
71+ aarch64_frame &frame = cfun->machine->frame;
72 rtx_insn *insn;
73 unsigned regno;
74 unsigned regno2;
75@@ -8008,8 +8009,8 @@ aarch64_save_callee_saves (poly_int64 start_offset,
76 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
77
78 if (skip_wb
79- && (regno == cfun->machine->frame.wb_candidate1
80- || regno == cfun->machine->frame.wb_candidate2))
81+ && (regno == frame.wb_candidate1
82+ || regno == frame.wb_candidate2))
83 continue;
84
85 if (cfun->machine->reg_is_wrapped_separately[regno])
86@@ -8017,7 +8018,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
87
88 machine_mode mode = aarch64_reg_save_mode (regno);
89 reg = gen_rtx_REG (mode, regno);
90- offset = start_offset + cfun->machine->frame.reg_offset[regno];
91+ offset = start_offset + frame.reg_offset[regno];
92 rtx base_rtx = stack_pointer_rtx;
93 poly_int64 sp_offset = offset;
94
95@@ -8030,7 +8031,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
96 {
97 gcc_assert (known_eq (start_offset, 0));
98 poly_int64 fp_offset
99- = cfun->machine->frame.below_hard_fp_saved_regs_size;
100+ = frame.below_hard_fp_saved_regs_size;
101 if (hard_fp_valid_p)
102 base_rtx = hard_frame_pointer_rtx;
103 else
104@@ -8052,8 +8053,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
105 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
106 && !cfun->machine->reg_is_wrapped_separately[regno2]
107 && known_eq (GET_MODE_SIZE (mode),
108- cfun->machine->frame.reg_offset[regno2]
109- - cfun->machine->frame.reg_offset[regno]))
110+ frame.reg_offset[regno2] - frame.reg_offset[regno]))
111 {
112 rtx reg2 = gen_rtx_REG (mode, regno2);
113 rtx mem2;
114@@ -8103,6 +8103,7 @@ static void
115 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
116 unsigned limit, bool skip_wb, rtx *cfi_ops)
117 {
118+ aarch64_frame &frame = cfun->machine->frame;
119 unsigned regno;
120 unsigned regno2;
121 poly_int64 offset;
122@@ -8119,13 +8120,13 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
123 rtx reg, mem;
124
125 if (skip_wb
126- && (regno == cfun->machine->frame.wb_candidate1
127- || regno == cfun->machine->frame.wb_candidate2))
128+ && (regno == frame.wb_candidate1
129+ || regno == frame.wb_candidate2))
130 continue;
131
132 machine_mode mode = aarch64_reg_save_mode (regno);
133 reg = gen_rtx_REG (mode, regno);
134- offset = start_offset + cfun->machine->frame.reg_offset[regno];
135+ offset = start_offset + frame.reg_offset[regno];
136 rtx base_rtx = stack_pointer_rtx;
137 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
138 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
139@@ -8136,8 +8137,7 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
140 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
141 && !cfun->machine->reg_is_wrapped_separately[regno2]
142 && known_eq (GET_MODE_SIZE (mode),
143- cfun->machine->frame.reg_offset[regno2]
144- - cfun->machine->frame.reg_offset[regno]))
145+ frame.reg_offset[regno2] - frame.reg_offset[regno]))
146 {
147 rtx reg2 = gen_rtx_REG (mode, regno2);
148 rtx mem2;
149@@ -8242,6 +8242,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
150 static sbitmap
151 aarch64_get_separate_components (void)
152 {
153+ aarch64_frame &frame = cfun->machine->frame;
154 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
155 bitmap_clear (components);
156
157@@ -8258,18 +8259,18 @@ aarch64_get_separate_components (void)
158 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
159 continue;
160
161- poly_int64 offset = cfun->machine->frame.reg_offset[regno];
162+ poly_int64 offset = frame.reg_offset[regno];
163
164 /* If the register is saved in the first SVE save slot, we use
165 it as a stack probe for -fstack-clash-protection. */
166 if (flag_stack_clash_protection
167- && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
168+ && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
169 && known_eq (offset, 0))
170 continue;
171
172 /* Get the offset relative to the register we'll use. */
173 if (frame_pointer_needed)
174- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
175+ offset -= frame.below_hard_fp_saved_regs_size;
176 else
177 offset += crtl->outgoing_args_size;
178
179@@ -8288,11 +8289,11 @@ aarch64_get_separate_components (void)
180 /* If the spare predicate register used by big-endian SVE code
181 is call-preserved, it must be saved in the main prologue
182 before any saves that use it. */
183- if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
184- bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
185+ if (frame.spare_pred_reg != INVALID_REGNUM)
186+ bitmap_clear_bit (components, frame.spare_pred_reg);
187
188- unsigned reg1 = cfun->machine->frame.wb_candidate1;
189- unsigned reg2 = cfun->machine->frame.wb_candidate2;
190+ unsigned reg1 = frame.wb_candidate1;
191+ unsigned reg2 = frame.wb_candidate2;
192 /* If registers have been chosen to be stored/restored with
193 writeback don't interfere with them to avoid having to output explicit
194 stack adjustment instructions. */
195@@ -8401,6 +8402,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
196 static void
197 aarch64_process_components (sbitmap components, bool prologue_p)
198 {
199+ aarch64_frame &frame = cfun->machine->frame;
200 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
201 ? HARD_FRAME_POINTER_REGNUM
202 : STACK_POINTER_REGNUM);
203@@ -8415,9 +8417,9 @@ aarch64_process_components (sbitmap components, bool prologue_p)
204 machine_mode mode = aarch64_reg_save_mode (regno);
205
206 rtx reg = gen_rtx_REG (mode, regno);
207- poly_int64 offset = cfun->machine->frame.reg_offset[regno];
208+ poly_int64 offset = frame.reg_offset[regno];
209 if (frame_pointer_needed)
210- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
211+ offset -= frame.below_hard_fp_saved_regs_size;
212 else
213 offset += crtl->outgoing_args_size;
214
215@@ -8442,14 +8444,14 @@ aarch64_process_components (sbitmap components, bool prologue_p)
216 break;
217 }
218
219- poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
220+ poly_int64 offset2 = frame.reg_offset[regno2];
221 /* The next register is not of the same class or its offset is not
222 mergeable with the current one into a pair. */
223 if (aarch64_sve_mode_p (mode)
224 || !satisfies_constraint_Ump (mem)
225 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
226 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
227- || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
228+ || maybe_ne ((offset2 - frame.reg_offset[regno]),
229 GET_MODE_SIZE (mode)))
230 {
231 insn = emit_insn (set);
232@@ -8471,7 +8473,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
233 /* REGNO2 can be saved/restored in a pair with REGNO. */
234 rtx reg2 = gen_rtx_REG (mode, regno2);
235 if (frame_pointer_needed)
236- offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
237+ offset2 -= frame.below_hard_fp_saved_regs_size;
238 else
239 offset2 += crtl->outgoing_args_size;
240 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
241@@ -8566,6 +8568,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
242 bool frame_related_p,
243 bool final_adjustment_p)
244 {
245+ aarch64_frame &frame = cfun->machine->frame;
246 HOST_WIDE_INT guard_size
247 = 1 << param_stack_clash_protection_guard_size;
248 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
249@@ -8586,25 +8589,25 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
250 register as a probe. We can't assume that LR was saved at position 0
251 though, so treat any space below it as unprobed. */
252 if (final_adjustment_p
253- && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
254+ && known_eq (frame.below_hard_fp_saved_regs_size, 0))
255 {
256- poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
257+ poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
258 if (known_ge (lr_offset, 0))
259 min_probe_threshold -= lr_offset.to_constant ();
260 else
261 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
262 }
263
264- poly_int64 frame_size = cfun->machine->frame.frame_size;
265+ poly_int64 frame_size = frame.frame_size;
266
267 /* We should always have a positive probe threshold. */
268 gcc_assert (min_probe_threshold > 0);
269
270 if (flag_stack_clash_protection && !final_adjustment_p)
271 {
272- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
273- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
274- poly_int64 final_adjust = cfun->machine->frame.final_adjust;
275+ poly_int64 initial_adjust = frame.initial_adjust;
276+ poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
277+ poly_int64 final_adjust = frame.final_adjust;
278
279 if (known_eq (frame_size, 0))
280 {
281@@ -8893,17 +8896,18 @@ aarch64_epilogue_uses (int regno)
282 void
283 aarch64_expand_prologue (void)
284 {
285- poly_int64 frame_size = cfun->machine->frame.frame_size;
286- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
287- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
288- poly_int64 final_adjust = cfun->machine->frame.final_adjust;
289- poly_int64 callee_offset = cfun->machine->frame.callee_offset;
290- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
291+ aarch64_frame &frame = cfun->machine->frame;
292+ poly_int64 frame_size = frame.frame_size;
293+ poly_int64 initial_adjust = frame.initial_adjust;
294+ HOST_WIDE_INT callee_adjust = frame.callee_adjust;
295+ poly_int64 final_adjust = frame.final_adjust;
296+ poly_int64 callee_offset = frame.callee_offset;
297+ poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
298 poly_int64 below_hard_fp_saved_regs_size
299- = cfun->machine->frame.below_hard_fp_saved_regs_size;
300- unsigned reg1 = cfun->machine->frame.wb_candidate1;
301- unsigned reg2 = cfun->machine->frame.wb_candidate2;
302- bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
303+ = frame.below_hard_fp_saved_regs_size;
304+ unsigned reg1 = frame.wb_candidate1;
305+ unsigned reg2 = frame.wb_candidate2;
306+ bool emit_frame_chain = frame.emit_frame_chain;
307 rtx_insn *insn;
308
309 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
310@@ -8969,7 +8973,7 @@ aarch64_expand_prologue (void)
311
312 /* The offset of the frame chain record (if any) from the current SP. */
313 poly_int64 chain_offset = (initial_adjust + callee_adjust
314- - cfun->machine->frame.hard_fp_offset);
315+ - frame.hard_fp_offset);
316 gcc_assert (known_ge (chain_offset, 0));
317
318 /* The offset of the bottom of the save area from the current SP. */
319@@ -9072,15 +9076,16 @@ aarch64_use_return_insn_p (void)
320 void
321 aarch64_expand_epilogue (bool for_sibcall)
322 {
323- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
324- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
325- poly_int64 final_adjust = cfun->machine->frame.final_adjust;
326- poly_int64 callee_offset = cfun->machine->frame.callee_offset;
327- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
328+ aarch64_frame &frame = cfun->machine->frame;
329+ poly_int64 initial_adjust = frame.initial_adjust;
330+ HOST_WIDE_INT callee_adjust = frame.callee_adjust;
331+ poly_int64 final_adjust = frame.final_adjust;
332+ poly_int64 callee_offset = frame.callee_offset;
333+ poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
334 poly_int64 below_hard_fp_saved_regs_size
335- = cfun->machine->frame.below_hard_fp_saved_regs_size;
336- unsigned reg1 = cfun->machine->frame.wb_candidate1;
337- unsigned reg2 = cfun->machine->frame.wb_candidate2;
338+ = frame.below_hard_fp_saved_regs_size;
339+ unsigned reg1 = frame.wb_candidate1;
340+ unsigned reg2 = frame.wb_candidate2;
341 rtx cfi_ops = NULL;
342 rtx_insn *insn;
343 /* A stack clash protection prologue may not have left EP0_REGNUM or
344@@ -9113,7 +9118,7 @@ aarch64_expand_epilogue (bool for_sibcall)
345 /* We need to add memory barrier to prevent read from deallocated stack. */
346 bool need_barrier_p
347 = maybe_ne (get_frame_size ()
348- + cfun->machine->frame.saved_varargs_size, 0);
349+ + frame.saved_varargs_size, 0);
350
351 /* Emit a barrier to prevent loads from a deallocated stack. */
352 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
353@@ -11744,24 +11749,24 @@ aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
354 poly_int64
355 aarch64_initial_elimination_offset (unsigned from, unsigned to)
356 {
357+ aarch64_frame &frame = cfun->machine->frame;
358+
359 if (to == HARD_FRAME_POINTER_REGNUM)
360 {
361 if (from == ARG_POINTER_REGNUM)
362- return cfun->machine->frame.hard_fp_offset;
363+ return frame.hard_fp_offset;
364
365 if (from == FRAME_POINTER_REGNUM)
366- return cfun->machine->frame.hard_fp_offset
367- - cfun->machine->frame.locals_offset;
368+ return frame.hard_fp_offset - frame.locals_offset;
369 }
370
371 if (to == STACK_POINTER_REGNUM)
372 {
373 if (from == FRAME_POINTER_REGNUM)
374- return cfun->machine->frame.frame_size
375- - cfun->machine->frame.locals_offset;
376+ return frame.frame_size - frame.locals_offset;
377 }
378
379- return cfun->machine->frame.frame_size;
380+ return frame.frame_size;
381 }
382
383
384--
3852.34.1
386
387
388From a2a57f7ec7912e77eb26919545807d90065584ff Mon Sep 17 00:00:00 2001
389From: Richard Sandiford <richard.sandiford@arm.com>
390Date: Tue, 12 Sep 2023 16:19:44 +0100
391Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset
392
393When we emit the frame chain, i.e. when we reach Here in this statement
394of aarch64_expand_prologue:
395
396 if (emit_frame_chain)
397 {
398 // Here
399 ...
400 }
401
402the stack is in one of two states:
403
404- We've allocated up to the frame chain, but no more.
405
406- We've allocated the whole frame, and the frame chain is within easy
407 reach of the new SP.
408
409The offset of the frame chain from the current SP is available
410in aarch64_frame as callee_offset. It is also available as the
411chain_offset local variable, where the latter is calculated from other
412data. (However, chain_offset is not always equal to callee_offset when
413!emit_frame_chain, so chain_offset isn't redundant.)
414
415In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using
416chain_offset for the initialisation of the hard frame pointer:
417
418 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
419- stack_pointer_rtx, callee_offset,
420+ stack_pointer_rtx, chain_offset,
421 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
422
423But the later REG_CFA_ADJUST_CFA handling still used callee_offset.
424
425I think the difference is harmless, but it's more logical for the
426CFA note to be in sync, and it's more convenient for later patches
427if it uses chain_offset.
428
429gcc/
430 * config/aarch64/aarch64.c (aarch64_expand_prologue): Use
431 chain_offset rather than callee_offset.
432---
433 gcc/config/aarch64/aarch64.c | 4 +---
434 1 file changed, 1 insertion(+), 3 deletions(-)
435
436diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
437index 77c1d1300a5..6bc026bd08f 100644
438--- a/gcc/config/aarch64/aarch64.c
439+++ b/gcc/config/aarch64/aarch64.c
440@@ -8901,7 +8901,6 @@ aarch64_expand_prologue (void)
441 poly_int64 initial_adjust = frame.initial_adjust;
442 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
443 poly_int64 final_adjust = frame.final_adjust;
444- poly_int64 callee_offset = frame.callee_offset;
445 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
446 poly_int64 below_hard_fp_saved_regs_size
447 = frame.below_hard_fp_saved_regs_size;
448@@ -9010,8 +9009,7 @@ aarch64_expand_prologue (void)
449 implicit. */
450 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
451 {
452- rtx src = plus_constant (Pmode, stack_pointer_rtx,
453- callee_offset);
454+ rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
455 add_reg_note (insn, REG_CFA_ADJUST_CFA,
456 gen_rtx_SET (hard_frame_pointer_rtx, src));
457 }
458--
4592.34.1
460
461
462From 5efdcc8ed19d9d9e708a001f5dc695560411496d Mon Sep 17 00:00:00 2001
463From: Richard Sandiford <richard.sandiford@arm.com>
464Date: Tue, 12 Sep 2023 16:19:44 +0100
465Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved
466 registers
467
468If a frame has no saved registers, it can be allocated in one go.
469There is no need to treat the areas below and above the saved
470registers as separate.
471
472And if we allocate the frame in one go, it should be allocated
473as the initial_adjust rather than the final_adjust. This allows the
474frame size to grow to guard_size - guard_used_by_caller before a stack
475probe is needed. (A frame with no register saves is necessarily a
476leaf frame.)
477
478This is a no-op as thing stand, since a leaf function will have
479no outgoing arguments, and so all the frame will be above where
480the saved registers normally go.
481
482gcc/
483 * config/aarch64/aarch64.c (aarch64_layout_frame): Explicitly
484 allocate the frame in one go if there are no saved registers.
485---
486 gcc/config/aarch64/aarch64.c | 8 +++++---
487 1 file changed, 5 insertions(+), 3 deletions(-)
488
489diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
490index 6bc026bd08f..05e6ae8c0c9 100644
491--- a/gcc/config/aarch64/aarch64.c
492+++ b/gcc/config/aarch64/aarch64.c
493@@ -7609,9 +7609,11 @@ aarch64_layout_frame (void)
494
495 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
496 HOST_WIDE_INT const_saved_regs_size;
497- if (frame.frame_size.is_constant (&const_size)
498- && const_size < max_push_offset
499- && known_eq (frame.hard_fp_offset, const_size))
500+ if (known_eq (frame.saved_regs_size, 0))
501+ frame.initial_adjust = frame.frame_size;
502+ else if (frame.frame_size.is_constant (&const_size)
503+ && const_size < max_push_offset
504+ && known_eq (frame.hard_fp_offset, const_size))
505 {
506 /* Simple, small frame with no outgoing arguments:
507
508--
5092.34.1
510
511
512From a8385d14318634f2e3a08a75bd2d6e2810f8cec9 Mon Sep 17 00:00:00 2001
513From: Richard Sandiford <richard.sandiford@arm.com>
514Date: Tue, 12 Sep 2023 16:19:45 +0100
515Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info
516
517The frame layout code currently hard-codes the assumption that
518the number of bytes below the saved registers is equal to the
519size of the outgoing arguments. This patch abstracts that
520value into a new field of aarch64_frame.
521
522gcc/
523 * config/aarch64/aarch64.h (aarch64_frame::bytes_below_saved_regs): New
524 field.
525 * config/aarch64/aarch64.c (aarch64_layout_frame): Initialize it,
526 and use it instead of crtl->outgoing_args_size.
527 (aarch64_get_separate_components): Use bytes_below_saved_regs instead
528 of outgoing_args_size.
529 (aarch64_process_components): Likewise.
530---
531 gcc/config/aarch64/aarch64.c | 71 ++++++++++++++++++------------------
532 gcc/config/aarch64/aarch64.h | 5 +++
533 2 files changed, 41 insertions(+), 35 deletions(-)
534
535diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
536index 05e6ae8c0c9..8fa5a0b2545 100644
537--- a/gcc/config/aarch64/aarch64.c
538+++ b/gcc/config/aarch64/aarch64.c
539@@ -7476,6 +7476,8 @@ aarch64_layout_frame (void)
540 gcc_assert (crtl->is_leaf
541 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
542
543+ frame.bytes_below_saved_regs = crtl->outgoing_args_size;
544+
545 /* Now assign stack slots for the registers. Start with the predicate
546 registers, since predicate LDR and STR have a relatively small
547 offset range. These saves happen below the hard frame pointer. */
548@@ -7580,18 +7582,18 @@ aarch64_layout_frame (void)
549
550 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
551
552- poly_int64 above_outgoing_args
553+ poly_int64 saved_regs_and_above
554 = aligned_upper_bound (varargs_and_saved_regs_size
555 + get_frame_size (),
556 STACK_BOUNDARY / BITS_PER_UNIT);
557
558 frame.hard_fp_offset
559- = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
560+ = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
561
562 /* Both these values are already aligned. */
563- gcc_assert (multiple_p (crtl->outgoing_args_size,
564+ gcc_assert (multiple_p (frame.bytes_below_saved_regs,
565 STACK_BOUNDARY / BITS_PER_UNIT));
566- frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
567+ frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
568
569 frame.locals_offset = frame.saved_varargs_size;
570
571@@ -7607,7 +7609,7 @@ aarch64_layout_frame (void)
572 else if (frame.wb_candidate1 != INVALID_REGNUM)
573 max_push_offset = 256;
574
575- HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
576+ HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
577 HOST_WIDE_INT const_saved_regs_size;
578 if (known_eq (frame.saved_regs_size, 0))
579 frame.initial_adjust = frame.frame_size;
580@@ -7615,31 +7617,31 @@ aarch64_layout_frame (void)
581 && const_size < max_push_offset
582 && known_eq (frame.hard_fp_offset, const_size))
583 {
584- /* Simple, small frame with no outgoing arguments:
585+ /* Simple, small frame with no data below the saved registers.
586
587 stp reg1, reg2, [sp, -frame_size]!
588 stp reg3, reg4, [sp, 16] */
589 frame.callee_adjust = const_size;
590 }
591- else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
592+ else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
593 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
594- && const_outgoing_args_size + const_saved_regs_size < 512
595- /* We could handle this case even with outgoing args, provided
596- that the number of args left us with valid offsets for all
597- predicate and vector save slots. It's such a rare case that
598- it hardly seems worth the effort though. */
599- && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
600+ && const_below_saved_regs + const_saved_regs_size < 512
601+ /* We could handle this case even with data below the saved
602+ registers, provided that that data left us with valid offsets
603+ for all predicate and vector save slots. It's such a rare
604+ case that it hardly seems worth the effort though. */
605+ && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
606 && !(cfun->calls_alloca
607 && frame.hard_fp_offset.is_constant (&const_fp_offset)
608 && const_fp_offset < max_push_offset))
609 {
610- /* Frame with small outgoing arguments:
611+ /* Frame with small area below the saved registers:
612
613 sub sp, sp, frame_size
614- stp reg1, reg2, [sp, outgoing_args_size]
615- stp reg3, reg4, [sp, outgoing_args_size + 16] */
616+ stp reg1, reg2, [sp, bytes_below_saved_regs]
617+ stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
618 frame.initial_adjust = frame.frame_size;
619- frame.callee_offset = const_outgoing_args_size;
620+ frame.callee_offset = const_below_saved_regs;
621 }
622 else if (saves_below_hard_fp_p
623 && known_eq (frame.saved_regs_size,
624@@ -7649,30 +7651,29 @@ aarch64_layout_frame (void)
625
626 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
627 save SVE registers relative to SP
628- sub sp, sp, outgoing_args_size */
629+ sub sp, sp, bytes_below_saved_regs */
630 frame.initial_adjust = (frame.hard_fp_offset
631 + frame.below_hard_fp_saved_regs_size);
632- frame.final_adjust = crtl->outgoing_args_size;
633+ frame.final_adjust = frame.bytes_below_saved_regs;
634 }
635 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
636 && const_fp_offset < max_push_offset)
637 {
638- /* Frame with large outgoing arguments or SVE saves, but with
639- a small local area:
640+ /* Frame with large area below the saved registers, or with SVE saves,
641+ but with a small area above:
642
643 stp reg1, reg2, [sp, -hard_fp_offset]!
644 stp reg3, reg4, [sp, 16]
645 [sub sp, sp, below_hard_fp_saved_regs_size]
646 [save SVE registers relative to SP]
647- sub sp, sp, outgoing_args_size */
648+ sub sp, sp, bytes_below_saved_regs */
649 frame.callee_adjust = const_fp_offset;
650 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
651- frame.final_adjust = crtl->outgoing_args_size;
652+ frame.final_adjust = frame.bytes_below_saved_regs;
653 }
654 else
655 {
656- /* Frame with large local area and outgoing arguments or SVE saves,
657- using frame pointer:
658+ /* General case:
659
660 sub sp, sp, hard_fp_offset
661 stp x29, x30, [sp, 0]
662@@ -7680,10 +7681,10 @@ aarch64_layout_frame (void)
663 stp reg3, reg4, [sp, 16]
664 [sub sp, sp, below_hard_fp_saved_regs_size]
665 [save SVE registers relative to SP]
666- sub sp, sp, outgoing_args_size */
667+ sub sp, sp, bytes_below_saved_regs */
668 frame.initial_adjust = frame.hard_fp_offset;
669 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
670- frame.final_adjust = crtl->outgoing_args_size;
671+ frame.final_adjust = frame.bytes_below_saved_regs;
672 }
673
674 /* Make sure the individual adjustments add up to the full frame size. */
675@@ -8274,7 +8275,7 @@ aarch64_get_separate_components (void)
676 if (frame_pointer_needed)
677 offset -= frame.below_hard_fp_saved_regs_size;
678 else
679- offset += crtl->outgoing_args_size;
680+ offset += frame.bytes_below_saved_regs;
681
682 /* Check that we can access the stack slot of the register with one
683 direct load with no adjustments needed. */
684@@ -8423,7 +8424,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
685 if (frame_pointer_needed)
686 offset -= frame.below_hard_fp_saved_regs_size;
687 else
688- offset += crtl->outgoing_args_size;
689+ offset += frame.bytes_below_saved_regs;
690
691 rtx addr = plus_constant (Pmode, ptr_reg, offset);
692 rtx mem = gen_frame_mem (mode, addr);
693@@ -8477,7 +8478,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
694 if (frame_pointer_needed)
695 offset2 -= frame.below_hard_fp_saved_regs_size;
696 else
697- offset2 += crtl->outgoing_args_size;
698+ offset2 += frame.bytes_below_saved_regs;
699 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
700 rtx mem2 = gen_frame_mem (mode, addr2);
701 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
702@@ -8551,10 +8552,10 @@ aarch64_stack_clash_protection_alloca_probe_range (void)
703 registers. If POLY_SIZE is not large enough to require a probe this function
704 will only adjust the stack. When allocating the stack space
705 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
706- FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
707- arguments. If we are then we ensure that any allocation larger than the ABI
708- defined buffer needs a probe so that the invariant of having a 1KB buffer is
709- maintained.
710+ FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
711+ the saved registers. If we are then we ensure that any allocation
712+ larger than the ABI defined buffer needs a probe so that the
713+ invariant of having a 1KB buffer is maintained.
714
715 We emit barriers after each stack adjustment to prevent optimizations from
716 breaking the invariant that we never drop the stack more than a page. This
717@@ -8763,7 +8764,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
718 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
719 be probed. This maintains the requirement that each page is probed at
720 least once. For initial probing we probe only if the allocation is
721- more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
722+ more than GUARD_SIZE - buffer, and below the saved registers we probe
723 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
724 GUARD_SIZE. This works that for any allocation that is large enough to
725 trigger a probe here, we'll have at least one, and if they're not large
726diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
727index bb383acfae8..6f0b8c7107e 100644
728--- a/gcc/config/aarch64/aarch64.h
729+++ b/gcc/config/aarch64/aarch64.h
730@@ -837,6 +837,11 @@ struct GTY (()) aarch64_frame
731 /* The size of the callee-save registers with a slot in REG_OFFSET. */
732 poly_int64 saved_regs_size;
733
734+ /* The number of bytes between the bottom of the static frame (the bottom
735+ of the outgoing arguments) and the bottom of the register save area.
736+ This value is always a multiple of STACK_BOUNDARY. */
737+ poly_int64 bytes_below_saved_regs;
738+
739 /* The size of the callee-save registers with a slot in REG_OFFSET that
740 are saved below the hard frame pointer. */
741 poly_int64 below_hard_fp_saved_regs_size;
742--
7432.34.1
744
745
746From d3f6ceecc8a7f128a9e6cb7d8aecf0de81ed9705 Mon Sep 17 00:00:00 2001
747From: Richard Sandiford <richard.sandiford@arm.com>
748Date: Tue, 12 Sep 2023 16:19:45 +0100
749Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info
750
751Following on from the previous bytes_below_saved_regs patch, this one
752records the number of bytes that are below the hard frame pointer.
753This eventually replaces below_hard_fp_saved_regs_size.
754
755If a frame pointer is not needed, the epilogue adds final_adjust
756to the stack pointer before restoring registers:
757
758 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
759
760Therefore, if the epilogue needs to restore the stack pointer from
761the hard frame pointer, the directly corresponding offset is:
762
763 -bytes_below_hard_fp + final_adjust
764
765i.e. go from the hard frame pointer to the bottom of the frame,
766then add the same amount as if we were using the stack pointer
767from the outset.
768
769gcc/
770 * config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp): New
771 field.
772 * config/aarch64/aarch64.c (aarch64_layout_frame): Initialize it.
773 (aarch64_expand_epilogue): Use it instead of
774 below_hard_fp_saved_regs_size.
775---
776 gcc/config/aarch64/aarch64.c | 6 +++---
777 gcc/config/aarch64/aarch64.h | 5 +++++
778 2 files changed, 8 insertions(+), 3 deletions(-)
779
780diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
781index 8fa5a0b2545..e03adf57226 100644
782--- a/gcc/config/aarch64/aarch64.c
783+++ b/gcc/config/aarch64/aarch64.c
784@@ -7528,6 +7528,7 @@ aarch64_layout_frame (void)
785 of the callee save area. */
786 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
787 frame.below_hard_fp_saved_regs_size = offset;
788+ frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
789 if (frame.emit_frame_chain)
790 {
791 /* FP and LR are placed in the linkage record. */
792@@ -9083,8 +9084,7 @@ aarch64_expand_epilogue (bool for_sibcall)
793 poly_int64 final_adjust = frame.final_adjust;
794 poly_int64 callee_offset = frame.callee_offset;
795 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
796- poly_int64 below_hard_fp_saved_regs_size
797- = frame.below_hard_fp_saved_regs_size;
798+ poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
799 unsigned reg1 = frame.wb_candidate1;
800 unsigned reg2 = frame.wb_candidate2;
801 rtx cfi_ops = NULL;
802@@ -9140,7 +9140,7 @@ aarch64_expand_epilogue (bool for_sibcall)
803 is restored on the instruction doing the writeback. */
804 aarch64_add_offset (Pmode, stack_pointer_rtx,
805 hard_frame_pointer_rtx,
806- -callee_offset - below_hard_fp_saved_regs_size,
807+ -bytes_below_hard_fp + final_adjust,
808 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
809 else
810 /* The case where we need to re-use the register here is very rare, so
811diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
812index 6f0b8c7107e..21ac920a3fe 100644
813--- a/gcc/config/aarch64/aarch64.h
814+++ b/gcc/config/aarch64/aarch64.h
815@@ -846,6 +846,11 @@ struct GTY (()) aarch64_frame
816 are saved below the hard frame pointer. */
817 poly_int64 below_hard_fp_saved_regs_size;
818
819+ /* The number of bytes between the bottom of the static frame (the bottom
820+ of the outgoing arguments) and the hard frame pointer. This value is
821+ always a multiple of STACK_BOUNDARY. */
822+ poly_int64 bytes_below_hard_fp;
823+
824 /* Offset from the base of the frame (incomming SP) to the
825 top of the locals area. This value is always a multiple of
826 STACK_BOUNDARY. */
827--
8282.34.1
829
830
831From e8a7ec87fcdbaa5f7c7bd499aebe5cefacbf8687 Mon Sep 17 00:00:00 2001
832From: Richard Sandiford <richard.sandiford@arm.com>
833Date: Tue, 12 Sep 2023 16:19:46 +0100
834Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves
835
836aarch64_save_callee_saves and aarch64_restore_callee_saves took
837a parameter called start_offset that gives the offset of the
838bottom of the saved register area from the current stack pointer.
839However, it's more convenient for later patches if we use the
840bottom of the entire frame as the reference point, rather than
841the bottom of the saved registers.
842
843Doing that removes the need for the callee_offset field.
844Other than that, this is not a win on its own. It only really
845makes sense in combination with the follow-on patches.
846
847gcc/
848 * config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete.
849 * config/aarch64/aarch64.c (aarch64_layout_frame): Remove
850 callee_offset handling.
851 (aarch64_save_callee_saves): Replace the start_offset parameter
852 with a bytes_below_sp parameter.
853 (aarch64_restore_callee_saves): Likewise.
854 (aarch64_expand_prologue): Update accordingly.
855 (aarch64_expand_epilogue): Likewise.
856---
857 gcc/config/aarch64/aarch64.c | 56 ++++++++++++++++++------------------
858 gcc/config/aarch64/aarch64.h | 4 ---
859 2 files changed, 28 insertions(+), 32 deletions(-)
860
861diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
862index e03adf57226..96e99f6c17a 100644
863--- a/gcc/config/aarch64/aarch64.c
864+++ b/gcc/config/aarch64/aarch64.c
865@@ -7602,7 +7602,6 @@ aarch64_layout_frame (void)
866 frame.final_adjust = 0;
867 frame.callee_adjust = 0;
868 frame.sve_callee_adjust = 0;
869- frame.callee_offset = 0;
870
871 HOST_WIDE_INT max_push_offset = 0;
872 if (frame.wb_candidate2 != INVALID_REGNUM)
873@@ -7642,7 +7641,6 @@ aarch64_layout_frame (void)
874 stp reg1, reg2, [sp, bytes_below_saved_regs]
875 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
876 frame.initial_adjust = frame.frame_size;
877- frame.callee_offset = const_below_saved_regs;
878 }
879 else if (saves_below_hard_fp_p
880 && known_eq (frame.saved_regs_size,
881@@ -7989,12 +7987,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
882 }
883
884 /* Emit code to save the callee-saved registers from register number START
885- to LIMIT to the stack at the location starting at offset START_OFFSET,
886- skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
887- is true if the hard frame pointer has been set up. */
888+ to LIMIT to the stack. The stack pointer is currently BYTES_BELOW_SP
889+ bytes above the bottom of the static frame. Skip any write-back
890+ candidates if SKIP_WB is true. HARD_FP_VALID_P is true if the hard
891+ frame pointer has been set up. */
892
893 static void
894-aarch64_save_callee_saves (poly_int64 start_offset,
895+aarch64_save_callee_saves (poly_int64 bytes_below_sp,
896 unsigned start, unsigned limit, bool skip_wb,
897 bool hard_fp_valid_p)
898 {
899@@ -8022,7 +8021,9 @@ aarch64_save_callee_saves (poly_int64 start_offset,
900
901 machine_mode mode = aarch64_reg_save_mode (regno);
902 reg = gen_rtx_REG (mode, regno);
903- offset = start_offset + frame.reg_offset[regno];
904+ offset = (frame.reg_offset[regno]
905+ + frame.bytes_below_saved_regs
906+ - bytes_below_sp);
907 rtx base_rtx = stack_pointer_rtx;
908 poly_int64 sp_offset = offset;
909
910@@ -8033,9 +8034,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
911 else if (GP_REGNUM_P (regno)
912 && (!offset.is_constant (&const_offset) || const_offset >= 512))
913 {
914- gcc_assert (known_eq (start_offset, 0));
915- poly_int64 fp_offset
916- = frame.below_hard_fp_saved_regs_size;
917+ poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
918 if (hard_fp_valid_p)
919 base_rtx = hard_frame_pointer_rtx;
920 else
921@@ -8099,12 +8098,13 @@ aarch64_save_callee_saves (poly_int64 start_offset,
922 }
923
924 /* Emit code to restore the callee registers from register number START
925- up to and including LIMIT. Restore from the stack offset START_OFFSET,
926- skipping any write-back candidates if SKIP_WB is true. Write the
927- appropriate REG_CFA_RESTORE notes into CFI_OPS. */
928+ up to and including LIMIT. The stack pointer is currently BYTES_BELOW_SP
929+ bytes above the bottom of the static frame. Skip any write-back
930+ candidates if SKIP_WB is true. Write the appropriate REG_CFA_RESTORE
931+ notes into CFI_OPS. */
932
933 static void
934-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
935+aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
936 unsigned limit, bool skip_wb, rtx *cfi_ops)
937 {
938 aarch64_frame &frame = cfun->machine->frame;
939@@ -8130,7 +8130,9 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
940
941 machine_mode mode = aarch64_reg_save_mode (regno);
942 reg = gen_rtx_REG (mode, regno);
943- offset = start_offset + frame.reg_offset[regno];
944+ offset = (frame.reg_offset[regno]
945+ + frame.bytes_below_saved_regs
946+ - bytes_below_sp);
947 rtx base_rtx = stack_pointer_rtx;
948 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
949 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
950@@ -8906,8 +8908,6 @@ aarch64_expand_prologue (void)
951 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
952 poly_int64 final_adjust = frame.final_adjust;
953 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
954- poly_int64 below_hard_fp_saved_regs_size
955- = frame.below_hard_fp_saved_regs_size;
956 unsigned reg1 = frame.wb_candidate1;
957 unsigned reg2 = frame.wb_candidate2;
958 bool emit_frame_chain = frame.emit_frame_chain;
959@@ -8979,8 +8979,8 @@ aarch64_expand_prologue (void)
960 - frame.hard_fp_offset);
961 gcc_assert (known_ge (chain_offset, 0));
962
963- /* The offset of the bottom of the save area from the current SP. */
964- poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
965+ /* The offset of the current SP from the bottom of the static frame. */
966+ poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
967
968 if (emit_frame_chain)
969 {
970@@ -8988,7 +8988,7 @@ aarch64_expand_prologue (void)
971 {
972 reg1 = R29_REGNUM;
973 reg2 = R30_REGNUM;
974- aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
975+ aarch64_save_callee_saves (bytes_below_sp, reg1, reg2,
976 false, false);
977 }
978 else
979@@ -9028,7 +9028,7 @@ aarch64_expand_prologue (void)
980 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
981 }
982
983- aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
984+ aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM,
985 callee_adjust != 0 || emit_frame_chain,
986 emit_frame_chain);
987 if (maybe_ne (sve_callee_adjust, 0))
988@@ -9038,16 +9038,17 @@ aarch64_expand_prologue (void)
989 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
990 sve_callee_adjust,
991 !frame_pointer_needed, false);
992- saved_regs_offset += sve_callee_adjust;
993+ bytes_below_sp -= sve_callee_adjust;
994 }
995- aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
996+ aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM,
997 false, emit_frame_chain);
998- aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
999+ aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM,
1000 callee_adjust != 0 || emit_frame_chain,
1001 emit_frame_chain);
1002
1003 /* We may need to probe the final adjustment if it is larger than the guard
1004 that is assumed by the called. */
1005+ gcc_assert (known_eq (bytes_below_sp, final_adjust));
1006 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
1007 !frame_pointer_needed, true);
1008 }
1009@@ -9082,7 +9083,6 @@ aarch64_expand_epilogue (bool for_sibcall)
1010 poly_int64 initial_adjust = frame.initial_adjust;
1011 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
1012 poly_int64 final_adjust = frame.final_adjust;
1013- poly_int64 callee_offset = frame.callee_offset;
1014 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
1015 poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
1016 unsigned reg1 = frame.wb_candidate1;
1017@@ -9150,13 +9150,13 @@ aarch64_expand_epilogue (bool for_sibcall)
1018
1019 /* Restore the vector registers before the predicate registers,
1020 so that we can use P4 as a temporary for big-endian SVE frames. */
1021- aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
1022+ aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM,
1023 callee_adjust != 0, &cfi_ops);
1024- aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
1025+ aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM,
1026 false, &cfi_ops);
1027 if (maybe_ne (sve_callee_adjust, 0))
1028 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
1029- aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
1030+ aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
1031 R0_REGNUM, R30_REGNUM,
1032 callee_adjust != 0, &cfi_ops);
1033
1034diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1035index 21ac920a3fe..57e67217745 100644
1036--- a/gcc/config/aarch64/aarch64.h
1037+++ b/gcc/config/aarch64/aarch64.h
1038@@ -873,10 +873,6 @@ struct GTY (()) aarch64_frame
1039 It is zero when no push is used. */
1040 HOST_WIDE_INT callee_adjust;
1041
1042- /* The offset from SP to the callee-save registers after initial_adjust.
1043- It may be non-zero if no push is used (ie. callee_adjust == 0). */
1044- poly_int64 callee_offset;
1045-
1046 /* The size of the stack adjustment before saving or after restoring
1047 SVE registers. */
1048 poly_int64 sve_callee_adjust;
1049--
10502.34.1
1051
1052
1053From 7356df0319aefe4c68ef57ec4c6bd18c72188a34 Mon Sep 17 00:00:00 2001
1054From: Richard Sandiford <richard.sandiford@arm.com>
1055Date: Tue, 12 Sep 2023 16:19:46 +0100
1056Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a
1057 chain
1058
1059After previous patches, it is no longer necessary to calculate
1060a chain_offset in cases where there is no chain record.
1061
1062gcc/
1063 * config/aarch64/aarch64.c (aarch64_expand_prologue): Move the
1064 calculation of chain_offset into the emit_frame_chain block.
1065---
1066 gcc/config/aarch64/aarch64.c | 10 +++++-----
1067 1 file changed, 5 insertions(+), 5 deletions(-)
1068
1069diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
1070index 96e99f6c17a..cf5244b7ec0 100644
1071--- a/gcc/config/aarch64/aarch64.c
1072+++ b/gcc/config/aarch64/aarch64.c
1073@@ -8974,16 +8974,16 @@ aarch64_expand_prologue (void)
1074 if (callee_adjust != 0)
1075 aarch64_push_regs (reg1, reg2, callee_adjust);
1076
1077- /* The offset of the frame chain record (if any) from the current SP. */
1078- poly_int64 chain_offset = (initial_adjust + callee_adjust
1079- - frame.hard_fp_offset);
1080- gcc_assert (known_ge (chain_offset, 0));
1081-
1082 /* The offset of the current SP from the bottom of the static frame. */
1083 poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
1084
1085 if (emit_frame_chain)
1086 {
1087+ /* The offset of the frame chain record (if any) from the current SP. */
1088+ poly_int64 chain_offset = (initial_adjust + callee_adjust
1089+ - frame.hard_fp_offset);
1090+ gcc_assert (known_ge (chain_offset, 0));
1091+
1092 if (callee_adjust == 0)
1093 {
1094 reg1 = R29_REGNUM;
1095--
10962.34.1
1097
1098
1099From 82fb69e75c21010f7afc72bb842751164fe8fc72 Mon Sep 17 00:00:00 2001
1100From: Richard Sandiford <richard.sandiford@arm.com>
1101Date: Tue, 12 Sep 2023 16:19:46 +0100
1102Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals
1103MIME-Version: 1.0
1104Content-Type: text/plain; charset=UTF-8
1105Content-Transfer-Encoding: 8bit
1106
1107locals_offset was described as:
1108
1109 /* Offset from the base of the frame (incomming SP) to the
1110 top of the locals area. This value is always a multiple of
1111 STACK_BOUNDARY. */
1112
1113This is implicitly an “upside down” view of the frame: the incoming
1114SP is at offset 0, and anything N bytes below the incoming SP is at
1115offset N (rather than -N).
1116
1117However, reg_offset instead uses a “right way up” view; that is,
1118it views offsets in address terms. Something above X is at a
1119positive offset from X and something below X is at a negative
1120offset from X.
1121
1122Also, even on FRAME_GROWS_DOWNWARD targets like AArch64,
1123target-independent code views offsets in address terms too:
1124locals are allocated at negative offsets to virtual_stack_vars.
1125
1126It seems confusing to have *_offset fields of the same structure
1127using different polarities like this. This patch tries to avoid
1128that by renaming locals_offset to bytes_above_locals.
1129
1130gcc/
1131 * config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename to...
1132 (aarch64_frame::bytes_above_locals): ...this.
1133 * config/aarch64/aarch64.c (aarch64_layout_frame)
1134 (aarch64_initial_elimination_offset): Update accordingly.
1135---
1136 gcc/config/aarch64/aarch64.c | 6 +++---
1137 gcc/config/aarch64/aarch64.h | 6 +++---
1138 2 files changed, 6 insertions(+), 6 deletions(-)
1139
1140diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
1141index cf5244b7ec0..d54f7a89306 100644
1142--- a/gcc/config/aarch64/aarch64.c
1143+++ b/gcc/config/aarch64/aarch64.c
1144@@ -7596,7 +7596,7 @@ aarch64_layout_frame (void)
1145 STACK_BOUNDARY / BITS_PER_UNIT));
1146 frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
1147
1148- frame.locals_offset = frame.saved_varargs_size;
1149+ frame.bytes_above_locals = frame.saved_varargs_size;
1150
1151 frame.initial_adjust = 0;
1152 frame.final_adjust = 0;
1153@@ -11758,13 +11758,13 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
1154 return frame.hard_fp_offset;
1155
1156 if (from == FRAME_POINTER_REGNUM)
1157- return frame.hard_fp_offset - frame.locals_offset;
1158+ return frame.hard_fp_offset - frame.bytes_above_locals;
1159 }
1160
1161 if (to == STACK_POINTER_REGNUM)
1162 {
1163 if (from == FRAME_POINTER_REGNUM)
1164- return frame.frame_size - frame.locals_offset;
1165+ return frame.frame_size - frame.bytes_above_locals;
1166 }
1167
1168 return frame.frame_size;
1169diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1170index 57e67217745..3c5e3dd429d 100644
1171--- a/gcc/config/aarch64/aarch64.h
1172+++ b/gcc/config/aarch64/aarch64.h
1173@@ -851,10 +851,10 @@ struct GTY (()) aarch64_frame
1174 always a multiple of STACK_BOUNDARY. */
1175 poly_int64 bytes_below_hard_fp;
1176
1177- /* Offset from the base of the frame (incomming SP) to the
1178- top of the locals area. This value is always a multiple of
1179+ /* The number of bytes between the top of the locals area and the top
1180+ of the frame (the incomming SP). This value is always a multiple of
1181 STACK_BOUNDARY. */
1182- poly_int64 locals_offset;
1183+ poly_int64 bytes_above_locals;
1184
1185 /* Offset from the base of the frame (incomming SP) to the
1186 hard_frame_pointer. This value is always a multiple of
1187--
11882.34.1
1189
1190
1191From fa6600b55b49ee14d8288f13719ceea2a75eea60 Mon Sep 17 00:00:00 2001
1192From: Richard Sandiford <richard.sandiford@arm.com>
1193Date: Tue, 12 Sep 2023 16:19:47 +0100
1194Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to bytes_above_hard_fp
1195MIME-Version: 1.0
1196Content-Type: text/plain; charset=UTF-8
1197Content-Transfer-Encoding: 8bit
1198
1199Similarly to the previous locals_offset patch, hard_fp_offset
1200was described as:
1201
1202 /* Offset from the base of the frame (incomming SP) to the
1203 hard_frame_pointer. This value is always a multiple of
1204 STACK_BOUNDARY. */
1205 poly_int64 hard_fp_offset;
1206
1207which again took an “upside-down” view: higher offsets meant lower
1208addresses. This patch renames the field to bytes_above_hard_fp instead.
1209
1210gcc/
1211 * config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename
1212 to...
1213 (aarch64_frame::bytes_above_hard_fp): ...this.
1214 * config/aarch64/aarch64.c (aarch64_layout_frame)
1215 (aarch64_expand_prologue): Update accordingly.
1216 (aarch64_initial_elimination_offset): Likewise.
1217---
1218 gcc/config/aarch64/aarch64.c | 26 +++++++++++++-------------
1219 gcc/config/aarch64/aarch64.h | 6 +++---
1220 2 files changed, 16 insertions(+), 16 deletions(-)
1221
1222diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
1223index d54f7a89306..23cb084e5a7 100644
1224--- a/gcc/config/aarch64/aarch64.c
1225+++ b/gcc/config/aarch64/aarch64.c
1226@@ -7588,7 +7588,7 @@ aarch64_layout_frame (void)
1227 + get_frame_size (),
1228 STACK_BOUNDARY / BITS_PER_UNIT);
1229
1230- frame.hard_fp_offset
1231+ frame.bytes_above_hard_fp
1232 = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
1233
1234 /* Both these values are already aligned. */
1235@@ -7609,13 +7609,13 @@ aarch64_layout_frame (void)
1236 else if (frame.wb_candidate1 != INVALID_REGNUM)
1237 max_push_offset = 256;
1238
1239- HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
1240+ HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
1241 HOST_WIDE_INT const_saved_regs_size;
1242 if (known_eq (frame.saved_regs_size, 0))
1243 frame.initial_adjust = frame.frame_size;
1244 else if (frame.frame_size.is_constant (&const_size)
1245 && const_size < max_push_offset
1246- && known_eq (frame.hard_fp_offset, const_size))
1247+ && known_eq (frame.bytes_above_hard_fp, const_size))
1248 {
1249 /* Simple, small frame with no data below the saved registers.
1250
1251@@ -7632,8 +7632,8 @@ aarch64_layout_frame (void)
1252 case that it hardly seems worth the effort though. */
1253 && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
1254 && !(cfun->calls_alloca
1255- && frame.hard_fp_offset.is_constant (&const_fp_offset)
1256- && const_fp_offset < max_push_offset))
1257+ && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
1258+ && const_above_fp < max_push_offset))
1259 {
1260 /* Frame with small area below the saved registers:
1261
1262@@ -7651,12 +7651,12 @@ aarch64_layout_frame (void)
1263 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
1264 save SVE registers relative to SP
1265 sub sp, sp, bytes_below_saved_regs */
1266- frame.initial_adjust = (frame.hard_fp_offset
1267+ frame.initial_adjust = (frame.bytes_above_hard_fp
1268 + frame.below_hard_fp_saved_regs_size);
1269 frame.final_adjust = frame.bytes_below_saved_regs;
1270 }
1271- else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
1272- && const_fp_offset < max_push_offset)
1273+ else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
1274+ && const_above_fp < max_push_offset)
1275 {
1276 /* Frame with large area below the saved registers, or with SVE saves,
1277 but with a small area above:
1278@@ -7666,7 +7666,7 @@ aarch64_layout_frame (void)
1279 [sub sp, sp, below_hard_fp_saved_regs_size]
1280 [save SVE registers relative to SP]
1281 sub sp, sp, bytes_below_saved_regs */
1282- frame.callee_adjust = const_fp_offset;
1283+ frame.callee_adjust = const_above_fp;
1284 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
1285 frame.final_adjust = frame.bytes_below_saved_regs;
1286 }
1287@@ -7681,7 +7681,7 @@ aarch64_layout_frame (void)
1288 [sub sp, sp, below_hard_fp_saved_regs_size]
1289 [save SVE registers relative to SP]
1290 sub sp, sp, bytes_below_saved_regs */
1291- frame.initial_adjust = frame.hard_fp_offset;
1292+ frame.initial_adjust = frame.bytes_above_hard_fp;
1293 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
1294 frame.final_adjust = frame.bytes_below_saved_regs;
1295 }
1296@@ -8981,7 +8981,7 @@ aarch64_expand_prologue (void)
1297 {
1298 /* The offset of the frame chain record (if any) from the current SP. */
1299 poly_int64 chain_offset = (initial_adjust + callee_adjust
1300- - frame.hard_fp_offset);
1301+ - frame.bytes_above_hard_fp);
1302 gcc_assert (known_ge (chain_offset, 0));
1303
1304 if (callee_adjust == 0)
1305@@ -11755,10 +11755,10 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
1306 if (to == HARD_FRAME_POINTER_REGNUM)
1307 {
1308 if (from == ARG_POINTER_REGNUM)
1309- return frame.hard_fp_offset;
1310+ return frame.bytes_above_hard_fp;
1311
1312 if (from == FRAME_POINTER_REGNUM)
1313- return frame.hard_fp_offset - frame.bytes_above_locals;
1314+ return frame.bytes_above_hard_fp - frame.bytes_above_locals;
1315 }
1316
1317 if (to == STACK_POINTER_REGNUM)
1318diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1319index 3c5e3dd429d..9291cfd3ec8 100644
1320--- a/gcc/config/aarch64/aarch64.h
1321+++ b/gcc/config/aarch64/aarch64.h
1322@@ -856,10 +856,10 @@ struct GTY (()) aarch64_frame
1323 STACK_BOUNDARY. */
1324 poly_int64 bytes_above_locals;
1325
1326- /* Offset from the base of the frame (incomming SP) to the
1327- hard_frame_pointer. This value is always a multiple of
1328+ /* The number of bytes between the hard_frame_pointer and the top of
1329+ the frame (the incomming SP). This value is always a multiple of
1330 STACK_BOUNDARY. */
1331- poly_int64 hard_fp_offset;
1332+ poly_int64 bytes_above_hard_fp;
1333
1334 /* The size of the frame. This value is the offset from base of the
1335 frame (incomming SP) to the stack_pointer. This value is always
1336--
13372.34.1
1338
1339
1340From b8cd5a0229da78c2d1289d54731fbef0126617d5 Mon Sep 17 00:00:00 2001
1341From: Richard Sandiford <richard.sandiford@arm.com>
1342Date: Tue, 12 Sep 2023 16:19:47 +0100
1343Subject: [PATCH 10/19] aarch64: Tweak frame_size comment
1344MIME-Version: 1.0
1345Content-Type: text/plain; charset=UTF-8
1346Content-Transfer-Encoding: 8bit
1347
1348This patch fixes another case in which a value was described with
1349an “upside-down” view.
1350
1351gcc/
1352 * config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak comment.
1353---
1354 gcc/config/aarch64/aarch64.h | 4 ++--
1355 1 file changed, 2 insertions(+), 2 deletions(-)
1356
1357diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1358index 9291cfd3ec8..82883ad5a0d 100644
1359--- a/gcc/config/aarch64/aarch64.h
1360+++ b/gcc/config/aarch64/aarch64.h
1361@@ -861,8 +861,8 @@ struct GTY (()) aarch64_frame
1362 STACK_BOUNDARY. */
1363 poly_int64 bytes_above_hard_fp;
1364
1365- /* The size of the frame. This value is the offset from base of the
1366- frame (incomming SP) to the stack_pointer. This value is always
1367+ /* The size of the frame, i.e. the number of bytes between the bottom
1368+ of the outgoing arguments and the incoming SP. This value is always
1369 a multiple of STACK_BOUNDARY. */
1370 poly_int64 frame_size;
1371
1372--
13732.34.1
1374
1375
1376From 999c4a81cffddb850d6ab0f6d3a8de3e704d2f7a Mon Sep 17 00:00:00 2001
1377From: Richard Sandiford <richard.sandiford@arm.com>
1378Date: Tue, 12 Sep 2023 16:19:48 +0100
1379Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the
1380 frame
1381
1382reg_offset was measured from the bottom of the saved register area.
1383This made perfect sense with the original layout, since the bottom
1384of the saved register area was also the hard frame pointer address.
1385It became slightly less obvious with SVE, since we save SVE
1386registers below the hard frame pointer, but it still made sense.
1387
1388However, if we want to allow different frame layouts, it's more
1389convenient and obvious to measure reg_offset from the bottom of
1390the frame. After previous patches, it's also a slight simplification
1391in its own right.
1392
1393gcc/
1394 * config/aarch64/aarch64.h (aarch64_frame): Add comment above
1395 reg_offset.
1396 * config/aarch64/aarch64.c (aarch64_layout_frame): Walk offsets
1397 from the bottom of the frame, rather than the bottom of the saved
1398 register area. Measure reg_offset from the bottom of the frame
1399 rather than the bottom of the saved register area.
1400 (aarch64_save_callee_saves): Update accordingly.
1401 (aarch64_restore_callee_saves): Likewise.
1402 (aarch64_get_separate_components): Likewise.
1403 (aarch64_process_components): Likewise.
1404---
1405 gcc/config/aarch64/aarch64.c | 53 ++++++++++++++++--------------------
1406 gcc/config/aarch64/aarch64.h | 3 ++
1407 2 files changed, 27 insertions(+), 29 deletions(-)
1408
1409diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
1410index 23cb084e5a7..45ff664cba6 100644
1411--- a/gcc/config/aarch64/aarch64.c
1412+++ b/gcc/config/aarch64/aarch64.c
1413@@ -7398,7 +7398,6 @@ aarch64_needs_frame_chain (void)
1414 static void
1415 aarch64_layout_frame (void)
1416 {
1417- poly_int64 offset = 0;
1418 int regno, last_fp_reg = INVALID_REGNUM;
1419 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
1420 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
1421@@ -7476,7 +7475,9 @@ aarch64_layout_frame (void)
1422 gcc_assert (crtl->is_leaf
1423 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
1424
1425- frame.bytes_below_saved_regs = crtl->outgoing_args_size;
1426+ poly_int64 offset = crtl->outgoing_args_size;
1427+ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
1428+ frame.bytes_below_saved_regs = offset;
1429
1430 /* Now assign stack slots for the registers. Start with the predicate
1431 registers, since predicate LDR and STR have a relatively small
1432@@ -7488,7 +7489,8 @@ aarch64_layout_frame (void)
1433 offset += BYTES_PER_SVE_PRED;
1434 }
1435
1436- if (maybe_ne (offset, 0))
1437+ poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
1438+ if (maybe_ne (saved_prs_size, 0))
1439 {
1440 /* If we have any vector registers to save above the predicate registers,
1441 the offset of the vector register save slots need to be a multiple
1442@@ -7506,10 +7508,10 @@ aarch64_layout_frame (void)
1443 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1444 else
1445 {
1446- if (known_le (offset, vector_save_size))
1447- offset = vector_save_size;
1448- else if (known_le (offset, vector_save_size * 2))
1449- offset = vector_save_size * 2;
1450+ if (known_le (saved_prs_size, vector_save_size))
1451+ offset = frame.bytes_below_saved_regs + vector_save_size;
1452+ else if (known_le (saved_prs_size, vector_save_size * 2))
1453+ offset = frame.bytes_below_saved_regs + vector_save_size * 2;
1454 else
1455 gcc_unreachable ();
1456 }
1457@@ -7526,9 +7528,10 @@ aarch64_layout_frame (void)
1458
1459 /* OFFSET is now the offset of the hard frame pointer from the bottom
1460 of the callee save area. */
1461- bool saves_below_hard_fp_p = maybe_ne (offset, 0);
1462- frame.below_hard_fp_saved_regs_size = offset;
1463- frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
1464+ frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
1465+ bool saves_below_hard_fp_p
1466+ = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
1467+ frame.bytes_below_hard_fp = offset;
1468 if (frame.emit_frame_chain)
1469 {
1470 /* FP and LR are placed in the linkage record. */
1471@@ -7579,9 +7582,10 @@ aarch64_layout_frame (void)
1472
1473 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1474
1475- frame.saved_regs_size = offset;
1476+ frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
1477
1478- poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
1479+ poly_int64 varargs_and_saved_regs_size
1480+ = frame.saved_regs_size + frame.saved_varargs_size;
1481
1482 poly_int64 saved_regs_and_above
1483 = aligned_upper_bound (varargs_and_saved_regs_size
1484@@ -8021,9 +8025,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
1485
1486 machine_mode mode = aarch64_reg_save_mode (regno);
1487 reg = gen_rtx_REG (mode, regno);
1488- offset = (frame.reg_offset[regno]
1489- + frame.bytes_below_saved_regs
1490- - bytes_below_sp);
1491+ offset = frame.reg_offset[regno] - bytes_below_sp;
1492 rtx base_rtx = stack_pointer_rtx;
1493 poly_int64 sp_offset = offset;
1494
1495@@ -8130,9 +8132,7 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
1496
1497 machine_mode mode = aarch64_reg_save_mode (regno);
1498 reg = gen_rtx_REG (mode, regno);
1499- offset = (frame.reg_offset[regno]
1500- + frame.bytes_below_saved_regs
1501- - bytes_below_sp);
1502+ offset = frame.reg_offset[regno] - bytes_below_sp;
1503 rtx base_rtx = stack_pointer_rtx;
1504 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
1505 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
1506@@ -8271,14 +8271,12 @@ aarch64_get_separate_components (void)
1507 it as a stack probe for -fstack-clash-protection. */
1508 if (flag_stack_clash_protection
1509 && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
1510- && known_eq (offset, 0))
1511+ && known_eq (offset, frame.bytes_below_saved_regs))
1512 continue;
1513
1514 /* Get the offset relative to the register we'll use. */
1515 if (frame_pointer_needed)
1516- offset -= frame.below_hard_fp_saved_regs_size;
1517- else
1518- offset += frame.bytes_below_saved_regs;
1519+ offset -= frame.bytes_below_hard_fp;
1520
1521 /* Check that we can access the stack slot of the register with one
1522 direct load with no adjustments needed. */
1523@@ -8425,9 +8423,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
1524 rtx reg = gen_rtx_REG (mode, regno);
1525 poly_int64 offset = frame.reg_offset[regno];
1526 if (frame_pointer_needed)
1527- offset -= frame.below_hard_fp_saved_regs_size;
1528- else
1529- offset += frame.bytes_below_saved_regs;
1530+ offset -= frame.bytes_below_hard_fp;
1531
1532 rtx addr = plus_constant (Pmode, ptr_reg, offset);
1533 rtx mem = gen_frame_mem (mode, addr);
1534@@ -8479,9 +8475,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
1535 /* REGNO2 can be saved/restored in a pair with REGNO. */
1536 rtx reg2 = gen_rtx_REG (mode, regno2);
1537 if (frame_pointer_needed)
1538- offset2 -= frame.below_hard_fp_saved_regs_size;
1539- else
1540- offset2 += frame.bytes_below_saved_regs;
1541+ offset2 -= frame.bytes_below_hard_fp;
1542 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
1543 rtx mem2 = gen_frame_mem (mode, addr2);
1544 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
1545@@ -8597,7 +8591,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1546 if (final_adjustment_p
1547 && known_eq (frame.below_hard_fp_saved_regs_size, 0))
1548 {
1549- poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
1550+ poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
1551+ - frame.bytes_below_saved_regs);
1552 if (known_ge (lr_offset, 0))
1553 min_probe_threshold -= lr_offset.to_constant ();
1554 else
1555diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1556index 82883ad5a0d..c8ec3d58495 100644
1557--- a/gcc/config/aarch64/aarch64.h
1558+++ b/gcc/config/aarch64/aarch64.h
1559@@ -826,6 +826,9 @@ extern enum aarch64_processor aarch64_tune;
1560 #ifdef HAVE_POLY_INT_H
1561 struct GTY (()) aarch64_frame
1562 {
1563+ /* The offset from the bottom of the static frame (the bottom of the
1564+ outgoing arguments) of each register save slot, or -2 if no save is
1565+ needed. */
1566 poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
1567
1568 /* The number of extra stack bytes taken up by register varargs.
1569--
15702.34.1
1571
1572
1573From 8b664cc8f05c8130e8ca73a59ae2751cdef8a0ea Mon Sep 17 00:00:00 2001
1574From: Richard Sandiford <richard.sandiford@arm.com>
1575Date: Tue, 12 Sep 2023 16:19:48 +0100
1576Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation
1577
1578After previous patches, it no longer really makes sense to allocate
1579the top of the frame in terms of varargs_and_saved_regs_size and
1580saved_regs_and_above.
1581
1582gcc/
1583 * config/aarch64/aarch64.c (aarch64_layout_frame): Simplify
1584 the allocation of the top of the frame.
1585---
1586 gcc/config/aarch64/aarch64.c | 23 ++++++++---------------
1587 1 file changed, 8 insertions(+), 15 deletions(-)
1588
1589diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
1590index 45ff664cba6..779547d0344 100644
1591--- a/gcc/config/aarch64/aarch64.c
1592+++ b/gcc/config/aarch64/aarch64.c
1593@@ -7584,23 +7584,16 @@ aarch64_layout_frame (void)
1594
1595 frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
1596
1597- poly_int64 varargs_and_saved_regs_size
1598- = frame.saved_regs_size + frame.saved_varargs_size;
1599-
1600- poly_int64 saved_regs_and_above
1601- = aligned_upper_bound (varargs_and_saved_regs_size
1602- + get_frame_size (),
1603- STACK_BOUNDARY / BITS_PER_UNIT);
1604-
1605- frame.bytes_above_hard_fp
1606- = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
1607+ offset += get_frame_size ();
1608+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1609+ auto top_of_locals = offset;
1610
1611- /* Both these values are already aligned. */
1612- gcc_assert (multiple_p (frame.bytes_below_saved_regs,
1613- STACK_BOUNDARY / BITS_PER_UNIT));
1614- frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
1615+ offset += frame.saved_varargs_size;
1616+ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
1617+ frame.frame_size = offset;
1618
1619- frame.bytes_above_locals = frame.saved_varargs_size;
1620+ frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
1621+ frame.bytes_above_locals = frame.frame_size - top_of_locals;
1622
1623 frame.initial_adjust = 0;
1624 frame.final_adjust = 0;
1625--
16262.34.1
1627
1628
1629From bb4600071acc3a02db4f37ffb95c8495ad76a140 Mon Sep 17 00:00:00 2001
1630From: Richard Sandiford <richard.sandiford@arm.com>
1631Date: Tue, 12 Sep 2023 16:19:49 +0100
1632Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak
1633
1634This patch just changes a calculation of initial_adjust
1635to one that makes it slightly more obvious that the total
1636adjustment is frame.frame_size.
1637
1638gcc/
1639 * config/aarch64/aarch64.c (aarch64_layout_frame): Tweak
1640 calculation of initial_adjust for frames in which all saves
1641 are SVE saves.
1642---
1643 gcc/config/aarch64/aarch64.c | 5 ++---
1644 1 file changed, 2 insertions(+), 3 deletions(-)
1645
1646diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
1647index 779547d0344..0b8992ada74 100644
1648--- a/gcc/config/aarch64/aarch64.c
1649+++ b/gcc/config/aarch64/aarch64.c
1650@@ -7645,11 +7645,10 @@ aarch64_layout_frame (void)
1651 {
1652 /* Frame in which all saves are SVE saves:
1653
1654- sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
1655+ sub sp, sp, frame_size - bytes_below_saved_regs
1656 save SVE registers relative to SP
1657 sub sp, sp, bytes_below_saved_regs */
1658- frame.initial_adjust = (frame.bytes_above_hard_fp
1659- + frame.below_hard_fp_saved_regs_size);
1660+ frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
1661 frame.final_adjust = frame.bytes_below_saved_regs;
1662 }
1663 else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
1664--
16652.34.1
1666
1667
1668From f22329d5efbacf80edf4a2d45ebadd93f283252c Mon Sep 17 00:00:00 2001
1669From: Richard Sandiford <richard.sandiford@arm.com>
1670Date: Tue, 12 Sep 2023 16:19:49 +0100
1671Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition
1672
1673The AArch64 ABI says that, when stack clash protection is used,
1674there can be a maximum of 1KiB of unprobed space at sp on entry
1675to a function. Therefore, we need to probe when allocating
1676>= guard_size - 1KiB of data (>= rather than >). This is what
1677GCC does.
1678
1679If an allocation is exactly guard_size bytes, it is enough to allocate
1680those bytes and probe once at offset 1024. It isn't possible to use a
1681single probe at any other offset: higher would conmplicate later code,
1682by leaving more unprobed space than usual, while lower would risk
1683leaving an entire page unprobed. For simplicity, the code probes all
1684allocations at offset 1024.
1685
1686Some register saves also act as probes. If we need to allocate
1687more space below the last such register save probe, we need to
1688probe the allocation if it is > 1KiB. Again, this allocation is
1689then sometimes (but not always) probed at offset 1024. This sort of
1690allocation is currently only used for outgoing arguments, which are
1691rarely this big.
1692
1693However, the code also probed if this final outgoing-arguments
1694allocation was == 1KiB, rather than just > 1KiB. This isn't
1695necessary, since the register save then probes at offset 1024
1696as required. Continuing to probe allocations of exactly 1KiB
1697would complicate later patches.
1698
1699gcc/
1700 * config/aarch64/aarch64.c (aarch64_allocate_and_probe_stack_space):
1701 Don't probe final allocations that are exactly 1KiB in size (after
1702 unprobed space above the final allocation has been deducted).
1703
1704gcc/testsuite/
1705 * gcc.target/aarch64/stack-check-prologue-17.c: New test.
1706---
1707 gcc/config/aarch64/aarch64.c | 4 +-
1708 .../aarch64/stack-check-prologue-17.c | 55 +++++++++++++++++++
1709 2 files changed, 58 insertions(+), 1 deletion(-)
1710 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
1711
1712diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
1713index 0b8992ada74..bfd24876195 100644
1714--- a/gcc/config/aarch64/aarch64.c
1715+++ b/gcc/config/aarch64/aarch64.c
1716@@ -8564,9 +8564,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1717 HOST_WIDE_INT guard_size
1718 = 1 << param_stack_clash_protection_guard_size;
1719 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
1720+ HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
1721+ gcc_assert (multiple_p (poly_size, byte_sp_alignment));
1722 HOST_WIDE_INT min_probe_threshold
1723 = (final_adjustment_p
1724- ? guard_used_by_caller
1725+ ? guard_used_by_caller + byte_sp_alignment
1726 : guard_size - guard_used_by_caller);
1727 /* When doing the final adjustment for the outgoing arguments, take into
1728 account any unprobed space there is above the current SP. There are
1729diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
1730new file mode 100644
1731index 00000000000..0d8a25d73a2
1732--- /dev/null
1733+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
1734@@ -0,0 +1,55 @@
1735+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
1736+/* { dg-final { check-function-bodies "**" "" } } */
1737+
1738+void f(int, ...);
1739+void g();
1740+
1741+/*
1742+** test1:
1743+** ...
1744+** str x30, \[sp\]
1745+** sub sp, sp, #1024
1746+** cbnz w0, .*
1747+** bl g
1748+** ...
1749+*/
1750+int test1(int z) {
1751+ __uint128_t x = 0;
1752+ int y[0x400];
1753+ if (z)
1754+ {
1755+ f(0, 0, 0, 0, 0, 0, 0, &y,
1756+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1757+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1758+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1759+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
1760+ }
1761+ g();
1762+ return 1;
1763+}
1764+
1765+/*
1766+** test2:
1767+** ...
1768+** str x30, \[sp\]
1769+** sub sp, sp, #1040
1770+** str xzr, \[sp\]
1771+** cbnz w0, .*
1772+** bl g
1773+** ...
1774+*/
1775+int test2(int z) {
1776+ __uint128_t x = 0;
1777+ int y[0x400];
1778+ if (z)
1779+ {
1780+ f(0, 0, 0, 0, 0, 0, 0, &y,
1781+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1782+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1783+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1784+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1785+ x);
1786+ }
1787+ g();
1788+ return 1;
1789+}
1790--
17912.34.1
1792
1793
1794From 174a9747491e591ef2abb3e20a0332303f11003a Mon Sep 17 00:00:00 2001
1795From: Richard Sandiford <richard.sandiford@arm.com>
1796Date: Tue, 12 Sep 2023 16:19:49 +0100
1797Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes
1798
1799-fstack-clash-protection uses the save of LR as a probe for the next
1800allocation. The next allocation could be:
1801
1802* another part of the static frame, e.g. when allocating SVE save slots
1803 or outgoing arguments
1804
1805* an alloca in the same function
1806
1807* an allocation made by a callee function
1808
1809However, when -fomit-frame-pointer is used, the LR save slot is placed
1810above the other GPR save slots. It could therefore be up to 80 bytes
1811above the base of the GPR save area (which is also the hard fp address).
1812
1813aarch64_allocate_and_probe_stack_space took this into account when
1814deciding how much subsequent space could be allocated without needing
1815a probe. However, it interacted badly with:
1816
1817 /* If doing a small final adjustment, we always probe at offset 0.
1818 This is done to avoid issues when LR is not at position 0 or when
1819 the final adjustment is smaller than the probing offset. */
1820 else if (final_adjustment_p && rounded_size == 0)
1821 residual_probe_offset = 0;
1822
1823which forces any allocation that is smaller than the guard page size
1824to be probed at offset 0 rather than the usual offset 1024. It was
1825therefore possible to construct cases in which we had:
1826
1827* a probe using LR at SP + 80 bytes (or some other value >= 16)
1828* an allocation of the guard page size - 16 bytes
1829* a probe at SP + 0
1830
1831which allocates guard page size + 64 consecutive unprobed bytes.
1832
1833This patch requires the LR probe to be in the first 16 bytes of the
1834save area when stack clash protection is active. Doing it
1835unconditionally would cause code-quality regressions, but a later
1836patch deals with that.
1837
1838The new comment doesn't say that the probe register is required
1839to be LR, since a later patch removes that restriction.
1840
1841gcc/
1842 * config/aarch64/aarch64.c (aarch64_layout_frame): Ensure that
1843 the LR save slot is in the first 16 bytes of the register save area.
1844 (aarch64_allocate_and_probe_stack_space): Remove workaround for
1845 when LR was not in the first 16 bytes.
1846
1847gcc/testsuite/
1848 * gcc.target/aarch64/stack-check-prologue-18.c: New test.
1849---
1850 gcc/config/aarch64/aarch64.c | 61 ++++-------
1851 .../aarch64/stack-check-prologue-18.c | 100 ++++++++++++++++++
1852 2 files changed, 123 insertions(+), 38 deletions(-)
1853 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
1854
1855diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
1856index bfd24876195..3f2b10de987 100644
1857--- a/gcc/config/aarch64/aarch64.c
1858+++ b/gcc/config/aarch64/aarch64.c
1859@@ -7532,26 +7532,34 @@ aarch64_layout_frame (void)
1860 bool saves_below_hard_fp_p
1861 = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
1862 frame.bytes_below_hard_fp = offset;
1863+
1864+ auto allocate_gpr_slot = [&](unsigned int regno)
1865+ {
1866+ frame.reg_offset[regno] = offset;
1867+ if (frame.wb_candidate1 == INVALID_REGNUM)
1868+ frame.wb_candidate1 = regno;
1869+ else if (frame.wb_candidate2 == INVALID_REGNUM)
1870+ frame.wb_candidate2 = regno;
1871+ offset += UNITS_PER_WORD;
1872+ };
1873+
1874 if (frame.emit_frame_chain)
1875 {
1876 /* FP and LR are placed in the linkage record. */
1877- frame.reg_offset[R29_REGNUM] = offset;
1878- frame.wb_candidate1 = R29_REGNUM;
1879- frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
1880- frame.wb_candidate2 = R30_REGNUM;
1881- offset += 2 * UNITS_PER_WORD;
1882+ allocate_gpr_slot (R29_REGNUM);
1883+ allocate_gpr_slot (R30_REGNUM);
1884 }
1885+ else if (flag_stack_clash_protection
1886+ && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
1887+ /* Put the LR save slot first, since it makes a good choice of probe
1888+ for stack clash purposes. The idea is that the link register usually
1889+ has to be saved before a call anyway, and so we lose little by
1890+ stopping it from being individually shrink-wrapped. */
1891+ allocate_gpr_slot (R30_REGNUM);
1892
1893 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1894 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
1895- {
1896- frame.reg_offset[regno] = offset;
1897- if (frame.wb_candidate1 == INVALID_REGNUM)
1898- frame.wb_candidate1 = regno;
1899- else if (frame.wb_candidate2 == INVALID_REGNUM)
1900- frame.wb_candidate2 = regno;
1901- offset += UNITS_PER_WORD;
1902- }
1903+ allocate_gpr_slot (regno);
1904
1905 poly_int64 max_int_offset = offset;
1906 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1907@@ -8570,29 +8578,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1908 = (final_adjustment_p
1909 ? guard_used_by_caller + byte_sp_alignment
1910 : guard_size - guard_used_by_caller);
1911- /* When doing the final adjustment for the outgoing arguments, take into
1912- account any unprobed space there is above the current SP. There are
1913- two cases:
1914-
1915- - When saving SVE registers below the hard frame pointer, we force
1916- the lowest save to take place in the prologue before doing the final
1917- adjustment (i.e. we don't allow the save to be shrink-wrapped).
1918- This acts as a probe at SP, so there is no unprobed space.
1919-
1920- - When there are no SVE register saves, we use the store of the link
1921- register as a probe. We can't assume that LR was saved at position 0
1922- though, so treat any space below it as unprobed. */
1923- if (final_adjustment_p
1924- && known_eq (frame.below_hard_fp_saved_regs_size, 0))
1925- {
1926- poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
1927- - frame.bytes_below_saved_regs);
1928- if (known_ge (lr_offset, 0))
1929- min_probe_threshold -= lr_offset.to_constant ();
1930- else
1931- gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
1932- }
1933-
1934 poly_int64 frame_size = frame.frame_size;
1935
1936 /* We should always have a positive probe threshold. */
1937@@ -8772,8 +8757,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1938 if (final_adjustment_p && rounded_size != 0)
1939 min_probe_threshold = 0;
1940 /* If doing a small final adjustment, we always probe at offset 0.
1941- This is done to avoid issues when LR is not at position 0 or when
1942- the final adjustment is smaller than the probing offset. */
1943+ This is done to avoid issues when the final adjustment is smaller
1944+ than the probing offset. */
1945 else if (final_adjustment_p && rounded_size == 0)
1946 residual_probe_offset = 0;
1947
1948diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
1949new file mode 100644
1950index 00000000000..82447d20fff
1951--- /dev/null
1952+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
1953@@ -0,0 +1,100 @@
1954+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
1955+/* { dg-final { check-function-bodies "**" "" } } */
1956+
1957+void f(int, ...);
1958+void g();
1959+
1960+/*
1961+** test1:
1962+** ...
1963+** str x30, \[sp\]
1964+** sub sp, sp, #4064
1965+** str xzr, \[sp\]
1966+** cbnz w0, .*
1967+** bl g
1968+** ...
1969+** str x26, \[sp, #?4128\]
1970+** ...
1971+*/
1972+int test1(int z) {
1973+ __uint128_t x = 0;
1974+ int y[0x400];
1975+ if (z)
1976+ {
1977+ asm volatile ("" :::
1978+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
1979+ f(0, 0, 0, 0, 0, 0, 0, &y,
1980+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1981+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1982+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1983+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1984+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1985+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1986+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1987+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1988+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1989+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1990+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1991+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1992+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1993+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1994+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1995+ x, x, x, x, x, x, x, x, x, x, x, x, x, x);
1996+ }
1997+ g();
1998+ return 1;
1999+}
2000+
2001+/*
2002+** test2:
2003+** ...
2004+** str x30, \[sp\]
2005+** sub sp, sp, #1040
2006+** str xzr, \[sp\]
2007+** cbnz w0, .*
2008+** bl g
2009+** ...
2010+*/
2011+int test2(int z) {
2012+ __uint128_t x = 0;
2013+ int y[0x400];
2014+ if (z)
2015+ {
2016+ asm volatile ("" :::
2017+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2018+ f(0, 0, 0, 0, 0, 0, 0, &y,
2019+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2020+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2021+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2022+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2023+ x);
2024+ }
2025+ g();
2026+ return 1;
2027+}
2028+
2029+/*
2030+** test3:
2031+** ...
2032+** str x30, \[sp\]
2033+** sub sp, sp, #1024
2034+** cbnz w0, .*
2035+** bl g
2036+** ...
2037+*/
2038+int test3(int z) {
2039+ __uint128_t x = 0;
2040+ int y[0x400];
2041+ if (z)
2042+ {
2043+ asm volatile ("" :::
2044+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2045+ f(0, 0, 0, 0, 0, 0, 0, &y,
2046+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2047+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2048+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2049+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2050+ }
2051+ g();
2052+ return 1;
2053+}
2054--
20552.34.1
2056
2057
2058From e932e11c353be52256dd30d30d924f4e834e3ca3 Mon Sep 17 00:00:00 2001
2059From: Richard Sandiford <richard.sandiford@arm.com>
2060Date: Tue, 12 Sep 2023 16:19:51 +0100
2061Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation
2062
2063Previous patches ensured that the final frame allocation only needs
2064a probe when the size is strictly greater than 1KiB. It's therefore
2065safe to use the normal 1024 probe offset in all cases.
2066
2067The main motivation for doing this is to simplify the code and
2068remove the number of special cases.
2069
2070gcc/
2071 * config/aarch64/aarch64.c (aarch64_allocate_and_probe_stack_space):
2072 Always probe the residual allocation at offset 1024, asserting
2073 that that is in range.
2074
2075gcc/testsuite/
2076 * gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe
2077 to be at offset 1024 rather than offset 0.
2078 * gcc.target/aarch64/stack-check-prologue-18.c: Likewise.
2079---
2080 gcc/config/aarch64/aarch64.c | 12 ++++--------
2081 .../gcc.target/aarch64/stack-check-prologue-17.c | 2 +-
2082 .../gcc.target/aarch64/stack-check-prologue-18.c | 4 ++--
2083 3 files changed, 7 insertions(+), 11 deletions(-)
2084
2085diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
2086index 3f2b10de987..4b9cd687525 100644
2087--- a/gcc/config/aarch64/aarch64.c
2088+++ b/gcc/config/aarch64/aarch64.c
2089@@ -8751,16 +8751,12 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
2090 are still safe. */
2091 if (residual)
2092 {
2093- HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
2094+ gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
2095+
2096 /* If we're doing final adjustments, and we've done any full page
2097 allocations then any residual needs to be probed. */
2098 if (final_adjustment_p && rounded_size != 0)
2099 min_probe_threshold = 0;
2100- /* If doing a small final adjustment, we always probe at offset 0.
2101- This is done to avoid issues when the final adjustment is smaller
2102- than the probing offset. */
2103- else if (final_adjustment_p && rounded_size == 0)
2104- residual_probe_offset = 0;
2105
2106 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
2107 if (residual >= min_probe_threshold)
2108@@ -8771,8 +8767,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
2109 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
2110 "\n", residual);
2111
2112- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
2113- residual_probe_offset));
2114+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
2115+ guard_used_by_caller));
2116 emit_insn (gen_blockage ());
2117 }
2118 }
2119diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
2120index 0d8a25d73a2..f0ec1389771 100644
2121--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
2122+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
2123@@ -33,7 +33,7 @@ int test1(int z) {
2124 ** ...
2125 ** str x30, \[sp\]
2126 ** sub sp, sp, #1040
2127-** str xzr, \[sp\]
2128+** str xzr, \[sp, #?1024\]
2129 ** cbnz w0, .*
2130 ** bl g
2131 ** ...
2132diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2133index 82447d20fff..6383bec5ebc 100644
2134--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2135+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2136@@ -9,7 +9,7 @@ void g();
2137 ** ...
2138 ** str x30, \[sp\]
2139 ** sub sp, sp, #4064
2140-** str xzr, \[sp\]
2141+** str xzr, \[sp, #?1024\]
2142 ** cbnz w0, .*
2143 ** bl g
2144 ** ...
2145@@ -50,7 +50,7 @@ int test1(int z) {
2146 ** ...
2147 ** str x30, \[sp\]
2148 ** sub sp, sp, #1040
2149-** str xzr, \[sp\]
2150+** str xzr, \[sp, #?1024\]
2151 ** cbnz w0, .*
2152 ** bl g
2153 ** ...
2154--
21552.34.1
2156
2157
2158From 9ed9fd54b2b471745c9489e83496c091a7b64904 Mon Sep 17 00:00:00 2001
2159From: Richard Sandiford <richard.sandiford@arm.com>
2160Date: Tue, 12 Sep 2023 16:19:52 +0100
2161Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame
2162 info
2163
2164The stack frame is currently divided into three areas:
2165
2166A: the area above the hard frame pointer
2167B: the SVE saves below the hard frame pointer
2168C: the outgoing arguments
2169
2170If the stack frame is allocated in one chunk, the allocation needs a
2171probe if the frame size is >= guard_size - 1KiB. In addition, if the
2172function is not a leaf function, it must probe an address no more than
21731KiB above the outgoing SP. We ensured the second condition by
2174
2175(1) using single-chunk allocations for non-leaf functions only if
2176 the link register save slot is within 512 bytes of the bottom
2177 of the frame; and
2178
2179(2) using the link register save as a probe (meaning, for instance,
2180 that it can't be individually shrink wrapped)
2181
2182If instead the stack is allocated in multiple chunks, then:
2183
2184* an allocation involving only the outgoing arguments (C above) requires
2185 a probe if the allocation size is > 1KiB
2186
2187* any other allocation requires a probe if the allocation size
2188 is >= guard_size - 1KiB
2189
2190* second and subsequent allocations require the previous allocation
2191 to probe at the bottom of the allocated area, regardless of the size
2192 of that previous allocation
2193
2194The final point means that, unlike for single allocations,
2195it can be necessary to have both a non-SVE register probe and
2196an SVE register probe. For example:
2197
2198* allocate A, probe using a non-SVE register save
2199* allocate B, probe using an SVE register save
2200* allocate C
2201
2202The non-SVE register used in this case was again the link register.
2203It was previously used even if the link register save slot was some
2204bytes above the bottom of the non-SVE register saves, but an earlier
2205patch avoided that by putting the link register save slot first.
2206
2207As a belt-and-braces fix, this patch explicitly records which
2208probe registers we're using and allows the non-SVE probe to be
2209whichever register comes first (as for SVE).
2210
2211The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c.
2212
2213gcc/
2214 * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe)
2215 (aarch64_frame::hard_fp_save_and_probe): New fields.
2216 * config/aarch64/aarch64.c (aarch64_layout_frame): Initialize them.
2217 Rather than asserting that a leaf function saves LR, instead assert
2218 that a leaf function saves something.
2219 (aarch64_get_separate_components): Prevent the chosen probe
2220 registers from being individually shrink-wrapped.
2221 (aarch64_allocate_and_probe_stack_space): Remove workaround for
2222 probe registers that aren't at the bottom of the previous allocation.
2223
2224gcc/testsuite/
2225 * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes.
2226---
2227 gcc/config/aarch64/aarch64.c | 68 +++++++++++++++----
2228 gcc/config/aarch64/aarch64.h | 8 +++
2229 .../aarch64/sve/pcs/stack_clash_3.c | 6 +-
2230 3 files changed, 64 insertions(+), 18 deletions(-)
2231
2232diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
2233index 4b9cd687525..ef4b3b671ba 100644
2234--- a/gcc/config/aarch64/aarch64.c
2235+++ b/gcc/config/aarch64/aarch64.c
2236@@ -7469,15 +7469,11 @@ aarch64_layout_frame (void)
2237 && !crtl->abi->clobbers_full_reg_p (regno))
2238 frame.reg_offset[regno] = SLOT_REQUIRED;
2239
2240- /* With stack-clash, LR must be saved in non-leaf functions. The saving of
2241- LR counts as an implicit probe which allows us to maintain the invariant
2242- described in the comment at expand_prologue. */
2243- gcc_assert (crtl->is_leaf
2244- || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
2245
2246 poly_int64 offset = crtl->outgoing_args_size;
2247 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
2248 frame.bytes_below_saved_regs = offset;
2249+ frame.sve_save_and_probe = INVALID_REGNUM;
2250
2251 /* Now assign stack slots for the registers. Start with the predicate
2252 registers, since predicate LDR and STR have a relatively small
2253@@ -7485,6 +7481,8 @@ aarch64_layout_frame (void)
2254 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
2255 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
2256 {
2257+ if (frame.sve_save_and_probe == INVALID_REGNUM)
2258+ frame.sve_save_and_probe = regno;
2259 frame.reg_offset[regno] = offset;
2260 offset += BYTES_PER_SVE_PRED;
2261 }
2262@@ -7522,6 +7520,8 @@ aarch64_layout_frame (void)
2263 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2264 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
2265 {
2266+ if (frame.sve_save_and_probe == INVALID_REGNUM)
2267+ frame.sve_save_and_probe = regno;
2268 frame.reg_offset[regno] = offset;
2269 offset += vector_save_size;
2270 }
2271@@ -7531,10 +7531,18 @@ aarch64_layout_frame (void)
2272 frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
2273 bool saves_below_hard_fp_p
2274 = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
2275+ gcc_assert (!saves_below_hard_fp_p
2276+ || (frame.sve_save_and_probe != INVALID_REGNUM
2277+ && known_eq (frame.reg_offset[frame.sve_save_and_probe],
2278+ frame.bytes_below_saved_regs)));
2279+
2280 frame.bytes_below_hard_fp = offset;
2281+ frame.hard_fp_save_and_probe = INVALID_REGNUM;
2282
2283 auto allocate_gpr_slot = [&](unsigned int regno)
2284 {
2285+ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
2286+ frame.hard_fp_save_and_probe = regno;
2287 frame.reg_offset[regno] = offset;
2288 if (frame.wb_candidate1 == INVALID_REGNUM)
2289 frame.wb_candidate1 = regno;
2290@@ -7568,6 +7576,8 @@ aarch64_layout_frame (void)
2291 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2292 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
2293 {
2294+ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
2295+ frame.hard_fp_save_and_probe = regno;
2296 /* If there is an alignment gap between integer and fp callee-saves,
2297 allocate the last fp register to it if possible. */
2298 if (regno == last_fp_reg
2299@@ -7591,6 +7601,17 @@ aarch64_layout_frame (void)
2300 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2301
2302 frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
2303+ gcc_assert (known_eq (frame.saved_regs_size,
2304+ frame.below_hard_fp_saved_regs_size)
2305+ || (frame.hard_fp_save_and_probe != INVALID_REGNUM
2306+ && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
2307+ frame.bytes_below_hard_fp)));
2308+
2309+ /* With stack-clash, a register must be saved in non-leaf functions.
2310+ The saving of the bottommost register counts as an implicit probe,
2311+ which allows us to maintain the invariant described in the comment
2312+ at expand_prologue. */
2313+ gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
2314
2315 offset += get_frame_size ();
2316 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2317@@ -7690,6 +7711,25 @@ aarch64_layout_frame (void)
2318 frame.final_adjust = frame.bytes_below_saved_regs;
2319 }
2320
2321+ /* The frame is allocated in pieces, with each non-final piece
2322+ including a register save at offset 0 that acts as a probe for
2323+ the following piece. In addition, the save of the bottommost register
2324+ acts as a probe for callees and allocas. Roll back any probes that
2325+ aren't needed.
2326+
2327+ A probe isn't needed if it is associated with the final allocation
2328+ (including callees and allocas) that happens before the epilogue is
2329+ executed. */
2330+ if (crtl->is_leaf
2331+ && !cfun->calls_alloca
2332+ && known_eq (frame.final_adjust, 0))
2333+ {
2334+ if (maybe_ne (frame.sve_callee_adjust, 0))
2335+ frame.sve_save_and_probe = INVALID_REGNUM;
2336+ else
2337+ frame.hard_fp_save_and_probe = INVALID_REGNUM;
2338+ }
2339+
2340 /* Make sure the individual adjustments add up to the full frame size. */
2341 gcc_assert (known_eq (frame.initial_adjust
2342 + frame.callee_adjust
2343@@ -8267,13 +8307,6 @@ aarch64_get_separate_components (void)
2344
2345 poly_int64 offset = frame.reg_offset[regno];
2346
2347- /* If the register is saved in the first SVE save slot, we use
2348- it as a stack probe for -fstack-clash-protection. */
2349- if (flag_stack_clash_protection
2350- && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
2351- && known_eq (offset, frame.bytes_below_saved_regs))
2352- continue;
2353-
2354 /* Get the offset relative to the register we'll use. */
2355 if (frame_pointer_needed)
2356 offset -= frame.bytes_below_hard_fp;
2357@@ -8308,6 +8341,13 @@ aarch64_get_separate_components (void)
2358
2359 bitmap_clear_bit (components, LR_REGNUM);
2360 bitmap_clear_bit (components, SP_REGNUM);
2361+ if (flag_stack_clash_protection)
2362+ {
2363+ if (frame.sve_save_and_probe != INVALID_REGNUM)
2364+ bitmap_clear_bit (components, frame.sve_save_and_probe);
2365+ if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
2366+ bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
2367+ }
2368
2369 return components;
2370 }
2371@@ -8844,8 +8884,8 @@ aarch64_epilogue_uses (int regno)
2372 When probing is needed, we emit a probe at the start of the prologue
2373 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
2374
2375- We have to track how much space has been allocated and the only stores
2376- to the stack we track as implicit probes are the FP/LR stores.
2377+ We can also use register saves as probes. These are stored in
2378+ sve_save_and_probe and hard_fp_save_and_probe.
2379
2380 For outgoing arguments we probe if the size is larger than 1KB, such that
2381 the ABI specified buffer is maintained for the next callee.
2382diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
2383index c8ec3d58495..97173e48598 100644
2384--- a/gcc/config/aarch64/aarch64.h
2385+++ b/gcc/config/aarch64/aarch64.h
2386@@ -911,6 +911,14 @@ struct GTY (()) aarch64_frame
2387 This is the register they should use. */
2388 unsigned spare_pred_reg;
2389
2390+ /* An SVE register that is saved below the hard frame pointer and that acts
2391+ as a probe for later allocations, or INVALID_REGNUM if none. */
2392+ unsigned sve_save_and_probe;
2393+
2394+ /* A register that is saved at the hard frame pointer and that acts
2395+ as a probe for later allocations, or INVALID_REGNUM if none. */
2396+ unsigned hard_fp_save_and_probe;
2397+
2398 bool laid_out;
2399 };
2400
2401diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
2402index 3e01ec36c3a..3530a0d504b 100644
2403--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
2404+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
2405@@ -11,11 +11,10 @@
2406 ** mov x11, sp
2407 ** ...
2408 ** sub sp, sp, x13
2409-** str p4, \[sp\]
2410 ** cbz w0, [^\n]*
2411+** str p4, \[sp\]
2412 ** ...
2413 ** ptrue p0\.b, all
2414-** ldr p4, \[sp\]
2415 ** addvl sp, sp, #1
2416 ** ldr x24, \[sp\], 32
2417 ** ret
2418@@ -39,13 +38,12 @@ test_1 (int n)
2419 ** mov x11, sp
2420 ** ...
2421 ** sub sp, sp, x13
2422-** str p4, \[sp\]
2423 ** cbz w0, [^\n]*
2424+** str p4, \[sp\]
2425 ** str p5, \[sp, #1, mul vl\]
2426 ** str p6, \[sp, #2, mul vl\]
2427 ** ...
2428 ** ptrue p0\.b, all
2429-** ldr p4, \[sp\]
2430 ** addvl sp, sp, #1
2431 ** ldr x24, \[sp\], 32
2432 ** ret
2433--
24342.34.1
2435
2436
2437From 4bbf7b6cdd02b0d547ddd6a630f2065680bf2f6b Mon Sep 17 00:00:00 2001
2438From: Richard Sandiford <richard.sandiford@arm.com>
2439Date: Tue, 12 Sep 2023 16:19:52 +0100
2440Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size
2441
2442After previous patches, it's no longer necessary to store
2443saved_regs_size and below_hard_fp_saved_regs_size in the frame info.
2444All measurements instead use the top or bottom of the frame as
2445reference points.
2446
2447gcc/
2448 * config/aarch64/aarch64.h (aarch64_frame::saved_regs_size)
2449 (aarch64_frame::below_hard_fp_saved_regs_size): Delete.
2450 * config/aarch64/aarch64.c (aarch64_layout_frame): Update accordingly.
2451---
2452 gcc/config/aarch64/aarch64.c | 45 +++++++++++++++++-------------------
2453 gcc/config/aarch64/aarch64.h | 7 ------
2454 2 files changed, 21 insertions(+), 31 deletions(-)
2455
2456diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
2457index ef4b3b671ba..385718a475b 100644
2458--- a/gcc/config/aarch64/aarch64.c
2459+++ b/gcc/config/aarch64/aarch64.c
2460@@ -7528,9 +7528,8 @@ aarch64_layout_frame (void)
2461
2462 /* OFFSET is now the offset of the hard frame pointer from the bottom
2463 of the callee save area. */
2464- frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
2465- bool saves_below_hard_fp_p
2466- = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
2467+ auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
2468+ bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
2469 gcc_assert (!saves_below_hard_fp_p
2470 || (frame.sve_save_and_probe != INVALID_REGNUM
2471 && known_eq (frame.reg_offset[frame.sve_save_and_probe],
2472@@ -7600,9 +7599,8 @@ aarch64_layout_frame (void)
2473
2474 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2475
2476- frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
2477- gcc_assert (known_eq (frame.saved_regs_size,
2478- frame.below_hard_fp_saved_regs_size)
2479+ auto saved_regs_size = offset - frame.bytes_below_saved_regs;
2480+ gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size)
2481 || (frame.hard_fp_save_and_probe != INVALID_REGNUM
2482 && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
2483 frame.bytes_below_hard_fp)));
2484@@ -7611,7 +7609,7 @@ aarch64_layout_frame (void)
2485 The saving of the bottommost register counts as an implicit probe,
2486 which allows us to maintain the invariant described in the comment
2487 at expand_prologue. */
2488- gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
2489+ gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
2490
2491 offset += get_frame_size ();
2492 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2493@@ -7637,7 +7635,7 @@ aarch64_layout_frame (void)
2494
2495 HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
2496 HOST_WIDE_INT const_saved_regs_size;
2497- if (known_eq (frame.saved_regs_size, 0))
2498+ if (known_eq (saved_regs_size, 0))
2499 frame.initial_adjust = frame.frame_size;
2500 else if (frame.frame_size.is_constant (&const_size)
2501 && const_size < max_push_offset
2502@@ -7650,7 +7648,7 @@ aarch64_layout_frame (void)
2503 frame.callee_adjust = const_size;
2504 }
2505 else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
2506- && frame.saved_regs_size.is_constant (&const_saved_regs_size)
2507+ && saved_regs_size.is_constant (&const_saved_regs_size)
2508 && const_below_saved_regs + const_saved_regs_size < 512
2509 /* We could handle this case even with data below the saved
2510 registers, provided that that data left us with valid offsets
2511@@ -7669,8 +7667,7 @@ aarch64_layout_frame (void)
2512 frame.initial_adjust = frame.frame_size;
2513 }
2514 else if (saves_below_hard_fp_p
2515- && known_eq (frame.saved_regs_size,
2516- frame.below_hard_fp_saved_regs_size))
2517+ && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
2518 {
2519 /* Frame in which all saves are SVE saves:
2520
2521@@ -7692,7 +7689,7 @@ aarch64_layout_frame (void)
2522 [save SVE registers relative to SP]
2523 sub sp, sp, bytes_below_saved_regs */
2524 frame.callee_adjust = const_above_fp;
2525- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
2526+ frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
2527 frame.final_adjust = frame.bytes_below_saved_regs;
2528 }
2529 else
2530@@ -7707,7 +7704,7 @@ aarch64_layout_frame (void)
2531 [save SVE registers relative to SP]
2532 sub sp, sp, bytes_below_saved_regs */
2533 frame.initial_adjust = frame.bytes_above_hard_fp;
2534- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
2535+ frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
2536 frame.final_adjust = frame.bytes_below_saved_regs;
2537 }
2538
2539@@ -8849,17 +8846,17 @@ aarch64_epilogue_uses (int regno)
2540 | local variables | <-- frame_pointer_rtx
2541 | |
2542 +-------------------------------+
2543- | padding | \
2544- +-------------------------------+ |
2545- | callee-saved registers | | frame.saved_regs_size
2546- +-------------------------------+ |
2547- | LR' | |
2548- +-------------------------------+ |
2549- | FP' | |
2550- +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
2551- | SVE vector registers | | \
2552- +-------------------------------+ | | below_hard_fp_saved_regs_size
2553- | SVE predicate registers | / /
2554+ | padding |
2555+ +-------------------------------+
2556+ | callee-saved registers |
2557+ +-------------------------------+
2558+ | LR' |
2559+ +-------------------------------+
2560+ | FP' |
2561+ +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
2562+ | SVE vector registers |
2563+ +-------------------------------+
2564+ | SVE predicate registers |
2565 +-------------------------------+
2566 | dynamic allocation |
2567 +-------------------------------+
2568diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
2569index 97173e48598..9084b1cfb9d 100644
2570--- a/gcc/config/aarch64/aarch64.h
2571+++ b/gcc/config/aarch64/aarch64.h
2572@@ -837,18 +837,11 @@ struct GTY (()) aarch64_frame
2573 STACK_BOUNDARY. */
2574 HOST_WIDE_INT saved_varargs_size;
2575
2576- /* The size of the callee-save registers with a slot in REG_OFFSET. */
2577- poly_int64 saved_regs_size;
2578-
2579 /* The number of bytes between the bottom of the static frame (the bottom
2580 of the outgoing arguments) and the bottom of the register save area.
2581 This value is always a multiple of STACK_BOUNDARY. */
2582 poly_int64 bytes_below_saved_regs;
2583
2584- /* The size of the callee-save registers with a slot in REG_OFFSET that
2585- are saved below the hard frame pointer. */
2586- poly_int64 below_hard_fp_saved_regs_size;
2587-
2588 /* The number of bytes between the bottom of the static frame (the bottom
2589 of the outgoing arguments) and the hard frame pointer. This value is
2590 always a multiple of STACK_BOUNDARY. */
2591--
25922.34.1
2593
2594
2595From bea0985749c12fcc264710586addb7838cc61e6d Mon Sep 17 00:00:00 2001
2596From: Richard Sandiford <richard.sandiford@arm.com>
2597Date: Tue, 12 Sep 2023 16:19:52 +0100
2598Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved
2599 registers
2600
2601AArch64 normally puts the saved registers near the bottom of the frame,
2602immediately above any dynamic allocations. But this means that a
2603stack-smash attack on those dynamic allocations could overwrite the
2604saved registers without needing to reach as far as the stack smash
2605canary.
2606
2607The same thing could also happen for variable-sized arguments that are
2608passed by value, since those are allocated before a call and popped on
2609return.
2610
2611This patch avoids that by putting the locals (and thus the canary) below
2612the saved registers when stack smash protection is active.
2613
2614The patch fixes CVE-2023-4039.
2615
2616gcc/
2617 * config/aarch64/aarch64.c (aarch64_save_regs_above_locals_p):
2618 New function.
2619 (aarch64_layout_frame): Use it to decide whether locals should
2620 go above or below the saved registers.
2621 (aarch64_expand_prologue): Update stack layout comment.
2622 Emit a stack tie after the final adjustment.
2623
2624gcc/testsuite/
2625 * gcc.target/aarch64/stack-protector-8.c: New test.
2626 * gcc.target/aarch64/stack-protector-9.c: Likewise.
2627---
2628 gcc/config/aarch64/aarch64.c | 46 +++++++--
2629 .../gcc.target/aarch64/stack-protector-8.c | 95 +++++++++++++++++++
2630 .../gcc.target/aarch64/stack-protector-9.c | 33 +++++++
2631 3 files changed, 168 insertions(+), 6 deletions(-)
2632 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
2633 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
2634
2635diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
2636index 385718a475b..3ccfd3c30fc 100644
2637--- a/gcc/config/aarch64/aarch64.c
2638+++ b/gcc/config/aarch64/aarch64.c
2639@@ -7392,6 +7392,20 @@ aarch64_needs_frame_chain (void)
2640 return aarch64_use_frame_pointer;
2641 }
2642
2643+/* Return true if the current function should save registers above
2644+ the locals area, rather than below it. */
2645+
2646+static bool
2647+aarch64_save_regs_above_locals_p ()
2648+{
2649+ /* When using stack smash protection, make sure that the canary slot
2650+ comes between the locals and the saved registers. Otherwise,
2651+ it would be possible for a carefully sized smash attack to change
2652+ the saved registers (particularly LR and FP) without reaching the
2653+ canary. */
2654+ return crtl->stack_protect_guard;
2655+}
2656+
2657 /* Mark the registers that need to be saved by the callee and calculate
2658 the size of the callee-saved registers area and frame record (both FP
2659 and LR may be omitted). */
2660@@ -7403,6 +7417,7 @@ aarch64_layout_frame (void)
2661 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
2662 bool frame_related_fp_reg_p = false;
2663 aarch64_frame &frame = cfun->machine->frame;
2664+ poly_int64 top_of_locals = -1;
2665
2666 frame.emit_frame_chain = aarch64_needs_frame_chain ();
2667
2668@@ -7469,9 +7484,16 @@ aarch64_layout_frame (void)
2669 && !crtl->abi->clobbers_full_reg_p (regno))
2670 frame.reg_offset[regno] = SLOT_REQUIRED;
2671
2672+ bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
2673
2674 poly_int64 offset = crtl->outgoing_args_size;
2675 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
2676+ if (regs_at_top_p)
2677+ {
2678+ offset += get_frame_size ();
2679+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2680+ top_of_locals = offset;
2681+ }
2682 frame.bytes_below_saved_regs = offset;
2683 frame.sve_save_and_probe = INVALID_REGNUM;
2684
2685@@ -7611,15 +7633,18 @@ aarch64_layout_frame (void)
2686 at expand_prologue. */
2687 gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
2688
2689- offset += get_frame_size ();
2690- offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2691- auto top_of_locals = offset;
2692-
2693+ if (!regs_at_top_p)
2694+ {
2695+ offset += get_frame_size ();
2696+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2697+ top_of_locals = offset;
2698+ }
2699 offset += frame.saved_varargs_size;
2700 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
2701 frame.frame_size = offset;
2702
2703 frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
2704+ gcc_assert (known_ge (top_of_locals, 0));
2705 frame.bytes_above_locals = frame.frame_size - top_of_locals;
2706
2707 frame.initial_adjust = 0;
2708@@ -8843,10 +8868,10 @@ aarch64_epilogue_uses (int regno)
2709 | for register varargs |
2710 | |
2711 +-------------------------------+
2712- | local variables | <-- frame_pointer_rtx
2713+ | local variables (1) | <-- frame_pointer_rtx
2714 | |
2715 +-------------------------------+
2716- | padding |
2717+ | padding (1) |
2718 +-------------------------------+
2719 | callee-saved registers |
2720 +-------------------------------+
2721@@ -8858,6 +8883,10 @@ aarch64_epilogue_uses (int regno)
2722 +-------------------------------+
2723 | SVE predicate registers |
2724 +-------------------------------+
2725+ | local variables (2) |
2726+ +-------------------------------+
2727+ | padding (2) |
2728+ +-------------------------------+
2729 | dynamic allocation |
2730 +-------------------------------+
2731 | padding |
2732@@ -8867,6 +8896,9 @@ aarch64_epilogue_uses (int regno)
2733 +-------------------------------+
2734 | | <-- stack_pointer_rtx (aligned)
2735
2736+ The regions marked (1) and (2) are mutually exclusive. (2) is used
2737+ when aarch64_save_regs_above_locals_p is true.
2738+
2739 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2740 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2741 unchanged.
2742@@ -9058,6 +9090,8 @@ aarch64_expand_prologue (void)
2743 gcc_assert (known_eq (bytes_below_sp, final_adjust));
2744 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
2745 !frame_pointer_needed, true);
2746+ if (emit_frame_chain && maybe_ne (final_adjust, 0))
2747+ emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2748 }
2749
2750 /* Return TRUE if we can use a simple_return insn.
2751diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
2752new file mode 100644
2753index 00000000000..e71d820e365
2754--- /dev/null
2755+++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
2756@@ -0,0 +1,95 @@
2757+/* { dg-options " -O -fstack-protector-strong -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0 -mstack-protector-guard-offset=16" } */
2758+/* { dg-final { check-function-bodies "**" "" } } */
2759+
2760+void g(void *);
2761+__SVBool_t *h(void *);
2762+
2763+/*
2764+** test1:
2765+** sub sp, sp, #288
2766+** stp x29, x30, \[sp, #?272\]
2767+** add x29, sp, #?272
2768+** mrs (x[0-9]+), tpidr2_el0
2769+** ldr (x[0-9]+), \[\1, #?16\]
2770+** str \2, \[sp, #?264\]
2771+** mov \2, #?0
2772+** add x0, sp, #?8
2773+** bl g
2774+** ...
2775+** mrs .*
2776+** ...
2777+** bne .*
2778+** ...
2779+** ldp x29, x30, \[sp, #?272\]
2780+** add sp, sp, #?288
2781+** ret
2782+** bl __stack_chk_fail
2783+*/
2784+int test1() {
2785+ int y[0x40];
2786+ g(y);
2787+ return 1;
2788+}
2789+
2790+/*
2791+** test2:
2792+** stp x29, x30, \[sp, #?-16\]!
2793+** mov x29, sp
2794+** sub sp, sp, #1040
2795+** mrs (x[0-9]+), tpidr2_el0
2796+** ldr (x[0-9]+), \[\1, #?16\]
2797+** str \2, \[sp, #?1032\]
2798+** mov \2, #?0
2799+** add x0, sp, #?8
2800+** bl g
2801+** ...
2802+** mrs .*
2803+** ...
2804+** bne .*
2805+** ...
2806+** add sp, sp, #?1040
2807+** ldp x29, x30, \[sp\], #?16
2808+** ret
2809+** bl __stack_chk_fail
2810+*/
2811+int test2() {
2812+ int y[0x100];
2813+ g(y);
2814+ return 1;
2815+}
2816+
2817+#pragma GCC target "+sve"
2818+
2819+/*
2820+** test3:
2821+** stp x29, x30, \[sp, #?-16\]!
2822+** mov x29, sp
2823+** addvl sp, sp, #-18
2824+** ...
2825+** str p4, \[sp\]
2826+** ...
2827+** sub sp, sp, #272
2828+** mrs (x[0-9]+), tpidr2_el0
2829+** ldr (x[0-9]+), \[\1, #?16\]
2830+** str \2, \[sp, #?264\]
2831+** mov \2, #?0
2832+** add x0, sp, #?8
2833+** bl h
2834+** ...
2835+** mrs .*
2836+** ...
2837+** bne .*
2838+** ...
2839+** add sp, sp, #?272
2840+** ...
2841+** ldr p4, \[sp\]
2842+** ...
2843+** addvl sp, sp, #18
2844+** ldp x29, x30, \[sp\], #?16
2845+** ret
2846+** bl __stack_chk_fail
2847+*/
2848+__SVBool_t test3() {
2849+ int y[0x40];
2850+ return *h(y);
2851+}
2852diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
2853new file mode 100644
2854index 00000000000..58f322aa480
2855--- /dev/null
2856+++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
2857@@ -0,0 +1,33 @@
2858+/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */
2859+/* { dg-final { check-function-bodies "**" "" } } */
2860+
2861+/*
2862+** main:
2863+** ...
2864+** stp x29, x30, \[sp, #?-[0-9]+\]!
2865+** ...
2866+** sub sp, sp, #[0-9]+
2867+** ...
2868+** str x[0-9]+, \[x29, #?-8\]
2869+** ...
2870+*/
2871+int f(const char *);
2872+void g(void *);
2873+int main(int argc, char* argv[])
2874+{
2875+ int a;
2876+ int b;
2877+ char c[2+f(argv[1])];
2878+ int d[0x100];
2879+ char y;
2880+
2881+ y=42; a=4; b=10;
2882+ c[0] = 'h'; c[1] = '\0';
2883+
2884+ c[f(argv[2])] = '\0';
2885+
2886+ __builtin_printf("%d %d\n%s\n", a, b, c);
2887+ g(d);
2888+
2889+ return 0;
2890+}
2891--
28922.34.1
2893