summaryrefslogtreecommitdiffstats
path: root/meta/recipes-devtools/gcc
diff options
context:
space:
mode:
authorRoss Burton <ross.burton@arm.com>2023-09-12 18:24:17 +0100
committerRichard Purdie <richard.purdie@linuxfoundation.org>2023-09-13 09:31:43 +0100
commit8dfb19926e6d27cffdd24112f1d2ad156cb71968 (patch)
tree1444a4af0f96085c902db381b84737626d625ecb /meta/recipes-devtools/gcc
parent843c6c2e055525af4c0c4a8f1e925a23c4280f0a (diff)
downloadpoky-8dfb19926e6d27cffdd24112f1d2ad156cb71968.tar.gz
gcc: Fix -fstack-protector issue on aarch64
This series of patches fixes deficiencies in GCC's -fstack-protector implementation for AArch64 when using dynamically allocated stack space. This is CVE-2023-4039. See: https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64 https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf for more details. (From OE-Core rev: 750396ca55e9f165a77dc94f841a953b9a6520d5) Signed-off-by: Ross Burton <ross.burton@arm.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'meta/recipes-devtools/gcc')
-rw-r--r--meta/recipes-devtools/gcc/gcc-13.2.inc1
-rw-r--r--meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch3093
2 files changed, 3094 insertions, 0 deletions
diff --git a/meta/recipes-devtools/gcc/gcc-13.2.inc b/meta/recipes-devtools/gcc/gcc-13.2.inc
index 7f97ecc332..0922251e18 100644
--- a/meta/recipes-devtools/gcc/gcc-13.2.inc
+++ b/meta/recipes-devtools/gcc/gcc-13.2.inc
@@ -65,6 +65,7 @@ SRC_URI = "${BASEURI} \
65 file://0023-Fix-install-path-of-linux64.h.patch \ 65 file://0023-Fix-install-path-of-linux64.h.patch \
66 file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \ 66 file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \
67 file://0025-gcc-testsuite-mips.patch \ 67 file://0025-gcc-testsuite-mips.patch \
68 file://CVE-2023-4039.patch \
68" 69"
69SRC_URI[sha256sum] = "e275e76442a6067341a27f04c5c6b83d8613144004c0413528863dc6b5c743da" 70SRC_URI[sha256sum] = "e275e76442a6067341a27f04c5c6b83d8613144004c0413528863dc6b5c743da"
70 71
diff --git a/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
new file mode 100644
index 0000000000..81b5067c33
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
@@ -0,0 +1,3093 @@
1From: Richard Sandiford <richard.sandiford@arm.com>
2Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue
3Date: Tue, 12 Sep 2023 16:25:10 +0100
4
5This series of patches fixes deficiencies in GCC's -fstack-protector
6implementation for AArch64 when using dynamically allocated stack space.
7This is CVE-2023-4039. See:
8
9https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
10https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf
11
12for more details.
13
14The fix is to put the saved registers above the locals area when
15-fstack-protector is used.
16
17The series also fixes a stack-clash problem that I found while working
18on the CVE. In unpatched sources, the stack-clash problem would only
19trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an
20equivalent). But it would be a more significant issue with the new
21-fstack-protector frame layout. It's therefore important that both
22problems are fixed together.
23
24Some reorganisation of the code seemed necessary to fix the problems in a
25cleanish way. The series is therefore quite long, but only a handful of
26patches should have any effect on code generation.
27
28See the individual patches for a detailed description.
29
30Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches.
31I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039.
32
33CVE: CVE-2023-4039
34Upstream-Status: Backport
35Signed-off-by: Ross Burton <ross.burton@arm.com>
36
37
38From 71a2aa2127283f450c623d3604dbcabe0e14a8d4 Mon Sep 17 00:00:00 2001
39From: Richard Sandiford <richard.sandiford@arm.com>
40Date: Tue, 12 Sep 2023 16:07:12 +0100
41Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping code
42
43aarch64_layout_frame uses a shorthand for referring to
44cfun->machine->frame:
45
46 aarch64_frame &frame = cfun->machine->frame;
47
48This patch does the same for some other heavy users of the structure.
49No functional change intended.
50
51gcc/
52 * config/aarch64/aarch64.cc (aarch64_save_callee_saves): Use
53 a local shorthand for cfun->machine->frame.
54 (aarch64_restore_callee_saves, aarch64_get_separate_components):
55 (aarch64_process_components): Likewise.
56 (aarch64_allocate_and_probe_stack_space): Likewise.
57 (aarch64_expand_prologue, aarch64_expand_epilogue): Likewise.
58 (aarch64_layout_frame): Use existing shorthand for one more case.
59---
60 gcc/config/aarch64/aarch64.cc | 123 ++++++++++++++++++----------------
61 1 file changed, 64 insertions(+), 59 deletions(-)
62
63diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
64index 822a2b49a46..5d473d161d9 100644
65--- a/gcc/config/aarch64/aarch64.cc
66+++ b/gcc/config/aarch64/aarch64.cc
67@@ -8612,7 +8612,7 @@ aarch64_layout_frame (void)
68 frame.is_scs_enabled
69 = (!crtl->calls_eh_return
70 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
71- && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
72+ && known_ge (frame.reg_offset[LR_REGNUM], 0));
73
74 /* When shadow call stack is enabled, the scs_pop in the epilogue will
75 restore x30, and we don't need to pop x30 again in the traditional
76@@ -9078,6 +9078,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
77 unsigned start, unsigned limit, bool skip_wb,
78 bool hard_fp_valid_p)
79 {
80+ aarch64_frame &frame = cfun->machine->frame;
81 rtx_insn *insn;
82 unsigned regno;
83 unsigned regno2;
84@@ -9092,8 +9093,8 @@ aarch64_save_callee_saves (poly_int64 start_offset,
85 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
86
87 if (skip_wb
88- && (regno == cfun->machine->frame.wb_push_candidate1
89- || regno == cfun->machine->frame.wb_push_candidate2))
90+ && (regno == frame.wb_push_candidate1
91+ || regno == frame.wb_push_candidate2))
92 continue;
93
94 if (cfun->machine->reg_is_wrapped_separately[regno])
95@@ -9101,7 +9102,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
96
97 machine_mode mode = aarch64_reg_save_mode (regno);
98 reg = gen_rtx_REG (mode, regno);
99- offset = start_offset + cfun->machine->frame.reg_offset[regno];
100+ offset = start_offset + frame.reg_offset[regno];
101 rtx base_rtx = stack_pointer_rtx;
102 poly_int64 sp_offset = offset;
103
104@@ -9114,7 +9115,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
105 {
106 gcc_assert (known_eq (start_offset, 0));
107 poly_int64 fp_offset
108- = cfun->machine->frame.below_hard_fp_saved_regs_size;
109+ = frame.below_hard_fp_saved_regs_size;
110 if (hard_fp_valid_p)
111 base_rtx = hard_frame_pointer_rtx;
112 else
113@@ -9136,8 +9137,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
114 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
115 && !cfun->machine->reg_is_wrapped_separately[regno2]
116 && known_eq (GET_MODE_SIZE (mode),
117- cfun->machine->frame.reg_offset[regno2]
118- - cfun->machine->frame.reg_offset[regno]))
119+ frame.reg_offset[regno2] - frame.reg_offset[regno]))
120 {
121 rtx reg2 = gen_rtx_REG (mode, regno2);
122 rtx mem2;
123@@ -9187,6 +9187,7 @@ static void
124 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
125 unsigned limit, bool skip_wb, rtx *cfi_ops)
126 {
127+ aarch64_frame &frame = cfun->machine->frame;
128 unsigned regno;
129 unsigned regno2;
130 poly_int64 offset;
131@@ -9203,13 +9204,13 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
132 rtx reg, mem;
133
134 if (skip_wb
135- && (regno == cfun->machine->frame.wb_pop_candidate1
136- || regno == cfun->machine->frame.wb_pop_candidate2))
137+ && (regno == frame.wb_pop_candidate1
138+ || regno == frame.wb_pop_candidate2))
139 continue;
140
141 machine_mode mode = aarch64_reg_save_mode (regno);
142 reg = gen_rtx_REG (mode, regno);
143- offset = start_offset + cfun->machine->frame.reg_offset[regno];
144+ offset = start_offset + frame.reg_offset[regno];
145 rtx base_rtx = stack_pointer_rtx;
146 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
147 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
148@@ -9220,8 +9221,7 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
149 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
150 && !cfun->machine->reg_is_wrapped_separately[regno2]
151 && known_eq (GET_MODE_SIZE (mode),
152- cfun->machine->frame.reg_offset[regno2]
153- - cfun->machine->frame.reg_offset[regno]))
154+ frame.reg_offset[regno2] - frame.reg_offset[regno]))
155 {
156 rtx reg2 = gen_rtx_REG (mode, regno2);
157 rtx mem2;
158@@ -9326,6 +9326,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
159 static sbitmap
160 aarch64_get_separate_components (void)
161 {
162+ aarch64_frame &frame = cfun->machine->frame;
163 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
164 bitmap_clear (components);
165
166@@ -9342,18 +9343,18 @@ aarch64_get_separate_components (void)
167 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
168 continue;
169
170- poly_int64 offset = cfun->machine->frame.reg_offset[regno];
171+ poly_int64 offset = frame.reg_offset[regno];
172
173 /* If the register is saved in the first SVE save slot, we use
174 it as a stack probe for -fstack-clash-protection. */
175 if (flag_stack_clash_protection
176- && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
177+ && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
178 && known_eq (offset, 0))
179 continue;
180
181 /* Get the offset relative to the register we'll use. */
182 if (frame_pointer_needed)
183- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
184+ offset -= frame.below_hard_fp_saved_regs_size;
185 else
186 offset += crtl->outgoing_args_size;
187
188@@ -9372,11 +9373,11 @@ aarch64_get_separate_components (void)
189 /* If the spare predicate register used by big-endian SVE code
190 is call-preserved, it must be saved in the main prologue
191 before any saves that use it. */
192- if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
193- bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
194+ if (frame.spare_pred_reg != INVALID_REGNUM)
195+ bitmap_clear_bit (components, frame.spare_pred_reg);
196
197- unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
198- unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
199+ unsigned reg1 = frame.wb_push_candidate1;
200+ unsigned reg2 = frame.wb_push_candidate2;
201 /* If registers have been chosen to be stored/restored with
202 writeback don't interfere with them to avoid having to output explicit
203 stack adjustment instructions. */
204@@ -9485,6 +9486,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
205 static void
206 aarch64_process_components (sbitmap components, bool prologue_p)
207 {
208+ aarch64_frame &frame = cfun->machine->frame;
209 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
210 ? HARD_FRAME_POINTER_REGNUM
211 : STACK_POINTER_REGNUM);
212@@ -9499,9 +9501,9 @@ aarch64_process_components (sbitmap components, bool prologue_p)
213 machine_mode mode = aarch64_reg_save_mode (regno);
214
215 rtx reg = gen_rtx_REG (mode, regno);
216- poly_int64 offset = cfun->machine->frame.reg_offset[regno];
217+ poly_int64 offset = frame.reg_offset[regno];
218 if (frame_pointer_needed)
219- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
220+ offset -= frame.below_hard_fp_saved_regs_size;
221 else
222 offset += crtl->outgoing_args_size;
223
224@@ -9526,14 +9528,14 @@ aarch64_process_components (sbitmap components, bool prologue_p)
225 break;
226 }
227
228- poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
229+ poly_int64 offset2 = frame.reg_offset[regno2];
230 /* The next register is not of the same class or its offset is not
231 mergeable with the current one into a pair. */
232 if (aarch64_sve_mode_p (mode)
233 || !satisfies_constraint_Ump (mem)
234 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
235 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
236- || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
237+ || maybe_ne ((offset2 - frame.reg_offset[regno]),
238 GET_MODE_SIZE (mode)))
239 {
240 insn = emit_insn (set);
241@@ -9555,7 +9557,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
242 /* REGNO2 can be saved/restored in a pair with REGNO. */
243 rtx reg2 = gen_rtx_REG (mode, regno2);
244 if (frame_pointer_needed)
245- offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
246+ offset2 -= frame.below_hard_fp_saved_regs_size;
247 else
248 offset2 += crtl->outgoing_args_size;
249 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
250@@ -9650,6 +9652,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
251 bool frame_related_p,
252 bool final_adjustment_p)
253 {
254+ aarch64_frame &frame = cfun->machine->frame;
255 HOST_WIDE_INT guard_size
256 = 1 << param_stack_clash_protection_guard_size;
257 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
258@@ -9670,25 +9673,25 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
259 register as a probe. We can't assume that LR was saved at position 0
260 though, so treat any space below it as unprobed. */
261 if (final_adjustment_p
262- && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
263+ && known_eq (frame.below_hard_fp_saved_regs_size, 0))
264 {
265- poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
266+ poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
267 if (known_ge (lr_offset, 0))
268 min_probe_threshold -= lr_offset.to_constant ();
269 else
270 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
271 }
272
273- poly_int64 frame_size = cfun->machine->frame.frame_size;
274+ poly_int64 frame_size = frame.frame_size;
275
276 /* We should always have a positive probe threshold. */
277 gcc_assert (min_probe_threshold > 0);
278
279 if (flag_stack_clash_protection && !final_adjustment_p)
280 {
281- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
282- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
283- poly_int64 final_adjust = cfun->machine->frame.final_adjust;
284+ poly_int64 initial_adjust = frame.initial_adjust;
285+ poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
286+ poly_int64 final_adjust = frame.final_adjust;
287
288 if (known_eq (frame_size, 0))
289 {
290@@ -9977,17 +9980,18 @@ aarch64_epilogue_uses (int regno)
291 void
292 aarch64_expand_prologue (void)
293 {
294- poly_int64 frame_size = cfun->machine->frame.frame_size;
295- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
296- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
297- poly_int64 final_adjust = cfun->machine->frame.final_adjust;
298- poly_int64 callee_offset = cfun->machine->frame.callee_offset;
299- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
300+ aarch64_frame &frame = cfun->machine->frame;
301+ poly_int64 frame_size = frame.frame_size;
302+ poly_int64 initial_adjust = frame.initial_adjust;
303+ HOST_WIDE_INT callee_adjust = frame.callee_adjust;
304+ poly_int64 final_adjust = frame.final_adjust;
305+ poly_int64 callee_offset = frame.callee_offset;
306+ poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
307 poly_int64 below_hard_fp_saved_regs_size
308- = cfun->machine->frame.below_hard_fp_saved_regs_size;
309- unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
310- unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
311- bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
312+ = frame.below_hard_fp_saved_regs_size;
313+ unsigned reg1 = frame.wb_push_candidate1;
314+ unsigned reg2 = frame.wb_push_candidate2;
315+ bool emit_frame_chain = frame.emit_frame_chain;
316 rtx_insn *insn;
317
318 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
319@@ -10018,7 +10022,7 @@ aarch64_expand_prologue (void)
320 }
321
322 /* Push return address to shadow call stack. */
323- if (cfun->machine->frame.is_scs_enabled)
324+ if (frame.is_scs_enabled)
325 emit_insn (gen_scs_push ());
326
327 if (flag_stack_usage_info)
328@@ -10057,7 +10061,7 @@ aarch64_expand_prologue (void)
329
330 /* The offset of the frame chain record (if any) from the current SP. */
331 poly_int64 chain_offset = (initial_adjust + callee_adjust
332- - cfun->machine->frame.hard_fp_offset);
333+ - frame.hard_fp_offset);
334 gcc_assert (known_ge (chain_offset, 0));
335
336 /* The offset of the bottom of the save area from the current SP. */
337@@ -10160,16 +10164,17 @@ aarch64_use_return_insn_p (void)
338 void
339 aarch64_expand_epilogue (bool for_sibcall)
340 {
341- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
342- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
343- poly_int64 final_adjust = cfun->machine->frame.final_adjust;
344- poly_int64 callee_offset = cfun->machine->frame.callee_offset;
345- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
346+ aarch64_frame &frame = cfun->machine->frame;
347+ poly_int64 initial_adjust = frame.initial_adjust;
348+ HOST_WIDE_INT callee_adjust = frame.callee_adjust;
349+ poly_int64 final_adjust = frame.final_adjust;
350+ poly_int64 callee_offset = frame.callee_offset;
351+ poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
352 poly_int64 below_hard_fp_saved_regs_size
353- = cfun->machine->frame.below_hard_fp_saved_regs_size;
354- unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
355- unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
356- unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
357+ = frame.below_hard_fp_saved_regs_size;
358+ unsigned reg1 = frame.wb_pop_candidate1;
359+ unsigned reg2 = frame.wb_pop_candidate2;
360+ unsigned int last_gpr = (frame.is_scs_enabled
361 ? R29_REGNUM : R30_REGNUM);
362 rtx cfi_ops = NULL;
363 rtx_insn *insn;
364@@ -10203,7 +10208,7 @@ aarch64_expand_epilogue (bool for_sibcall)
365 /* We need to add memory barrier to prevent read from deallocated stack. */
366 bool need_barrier_p
367 = maybe_ne (get_frame_size ()
368- + cfun->machine->frame.saved_varargs_size, 0);
369+ + frame.saved_varargs_size, 0);
370
371 /* Emit a barrier to prevent loads from a deallocated stack. */
372 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
373@@ -10284,7 +10289,7 @@ aarch64_expand_epilogue (bool for_sibcall)
374 }
375
376 /* Pop return address from shadow call stack. */
377- if (cfun->machine->frame.is_scs_enabled)
378+ if (frame.is_scs_enabled)
379 {
380 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
381 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
382@@ -12740,24 +12745,24 @@ aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
383 poly_int64
384 aarch64_initial_elimination_offset (unsigned from, unsigned to)
385 {
386+ aarch64_frame &frame = cfun->machine->frame;
387+
388 if (to == HARD_FRAME_POINTER_REGNUM)
389 {
390 if (from == ARG_POINTER_REGNUM)
391- return cfun->machine->frame.hard_fp_offset;
392+ return frame.hard_fp_offset;
393
394 if (from == FRAME_POINTER_REGNUM)
395- return cfun->machine->frame.hard_fp_offset
396- - cfun->machine->frame.locals_offset;
397+ return frame.hard_fp_offset - frame.locals_offset;
398 }
399
400 if (to == STACK_POINTER_REGNUM)
401 {
402 if (from == FRAME_POINTER_REGNUM)
403- return cfun->machine->frame.frame_size
404- - cfun->machine->frame.locals_offset;
405+ return frame.frame_size - frame.locals_offset;
406 }
407
408- return cfun->machine->frame.frame_size;
409+ return frame.frame_size;
410 }
411
412
413--
4142.34.1
415
416
417From 89a9fa287706c5011f61926eaf65e7b996b963a3 Mon Sep 17 00:00:00 2001
418From: Richard Sandiford <richard.sandiford@arm.com>
419Date: Tue, 12 Sep 2023 16:07:12 +0100
420Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset
421
422When we emit the frame chain, i.e. when we reach Here in this statement
423of aarch64_expand_prologue:
424
425 if (emit_frame_chain)
426 {
427 // Here
428 ...
429 }
430
431the stack is in one of two states:
432
433- We've allocated up to the frame chain, but no more.
434
435- We've allocated the whole frame, and the frame chain is within easy
436 reach of the new SP.
437
438The offset of the frame chain from the current SP is available
439in aarch64_frame as callee_offset. It is also available as the
440chain_offset local variable, where the latter is calculated from other
441data. (However, chain_offset is not always equal to callee_offset when
442!emit_frame_chain, so chain_offset isn't redundant.)
443
444In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using
445chain_offset for the initialisation of the hard frame pointer:
446
447 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
448- stack_pointer_rtx, callee_offset,
449+ stack_pointer_rtx, chain_offset,
450 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
451
452But the later REG_CFA_ADJUST_CFA handling still used callee_offset.
453
454I think the difference is harmless, but it's more logical for the
455CFA note to be in sync, and it's more convenient for later patches
456if it uses chain_offset.
457
458gcc/
459 * config/aarch64/aarch64.cc (aarch64_expand_prologue): Use
460 chain_offset rather than callee_offset.
461---
462 gcc/config/aarch64/aarch64.cc | 4 +---
463 1 file changed, 1 insertion(+), 3 deletions(-)
464
465diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
466index 5d473d161d9..4f233c95140 100644
467--- a/gcc/config/aarch64/aarch64.cc
468+++ b/gcc/config/aarch64/aarch64.cc
469@@ -9985,7 +9985,6 @@ aarch64_expand_prologue (void)
470 poly_int64 initial_adjust = frame.initial_adjust;
471 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
472 poly_int64 final_adjust = frame.final_adjust;
473- poly_int64 callee_offset = frame.callee_offset;
474 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
475 poly_int64 below_hard_fp_saved_regs_size
476 = frame.below_hard_fp_saved_regs_size;
477@@ -10098,8 +10097,7 @@ aarch64_expand_prologue (void)
478 implicit. */
479 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
480 {
481- rtx src = plus_constant (Pmode, stack_pointer_rtx,
482- callee_offset);
483+ rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
484 add_reg_note (insn, REG_CFA_ADJUST_CFA,
485 gen_rtx_SET (hard_frame_pointer_rtx, src));
486 }
487--
4882.34.1
489
490
491From b36a2a78040722dab6124366c5d6baf8eaf80aef Mon Sep 17 00:00:00 2001
492From: Richard Sandiford <richard.sandiford@arm.com>
493Date: Tue, 12 Sep 2023 16:07:13 +0100
494Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved
495 registers
496
497If a frame has no saved registers, it can be allocated in one go.
498There is no need to treat the areas below and above the saved
499registers as separate.
500
501And if we allocate the frame in one go, it should be allocated
502as the initial_adjust rather than the final_adjust. This allows the
503frame size to grow to guard_size - guard_used_by_caller before a stack
504probe is needed. (A frame with no register saves is necessarily a
505leaf frame.)
506
507This is a no-op as thing stand, since a leaf function will have
508no outgoing arguments, and so all the frame will be above where
509the saved registers normally go.
510
511gcc/
512 * config/aarch64/aarch64.cc (aarch64_layout_frame): Explicitly
513 allocate the frame in one go if there are no saved registers.
514---
515 gcc/config/aarch64/aarch64.cc | 8 +++++---
516 1 file changed, 5 insertions(+), 3 deletions(-)
517
518diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
519index 4f233c95140..37643041ffb 100644
520--- a/gcc/config/aarch64/aarch64.cc
521+++ b/gcc/config/aarch64/aarch64.cc
522@@ -8639,9 +8639,11 @@ aarch64_layout_frame (void)
523
524 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
525 HOST_WIDE_INT const_saved_regs_size;
526- if (frame.frame_size.is_constant (&const_size)
527- && const_size < max_push_offset
528- && known_eq (frame.hard_fp_offset, const_size))
529+ if (known_eq (frame.saved_regs_size, 0))
530+ frame.initial_adjust = frame.frame_size;
531+ else if (frame.frame_size.is_constant (&const_size)
532+ && const_size < max_push_offset
533+ && known_eq (frame.hard_fp_offset, const_size))
534 {
535 /* Simple, small frame with no outgoing arguments:
536
537--
5382.34.1
539
540
541From ada2ab0093596be707f23a3466ac82cff59fcffe Mon Sep 17 00:00:00 2001
542From: Richard Sandiford <richard.sandiford@arm.com>
543Date: Tue, 12 Sep 2023 16:07:13 +0100
544Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info
545
546The frame layout code currently hard-codes the assumption that
547the number of bytes below the saved registers is equal to the
548size of the outgoing arguments. This patch abstracts that
549value into a new field of aarch64_frame.
550
551gcc/
552 * config/aarch64/aarch64.h (aarch64_frame::bytes_below_saved_regs): New
553 field.
554 * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it,
555 and use it instead of crtl->outgoing_args_size.
556 (aarch64_get_separate_components): Use bytes_below_saved_regs instead
557 of outgoing_args_size.
558 (aarch64_process_components): Likewise.
559---
560 gcc/config/aarch64/aarch64.cc | 71 ++++++++++++++++++-----------------
561 gcc/config/aarch64/aarch64.h | 5 +++
562 2 files changed, 41 insertions(+), 35 deletions(-)
563
564diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
565index 37643041ffb..dacc2b0e4dd 100644
566--- a/gcc/config/aarch64/aarch64.cc
567+++ b/gcc/config/aarch64/aarch64.cc
568@@ -8478,6 +8478,8 @@ aarch64_layout_frame (void)
569 gcc_assert (crtl->is_leaf
570 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
571
572+ frame.bytes_below_saved_regs = crtl->outgoing_args_size;
573+
574 /* Now assign stack slots for the registers. Start with the predicate
575 registers, since predicate LDR and STR have a relatively small
576 offset range. These saves happen below the hard frame pointer. */
577@@ -8582,18 +8584,18 @@ aarch64_layout_frame (void)
578
579 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
580
581- poly_int64 above_outgoing_args
582+ poly_int64 saved_regs_and_above
583 = aligned_upper_bound (varargs_and_saved_regs_size
584 + get_frame_size (),
585 STACK_BOUNDARY / BITS_PER_UNIT);
586
587 frame.hard_fp_offset
588- = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
589+ = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
590
591 /* Both these values are already aligned. */
592- gcc_assert (multiple_p (crtl->outgoing_args_size,
593+ gcc_assert (multiple_p (frame.bytes_below_saved_regs,
594 STACK_BOUNDARY / BITS_PER_UNIT));
595- frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
596+ frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
597
598 frame.locals_offset = frame.saved_varargs_size;
599
600@@ -8637,7 +8639,7 @@ aarch64_layout_frame (void)
601 else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
602 max_push_offset = 256;
603
604- HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
605+ HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
606 HOST_WIDE_INT const_saved_regs_size;
607 if (known_eq (frame.saved_regs_size, 0))
608 frame.initial_adjust = frame.frame_size;
609@@ -8645,31 +8647,31 @@ aarch64_layout_frame (void)
610 && const_size < max_push_offset
611 && known_eq (frame.hard_fp_offset, const_size))
612 {
613- /* Simple, small frame with no outgoing arguments:
614+ /* Simple, small frame with no data below the saved registers.
615
616 stp reg1, reg2, [sp, -frame_size]!
617 stp reg3, reg4, [sp, 16] */
618 frame.callee_adjust = const_size;
619 }
620- else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
621+ else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
622 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
623- && const_outgoing_args_size + const_saved_regs_size < 512
624- /* We could handle this case even with outgoing args, provided
625- that the number of args left us with valid offsets for all
626- predicate and vector save slots. It's such a rare case that
627- it hardly seems worth the effort though. */
628- && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
629+ && const_below_saved_regs + const_saved_regs_size < 512
630+ /* We could handle this case even with data below the saved
631+ registers, provided that that data left us with valid offsets
632+ for all predicate and vector save slots. It's such a rare
633+ case that it hardly seems worth the effort though. */
634+ && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
635 && !(cfun->calls_alloca
636 && frame.hard_fp_offset.is_constant (&const_fp_offset)
637 && const_fp_offset < max_push_offset))
638 {
639- /* Frame with small outgoing arguments:
640+ /* Frame with small area below the saved registers:
641
642 sub sp, sp, frame_size
643- stp reg1, reg2, [sp, outgoing_args_size]
644- stp reg3, reg4, [sp, outgoing_args_size + 16] */
645+ stp reg1, reg2, [sp, bytes_below_saved_regs]
646+ stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
647 frame.initial_adjust = frame.frame_size;
648- frame.callee_offset = const_outgoing_args_size;
649+ frame.callee_offset = const_below_saved_regs;
650 }
651 else if (saves_below_hard_fp_p
652 && known_eq (frame.saved_regs_size,
653@@ -8679,30 +8681,29 @@ aarch64_layout_frame (void)
654
655 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
656 save SVE registers relative to SP
657- sub sp, sp, outgoing_args_size */
658+ sub sp, sp, bytes_below_saved_regs */
659 frame.initial_adjust = (frame.hard_fp_offset
660 + frame.below_hard_fp_saved_regs_size);
661- frame.final_adjust = crtl->outgoing_args_size;
662+ frame.final_adjust = frame.bytes_below_saved_regs;
663 }
664 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
665 && const_fp_offset < max_push_offset)
666 {
667- /* Frame with large outgoing arguments or SVE saves, but with
668- a small local area:
669+ /* Frame with large area below the saved registers, or with SVE saves,
670+ but with a small area above:
671
672 stp reg1, reg2, [sp, -hard_fp_offset]!
673 stp reg3, reg4, [sp, 16]
674 [sub sp, sp, below_hard_fp_saved_regs_size]
675 [save SVE registers relative to SP]
676- sub sp, sp, outgoing_args_size */
677+ sub sp, sp, bytes_below_saved_regs */
678 frame.callee_adjust = const_fp_offset;
679 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
680- frame.final_adjust = crtl->outgoing_args_size;
681+ frame.final_adjust = frame.bytes_below_saved_regs;
682 }
683 else
684 {
685- /* Frame with large local area and outgoing arguments or SVE saves,
686- using frame pointer:
687+ /* General case:
688
689 sub sp, sp, hard_fp_offset
690 stp x29, x30, [sp, 0]
691@@ -8710,10 +8711,10 @@ aarch64_layout_frame (void)
692 stp reg3, reg4, [sp, 16]
693 [sub sp, sp, below_hard_fp_saved_regs_size]
694 [save SVE registers relative to SP]
695- sub sp, sp, outgoing_args_size */
696+ sub sp, sp, bytes_below_saved_regs */
697 frame.initial_adjust = frame.hard_fp_offset;
698 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
699- frame.final_adjust = crtl->outgoing_args_size;
700+ frame.final_adjust = frame.bytes_below_saved_regs;
701 }
702
703 /* Make sure the individual adjustments add up to the full frame size. */
704@@ -9358,7 +9359,7 @@ aarch64_get_separate_components (void)
705 if (frame_pointer_needed)
706 offset -= frame.below_hard_fp_saved_regs_size;
707 else
708- offset += crtl->outgoing_args_size;
709+ offset += frame.bytes_below_saved_regs;
710
711 /* Check that we can access the stack slot of the register with one
712 direct load with no adjustments needed. */
713@@ -9507,7 +9508,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
714 if (frame_pointer_needed)
715 offset -= frame.below_hard_fp_saved_regs_size;
716 else
717- offset += crtl->outgoing_args_size;
718+ offset += frame.bytes_below_saved_regs;
719
720 rtx addr = plus_constant (Pmode, ptr_reg, offset);
721 rtx mem = gen_frame_mem (mode, addr);
722@@ -9561,7 +9562,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
723 if (frame_pointer_needed)
724 offset2 -= frame.below_hard_fp_saved_regs_size;
725 else
726- offset2 += crtl->outgoing_args_size;
727+ offset2 += frame.bytes_below_saved_regs;
728 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
729 rtx mem2 = gen_frame_mem (mode, addr2);
730 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
731@@ -9635,10 +9636,10 @@ aarch64_stack_clash_protection_alloca_probe_range (void)
732 registers. If POLY_SIZE is not large enough to require a probe this function
733 will only adjust the stack. When allocating the stack space
734 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
735- FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
736- arguments. If we are then we ensure that any allocation larger than the ABI
737- defined buffer needs a probe so that the invariant of having a 1KB buffer is
738- maintained.
739+ FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
740+ the saved registers. If we are then we ensure that any allocation
741+ larger than the ABI defined buffer needs a probe so that the
742+ invariant of having a 1KB buffer is maintained.
743
744 We emit barriers after each stack adjustment to prevent optimizations from
745 breaking the invariant that we never drop the stack more than a page. This
746@@ -9847,7 +9848,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
747 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
748 be probed. This maintains the requirement that each page is probed at
749 least once. For initial probing we probe only if the allocation is
750- more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
751+ more than GUARD_SIZE - buffer, and below the saved registers we probe
752 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
753 GUARD_SIZE. This works that for any allocation that is large enough to
754 trigger a probe here, we'll have at least one, and if they're not large
755diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
756index 73b09e20508..0b6faa3ddf1 100644
757--- a/gcc/config/aarch64/aarch64.h
758+++ b/gcc/config/aarch64/aarch64.h
759@@ -777,6 +777,11 @@ struct GTY (()) aarch64_frame
760 /* The size of the callee-save registers with a slot in REG_OFFSET. */
761 poly_int64 saved_regs_size;
762
763+ /* The number of bytes between the bottom of the static frame (the bottom
764+ of the outgoing arguments) and the bottom of the register save area.
765+ This value is always a multiple of STACK_BOUNDARY. */
766+ poly_int64 bytes_below_saved_regs;
767+
768 /* The size of the callee-save registers with a slot in REG_OFFSET that
769 are saved below the hard frame pointer. */
770 poly_int64 below_hard_fp_saved_regs_size;
771--
7722.34.1
773
774
775From 82f6b3e1b596ef0f4e3ac3bb9c6e88fb4458f402 Mon Sep 17 00:00:00 2001
776From: Richard Sandiford <richard.sandiford@arm.com>
777Date: Tue, 12 Sep 2023 16:07:14 +0100
778Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info
779
780Following on from the previous bytes_below_saved_regs patch, this one
781records the number of bytes that are below the hard frame pointer.
782This eventually replaces below_hard_fp_saved_regs_size.
783
784If a frame pointer is not needed, the epilogue adds final_adjust
785to the stack pointer before restoring registers:
786
787 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
788
789Therefore, if the epilogue needs to restore the stack pointer from
790the hard frame pointer, the directly corresponding offset is:
791
792 -bytes_below_hard_fp + final_adjust
793
794i.e. go from the hard frame pointer to the bottom of the frame,
795then add the same amount as if we were using the stack pointer
796from the outset.
797
798gcc/
799 * config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp): New
800 field.
801 * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it.
802 (aarch64_expand_epilogue): Use it instead of
803 below_hard_fp_saved_regs_size.
804---
805 gcc/config/aarch64/aarch64.cc | 6 +++---
806 gcc/config/aarch64/aarch64.h | 5 +++++
807 2 files changed, 8 insertions(+), 3 deletions(-)
808
809diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
810index dacc2b0e4dd..a3f7aabcc59 100644
811--- a/gcc/config/aarch64/aarch64.cc
812+++ b/gcc/config/aarch64/aarch64.cc
813@@ -8530,6 +8530,7 @@ aarch64_layout_frame (void)
814 of the callee save area. */
815 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
816 frame.below_hard_fp_saved_regs_size = offset;
817+ frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
818 if (frame.emit_frame_chain)
819 {
820 /* FP and LR are placed in the linkage record. */
821@@ -10171,8 +10172,7 @@ aarch64_expand_epilogue (bool for_sibcall)
822 poly_int64 final_adjust = frame.final_adjust;
823 poly_int64 callee_offset = frame.callee_offset;
824 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
825- poly_int64 below_hard_fp_saved_regs_size
826- = frame.below_hard_fp_saved_regs_size;
827+ poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
828 unsigned reg1 = frame.wb_pop_candidate1;
829 unsigned reg2 = frame.wb_pop_candidate2;
830 unsigned int last_gpr = (frame.is_scs_enabled
831@@ -10230,7 +10230,7 @@ aarch64_expand_epilogue (bool for_sibcall)
832 is restored on the instruction doing the writeback. */
833 aarch64_add_offset (Pmode, stack_pointer_rtx,
834 hard_frame_pointer_rtx,
835- -callee_offset - below_hard_fp_saved_regs_size,
836+ -bytes_below_hard_fp + final_adjust,
837 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
838 else
839 /* The case where we need to re-use the register here is very rare, so
840diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
841index 0b6faa3ddf1..4263d29d29d 100644
842--- a/gcc/config/aarch64/aarch64.h
843+++ b/gcc/config/aarch64/aarch64.h
844@@ -786,6 +786,11 @@ struct GTY (()) aarch64_frame
845 are saved below the hard frame pointer. */
846 poly_int64 below_hard_fp_saved_regs_size;
847
848+ /* The number of bytes between the bottom of the static frame (the bottom
849+ of the outgoing arguments) and the hard frame pointer. This value is
850+ always a multiple of STACK_BOUNDARY. */
851+ poly_int64 bytes_below_hard_fp;
852+
853 /* Offset from the base of the frame (incomming SP) to the
854 top of the locals area. This value is always a multiple of
855 STACK_BOUNDARY. */
856--
8572.34.1
858
859
860From 86fa43e9fe4a8bf954f2919f07cbe3646d1d1df3 Mon Sep 17 00:00:00 2001
861From: Richard Sandiford <richard.sandiford@arm.com>
862Date: Tue, 12 Sep 2023 16:07:14 +0100
863Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves
864
865aarch64_save_callee_saves and aarch64_restore_callee_saves took
866a parameter called start_offset that gives the offset of the
867bottom of the saved register area from the current stack pointer.
868However, it's more convenient for later patches if we use the
869bottom of the entire frame as the reference point, rather than
870the bottom of the saved registers.
871
872Doing that removes the need for the callee_offset field.
873Other than that, this is not a win on its own. It only really
874makes sense in combination with the follow-on patches.
875
876gcc/
877 * config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete.
878 * config/aarch64/aarch64.cc (aarch64_layout_frame): Remove
879 callee_offset handling.
880 (aarch64_save_callee_saves): Replace the start_offset parameter
881 with a bytes_below_sp parameter.
882 (aarch64_restore_callee_saves): Likewise.
883 (aarch64_expand_prologue): Update accordingly.
884 (aarch64_expand_epilogue): Likewise.
885---
886 gcc/config/aarch64/aarch64.cc | 56 +++++++++++++++++------------------
887 gcc/config/aarch64/aarch64.h | 4 ---
888 2 files changed, 28 insertions(+), 32 deletions(-)
889
890diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
891index a3f7aabcc59..46ae5cf7673 100644
892--- a/gcc/config/aarch64/aarch64.cc
893+++ b/gcc/config/aarch64/aarch64.cc
894@@ -8604,7 +8604,6 @@ aarch64_layout_frame (void)
895 frame.final_adjust = 0;
896 frame.callee_adjust = 0;
897 frame.sve_callee_adjust = 0;
898- frame.callee_offset = 0;
899
900 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
901 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
902@@ -8672,7 +8671,6 @@ aarch64_layout_frame (void)
903 stp reg1, reg2, [sp, bytes_below_saved_regs]
904 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
905 frame.initial_adjust = frame.frame_size;
906- frame.callee_offset = const_below_saved_regs;
907 }
908 else if (saves_below_hard_fp_p
909 && known_eq (frame.saved_regs_size,
910@@ -9073,12 +9071,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
911 }
912
913 /* Emit code to save the callee-saved registers from register number START
914- to LIMIT to the stack at the location starting at offset START_OFFSET,
915- skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
916- is true if the hard frame pointer has been set up. */
917+ to LIMIT to the stack. The stack pointer is currently BYTES_BELOW_SP
918+ bytes above the bottom of the static frame. Skip any write-back
919+ candidates if SKIP_WB is true. HARD_FP_VALID_P is true if the hard
920+ frame pointer has been set up. */
921
922 static void
923-aarch64_save_callee_saves (poly_int64 start_offset,
924+aarch64_save_callee_saves (poly_int64 bytes_below_sp,
925 unsigned start, unsigned limit, bool skip_wb,
926 bool hard_fp_valid_p)
927 {
928@@ -9106,7 +9105,9 @@ aarch64_save_callee_saves (poly_int64 start_offset,
929
930 machine_mode mode = aarch64_reg_save_mode (regno);
931 reg = gen_rtx_REG (mode, regno);
932- offset = start_offset + frame.reg_offset[regno];
933+ offset = (frame.reg_offset[regno]
934+ + frame.bytes_below_saved_regs
935+ - bytes_below_sp);
936 rtx base_rtx = stack_pointer_rtx;
937 poly_int64 sp_offset = offset;
938
939@@ -9117,9 +9118,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
940 else if (GP_REGNUM_P (regno)
941 && (!offset.is_constant (&const_offset) || const_offset >= 512))
942 {
943- gcc_assert (known_eq (start_offset, 0));
944- poly_int64 fp_offset
945- = frame.below_hard_fp_saved_regs_size;
946+ poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
947 if (hard_fp_valid_p)
948 base_rtx = hard_frame_pointer_rtx;
949 else
950@@ -9183,12 +9182,13 @@ aarch64_save_callee_saves (poly_int64 start_offset,
951 }
952
953 /* Emit code to restore the callee registers from register number START
954- up to and including LIMIT. Restore from the stack offset START_OFFSET,
955- skipping any write-back candidates if SKIP_WB is true. Write the
956- appropriate REG_CFA_RESTORE notes into CFI_OPS. */
957+ up to and including LIMIT. The stack pointer is currently BYTES_BELOW_SP
958+ bytes above the bottom of the static frame. Skip any write-back
959+ candidates if SKIP_WB is true. Write the appropriate REG_CFA_RESTORE
960+ notes into CFI_OPS. */
961
962 static void
963-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
964+aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
965 unsigned limit, bool skip_wb, rtx *cfi_ops)
966 {
967 aarch64_frame &frame = cfun->machine->frame;
968@@ -9214,7 +9214,9 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
969
970 machine_mode mode = aarch64_reg_save_mode (regno);
971 reg = gen_rtx_REG (mode, regno);
972- offset = start_offset + frame.reg_offset[regno];
973+ offset = (frame.reg_offset[regno]
974+ + frame.bytes_below_saved_regs
975+ - bytes_below_sp);
976 rtx base_rtx = stack_pointer_rtx;
977 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
978 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
979@@ -9990,8 +9992,6 @@ aarch64_expand_prologue (void)
980 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
981 poly_int64 final_adjust = frame.final_adjust;
982 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
983- poly_int64 below_hard_fp_saved_regs_size
984- = frame.below_hard_fp_saved_regs_size;
985 unsigned reg1 = frame.wb_push_candidate1;
986 unsigned reg2 = frame.wb_push_candidate2;
987 bool emit_frame_chain = frame.emit_frame_chain;
988@@ -10067,8 +10067,8 @@ aarch64_expand_prologue (void)
989 - frame.hard_fp_offset);
990 gcc_assert (known_ge (chain_offset, 0));
991
992- /* The offset of the bottom of the save area from the current SP. */
993- poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
994+ /* The offset of the current SP from the bottom of the static frame. */
995+ poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
996
997 if (emit_frame_chain)
998 {
999@@ -10076,7 +10076,7 @@ aarch64_expand_prologue (void)
1000 {
1001 reg1 = R29_REGNUM;
1002 reg2 = R30_REGNUM;
1003- aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
1004+ aarch64_save_callee_saves (bytes_below_sp, reg1, reg2,
1005 false, false);
1006 }
1007 else
1008@@ -10116,7 +10116,7 @@ aarch64_expand_prologue (void)
1009 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
1010 }
1011
1012- aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
1013+ aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM,
1014 callee_adjust != 0 || emit_frame_chain,
1015 emit_frame_chain);
1016 if (maybe_ne (sve_callee_adjust, 0))
1017@@ -10126,16 +10126,17 @@ aarch64_expand_prologue (void)
1018 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
1019 sve_callee_adjust,
1020 !frame_pointer_needed, false);
1021- saved_regs_offset += sve_callee_adjust;
1022+ bytes_below_sp -= sve_callee_adjust;
1023 }
1024- aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
1025+ aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM,
1026 false, emit_frame_chain);
1027- aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
1028+ aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM,
1029 callee_adjust != 0 || emit_frame_chain,
1030 emit_frame_chain);
1031
1032 /* We may need to probe the final adjustment if it is larger than the guard
1033 that is assumed by the called. */
1034+ gcc_assert (known_eq (bytes_below_sp, final_adjust));
1035 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
1036 !frame_pointer_needed, true);
1037 }
1038@@ -10170,7 +10171,6 @@ aarch64_expand_epilogue (bool for_sibcall)
1039 poly_int64 initial_adjust = frame.initial_adjust;
1040 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
1041 poly_int64 final_adjust = frame.final_adjust;
1042- poly_int64 callee_offset = frame.callee_offset;
1043 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
1044 poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
1045 unsigned reg1 = frame.wb_pop_candidate1;
1046@@ -10240,9 +10240,9 @@ aarch64_expand_epilogue (bool for_sibcall)
1047
1048 /* Restore the vector registers before the predicate registers,
1049 so that we can use P4 as a temporary for big-endian SVE frames. */
1050- aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
1051+ aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM,
1052 callee_adjust != 0, &cfi_ops);
1053- aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
1054+ aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM,
1055 false, &cfi_ops);
1056 if (maybe_ne (sve_callee_adjust, 0))
1057 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
1058@@ -10250,7 +10250,7 @@ aarch64_expand_epilogue (bool for_sibcall)
1059 /* When shadow call stack is enabled, the scs_pop in the epilogue will
1060 restore x30, we don't need to restore x30 again in the traditional
1061 way. */
1062- aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
1063+ aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
1064 R0_REGNUM, last_gpr,
1065 callee_adjust != 0, &cfi_ops);
1066
1067diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1068index 4263d29d29d..fd820b1be4e 100644
1069--- a/gcc/config/aarch64/aarch64.h
1070+++ b/gcc/config/aarch64/aarch64.h
1071@@ -813,10 +813,6 @@ struct GTY (()) aarch64_frame
1072 It is zero when no push is used. */
1073 HOST_WIDE_INT callee_adjust;
1074
1075- /* The offset from SP to the callee-save registers after initial_adjust.
1076- It may be non-zero if no push is used (ie. callee_adjust == 0). */
1077- poly_int64 callee_offset;
1078-
1079 /* The size of the stack adjustment before saving or after restoring
1080 SVE registers. */
1081 poly_int64 sve_callee_adjust;
1082--
10832.34.1
1084
1085
1086From 8ae9181426f2700c2e5a2909487fa630e6fa406b Mon Sep 17 00:00:00 2001
1087From: Richard Sandiford <richard.sandiford@arm.com>
1088Date: Tue, 12 Sep 2023 16:07:15 +0100
1089Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a
1090 chain
1091
1092After previous patches, it is no longer necessary to calculate
1093a chain_offset in cases where there is no chain record.
1094
1095gcc/
1096 * config/aarch64/aarch64.cc (aarch64_expand_prologue): Move the
1097 calculation of chain_offset into the emit_frame_chain block.
1098---
1099 gcc/config/aarch64/aarch64.cc | 10 +++++-----
1100 1 file changed, 5 insertions(+), 5 deletions(-)
1101
1102diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1103index 46ae5cf7673..0e9b9717c08 100644
1104--- a/gcc/config/aarch64/aarch64.cc
1105+++ b/gcc/config/aarch64/aarch64.cc
1106@@ -10062,16 +10062,16 @@ aarch64_expand_prologue (void)
1107 if (callee_adjust != 0)
1108 aarch64_push_regs (reg1, reg2, callee_adjust);
1109
1110- /* The offset of the frame chain record (if any) from the current SP. */
1111- poly_int64 chain_offset = (initial_adjust + callee_adjust
1112- - frame.hard_fp_offset);
1113- gcc_assert (known_ge (chain_offset, 0));
1114-
1115 /* The offset of the current SP from the bottom of the static frame. */
1116 poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
1117
1118 if (emit_frame_chain)
1119 {
1120+ /* The offset of the frame chain record (if any) from the current SP. */
1121+ poly_int64 chain_offset = (initial_adjust + callee_adjust
1122+ - frame.hard_fp_offset);
1123+ gcc_assert (known_ge (chain_offset, 0));
1124+
1125 if (callee_adjust == 0)
1126 {
1127 reg1 = R29_REGNUM;
1128--
11292.34.1
1130
1131
1132From 375794feb614cee1f41b710b9cc1b6f25da6c1cb Mon Sep 17 00:00:00 2001
1133From: Richard Sandiford <richard.sandiford@arm.com>
1134Date: Tue, 12 Sep 2023 16:07:15 +0100
1135Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals
1136MIME-Version: 1.0
1137Content-Type: text/plain; charset=UTF-8
1138Content-Transfer-Encoding: 8bit
1139
1140locals_offset was described as:
1141
1142 /* Offset from the base of the frame (incomming SP) to the
1143 top of the locals area. This value is always a multiple of
1144 STACK_BOUNDARY. */
1145
1146This is implicitly an “upside down” view of the frame: the incoming
1147SP is at offset 0, and anything N bytes below the incoming SP is at
1148offset N (rather than -N).
1149
1150However, reg_offset instead uses a “right way up” view; that is,
1151it views offsets in address terms. Something above X is at a
1152positive offset from X and something below X is at a negative
1153offset from X.
1154
1155Also, even on FRAME_GROWS_DOWNWARD targets like AArch64,
1156target-independent code views offsets in address terms too:
1157locals are allocated at negative offsets to virtual_stack_vars.
1158
1159It seems confusing to have *_offset fields of the same structure
1160using different polarities like this. This patch tries to avoid
1161that by renaming locals_offset to bytes_above_locals.
1162
1163gcc/
1164 * config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename to...
1165 (aarch64_frame::bytes_above_locals): ...this.
1166 * config/aarch64/aarch64.cc (aarch64_layout_frame)
1167 (aarch64_initial_elimination_offset): Update accordingly.
1168---
1169 gcc/config/aarch64/aarch64.cc | 6 +++---
1170 gcc/config/aarch64/aarch64.h | 6 +++---
1171 2 files changed, 6 insertions(+), 6 deletions(-)
1172
1173diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1174index 0e9b9717c08..0a22f91520e 100644
1175--- a/gcc/config/aarch64/aarch64.cc
1176+++ b/gcc/config/aarch64/aarch64.cc
1177@@ -8598,7 +8598,7 @@ aarch64_layout_frame (void)
1178 STACK_BOUNDARY / BITS_PER_UNIT));
1179 frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
1180
1181- frame.locals_offset = frame.saved_varargs_size;
1182+ frame.bytes_above_locals = frame.saved_varargs_size;
1183
1184 frame.initial_adjust = 0;
1185 frame.final_adjust = 0;
1186@@ -12754,13 +12754,13 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
1187 return frame.hard_fp_offset;
1188
1189 if (from == FRAME_POINTER_REGNUM)
1190- return frame.hard_fp_offset - frame.locals_offset;
1191+ return frame.hard_fp_offset - frame.bytes_above_locals;
1192 }
1193
1194 if (to == STACK_POINTER_REGNUM)
1195 {
1196 if (from == FRAME_POINTER_REGNUM)
1197- return frame.frame_size - frame.locals_offset;
1198+ return frame.frame_size - frame.bytes_above_locals;
1199 }
1200
1201 return frame.frame_size;
1202diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1203index fd820b1be4e..7ae12d13e2b 100644
1204--- a/gcc/config/aarch64/aarch64.h
1205+++ b/gcc/config/aarch64/aarch64.h
1206@@ -791,10 +791,10 @@ struct GTY (()) aarch64_frame
1207 always a multiple of STACK_BOUNDARY. */
1208 poly_int64 bytes_below_hard_fp;
1209
1210- /* Offset from the base of the frame (incomming SP) to the
1211- top of the locals area. This value is always a multiple of
1212+ /* The number of bytes between the top of the locals area and the top
1213+ of the frame (the incomming SP). This value is always a multiple of
1214 STACK_BOUNDARY. */
1215- poly_int64 locals_offset;
1216+ poly_int64 bytes_above_locals;
1217
1218 /* Offset from the base of the frame (incomming SP) to the
1219 hard_frame_pointer. This value is always a multiple of
1220--
12212.34.1
1222
1223
1224From 1a9ea1c45c75615ffbfabe652b3598a1d7be2168 Mon Sep 17 00:00:00 2001
1225From: Richard Sandiford <richard.sandiford@arm.com>
1226Date: Tue, 12 Sep 2023 16:07:16 +0100
1227Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to bytes_above_hard_fp
1228MIME-Version: 1.0
1229Content-Type: text/plain; charset=UTF-8
1230Content-Transfer-Encoding: 8bit
1231
1232Similarly to the previous locals_offset patch, hard_fp_offset
1233was described as:
1234
1235 /* Offset from the base of the frame (incomming SP) to the
1236 hard_frame_pointer. This value is always a multiple of
1237 STACK_BOUNDARY. */
1238 poly_int64 hard_fp_offset;
1239
1240which again took an “upside-down” view: higher offsets meant lower
1241addresses. This patch renames the field to bytes_above_hard_fp instead.
1242
1243gcc/
1244 * config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename
1245 to...
1246 (aarch64_frame::bytes_above_hard_fp): ...this.
1247 * config/aarch64/aarch64.cc (aarch64_layout_frame)
1248 (aarch64_expand_prologue): Update accordingly.
1249 (aarch64_initial_elimination_offset): Likewise.
1250---
1251 gcc/config/aarch64/aarch64.cc | 26 +++++++++++++-------------
1252 gcc/config/aarch64/aarch64.h | 6 +++---
1253 2 files changed, 16 insertions(+), 16 deletions(-)
1254
1255diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1256index 0a22f91520e..95499ae49ba 100644
1257--- a/gcc/config/aarch64/aarch64.cc
1258+++ b/gcc/config/aarch64/aarch64.cc
1259@@ -8590,7 +8590,7 @@ aarch64_layout_frame (void)
1260 + get_frame_size (),
1261 STACK_BOUNDARY / BITS_PER_UNIT);
1262
1263- frame.hard_fp_offset
1264+ frame.bytes_above_hard_fp
1265 = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
1266
1267 /* Both these values are already aligned. */
1268@@ -8639,13 +8639,13 @@ aarch64_layout_frame (void)
1269 else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
1270 max_push_offset = 256;
1271
1272- HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
1273+ HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
1274 HOST_WIDE_INT const_saved_regs_size;
1275 if (known_eq (frame.saved_regs_size, 0))
1276 frame.initial_adjust = frame.frame_size;
1277 else if (frame.frame_size.is_constant (&const_size)
1278 && const_size < max_push_offset
1279- && known_eq (frame.hard_fp_offset, const_size))
1280+ && known_eq (frame.bytes_above_hard_fp, const_size))
1281 {
1282 /* Simple, small frame with no data below the saved registers.
1283
1284@@ -8662,8 +8662,8 @@ aarch64_layout_frame (void)
1285 case that it hardly seems worth the effort though. */
1286 && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
1287 && !(cfun->calls_alloca
1288- && frame.hard_fp_offset.is_constant (&const_fp_offset)
1289- && const_fp_offset < max_push_offset))
1290+ && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
1291+ && const_above_fp < max_push_offset))
1292 {
1293 /* Frame with small area below the saved registers:
1294
1295@@ -8681,12 +8681,12 @@ aarch64_layout_frame (void)
1296 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
1297 save SVE registers relative to SP
1298 sub sp, sp, bytes_below_saved_regs */
1299- frame.initial_adjust = (frame.hard_fp_offset
1300+ frame.initial_adjust = (frame.bytes_above_hard_fp
1301 + frame.below_hard_fp_saved_regs_size);
1302 frame.final_adjust = frame.bytes_below_saved_regs;
1303 }
1304- else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
1305- && const_fp_offset < max_push_offset)
1306+ else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
1307+ && const_above_fp < max_push_offset)
1308 {
1309 /* Frame with large area below the saved registers, or with SVE saves,
1310 but with a small area above:
1311@@ -8696,7 +8696,7 @@ aarch64_layout_frame (void)
1312 [sub sp, sp, below_hard_fp_saved_regs_size]
1313 [save SVE registers relative to SP]
1314 sub sp, sp, bytes_below_saved_regs */
1315- frame.callee_adjust = const_fp_offset;
1316+ frame.callee_adjust = const_above_fp;
1317 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
1318 frame.final_adjust = frame.bytes_below_saved_regs;
1319 }
1320@@ -8711,7 +8711,7 @@ aarch64_layout_frame (void)
1321 [sub sp, sp, below_hard_fp_saved_regs_size]
1322 [save SVE registers relative to SP]
1323 sub sp, sp, bytes_below_saved_regs */
1324- frame.initial_adjust = frame.hard_fp_offset;
1325+ frame.initial_adjust = frame.bytes_above_hard_fp;
1326 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
1327 frame.final_adjust = frame.bytes_below_saved_regs;
1328 }
1329@@ -10069,7 +10069,7 @@ aarch64_expand_prologue (void)
1330 {
1331 /* The offset of the frame chain record (if any) from the current SP. */
1332 poly_int64 chain_offset = (initial_adjust + callee_adjust
1333- - frame.hard_fp_offset);
1334+ - frame.bytes_above_hard_fp);
1335 gcc_assert (known_ge (chain_offset, 0));
1336
1337 if (callee_adjust == 0)
1338@@ -12751,10 +12751,10 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
1339 if (to == HARD_FRAME_POINTER_REGNUM)
1340 {
1341 if (from == ARG_POINTER_REGNUM)
1342- return frame.hard_fp_offset;
1343+ return frame.bytes_above_hard_fp;
1344
1345 if (from == FRAME_POINTER_REGNUM)
1346- return frame.hard_fp_offset - frame.bytes_above_locals;
1347+ return frame.bytes_above_hard_fp - frame.bytes_above_locals;
1348 }
1349
1350 if (to == STACK_POINTER_REGNUM)
1351diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1352index 7ae12d13e2b..3808f49e9ca 100644
1353--- a/gcc/config/aarch64/aarch64.h
1354+++ b/gcc/config/aarch64/aarch64.h
1355@@ -796,10 +796,10 @@ struct GTY (()) aarch64_frame
1356 STACK_BOUNDARY. */
1357 poly_int64 bytes_above_locals;
1358
1359- /* Offset from the base of the frame (incomming SP) to the
1360- hard_frame_pointer. This value is always a multiple of
1361+ /* The number of bytes between the hard_frame_pointer and the top of
1362+ the frame (the incomming SP). This value is always a multiple of
1363 STACK_BOUNDARY. */
1364- poly_int64 hard_fp_offset;
1365+ poly_int64 bytes_above_hard_fp;
1366
1367 /* The size of the frame. This value is the offset from base of the
1368 frame (incomming SP) to the stack_pointer. This value is always
1369--
13702.34.1
1371
1372
1373From d202ce1ecf60a36a3e1009917dd76109248ce9be Mon Sep 17 00:00:00 2001
1374From: Richard Sandiford <richard.sandiford@arm.com>
1375Date: Tue, 12 Sep 2023 16:07:16 +0100
1376Subject: [PATCH 10/19] aarch64: Tweak frame_size comment
1377MIME-Version: 1.0
1378Content-Type: text/plain; charset=UTF-8
1379Content-Transfer-Encoding: 8bit
1380
1381This patch fixes another case in which a value was described with
1382an “upside-down” view.
1383
1384gcc/
1385 * config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak comment.
1386---
1387 gcc/config/aarch64/aarch64.h | 4 ++--
1388 1 file changed, 2 insertions(+), 2 deletions(-)
1389
1390diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1391index 3808f49e9ca..108a5731b0d 100644
1392--- a/gcc/config/aarch64/aarch64.h
1393+++ b/gcc/config/aarch64/aarch64.h
1394@@ -801,8 +801,8 @@ struct GTY (()) aarch64_frame
1395 STACK_BOUNDARY. */
1396 poly_int64 bytes_above_hard_fp;
1397
1398- /* The size of the frame. This value is the offset from base of the
1399- frame (incomming SP) to the stack_pointer. This value is always
1400+ /* The size of the frame, i.e. the number of bytes between the bottom
1401+ of the outgoing arguments and the incoming SP. This value is always
1402 a multiple of STACK_BOUNDARY. */
1403 poly_int64 frame_size;
1404
1405--
14062.34.1
1407
1408
1409From f2b585375205b0a1802d79c682ba33766ecd1f0f Mon Sep 17 00:00:00 2001
1410From: Richard Sandiford <richard.sandiford@arm.com>
1411Date: Tue, 12 Sep 2023 16:07:17 +0100
1412Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the
1413 frame
1414
1415reg_offset was measured from the bottom of the saved register area.
1416This made perfect sense with the original layout, since the bottom
1417of the saved register area was also the hard frame pointer address.
1418It became slightly less obvious with SVE, since we save SVE
1419registers below the hard frame pointer, but it still made sense.
1420
1421However, if we want to allow different frame layouts, it's more
1422convenient and obvious to measure reg_offset from the bottom of
1423the frame. After previous patches, it's also a slight simplification
1424in its own right.
1425
1426gcc/
1427 * config/aarch64/aarch64.h (aarch64_frame): Add comment above
1428 reg_offset.
1429 * config/aarch64/aarch64.cc (aarch64_layout_frame): Walk offsets
1430 from the bottom of the frame, rather than the bottom of the saved
1431 register area. Measure reg_offset from the bottom of the frame
1432 rather than the bottom of the saved register area.
1433 (aarch64_save_callee_saves): Update accordingly.
1434 (aarch64_restore_callee_saves): Likewise.
1435 (aarch64_get_separate_components): Likewise.
1436 (aarch64_process_components): Likewise.
1437---
1438 gcc/config/aarch64/aarch64.cc | 53 ++++++++++++++++-------------------
1439 gcc/config/aarch64/aarch64.h | 3 ++
1440 2 files changed, 27 insertions(+), 29 deletions(-)
1441
1442diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1443index 95499ae49ba..af99807ef8a 100644
1444--- a/gcc/config/aarch64/aarch64.cc
1445+++ b/gcc/config/aarch64/aarch64.cc
1446@@ -8400,7 +8400,6 @@ aarch64_needs_frame_chain (void)
1447 static void
1448 aarch64_layout_frame (void)
1449 {
1450- poly_int64 offset = 0;
1451 int regno, last_fp_reg = INVALID_REGNUM;
1452 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
1453 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
1454@@ -8478,7 +8477,9 @@ aarch64_layout_frame (void)
1455 gcc_assert (crtl->is_leaf
1456 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
1457
1458- frame.bytes_below_saved_regs = crtl->outgoing_args_size;
1459+ poly_int64 offset = crtl->outgoing_args_size;
1460+ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
1461+ frame.bytes_below_saved_regs = offset;
1462
1463 /* Now assign stack slots for the registers. Start with the predicate
1464 registers, since predicate LDR and STR have a relatively small
1465@@ -8490,7 +8491,8 @@ aarch64_layout_frame (void)
1466 offset += BYTES_PER_SVE_PRED;
1467 }
1468
1469- if (maybe_ne (offset, 0))
1470+ poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
1471+ if (maybe_ne (saved_prs_size, 0))
1472 {
1473 /* If we have any vector registers to save above the predicate registers,
1474 the offset of the vector register save slots need to be a multiple
1475@@ -8508,10 +8510,10 @@ aarch64_layout_frame (void)
1476 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1477 else
1478 {
1479- if (known_le (offset, vector_save_size))
1480- offset = vector_save_size;
1481- else if (known_le (offset, vector_save_size * 2))
1482- offset = vector_save_size * 2;
1483+ if (known_le (saved_prs_size, vector_save_size))
1484+ offset = frame.bytes_below_saved_regs + vector_save_size;
1485+ else if (known_le (saved_prs_size, vector_save_size * 2))
1486+ offset = frame.bytes_below_saved_regs + vector_save_size * 2;
1487 else
1488 gcc_unreachable ();
1489 }
1490@@ -8528,9 +8530,10 @@ aarch64_layout_frame (void)
1491
1492 /* OFFSET is now the offset of the hard frame pointer from the bottom
1493 of the callee save area. */
1494- bool saves_below_hard_fp_p = maybe_ne (offset, 0);
1495- frame.below_hard_fp_saved_regs_size = offset;
1496- frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
1497+ frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
1498+ bool saves_below_hard_fp_p
1499+ = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
1500+ frame.bytes_below_hard_fp = offset;
1501 if (frame.emit_frame_chain)
1502 {
1503 /* FP and LR are placed in the linkage record. */
1504@@ -8581,9 +8584,10 @@ aarch64_layout_frame (void)
1505
1506 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1507
1508- frame.saved_regs_size = offset;
1509+ frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
1510
1511- poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
1512+ poly_int64 varargs_and_saved_regs_size
1513+ = frame.saved_regs_size + frame.saved_varargs_size;
1514
1515 poly_int64 saved_regs_and_above
1516 = aligned_upper_bound (varargs_and_saved_regs_size
1517@@ -9105,9 +9109,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
1518
1519 machine_mode mode = aarch64_reg_save_mode (regno);
1520 reg = gen_rtx_REG (mode, regno);
1521- offset = (frame.reg_offset[regno]
1522- + frame.bytes_below_saved_regs
1523- - bytes_below_sp);
1524+ offset = frame.reg_offset[regno] - bytes_below_sp;
1525 rtx base_rtx = stack_pointer_rtx;
1526 poly_int64 sp_offset = offset;
1527
1528@@ -9214,9 +9216,7 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
1529
1530 machine_mode mode = aarch64_reg_save_mode (regno);
1531 reg = gen_rtx_REG (mode, regno);
1532- offset = (frame.reg_offset[regno]
1533- + frame.bytes_below_saved_regs
1534- - bytes_below_sp);
1535+ offset = frame.reg_offset[regno] - bytes_below_sp;
1536 rtx base_rtx = stack_pointer_rtx;
1537 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
1538 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
1539@@ -9355,14 +9355,12 @@ aarch64_get_separate_components (void)
1540 it as a stack probe for -fstack-clash-protection. */
1541 if (flag_stack_clash_protection
1542 && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
1543- && known_eq (offset, 0))
1544+ && known_eq (offset, frame.bytes_below_saved_regs))
1545 continue;
1546
1547 /* Get the offset relative to the register we'll use. */
1548 if (frame_pointer_needed)
1549- offset -= frame.below_hard_fp_saved_regs_size;
1550- else
1551- offset += frame.bytes_below_saved_regs;
1552+ offset -= frame.bytes_below_hard_fp;
1553
1554 /* Check that we can access the stack slot of the register with one
1555 direct load with no adjustments needed. */
1556@@ -9509,9 +9507,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
1557 rtx reg = gen_rtx_REG (mode, regno);
1558 poly_int64 offset = frame.reg_offset[regno];
1559 if (frame_pointer_needed)
1560- offset -= frame.below_hard_fp_saved_regs_size;
1561- else
1562- offset += frame.bytes_below_saved_regs;
1563+ offset -= frame.bytes_below_hard_fp;
1564
1565 rtx addr = plus_constant (Pmode, ptr_reg, offset);
1566 rtx mem = gen_frame_mem (mode, addr);
1567@@ -9563,9 +9559,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
1568 /* REGNO2 can be saved/restored in a pair with REGNO. */
1569 rtx reg2 = gen_rtx_REG (mode, regno2);
1570 if (frame_pointer_needed)
1571- offset2 -= frame.below_hard_fp_saved_regs_size;
1572- else
1573- offset2 += frame.bytes_below_saved_regs;
1574+ offset2 -= frame.bytes_below_hard_fp;
1575 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
1576 rtx mem2 = gen_frame_mem (mode, addr2);
1577 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
1578@@ -9681,7 +9675,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1579 if (final_adjustment_p
1580 && known_eq (frame.below_hard_fp_saved_regs_size, 0))
1581 {
1582- poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
1583+ poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
1584+ - frame.bytes_below_saved_regs);
1585 if (known_ge (lr_offset, 0))
1586 min_probe_threshold -= lr_offset.to_constant ();
1587 else
1588diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
1589index 108a5731b0d..c8becb098c8 100644
1590--- a/gcc/config/aarch64/aarch64.h
1591+++ b/gcc/config/aarch64/aarch64.h
1592@@ -766,6 +766,9 @@ extern enum aarch64_processor aarch64_tune;
1593 #ifdef HAVE_POLY_INT_H
1594 struct GTY (()) aarch64_frame
1595 {
1596+ /* The offset from the bottom of the static frame (the bottom of the
1597+ outgoing arguments) of each register save slot, or -2 if no save is
1598+ needed. */
1599 poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
1600
1601 /* The number of extra stack bytes taken up by register varargs.
1602--
16032.34.1
1604
1605
1606From 79faabda181d0d9fd29a3cf5726ba65bdee945b5 Mon Sep 17 00:00:00 2001
1607From: Richard Sandiford <richard.sandiford@arm.com>
1608Date: Tue, 12 Sep 2023 16:07:17 +0100
1609Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation
1610
1611After previous patches, it no longer really makes sense to allocate
1612the top of the frame in terms of varargs_and_saved_regs_size and
1613saved_regs_and_above.
1614
1615gcc/
1616 * config/aarch64/aarch64.cc (aarch64_layout_frame): Simplify
1617 the allocation of the top of the frame.
1618---
1619 gcc/config/aarch64/aarch64.cc | 23 ++++++++---------------
1620 1 file changed, 8 insertions(+), 15 deletions(-)
1621
1622diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1623index af99807ef8a..31b00094c2a 100644
1624--- a/gcc/config/aarch64/aarch64.cc
1625+++ b/gcc/config/aarch64/aarch64.cc
1626@@ -8586,23 +8586,16 @@ aarch64_layout_frame (void)
1627
1628 frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
1629
1630- poly_int64 varargs_and_saved_regs_size
1631- = frame.saved_regs_size + frame.saved_varargs_size;
1632-
1633- poly_int64 saved_regs_and_above
1634- = aligned_upper_bound (varargs_and_saved_regs_size
1635- + get_frame_size (),
1636- STACK_BOUNDARY / BITS_PER_UNIT);
1637-
1638- frame.bytes_above_hard_fp
1639- = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
1640+ offset += get_frame_size ();
1641+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1642+ auto top_of_locals = offset;
1643
1644- /* Both these values are already aligned. */
1645- gcc_assert (multiple_p (frame.bytes_below_saved_regs,
1646- STACK_BOUNDARY / BITS_PER_UNIT));
1647- frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
1648+ offset += frame.saved_varargs_size;
1649+ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
1650+ frame.frame_size = offset;
1651
1652- frame.bytes_above_locals = frame.saved_varargs_size;
1653+ frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
1654+ frame.bytes_above_locals = frame.frame_size - top_of_locals;
1655
1656 frame.initial_adjust = 0;
1657 frame.final_adjust = 0;
1658--
16592.34.1
1660
1661
1662From 4e62049e403b141e6f916176160dac8cbd65fe47 Mon Sep 17 00:00:00 2001
1663From: Richard Sandiford <richard.sandiford@arm.com>
1664Date: Tue, 12 Sep 2023 16:07:18 +0100
1665Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak
1666
1667This patch just changes a calculation of initial_adjust
1668to one that makes it slightly more obvious that the total
1669adjustment is frame.frame_size.
1670
1671gcc/
1672 * config/aarch64/aarch64.cc (aarch64_layout_frame): Tweak
1673 calculation of initial_adjust for frames in which all saves
1674 are SVE saves.
1675---
1676 gcc/config/aarch64/aarch64.cc | 5 ++---
1677 1 file changed, 2 insertions(+), 3 deletions(-)
1678
1679diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1680index 31b00094c2a..1aa79da0673 100644
1681--- a/gcc/config/aarch64/aarch64.cc
1682+++ b/gcc/config/aarch64/aarch64.cc
1683@@ -8675,11 +8675,10 @@ aarch64_layout_frame (void)
1684 {
1685 /* Frame in which all saves are SVE saves:
1686
1687- sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
1688+ sub sp, sp, frame_size - bytes_below_saved_regs
1689 save SVE registers relative to SP
1690 sub sp, sp, bytes_below_saved_regs */
1691- frame.initial_adjust = (frame.bytes_above_hard_fp
1692- + frame.below_hard_fp_saved_regs_size);
1693+ frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
1694 frame.final_adjust = frame.bytes_below_saved_regs;
1695 }
1696 else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
1697--
16982.34.1
1699
1700
1701From aaa1a0a5912d9e5d571e5f1c6f09ceac99544ab5 Mon Sep 17 00:00:00 2001
1702From: Richard Sandiford <richard.sandiford@arm.com>
1703Date: Tue, 12 Sep 2023 16:07:18 +0100
1704Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition
1705
1706The AArch64 ABI says that, when stack clash protection is used,
1707there can be a maximum of 1KiB of unprobed space at sp on entry
1708to a function. Therefore, we need to probe when allocating
1709>= guard_size - 1KiB of data (>= rather than >). This is what
1710GCC does.
1711
1712If an allocation is exactly guard_size bytes, it is enough to allocate
1713those bytes and probe once at offset 1024. It isn't possible to use a
1714single probe at any other offset: higher would conmplicate later code,
1715by leaving more unprobed space than usual, while lower would risk
1716leaving an entire page unprobed. For simplicity, the code probes all
1717allocations at offset 1024.
1718
1719Some register saves also act as probes. If we need to allocate
1720more space below the last such register save probe, we need to
1721probe the allocation if it is > 1KiB. Again, this allocation is
1722then sometimes (but not always) probed at offset 1024. This sort of
1723allocation is currently only used for outgoing arguments, which are
1724rarely this big.
1725
1726However, the code also probed if this final outgoing-arguments
1727allocation was == 1KiB, rather than just > 1KiB. This isn't
1728necessary, since the register save then probes at offset 1024
1729as required. Continuing to probe allocations of exactly 1KiB
1730would complicate later patches.
1731
1732gcc/
1733 * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space):
1734 Don't probe final allocations that are exactly 1KiB in size (after
1735 unprobed space above the final allocation has been deducted).
1736
1737gcc/testsuite/
1738 * gcc.target/aarch64/stack-check-prologue-17.c: New test.
1739---
1740 gcc/config/aarch64/aarch64.cc | 4 +-
1741 .../aarch64/stack-check-prologue-17.c | 55 +++++++++++++++++++
1742 2 files changed, 58 insertions(+), 1 deletion(-)
1743 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
1744
1745diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1746index 1aa79da0673..5cad847977a 100644
1747--- a/gcc/config/aarch64/aarch64.cc
1748+++ b/gcc/config/aarch64/aarch64.cc
1749@@ -9648,9 +9648,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1750 HOST_WIDE_INT guard_size
1751 = 1 << param_stack_clash_protection_guard_size;
1752 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
1753+ HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
1754+ gcc_assert (multiple_p (poly_size, byte_sp_alignment));
1755 HOST_WIDE_INT min_probe_threshold
1756 = (final_adjustment_p
1757- ? guard_used_by_caller
1758+ ? guard_used_by_caller + byte_sp_alignment
1759 : guard_size - guard_used_by_caller);
1760 /* When doing the final adjustment for the outgoing arguments, take into
1761 account any unprobed space there is above the current SP. There are
1762diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
1763new file mode 100644
1764index 00000000000..0d8a25d73a2
1765--- /dev/null
1766+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
1767@@ -0,0 +1,55 @@
1768+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
1769+/* { dg-final { check-function-bodies "**" "" } } */
1770+
1771+void f(int, ...);
1772+void g();
1773+
1774+/*
1775+** test1:
1776+** ...
1777+** str x30, \[sp\]
1778+** sub sp, sp, #1024
1779+** cbnz w0, .*
1780+** bl g
1781+** ...
1782+*/
1783+int test1(int z) {
1784+ __uint128_t x = 0;
1785+ int y[0x400];
1786+ if (z)
1787+ {
1788+ f(0, 0, 0, 0, 0, 0, 0, &y,
1789+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1790+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1791+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1792+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
1793+ }
1794+ g();
1795+ return 1;
1796+}
1797+
1798+/*
1799+** test2:
1800+** ...
1801+** str x30, \[sp\]
1802+** sub sp, sp, #1040
1803+** str xzr, \[sp\]
1804+** cbnz w0, .*
1805+** bl g
1806+** ...
1807+*/
1808+int test2(int z) {
1809+ __uint128_t x = 0;
1810+ int y[0x400];
1811+ if (z)
1812+ {
1813+ f(0, 0, 0, 0, 0, 0, 0, &y,
1814+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1815+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1816+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1817+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
1818+ x);
1819+ }
1820+ g();
1821+ return 1;
1822+}
1823--
18242.34.1
1825
1826
1827From 8433953434a7b58c0923140d39eb3c5988c1d097 Mon Sep 17 00:00:00 2001
1828From: Richard Sandiford <richard.sandiford@arm.com>
1829Date: Tue, 12 Sep 2023 16:07:19 +0100
1830Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes
1831
1832-fstack-clash-protection uses the save of LR as a probe for the next
1833allocation. The next allocation could be:
1834
1835* another part of the static frame, e.g. when allocating SVE save slots
1836 or outgoing arguments
1837
1838* an alloca in the same function
1839
1840* an allocation made by a callee function
1841
1842However, when -fomit-frame-pointer is used, the LR save slot is placed
1843above the other GPR save slots. It could therefore be up to 80 bytes
1844above the base of the GPR save area (which is also the hard fp address).
1845
1846aarch64_allocate_and_probe_stack_space took this into account when
1847deciding how much subsequent space could be allocated without needing
1848a probe. However, it interacted badly with:
1849
1850 /* If doing a small final adjustment, we always probe at offset 0.
1851 This is done to avoid issues when LR is not at position 0 or when
1852 the final adjustment is smaller than the probing offset. */
1853 else if (final_adjustment_p && rounded_size == 0)
1854 residual_probe_offset = 0;
1855
1856which forces any allocation that is smaller than the guard page size
1857to be probed at offset 0 rather than the usual offset 1024. It was
1858therefore possible to construct cases in which we had:
1859
1860* a probe using LR at SP + 80 bytes (or some other value >= 16)
1861* an allocation of the guard page size - 16 bytes
1862* a probe at SP + 0
1863
1864which allocates guard page size + 64 consecutive unprobed bytes.
1865
1866This patch requires the LR probe to be in the first 16 bytes of the
1867save area when stack clash protection is active. Doing it
1868unconditionally would cause code-quality regressions.
1869
1870Putting LR before other registers prevents push/pop allocation
1871when shadow call stacks are enabled, since LR is restored
1872separately from the other callee-saved registers.
1873
1874The new comment doesn't say that the probe register is required
1875to be LR, since a later patch removes that restriction.
1876
1877gcc/
1878 * config/aarch64/aarch64.cc (aarch64_layout_frame): Ensure that
1879 the LR save slot is in the first 16 bytes of the register save area.
1880 Only form STP/LDP push/pop candidates if both registers are valid.
1881 (aarch64_allocate_and_probe_stack_space): Remove workaround for
1882 when LR was not in the first 16 bytes.
1883
1884gcc/testsuite/
1885 * gcc.target/aarch64/stack-check-prologue-18.c: New test.
1886 * gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
1887 * gcc.target/aarch64/stack-check-prologue-20.c: Likewise.
1888---
1889 gcc/config/aarch64/aarch64.cc | 72 ++++++-------
1890 .../aarch64/stack-check-prologue-18.c | 100 ++++++++++++++++++
1891 .../aarch64/stack-check-prologue-19.c | 100 ++++++++++++++++++
1892 .../aarch64/stack-check-prologue-20.c | 3 +
1893 4 files changed, 233 insertions(+), 42 deletions(-)
1894 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
1895 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
1896 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
1897
1898diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
1899index 5cad847977a..a765f92329d 100644
1900--- a/gcc/config/aarch64/aarch64.cc
1901+++ b/gcc/config/aarch64/aarch64.cc
1902@@ -8534,26 +8534,34 @@ aarch64_layout_frame (void)
1903 bool saves_below_hard_fp_p
1904 = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
1905 frame.bytes_below_hard_fp = offset;
1906+
1907+ auto allocate_gpr_slot = [&](unsigned int regno)
1908+ {
1909+ frame.reg_offset[regno] = offset;
1910+ if (frame.wb_push_candidate1 == INVALID_REGNUM)
1911+ frame.wb_push_candidate1 = regno;
1912+ else if (frame.wb_push_candidate2 == INVALID_REGNUM)
1913+ frame.wb_push_candidate2 = regno;
1914+ offset += UNITS_PER_WORD;
1915+ };
1916+
1917 if (frame.emit_frame_chain)
1918 {
1919 /* FP and LR are placed in the linkage record. */
1920- frame.reg_offset[R29_REGNUM] = offset;
1921- frame.wb_push_candidate1 = R29_REGNUM;
1922- frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
1923- frame.wb_push_candidate2 = R30_REGNUM;
1924- offset += 2 * UNITS_PER_WORD;
1925+ allocate_gpr_slot (R29_REGNUM);
1926+ allocate_gpr_slot (R30_REGNUM);
1927 }
1928+ else if (flag_stack_clash_protection
1929+ && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
1930+ /* Put the LR save slot first, since it makes a good choice of probe
1931+ for stack clash purposes. The idea is that the link register usually
1932+ has to be saved before a call anyway, and so we lose little by
1933+ stopping it from being individually shrink-wrapped. */
1934+ allocate_gpr_slot (R30_REGNUM);
1935
1936 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1937 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
1938- {
1939- frame.reg_offset[regno] = offset;
1940- if (frame.wb_push_candidate1 == INVALID_REGNUM)
1941- frame.wb_push_candidate1 = regno;
1942- else if (frame.wb_push_candidate2 == INVALID_REGNUM)
1943- frame.wb_push_candidate2 = regno;
1944- offset += UNITS_PER_WORD;
1945- }
1946+ allocate_gpr_slot (regno);
1947
1948 poly_int64 max_int_offset = offset;
1949 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1950@@ -8631,10 +8639,13 @@ aarch64_layout_frame (void)
1951 max_push_offset to 0, because no registers are popped at this time,
1952 so callee_adjust cannot be adjusted. */
1953 HOST_WIDE_INT max_push_offset = 0;
1954- if (frame.wb_pop_candidate2 != INVALID_REGNUM)
1955- max_push_offset = 512;
1956- else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
1957- max_push_offset = 256;
1958+ if (frame.wb_pop_candidate1 != INVALID_REGNUM)
1959+ {
1960+ if (frame.wb_pop_candidate2 != INVALID_REGNUM)
1961+ max_push_offset = 512;
1962+ else
1963+ max_push_offset = 256;
1964+ }
1965
1966 HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
1967 HOST_WIDE_INT const_saved_regs_size;
1968@@ -9654,29 +9665,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1969 = (final_adjustment_p
1970 ? guard_used_by_caller + byte_sp_alignment
1971 : guard_size - guard_used_by_caller);
1972- /* When doing the final adjustment for the outgoing arguments, take into
1973- account any unprobed space there is above the current SP. There are
1974- two cases:
1975-
1976- - When saving SVE registers below the hard frame pointer, we force
1977- the lowest save to take place in the prologue before doing the final
1978- adjustment (i.e. we don't allow the save to be shrink-wrapped).
1979- This acts as a probe at SP, so there is no unprobed space.
1980-
1981- - When there are no SVE register saves, we use the store of the link
1982- register as a probe. We can't assume that LR was saved at position 0
1983- though, so treat any space below it as unprobed. */
1984- if (final_adjustment_p
1985- && known_eq (frame.below_hard_fp_saved_regs_size, 0))
1986- {
1987- poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
1988- - frame.bytes_below_saved_regs);
1989- if (known_ge (lr_offset, 0))
1990- min_probe_threshold -= lr_offset.to_constant ();
1991- else
1992- gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
1993- }
1994-
1995 poly_int64 frame_size = frame.frame_size;
1996
1997 /* We should always have a positive probe threshold. */
1998@@ -9856,8 +9844,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
1999 if (final_adjustment_p && rounded_size != 0)
2000 min_probe_threshold = 0;
2001 /* If doing a small final adjustment, we always probe at offset 0.
2002- This is done to avoid issues when LR is not at position 0 or when
2003- the final adjustment is smaller than the probing offset. */
2004+ This is done to avoid issues when the final adjustment is smaller
2005+ than the probing offset. */
2006 else if (final_adjustment_p && rounded_size == 0)
2007 residual_probe_offset = 0;
2008
2009diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2010new file mode 100644
2011index 00000000000..82447d20fff
2012--- /dev/null
2013+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2014@@ -0,0 +1,100 @@
2015+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */
2016+/* { dg-final { check-function-bodies "**" "" } } */
2017+
2018+void f(int, ...);
2019+void g();
2020+
2021+/*
2022+** test1:
2023+** ...
2024+** str x30, \[sp\]
2025+** sub sp, sp, #4064
2026+** str xzr, \[sp\]
2027+** cbnz w0, .*
2028+** bl g
2029+** ...
2030+** str x26, \[sp, #?4128\]
2031+** ...
2032+*/
2033+int test1(int z) {
2034+ __uint128_t x = 0;
2035+ int y[0x400];
2036+ if (z)
2037+ {
2038+ asm volatile ("" :::
2039+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2040+ f(0, 0, 0, 0, 0, 0, 0, &y,
2041+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2042+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2043+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2044+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2045+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2046+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2047+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2048+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2049+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2050+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2051+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2052+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2053+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2054+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2055+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2056+ x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2057+ }
2058+ g();
2059+ return 1;
2060+}
2061+
2062+/*
2063+** test2:
2064+** ...
2065+** str x30, \[sp\]
2066+** sub sp, sp, #1040
2067+** str xzr, \[sp\]
2068+** cbnz w0, .*
2069+** bl g
2070+** ...
2071+*/
2072+int test2(int z) {
2073+ __uint128_t x = 0;
2074+ int y[0x400];
2075+ if (z)
2076+ {
2077+ asm volatile ("" :::
2078+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2079+ f(0, 0, 0, 0, 0, 0, 0, &y,
2080+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2081+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2082+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2083+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2084+ x);
2085+ }
2086+ g();
2087+ return 1;
2088+}
2089+
2090+/*
2091+** test3:
2092+** ...
2093+** str x30, \[sp\]
2094+** sub sp, sp, #1024
2095+** cbnz w0, .*
2096+** bl g
2097+** ...
2098+*/
2099+int test3(int z) {
2100+ __uint128_t x = 0;
2101+ int y[0x400];
2102+ if (z)
2103+ {
2104+ asm volatile ("" :::
2105+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2106+ f(0, 0, 0, 0, 0, 0, 0, &y,
2107+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2108+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2109+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2110+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2111+ }
2112+ g();
2113+ return 1;
2114+}
2115diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2116new file mode 100644
2117index 00000000000..73ac3e4e4eb
2118--- /dev/null
2119+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2120@@ -0,0 +1,100 @@
2121+/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */
2122+/* { dg-final { check-function-bodies "**" "" } } */
2123+
2124+void f(int, ...);
2125+void g();
2126+
2127+/*
2128+** test1:
2129+** ...
2130+** str x30, \[sp\]
2131+** sub sp, sp, #4064
2132+** str xzr, \[sp\]
2133+** cbnz w0, .*
2134+** bl g
2135+** ...
2136+** str x26, \[sp, #?4128\]
2137+** ...
2138+*/
2139+int test1(int z) {
2140+ __uint128_t x = 0;
2141+ int y[0x400];
2142+ if (z)
2143+ {
2144+ asm volatile ("" :::
2145+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2146+ f(0, 0, 0, 0, 0, 0, 0, &y,
2147+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2148+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2149+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2150+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2151+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2152+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2153+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2154+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2155+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2156+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2157+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2158+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2159+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2160+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2161+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2162+ x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2163+ }
2164+ g();
2165+ return 1;
2166+}
2167+
2168+/*
2169+** test2:
2170+** ...
2171+** str x30, \[sp\]
2172+** sub sp, sp, #1040
2173+** str xzr, \[sp\]
2174+** cbnz w0, .*
2175+** bl g
2176+** ...
2177+*/
2178+int test2(int z) {
2179+ __uint128_t x = 0;
2180+ int y[0x400];
2181+ if (z)
2182+ {
2183+ asm volatile ("" :::
2184+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2185+ f(0, 0, 0, 0, 0, 0, 0, &y,
2186+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2187+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2188+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2189+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2190+ x);
2191+ }
2192+ g();
2193+ return 1;
2194+}
2195+
2196+/*
2197+** test3:
2198+** ...
2199+** str x30, \[sp\]
2200+** sub sp, sp, #1024
2201+** cbnz w0, .*
2202+** bl g
2203+** ...
2204+*/
2205+int test3(int z) {
2206+ __uint128_t x = 0;
2207+ int y[0x400];
2208+ if (z)
2209+ {
2210+ asm volatile ("" :::
2211+ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
2212+ f(0, 0, 0, 0, 0, 0, 0, &y,
2213+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2214+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2215+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
2216+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
2217+ }
2218+ g();
2219+ return 1;
2220+}
2221diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
2222new file mode 100644
2223index 00000000000..690aae8dfd5
2224--- /dev/null
2225+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
2226@@ -0,0 +1,3 @@
2227+/* { dg-options "-O2 -fstack-protector-all -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */
2228+
2229+#include "stack-check-prologue-19.c"
2230--
22312.34.1
2232
2233
2234From eea1759073e09dd1aefbc9a881601ab1eebfdd18 Mon Sep 17 00:00:00 2001
2235From: Richard Sandiford <richard.sandiford@arm.com>
2236Date: Tue, 12 Sep 2023 16:07:19 +0100
2237Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation
2238
2239Previous patches ensured that the final frame allocation only needs
2240a probe when the size is strictly greater than 1KiB. It's therefore
2241safe to use the normal 1024 probe offset in all cases.
2242
2243The main motivation for doing this is to simplify the code and
2244remove the number of special cases.
2245
2246gcc/
2247 * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space):
2248 Always probe the residual allocation at offset 1024, asserting
2249 that that is in range.
2250
2251gcc/testsuite/
2252 * gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe
2253 to be at offset 1024 rather than offset 0.
2254 * gcc.target/aarch64/stack-check-prologue-18.c: Likewise.
2255 * gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
2256---
2257 gcc/config/aarch64/aarch64.cc | 12 ++++--------
2258 .../gcc.target/aarch64/stack-check-prologue-17.c | 2 +-
2259 .../gcc.target/aarch64/stack-check-prologue-18.c | 4 ++--
2260 .../gcc.target/aarch64/stack-check-prologue-19.c | 4 ++--
2261 4 files changed, 9 insertions(+), 13 deletions(-)
2262
2263diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
2264index a765f92329d..37809a306f7 100644
2265--- a/gcc/config/aarch64/aarch64.cc
2266+++ b/gcc/config/aarch64/aarch64.cc
2267@@ -9838,16 +9838,12 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
2268 are still safe. */
2269 if (residual)
2270 {
2271- HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
2272+ gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
2273+
2274 /* If we're doing final adjustments, and we've done any full page
2275 allocations then any residual needs to be probed. */
2276 if (final_adjustment_p && rounded_size != 0)
2277 min_probe_threshold = 0;
2278- /* If doing a small final adjustment, we always probe at offset 0.
2279- This is done to avoid issues when the final adjustment is smaller
2280- than the probing offset. */
2281- else if (final_adjustment_p && rounded_size == 0)
2282- residual_probe_offset = 0;
2283
2284 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
2285 if (residual >= min_probe_threshold)
2286@@ -9858,8 +9854,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
2287 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
2288 "\n", residual);
2289
2290- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
2291- residual_probe_offset));
2292+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
2293+ guard_used_by_caller));
2294 emit_insn (gen_blockage ());
2295 }
2296 }
2297diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
2298index 0d8a25d73a2..f0ec1389771 100644
2299--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
2300+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
2301@@ -33,7 +33,7 @@ int test1(int z) {
2302 ** ...
2303 ** str x30, \[sp\]
2304 ** sub sp, sp, #1040
2305-** str xzr, \[sp\]
2306+** str xzr, \[sp, #?1024\]
2307 ** cbnz w0, .*
2308 ** bl g
2309 ** ...
2310diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2311index 82447d20fff..6383bec5ebc 100644
2312--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2313+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
2314@@ -9,7 +9,7 @@ void g();
2315 ** ...
2316 ** str x30, \[sp\]
2317 ** sub sp, sp, #4064
2318-** str xzr, \[sp\]
2319+** str xzr, \[sp, #?1024\]
2320 ** cbnz w0, .*
2321 ** bl g
2322 ** ...
2323@@ -50,7 +50,7 @@ int test1(int z) {
2324 ** ...
2325 ** str x30, \[sp\]
2326 ** sub sp, sp, #1040
2327-** str xzr, \[sp\]
2328+** str xzr, \[sp, #?1024\]
2329 ** cbnz w0, .*
2330 ** bl g
2331 ** ...
2332diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2333index 73ac3e4e4eb..562039b5e9b 100644
2334--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2335+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
2336@@ -9,7 +9,7 @@ void g();
2337 ** ...
2338 ** str x30, \[sp\]
2339 ** sub sp, sp, #4064
2340-** str xzr, \[sp\]
2341+** str xzr, \[sp, #?1024\]
2342 ** cbnz w0, .*
2343 ** bl g
2344 ** ...
2345@@ -50,7 +50,7 @@ int test1(int z) {
2346 ** ...
2347 ** str x30, \[sp\]
2348 ** sub sp, sp, #1040
2349-** str xzr, \[sp\]
2350+** str xzr, \[sp, #?1024\]
2351 ** cbnz w0, .*
2352 ** bl g
2353 ** ...
2354--
23552.34.1
2356
2357
2358From 96d85187c3b9c9a7efc2fd698c3d452e80d8aa47 Mon Sep 17 00:00:00 2001
2359From: Richard Sandiford <richard.sandiford@arm.com>
2360Date: Tue, 12 Sep 2023 16:07:20 +0100
2361Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame
2362 info
2363
2364The stack frame is currently divided into three areas:
2365
2366A: the area above the hard frame pointer
2367B: the SVE saves below the hard frame pointer
2368C: the outgoing arguments
2369
2370If the stack frame is allocated in one chunk, the allocation needs a
2371probe if the frame size is >= guard_size - 1KiB. In addition, if the
2372function is not a leaf function, it must probe an address no more than
23731KiB above the outgoing SP. We ensured the second condition by
2374
2375(1) using single-chunk allocations for non-leaf functions only if
2376 the link register save slot is within 512 bytes of the bottom
2377 of the frame; and
2378
2379(2) using the link register save as a probe (meaning, for instance,
2380 that it can't be individually shrink wrapped)
2381
2382If instead the stack is allocated in multiple chunks, then:
2383
2384* an allocation involving only the outgoing arguments (C above) requires
2385 a probe if the allocation size is > 1KiB
2386
2387* any other allocation requires a probe if the allocation size
2388 is >= guard_size - 1KiB
2389
2390* second and subsequent allocations require the previous allocation
2391 to probe at the bottom of the allocated area, regardless of the size
2392 of that previous allocation
2393
2394The final point means that, unlike for single allocations,
2395it can be necessary to have both a non-SVE register probe and
2396an SVE register probe. For example:
2397
2398* allocate A, probe using a non-SVE register save
2399* allocate B, probe using an SVE register save
2400* allocate C
2401
2402The non-SVE register used in this case was again the link register.
2403It was previously used even if the link register save slot was some
2404bytes above the bottom of the non-SVE register saves, but an earlier
2405patch avoided that by putting the link register save slot first.
2406
2407As a belt-and-braces fix, this patch explicitly records which
2408probe registers we're using and allows the non-SVE probe to be
2409whichever register comes first (as for SVE).
2410
2411The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c.
2412
2413gcc/
2414 * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe)
2415 (aarch64_frame::hard_fp_save_and_probe): New fields.
2416 * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize them.
2417 Rather than asserting that a leaf function saves LR, instead assert
2418 that a leaf function saves something.
2419 (aarch64_get_separate_components): Prevent the chosen probe
2420 registers from being individually shrink-wrapped.
2421 (aarch64_allocate_and_probe_stack_space): Remove workaround for
2422 probe registers that aren't at the bottom of the previous allocation.
2423
2424gcc/testsuite/
2425 * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes.
2426---
2427 gcc/config/aarch64/aarch64.cc | 68 +++++++++++++++----
2428 gcc/config/aarch64/aarch64.h | 8 +++
2429 .../aarch64/sve/pcs/stack_clash_3.c | 6 +-
2430 3 files changed, 64 insertions(+), 18 deletions(-)
2431
2432diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
2433index 37809a306f7..6c59c39a639 100644
2434--- a/gcc/config/aarch64/aarch64.cc
2435+++ b/gcc/config/aarch64/aarch64.cc
2436@@ -8471,15 +8471,11 @@ aarch64_layout_frame (void)
2437 && !crtl->abi->clobbers_full_reg_p (regno))
2438 frame.reg_offset[regno] = SLOT_REQUIRED;
2439
2440- /* With stack-clash, LR must be saved in non-leaf functions. The saving of
2441- LR counts as an implicit probe which allows us to maintain the invariant
2442- described in the comment at expand_prologue. */
2443- gcc_assert (crtl->is_leaf
2444- || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
2445
2446 poly_int64 offset = crtl->outgoing_args_size;
2447 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
2448 frame.bytes_below_saved_regs = offset;
2449+ frame.sve_save_and_probe = INVALID_REGNUM;
2450
2451 /* Now assign stack slots for the registers. Start with the predicate
2452 registers, since predicate LDR and STR have a relatively small
2453@@ -8487,6 +8483,8 @@ aarch64_layout_frame (void)
2454 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
2455 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
2456 {
2457+ if (frame.sve_save_and_probe == INVALID_REGNUM)
2458+ frame.sve_save_and_probe = regno;
2459 frame.reg_offset[regno] = offset;
2460 offset += BYTES_PER_SVE_PRED;
2461 }
2462@@ -8524,6 +8522,8 @@ aarch64_layout_frame (void)
2463 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2464 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
2465 {
2466+ if (frame.sve_save_and_probe == INVALID_REGNUM)
2467+ frame.sve_save_and_probe = regno;
2468 frame.reg_offset[regno] = offset;
2469 offset += vector_save_size;
2470 }
2471@@ -8533,10 +8533,18 @@ aarch64_layout_frame (void)
2472 frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
2473 bool saves_below_hard_fp_p
2474 = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
2475+ gcc_assert (!saves_below_hard_fp_p
2476+ || (frame.sve_save_and_probe != INVALID_REGNUM
2477+ && known_eq (frame.reg_offset[frame.sve_save_and_probe],
2478+ frame.bytes_below_saved_regs)));
2479+
2480 frame.bytes_below_hard_fp = offset;
2481+ frame.hard_fp_save_and_probe = INVALID_REGNUM;
2482
2483 auto allocate_gpr_slot = [&](unsigned int regno)
2484 {
2485+ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
2486+ frame.hard_fp_save_and_probe = regno;
2487 frame.reg_offset[regno] = offset;
2488 if (frame.wb_push_candidate1 == INVALID_REGNUM)
2489 frame.wb_push_candidate1 = regno;
2490@@ -8570,6 +8578,8 @@ aarch64_layout_frame (void)
2491 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2492 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
2493 {
2494+ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
2495+ frame.hard_fp_save_and_probe = regno;
2496 /* If there is an alignment gap between integer and fp callee-saves,
2497 allocate the last fp register to it if possible. */
2498 if (regno == last_fp_reg
2499@@ -8593,6 +8603,17 @@ aarch64_layout_frame (void)
2500 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2501
2502 frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
2503+ gcc_assert (known_eq (frame.saved_regs_size,
2504+ frame.below_hard_fp_saved_regs_size)
2505+ || (frame.hard_fp_save_and_probe != INVALID_REGNUM
2506+ && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
2507+ frame.bytes_below_hard_fp)));
2508+
2509+ /* With stack-clash, a register must be saved in non-leaf functions.
2510+ The saving of the bottommost register counts as an implicit probe,
2511+ which allows us to maintain the invariant described in the comment
2512+ at expand_prologue. */
2513+ gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
2514
2515 offset += get_frame_size ();
2516 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2517@@ -8723,6 +8744,25 @@ aarch64_layout_frame (void)
2518 frame.final_adjust = frame.bytes_below_saved_regs;
2519 }
2520
2521+ /* The frame is allocated in pieces, with each non-final piece
2522+ including a register save at offset 0 that acts as a probe for
2523+ the following piece. In addition, the save of the bottommost register
2524+ acts as a probe for callees and allocas. Roll back any probes that
2525+ aren't needed.
2526+
2527+ A probe isn't needed if it is associated with the final allocation
2528+ (including callees and allocas) that happens before the epilogue is
2529+ executed. */
2530+ if (crtl->is_leaf
2531+ && !cfun->calls_alloca
2532+ && known_eq (frame.final_adjust, 0))
2533+ {
2534+ if (maybe_ne (frame.sve_callee_adjust, 0))
2535+ frame.sve_save_and_probe = INVALID_REGNUM;
2536+ else
2537+ frame.hard_fp_save_and_probe = INVALID_REGNUM;
2538+ }
2539+
2540 /* Make sure the individual adjustments add up to the full frame size. */
2541 gcc_assert (known_eq (frame.initial_adjust
2542 + frame.callee_adjust
2543@@ -9354,13 +9394,6 @@ aarch64_get_separate_components (void)
2544
2545 poly_int64 offset = frame.reg_offset[regno];
2546
2547- /* If the register is saved in the first SVE save slot, we use
2548- it as a stack probe for -fstack-clash-protection. */
2549- if (flag_stack_clash_protection
2550- && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
2551- && known_eq (offset, frame.bytes_below_saved_regs))
2552- continue;
2553-
2554 /* Get the offset relative to the register we'll use. */
2555 if (frame_pointer_needed)
2556 offset -= frame.bytes_below_hard_fp;
2557@@ -9395,6 +9428,13 @@ aarch64_get_separate_components (void)
2558
2559 bitmap_clear_bit (components, LR_REGNUM);
2560 bitmap_clear_bit (components, SP_REGNUM);
2561+ if (flag_stack_clash_protection)
2562+ {
2563+ if (frame.sve_save_and_probe != INVALID_REGNUM)
2564+ bitmap_clear_bit (components, frame.sve_save_and_probe);
2565+ if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
2566+ bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
2567+ }
2568
2569 return components;
2570 }
2571@@ -9931,8 +9971,8 @@ aarch64_epilogue_uses (int regno)
2572 When probing is needed, we emit a probe at the start of the prologue
2573 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
2574
2575- We have to track how much space has been allocated and the only stores
2576- to the stack we track as implicit probes are the FP/LR stores.
2577+ We can also use register saves as probes. These are stored in
2578+ sve_save_and_probe and hard_fp_save_and_probe.
2579
2580 For outgoing arguments we probe if the size is larger than 1KB, such that
2581 the ABI specified buffer is maintained for the next callee.
2582diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
2583index c8becb098c8..fbfb73545ba 100644
2584--- a/gcc/config/aarch64/aarch64.h
2585+++ b/gcc/config/aarch64/aarch64.h
2586@@ -863,6 +863,14 @@ struct GTY (()) aarch64_frame
2587 This is the register they should use. */
2588 unsigned spare_pred_reg;
2589
2590+ /* An SVE register that is saved below the hard frame pointer and that acts
2591+ as a probe for later allocations, or INVALID_REGNUM if none. */
2592+ unsigned sve_save_and_probe;
2593+
2594+ /* A register that is saved at the hard frame pointer and that acts
2595+ as a probe for later allocations, or INVALID_REGNUM if none. */
2596+ unsigned hard_fp_save_and_probe;
2597+
2598 bool laid_out;
2599
2600 /* True if shadow call stack should be enabled for the current function. */
2601diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
2602index 3e01ec36c3a..3530a0d504b 100644
2603--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
2604+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
2605@@ -11,11 +11,10 @@
2606 ** mov x11, sp
2607 ** ...
2608 ** sub sp, sp, x13
2609-** str p4, \[sp\]
2610 ** cbz w0, [^\n]*
2611+** str p4, \[sp\]
2612 ** ...
2613 ** ptrue p0\.b, all
2614-** ldr p4, \[sp\]
2615 ** addvl sp, sp, #1
2616 ** ldr x24, \[sp\], 32
2617 ** ret
2618@@ -39,13 +38,12 @@ test_1 (int n)
2619 ** mov x11, sp
2620 ** ...
2621 ** sub sp, sp, x13
2622-** str p4, \[sp\]
2623 ** cbz w0, [^\n]*
2624+** str p4, \[sp\]
2625 ** str p5, \[sp, #1, mul vl\]
2626 ** str p6, \[sp, #2, mul vl\]
2627 ** ...
2628 ** ptrue p0\.b, all
2629-** ldr p4, \[sp\]
2630 ** addvl sp, sp, #1
2631 ** ldr x24, \[sp\], 32
2632 ** ret
2633--
26342.34.1
2635
2636
2637From 56df065080950bb30dda9c260f71be54269bdda5 Mon Sep 17 00:00:00 2001
2638From: Richard Sandiford <richard.sandiford@arm.com>
2639Date: Tue, 12 Sep 2023 16:07:20 +0100
2640Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size
2641
2642After previous patches, it's no longer necessary to store
2643saved_regs_size and below_hard_fp_saved_regs_size in the frame info.
2644All measurements instead use the top or bottom of the frame as
2645reference points.
2646
2647gcc/
2648 * config/aarch64/aarch64.h (aarch64_frame::saved_regs_size)
2649 (aarch64_frame::below_hard_fp_saved_regs_size): Delete.
2650 * config/aarch64/aarch64.cc (aarch64_layout_frame): Update accordingly.
2651---
2652 gcc/config/aarch64/aarch64.cc | 45 ++++++++++++++++-------------------
2653 gcc/config/aarch64/aarch64.h | 7 ------
2654 2 files changed, 21 insertions(+), 31 deletions(-)
2655
2656diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
2657index 6c59c39a639..b95e805a8cc 100644
2658--- a/gcc/config/aarch64/aarch64.cc
2659+++ b/gcc/config/aarch64/aarch64.cc
2660@@ -8530,9 +8530,8 @@ aarch64_layout_frame (void)
2661
2662 /* OFFSET is now the offset of the hard frame pointer from the bottom
2663 of the callee save area. */
2664- frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
2665- bool saves_below_hard_fp_p
2666- = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
2667+ auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
2668+ bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
2669 gcc_assert (!saves_below_hard_fp_p
2670 || (frame.sve_save_and_probe != INVALID_REGNUM
2671 && known_eq (frame.reg_offset[frame.sve_save_and_probe],
2672@@ -8602,9 +8601,8 @@ aarch64_layout_frame (void)
2673
2674 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2675
2676- frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
2677- gcc_assert (known_eq (frame.saved_regs_size,
2678- frame.below_hard_fp_saved_regs_size)
2679+ auto saved_regs_size = offset - frame.bytes_below_saved_regs;
2680+ gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size)
2681 || (frame.hard_fp_save_and_probe != INVALID_REGNUM
2682 && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
2683 frame.bytes_below_hard_fp)));
2684@@ -8613,7 +8611,7 @@ aarch64_layout_frame (void)
2685 The saving of the bottommost register counts as an implicit probe,
2686 which allows us to maintain the invariant described in the comment
2687 at expand_prologue. */
2688- gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
2689+ gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
2690
2691 offset += get_frame_size ();
2692 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2693@@ -8670,7 +8668,7 @@ aarch64_layout_frame (void)
2694
2695 HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
2696 HOST_WIDE_INT const_saved_regs_size;
2697- if (known_eq (frame.saved_regs_size, 0))
2698+ if (known_eq (saved_regs_size, 0))
2699 frame.initial_adjust = frame.frame_size;
2700 else if (frame.frame_size.is_constant (&const_size)
2701 && const_size < max_push_offset
2702@@ -8683,7 +8681,7 @@ aarch64_layout_frame (void)
2703 frame.callee_adjust = const_size;
2704 }
2705 else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
2706- && frame.saved_regs_size.is_constant (&const_saved_regs_size)
2707+ && saved_regs_size.is_constant (&const_saved_regs_size)
2708 && const_below_saved_regs + const_saved_regs_size < 512
2709 /* We could handle this case even with data below the saved
2710 registers, provided that that data left us with valid offsets
2711@@ -8702,8 +8700,7 @@ aarch64_layout_frame (void)
2712 frame.initial_adjust = frame.frame_size;
2713 }
2714 else if (saves_below_hard_fp_p
2715- && known_eq (frame.saved_regs_size,
2716- frame.below_hard_fp_saved_regs_size))
2717+ && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
2718 {
2719 /* Frame in which all saves are SVE saves:
2720
2721@@ -8725,7 +8722,7 @@ aarch64_layout_frame (void)
2722 [save SVE registers relative to SP]
2723 sub sp, sp, bytes_below_saved_regs */
2724 frame.callee_adjust = const_above_fp;
2725- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
2726+ frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
2727 frame.final_adjust = frame.bytes_below_saved_regs;
2728 }
2729 else
2730@@ -8740,7 +8737,7 @@ aarch64_layout_frame (void)
2731 [save SVE registers relative to SP]
2732 sub sp, sp, bytes_below_saved_regs */
2733 frame.initial_adjust = frame.bytes_above_hard_fp;
2734- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
2735+ frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
2736 frame.final_adjust = frame.bytes_below_saved_regs;
2737 }
2738
2739@@ -9936,17 +9933,17 @@ aarch64_epilogue_uses (int regno)
2740 | local variables | <-- frame_pointer_rtx
2741 | |
2742 +-------------------------------+
2743- | padding | \
2744- +-------------------------------+ |
2745- | callee-saved registers | | frame.saved_regs_size
2746- +-------------------------------+ |
2747- | LR' | |
2748- +-------------------------------+ |
2749- | FP' | |
2750- +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
2751- | SVE vector registers | | \
2752- +-------------------------------+ | | below_hard_fp_saved_regs_size
2753- | SVE predicate registers | / /
2754+ | padding |
2755+ +-------------------------------+
2756+ | callee-saved registers |
2757+ +-------------------------------+
2758+ | LR' |
2759+ +-------------------------------+
2760+ | FP' |
2761+ +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
2762+ | SVE vector registers |
2763+ +-------------------------------+
2764+ | SVE predicate registers |
2765 +-------------------------------+
2766 | dynamic allocation |
2767 +-------------------------------+
2768diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
2769index fbfb73545ba..cfeaf4657ab 100644
2770--- a/gcc/config/aarch64/aarch64.h
2771+++ b/gcc/config/aarch64/aarch64.h
2772@@ -777,18 +777,11 @@ struct GTY (()) aarch64_frame
2773 STACK_BOUNDARY. */
2774 HOST_WIDE_INT saved_varargs_size;
2775
2776- /* The size of the callee-save registers with a slot in REG_OFFSET. */
2777- poly_int64 saved_regs_size;
2778-
2779 /* The number of bytes between the bottom of the static frame (the bottom
2780 of the outgoing arguments) and the bottom of the register save area.
2781 This value is always a multiple of STACK_BOUNDARY. */
2782 poly_int64 bytes_below_saved_regs;
2783
2784- /* The size of the callee-save registers with a slot in REG_OFFSET that
2785- are saved below the hard frame pointer. */
2786- poly_int64 below_hard_fp_saved_regs_size;
2787-
2788 /* The number of bytes between the bottom of the static frame (the bottom
2789 of the outgoing arguments) and the hard frame pointer. This value is
2790 always a multiple of STACK_BOUNDARY. */
2791--
27922.34.1
2793
2794
2795From b96e66fd4ef3e36983969fb8cdd1956f551a074b Mon Sep 17 00:00:00 2001
2796From: Richard Sandiford <richard.sandiford@arm.com>
2797Date: Tue, 12 Sep 2023 16:07:21 +0100
2798Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved
2799 registers
2800
2801AArch64 normally puts the saved registers near the bottom of the frame,
2802immediately above any dynamic allocations. But this means that a
2803stack-smash attack on those dynamic allocations could overwrite the
2804saved registers without needing to reach as far as the stack smash
2805canary.
2806
2807The same thing could also happen for variable-sized arguments that are
2808passed by value, since those are allocated before a call and popped on
2809return.
2810
2811This patch avoids that by putting the locals (and thus the canary) below
2812the saved registers when stack smash protection is active.
2813
2814The patch fixes CVE-2023-4039.
2815
2816gcc/
2817 * config/aarch64/aarch64.cc (aarch64_save_regs_above_locals_p):
2818 New function.
2819 (aarch64_layout_frame): Use it to decide whether locals should
2820 go above or below the saved registers.
2821 (aarch64_expand_prologue): Update stack layout comment.
2822 Emit a stack tie after the final adjustment.
2823
2824gcc/testsuite/
2825 * gcc.target/aarch64/stack-protector-8.c: New test.
2826 * gcc.target/aarch64/stack-protector-9.c: Likewise.
2827---
2828 gcc/config/aarch64/aarch64.cc | 46 +++++++--
2829 .../gcc.target/aarch64/stack-protector-8.c | 95 +++++++++++++++++++
2830 .../gcc.target/aarch64/stack-protector-9.c | 33 +++++++
2831 3 files changed, 168 insertions(+), 6 deletions(-)
2832 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
2833 create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
2834
2835diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
2836index b95e805a8cc..389c0e29353 100644
2837--- a/gcc/config/aarch64/aarch64.cc
2838+++ b/gcc/config/aarch64/aarch64.cc
2839@@ -8394,6 +8394,20 @@ aarch64_needs_frame_chain (void)
2840 return aarch64_use_frame_pointer;
2841 }
2842
2843+/* Return true if the current function should save registers above
2844+ the locals area, rather than below it. */
2845+
2846+static bool
2847+aarch64_save_regs_above_locals_p ()
2848+{
2849+ /* When using stack smash protection, make sure that the canary slot
2850+ comes between the locals and the saved registers. Otherwise,
2851+ it would be possible for a carefully sized smash attack to change
2852+ the saved registers (particularly LR and FP) without reaching the
2853+ canary. */
2854+ return crtl->stack_protect_guard;
2855+}
2856+
2857 /* Mark the registers that need to be saved by the callee and calculate
2858 the size of the callee-saved registers area and frame record (both FP
2859 and LR may be omitted). */
2860@@ -8405,6 +8419,7 @@ aarch64_layout_frame (void)
2861 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
2862 bool frame_related_fp_reg_p = false;
2863 aarch64_frame &frame = cfun->machine->frame;
2864+ poly_int64 top_of_locals = -1;
2865
2866 frame.emit_frame_chain = aarch64_needs_frame_chain ();
2867
2868@@ -8471,9 +8486,16 @@ aarch64_layout_frame (void)
2869 && !crtl->abi->clobbers_full_reg_p (regno))
2870 frame.reg_offset[regno] = SLOT_REQUIRED;
2871
2872+ bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
2873
2874 poly_int64 offset = crtl->outgoing_args_size;
2875 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
2876+ if (regs_at_top_p)
2877+ {
2878+ offset += get_frame_size ();
2879+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2880+ top_of_locals = offset;
2881+ }
2882 frame.bytes_below_saved_regs = offset;
2883 frame.sve_save_and_probe = INVALID_REGNUM;
2884
2885@@ -8613,15 +8635,18 @@ aarch64_layout_frame (void)
2886 at expand_prologue. */
2887 gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
2888
2889- offset += get_frame_size ();
2890- offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2891- auto top_of_locals = offset;
2892-
2893+ if (!regs_at_top_p)
2894+ {
2895+ offset += get_frame_size ();
2896+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2897+ top_of_locals = offset;
2898+ }
2899 offset += frame.saved_varargs_size;
2900 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
2901 frame.frame_size = offset;
2902
2903 frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
2904+ gcc_assert (known_ge (top_of_locals, 0));
2905 frame.bytes_above_locals = frame.frame_size - top_of_locals;
2906
2907 frame.initial_adjust = 0;
2908@@ -9930,10 +9955,10 @@ aarch64_epilogue_uses (int regno)
2909 | for register varargs |
2910 | |
2911 +-------------------------------+
2912- | local variables | <-- frame_pointer_rtx
2913+ | local variables (1) | <-- frame_pointer_rtx
2914 | |
2915 +-------------------------------+
2916- | padding |
2917+ | padding (1) |
2918 +-------------------------------+
2919 | callee-saved registers |
2920 +-------------------------------+
2921@@ -9945,6 +9970,10 @@ aarch64_epilogue_uses (int regno)
2922 +-------------------------------+
2923 | SVE predicate registers |
2924 +-------------------------------+
2925+ | local variables (2) |
2926+ +-------------------------------+
2927+ | padding (2) |
2928+ +-------------------------------+
2929 | dynamic allocation |
2930 +-------------------------------+
2931 | padding |
2932@@ -9954,6 +9983,9 @@ aarch64_epilogue_uses (int regno)
2933 +-------------------------------+
2934 | | <-- stack_pointer_rtx (aligned)
2935
2936+ The regions marked (1) and (2) are mutually exclusive. (2) is used
2937+ when aarch64_save_regs_above_locals_p is true.
2938+
2939 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2940 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2941 unchanged.
2942@@ -10149,6 +10181,8 @@ aarch64_expand_prologue (void)
2943 gcc_assert (known_eq (bytes_below_sp, final_adjust));
2944 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
2945 !frame_pointer_needed, true);
2946+ if (emit_frame_chain && maybe_ne (final_adjust, 0))
2947+ emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2948 }
2949
2950 /* Return TRUE if we can use a simple_return insn.
2951diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
2952new file mode 100644
2953index 00000000000..e71d820e365
2954--- /dev/null
2955+++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
2956@@ -0,0 +1,95 @@
2957+/* { dg-options " -O -fstack-protector-strong -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0 -mstack-protector-guard-offset=16" } */
2958+/* { dg-final { check-function-bodies "**" "" } } */
2959+
2960+void g(void *);
2961+__SVBool_t *h(void *);
2962+
2963+/*
2964+** test1:
2965+** sub sp, sp, #288
2966+** stp x29, x30, \[sp, #?272\]
2967+** add x29, sp, #?272
2968+** mrs (x[0-9]+), tpidr2_el0
2969+** ldr (x[0-9]+), \[\1, #?16\]
2970+** str \2, \[sp, #?264\]
2971+** mov \2, #?0
2972+** add x0, sp, #?8
2973+** bl g
2974+** ...
2975+** mrs .*
2976+** ...
2977+** bne .*
2978+** ...
2979+** ldp x29, x30, \[sp, #?272\]
2980+** add sp, sp, #?288
2981+** ret
2982+** bl __stack_chk_fail
2983+*/
2984+int test1() {
2985+ int y[0x40];
2986+ g(y);
2987+ return 1;
2988+}
2989+
2990+/*
2991+** test2:
2992+** stp x29, x30, \[sp, #?-16\]!
2993+** mov x29, sp
2994+** sub sp, sp, #1040
2995+** mrs (x[0-9]+), tpidr2_el0
2996+** ldr (x[0-9]+), \[\1, #?16\]
2997+** str \2, \[sp, #?1032\]
2998+** mov \2, #?0
2999+** add x0, sp, #?8
3000+** bl g
3001+** ...
3002+** mrs .*
3003+** ...
3004+** bne .*
3005+** ...
3006+** add sp, sp, #?1040
3007+** ldp x29, x30, \[sp\], #?16
3008+** ret
3009+** bl __stack_chk_fail
3010+*/
3011+int test2() {
3012+ int y[0x100];
3013+ g(y);
3014+ return 1;
3015+}
3016+
3017+#pragma GCC target "+sve"
3018+
3019+/*
3020+** test3:
3021+** stp x29, x30, \[sp, #?-16\]!
3022+** mov x29, sp
3023+** addvl sp, sp, #-18
3024+** ...
3025+** str p4, \[sp\]
3026+** ...
3027+** sub sp, sp, #272
3028+** mrs (x[0-9]+), tpidr2_el0
3029+** ldr (x[0-9]+), \[\1, #?16\]
3030+** str \2, \[sp, #?264\]
3031+** mov \2, #?0
3032+** add x0, sp, #?8
3033+** bl h
3034+** ...
3035+** mrs .*
3036+** ...
3037+** bne .*
3038+** ...
3039+** add sp, sp, #?272
3040+** ...
3041+** ldr p4, \[sp\]
3042+** ...
3043+** addvl sp, sp, #18
3044+** ldp x29, x30, \[sp\], #?16
3045+** ret
3046+** bl __stack_chk_fail
3047+*/
3048+__SVBool_t test3() {
3049+ int y[0x40];
3050+ return *h(y);
3051+}
3052diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
3053new file mode 100644
3054index 00000000000..58f322aa480
3055--- /dev/null
3056+++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
3057@@ -0,0 +1,33 @@
3058+/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */
3059+/* { dg-final { check-function-bodies "**" "" } } */
3060+
3061+/*
3062+** main:
3063+** ...
3064+** stp x29, x30, \[sp, #?-[0-9]+\]!
3065+** ...
3066+** sub sp, sp, #[0-9]+
3067+** ...
3068+** str x[0-9]+, \[x29, #?-8\]
3069+** ...
3070+*/
3071+int f(const char *);
3072+void g(void *);
3073+int main(int argc, char* argv[])
3074+{
3075+ int a;
3076+ int b;
3077+ char c[2+f(argv[1])];
3078+ int d[0x100];
3079+ char y;
3080+
3081+ y=42; a=4; b=10;
3082+ c[0] = 'h'; c[1] = '\0';
3083+
3084+ c[f(argv[2])] = '\0';
3085+
3086+ __builtin_printf("%d %d\n%s\n", a, b, c);
3087+ g(d);
3088+
3089+ return 0;
3090+}
3091--
30922.34.1
3093