diff options
Diffstat (limited to 'meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch')
-rw-r--r-- | meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch | 663 |
1 files changed, 663 insertions, 0 deletions
diff --git a/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch b/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch new file mode 100644 index 000000000..be102160c --- /dev/null +++ b/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch | |||
@@ -0,0 +1,663 @@ | |||
1 | 2010-08-24 Andrew Stubbs <ams@codesourcery.com> | ||
2 | |||
3 | Backport from FSF: | ||
4 | |||
5 | 2010-08-07 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> | ||
6 | |||
7 | * config/arm/cortex-a9.md: Rewrite VFP Pipeline description. | ||
8 | * config/arm/arm.c (arm_xscale_tune): Initialize sched_adjust_cost. | ||
9 | (arm_fastmul_tune,arm_slowmul_tune, arm_9e_tune): Likewise. | ||
10 | (arm_adjust_cost): Split into xscale_sched_adjust_cost and a | ||
11 | generic part. | ||
12 | (cortex_a9_sched_adjust_cost): New function. | ||
13 | (xscale_sched_adjust_cost): New function. | ||
14 | * config/arm/arm-protos.h (struct tune_params): New field | ||
15 | sched_adjust_cost. | ||
16 | * config/arm/arm-cores.def: Adjust costs for cortex-a9. | ||
17 | |||
18 | 2010-04-17 Richard Earnshaw <rearnsha@arm.com> | ||
19 | |||
20 | * arm-protos.h (tune_params): New structure. | ||
21 | * arm.c (current_tune): New variable. | ||
22 | (arm_constant_limit): Delete. | ||
23 | (struct processors): Add pointer to the tune parameters. | ||
24 | (arm_slowmul_tune): New tuning option. | ||
25 | (arm_fastmul_tune, arm_xscale_tune, arm_9e_tune): Likewise. | ||
26 | (all_cores): Adjust to pick up the tuning model. | ||
27 | (arm_constant_limit): New function. | ||
28 | (arm_override_options): Select the appropriate tuning model. Delete | ||
29 | initialization of arm_const_limit. | ||
30 | (arm_split_constant): Use the new constant-limit model. | ||
31 | (arm_rtx_costs): Pick up the current tuning model. | ||
32 | * arm.md (is_strongarm, is_xscale): Delete. | ||
33 | * arm-generic.md (load_ldsched_x, load_ldsched): Test explicitly | ||
34 | for Xscale variant architectures. | ||
35 | (mult_ldsched_strongarm, mult_ldsched): Similarly for StrongARM. | ||
36 | |||
37 | 2010-08-23 Andrew Stubbs <ams@codesourcery.com> | ||
38 | |||
39 | Backport from FSF: | ||
40 | |||
41 | === modified file 'gcc/config/arm/arm-cores.def' | ||
42 | --- old/gcc/config/arm/arm-cores.def 2010-07-29 15:53:39 +0000 | ||
43 | +++ new/gcc/config/arm/arm-cores.def 2010-08-24 13:15:54 +0000 | ||
44 | @@ -120,7 +120,7 @@ | ||
45 | ARM_CORE("arm1156t2f-s", arm1156t2fs, 6T2, FL_LDSCHED | FL_VFPV2, 9e) | ||
46 | ARM_CORE("cortex-a5", cortexa5, 7A, FL_LDSCHED, 9e) | ||
47 | ARM_CORE("cortex-a8", cortexa8, 7A, FL_LDSCHED, 9e) | ||
48 | -ARM_CORE("cortex-a9", cortexa9, 7A, FL_LDSCHED, 9e) | ||
49 | +ARM_CORE("cortex-a9", cortexa9, 7A, FL_LDSCHED, cortex_a9) | ||
50 | ARM_CORE("cortex-r4", cortexr4, 7R, FL_LDSCHED, 9e) | ||
51 | ARM_CORE("cortex-r4f", cortexr4f, 7R, FL_LDSCHED, 9e) | ||
52 | ARM_CORE("cortex-m4", cortexm4, 7EM, FL_LDSCHED, 9e) | ||
53 | |||
54 | === modified file 'gcc/config/arm/arm-generic.md' | ||
55 | --- old/gcc/config/arm/arm-generic.md 2007-08-02 09:49:31 +0000 | ||
56 | +++ new/gcc/config/arm/arm-generic.md 2010-08-24 13:15:54 +0000 | ||
57 | @@ -104,14 +104,14 @@ | ||
58 | (and (eq_attr "generic_sched" "yes") | ||
59 | (and (eq_attr "ldsched" "yes") | ||
60 | (and (eq_attr "type" "load_byte,load1") | ||
61 | - (eq_attr "is_xscale" "yes")))) | ||
62 | + (eq_attr "tune" "xscale,iwmmxt,iwmmxt2")))) | ||
63 | "core") | ||
64 | |||
65 | (define_insn_reservation "load_ldsched" 2 | ||
66 | (and (eq_attr "generic_sched" "yes") | ||
67 | (and (eq_attr "ldsched" "yes") | ||
68 | (and (eq_attr "type" "load_byte,load1") | ||
69 | - (eq_attr "is_xscale" "no")))) | ||
70 | + (eq_attr "tune" "!xscale,iwmmxt,iwmmxt2")))) | ||
71 | "core") | ||
72 | |||
73 | (define_insn_reservation "load_or_store" 2 | ||
74 | @@ -128,14 +128,16 @@ | ||
75 | (define_insn_reservation "mult_ldsched_strongarm" 3 | ||
76 | (and (eq_attr "generic_sched" "yes") | ||
77 | (and (eq_attr "ldsched" "yes") | ||
78 | - (and (eq_attr "is_strongarm" "yes") | ||
79 | + (and (eq_attr "tune" | ||
80 | + "strongarm,strongarm110,strongarm1100,strongarm1110") | ||
81 | (eq_attr "type" "mult")))) | ||
82 | "core*2") | ||
83 | |||
84 | (define_insn_reservation "mult_ldsched" 4 | ||
85 | (and (eq_attr "generic_sched" "yes") | ||
86 | (and (eq_attr "ldsched" "yes") | ||
87 | - (and (eq_attr "is_strongarm" "no") | ||
88 | + (and (eq_attr "tune" | ||
89 | + "!strongarm,strongarm110,strongarm1100,strongarm1110") | ||
90 | (eq_attr "type" "mult")))) | ||
91 | "core*4") | ||
92 | |||
93 | |||
94 | === modified file 'gcc/config/arm/arm-protos.h' | ||
95 | --- old/gcc/config/arm/arm-protos.h 2010-08-10 13:31:21 +0000 | ||
96 | +++ new/gcc/config/arm/arm-protos.h 2010-08-24 13:15:54 +0000 | ||
97 | @@ -214,4 +214,17 @@ | ||
98 | |||
99 | extern void arm_order_regs_for_local_alloc (void); | ||
100 | |||
101 | +#ifdef RTX_CODE | ||
102 | +/* This needs to be here because we need RTX_CODE and similar. */ | ||
103 | + | ||
104 | +struct tune_params | ||
105 | +{ | ||
106 | + bool (*rtx_costs) (rtx, RTX_CODE, RTX_CODE, int *, bool); | ||
107 | + bool (*sched_adjust_cost) (rtx, rtx, rtx, int *); | ||
108 | + int constant_limit; | ||
109 | +}; | ||
110 | + | ||
111 | +extern const struct tune_params *current_tune; | ||
112 | +#endif /* RTX_CODE */ | ||
113 | + | ||
114 | #endif /* ! GCC_ARM_PROTOS_H */ | ||
115 | |||
116 | === modified file 'gcc/config/arm/arm.c' | ||
117 | --- old/gcc/config/arm/arm.c 2010-08-20 16:21:01 +0000 | ||
118 | +++ new/gcc/config/arm/arm.c 2010-08-24 13:15:54 +0000 | ||
119 | @@ -228,6 +228,8 @@ | ||
120 | static void arm_trampoline_init (rtx, tree, rtx); | ||
121 | static rtx arm_trampoline_adjust_address (rtx); | ||
122 | static rtx arm_pic_static_addr (rtx orig, rtx reg); | ||
123 | +static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *); | ||
124 | +static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *); | ||
125 | static bool arm_vector_alignment_reachable (const_tree type, bool is_packed); | ||
126 | static bool arm_builtin_support_vector_misalignment (enum machine_mode mode, | ||
127 | const_tree type, | ||
128 | @@ -545,6 +547,9 @@ | ||
129 | /* The processor for which instructions should be scheduled. */ | ||
130 | enum processor_type arm_tune = arm_none; | ||
131 | |||
132 | +/* The current tuning set. */ | ||
133 | +const struct tune_params *current_tune; | ||
134 | + | ||
135 | /* The default processor used if not overridden by commandline. */ | ||
136 | static enum processor_type arm_default_cpu = arm_none; | ||
137 | |||
138 | @@ -720,9 +725,6 @@ | ||
139 | the next function. */ | ||
140 | static int after_arm_reorg = 0; | ||
141 | |||
142 | -/* The maximum number of insns to be used when loading a constant. */ | ||
143 | -static int arm_constant_limit = 3; | ||
144 | - | ||
145 | enum arm_pcs arm_pcs_default; | ||
146 | |||
147 | /* For an explanation of these variables, see final_prescan_insn below. */ | ||
148 | @@ -761,8 +763,44 @@ | ||
149 | enum processor_type core; | ||
150 | const char *arch; | ||
151 | const unsigned long flags; | ||
152 | - bool (* rtx_costs) (rtx, enum rtx_code, enum rtx_code, int *, bool); | ||
153 | -}; | ||
154 | + const struct tune_params *const tune; | ||
155 | +}; | ||
156 | + | ||
157 | +const struct tune_params arm_slowmul_tune = | ||
158 | +{ | ||
159 | + arm_slowmul_rtx_costs, | ||
160 | + NULL, | ||
161 | + 3 | ||
162 | +}; | ||
163 | + | ||
164 | +const struct tune_params arm_fastmul_tune = | ||
165 | +{ | ||
166 | + arm_fastmul_rtx_costs, | ||
167 | + NULL, | ||
168 | + 1 | ||
169 | +}; | ||
170 | + | ||
171 | +const struct tune_params arm_xscale_tune = | ||
172 | +{ | ||
173 | + arm_xscale_rtx_costs, | ||
174 | + xscale_sched_adjust_cost, | ||
175 | + 2 | ||
176 | +}; | ||
177 | + | ||
178 | +const struct tune_params arm_9e_tune = | ||
179 | +{ | ||
180 | + arm_9e_rtx_costs, | ||
181 | + NULL, | ||
182 | + 1 | ||
183 | +}; | ||
184 | + | ||
185 | +const struct tune_params arm_cortex_a9_tune = | ||
186 | +{ | ||
187 | + arm_9e_rtx_costs, | ||
188 | + cortex_a9_sched_adjust_cost, | ||
189 | + 1 | ||
190 | +}; | ||
191 | + | ||
192 | |||
193 | /* Not all of these give usefully different compilation alternatives, | ||
194 | but there is no simple way of generalizing them. */ | ||
195 | @@ -770,7 +808,7 @@ | ||
196 | { | ||
197 | /* ARM Cores */ | ||
198 | #define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \ | ||
199 | - {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, arm_##COSTS##_rtx_costs}, | ||
200 | + {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune}, | ||
201 | #include "arm-cores.def" | ||
202 | #undef ARM_CORE | ||
203 | {NULL, arm_none, NULL, 0, NULL} | ||
204 | @@ -779,7 +817,7 @@ | ||
205 | static const struct processors all_architectures[] = | ||
206 | { | ||
207 | /* ARM Architectures */ | ||
208 | - /* We don't specify rtx_costs here as it will be figured out | ||
209 | + /* We don't specify tuning costs here as it will be figured out | ||
210 | from the core. */ | ||
211 | |||
212 | {"armv2", arm2, "2", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL}, | ||
213 | @@ -928,6 +966,13 @@ | ||
214 | TLS_LE32 | ||
215 | }; | ||
216 | |||
217 | +/* The maximum number of insns to be used when loading a constant. */ | ||
218 | +inline static int | ||
219 | +arm_constant_limit (bool size_p) | ||
220 | +{ | ||
221 | + return size_p ? 1 : current_tune->constant_limit; | ||
222 | +} | ||
223 | + | ||
224 | /* Emit an insn that's a simple single-set. Both the operands must be known | ||
225 | to be valid. */ | ||
226 | inline static rtx | ||
227 | @@ -1478,6 +1523,7 @@ | ||
228 | } | ||
229 | |||
230 | tune_flags = all_cores[(int)arm_tune].flags; | ||
231 | + current_tune = all_cores[(int)arm_tune].tune; | ||
232 | |||
233 | if (target_fp16_format_name) | ||
234 | { | ||
235 | @@ -1875,26 +1921,12 @@ | ||
236 | |||
237 | if (optimize_size) | ||
238 | { | ||
239 | - arm_constant_limit = 1; | ||
240 | - | ||
241 | /* If optimizing for size, bump the number of instructions that we | ||
242 | are prepared to conditionally execute (even on a StrongARM). */ | ||
243 | max_insns_skipped = 6; | ||
244 | } | ||
245 | else | ||
246 | { | ||
247 | - /* For processors with load scheduling, it never costs more than | ||
248 | - 2 cycles to load a constant, and the load scheduler may well | ||
249 | - reduce that to 1. */ | ||
250 | - if (arm_ld_sched) | ||
251 | - arm_constant_limit = 1; | ||
252 | - | ||
253 | - /* On XScale the longer latency of a load makes it more difficult | ||
254 | - to achieve a good schedule, so it's faster to synthesize | ||
255 | - constants that can be done in two insns. */ | ||
256 | - if (arm_tune_xscale) | ||
257 | - arm_constant_limit = 2; | ||
258 | - | ||
259 | /* StrongARM has early execution of branches, so a sequence | ||
260 | that is worth skipping is shorter. */ | ||
261 | if (arm_tune_strongarm) | ||
262 | @@ -2423,7 +2455,8 @@ | ||
263 | && !cond | ||
264 | && (arm_gen_constant (code, mode, NULL_RTX, val, target, source, | ||
265 | 1, 0) | ||
266 | - > arm_constant_limit + (code != SET))) | ||
267 | + > (arm_constant_limit (optimize_function_for_size_p (cfun)) | ||
268 | + + (code != SET)))) | ||
269 | { | ||
270 | if (code == SET) | ||
271 | { | ||
272 | @@ -7771,9 +7804,9 @@ | ||
273 | (enum rtx_code) outer_code, total); | ||
274 | } | ||
275 | else | ||
276 | - return all_cores[(int)arm_tune].rtx_costs (x, (enum rtx_code) code, | ||
277 | - (enum rtx_code) outer_code, | ||
278 | - total, speed); | ||
279 | + return current_tune->rtx_costs (x, (enum rtx_code) code, | ||
280 | + (enum rtx_code) outer_code, | ||
281 | + total, speed); | ||
282 | } | ||
283 | |||
284 | /* RTX costs for cores with a slow MUL implementation. Thumb-2 is not | ||
285 | @@ -7918,7 +7951,8 @@ | ||
286 | so it can be ignored. */ | ||
287 | |||
288 | static bool | ||
289 | -arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, int *total, bool speed) | ||
290 | +arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, | ||
291 | + int *total, bool speed) | ||
292 | { | ||
293 | enum machine_mode mode = GET_MODE (x); | ||
294 | |||
295 | @@ -8119,15 +8153,15 @@ | ||
296 | return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x); | ||
297 | } | ||
298 | |||
299 | -static int | ||
300 | -arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) | ||
301 | +/* Adjust cost hook for XScale. */ | ||
302 | +static bool | ||
303 | +xscale_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) | ||
304 | { | ||
305 | rtx i_pat, d_pat; | ||
306 | |||
307 | /* Some true dependencies can have a higher cost depending | ||
308 | on precisely how certain input operands are used. */ | ||
309 | - if (arm_tune_xscale | ||
310 | - && REG_NOTE_KIND (link) == 0 | ||
311 | + if (REG_NOTE_KIND (link) == 0 | ||
312 | && recog_memoized (insn) >= 0 | ||
313 | && recog_memoized (dep) >= 0) | ||
314 | { | ||
315 | @@ -8161,10 +8195,106 @@ | ||
316 | |||
317 | if (reg_overlap_mentioned_p (recog_data.operand[opno], | ||
318 | shifted_operand)) | ||
319 | - return 2; | ||
320 | + { | ||
321 | + *cost = 2; | ||
322 | + return false; | ||
323 | + } | ||
324 | } | ||
325 | } | ||
326 | } | ||
327 | + return true; | ||
328 | +} | ||
329 | + | ||
330 | +/* Adjust cost hook for Cortex A9. */ | ||
331 | +static bool | ||
332 | +cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) | ||
333 | +{ | ||
334 | + switch (REG_NOTE_KIND (link)) | ||
335 | + { | ||
336 | + case REG_DEP_ANTI: | ||
337 | + *cost = 0; | ||
338 | + return false; | ||
339 | + | ||
340 | + case REG_DEP_TRUE: | ||
341 | + case REG_DEP_OUTPUT: | ||
342 | + if (recog_memoized (insn) >= 0 | ||
343 | + && recog_memoized (dep) >= 0) | ||
344 | + { | ||
345 | + if (GET_CODE (PATTERN (insn)) == SET) | ||
346 | + { | ||
347 | + if (GET_MODE_CLASS | ||
348 | + (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT | ||
349 | + || GET_MODE_CLASS | ||
350 | + (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT) | ||
351 | + { | ||
352 | + enum attr_type attr_type_insn = get_attr_type (insn); | ||
353 | + enum attr_type attr_type_dep = get_attr_type (dep); | ||
354 | + | ||
355 | + /* By default all dependencies of the form | ||
356 | + s0 = s0 <op> s1 | ||
357 | + s0 = s0 <op> s2 | ||
358 | + have an extra latency of 1 cycle because | ||
359 | + of the input and output dependency in this | ||
360 | + case. However this gets modeled as an true | ||
361 | + dependency and hence all these checks. */ | ||
362 | + if (REG_P (SET_DEST (PATTERN (insn))) | ||
363 | + && REG_P (SET_DEST (PATTERN (dep))) | ||
364 | + && reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)), | ||
365 | + SET_DEST (PATTERN (dep)))) | ||
366 | + { | ||
367 | + /* FMACS is a special case where the dependant | ||
368 | + instruction can be issued 3 cycles before | ||
369 | + the normal latency in case of an output | ||
370 | + dependency. */ | ||
371 | + if ((attr_type_insn == TYPE_FMACS | ||
372 | + || attr_type_insn == TYPE_FMACD) | ||
373 | + && (attr_type_dep == TYPE_FMACS | ||
374 | + || attr_type_dep == TYPE_FMACD)) | ||
375 | + { | ||
376 | + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) | ||
377 | + *cost = insn_default_latency (dep) - 3; | ||
378 | + else | ||
379 | + *cost = insn_default_latency (dep); | ||
380 | + return false; | ||
381 | + } | ||
382 | + else | ||
383 | + { | ||
384 | + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) | ||
385 | + *cost = insn_default_latency (dep) + 1; | ||
386 | + else | ||
387 | + *cost = insn_default_latency (dep); | ||
388 | + } | ||
389 | + return false; | ||
390 | + } | ||
391 | + } | ||
392 | + } | ||
393 | + } | ||
394 | + break; | ||
395 | + | ||
396 | + default: | ||
397 | + gcc_unreachable (); | ||
398 | + } | ||
399 | + | ||
400 | + return true; | ||
401 | +} | ||
402 | + | ||
403 | +/* This function implements the target macro TARGET_SCHED_ADJUST_COST. | ||
404 | + It corrects the value of COST based on the relationship between | ||
405 | + INSN and DEP through the dependence LINK. It returns the new | ||
406 | + value. There is a per-core adjust_cost hook to adjust scheduler costs | ||
407 | + and the per-core hook can choose to completely override the generic | ||
408 | + adjust_cost function. Only put bits of code into arm_adjust_cost that | ||
409 | + are common across all cores. */ | ||
410 | +static int | ||
411 | +arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) | ||
412 | +{ | ||
413 | + rtx i_pat, d_pat; | ||
414 | + | ||
415 | + if (current_tune->sched_adjust_cost != NULL) | ||
416 | + { | ||
417 | + if (!current_tune->sched_adjust_cost (insn, link, dep, &cost)) | ||
418 | + return cost; | ||
419 | + } | ||
420 | |||
421 | /* XXX This is not strictly true for the FPA. */ | ||
422 | if (REG_NOTE_KIND (link) == REG_DEP_ANTI | ||
423 | @@ -8187,7 +8317,8 @@ | ||
424 | constant pool are cached, and that others will miss. This is a | ||
425 | hack. */ | ||
426 | |||
427 | - if ((GET_CODE (src_mem) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (src_mem)) | ||
428 | + if ((GET_CODE (src_mem) == SYMBOL_REF | ||
429 | + && CONSTANT_POOL_ADDRESS_P (src_mem)) | ||
430 | || reg_mentioned_p (stack_pointer_rtx, src_mem) | ||
431 | || reg_mentioned_p (frame_pointer_rtx, src_mem) | ||
432 | || reg_mentioned_p (hard_frame_pointer_rtx, src_mem)) | ||
433 | |||
434 | === modified file 'gcc/config/arm/arm.md' | ||
435 | --- old/gcc/config/arm/arm.md 2010-08-23 14:39:12 +0000 | ||
436 | +++ new/gcc/config/arm/arm.md 2010-08-24 13:15:54 +0000 | ||
437 | @@ -150,13 +150,6 @@ | ||
438 | ; patterns that share the same RTL in both ARM and Thumb code. | ||
439 | (define_attr "is_thumb" "no,yes" (const (symbol_ref "thumb_code"))) | ||
440 | |||
441 | -; IS_STRONGARM is set to 'yes' when compiling for StrongARM, it affects | ||
442 | -; scheduling decisions for the load unit and the multiplier. | ||
443 | -(define_attr "is_strongarm" "no,yes" (const (symbol_ref "arm_tune_strongarm"))) | ||
444 | - | ||
445 | -; IS_XSCALE is set to 'yes' when compiling for XScale. | ||
446 | -(define_attr "is_xscale" "no,yes" (const (symbol_ref "arm_tune_xscale"))) | ||
447 | - | ||
448 | ;; Operand number of an input operand that is shifted. Zero if the | ||
449 | ;; given instruction does not shift one of its input operands. | ||
450 | (define_attr "shift" "" (const_int 0)) | ||
451 | |||
452 | === modified file 'gcc/config/arm/cortex-a9.md' | ||
453 | --- old/gcc/config/arm/cortex-a9.md 2009-10-31 16:40:03 +0000 | ||
454 | +++ new/gcc/config/arm/cortex-a9.md 2010-08-24 13:15:54 +0000 | ||
455 | @@ -2,8 +2,10 @@ | ||
456 | ;; Copyright (C) 2008, 2009 Free Software Foundation, Inc. | ||
457 | ;; Originally written by CodeSourcery for VFP. | ||
458 | ;; | ||
459 | -;; Integer core pipeline description contributed by ARM Ltd. | ||
460 | -;; | ||
461 | +;; Rewritten by Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> | ||
462 | +;; Integer Pipeline description contributed by ARM Ltd. | ||
463 | +;; VFP Pipeline description rewritten and contributed by ARM Ltd. | ||
464 | + | ||
465 | ;; This file is part of GCC. | ||
466 | ;; | ||
467 | ;; GCC is free software; you can redistribute it and/or modify it | ||
468 | @@ -22,28 +24,27 @@ | ||
469 | |||
470 | (define_automaton "cortex_a9") | ||
471 | |||
472 | -;; The Cortex-A9 integer core is modelled as a dual issue pipeline that has | ||
473 | +;; The Cortex-A9 core is modelled as a dual issue pipeline that has | ||
474 | ;; the following components. | ||
475 | ;; 1. 1 Load Store Pipeline. | ||
476 | ;; 2. P0 / main pipeline for data processing instructions. | ||
477 | ;; 3. P1 / Dual pipeline for Data processing instructions. | ||
478 | ;; 4. MAC pipeline for multiply as well as multiply | ||
479 | ;; and accumulate instructions. | ||
480 | -;; 5. 1 VFP / Neon pipeline. | ||
481 | -;; The Load/Store and VFP/Neon pipeline are multiplexed. | ||
482 | +;; 5. 1 VFP and an optional Neon unit. | ||
483 | +;; The Load/Store, VFP and Neon issue pipeline are multiplexed. | ||
484 | ;; The P0 / main pipeline and M1 stage of the MAC pipeline are | ||
485 | ;; multiplexed. | ||
486 | ;; The P1 / dual pipeline and M2 stage of the MAC pipeline are | ||
487 | ;; multiplexed. | ||
488 | -;; There are only 4 register read ports and hence at any point of | ||
489 | +;; There are only 4 integer register read ports and hence at any point of | ||
490 | ;; time we can't have issue down the E1 and the E2 ports unless | ||
491 | ;; of course there are bypass paths that get exercised. | ||
492 | ;; Both P0 and P1 have 2 stages E1 and E2. | ||
493 | ;; Data processing instructions issue to E1 or E2 depending on | ||
494 | ;; whether they have an early shift or not. | ||
495 | |||
496 | - | ||
497 | -(define_cpu_unit "cortex_a9_vfp, cortex_a9_ls" "cortex_a9") | ||
498 | +(define_cpu_unit "ca9_issue_vfp_neon, cortex_a9_ls" "cortex_a9") | ||
499 | (define_cpu_unit "cortex_a9_p0_e1, cortex_a9_p0_e2" "cortex_a9") | ||
500 | (define_cpu_unit "cortex_a9_p1_e1, cortex_a9_p1_e2" "cortex_a9") | ||
501 | (define_cpu_unit "cortex_a9_p0_wb, cortex_a9_p1_wb" "cortex_a9") | ||
502 | @@ -71,11 +72,7 @@ | ||
503 | |||
504 | ;; Issue at the same time along the load store pipeline and | ||
505 | ;; the VFP / Neon pipeline is not possible. | ||
506 | -;; FIXME:: At some point we need to model the issue | ||
507 | -;; of the load store and the vfp being shared rather than anything else. | ||
508 | - | ||
509 | -(exclusion_set "cortex_a9_ls" "cortex_a9_vfp") | ||
510 | - | ||
511 | +(exclusion_set "cortex_a9_ls" "ca9_issue_vfp_neon") | ||
512 | |||
513 | ;; Default data processing instruction without any shift | ||
514 | ;; The only exception to this is the mov instruction | ||
515 | @@ -101,18 +98,13 @@ | ||
516 | |||
517 | (define_insn_reservation "cortex_a9_load1_2" 4 | ||
518 | (and (eq_attr "tune" "cortexa9") | ||
519 | - (eq_attr "type" "load1, load2, load_byte")) | ||
520 | + (eq_attr "type" "load1, load2, load_byte, f_loads, f_loadd")) | ||
521 | "cortex_a9_ls") | ||
522 | |||
523 | ;; Loads multiples and store multiples can't be issued for 2 cycles in a | ||
524 | ;; row. The description below assumes that addresses are 64 bit aligned. | ||
525 | ;; If not, there is an extra cycle latency which is not modelled. | ||
526 | |||
527 | -;; FIXME:: This bit might need to be reworked when we get to | ||
528 | -;; tuning for the VFP because strictly speaking the ldm | ||
529 | -;; is sent to the LSU unit as is and there is only an | ||
530 | -;; issue restriction between the LSU and the VFP/ Neon unit. | ||
531 | - | ||
532 | (define_insn_reservation "cortex_a9_load3_4" 5 | ||
533 | (and (eq_attr "tune" "cortexa9") | ||
534 | (eq_attr "type" "load3, load4")) | ||
535 | @@ -120,12 +112,13 @@ | ||
536 | |||
537 | (define_insn_reservation "cortex_a9_store1_2" 0 | ||
538 | (and (eq_attr "tune" "cortexa9") | ||
539 | - (eq_attr "type" "store1, store2")) | ||
540 | + (eq_attr "type" "store1, store2, f_stores, f_stored")) | ||
541 | "cortex_a9_ls") | ||
542 | |||
543 | ;; Almost all our store multiples use an auto-increment | ||
544 | ;; form. Don't issue back to back load and store multiples | ||
545 | ;; because the load store unit will stall. | ||
546 | + | ||
547 | (define_insn_reservation "cortex_a9_store3_4" 0 | ||
548 | (and (eq_attr "tune" "cortexa9") | ||
549 | (eq_attr "type" "store3, store4")) | ||
550 | @@ -193,47 +186,79 @@ | ||
551 | (define_insn_reservation "cortex_a9_call" 0 | ||
552 | (and (eq_attr "tune" "cortexa9") | ||
553 | (eq_attr "type" "call")) | ||
554 | - "cortex_a9_issue_branch + cortex_a9_multcycle1 + cortex_a9_ls + cortex_a9_vfp") | ||
555 | + "cortex_a9_issue_branch + cortex_a9_multcycle1 + cortex_a9_ls + ca9_issue_vfp_neon") | ||
556 | |||
557 | |||
558 | ;; Pipelining for VFP instructions. | ||
559 | - | ||
560 | -(define_insn_reservation "cortex_a9_ffarith" 1 | ||
561 | +;; Issue happens either along load store unit or the VFP / Neon unit. | ||
562 | +;; Pipeline Instruction Classification. | ||
563 | +;; FPS - fcpys, ffariths, ffarithd,r_2_f,f_2_r | ||
564 | +;; FP_ADD - fadds, faddd, fcmps (1) | ||
565 | +;; FPMUL - fmul{s,d}, fmac{s,d} | ||
566 | +;; FPDIV - fdiv{s,d} | ||
567 | +(define_cpu_unit "ca9fps" "cortex_a9") | ||
568 | +(define_cpu_unit "ca9fp_add1, ca9fp_add2, ca9fp_add3, ca9fp_add4" "cortex_a9") | ||
569 | +(define_cpu_unit "ca9fp_mul1, ca9fp_mul2 , ca9fp_mul3, ca9fp_mul4" "cortex_a9") | ||
570 | +(define_cpu_unit "ca9fp_ds1" "cortex_a9") | ||
571 | + | ||
572 | + | ||
573 | +;; fmrs, fmrrd, fmstat and fmrx - The data is available after 1 cycle. | ||
574 | +(define_insn_reservation "cortex_a9_fps" 2 | ||
575 | (and (eq_attr "tune" "cortexa9") | ||
576 | - (eq_attr "type" "fcpys,ffariths,ffarithd,fcmps,fcmpd,fconsts,fconstd")) | ||
577 | - "cortex_a9_vfp") | ||
578 | + (eq_attr "type" "fcpys, fconsts, fconstd, ffariths, ffarithd, r_2_f, f_2_r, f_flag")) | ||
579 | + "ca9_issue_vfp_neon + ca9fps") | ||
580 | + | ||
581 | +(define_bypass 1 | ||
582 | + "cortex_a9_fps" | ||
583 | + "cortex_a9_fadd, cortex_a9_fps, cortex_a9_fcmp, cortex_a9_dp, cortex_a9_dp_shift, cortex_a9_multiply") | ||
584 | + | ||
585 | +;; Scheduling on the FP_ADD pipeline. | ||
586 | +(define_reservation "ca9fp_add" "ca9_issue_vfp_neon + ca9fp_add1, ca9fp_add2, ca9fp_add3, ca9fp_add4") | ||
587 | |||
588 | (define_insn_reservation "cortex_a9_fadd" 4 | ||
589 | - (and (eq_attr "tune" "cortexa9") | ||
590 | - (eq_attr "type" "fadds,faddd,f_cvt")) | ||
591 | - "cortex_a9_vfp") | ||
592 | - | ||
593 | -(define_insn_reservation "cortex_a9_fmuls" 5 | ||
594 | - (and (eq_attr "tune" "cortexa9") | ||
595 | - (eq_attr "type" "fmuls")) | ||
596 | - "cortex_a9_vfp") | ||
597 | - | ||
598 | -(define_insn_reservation "cortex_a9_fmuld" 6 | ||
599 | - (and (eq_attr "tune" "cortexa9") | ||
600 | - (eq_attr "type" "fmuld")) | ||
601 | - "cortex_a9_vfp*2") | ||
602 | + (and (eq_attr "tune" "cortexa9") | ||
603 | + (eq_attr "type" "fadds, faddd, f_cvt")) | ||
604 | + "ca9fp_add") | ||
605 | + | ||
606 | +(define_insn_reservation "cortex_a9_fcmp" 1 | ||
607 | + (and (eq_attr "tune" "cortexa9") | ||
608 | + (eq_attr "type" "fcmps, fcmpd")) | ||
609 | + "ca9_issue_vfp_neon + ca9fp_add1") | ||
610 | + | ||
611 | +;; Scheduling for the Multiply and MAC instructions. | ||
612 | +(define_reservation "ca9fmuls" | ||
613 | + "ca9fp_mul1 + ca9_issue_vfp_neon, ca9fp_mul2, ca9fp_mul3, ca9fp_mul4") | ||
614 | + | ||
615 | +(define_reservation "ca9fmuld" | ||
616 | + "ca9fp_mul1 + ca9_issue_vfp_neon, (ca9fp_mul1 + ca9fp_mul2), ca9fp_mul2, ca9fp_mul3, ca9fp_mul4") | ||
617 | + | ||
618 | +(define_insn_reservation "cortex_a9_fmuls" 4 | ||
619 | + (and (eq_attr "tune" "cortexa9") | ||
620 | + (eq_attr "type" "fmuls")) | ||
621 | + "ca9fmuls") | ||
622 | + | ||
623 | +(define_insn_reservation "cortex_a9_fmuld" 5 | ||
624 | + (and (eq_attr "tune" "cortexa9") | ||
625 | + (eq_attr "type" "fmuld")) | ||
626 | + "ca9fmuld") | ||
627 | |||
628 | (define_insn_reservation "cortex_a9_fmacs" 8 | ||
629 | - (and (eq_attr "tune" "cortexa9") | ||
630 | - (eq_attr "type" "fmacs")) | ||
631 | - "cortex_a9_vfp") | ||
632 | - | ||
633 | -(define_insn_reservation "cortex_a9_fmacd" 8 | ||
634 | - (and (eq_attr "tune" "cortexa9") | ||
635 | - (eq_attr "type" "fmacd")) | ||
636 | - "cortex_a9_vfp*2") | ||
637 | - | ||
638 | + (and (eq_attr "tune" "cortexa9") | ||
639 | + (eq_attr "type" "fmacs")) | ||
640 | + "ca9fmuls, ca9fp_add") | ||
641 | + | ||
642 | +(define_insn_reservation "cortex_a9_fmacd" 9 | ||
643 | + (and (eq_attr "tune" "cortexa9") | ||
644 | + (eq_attr "type" "fmacd")) | ||
645 | + "ca9fmuld, ca9fp_add") | ||
646 | + | ||
647 | +;; Division pipeline description. | ||
648 | (define_insn_reservation "cortex_a9_fdivs" 15 | ||
649 | - (and (eq_attr "tune" "cortexa9") | ||
650 | - (eq_attr "type" "fdivs")) | ||
651 | - "cortex_a9_vfp*10") | ||
652 | + (and (eq_attr "tune" "cortexa9") | ||
653 | + (eq_attr "type" "fdivs")) | ||
654 | + "ca9fp_ds1 + ca9_issue_vfp_neon, nothing*14") | ||
655 | |||
656 | (define_insn_reservation "cortex_a9_fdivd" 25 | ||
657 | - (and (eq_attr "tune" "cortexa9") | ||
658 | - (eq_attr "type" "fdivd")) | ||
659 | - "cortex_a9_vfp*20") | ||
660 | + (and (eq_attr "tune" "cortexa9") | ||
661 | + (eq_attr "type" "fdivd")) | ||
662 | + "ca9fp_ds1 + ca9_issue_vfp_neon, nothing*24") | ||
663 | |||