summaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch')
-rw-r--r--meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch663
1 files changed, 663 insertions, 0 deletions
diff --git a/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch b/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch
new file mode 100644
index 000000000..be102160c
--- /dev/null
+++ b/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch
@@ -0,0 +1,663 @@
12010-08-24 Andrew Stubbs <ams@codesourcery.com>
2
3 Backport from FSF:
4
5 2010-08-07 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>
6
7 * config/arm/cortex-a9.md: Rewrite VFP Pipeline description.
8 * config/arm/arm.c (arm_xscale_tune): Initialize sched_adjust_cost.
9 (arm_fastmul_tune,arm_slowmul_tune, arm_9e_tune): Likewise.
10 (arm_adjust_cost): Split into xscale_sched_adjust_cost and a
11 generic part.
12 (cortex_a9_sched_adjust_cost): New function.
13 (xscale_sched_adjust_cost): New function.
14 * config/arm/arm-protos.h (struct tune_params): New field
15 sched_adjust_cost.
16 * config/arm/arm-cores.def: Adjust costs for cortex-a9.
17
18 2010-04-17 Richard Earnshaw <rearnsha@arm.com>
19
20 * arm-protos.h (tune_params): New structure.
21 * arm.c (current_tune): New variable.
22 (arm_constant_limit): Delete.
23 (struct processors): Add pointer to the tune parameters.
24 (arm_slowmul_tune): New tuning option.
25 (arm_fastmul_tune, arm_xscale_tune, arm_9e_tune): Likewise.
26 (all_cores): Adjust to pick up the tuning model.
27 (arm_constant_limit): New function.
28 (arm_override_options): Select the appropriate tuning model. Delete
29 initialization of arm_const_limit.
30 (arm_split_constant): Use the new constant-limit model.
31 (arm_rtx_costs): Pick up the current tuning model.
32 * arm.md (is_strongarm, is_xscale): Delete.
33 * arm-generic.md (load_ldsched_x, load_ldsched): Test explicitly
34 for Xscale variant architectures.
35 (mult_ldsched_strongarm, mult_ldsched): Similarly for StrongARM.
36
37 2010-08-23 Andrew Stubbs <ams@codesourcery.com>
38
39 Backport from FSF:
40
41=== modified file 'gcc/config/arm/arm-cores.def'
42--- old/gcc/config/arm/arm-cores.def 2010-07-29 15:53:39 +0000
43+++ new/gcc/config/arm/arm-cores.def 2010-08-24 13:15:54 +0000
44@@ -120,7 +120,7 @@
45 ARM_CORE("arm1156t2f-s", arm1156t2fs, 6T2, FL_LDSCHED | FL_VFPV2, 9e)
46 ARM_CORE("cortex-a5", cortexa5, 7A, FL_LDSCHED, 9e)
47 ARM_CORE("cortex-a8", cortexa8, 7A, FL_LDSCHED, 9e)
48-ARM_CORE("cortex-a9", cortexa9, 7A, FL_LDSCHED, 9e)
49+ARM_CORE("cortex-a9", cortexa9, 7A, FL_LDSCHED, cortex_a9)
50 ARM_CORE("cortex-r4", cortexr4, 7R, FL_LDSCHED, 9e)
51 ARM_CORE("cortex-r4f", cortexr4f, 7R, FL_LDSCHED, 9e)
52 ARM_CORE("cortex-m4", cortexm4, 7EM, FL_LDSCHED, 9e)
53
54=== modified file 'gcc/config/arm/arm-generic.md'
55--- old/gcc/config/arm/arm-generic.md 2007-08-02 09:49:31 +0000
56+++ new/gcc/config/arm/arm-generic.md 2010-08-24 13:15:54 +0000
57@@ -104,14 +104,14 @@
58 (and (eq_attr "generic_sched" "yes")
59 (and (eq_attr "ldsched" "yes")
60 (and (eq_attr "type" "load_byte,load1")
61- (eq_attr "is_xscale" "yes"))))
62+ (eq_attr "tune" "xscale,iwmmxt,iwmmxt2"))))
63 "core")
64
65 (define_insn_reservation "load_ldsched" 2
66 (and (eq_attr "generic_sched" "yes")
67 (and (eq_attr "ldsched" "yes")
68 (and (eq_attr "type" "load_byte,load1")
69- (eq_attr "is_xscale" "no"))))
70+ (eq_attr "tune" "!xscale,iwmmxt,iwmmxt2"))))
71 "core")
72
73 (define_insn_reservation "load_or_store" 2
74@@ -128,14 +128,16 @@
75 (define_insn_reservation "mult_ldsched_strongarm" 3
76 (and (eq_attr "generic_sched" "yes")
77 (and (eq_attr "ldsched" "yes")
78- (and (eq_attr "is_strongarm" "yes")
79+ (and (eq_attr "tune"
80+ "strongarm,strongarm110,strongarm1100,strongarm1110")
81 (eq_attr "type" "mult"))))
82 "core*2")
83
84 (define_insn_reservation "mult_ldsched" 4
85 (and (eq_attr "generic_sched" "yes")
86 (and (eq_attr "ldsched" "yes")
87- (and (eq_attr "is_strongarm" "no")
88+ (and (eq_attr "tune"
89+ "!strongarm,strongarm110,strongarm1100,strongarm1110")
90 (eq_attr "type" "mult"))))
91 "core*4")
92
93
94=== modified file 'gcc/config/arm/arm-protos.h'
95--- old/gcc/config/arm/arm-protos.h 2010-08-10 13:31:21 +0000
96+++ new/gcc/config/arm/arm-protos.h 2010-08-24 13:15:54 +0000
97@@ -214,4 +214,17 @@
98
99 extern void arm_order_regs_for_local_alloc (void);
100
101+#ifdef RTX_CODE
102+/* This needs to be here because we need RTX_CODE and similar. */
103+
104+struct tune_params
105+{
106+ bool (*rtx_costs) (rtx, RTX_CODE, RTX_CODE, int *, bool);
107+ bool (*sched_adjust_cost) (rtx, rtx, rtx, int *);
108+ int constant_limit;
109+};
110+
111+extern const struct tune_params *current_tune;
112+#endif /* RTX_CODE */
113+
114 #endif /* ! GCC_ARM_PROTOS_H */
115
116=== modified file 'gcc/config/arm/arm.c'
117--- old/gcc/config/arm/arm.c 2010-08-20 16:21:01 +0000
118+++ new/gcc/config/arm/arm.c 2010-08-24 13:15:54 +0000
119@@ -228,6 +228,8 @@
120 static void arm_trampoline_init (rtx, tree, rtx);
121 static rtx arm_trampoline_adjust_address (rtx);
122 static rtx arm_pic_static_addr (rtx orig, rtx reg);
123+static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
124+static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
125 static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
126 static bool arm_builtin_support_vector_misalignment (enum machine_mode mode,
127 const_tree type,
128@@ -545,6 +547,9 @@
129 /* The processor for which instructions should be scheduled. */
130 enum processor_type arm_tune = arm_none;
131
132+/* The current tuning set. */
133+const struct tune_params *current_tune;
134+
135 /* The default processor used if not overridden by commandline. */
136 static enum processor_type arm_default_cpu = arm_none;
137
138@@ -720,9 +725,6 @@
139 the next function. */
140 static int after_arm_reorg = 0;
141
142-/* The maximum number of insns to be used when loading a constant. */
143-static int arm_constant_limit = 3;
144-
145 enum arm_pcs arm_pcs_default;
146
147 /* For an explanation of these variables, see final_prescan_insn below. */
148@@ -761,8 +763,44 @@
149 enum processor_type core;
150 const char *arch;
151 const unsigned long flags;
152- bool (* rtx_costs) (rtx, enum rtx_code, enum rtx_code, int *, bool);
153-};
154+ const struct tune_params *const tune;
155+};
156+
157+const struct tune_params arm_slowmul_tune =
158+{
159+ arm_slowmul_rtx_costs,
160+ NULL,
161+ 3
162+};
163+
164+const struct tune_params arm_fastmul_tune =
165+{
166+ arm_fastmul_rtx_costs,
167+ NULL,
168+ 1
169+};
170+
171+const struct tune_params arm_xscale_tune =
172+{
173+ arm_xscale_rtx_costs,
174+ xscale_sched_adjust_cost,
175+ 2
176+};
177+
178+const struct tune_params arm_9e_tune =
179+{
180+ arm_9e_rtx_costs,
181+ NULL,
182+ 1
183+};
184+
185+const struct tune_params arm_cortex_a9_tune =
186+{
187+ arm_9e_rtx_costs,
188+ cortex_a9_sched_adjust_cost,
189+ 1
190+};
191+
192
193 /* Not all of these give usefully different compilation alternatives,
194 but there is no simple way of generalizing them. */
195@@ -770,7 +808,7 @@
196 {
197 /* ARM Cores */
198 #define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \
199- {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, arm_##COSTS##_rtx_costs},
200+ {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune},
201 #include "arm-cores.def"
202 #undef ARM_CORE
203 {NULL, arm_none, NULL, 0, NULL}
204@@ -779,7 +817,7 @@
205 static const struct processors all_architectures[] =
206 {
207 /* ARM Architectures */
208- /* We don't specify rtx_costs here as it will be figured out
209+ /* We don't specify tuning costs here as it will be figured out
210 from the core. */
211
212 {"armv2", arm2, "2", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL},
213@@ -928,6 +966,13 @@
214 TLS_LE32
215 };
216
217+/* The maximum number of insns to be used when loading a constant. */
218+inline static int
219+arm_constant_limit (bool size_p)
220+{
221+ return size_p ? 1 : current_tune->constant_limit;
222+}
223+
224 /* Emit an insn that's a simple single-set. Both the operands must be known
225 to be valid. */
226 inline static rtx
227@@ -1478,6 +1523,7 @@
228 }
229
230 tune_flags = all_cores[(int)arm_tune].flags;
231+ current_tune = all_cores[(int)arm_tune].tune;
232
233 if (target_fp16_format_name)
234 {
235@@ -1875,26 +1921,12 @@
236
237 if (optimize_size)
238 {
239- arm_constant_limit = 1;
240-
241 /* If optimizing for size, bump the number of instructions that we
242 are prepared to conditionally execute (even on a StrongARM). */
243 max_insns_skipped = 6;
244 }
245 else
246 {
247- /* For processors with load scheduling, it never costs more than
248- 2 cycles to load a constant, and the load scheduler may well
249- reduce that to 1. */
250- if (arm_ld_sched)
251- arm_constant_limit = 1;
252-
253- /* On XScale the longer latency of a load makes it more difficult
254- to achieve a good schedule, so it's faster to synthesize
255- constants that can be done in two insns. */
256- if (arm_tune_xscale)
257- arm_constant_limit = 2;
258-
259 /* StrongARM has early execution of branches, so a sequence
260 that is worth skipping is shorter. */
261 if (arm_tune_strongarm)
262@@ -2423,7 +2455,8 @@
263 && !cond
264 && (arm_gen_constant (code, mode, NULL_RTX, val, target, source,
265 1, 0)
266- > arm_constant_limit + (code != SET)))
267+ > (arm_constant_limit (optimize_function_for_size_p (cfun))
268+ + (code != SET))))
269 {
270 if (code == SET)
271 {
272@@ -7771,9 +7804,9 @@
273 (enum rtx_code) outer_code, total);
274 }
275 else
276- return all_cores[(int)arm_tune].rtx_costs (x, (enum rtx_code) code,
277- (enum rtx_code) outer_code,
278- total, speed);
279+ return current_tune->rtx_costs (x, (enum rtx_code) code,
280+ (enum rtx_code) outer_code,
281+ total, speed);
282 }
283
284 /* RTX costs for cores with a slow MUL implementation. Thumb-2 is not
285@@ -7918,7 +7951,8 @@
286 so it can be ignored. */
287
288 static bool
289-arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, int *total, bool speed)
290+arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
291+ int *total, bool speed)
292 {
293 enum machine_mode mode = GET_MODE (x);
294
295@@ -8119,15 +8153,15 @@
296 return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x);
297 }
298
299-static int
300-arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
301+/* Adjust cost hook for XScale. */
302+static bool
303+xscale_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
304 {
305 rtx i_pat, d_pat;
306
307 /* Some true dependencies can have a higher cost depending
308 on precisely how certain input operands are used. */
309- if (arm_tune_xscale
310- && REG_NOTE_KIND (link) == 0
311+ if (REG_NOTE_KIND (link) == 0
312 && recog_memoized (insn) >= 0
313 && recog_memoized (dep) >= 0)
314 {
315@@ -8161,10 +8195,106 @@
316
317 if (reg_overlap_mentioned_p (recog_data.operand[opno],
318 shifted_operand))
319- return 2;
320+ {
321+ *cost = 2;
322+ return false;
323+ }
324 }
325 }
326 }
327+ return true;
328+}
329+
330+/* Adjust cost hook for Cortex A9. */
331+static bool
332+cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
333+{
334+ switch (REG_NOTE_KIND (link))
335+ {
336+ case REG_DEP_ANTI:
337+ *cost = 0;
338+ return false;
339+
340+ case REG_DEP_TRUE:
341+ case REG_DEP_OUTPUT:
342+ if (recog_memoized (insn) >= 0
343+ && recog_memoized (dep) >= 0)
344+ {
345+ if (GET_CODE (PATTERN (insn)) == SET)
346+ {
347+ if (GET_MODE_CLASS
348+ (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT
349+ || GET_MODE_CLASS
350+ (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT)
351+ {
352+ enum attr_type attr_type_insn = get_attr_type (insn);
353+ enum attr_type attr_type_dep = get_attr_type (dep);
354+
355+ /* By default all dependencies of the form
356+ s0 = s0 <op> s1
357+ s0 = s0 <op> s2
358+ have an extra latency of 1 cycle because
359+ of the input and output dependency in this
360+ case. However this gets modeled as an true
361+ dependency and hence all these checks. */
362+ if (REG_P (SET_DEST (PATTERN (insn)))
363+ && REG_P (SET_DEST (PATTERN (dep)))
364+ && reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)),
365+ SET_DEST (PATTERN (dep))))
366+ {
367+ /* FMACS is a special case where the dependant
368+ instruction can be issued 3 cycles before
369+ the normal latency in case of an output
370+ dependency. */
371+ if ((attr_type_insn == TYPE_FMACS
372+ || attr_type_insn == TYPE_FMACD)
373+ && (attr_type_dep == TYPE_FMACS
374+ || attr_type_dep == TYPE_FMACD))
375+ {
376+ if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
377+ *cost = insn_default_latency (dep) - 3;
378+ else
379+ *cost = insn_default_latency (dep);
380+ return false;
381+ }
382+ else
383+ {
384+ if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
385+ *cost = insn_default_latency (dep) + 1;
386+ else
387+ *cost = insn_default_latency (dep);
388+ }
389+ return false;
390+ }
391+ }
392+ }
393+ }
394+ break;
395+
396+ default:
397+ gcc_unreachable ();
398+ }
399+
400+ return true;
401+}
402+
403+/* This function implements the target macro TARGET_SCHED_ADJUST_COST.
404+ It corrects the value of COST based on the relationship between
405+ INSN and DEP through the dependence LINK. It returns the new
406+ value. There is a per-core adjust_cost hook to adjust scheduler costs
407+ and the per-core hook can choose to completely override the generic
408+ adjust_cost function. Only put bits of code into arm_adjust_cost that
409+ are common across all cores. */
410+static int
411+arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
412+{
413+ rtx i_pat, d_pat;
414+
415+ if (current_tune->sched_adjust_cost != NULL)
416+ {
417+ if (!current_tune->sched_adjust_cost (insn, link, dep, &cost))
418+ return cost;
419+ }
420
421 /* XXX This is not strictly true for the FPA. */
422 if (REG_NOTE_KIND (link) == REG_DEP_ANTI
423@@ -8187,7 +8317,8 @@
424 constant pool are cached, and that others will miss. This is a
425 hack. */
426
427- if ((GET_CODE (src_mem) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (src_mem))
428+ if ((GET_CODE (src_mem) == SYMBOL_REF
429+ && CONSTANT_POOL_ADDRESS_P (src_mem))
430 || reg_mentioned_p (stack_pointer_rtx, src_mem)
431 || reg_mentioned_p (frame_pointer_rtx, src_mem)
432 || reg_mentioned_p (hard_frame_pointer_rtx, src_mem))
433
434=== modified file 'gcc/config/arm/arm.md'
435--- old/gcc/config/arm/arm.md 2010-08-23 14:39:12 +0000
436+++ new/gcc/config/arm/arm.md 2010-08-24 13:15:54 +0000
437@@ -150,13 +150,6 @@
438 ; patterns that share the same RTL in both ARM and Thumb code.
439 (define_attr "is_thumb" "no,yes" (const (symbol_ref "thumb_code")))
440
441-; IS_STRONGARM is set to 'yes' when compiling for StrongARM, it affects
442-; scheduling decisions for the load unit and the multiplier.
443-(define_attr "is_strongarm" "no,yes" (const (symbol_ref "arm_tune_strongarm")))
444-
445-; IS_XSCALE is set to 'yes' when compiling for XScale.
446-(define_attr "is_xscale" "no,yes" (const (symbol_ref "arm_tune_xscale")))
447-
448 ;; Operand number of an input operand that is shifted. Zero if the
449 ;; given instruction does not shift one of its input operands.
450 (define_attr "shift" "" (const_int 0))
451
452=== modified file 'gcc/config/arm/cortex-a9.md'
453--- old/gcc/config/arm/cortex-a9.md 2009-10-31 16:40:03 +0000
454+++ new/gcc/config/arm/cortex-a9.md 2010-08-24 13:15:54 +0000
455@@ -2,8 +2,10 @@
456 ;; Copyright (C) 2008, 2009 Free Software Foundation, Inc.
457 ;; Originally written by CodeSourcery for VFP.
458 ;;
459-;; Integer core pipeline description contributed by ARM Ltd.
460-;;
461+;; Rewritten by Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>
462+;; Integer Pipeline description contributed by ARM Ltd.
463+;; VFP Pipeline description rewritten and contributed by ARM Ltd.
464+
465 ;; This file is part of GCC.
466 ;;
467 ;; GCC is free software; you can redistribute it and/or modify it
468@@ -22,28 +24,27 @@
469
470 (define_automaton "cortex_a9")
471
472-;; The Cortex-A9 integer core is modelled as a dual issue pipeline that has
473+;; The Cortex-A9 core is modelled as a dual issue pipeline that has
474 ;; the following components.
475 ;; 1. 1 Load Store Pipeline.
476 ;; 2. P0 / main pipeline for data processing instructions.
477 ;; 3. P1 / Dual pipeline for Data processing instructions.
478 ;; 4. MAC pipeline for multiply as well as multiply
479 ;; and accumulate instructions.
480-;; 5. 1 VFP / Neon pipeline.
481-;; The Load/Store and VFP/Neon pipeline are multiplexed.
482+;; 5. 1 VFP and an optional Neon unit.
483+;; The Load/Store, VFP and Neon issue pipeline are multiplexed.
484 ;; The P0 / main pipeline and M1 stage of the MAC pipeline are
485 ;; multiplexed.
486 ;; The P1 / dual pipeline and M2 stage of the MAC pipeline are
487 ;; multiplexed.
488-;; There are only 4 register read ports and hence at any point of
489+;; There are only 4 integer register read ports and hence at any point of
490 ;; time we can't have issue down the E1 and the E2 ports unless
491 ;; of course there are bypass paths that get exercised.
492 ;; Both P0 and P1 have 2 stages E1 and E2.
493 ;; Data processing instructions issue to E1 or E2 depending on
494 ;; whether they have an early shift or not.
495
496-
497-(define_cpu_unit "cortex_a9_vfp, cortex_a9_ls" "cortex_a9")
498+(define_cpu_unit "ca9_issue_vfp_neon, cortex_a9_ls" "cortex_a9")
499 (define_cpu_unit "cortex_a9_p0_e1, cortex_a9_p0_e2" "cortex_a9")
500 (define_cpu_unit "cortex_a9_p1_e1, cortex_a9_p1_e2" "cortex_a9")
501 (define_cpu_unit "cortex_a9_p0_wb, cortex_a9_p1_wb" "cortex_a9")
502@@ -71,11 +72,7 @@
503
504 ;; Issue at the same time along the load store pipeline and
505 ;; the VFP / Neon pipeline is not possible.
506-;; FIXME:: At some point we need to model the issue
507-;; of the load store and the vfp being shared rather than anything else.
508-
509-(exclusion_set "cortex_a9_ls" "cortex_a9_vfp")
510-
511+(exclusion_set "cortex_a9_ls" "ca9_issue_vfp_neon")
512
513 ;; Default data processing instruction without any shift
514 ;; The only exception to this is the mov instruction
515@@ -101,18 +98,13 @@
516
517 (define_insn_reservation "cortex_a9_load1_2" 4
518 (and (eq_attr "tune" "cortexa9")
519- (eq_attr "type" "load1, load2, load_byte"))
520+ (eq_attr "type" "load1, load2, load_byte, f_loads, f_loadd"))
521 "cortex_a9_ls")
522
523 ;; Loads multiples and store multiples can't be issued for 2 cycles in a
524 ;; row. The description below assumes that addresses are 64 bit aligned.
525 ;; If not, there is an extra cycle latency which is not modelled.
526
527-;; FIXME:: This bit might need to be reworked when we get to
528-;; tuning for the VFP because strictly speaking the ldm
529-;; is sent to the LSU unit as is and there is only an
530-;; issue restriction between the LSU and the VFP/ Neon unit.
531-
532 (define_insn_reservation "cortex_a9_load3_4" 5
533 (and (eq_attr "tune" "cortexa9")
534 (eq_attr "type" "load3, load4"))
535@@ -120,12 +112,13 @@
536
537 (define_insn_reservation "cortex_a9_store1_2" 0
538 (and (eq_attr "tune" "cortexa9")
539- (eq_attr "type" "store1, store2"))
540+ (eq_attr "type" "store1, store2, f_stores, f_stored"))
541 "cortex_a9_ls")
542
543 ;; Almost all our store multiples use an auto-increment
544 ;; form. Don't issue back to back load and store multiples
545 ;; because the load store unit will stall.
546+
547 (define_insn_reservation "cortex_a9_store3_4" 0
548 (and (eq_attr "tune" "cortexa9")
549 (eq_attr "type" "store3, store4"))
550@@ -193,47 +186,79 @@
551 (define_insn_reservation "cortex_a9_call" 0
552 (and (eq_attr "tune" "cortexa9")
553 (eq_attr "type" "call"))
554- "cortex_a9_issue_branch + cortex_a9_multcycle1 + cortex_a9_ls + cortex_a9_vfp")
555+ "cortex_a9_issue_branch + cortex_a9_multcycle1 + cortex_a9_ls + ca9_issue_vfp_neon")
556
557
558 ;; Pipelining for VFP instructions.
559-
560-(define_insn_reservation "cortex_a9_ffarith" 1
561+;; Issue happens either along load store unit or the VFP / Neon unit.
562+;; Pipeline Instruction Classification.
563+;; FPS - fcpys, ffariths, ffarithd,r_2_f,f_2_r
564+;; FP_ADD - fadds, faddd, fcmps (1)
565+;; FPMUL - fmul{s,d}, fmac{s,d}
566+;; FPDIV - fdiv{s,d}
567+(define_cpu_unit "ca9fps" "cortex_a9")
568+(define_cpu_unit "ca9fp_add1, ca9fp_add2, ca9fp_add3, ca9fp_add4" "cortex_a9")
569+(define_cpu_unit "ca9fp_mul1, ca9fp_mul2 , ca9fp_mul3, ca9fp_mul4" "cortex_a9")
570+(define_cpu_unit "ca9fp_ds1" "cortex_a9")
571+
572+
573+;; fmrs, fmrrd, fmstat and fmrx - The data is available after 1 cycle.
574+(define_insn_reservation "cortex_a9_fps" 2
575 (and (eq_attr "tune" "cortexa9")
576- (eq_attr "type" "fcpys,ffariths,ffarithd,fcmps,fcmpd,fconsts,fconstd"))
577- "cortex_a9_vfp")
578+ (eq_attr "type" "fcpys, fconsts, fconstd, ffariths, ffarithd, r_2_f, f_2_r, f_flag"))
579+ "ca9_issue_vfp_neon + ca9fps")
580+
581+(define_bypass 1
582+ "cortex_a9_fps"
583+ "cortex_a9_fadd, cortex_a9_fps, cortex_a9_fcmp, cortex_a9_dp, cortex_a9_dp_shift, cortex_a9_multiply")
584+
585+;; Scheduling on the FP_ADD pipeline.
586+(define_reservation "ca9fp_add" "ca9_issue_vfp_neon + ca9fp_add1, ca9fp_add2, ca9fp_add3, ca9fp_add4")
587
588 (define_insn_reservation "cortex_a9_fadd" 4
589- (and (eq_attr "tune" "cortexa9")
590- (eq_attr "type" "fadds,faddd,f_cvt"))
591- "cortex_a9_vfp")
592-
593-(define_insn_reservation "cortex_a9_fmuls" 5
594- (and (eq_attr "tune" "cortexa9")
595- (eq_attr "type" "fmuls"))
596- "cortex_a9_vfp")
597-
598-(define_insn_reservation "cortex_a9_fmuld" 6
599- (and (eq_attr "tune" "cortexa9")
600- (eq_attr "type" "fmuld"))
601- "cortex_a9_vfp*2")
602+ (and (eq_attr "tune" "cortexa9")
603+ (eq_attr "type" "fadds, faddd, f_cvt"))
604+ "ca9fp_add")
605+
606+(define_insn_reservation "cortex_a9_fcmp" 1
607+ (and (eq_attr "tune" "cortexa9")
608+ (eq_attr "type" "fcmps, fcmpd"))
609+ "ca9_issue_vfp_neon + ca9fp_add1")
610+
611+;; Scheduling for the Multiply and MAC instructions.
612+(define_reservation "ca9fmuls"
613+ "ca9fp_mul1 + ca9_issue_vfp_neon, ca9fp_mul2, ca9fp_mul3, ca9fp_mul4")
614+
615+(define_reservation "ca9fmuld"
616+ "ca9fp_mul1 + ca9_issue_vfp_neon, (ca9fp_mul1 + ca9fp_mul2), ca9fp_mul2, ca9fp_mul3, ca9fp_mul4")
617+
618+(define_insn_reservation "cortex_a9_fmuls" 4
619+ (and (eq_attr "tune" "cortexa9")
620+ (eq_attr "type" "fmuls"))
621+ "ca9fmuls")
622+
623+(define_insn_reservation "cortex_a9_fmuld" 5
624+ (and (eq_attr "tune" "cortexa9")
625+ (eq_attr "type" "fmuld"))
626+ "ca9fmuld")
627
628 (define_insn_reservation "cortex_a9_fmacs" 8
629- (and (eq_attr "tune" "cortexa9")
630- (eq_attr "type" "fmacs"))
631- "cortex_a9_vfp")
632-
633-(define_insn_reservation "cortex_a9_fmacd" 8
634- (and (eq_attr "tune" "cortexa9")
635- (eq_attr "type" "fmacd"))
636- "cortex_a9_vfp*2")
637-
638+ (and (eq_attr "tune" "cortexa9")
639+ (eq_attr "type" "fmacs"))
640+ "ca9fmuls, ca9fp_add")
641+
642+(define_insn_reservation "cortex_a9_fmacd" 9
643+ (and (eq_attr "tune" "cortexa9")
644+ (eq_attr "type" "fmacd"))
645+ "ca9fmuld, ca9fp_add")
646+
647+;; Division pipeline description.
648 (define_insn_reservation "cortex_a9_fdivs" 15
649- (and (eq_attr "tune" "cortexa9")
650- (eq_attr "type" "fdivs"))
651- "cortex_a9_vfp*10")
652+ (and (eq_attr "tune" "cortexa9")
653+ (eq_attr "type" "fdivs"))
654+ "ca9fp_ds1 + ca9_issue_vfp_neon, nothing*14")
655
656 (define_insn_reservation "cortex_a9_fdivd" 25
657- (and (eq_attr "tune" "cortexa9")
658- (eq_attr "type" "fdivd"))
659- "cortex_a9_vfp*20")
660+ (and (eq_attr "tune" "cortexa9")
661+ (eq_attr "type" "fdivd"))
662+ "ca9fp_ds1 + ca9_issue_vfp_neon, nothing*24")
663