diff options
Diffstat (limited to 'recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch')
| -rw-r--r-- | recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch | 663 |
1 files changed, 663 insertions, 0 deletions
diff --git a/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch b/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch new file mode 100644 index 0000000000..be102160c5 --- /dev/null +++ b/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99371.patch | |||
| @@ -0,0 +1,663 @@ | |||
| 1 | 2010-08-24 Andrew Stubbs <ams@codesourcery.com> | ||
| 2 | |||
| 3 | Backport from FSF: | ||
| 4 | |||
| 5 | 2010-08-07 Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> | ||
| 6 | |||
| 7 | * config/arm/cortex-a9.md: Rewrite VFP Pipeline description. | ||
| 8 | * config/arm/arm.c (arm_xscale_tune): Initialize sched_adjust_cost. | ||
| 9 | (arm_fastmul_tune,arm_slowmul_tune, arm_9e_tune): Likewise. | ||
| 10 | (arm_adjust_cost): Split into xscale_sched_adjust_cost and a | ||
| 11 | generic part. | ||
| 12 | (cortex_a9_sched_adjust_cost): New function. | ||
| 13 | (xscale_sched_adjust_cost): New function. | ||
| 14 | * config/arm/arm-protos.h (struct tune_params): New field | ||
| 15 | sched_adjust_cost. | ||
| 16 | * config/arm/arm-cores.def: Adjust costs for cortex-a9. | ||
| 17 | |||
| 18 | 2010-04-17 Richard Earnshaw <rearnsha@arm.com> | ||
| 19 | |||
| 20 | * arm-protos.h (tune_params): New structure. | ||
| 21 | * arm.c (current_tune): New variable. | ||
| 22 | (arm_constant_limit): Delete. | ||
| 23 | (struct processors): Add pointer to the tune parameters. | ||
| 24 | (arm_slowmul_tune): New tuning option. | ||
| 25 | (arm_fastmul_tune, arm_xscale_tune, arm_9e_tune): Likewise. | ||
| 26 | (all_cores): Adjust to pick up the tuning model. | ||
| 27 | (arm_constant_limit): New function. | ||
| 28 | (arm_override_options): Select the appropriate tuning model. Delete | ||
| 29 | initialization of arm_const_limit. | ||
| 30 | (arm_split_constant): Use the new constant-limit model. | ||
| 31 | (arm_rtx_costs): Pick up the current tuning model. | ||
| 32 | * arm.md (is_strongarm, is_xscale): Delete. | ||
| 33 | * arm-generic.md (load_ldsched_x, load_ldsched): Test explicitly | ||
| 34 | for Xscale variant architectures. | ||
| 35 | (mult_ldsched_strongarm, mult_ldsched): Similarly for StrongARM. | ||
| 36 | |||
| 37 | 2010-08-23 Andrew Stubbs <ams@codesourcery.com> | ||
| 38 | |||
| 39 | Backport from FSF: | ||
| 40 | |||
| 41 | === modified file 'gcc/config/arm/arm-cores.def' | ||
| 42 | --- old/gcc/config/arm/arm-cores.def 2010-07-29 15:53:39 +0000 | ||
| 43 | +++ new/gcc/config/arm/arm-cores.def 2010-08-24 13:15:54 +0000 | ||
| 44 | @@ -120,7 +120,7 @@ | ||
| 45 | ARM_CORE("arm1156t2f-s", arm1156t2fs, 6T2, FL_LDSCHED | FL_VFPV2, 9e) | ||
| 46 | ARM_CORE("cortex-a5", cortexa5, 7A, FL_LDSCHED, 9e) | ||
| 47 | ARM_CORE("cortex-a8", cortexa8, 7A, FL_LDSCHED, 9e) | ||
| 48 | -ARM_CORE("cortex-a9", cortexa9, 7A, FL_LDSCHED, 9e) | ||
| 49 | +ARM_CORE("cortex-a9", cortexa9, 7A, FL_LDSCHED, cortex_a9) | ||
| 50 | ARM_CORE("cortex-r4", cortexr4, 7R, FL_LDSCHED, 9e) | ||
| 51 | ARM_CORE("cortex-r4f", cortexr4f, 7R, FL_LDSCHED, 9e) | ||
| 52 | ARM_CORE("cortex-m4", cortexm4, 7EM, FL_LDSCHED, 9e) | ||
| 53 | |||
| 54 | === modified file 'gcc/config/arm/arm-generic.md' | ||
| 55 | --- old/gcc/config/arm/arm-generic.md 2007-08-02 09:49:31 +0000 | ||
| 56 | +++ new/gcc/config/arm/arm-generic.md 2010-08-24 13:15:54 +0000 | ||
| 57 | @@ -104,14 +104,14 @@ | ||
| 58 | (and (eq_attr "generic_sched" "yes") | ||
| 59 | (and (eq_attr "ldsched" "yes") | ||
| 60 | (and (eq_attr "type" "load_byte,load1") | ||
| 61 | - (eq_attr "is_xscale" "yes")))) | ||
| 62 | + (eq_attr "tune" "xscale,iwmmxt,iwmmxt2")))) | ||
| 63 | "core") | ||
| 64 | |||
| 65 | (define_insn_reservation "load_ldsched" 2 | ||
| 66 | (and (eq_attr "generic_sched" "yes") | ||
| 67 | (and (eq_attr "ldsched" "yes") | ||
| 68 | (and (eq_attr "type" "load_byte,load1") | ||
| 69 | - (eq_attr "is_xscale" "no")))) | ||
| 70 | + (eq_attr "tune" "!xscale,iwmmxt,iwmmxt2")))) | ||
| 71 | "core") | ||
| 72 | |||
| 73 | (define_insn_reservation "load_or_store" 2 | ||
| 74 | @@ -128,14 +128,16 @@ | ||
| 75 | (define_insn_reservation "mult_ldsched_strongarm" 3 | ||
| 76 | (and (eq_attr "generic_sched" "yes") | ||
| 77 | (and (eq_attr "ldsched" "yes") | ||
| 78 | - (and (eq_attr "is_strongarm" "yes") | ||
| 79 | + (and (eq_attr "tune" | ||
| 80 | + "strongarm,strongarm110,strongarm1100,strongarm1110") | ||
| 81 | (eq_attr "type" "mult")))) | ||
| 82 | "core*2") | ||
| 83 | |||
| 84 | (define_insn_reservation "mult_ldsched" 4 | ||
| 85 | (and (eq_attr "generic_sched" "yes") | ||
| 86 | (and (eq_attr "ldsched" "yes") | ||
| 87 | - (and (eq_attr "is_strongarm" "no") | ||
| 88 | + (and (eq_attr "tune" | ||
| 89 | + "!strongarm,strongarm110,strongarm1100,strongarm1110") | ||
| 90 | (eq_attr "type" "mult")))) | ||
| 91 | "core*4") | ||
| 92 | |||
| 93 | |||
| 94 | === modified file 'gcc/config/arm/arm-protos.h' | ||
| 95 | --- old/gcc/config/arm/arm-protos.h 2010-08-10 13:31:21 +0000 | ||
| 96 | +++ new/gcc/config/arm/arm-protos.h 2010-08-24 13:15:54 +0000 | ||
| 97 | @@ -214,4 +214,17 @@ | ||
| 98 | |||
| 99 | extern void arm_order_regs_for_local_alloc (void); | ||
| 100 | |||
| 101 | +#ifdef RTX_CODE | ||
| 102 | +/* This needs to be here because we need RTX_CODE and similar. */ | ||
| 103 | + | ||
| 104 | +struct tune_params | ||
| 105 | +{ | ||
| 106 | + bool (*rtx_costs) (rtx, RTX_CODE, RTX_CODE, int *, bool); | ||
| 107 | + bool (*sched_adjust_cost) (rtx, rtx, rtx, int *); | ||
| 108 | + int constant_limit; | ||
| 109 | +}; | ||
| 110 | + | ||
| 111 | +extern const struct tune_params *current_tune; | ||
| 112 | +#endif /* RTX_CODE */ | ||
| 113 | + | ||
| 114 | #endif /* ! GCC_ARM_PROTOS_H */ | ||
| 115 | |||
| 116 | === modified file 'gcc/config/arm/arm.c' | ||
| 117 | --- old/gcc/config/arm/arm.c 2010-08-20 16:21:01 +0000 | ||
| 118 | +++ new/gcc/config/arm/arm.c 2010-08-24 13:15:54 +0000 | ||
| 119 | @@ -228,6 +228,8 @@ | ||
| 120 | static void arm_trampoline_init (rtx, tree, rtx); | ||
| 121 | static rtx arm_trampoline_adjust_address (rtx); | ||
| 122 | static rtx arm_pic_static_addr (rtx orig, rtx reg); | ||
| 123 | +static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *); | ||
| 124 | +static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *); | ||
| 125 | static bool arm_vector_alignment_reachable (const_tree type, bool is_packed); | ||
| 126 | static bool arm_builtin_support_vector_misalignment (enum machine_mode mode, | ||
| 127 | const_tree type, | ||
| 128 | @@ -545,6 +547,9 @@ | ||
| 129 | /* The processor for which instructions should be scheduled. */ | ||
| 130 | enum processor_type arm_tune = arm_none; | ||
| 131 | |||
| 132 | +/* The current tuning set. */ | ||
| 133 | +const struct tune_params *current_tune; | ||
| 134 | + | ||
| 135 | /* The default processor used if not overridden by commandline. */ | ||
| 136 | static enum processor_type arm_default_cpu = arm_none; | ||
| 137 | |||
| 138 | @@ -720,9 +725,6 @@ | ||
| 139 | the next function. */ | ||
| 140 | static int after_arm_reorg = 0; | ||
| 141 | |||
| 142 | -/* The maximum number of insns to be used when loading a constant. */ | ||
| 143 | -static int arm_constant_limit = 3; | ||
| 144 | - | ||
| 145 | enum arm_pcs arm_pcs_default; | ||
| 146 | |||
| 147 | /* For an explanation of these variables, see final_prescan_insn below. */ | ||
| 148 | @@ -761,8 +763,44 @@ | ||
| 149 | enum processor_type core; | ||
| 150 | const char *arch; | ||
| 151 | const unsigned long flags; | ||
| 152 | - bool (* rtx_costs) (rtx, enum rtx_code, enum rtx_code, int *, bool); | ||
| 153 | -}; | ||
| 154 | + const struct tune_params *const tune; | ||
| 155 | +}; | ||
| 156 | + | ||
| 157 | +const struct tune_params arm_slowmul_tune = | ||
| 158 | +{ | ||
| 159 | + arm_slowmul_rtx_costs, | ||
| 160 | + NULL, | ||
| 161 | + 3 | ||
| 162 | +}; | ||
| 163 | + | ||
| 164 | +const struct tune_params arm_fastmul_tune = | ||
| 165 | +{ | ||
| 166 | + arm_fastmul_rtx_costs, | ||
| 167 | + NULL, | ||
| 168 | + 1 | ||
| 169 | +}; | ||
| 170 | + | ||
| 171 | +const struct tune_params arm_xscale_tune = | ||
| 172 | +{ | ||
| 173 | + arm_xscale_rtx_costs, | ||
| 174 | + xscale_sched_adjust_cost, | ||
| 175 | + 2 | ||
| 176 | +}; | ||
| 177 | + | ||
| 178 | +const struct tune_params arm_9e_tune = | ||
| 179 | +{ | ||
| 180 | + arm_9e_rtx_costs, | ||
| 181 | + NULL, | ||
| 182 | + 1 | ||
| 183 | +}; | ||
| 184 | + | ||
| 185 | +const struct tune_params arm_cortex_a9_tune = | ||
| 186 | +{ | ||
| 187 | + arm_9e_rtx_costs, | ||
| 188 | + cortex_a9_sched_adjust_cost, | ||
| 189 | + 1 | ||
| 190 | +}; | ||
| 191 | + | ||
| 192 | |||
| 193 | /* Not all of these give usefully different compilation alternatives, | ||
| 194 | but there is no simple way of generalizing them. */ | ||
| 195 | @@ -770,7 +808,7 @@ | ||
| 196 | { | ||
| 197 | /* ARM Cores */ | ||
| 198 | #define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \ | ||
| 199 | - {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, arm_##COSTS##_rtx_costs}, | ||
| 200 | + {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune}, | ||
| 201 | #include "arm-cores.def" | ||
| 202 | #undef ARM_CORE | ||
| 203 | {NULL, arm_none, NULL, 0, NULL} | ||
| 204 | @@ -779,7 +817,7 @@ | ||
| 205 | static const struct processors all_architectures[] = | ||
| 206 | { | ||
| 207 | /* ARM Architectures */ | ||
| 208 | - /* We don't specify rtx_costs here as it will be figured out | ||
| 209 | + /* We don't specify tuning costs here as it will be figured out | ||
| 210 | from the core. */ | ||
| 211 | |||
| 212 | {"armv2", arm2, "2", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL}, | ||
| 213 | @@ -928,6 +966,13 @@ | ||
| 214 | TLS_LE32 | ||
| 215 | }; | ||
| 216 | |||
| 217 | +/* The maximum number of insns to be used when loading a constant. */ | ||
| 218 | +inline static int | ||
| 219 | +arm_constant_limit (bool size_p) | ||
| 220 | +{ | ||
| 221 | + return size_p ? 1 : current_tune->constant_limit; | ||
| 222 | +} | ||
| 223 | + | ||
| 224 | /* Emit an insn that's a simple single-set. Both the operands must be known | ||
| 225 | to be valid. */ | ||
| 226 | inline static rtx | ||
| 227 | @@ -1478,6 +1523,7 @@ | ||
| 228 | } | ||
| 229 | |||
| 230 | tune_flags = all_cores[(int)arm_tune].flags; | ||
| 231 | + current_tune = all_cores[(int)arm_tune].tune; | ||
| 232 | |||
| 233 | if (target_fp16_format_name) | ||
| 234 | { | ||
| 235 | @@ -1875,26 +1921,12 @@ | ||
| 236 | |||
| 237 | if (optimize_size) | ||
| 238 | { | ||
| 239 | - arm_constant_limit = 1; | ||
| 240 | - | ||
| 241 | /* If optimizing for size, bump the number of instructions that we | ||
| 242 | are prepared to conditionally execute (even on a StrongARM). */ | ||
| 243 | max_insns_skipped = 6; | ||
| 244 | } | ||
| 245 | else | ||
| 246 | { | ||
| 247 | - /* For processors with load scheduling, it never costs more than | ||
| 248 | - 2 cycles to load a constant, and the load scheduler may well | ||
| 249 | - reduce that to 1. */ | ||
| 250 | - if (arm_ld_sched) | ||
| 251 | - arm_constant_limit = 1; | ||
| 252 | - | ||
| 253 | - /* On XScale the longer latency of a load makes it more difficult | ||
| 254 | - to achieve a good schedule, so it's faster to synthesize | ||
| 255 | - constants that can be done in two insns. */ | ||
| 256 | - if (arm_tune_xscale) | ||
| 257 | - arm_constant_limit = 2; | ||
| 258 | - | ||
| 259 | /* StrongARM has early execution of branches, so a sequence | ||
| 260 | that is worth skipping is shorter. */ | ||
| 261 | if (arm_tune_strongarm) | ||
| 262 | @@ -2423,7 +2455,8 @@ | ||
| 263 | && !cond | ||
| 264 | && (arm_gen_constant (code, mode, NULL_RTX, val, target, source, | ||
| 265 | 1, 0) | ||
| 266 | - > arm_constant_limit + (code != SET))) | ||
| 267 | + > (arm_constant_limit (optimize_function_for_size_p (cfun)) | ||
| 268 | + + (code != SET)))) | ||
| 269 | { | ||
| 270 | if (code == SET) | ||
| 271 | { | ||
| 272 | @@ -7771,9 +7804,9 @@ | ||
| 273 | (enum rtx_code) outer_code, total); | ||
| 274 | } | ||
| 275 | else | ||
| 276 | - return all_cores[(int)arm_tune].rtx_costs (x, (enum rtx_code) code, | ||
| 277 | - (enum rtx_code) outer_code, | ||
| 278 | - total, speed); | ||
| 279 | + return current_tune->rtx_costs (x, (enum rtx_code) code, | ||
| 280 | + (enum rtx_code) outer_code, | ||
| 281 | + total, speed); | ||
| 282 | } | ||
| 283 | |||
| 284 | /* RTX costs for cores with a slow MUL implementation. Thumb-2 is not | ||
| 285 | @@ -7918,7 +7951,8 @@ | ||
| 286 | so it can be ignored. */ | ||
| 287 | |||
| 288 | static bool | ||
| 289 | -arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, int *total, bool speed) | ||
| 290 | +arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, | ||
| 291 | + int *total, bool speed) | ||
| 292 | { | ||
| 293 | enum machine_mode mode = GET_MODE (x); | ||
| 294 | |||
| 295 | @@ -8119,15 +8153,15 @@ | ||
| 296 | return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x); | ||
| 297 | } | ||
| 298 | |||
| 299 | -static int | ||
| 300 | -arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) | ||
| 301 | +/* Adjust cost hook for XScale. */ | ||
| 302 | +static bool | ||
| 303 | +xscale_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) | ||
| 304 | { | ||
| 305 | rtx i_pat, d_pat; | ||
| 306 | |||
| 307 | /* Some true dependencies can have a higher cost depending | ||
| 308 | on precisely how certain input operands are used. */ | ||
| 309 | - if (arm_tune_xscale | ||
| 310 | - && REG_NOTE_KIND (link) == 0 | ||
| 311 | + if (REG_NOTE_KIND (link) == 0 | ||
| 312 | && recog_memoized (insn) >= 0 | ||
| 313 | && recog_memoized (dep) >= 0) | ||
| 314 | { | ||
| 315 | @@ -8161,10 +8195,106 @@ | ||
| 316 | |||
| 317 | if (reg_overlap_mentioned_p (recog_data.operand[opno], | ||
| 318 | shifted_operand)) | ||
| 319 | - return 2; | ||
| 320 | + { | ||
| 321 | + *cost = 2; | ||
| 322 | + return false; | ||
| 323 | + } | ||
| 324 | } | ||
| 325 | } | ||
| 326 | } | ||
| 327 | + return true; | ||
| 328 | +} | ||
| 329 | + | ||
| 330 | +/* Adjust cost hook for Cortex A9. */ | ||
| 331 | +static bool | ||
| 332 | +cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) | ||
| 333 | +{ | ||
| 334 | + switch (REG_NOTE_KIND (link)) | ||
| 335 | + { | ||
| 336 | + case REG_DEP_ANTI: | ||
| 337 | + *cost = 0; | ||
| 338 | + return false; | ||
| 339 | + | ||
| 340 | + case REG_DEP_TRUE: | ||
| 341 | + case REG_DEP_OUTPUT: | ||
| 342 | + if (recog_memoized (insn) >= 0 | ||
| 343 | + && recog_memoized (dep) >= 0) | ||
| 344 | + { | ||
| 345 | + if (GET_CODE (PATTERN (insn)) == SET) | ||
| 346 | + { | ||
| 347 | + if (GET_MODE_CLASS | ||
| 348 | + (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT | ||
| 349 | + || GET_MODE_CLASS | ||
| 350 | + (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT) | ||
| 351 | + { | ||
| 352 | + enum attr_type attr_type_insn = get_attr_type (insn); | ||
| 353 | + enum attr_type attr_type_dep = get_attr_type (dep); | ||
| 354 | + | ||
| 355 | + /* By default all dependencies of the form | ||
| 356 | + s0 = s0 <op> s1 | ||
| 357 | + s0 = s0 <op> s2 | ||
| 358 | + have an extra latency of 1 cycle because | ||
| 359 | + of the input and output dependency in this | ||
| 360 | + case. However this gets modeled as an true | ||
| 361 | + dependency and hence all these checks. */ | ||
| 362 | + if (REG_P (SET_DEST (PATTERN (insn))) | ||
| 363 | + && REG_P (SET_DEST (PATTERN (dep))) | ||
| 364 | + && reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)), | ||
| 365 | + SET_DEST (PATTERN (dep)))) | ||
| 366 | + { | ||
| 367 | + /* FMACS is a special case where the dependant | ||
| 368 | + instruction can be issued 3 cycles before | ||
| 369 | + the normal latency in case of an output | ||
| 370 | + dependency. */ | ||
| 371 | + if ((attr_type_insn == TYPE_FMACS | ||
| 372 | + || attr_type_insn == TYPE_FMACD) | ||
| 373 | + && (attr_type_dep == TYPE_FMACS | ||
| 374 | + || attr_type_dep == TYPE_FMACD)) | ||
| 375 | + { | ||
| 376 | + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) | ||
| 377 | + *cost = insn_default_latency (dep) - 3; | ||
| 378 | + else | ||
| 379 | + *cost = insn_default_latency (dep); | ||
| 380 | + return false; | ||
| 381 | + } | ||
| 382 | + else | ||
| 383 | + { | ||
| 384 | + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) | ||
| 385 | + *cost = insn_default_latency (dep) + 1; | ||
| 386 | + else | ||
| 387 | + *cost = insn_default_latency (dep); | ||
| 388 | + } | ||
| 389 | + return false; | ||
| 390 | + } | ||
| 391 | + } | ||
| 392 | + } | ||
| 393 | + } | ||
| 394 | + break; | ||
| 395 | + | ||
| 396 | + default: | ||
| 397 | + gcc_unreachable (); | ||
| 398 | + } | ||
| 399 | + | ||
| 400 | + return true; | ||
| 401 | +} | ||
| 402 | + | ||
| 403 | +/* This function implements the target macro TARGET_SCHED_ADJUST_COST. | ||
| 404 | + It corrects the value of COST based on the relationship between | ||
| 405 | + INSN and DEP through the dependence LINK. It returns the new | ||
| 406 | + value. There is a per-core adjust_cost hook to adjust scheduler costs | ||
| 407 | + and the per-core hook can choose to completely override the generic | ||
| 408 | + adjust_cost function. Only put bits of code into arm_adjust_cost that | ||
| 409 | + are common across all cores. */ | ||
| 410 | +static int | ||
| 411 | +arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) | ||
| 412 | +{ | ||
| 413 | + rtx i_pat, d_pat; | ||
| 414 | + | ||
| 415 | + if (current_tune->sched_adjust_cost != NULL) | ||
| 416 | + { | ||
| 417 | + if (!current_tune->sched_adjust_cost (insn, link, dep, &cost)) | ||
| 418 | + return cost; | ||
| 419 | + } | ||
| 420 | |||
| 421 | /* XXX This is not strictly true for the FPA. */ | ||
| 422 | if (REG_NOTE_KIND (link) == REG_DEP_ANTI | ||
| 423 | @@ -8187,7 +8317,8 @@ | ||
| 424 | constant pool are cached, and that others will miss. This is a | ||
| 425 | hack. */ | ||
| 426 | |||
| 427 | - if ((GET_CODE (src_mem) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (src_mem)) | ||
| 428 | + if ((GET_CODE (src_mem) == SYMBOL_REF | ||
| 429 | + && CONSTANT_POOL_ADDRESS_P (src_mem)) | ||
| 430 | || reg_mentioned_p (stack_pointer_rtx, src_mem) | ||
| 431 | || reg_mentioned_p (frame_pointer_rtx, src_mem) | ||
| 432 | || reg_mentioned_p (hard_frame_pointer_rtx, src_mem)) | ||
| 433 | |||
| 434 | === modified file 'gcc/config/arm/arm.md' | ||
| 435 | --- old/gcc/config/arm/arm.md 2010-08-23 14:39:12 +0000 | ||
| 436 | +++ new/gcc/config/arm/arm.md 2010-08-24 13:15:54 +0000 | ||
| 437 | @@ -150,13 +150,6 @@ | ||
| 438 | ; patterns that share the same RTL in both ARM and Thumb code. | ||
| 439 | (define_attr "is_thumb" "no,yes" (const (symbol_ref "thumb_code"))) | ||
| 440 | |||
| 441 | -; IS_STRONGARM is set to 'yes' when compiling for StrongARM, it affects | ||
| 442 | -; scheduling decisions for the load unit and the multiplier. | ||
| 443 | -(define_attr "is_strongarm" "no,yes" (const (symbol_ref "arm_tune_strongarm"))) | ||
| 444 | - | ||
| 445 | -; IS_XSCALE is set to 'yes' when compiling for XScale. | ||
| 446 | -(define_attr "is_xscale" "no,yes" (const (symbol_ref "arm_tune_xscale"))) | ||
| 447 | - | ||
| 448 | ;; Operand number of an input operand that is shifted. Zero if the | ||
| 449 | ;; given instruction does not shift one of its input operands. | ||
| 450 | (define_attr "shift" "" (const_int 0)) | ||
| 451 | |||
| 452 | === modified file 'gcc/config/arm/cortex-a9.md' | ||
| 453 | --- old/gcc/config/arm/cortex-a9.md 2009-10-31 16:40:03 +0000 | ||
| 454 | +++ new/gcc/config/arm/cortex-a9.md 2010-08-24 13:15:54 +0000 | ||
| 455 | @@ -2,8 +2,10 @@ | ||
| 456 | ;; Copyright (C) 2008, 2009 Free Software Foundation, Inc. | ||
| 457 | ;; Originally written by CodeSourcery for VFP. | ||
| 458 | ;; | ||
| 459 | -;; Integer core pipeline description contributed by ARM Ltd. | ||
| 460 | -;; | ||
| 461 | +;; Rewritten by Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> | ||
| 462 | +;; Integer Pipeline description contributed by ARM Ltd. | ||
| 463 | +;; VFP Pipeline description rewritten and contributed by ARM Ltd. | ||
| 464 | + | ||
| 465 | ;; This file is part of GCC. | ||
| 466 | ;; | ||
| 467 | ;; GCC is free software; you can redistribute it and/or modify it | ||
| 468 | @@ -22,28 +24,27 @@ | ||
| 469 | |||
| 470 | (define_automaton "cortex_a9") | ||
| 471 | |||
| 472 | -;; The Cortex-A9 integer core is modelled as a dual issue pipeline that has | ||
| 473 | +;; The Cortex-A9 core is modelled as a dual issue pipeline that has | ||
| 474 | ;; the following components. | ||
| 475 | ;; 1. 1 Load Store Pipeline. | ||
| 476 | ;; 2. P0 / main pipeline for data processing instructions. | ||
| 477 | ;; 3. P1 / Dual pipeline for Data processing instructions. | ||
| 478 | ;; 4. MAC pipeline for multiply as well as multiply | ||
| 479 | ;; and accumulate instructions. | ||
| 480 | -;; 5. 1 VFP / Neon pipeline. | ||
| 481 | -;; The Load/Store and VFP/Neon pipeline are multiplexed. | ||
| 482 | +;; 5. 1 VFP and an optional Neon unit. | ||
| 483 | +;; The Load/Store, VFP and Neon issue pipeline are multiplexed. | ||
| 484 | ;; The P0 / main pipeline and M1 stage of the MAC pipeline are | ||
| 485 | ;; multiplexed. | ||
| 486 | ;; The P1 / dual pipeline and M2 stage of the MAC pipeline are | ||
| 487 | ;; multiplexed. | ||
| 488 | -;; There are only 4 register read ports and hence at any point of | ||
| 489 | +;; There are only 4 integer register read ports and hence at any point of | ||
| 490 | ;; time we can't have issue down the E1 and the E2 ports unless | ||
| 491 | ;; of course there are bypass paths that get exercised. | ||
| 492 | ;; Both P0 and P1 have 2 stages E1 and E2. | ||
| 493 | ;; Data processing instructions issue to E1 or E2 depending on | ||
| 494 | ;; whether they have an early shift or not. | ||
| 495 | |||
| 496 | - | ||
| 497 | -(define_cpu_unit "cortex_a9_vfp, cortex_a9_ls" "cortex_a9") | ||
| 498 | +(define_cpu_unit "ca9_issue_vfp_neon, cortex_a9_ls" "cortex_a9") | ||
| 499 | (define_cpu_unit "cortex_a9_p0_e1, cortex_a9_p0_e2" "cortex_a9") | ||
| 500 | (define_cpu_unit "cortex_a9_p1_e1, cortex_a9_p1_e2" "cortex_a9") | ||
| 501 | (define_cpu_unit "cortex_a9_p0_wb, cortex_a9_p1_wb" "cortex_a9") | ||
| 502 | @@ -71,11 +72,7 @@ | ||
| 503 | |||
| 504 | ;; Issue at the same time along the load store pipeline and | ||
| 505 | ;; the VFP / Neon pipeline is not possible. | ||
| 506 | -;; FIXME:: At some point we need to model the issue | ||
| 507 | -;; of the load store and the vfp being shared rather than anything else. | ||
| 508 | - | ||
| 509 | -(exclusion_set "cortex_a9_ls" "cortex_a9_vfp") | ||
| 510 | - | ||
| 511 | +(exclusion_set "cortex_a9_ls" "ca9_issue_vfp_neon") | ||
| 512 | |||
| 513 | ;; Default data processing instruction without any shift | ||
| 514 | ;; The only exception to this is the mov instruction | ||
| 515 | @@ -101,18 +98,13 @@ | ||
| 516 | |||
| 517 | (define_insn_reservation "cortex_a9_load1_2" 4 | ||
| 518 | (and (eq_attr "tune" "cortexa9") | ||
| 519 | - (eq_attr "type" "load1, load2, load_byte")) | ||
| 520 | + (eq_attr "type" "load1, load2, load_byte, f_loads, f_loadd")) | ||
| 521 | "cortex_a9_ls") | ||
| 522 | |||
| 523 | ;; Loads multiples and store multiples can't be issued for 2 cycles in a | ||
| 524 | ;; row. The description below assumes that addresses are 64 bit aligned. | ||
| 525 | ;; If not, there is an extra cycle latency which is not modelled. | ||
| 526 | |||
| 527 | -;; FIXME:: This bit might need to be reworked when we get to | ||
| 528 | -;; tuning for the VFP because strictly speaking the ldm | ||
| 529 | -;; is sent to the LSU unit as is and there is only an | ||
| 530 | -;; issue restriction between the LSU and the VFP/ Neon unit. | ||
| 531 | - | ||
| 532 | (define_insn_reservation "cortex_a9_load3_4" 5 | ||
| 533 | (and (eq_attr "tune" "cortexa9") | ||
| 534 | (eq_attr "type" "load3, load4")) | ||
| 535 | @@ -120,12 +112,13 @@ | ||
| 536 | |||
| 537 | (define_insn_reservation "cortex_a9_store1_2" 0 | ||
| 538 | (and (eq_attr "tune" "cortexa9") | ||
| 539 | - (eq_attr "type" "store1, store2")) | ||
| 540 | + (eq_attr "type" "store1, store2, f_stores, f_stored")) | ||
| 541 | "cortex_a9_ls") | ||
| 542 | |||
| 543 | ;; Almost all our store multiples use an auto-increment | ||
| 544 | ;; form. Don't issue back to back load and store multiples | ||
| 545 | ;; because the load store unit will stall. | ||
| 546 | + | ||
| 547 | (define_insn_reservation "cortex_a9_store3_4" 0 | ||
| 548 | (and (eq_attr "tune" "cortexa9") | ||
| 549 | (eq_attr "type" "store3, store4")) | ||
| 550 | @@ -193,47 +186,79 @@ | ||
| 551 | (define_insn_reservation "cortex_a9_call" 0 | ||
| 552 | (and (eq_attr "tune" "cortexa9") | ||
| 553 | (eq_attr "type" "call")) | ||
| 554 | - "cortex_a9_issue_branch + cortex_a9_multcycle1 + cortex_a9_ls + cortex_a9_vfp") | ||
| 555 | + "cortex_a9_issue_branch + cortex_a9_multcycle1 + cortex_a9_ls + ca9_issue_vfp_neon") | ||
| 556 | |||
| 557 | |||
| 558 | ;; Pipelining for VFP instructions. | ||
| 559 | - | ||
| 560 | -(define_insn_reservation "cortex_a9_ffarith" 1 | ||
| 561 | +;; Issue happens either along load store unit or the VFP / Neon unit. | ||
| 562 | +;; Pipeline Instruction Classification. | ||
| 563 | +;; FPS - fcpys, ffariths, ffarithd,r_2_f,f_2_r | ||
| 564 | +;; FP_ADD - fadds, faddd, fcmps (1) | ||
| 565 | +;; FPMUL - fmul{s,d}, fmac{s,d} | ||
| 566 | +;; FPDIV - fdiv{s,d} | ||
| 567 | +(define_cpu_unit "ca9fps" "cortex_a9") | ||
| 568 | +(define_cpu_unit "ca9fp_add1, ca9fp_add2, ca9fp_add3, ca9fp_add4" "cortex_a9") | ||
| 569 | +(define_cpu_unit "ca9fp_mul1, ca9fp_mul2 , ca9fp_mul3, ca9fp_mul4" "cortex_a9") | ||
| 570 | +(define_cpu_unit "ca9fp_ds1" "cortex_a9") | ||
| 571 | + | ||
| 572 | + | ||
| 573 | +;; fmrs, fmrrd, fmstat and fmrx - The data is available after 1 cycle. | ||
| 574 | +(define_insn_reservation "cortex_a9_fps" 2 | ||
| 575 | (and (eq_attr "tune" "cortexa9") | ||
| 576 | - (eq_attr "type" "fcpys,ffariths,ffarithd,fcmps,fcmpd,fconsts,fconstd")) | ||
| 577 | - "cortex_a9_vfp") | ||
| 578 | + (eq_attr "type" "fcpys, fconsts, fconstd, ffariths, ffarithd, r_2_f, f_2_r, f_flag")) | ||
| 579 | + "ca9_issue_vfp_neon + ca9fps") | ||
| 580 | + | ||
| 581 | +(define_bypass 1 | ||
| 582 | + "cortex_a9_fps" | ||
| 583 | + "cortex_a9_fadd, cortex_a9_fps, cortex_a9_fcmp, cortex_a9_dp, cortex_a9_dp_shift, cortex_a9_multiply") | ||
| 584 | + | ||
| 585 | +;; Scheduling on the FP_ADD pipeline. | ||
| 586 | +(define_reservation "ca9fp_add" "ca9_issue_vfp_neon + ca9fp_add1, ca9fp_add2, ca9fp_add3, ca9fp_add4") | ||
| 587 | |||
| 588 | (define_insn_reservation "cortex_a9_fadd" 4 | ||
| 589 | - (and (eq_attr "tune" "cortexa9") | ||
| 590 | - (eq_attr "type" "fadds,faddd,f_cvt")) | ||
| 591 | - "cortex_a9_vfp") | ||
| 592 | - | ||
| 593 | -(define_insn_reservation "cortex_a9_fmuls" 5 | ||
| 594 | - (and (eq_attr "tune" "cortexa9") | ||
| 595 | - (eq_attr "type" "fmuls")) | ||
| 596 | - "cortex_a9_vfp") | ||
| 597 | - | ||
| 598 | -(define_insn_reservation "cortex_a9_fmuld" 6 | ||
| 599 | - (and (eq_attr "tune" "cortexa9") | ||
| 600 | - (eq_attr "type" "fmuld")) | ||
| 601 | - "cortex_a9_vfp*2") | ||
| 602 | + (and (eq_attr "tune" "cortexa9") | ||
| 603 | + (eq_attr "type" "fadds, faddd, f_cvt")) | ||
| 604 | + "ca9fp_add") | ||
| 605 | + | ||
| 606 | +(define_insn_reservation "cortex_a9_fcmp" 1 | ||
| 607 | + (and (eq_attr "tune" "cortexa9") | ||
| 608 | + (eq_attr "type" "fcmps, fcmpd")) | ||
| 609 | + "ca9_issue_vfp_neon + ca9fp_add1") | ||
| 610 | + | ||
| 611 | +;; Scheduling for the Multiply and MAC instructions. | ||
| 612 | +(define_reservation "ca9fmuls" | ||
| 613 | + "ca9fp_mul1 + ca9_issue_vfp_neon, ca9fp_mul2, ca9fp_mul3, ca9fp_mul4") | ||
| 614 | + | ||
| 615 | +(define_reservation "ca9fmuld" | ||
| 616 | + "ca9fp_mul1 + ca9_issue_vfp_neon, (ca9fp_mul1 + ca9fp_mul2), ca9fp_mul2, ca9fp_mul3, ca9fp_mul4") | ||
| 617 | + | ||
| 618 | +(define_insn_reservation "cortex_a9_fmuls" 4 | ||
| 619 | + (and (eq_attr "tune" "cortexa9") | ||
| 620 | + (eq_attr "type" "fmuls")) | ||
| 621 | + "ca9fmuls") | ||
| 622 | + | ||
| 623 | +(define_insn_reservation "cortex_a9_fmuld" 5 | ||
| 624 | + (and (eq_attr "tune" "cortexa9") | ||
| 625 | + (eq_attr "type" "fmuld")) | ||
| 626 | + "ca9fmuld") | ||
| 627 | |||
| 628 | (define_insn_reservation "cortex_a9_fmacs" 8 | ||
| 629 | - (and (eq_attr "tune" "cortexa9") | ||
| 630 | - (eq_attr "type" "fmacs")) | ||
| 631 | - "cortex_a9_vfp") | ||
| 632 | - | ||
| 633 | -(define_insn_reservation "cortex_a9_fmacd" 8 | ||
| 634 | - (and (eq_attr "tune" "cortexa9") | ||
| 635 | - (eq_attr "type" "fmacd")) | ||
| 636 | - "cortex_a9_vfp*2") | ||
| 637 | - | ||
| 638 | + (and (eq_attr "tune" "cortexa9") | ||
| 639 | + (eq_attr "type" "fmacs")) | ||
| 640 | + "ca9fmuls, ca9fp_add") | ||
| 641 | + | ||
| 642 | +(define_insn_reservation "cortex_a9_fmacd" 9 | ||
| 643 | + (and (eq_attr "tune" "cortexa9") | ||
| 644 | + (eq_attr "type" "fmacd")) | ||
| 645 | + "ca9fmuld, ca9fp_add") | ||
| 646 | + | ||
| 647 | +;; Division pipeline description. | ||
| 648 | (define_insn_reservation "cortex_a9_fdivs" 15 | ||
| 649 | - (and (eq_attr "tune" "cortexa9") | ||
| 650 | - (eq_attr "type" "fdivs")) | ||
| 651 | - "cortex_a9_vfp*10") | ||
| 652 | + (and (eq_attr "tune" "cortexa9") | ||
| 653 | + (eq_attr "type" "fdivs")) | ||
| 654 | + "ca9fp_ds1 + ca9_issue_vfp_neon, nothing*14") | ||
| 655 | |||
| 656 | (define_insn_reservation "cortex_a9_fdivd" 25 | ||
| 657 | - (and (eq_attr "tune" "cortexa9") | ||
| 658 | - (eq_attr "type" "fdivd")) | ||
| 659 | - "cortex_a9_vfp*20") | ||
| 660 | + (and (eq_attr "tune" "cortexa9") | ||
| 661 | + (eq_attr "type" "fdivd")) | ||
| 662 | + "ca9fp_ds1 + ca9_issue_vfp_neon, nothing*24") | ||
| 663 | |||
