summaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch')
-rw-r--r--meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch784
1 files changed, 784 insertions, 0 deletions
diff --git a/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch b/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch
new file mode 100644
index 000000000..bb866ce8d
--- /dev/null
+++ b/meta-oe/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch
@@ -0,0 +1,784 @@
12011-03-24 Revital Eres <revital.eres@linaro.org>
2
3 gcc/
4 * loop-doloop.c (doloop_condition_get): Support new form of
5 doloop pattern and use prev_nondebug_insn instead of PREV_INSN.
6 * config/arm/thumb2.md (*thumb2_addsi3_compare0): Remove "*".
7 (doloop_end): New.
8 * config/arm/arm.md (*addsi3_compare0): Remove "*".
9 * ddg.c (check_closing_branch_deps, get_node_of_insn_uid):
10 New functions.
11 (create_ddg): Pass sbitmap containing do-loop related
12 instructions instead of closing_branch_deps parameter and call
13 check_closing_branch_deps function.
14 * ddg.h (create_ddg): Adjust the function declaration.
15 * modulo-sched.c (PS_STAGE_COUNT): Rename to CALC_STAGE_COUNT
16 and redefine.
17 (doloop_register_get): Handle NONDEBUG_INSN_P.
18 (stage_count): New field in struct partial_schedule.
19 (mark_doloop_insns, calculate_stage_count): New functions.
20 (normalize_sched_times): Rename to reset_sched_times and handle
21 incrementing the sched time of the nodes by a constant value
22 passed as parameter.
23 (duplicate_insns_of_cycles): Skip closing branch.
24 (sms_schedule_by_order): Schedule closing branch when
25 closing_branch_deps is true.
26 (ps_insn_find_column): Handle closing branch.
27 (sms_schedule): Call reset_sched_times and handle case where
28 do-loop pattern is not decoupled from the other loop instructions.
29 Support new form of doloop pattern.
30 (ps_insert_empty_row): Update calls to normalize_sched_times
31 and rotate_partial_schedule functions.
32
33=== modified file 'gcc/config/arm/arm.md'
34--- old/gcc/config/arm/arm.md 2011-03-11 14:26:34 +0000
35+++ new/gcc/config/arm/arm.md 2011-03-24 07:45:38 +0000
36@@ -734,7 +734,7 @@
37 ""
38 )
39
40-(define_insn "*addsi3_compare0"
41+(define_insn "addsi3_compare0"
42 [(set (reg:CC_NOOV CC_REGNUM)
43 (compare:CC_NOOV
44 (plus:SI (match_operand:SI 1 "s_register_operand" "r, r")
45
46=== modified file 'gcc/config/arm/thumb2.md'
47--- old/gcc/config/arm/thumb2.md 2011-02-08 10:51:58 +0000
48+++ new/gcc/config/arm/thumb2.md 2011-03-24 07:45:38 +0000
49@@ -1194,7 +1194,7 @@
50 (set_attr "length" "2")]
51 )
52
53-(define_insn "*thumb2_addsi3_compare0"
54+(define_insn "thumb2_addsi3_compare0"
55 [(set (reg:CC_NOOV CC_REGNUM)
56 (compare:CC_NOOV
57 (plus:SI (match_operand:SI 1 "s_register_operand" "l, 0, r")
58@@ -1445,3 +1445,56 @@
59 [(set_attr "length" "4,4,16")
60 (set_attr "predicable" "yes")]
61 )
62+
63+
64+;; Define the subtract-one-and-jump insns so loop.c
65+;; knows what to generate.
66+(define_expand "doloop_end"
67+ [(use (match_operand 0 "" "")) ; loop pseudo
68+ (use (match_operand 1 "" "")) ; iterations; zero if unknown
69+ (use (match_operand 2 "" "")) ; max iterations
70+ (use (match_operand 3 "" "")) ; loop level
71+ (use (match_operand 4 "" ""))] ; label
72+ "TARGET_32BIT"
73+ "
74+ {
75+ /* Currently SMS relies on the do-loop pattern to recognize loops
76+ where (1) the control part consists of all insns defining and/or
77+ using a certain 'count' register and (2) the loop count can be
78+ adjusted by modifying this register prior to the loop.
79+ ??? The possible introduction of a new block to initialize the
80+ new IV can potentially affect branch optimizations. */
81+ if (optimize > 0 && flag_modulo_sched)
82+ {
83+ rtx s0;
84+ rtx bcomp;
85+ rtx loc_ref;
86+ rtx cc_reg;
87+ rtx insn;
88+ rtx cmp;
89+
90+ /* Only use this on innermost loops. */
91+ if (INTVAL (operands[3]) > 1)
92+ FAIL;
93+
94+ if (GET_MODE (operands[0]) != SImode)
95+ FAIL;
96+
97+ s0 = operands [0];
98+ if (TARGET_THUMB2)
99+ insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
100+ else
101+ insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
102+
103+ cmp = XVECEXP (PATTERN (insn), 0, 0);
104+ cc_reg = SET_DEST (cmp);
105+ bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
106+ loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [4]);
107+ emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,
108+ gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
109+ loc_ref, pc_rtx)));
110+ DONE;
111+ }else
112+ FAIL;
113+ }")
114+
115
116=== modified file 'gcc/ddg.c'
117--- old/gcc/ddg.c 2010-07-19 08:58:53 +0000
118+++ new/gcc/ddg.c 2011-03-24 07:45:38 +0000
119@@ -60,6 +60,8 @@
120 static ddg_edge_ptr create_ddg_edge (ddg_node_ptr, ddg_node_ptr, dep_type,
121 dep_data_type, int, int);
122 static void add_edge_to_ddg (ddg_ptr g, ddg_edge_ptr);
123+static ddg_node_ptr get_node_of_insn_uid (ddg_ptr, int);
124+
125
126 /* Auxiliary variable for mem_read_insn_p/mem_write_insn_p. */
127 static bool mem_ref_p;
128@@ -450,12 +452,65 @@
129 sched_free_deps (head, tail, false);
130 }
131
132+/* Given DOLOOP_INSNS which holds the instructions that
133+ belong to the do-loop part; mark closing_branch_deps field in ddg G
134+ as TRUE if the do-loop part's instructions are dependent on the other
135+ loop instructions. Otherwise mark it as FALSE. */
136+static void
137+check_closing_branch_deps (ddg_ptr g, sbitmap doloop_insns)
138+{
139+ sbitmap_iterator sbi;
140+ unsigned int u = 0;
141+
142+ EXECUTE_IF_SET_IN_SBITMAP (doloop_insns, 0, u, sbi)
143+ {
144+ ddg_edge_ptr e;
145+ ddg_node_ptr u_node = get_node_of_insn_uid (g, u);
146+
147+ gcc_assert (u_node);
148+
149+ for (e = u_node->in; e != 0; e = e->next_in)
150+ {
151+ ddg_node_ptr v_node = e->src;
152+
153+ if (((unsigned int) INSN_UID (v_node->insn) == u)
154+ || DEBUG_INSN_P (v_node->insn))
155+ continue;
156+
157+ /* Ignore dependencies between memory writes and the
158+ jump. */
159+ if (JUMP_P (u_node->insn)
160+ && e->type == OUTPUT_DEP
161+ && mem_write_insn_p (v_node->insn))
162+ continue;
163+ if (!TEST_BIT (doloop_insns, INSN_UID (v_node->insn)))
164+ {
165+ g->closing_branch_deps = 1;
166+ return;
167+ }
168+ }
169+ for (e = u_node->out; e != 0; e = e->next_out)
170+ {
171+ ddg_node_ptr v_node = e->dest;
172+
173+ if (((unsigned int) INSN_UID (v_node->insn) == u)
174+ || DEBUG_INSN_P (v_node->insn))
175+ continue;
176+ if (!TEST_BIT (doloop_insns, INSN_UID (v_node->insn)))
177+ {
178+ g->closing_branch_deps = 1;
179+ return;
180+ }
181+ }
182+ }
183+ g->closing_branch_deps = 0;
184+}
185
186 /* Given a basic block, create its DDG and return a pointer to a variable
187 of ddg type that represents it.
188 Initialize the ddg structure fields to the appropriate values. */
189 ddg_ptr
190-create_ddg (basic_block bb, int closing_branch_deps)
191+create_ddg (basic_block bb, sbitmap doloop_insns)
192 {
193 ddg_ptr g;
194 rtx insn, first_note;
195@@ -465,7 +520,6 @@
196 g = (ddg_ptr) xcalloc (1, sizeof (struct ddg));
197
198 g->bb = bb;
199- g->closing_branch_deps = closing_branch_deps;
200
201 /* Count the number of insns in the BB. */
202 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
203@@ -538,6 +592,11 @@
204 /* Build the data dependency graph. */
205 build_intra_loop_deps (g);
206 build_inter_loop_deps (g);
207+
208+ /* Check whether the do-loop part is decoupled from the other loop
209+ instructions. */
210+ check_closing_branch_deps (g, doloop_insns);
211+
212 return g;
213 }
214
215@@ -831,6 +890,18 @@
216 return NULL;
217 }
218
219+/* Given the uid of an instruction UID return the node that represents it. */
220+static ddg_node_ptr
221+get_node_of_insn_uid (ddg_ptr g, int uid)
222+{
223+ int i;
224+
225+ for (i = 0; i < g->num_nodes; i++)
226+ if (uid == INSN_UID (g->nodes[i].insn))
227+ return &g->nodes[i];
228+ return NULL;
229+}
230+
231 /* Given a set OPS of nodes in the DDG, find the set of their successors
232 which are not in OPS, and set their bits in SUCC. Bits corresponding to
233 OPS are cleared from SUCC. Leaves the other bits in SUCC unchanged. */
234
235=== modified file 'gcc/ddg.h'
236--- old/gcc/ddg.h 2009-11-25 10:55:54 +0000
237+++ new/gcc/ddg.h 2011-03-24 07:45:38 +0000
238@@ -167,7 +167,7 @@
239 };
240
241
242-ddg_ptr create_ddg (basic_block, int closing_branch_deps);
243+ddg_ptr create_ddg (basic_block, sbitmap);
244 void free_ddg (ddg_ptr);
245
246 void print_ddg (FILE *, ddg_ptr);
247
248=== modified file 'gcc/loop-doloop.c'
249--- old/gcc/loop-doloop.c 2010-07-19 08:58:53 +0000
250+++ new/gcc/loop-doloop.c 2011-03-24 07:45:38 +0000
251@@ -78,6 +78,8 @@
252 rtx inc_src;
253 rtx condition;
254 rtx pattern;
255+ rtx cc_reg = NULL_RTX;
256+ rtx reg_orig = NULL_RTX;
257
258 /* The canonical doloop pattern we expect has one of the following
259 forms:
260@@ -96,7 +98,16 @@
261 2) (set (reg) (plus (reg) (const_int -1))
262 (set (pc) (if_then_else (reg != 0)
263 (label_ref (label))
264- (pc))). */
265+ (pc))).
266+
267+ Some targets (ARM) do the comparison before the branch, as in the
268+ following form:
269+
270+ 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
271+ (set (reg) (plus (reg) (const_int -1)))])
272+ (set (pc) (if_then_else (cc == NE)
273+ (label_ref (label))
274+ (pc))) */
275
276 pattern = PATTERN (doloop_pat);
277
278@@ -104,19 +115,47 @@
279 {
280 rtx cond;
281 rtx prev_insn = prev_nondebug_insn (doloop_pat);
282+ rtx cmp_arg1, cmp_arg2;
283+ rtx cmp_orig;
284
285- /* We expect the decrement to immediately precede the branch. */
286+ /* In case the pattern is not PARALLEL we expect two forms
287+ of doloop which are cases 2) and 3) above: in case 2) the
288+ decrement immediately precedes the branch, while in case 3)
289+ the compare and decrement instructions immediately precede
290+ the branch. */
291
292 if (prev_insn == NULL_RTX || !INSN_P (prev_insn))
293 return 0;
294
295 cmp = pattern;
296- inc = PATTERN (PREV_INSN (doloop_pat));
297+ if (GET_CODE (PATTERN (prev_insn)) == PARALLEL)
298+ {
299+ /* The third case: the compare and decrement instructions
300+ immediately precede the branch. */
301+ cmp_orig = XVECEXP (PATTERN (prev_insn), 0, 0);
302+ if (GET_CODE (cmp_orig) != SET)
303+ return 0;
304+ if (GET_CODE (SET_SRC (cmp_orig)) != COMPARE)
305+ return 0;
306+ cmp_arg1 = XEXP (SET_SRC (cmp_orig), 0);
307+ cmp_arg2 = XEXP (SET_SRC (cmp_orig), 1);
308+ if (cmp_arg2 != const0_rtx
309+ || GET_CODE (cmp_arg1) != PLUS)
310+ return 0;
311+ reg_orig = XEXP (cmp_arg1, 0);
312+ if (XEXP (cmp_arg1, 1) != GEN_INT (-1)
313+ || !REG_P (reg_orig))
314+ return 0;
315+ cc_reg = SET_DEST (cmp_orig);
316+
317+ inc = XVECEXP (PATTERN (prev_insn), 0, 1);
318+ }
319+ else
320+ inc = PATTERN (prev_insn);
321 /* We expect the condition to be of the form (reg != 0) */
322 cond = XEXP (SET_SRC (cmp), 0);
323 if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
324 return 0;
325-
326 }
327 else
328 {
329@@ -162,11 +201,15 @@
330 return 0;
331
332 if ((XEXP (condition, 0) == reg)
333+ /* For the third case: */
334+ || ((cc_reg != NULL_RTX)
335+ && (XEXP (condition, 0) == cc_reg)
336+ && (reg_orig == reg))
337 || (GET_CODE (XEXP (condition, 0)) == PLUS
338- && XEXP (XEXP (condition, 0), 0) == reg))
339+ && XEXP (XEXP (condition, 0), 0) == reg))
340 {
341 if (GET_CODE (pattern) != PARALLEL)
342- /* The second form we expect:
343+ /* For the second form we expect:
344
345 (set (reg) (plus (reg) (const_int -1))
346 (set (pc) (if_then_else (reg != 0)
347@@ -181,7 +224,24 @@
348 (set (reg) (plus (reg) (const_int -1)))
349 (additional clobbers and uses)])
350
351- So we return that form instead.
352+ For the third form we expect:
353+
354+ (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
355+ (set (reg) (plus (reg) (const_int -1)))])
356+ (set (pc) (if_then_else (cc == NE)
357+ (label_ref (label))
358+ (pc)))
359+
360+ which is equivalent to the following:
361+
362+ (parallel [(set (cc) (compare (reg, 1))
363+ (set (reg) (plus (reg) (const_int -1)))
364+ (set (pc) (if_then_else (NE == cc)
365+ (label_ref (label))
366+ (pc))))])
367+
368+ So we return the second form instead for the two cases.
369+
370 */
371 condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
372
373
374=== modified file 'gcc/modulo-sched.c'
375--- old/gcc/modulo-sched.c 2009-11-25 10:55:54 +0000
376+++ new/gcc/modulo-sched.c 2011-03-24 07:45:38 +0000
377@@ -116,8 +116,10 @@
378
379 /* The number of different iterations the nodes in ps span, assuming
380 the stage boundaries are placed efficiently. */
381-#define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
382- + 1 + (ps)->ii - 1) / (ps)->ii)
383+#define CALC_STAGE_COUNT(min_cycle,max_cycle,ii) ((max_cycle - min_cycle \
384+ + 1 + ii - 1) / ii)
385+/* The stage count of ps. */
386+#define PS_STAGE_COUNT(ps) (((partial_schedule_ptr)(ps))->stage_count)
387
388 /* A single instruction in the partial schedule. */
389 struct ps_insn
390@@ -155,6 +157,8 @@
391 int max_cycle;
392
393 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
394+
395+ int stage_count; /* The stage count of the partial schedule. */
396 };
397
398 /* We use this to record all the register replacements we do in
399@@ -195,6 +199,7 @@
400 rtx, rtx);
401 static void duplicate_insns_of_cycles (partial_schedule_ptr,
402 int, int, int, rtx);
403+static int calculate_stage_count (partial_schedule_ptr ps);
404
405 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
406 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
407@@ -310,10 +315,10 @@
408 either a single (parallel) branch-on-count or a (non-parallel)
409 branch immediately preceded by a single (decrement) insn. */
410 first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
411- : PREV_INSN (tail));
412+ : prev_nondebug_insn (tail));
413
414 for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
415- if (reg_mentioned_p (reg, insn))
416+ if (reg_mentioned_p (reg, insn) && NONDEBUG_INSN_P (insn))
417 {
418 if (dump_file)
419 {
420@@ -332,6 +337,24 @@
421 #endif
422 }
423
424+/* Mark in DOLOOP_INSNS the instructions that belong to the do-loop part.
425+ Use TAIL to recognize that part. */
426+static void
427+mark_doloop_insns (sbitmap doloop_insns, rtx tail)
428+{
429+ rtx first_insn_not_to_check, insn;
430+
431+ /* This is the first instruction which belongs the doloop part. */
432+ first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
433+ : prev_nondebug_insn (tail));
434+
435+ sbitmap_zero (doloop_insns);
436+ for (insn = first_insn_not_to_check; insn != NEXT_INSN (tail);
437+ insn = NEXT_INSN (insn))
438+ if (NONDEBUG_INSN_P (insn))
439+ SET_BIT (doloop_insns, INSN_UID (insn));
440+}
441+
442 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
443 that the number of iterations is a compile-time constant. If so,
444 return the rtx that sets COUNT_REG to a constant, and set COUNT to
445@@ -569,13 +592,12 @@
446 }
447 }
448
449-/* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
450- of SCHED_ROW and SCHED_STAGE. */
451+/* Bump the SCHED_TIMEs of all nodes by AMOUNT. Set the values of
452+ SCHED_ROW and SCHED_STAGE. */
453 static void
454-normalize_sched_times (partial_schedule_ptr ps)
455+reset_sched_times (partial_schedule_ptr ps, int amount)
456 {
457 int row;
458- int amount = PS_MIN_CYCLE (ps);
459 int ii = ps->ii;
460 ps_insn_ptr crr_insn;
461
462@@ -584,6 +606,10 @@
463 {
464 ddg_node_ptr u = crr_insn->node;
465 int normalized_time = SCHED_TIME (u) - amount;
466+ int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
467+ /* The first cycle in row zero after the rotation. */
468+ int new_first_cycle_in_row_zero =
469+ new_min_cycle + ii - SMODULO (new_min_cycle, ii);
470
471 if (dump_file)
472 fprintf (dump_file, "crr_insn->node=%d, crr_insn->cycle=%d,\
473@@ -592,8 +618,30 @@
474 gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
475 gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
476 SCHED_TIME (u) = normalized_time;
477- SCHED_ROW (u) = normalized_time % ii;
478- SCHED_STAGE (u) = normalized_time / ii;
479+ crr_insn->cycle = normalized_time;
480+ SCHED_ROW (u) = SMODULO (normalized_time, ii);
481+
482+ /* If min_cycle is in row zero after the rotation then
483+ the stage count can be calculated by dividing the cycle
484+ with ii. Otherwise, the calculation is done by dividing the
485+ SMSed kernel into two intervals:
486+
487+ 1) min_cycle <= interval 0 < first_cycle_in_row_zero
488+ 2) first_cycle_in_row_zero <= interval 1 < max_cycle
489+
490+ Cycles in interval 0 are in stage 0. The stage of cycles
491+ in interval 1 should be added by 1 to take interval 0 into
492+ account. */
493+ if (SMODULO (new_min_cycle, ii) == 0)
494+ SCHED_STAGE (u) = normalized_time / ii;
495+ else
496+ {
497+ if (crr_insn->cycle < new_first_cycle_in_row_zero)
498+ SCHED_STAGE (u) = 0;
499+ else
500+ SCHED_STAGE (u) =
501+ ((SCHED_TIME (u) - new_first_cycle_in_row_zero) / ii) + 1;
502+ }
503 }
504 }
505
506@@ -646,9 +694,12 @@
507
508 /* Do not duplicate any insn which refers to count_reg as it
509 belongs to the control part.
510+ If closing_branch_deps is true the closing branch is scheduled
511+ as well and thus should be ignored.
512 TODO: This should be done by analyzing the control part of
513 the loop. */
514- if (reg_mentioned_p (count_reg, u_node->insn))
515+ if (reg_mentioned_p (count_reg, u_node->insn)
516+ || JUMP_P (ps_ij->node->insn))
517 continue;
518
519 if (for_prolog)
520@@ -894,7 +945,8 @@
521 basic_block condition_bb = NULL;
522 edge latch_edge;
523 gcov_type trip_count = 0;
524-
525+ sbitmap doloop_insns;
526+
527 loop_optimizer_init (LOOPS_HAVE_PREHEADERS
528 | LOOPS_HAVE_RECORDED_EXITS);
529 if (number_of_loops () <= 1)
530@@ -919,6 +971,7 @@
531 setup_sched_infos ();
532 haifa_sched_init ();
533
534+ doloop_insns = sbitmap_alloc (get_max_uid () + 1);
535 /* Allocate memory to hold the DDG array one entry for each loop.
536 We use loop->num as index into this array. */
537 g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
538@@ -1009,9 +1062,11 @@
539 continue;
540 }
541
542- /* Don't handle BBs with calls or barriers, or !single_set insns,
543- or auto-increment insns (to avoid creating invalid reg-moves
544- for the auto-increment insns).
545+ /* Don't handle BBs with calls or barriers or auto-increment insns
546+ (to avoid creating invalid reg-moves for the auto-increment insns),
547+ or !single_set with the exception of instructions that include
548+ count_reg---these instructions are part of the control part
549+ that do-loop recognizes.
550 ??? Should handle auto-increment insns.
551 ??? Should handle insns defining subregs. */
552 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
553@@ -1021,7 +1076,8 @@
554 if (CALL_P (insn)
555 || BARRIER_P (insn)
556 || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
557- && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)
558+ && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
559+ && !reg_mentioned_p (count_reg, insn))
560 || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
561 || (INSN_P (insn) && (set = single_set (insn))
562 && GET_CODE (SET_DEST (set)) == SUBREG))
563@@ -1048,14 +1104,16 @@
564
565 continue;
566 }
567-
568- if (! (g = create_ddg (bb, 0)))
569+ mark_doloop_insns (doloop_insns, tail);
570+ if (! (g = create_ddg (bb, doloop_insns)))
571 {
572 if (dump_file)
573 fprintf (dump_file, "SMS create_ddg failed\n");
574 continue;
575 }
576-
577+ if (dump_file)
578+ fprintf (dump_file, "SMS closing_branch_deps: %d\n",
579+ g->closing_branch_deps);
580 g_arr[loop->num] = g;
581 if (dump_file)
582 fprintf (dump_file, "...OK\n");
583@@ -1157,11 +1215,13 @@
584
585 ps = sms_schedule_by_order (g, mii, maxii, node_order);
586
587- if (ps){
588- stage_count = PS_STAGE_COUNT (ps);
589- gcc_assert(stage_count >= 1);
590- }
591-
592+ if (ps)
593+ {
594+ stage_count = calculate_stage_count (ps);
595+ gcc_assert(stage_count >= 1);
596+ PS_STAGE_COUNT(ps) = stage_count;
597+ }
598+
599 /* Stage count of 1 means that there is no interleaving between
600 iterations, let the scheduling passes do the job. */
601 if (stage_count <= 1
602@@ -1182,17 +1242,7 @@
603 else
604 {
605 struct undo_replace_buff_elem *reg_move_replaces;
606-
607- if (dump_file)
608- {
609- fprintf (dump_file,
610- "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
611- stage_count);
612- print_partial_schedule (ps, dump_file);
613- fprintf (dump_file,
614- "SMS Branch (%d) will later be scheduled at cycle %d.\n",
615- g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
616- }
617+ int amount;
618
619 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
620 the closing_branch was scheduled and should appear in the last (ii-1)
621@@ -1202,12 +1252,28 @@
622 TODO: Revisit the issue of scheduling the insns of the
623 control part relative to the branch when the control part
624 has more than one insn. */
625- normalize_sched_times (ps);
626- rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
627+ amount = (g->closing_branch_deps)? SCHED_TIME (g->closing_branch) + 1:
628+ PS_MIN_CYCLE (ps);
629+ reset_sched_times (ps, amount);
630+ rotate_partial_schedule (ps, amount);
631+
632 set_columns_for_ps (ps);
633
634 canon_loop (loop);
635
636+ if (dump_file)
637+ {
638+ fprintf (dump_file,
639+ "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
640+ stage_count);
641+ print_partial_schedule (ps, dump_file);
642+ if (!g->closing_branch_deps)
643+ fprintf (dump_file,
644+ "SMS Branch (%d) will later be scheduled at \
645+ cycle %d.\n",
646+ g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
647+ }
648+
649 /* case the BCT count is not known , Do loop-versioning */
650 if (count_reg && ! count_init)
651 {
652@@ -1252,6 +1318,7 @@
653 }
654
655 free (g_arr);
656+ sbitmap_free (doloop_insns);
657
658 /* Release scheduler data, needed until now because of DFA. */
659 haifa_sched_finish ();
660@@ -1759,8 +1826,9 @@
661 RESET_BIT (tobe_scheduled, u);
662 continue;
663 }
664-
665- if (JUMP_P (insn)) /* Closing branch handled later. */
666+ /* Closing branch handled later unless closing_branch_deps
667+ is true. */
668+ if (JUMP_P (insn) && !g->closing_branch_deps)
669 {
670 RESET_BIT (tobe_scheduled, u);
671 continue;
672@@ -1893,8 +1961,8 @@
673 if (dump_file)
674 fprintf (dump_file, "split_row=%d\n", split_row);
675
676- normalize_sched_times (ps);
677- rotate_partial_schedule (ps, ps->min_cycle);
678+ reset_sched_times (ps, PS_MIN_CYCLE (ps));
679+ rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
680
681 rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
682 for (row = 0; row < split_row; row++)
683@@ -2571,6 +2639,7 @@
684 ps_insn_ptr next_ps_i;
685 ps_insn_ptr first_must_follow = NULL;
686 ps_insn_ptr last_must_precede = NULL;
687+ ps_insn_ptr last_in_row = NULL;
688 int row;
689
690 if (! ps_i)
691@@ -2597,8 +2666,37 @@
692 else
693 last_must_precede = next_ps_i;
694 }
695+ /* The closing branch must be the last in the row. */
696+ if (must_precede
697+ && TEST_BIT (must_precede, next_ps_i->node->cuid)
698+ && JUMP_P (next_ps_i->node->insn))
699+ return false;
700+
701+ last_in_row = next_ps_i;
702 }
703
704+ /* If closing_branch_deps is true we are scheduling the closing
705+ branch as well. Make sure there is no dependent instruction after
706+ it as the branch should be the last instruction. */
707+ if (JUMP_P (ps_i->node->insn))
708+ {
709+ if (first_must_follow)
710+ return false;
711+ if (last_in_row)
712+ {
713+ /* Make the branch the last in the row. New instructions
714+ will be inserted at the beginning of the row or after the
715+ last must_precede instruction thus the branch is guaranteed
716+ to remain the last instruction in the row. */
717+ last_in_row->next_in_row = ps_i;
718+ ps_i->prev_in_row = last_in_row;
719+ ps_i->next_in_row = NULL;
720+ }
721+ else
722+ ps->rows[row] = ps_i;
723+ return true;
724+ }
725+
726 /* Now insert the node after INSERT_AFTER_PSI. */
727
728 if (! last_must_precede)
729@@ -2820,6 +2918,54 @@
730 return ps_i;
731 }
732
733+/* Calculate the stage count of the partial schedule PS. */
734+int
735+calculate_stage_count (partial_schedule_ptr ps)
736+{
737+ int stage_count;
738+
739+ /* If closing_branch_deps is false then the stage
740+ boundaries are placed efficiently, meaning that min_cycle will be
741+ placed at row 0. Otherwise, the closing branch will be placed in
742+ row ii-1. For the later case we assume the final SMSed kernel can
743+ be divided into two intervals. This assumption is used for the
744+ stage count calculation:
745+
746+ 1) min_cycle <= interval 0 < first_cycle_in_row_zero
747+ 2) first_cycle_in_row_zero <= interval 1 < max_cycle
748+ */
749+ stage_count =
750+ CALC_STAGE_COUNT (PS_MIN_CYCLE (ps), PS_MAX_CYCLE (ps), ps->ii);
751+ if (ps->g->closing_branch_deps)
752+ {
753+ int new_min_cycle;
754+ int new_min_cycle_row;
755+ int rotation_amount = SCHED_TIME (ps->g->closing_branch) + 1;
756+
757+ /* This is the new value of min_cycle after the final rotation to
758+ bring closing branch into row ii-1. */
759+ new_min_cycle = PS_MIN_CYCLE (ps) - rotation_amount;
760+ /* This is the row which the the new min_cycle will be placed in. */
761+ new_min_cycle_row = SMODULO (new_min_cycle, ps->ii);
762+ /* If the row of min_cycle is zero then interval 0 is empty.
763+ Otherwise, we need to calculate interval 1 and add it by one
764+ to take interval 0 into account. */
765+ if (new_min_cycle_row != 0)
766+ {
767+ int new_max_cycle, first_cycle_in_row_zero;
768+
769+ new_max_cycle = PS_MAX_CYCLE (ps) - rotation_amount;
770+ first_cycle_in_row_zero =
771+ new_min_cycle + ps->ii - new_min_cycle_row;
772+
773+ stage_count =
774+ CALC_STAGE_COUNT (first_cycle_in_row_zero, new_max_cycle,
775+ ps->ii) + 1;
776+ }
777+ }
778+ return stage_count;
779+}
780+
781 /* Rotate the rows of PS such that insns scheduled at time
782 START_CYCLE will appear in row 0. Updates max/min_cycles. */
783 void
784