1 files changed, 462 insertions, 0 deletions
diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0028-sched-nohz-Rewrite-and-fix-load-avg-computation-agai.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0028-sched-nohz-Rewrite-and-fix-load-avg-computation-agai.patch
new file mode 100644
index 00000000..5659ce7b
--- /dev/null
+++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0028-sched-nohz-Rewrite-and-fix-load-avg-computation-agai.patch
@@ -0,0 +1,462 @@
+From a7d3f237430003ca8d32d1703770f04d32a02b27 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Fri, 22 Jun 2012 15:52:09 +0200
+Subject: [PATCH 028/109] sched/nohz: Rewrite and fix load-avg computation --
+ again
+commit 5167e8d5417bf5c322a703d2927daec727ea40dd upstream.
+Thanks to Charles Wang for spotting the defects in the current code:
+ - If we go idle during the sample window -- after sampling, we get a
+   negative bias because we can negate our own sample.
+ - If we wake up during the sample window we get a positive bias
+   because we push the sample to a known active period.
+So rewrite the entire nohz load-avg muck once again, now adding
+copious documentation to the code.
+Reported-and-tested-by: Doug Smythies <dsmythies@telus.net>
+Reported-and-tested-by: Charles Wang <muming.wq@gmail.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins
+[ minor edits ]
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+[bwh: Backported to 3.2: adjust filenames, context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+---
+ include/linux/sched.h    |    8 ++
+ kernel/sched.c           |  276 ++++++++++++++++++++++++++++++++++------------
+ kernel/sched_idletask.c  |    1 -
+ kernel/time/tick-sched.c |    2 +
+ 4 files changed, 213 insertions(+), 74 deletions(-)
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 1c4f3e9..5afa2a3 100644
+--- a/include/linux/sched.h
+++ b/include/linux/sched.h
+@@ -1892,6 +1892,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
+ }
+ #endif
+ 
+#ifdef CONFIG_NO_HZ
+void calc_load_enter_idle(void);
+void calc_load_exit_idle(void);
+#else
+static inline void calc_load_enter_idle(void) { }
+static inline void calc_load_exit_idle(void) { }
+#endif /* CONFIG_NO_HZ */
+
+ #ifndef CONFIG_CPUMASK_OFFSTACK
+ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+ {
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 576a27f..52ac69b 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -1885,7 +1885,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ 
+ #endif
+ 
+-static void calc_load_account_idle(struct rq *this_rq);
+ static void update_sysctl(void);
+ static int get_update_sysctl_factor(void);
+ static void update_cpu_load(struct rq *this_rq);
+@@ -3401,11 +3400,73 @@ unsigned long this_cpu_load(void)
+ }
+ 
+ 
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *     nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
+ /* Variables and functions for calc_load */
+ static atomic_long_t calc_load_tasks;
+ static unsigned long calc_load_update;
+ unsigned long avenrun[3];
+-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:     pointer to dest load array
+ * @offset:    offset to add
+ * @shift:     shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+       loads[0] = (avenrun[0] + offset) << shift;
+       loads[1] = (avenrun[1] + offset) << shift;
+       loads[2] = (avenrun[2] + offset) << shift;
+}
+ 
+ static long calc_load_fold_active(struct rq *this_rq)
+ {
+@@ -3422,6 +3483,9 @@ static long calc_load_fold_active(struct rq *this_rq)
+        return delta;
+ }
+ 
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+ static unsigned long
+ calc_load(unsigned long load, unsigned long exp, unsigned long active)
+ {
+@@ -3433,30 +3497,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
+ 
+ #ifdef CONFIG_NO_HZ
+ /*
+- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
+  *
+  * When making the ILB scale, we should try to pull this in as well.
+  */
+-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+ 
+-static void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_write_idx(void)
+ {
+       int idx = calc_load_idx;
+
+       /*
+        * See calc_global_nohz(), if we observe the new index, we also
+        * need to observe the new update time.
+        */
+       smp_rmb();
+
+       /*
+        * If the folding window started, make sure we start writing in the
+        * next idle-delta.
+        */
+       if (!time_before(jiffies, calc_load_update))
+               idx++;
+
+       return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+       return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+       struct rq *this_rq = this_rq();
+        long delta;
+ 
+       /*
+        * We're going into NOHZ mode, if there's any pending delta, fold it
+        * into the pending idle delta.
+        */
+        delta = calc_load_fold_active(this_rq);
+-       if (delta)
+-               atomic_long_add(delta, &calc_load_tasks_idle);
+       if (delta) {
+               int idx = calc_load_write_idx();
+               atomic_long_add(delta, &calc_load_idle[idx]);
+       }
+ }
+ 
+-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
+ {
+-       long delta = 0;
+       struct rq *this_rq = this_rq();
+
+       /*
+        * If we're still before the sample window, we're done.
+        */
+       if (time_before(jiffies, this_rq->calc_load_update))
+               return;
+ 
+        /*
+-        * Its got a race, we don't care...
+        * We woke inside or after the sample window, this means we're already
+        * accounted through the nohz accounting, so skip the entire deal and
+        * sync up for the next window.
+         */
+-       if (atomic_long_read(&calc_load_tasks_idle))
+-               delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+       this_rq->calc_load_update = calc_load_update;
+       if (time_before(jiffies, this_rq->calc_load_update + 10))
+               this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+       int idx = calc_load_read_idx();
+       long delta = 0;
+
+       if (atomic_long_read(&calc_load_idle[idx]))
+               delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+ 
+        return delta;
+ }
+@@ -3542,66 +3694,39 @@ static void calc_global_nohz(void)
+ {
+        long delta, active, n;
+ 
+-       /*
+-        * If we crossed a calc_load_update boundary, make sure to fold
+-        * any pending idle changes, the respective CPUs might have
+-        * missed the tick driven calc_load_account_active() update
+-        * due to NO_HZ.
+-        */
+-       delta = calc_load_fold_idle();
+-       if (delta)
+-               atomic_long_add(delta, &calc_load_tasks);
+-
+-       /*
+-        * It could be the one fold was all it took, we done!
+-        */
+-       if (time_before(jiffies, calc_load_update + 10))
+-               return;
+-
+-       /*
+-        * Catch-up, fold however many we are behind still
+-        */
+-       delta = jiffies - calc_load_update - 10;
+-       n = 1 + (delta / LOAD_FREQ);
+       if (!time_before(jiffies, calc_load_update + 10)) {
+               /*
+                * Catch-up, fold however many we are behind still
+                */
+               delta = jiffies - calc_load_update - 10;
+               n = 1 + (delta / LOAD_FREQ);
+ 
+-       active = atomic_long_read(&calc_load_tasks);
+-       active = active > 0 ? active * FIXED_1 : 0;
+               active = atomic_long_read(&calc_load_tasks);
+               active = active > 0 ? active * FIXED_1 : 0;
+ 
+-       avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+-       avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+-       avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ 
+-       calc_load_update += n * LOAD_FREQ;
+-}
+-#else
+-static void calc_load_account_idle(struct rq *this_rq)
+-{
+-}
+               calc_load_update += n * LOAD_FREQ;
+       }
+ 
+-static inline long calc_load_fold_idle(void)
+-{
+-       return 0;
+       /*
+        * Flip the idle index...
+        *
+        * Make sure we first write the new time then flip the index, so that
+        * calc_load_write_idx() will see the new time when it reads the new
+        * index, this avoids a double flip messing things up.
+        */
+       smp_wmb();
+       calc_load_idx++;
+ }
+#else /* !CONFIG_NO_HZ */
+ 
+-static void calc_global_nohz(void)
+-{
+-}
+-#endif
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+ 
+-/**
+- * get_avenrun - get the load average array
+- * @loads:     pointer to dest load array
+- * @offset:    offset to add
+- * @shift:     shift count to shift the result left
+- *
+- * These values are estimates at best, so no need for locking.
+- */
+-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+-{
+-       loads[0] = (avenrun[0] + offset) << shift;
+-       loads[1] = (avenrun[1] + offset) << shift;
+-       loads[2] = (avenrun[2] + offset) << shift;
+-}
+#endif /* CONFIG_NO_HZ */
+ 
+ /*
+  * calc_load - update the avenrun load estimates 10 ticks after the
+@@ -3609,11 +3734,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+  */
+ void calc_global_load(unsigned long ticks)
+ {
+-       long active;
+       long active, delta;
+ 
+        if (time_before(jiffies, calc_load_update + 10))
+                return;
+ 
+       /*
+        * Fold the 'old' idle-delta to include all NO_HZ cpus.
+        */
+       delta = calc_load_fold_idle();
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks);
+
+        active = atomic_long_read(&calc_load_tasks);
+        active = active > 0 ? active * FIXED_1 : 0;
+ 
+@@ -3624,12 +3756,7 @@ void calc_global_load(unsigned long ticks)
+        calc_load_update += LOAD_FREQ;
+ 
+        /*
+-        * Account one period with whatever state we found before
+-        * folding in the nohz state and ageing the entire idle period.
+-        *
+-        * This avoids loosing a sample when we go idle between 
+-        * calc_load_account_active() (10 ticks ago) and now and thus
+-        * under-accounting.
+        * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+         */
+        calc_global_nohz();
+ }
+@@ -3646,7 +3773,6 @@ static void calc_load_account_active(struct rq *this_rq)
+                return;
+ 
+        delta  = calc_load_fold_active(this_rq);
+-       delta += calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+ 
+@@ -3654,6 +3780,10 @@ static void calc_load_account_active(struct rq *this_rq)
+ }
+ 
+ /*
+ * End of global load-average stuff
+ */
+
+/*
+  * The exact cpuload at various idx values, calculated at every tick would be
+  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+  *
+diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
+index 0a51882..be92bfe 100644
+--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
+@@ -23,7 +23,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
+ static struct task_struct *pick_next_task_idle(struct rq *rq)
+ {
+        schedstat_inc(rq, sched_goidle);
+-       calc_load_account_idle(rq);
+        return rq->idle;
+ }
+ 
+diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
+index c923640..9955ebd 100644
+--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
+@@ -430,6 +430,7 @@ void tick_nohz_stop_sched_tick(int inidle)
+                 */
+                if (!ts->tick_stopped) {
+                        select_nohz_load_balancer(1);
+                       calc_load_enter_idle();
+ 
+                        ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
+                        ts->tick_stopped = 1;
+@@ -563,6 +564,7 @@ void tick_nohz_restart_sched_tick(void)
+                account_idle_ticks(ticks);
+ #endif
+ 
+       calc_load_exit_idle();
+        touch_softlockup_watchdog();
+        /*
+         * Cancel the scheduled timer and restore the tick
+-- 
+1.7.7.6

diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0028-sched-nohz-Rewrite-and-fix-load-avg-computation-agai.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0028-sched-nohz-Rewrite-and-fix-load-avg-computation-agai.patch new file mode 100644 index 00000000..5659ce7b --- /dev/null +++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0028-sched-nohz-Rewrite-and-fix-load-avg-computation-agai.patch
@@ -0,0 +1,462 @@
	1	From a7d3f237430003ca8d32d1703770f04d32a02b27 Mon Sep 17 00:00:00 2001
	2	From: Peter Zijlstra <a.p.zijlstra@chello.nl>
	3	Date: Fri, 22 Jun 2012 15:52:09 +0200
	4	Subject: [PATCH 028/109] sched/nohz: Rewrite and fix load-avg computation --
	5	again
	6
	7	commit 5167e8d5417bf5c322a703d2927daec727ea40dd upstream.
	8
	9	Thanks to Charles Wang for spotting the defects in the current code:
	10
	11	- If we go idle during the sample window -- after sampling, we get a
	12	negative bias because we can negate our own sample.
	13
	14	- If we wake up during the sample window we get a positive bias
	15	because we push the sample to a known active period.
	16
	17	So rewrite the entire nohz load-avg muck once again, now adding
	18	copious documentation to the code.
	19
	20	Reported-and-tested-by: Doug Smythies <dsmythies@telus.net>
	21	Reported-and-tested-by: Charles Wang <muming.wq@gmail.com>
	22	Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
	23	Cc: Linus Torvalds <torvalds@linux-foundation.org>
	24	Cc: Andrew Morton <akpm@linux-foundation.org>
	25	Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins
	26	[ minor edits ]
	27	Signed-off-by: Ingo Molnar <mingo@kernel.org>
	28	[bwh: Backported to 3.2: adjust filenames, context]
	29	Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
	30	---
	31	include/linux/sched.h \| 8 ++
	32	kernel/sched.c \| 276 ++++++++++++++++++++++++++++++++++------------
	33	kernel/sched_idletask.c \| 1 -
	34	kernel/time/tick-sched.c \| 2 +
	35	4 files changed, 213 insertions(+), 74 deletions(-)
	36
	37	diff --git a/include/linux/sched.h b/include/linux/sched.h
	38	index 1c4f3e9..5afa2a3 100644
	39	--- a/include/linux/sched.h
	40	+++ b/include/linux/sched.h
	41	@@ -1892,6 +1892,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
	42	}
	43	#endif
	44
	45	+#ifdef CONFIG_NO_HZ
	46	+void calc_load_enter_idle(void);
	47	+void calc_load_exit_idle(void);
	48	+#else
	49	+static inline void calc_load_enter_idle(void) { }
	50	+static inline void calc_load_exit_idle(void) { }
	51	+#endif /* CONFIG_NO_HZ */
	52	+
	53	#ifndef CONFIG_CPUMASK_OFFSTACK
	54	static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
	55	{
	56	diff --git a/kernel/sched.c b/kernel/sched.c
	57	index 576a27f..52ac69b 100644
	58	--- a/kernel/sched.c
	59	+++ b/kernel/sched.c
	60	@@ -1885,7 +1885,6 @@ static void double_rq_unlock(struct rq rq1, struct rq rq2)
	61
	62	#endif
	63
	64	-static void calc_load_account_idle(struct rq *this_rq);
	65	static void update_sysctl(void);
	66	static int get_update_sysctl_factor(void);
	67	static void update_cpu_load(struct rq *this_rq);
	68	@@ -3401,11 +3400,73 @@ unsigned long this_cpu_load(void)
	69	}
	70
	71
	72	+/*
	73	+ * Global load-average calculations
	74	+ *
	75	+ * We take a distributed and async approach to calculating the global load-avg
	76	+ * in order to minimize overhead.
	77	+ *
	78	+ * The global load average is an exponentially decaying average of nr_running +
	79	+ * nr_uninterruptible.
	80	+ *
	81	+ * Once every LOAD_FREQ:
	82	+ *
	83	+ * nr_active = 0;
	84	+ * for_each_possible_cpu(cpu)
	85	+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
	86	+ *
	87	+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
	88	+ *
	89	+ * Due to a number of reasons the above turns in the mess below:
	90	+ *
	91	+ * - for_each_possible_cpu() is prohibitively expensive on machines with
	92	+ * serious number of cpus, therefore we need to take a distributed approach
	93	+ * to calculating nr_active.
	94	+ *
	95	+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) \| x_i(t_0) := 0
	96	+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
	97	+ *
	98	+ * So assuming nr_active := 0 when we start out -- true per definition, we
	99	+ * can simply take per-cpu deltas and fold those into a global accumulate
	100	+ * to obtain the same result. See calc_load_fold_active().
	101	+ *
	102	+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
	103	+ * across the machine, we assume 10 ticks is sufficient time for every
	104	+ * cpu to have completed this task.
	105	+ *
	106	+ * This places an upper-bound on the IRQ-off latency of the machine. Then
	107	+ * again, being late doesn't loose the delta, just wrecks the sample.
	108	+ *
	109	+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
	110	+ * this would add another cross-cpu cacheline miss and atomic operation
	111	+ * to the wakeup path. Instead we increment on whatever cpu the task ran
	112	+ * when it went into uninterruptible state and decrement on whatever cpu
	113	+ * did the wakeup. This means that only the sum of nr_uninterruptible over
	114	+ * all cpus yields the correct result.
	115	+ *
	116	+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
	117	+ */
	118	+
	119	/* Variables and functions for calc_load */
	120	static atomic_long_t calc_load_tasks;
	121	static unsigned long calc_load_update;
	122	unsigned long avenrun[3];
	123	-EXPORT_SYMBOL(avenrun);
	124	+EXPORT_SYMBOL(avenrun); /* should be removed */
	125	+
	126	+/**
	127	+ * get_avenrun - get the load average array
	128	+ * @loads: pointer to dest load array
	129	+ * @offset: offset to add
	130	+ * @shift: shift count to shift the result left
	131	+ *
	132	+ * These values are estimates at best, so no need for locking.
	133	+ */
	134	+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
	135	+{
	136	+ loads[0] = (avenrun[0] + offset) << shift;
	137	+ loads[1] = (avenrun[1] + offset) << shift;
	138	+ loads[2] = (avenrun[2] + offset) << shift;
	139	+}
	140
	141	static long calc_load_fold_active(struct rq *this_rq)
	142	{
	143	@@ -3422,6 +3483,9 @@ static long calc_load_fold_active(struct rq *this_rq)
	144	return delta;
	145	}
	146
	147	+/*
	148	+ * a1 = a0 * e + a * (1 - e)
	149	+ */
	150	static unsigned long
	151	calc_load(unsigned long load, unsigned long exp, unsigned long active)
	152	{
	153	@@ -3433,30 +3497,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
	154
	155	#ifdef CONFIG_NO_HZ
	156	/*
	157	- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
	158	+ * Handle NO_HZ for the global load-average.
	159	+ *
	160	+ * Since the above described distributed algorithm to compute the global
	161	+ * load-average relies on per-cpu sampling from the tick, it is affected by
	162	+ * NO_HZ.
	163	+ *
	164	+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
	165	+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
	166	+ * when we read the global state.
	167	+ *
	168	+ * Obviously reality has to ruin such a delightfully simple scheme:
	169	+ *
	170	+ * - When we go NO_HZ idle during the window, we can negate our sample
	171	+ * contribution, causing under-accounting.
	172	+ *
	173	+ * We avoid this by keeping two idle-delta counters and flipping them
	174	+ * when the window starts, thus separating old and new NO_HZ load.
	175	+ *
	176	+ * The only trick is the slight shift in index flip for read vs write.
	177	+ *
	178	+ * 0s 5s 10s 15s
	179	+ * +10 +10 +10 +10
	180	+ * \|-\|-----------\|-\|-----------\|-\|-----------\|-\|
	181	+ * r:0 0 1 1 0 0 1 1 0
	182	+ * w:0 1 1 0 0 1 1 0 0
	183	+ *
	184	+ * This ensures we'll fold the old idle contribution in this window while
	185	+ * accumlating the new one.
	186	+ *
	187	+ * - When we wake up from NO_HZ idle during the window, we push up our
	188	+ * contribution, since we effectively move our sample point to a known
	189	+ * busy state.
	190	+ *
	191	+ * This is solved by pushing the window forward, and thus skipping the
	192	+ * sample, for this cpu (effectively using the idle-delta for this cpu which
	193	+ * was in effect at the time the window opened). This also solves the issue
	194	+ * of having to deal with a cpu having been in NOHZ idle for multiple
	195	+ * LOAD_FREQ intervals.
	196	*
	197	* When making the ILB scale, we should try to pull this in as well.
	198	*/
	199	-static atomic_long_t calc_load_tasks_idle;
	200	+static atomic_long_t calc_load_idle[2];
	201	+static int calc_load_idx;
	202
	203	-static void calc_load_account_idle(struct rq *this_rq)
	204	+static inline int calc_load_write_idx(void)
	205	{
	206	+ int idx = calc_load_idx;
	207	+
	208	+ /*
	209	+ * See calc_global_nohz(), if we observe the new index, we also
	210	+ * need to observe the new update time.
	211	+ */
	212	+ smp_rmb();
	213	+
	214	+ /*
	215	+ * If the folding window started, make sure we start writing in the
	216	+ * next idle-delta.
	217	+ */
	218	+ if (!time_before(jiffies, calc_load_update))
	219	+ idx++;
	220	+
	221	+ return idx & 1;
	222	+}
	223	+
	224	+static inline int calc_load_read_idx(void)
	225	+{
	226	+ return calc_load_idx & 1;
	227	+}
	228	+
	229	+void calc_load_enter_idle(void)
	230	+{
	231	+ struct rq *this_rq = this_rq();
	232	long delta;
	233
	234	+ /*
	235	+ * We're going into NOHZ mode, if there's any pending delta, fold it
	236	+ * into the pending idle delta.
	237	+ */
	238	delta = calc_load_fold_active(this_rq);
	239	- if (delta)
	240	- atomic_long_add(delta, &calc_load_tasks_idle);
	241	+ if (delta) {
	242	+ int idx = calc_load_write_idx();
	243	+ atomic_long_add(delta, &calc_load_idle[idx]);
	244	+ }
	245	}
	246
	247	-static long calc_load_fold_idle(void)
	248	+void calc_load_exit_idle(void)
	249	{
	250	- long delta = 0;
	251	+ struct rq *this_rq = this_rq();
	252	+
	253	+ /*
	254	+ * If we're still before the sample window, we're done.
	255	+ */
	256	+ if (time_before(jiffies, this_rq->calc_load_update))
	257	+ return;
	258
	259	/*
	260	- * Its got a race, we don't care...
	261	+ * We woke inside or after the sample window, this means we're already
	262	+ * accounted through the nohz accounting, so skip the entire deal and
	263	+ * sync up for the next window.
	264	*/
	265	- if (atomic_long_read(&calc_load_tasks_idle))
	266	- delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
	267	+ this_rq->calc_load_update = calc_load_update;
	268	+ if (time_before(jiffies, this_rq->calc_load_update + 10))
	269	+ this_rq->calc_load_update += LOAD_FREQ;
	270	+}
	271	+
	272	+static long calc_load_fold_idle(void)
	273	+{
	274	+ int idx = calc_load_read_idx();
	275	+ long delta = 0;
	276	+
	277	+ if (atomic_long_read(&calc_load_idle[idx]))
	278	+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
	279
	280	return delta;
	281	}
	282	@@ -3542,66 +3694,39 @@ static void calc_global_nohz(void)
	283	{
	284	long delta, active, n;
	285
	286	- /*
	287	- * If we crossed a calc_load_update boundary, make sure to fold
	288	- * any pending idle changes, the respective CPUs might have
	289	- * missed the tick driven calc_load_account_active() update
	290	- * due to NO_HZ.
	291	- */
	292	- delta = calc_load_fold_idle();
	293	- if (delta)
	294	- atomic_long_add(delta, &calc_load_tasks);
	295	-
	296	- /*
	297	- * It could be the one fold was all it took, we done!
	298	- */
	299	- if (time_before(jiffies, calc_load_update + 10))
	300	- return;
	301	-
	302	- /*
	303	- * Catch-up, fold however many we are behind still
	304	- */
	305	- delta = jiffies - calc_load_update - 10;
	306	- n = 1 + (delta / LOAD_FREQ);
	307	+ if (!time_before(jiffies, calc_load_update + 10)) {
	308	+ /*
	309	+ * Catch-up, fold however many we are behind still
	310	+ */
	311	+ delta = jiffies - calc_load_update - 10;
	312	+ n = 1 + (delta / LOAD_FREQ);
	313
	314	- active = atomic_long_read(&calc_load_tasks);
	315	- active = active > 0 ? active * FIXED_1 : 0;
	316	+ active = atomic_long_read(&calc_load_tasks);
	317	+ active = active > 0 ? active * FIXED_1 : 0;
	318
	319	- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
	320	- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
	321	- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
	322	+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
	323	+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
	324	+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
	325
	326	- calc_load_update += n * LOAD_FREQ;
	327	-}
	328	-#else
	329	-static void calc_load_account_idle(struct rq *this_rq)
	330	-{
	331	-}
	332	+ calc_load_update += n * LOAD_FREQ;
	333	+ }
	334
	335	-static inline long calc_load_fold_idle(void)
	336	-{
	337	- return 0;
	338	+ /*
	339	+ * Flip the idle index...
	340	+ *
	341	+ * Make sure we first write the new time then flip the index, so that
	342	+ * calc_load_write_idx() will see the new time when it reads the new
	343	+ * index, this avoids a double flip messing things up.
	344	+ */
	345	+ smp_wmb();
	346	+ calc_load_idx++;
	347	}
	348	+#else /* !CONFIG_NO_HZ */
	349
	350	-static void calc_global_nohz(void)
	351	-{
	352	-}
	353	-#endif
	354	+static inline long calc_load_fold_idle(void) { return 0; }
	355	+static inline void calc_global_nohz(void) { }
	356
	357	-/**
	358	- * get_avenrun - get the load average array
	359	- * @loads: pointer to dest load array
	360	- * @offset: offset to add
	361	- * @shift: shift count to shift the result left
	362	- *
	363	- * These values are estimates at best, so no need for locking.
	364	- */
	365	-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
	366	-{
	367	- loads[0] = (avenrun[0] + offset) << shift;
	368	- loads[1] = (avenrun[1] + offset) << shift;
	369	- loads[2] = (avenrun[2] + offset) << shift;
	370	-}
	371	+#endif /* CONFIG_NO_HZ */
	372
	373	/*
	374	* calc_load - update the avenrun load estimates 10 ticks after the
	375	@@ -3609,11 +3734,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
	376	*/
	377	void calc_global_load(unsigned long ticks)
	378	{
	379	- long active;
	380	+ long active, delta;
	381
	382	if (time_before(jiffies, calc_load_update + 10))
	383	return;
	384
	385	+ /*
	386	+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
	387	+ */
	388	+ delta = calc_load_fold_idle();
	389	+ if (delta)
	390	+ atomic_long_add(delta, &calc_load_tasks);
	391	+
	392	active = atomic_long_read(&calc_load_tasks);
	393	active = active > 0 ? active * FIXED_1 : 0;
	394
	395	@@ -3624,12 +3756,7 @@ void calc_global_load(unsigned long ticks)
	396	calc_load_update += LOAD_FREQ;
	397
	398	/*
	399	- * Account one period with whatever state we found before
	400	- * folding in the nohz state and ageing the entire idle period.
	401	- *
	402	- * This avoids loosing a sample when we go idle between
	403	- * calc_load_account_active() (10 ticks ago) and now and thus
	404	- * under-accounting.
	405	+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
	406	*/
	407	calc_global_nohz();
	408	}
	409	@@ -3646,7 +3773,6 @@ static void calc_load_account_active(struct rq *this_rq)
	410	return;
	411
	412	delta = calc_load_fold_active(this_rq);
	413	- delta += calc_load_fold_idle();
	414	if (delta)
	415	atomic_long_add(delta, &calc_load_tasks);
	416
	417	@@ -3654,6 +3780,10 @@ static void calc_load_account_active(struct rq *this_rq)
	418	}
	419
	420	/*
	421	+ * End of global load-average stuff
	422	+ */
	423	+
	424	+/*
	425	* The exact cpuload at various idx values, calculated at every tick would be
	426	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
	427	*
	428	diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
	429	index 0a51882..be92bfe 100644
	430	--- a/kernel/sched_idletask.c
	431	+++ b/kernel/sched_idletask.c
	432	@@ -23,7 +23,6 @@ static void check_preempt_curr_idle(struct rq rq, struct task_struct p, int fl
	433	static struct task_struct pick_next_task_idle(struct rq rq)
	434	{
	435	schedstat_inc(rq, sched_goidle);
	436	- calc_load_account_idle(rq);
	437	return rq->idle;
	438	}
	439
	440	diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
	441	index c923640..9955ebd 100644
	442	--- a/kernel/time/tick-sched.c
	443	+++ b/kernel/time/tick-sched.c
	444	@@ -430,6 +430,7 @@ void tick_nohz_stop_sched_tick(int inidle)
	445	*/
	446	if (!ts->tick_stopped) {
	447	select_nohz_load_balancer(1);
	448	+ calc_load_enter_idle();
	449
	450	ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
	451	ts->tick_stopped = 1;
	452	@@ -563,6 +564,7 @@ void tick_nohz_restart_sched_tick(void)
	453	account_idle_ticks(ticks);
	454	#endif
	455
	456	+ calc_load_exit_idle();
	457	touch_softlockup_watchdog();
	458	/*
	459	* Cancel the scheduled timer and restore the tick
	460	--
	461	1.7.7.6
	462