summaryrefslogtreecommitdiffstats
path: root/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch
diff options
context:
space:
mode:
Diffstat (limited to 'recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch')
-rw-r--r--recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch353
1 files changed, 353 insertions, 0 deletions
diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch
new file mode 100644
index 00000000..95d4a73e
--- /dev/null
+++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch
@@ -0,0 +1,353 @@
1From 9016ec427136d5b5d025948319cf1114dc7734e4 Mon Sep 17 00:00:00 2001
2From: Linus Torvalds <torvalds@linux-foundation.org>
3Date: Sat, 18 Feb 2012 12:56:35 -0800
4Subject: [PATCH 10/11] i387: re-introduce FPU state preloading at context
5 switch time
6
7commit 34ddc81a230b15c0e345b6b253049db731499f7e upstream.
8
9After all the FPU state cleanups and finally finding the problem that
10caused all our FPU save/restore problems, this re-introduces the
11preloading of FPU state that was removed in commit b3b0870ef3ff ("i387:
12do not preload FPU state at task switch time").
13
14However, instead of simply reverting the removal, this reimplements
15preloading with several fixes, most notably
16
17 - properly abstracted as a true FPU state switch, rather than as
18 open-coded save and restore with various hacks.
19
20 In particular, implementing it as a proper FPU state switch allows us
21 to optimize the CR0.TS flag accesses: there is no reason to set the
22 TS bit only to then almost immediately clear it again. CR0 accesses
23 are quite slow and expensive, don't flip the bit back and forth for
24 no good reason.
25
26 - Make sure that the same model works for both x86-32 and x86-64, so
27 that there are no gratuitous differences between the two due to the
28 way they save and restore segment state differently due to
29 architectural differences that really don't matter to the FPU state.
30
31 - Avoid exposing the "preload" state to the context switch routines,
32 and in particular allow the concept of lazy state restore: if nothing
33 else has used the FPU in the meantime, and the process is still on
34 the same CPU, we can avoid restoring state from memory entirely, just
35 re-expose the state that is still in the FPU unit.
36
37 That optimized lazy restore isn't actually implemented here, but the
38 infrastructure is set up for it. Of course, older CPU's that use
39 'fnsave' to save the state cannot take advantage of this, since the
40 state saving also trashes the state.
41
42In other words, there is now an actual _design_ to the FPU state saving,
43rather than just random historical baggage. Hopefully it's easier to
44follow as a result.
45
46Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
47Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
48---
49 arch/x86/include/asm/i387.h | 110 +++++++++++++++++++++++++++++++++++-------
50 arch/x86/kernel/process_32.c | 5 ++-
51 arch/x86/kernel/process_64.c | 5 ++-
52 arch/x86/kernel/traps.c | 55 ++++++++++++---------
53 4 files changed, 133 insertions(+), 42 deletions(-)
54
55diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
56index f537667..a850b4d 100644
57--- a/arch/x86/include/asm/i387.h
58+++ b/arch/x86/include/asm/i387.h
59@@ -29,6 +29,7 @@ extern unsigned int sig_xstate_size;
60 extern void fpu_init(void);
61 extern void mxcsr_feature_mask_init(void);
62 extern int init_fpu(struct task_struct *child);
63+extern void __math_state_restore(struct task_struct *);
64 extern void math_state_restore(void);
65 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
66
67@@ -212,9 +213,10 @@ static inline void fpu_fxsave(struct fpu *fpu)
68 #endif /* CONFIG_X86_64 */
69
70 /*
71- * These must be called with preempt disabled
72+ * These must be called with preempt disabled. Returns
73+ * 'true' if the FPU state is still intact.
74 */
75-static inline void fpu_save_init(struct fpu *fpu)
76+static inline int fpu_save_init(struct fpu *fpu)
77 {
78 if (use_xsave()) {
79 fpu_xsave(fpu);
80@@ -223,22 +225,33 @@ static inline void fpu_save_init(struct fpu *fpu)
81 * xsave header may indicate the init state of the FP.
82 */
83 if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
84- return;
85+ return 1;
86 } else if (use_fxsr()) {
87 fpu_fxsave(fpu);
88 } else {
89 asm volatile("fnsave %[fx]; fwait"
90 : [fx] "=m" (fpu->state->fsave));
91- return;
92+ return 0;
93 }
94
95- if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
96+ /*
97+ * If exceptions are pending, we need to clear them so
98+ * that we don't randomly get exceptions later.
99+ *
100+ * FIXME! Is this perhaps only true for the old-style
101+ * irq13 case? Maybe we could leave the x87 state
102+ * intact otherwise?
103+ */
104+ if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) {
105 asm volatile("fnclex");
106+ return 0;
107+ }
108+ return 1;
109 }
110
111-static inline void __save_init_fpu(struct task_struct *tsk)
112+static inline int __save_init_fpu(struct task_struct *tsk)
113 {
114- fpu_save_init(&tsk->thread.fpu);
115+ return fpu_save_init(&tsk->thread.fpu);
116 }
117
118 static inline int fpu_fxrstor_checking(struct fpu *fpu)
119@@ -301,20 +314,79 @@ static inline void __thread_fpu_begin(struct task_struct *tsk)
120 }
121
122 /*
123- * Signal frame handlers...
124+ * FPU state switching for scheduling.
125+ *
126+ * This is a two-stage process:
127+ *
128+ * - switch_fpu_prepare() saves the old state and
129+ * sets the new state of the CR0.TS bit. This is
130+ * done within the context of the old process.
131+ *
132+ * - switch_fpu_finish() restores the new state as
133+ * necessary.
134 */
135-extern int save_i387_xstate(void __user *buf);
136-extern int restore_i387_xstate(void __user *buf);
137+typedef struct { int preload; } fpu_switch_t;
138+
139+/*
140+ * FIXME! We could do a totally lazy restore, but we need to
141+ * add a per-cpu "this was the task that last touched the FPU
142+ * on this CPU" variable, and the task needs to have a "I last
143+ * touched the FPU on this CPU" and check them.
144+ *
145+ * We don't do that yet, so "fpu_lazy_restore()" always returns
146+ * false, but some day..
147+ */
148+#define fpu_lazy_restore(tsk) (0)
149+#define fpu_lazy_state_intact(tsk) do { } while (0)
150+
151+static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new)
152+{
153+ fpu_switch_t fpu;
154+
155+ fpu.preload = tsk_used_math(new) && new->fpu_counter > 5;
156+ if (__thread_has_fpu(old)) {
157+ if (__save_init_fpu(old))
158+ fpu_lazy_state_intact(old);
159+ __thread_clear_has_fpu(old);
160+ old->fpu_counter++;
161+
162+ /* Don't change CR0.TS if we just switch! */
163+ if (fpu.preload) {
164+ __thread_set_has_fpu(new);
165+ prefetch(new->thread.fpu.state);
166+ } else
167+ stts();
168+ } else {
169+ old->fpu_counter = 0;
170+ if (fpu.preload) {
171+ if (fpu_lazy_restore(new))
172+ fpu.preload = 0;
173+ else
174+ prefetch(new->thread.fpu.state);
175+ __thread_fpu_begin(new);
176+ }
177+ }
178+ return fpu;
179+}
180
181-static inline void __unlazy_fpu(struct task_struct *tsk)
182+/*
183+ * By the time this gets called, we've already cleared CR0.TS and
184+ * given the process the FPU if we are going to preload the FPU
185+ * state - all we need to do is to conditionally restore the register
186+ * state itself.
187+ */
188+static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
189 {
190- if (__thread_has_fpu(tsk)) {
191- __save_init_fpu(tsk);
192- __thread_fpu_end(tsk);
193- } else
194- tsk->fpu_counter = 0;
195+ if (fpu.preload)
196+ __math_state_restore(new);
197 }
198
199+/*
200+ * Signal frame handlers...
201+ */
202+extern int save_i387_xstate(void __user *buf);
203+extern int restore_i387_xstate(void __user *buf);
204+
205 static inline void __clear_fpu(struct task_struct *tsk)
206 {
207 if (__thread_has_fpu(tsk)) {
208@@ -474,7 +546,11 @@ static inline void save_init_fpu(struct task_struct *tsk)
209 static inline void unlazy_fpu(struct task_struct *tsk)
210 {
211 preempt_disable();
212- __unlazy_fpu(tsk);
213+ if (__thread_has_fpu(tsk)) {
214+ __save_init_fpu(tsk);
215+ __thread_fpu_end(tsk);
216+ } else
217+ tsk->fpu_counter = 0;
218 preempt_enable();
219 }
220
221diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
222index 0cdb4fa..8598296 100644
223--- a/arch/x86/kernel/process_32.c
224+++ b/arch/x86/kernel/process_32.c
225@@ -297,10 +297,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
226 *next = &next_p->thread;
227 int cpu = smp_processor_id();
228 struct tss_struct *tss = &per_cpu(init_tss, cpu);
229+ fpu_switch_t fpu;
230
231 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
232
233- __unlazy_fpu(prev_p);
234+ fpu = switch_fpu_prepare(prev_p, next_p);
235
236 /*
237 * Reload esp0.
238@@ -355,6 +356,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
239 if (prev->gs | next->gs)
240 lazy_load_gs(next->gs);
241
242+ switch_fpu_finish(next_p, fpu);
243+
244 percpu_write(current_task, next_p);
245
246 return prev_p;
247diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
248index 042b18f..6a364a6 100644
249--- a/arch/x86/kernel/process_64.c
250+++ b/arch/x86/kernel/process_64.c
251@@ -381,8 +381,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
252 int cpu = smp_processor_id();
253 struct tss_struct *tss = &per_cpu(init_tss, cpu);
254 unsigned fsindex, gsindex;
255+ fpu_switch_t fpu;
256
257- __unlazy_fpu(prev_p);
258+ fpu = switch_fpu_prepare(prev_p, next_p);
259
260 /*
261 * Reload esp0, LDT and the page table pointer:
262@@ -452,6 +453,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
263 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
264 prev->gsindex = gsindex;
265
266+ switch_fpu_finish(next_p, fpu);
267+
268 /*
269 * Switch the PDA and FPU contexts.
270 */
271diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
272index a99badf..31d9d0f 100644
273--- a/arch/x86/kernel/traps.c
274+++ b/arch/x86/kernel/traps.c
275@@ -562,6 +562,37 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
276 }
277
278 /*
279+ * This gets called with the process already owning the
280+ * FPU state, and with CR0.TS cleared. It just needs to
281+ * restore the FPU register state.
282+ */
283+void __math_state_restore(struct task_struct *tsk)
284+{
285+ /* We need a safe address that is cheap to find and that is already
286+ in L1. We've just brought in "tsk->thread.has_fpu", so use that */
287+#define safe_address (tsk->thread.has_fpu)
288+
289+ /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
290+ is pending. Clear the x87 state here by setting it to fixed
291+ values. safe_address is a random variable that should be in L1 */
292+ alternative_input(
293+ ASM_NOP8 ASM_NOP2,
294+ "emms\n\t" /* clear stack tags */
295+ "fildl %P[addr]", /* set F?P to defined value */
296+ X86_FEATURE_FXSAVE_LEAK,
297+ [addr] "m" (safe_address));
298+
299+ /*
300+ * Paranoid restore. send a SIGSEGV if we fail to restore the state.
301+ */
302+ if (unlikely(restore_fpu_checking(tsk))) {
303+ __thread_fpu_end(tsk);
304+ force_sig(SIGSEGV, tsk);
305+ return;
306+ }
307+}
308+
309+/*
310 * 'math_state_restore()' saves the current math information in the
311 * old math state array, and gets the new ones from the current task
312 *
313@@ -575,10 +606,6 @@ void math_state_restore(void)
314 {
315 struct task_struct *tsk = current;
316
317- /* We need a safe address that is cheap to find and that is already
318- in L1. We're just bringing in "tsk->thread.has_fpu", so use that */
319-#define safe_address (tsk->thread.has_fpu)
320-
321 if (!tsk_used_math(tsk)) {
322 local_irq_enable();
323 /*
324@@ -595,25 +622,7 @@ void math_state_restore(void)
325 }
326
327 __thread_fpu_begin(tsk);
328-
329- /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
330- is pending. Clear the x87 state here by setting it to fixed
331- values. safe_address is a random variable that should be in L1 */
332- alternative_input(
333- ASM_NOP8 ASM_NOP2,
334- "emms\n\t" /* clear stack tags */
335- "fildl %P[addr]", /* set F?P to defined value */
336- X86_FEATURE_FXSAVE_LEAK,
337- [addr] "m" (safe_address));
338-
339- /*
340- * Paranoid restore. send a SIGSEGV if we fail to restore the state.
341- */
342- if (unlikely(restore_fpu_checking(tsk))) {
343- __thread_fpu_end(tsk);
344- force_sig(SIGSEGV, tsk);
345- return;
346- }
347+ __math_state_restore(tsk);
348
349 tsk->fpu_counter++;
350 }
351--
3521.7.7.4
353