diff options
Diffstat (limited to 'recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch')
-rw-r--r-- | recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch | 353 |
1 files changed, 353 insertions, 0 deletions
diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch new file mode 100644 index 00000000..95d4a73e --- /dev/null +++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.8/0010-i387-re-introduce-FPU-state-preloading-at-context-sw.patch | |||
@@ -0,0 +1,353 @@ | |||
1 | From 9016ec427136d5b5d025948319cf1114dc7734e4 Mon Sep 17 00:00:00 2001 | ||
2 | From: Linus Torvalds <torvalds@linux-foundation.org> | ||
3 | Date: Sat, 18 Feb 2012 12:56:35 -0800 | ||
4 | Subject: [PATCH 10/11] i387: re-introduce FPU state preloading at context | ||
5 | switch time | ||
6 | |||
7 | commit 34ddc81a230b15c0e345b6b253049db731499f7e upstream. | ||
8 | |||
9 | After all the FPU state cleanups and finally finding the problem that | ||
10 | caused all our FPU save/restore problems, this re-introduces the | ||
11 | preloading of FPU state that was removed in commit b3b0870ef3ff ("i387: | ||
12 | do not preload FPU state at task switch time"). | ||
13 | |||
14 | However, instead of simply reverting the removal, this reimplements | ||
15 | preloading with several fixes, most notably | ||
16 | |||
17 | - properly abstracted as a true FPU state switch, rather than as | ||
18 | open-coded save and restore with various hacks. | ||
19 | |||
20 | In particular, implementing it as a proper FPU state switch allows us | ||
21 | to optimize the CR0.TS flag accesses: there is no reason to set the | ||
22 | TS bit only to then almost immediately clear it again. CR0 accesses | ||
23 | are quite slow and expensive, don't flip the bit back and forth for | ||
24 | no good reason. | ||
25 | |||
26 | - Make sure that the same model works for both x86-32 and x86-64, so | ||
27 | that there are no gratuitous differences between the two due to the | ||
28 | way they save and restore segment state differently due to | ||
29 | architectural differences that really don't matter to the FPU state. | ||
30 | |||
31 | - Avoid exposing the "preload" state to the context switch routines, | ||
32 | and in particular allow the concept of lazy state restore: if nothing | ||
33 | else has used the FPU in the meantime, and the process is still on | ||
34 | the same CPU, we can avoid restoring state from memory entirely, just | ||
35 | re-expose the state that is still in the FPU unit. | ||
36 | |||
37 | That optimized lazy restore isn't actually implemented here, but the | ||
38 | infrastructure is set up for it. Of course, older CPU's that use | ||
39 | 'fnsave' to save the state cannot take advantage of this, since the | ||
40 | state saving also trashes the state. | ||
41 | |||
42 | In other words, there is now an actual _design_ to the FPU state saving, | ||
43 | rather than just random historical baggage. Hopefully it's easier to | ||
44 | follow as a result. | ||
45 | |||
46 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | ||
47 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | ||
48 | --- | ||
49 | arch/x86/include/asm/i387.h | 110 +++++++++++++++++++++++++++++++++++------- | ||
50 | arch/x86/kernel/process_32.c | 5 ++- | ||
51 | arch/x86/kernel/process_64.c | 5 ++- | ||
52 | arch/x86/kernel/traps.c | 55 ++++++++++++--------- | ||
53 | 4 files changed, 133 insertions(+), 42 deletions(-) | ||
54 | |||
55 | diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h | ||
56 | index f537667..a850b4d 100644 | ||
57 | --- a/arch/x86/include/asm/i387.h | ||
58 | +++ b/arch/x86/include/asm/i387.h | ||
59 | @@ -29,6 +29,7 @@ extern unsigned int sig_xstate_size; | ||
60 | extern void fpu_init(void); | ||
61 | extern void mxcsr_feature_mask_init(void); | ||
62 | extern int init_fpu(struct task_struct *child); | ||
63 | +extern void __math_state_restore(struct task_struct *); | ||
64 | extern void math_state_restore(void); | ||
65 | extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); | ||
66 | |||
67 | @@ -212,9 +213,10 @@ static inline void fpu_fxsave(struct fpu *fpu) | ||
68 | #endif /* CONFIG_X86_64 */ | ||
69 | |||
70 | /* | ||
71 | - * These must be called with preempt disabled | ||
72 | + * These must be called with preempt disabled. Returns | ||
73 | + * 'true' if the FPU state is still intact. | ||
74 | */ | ||
75 | -static inline void fpu_save_init(struct fpu *fpu) | ||
76 | +static inline int fpu_save_init(struct fpu *fpu) | ||
77 | { | ||
78 | if (use_xsave()) { | ||
79 | fpu_xsave(fpu); | ||
80 | @@ -223,22 +225,33 @@ static inline void fpu_save_init(struct fpu *fpu) | ||
81 | * xsave header may indicate the init state of the FP. | ||
82 | */ | ||
83 | if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) | ||
84 | - return; | ||
85 | + return 1; | ||
86 | } else if (use_fxsr()) { | ||
87 | fpu_fxsave(fpu); | ||
88 | } else { | ||
89 | asm volatile("fnsave %[fx]; fwait" | ||
90 | : [fx] "=m" (fpu->state->fsave)); | ||
91 | - return; | ||
92 | + return 0; | ||
93 | } | ||
94 | |||
95 | - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) | ||
96 | + /* | ||
97 | + * If exceptions are pending, we need to clear them so | ||
98 | + * that we don't randomly get exceptions later. | ||
99 | + * | ||
100 | + * FIXME! Is this perhaps only true for the old-style | ||
101 | + * irq13 case? Maybe we could leave the x87 state | ||
102 | + * intact otherwise? | ||
103 | + */ | ||
104 | + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { | ||
105 | asm volatile("fnclex"); | ||
106 | + return 0; | ||
107 | + } | ||
108 | + return 1; | ||
109 | } | ||
110 | |||
111 | -static inline void __save_init_fpu(struct task_struct *tsk) | ||
112 | +static inline int __save_init_fpu(struct task_struct *tsk) | ||
113 | { | ||
114 | - fpu_save_init(&tsk->thread.fpu); | ||
115 | + return fpu_save_init(&tsk->thread.fpu); | ||
116 | } | ||
117 | |||
118 | static inline int fpu_fxrstor_checking(struct fpu *fpu) | ||
119 | @@ -301,20 +314,79 @@ static inline void __thread_fpu_begin(struct task_struct *tsk) | ||
120 | } | ||
121 | |||
122 | /* | ||
123 | - * Signal frame handlers... | ||
124 | + * FPU state switching for scheduling. | ||
125 | + * | ||
126 | + * This is a two-stage process: | ||
127 | + * | ||
128 | + * - switch_fpu_prepare() saves the old state and | ||
129 | + * sets the new state of the CR0.TS bit. This is | ||
130 | + * done within the context of the old process. | ||
131 | + * | ||
132 | + * - switch_fpu_finish() restores the new state as | ||
133 | + * necessary. | ||
134 | */ | ||
135 | -extern int save_i387_xstate(void __user *buf); | ||
136 | -extern int restore_i387_xstate(void __user *buf); | ||
137 | +typedef struct { int preload; } fpu_switch_t; | ||
138 | + | ||
139 | +/* | ||
140 | + * FIXME! We could do a totally lazy restore, but we need to | ||
141 | + * add a per-cpu "this was the task that last touched the FPU | ||
142 | + * on this CPU" variable, and the task needs to have a "I last | ||
143 | + * touched the FPU on this CPU" and check them. | ||
144 | + * | ||
145 | + * We don't do that yet, so "fpu_lazy_restore()" always returns | ||
146 | + * false, but some day.. | ||
147 | + */ | ||
148 | +#define fpu_lazy_restore(tsk) (0) | ||
149 | +#define fpu_lazy_state_intact(tsk) do { } while (0) | ||
150 | + | ||
151 | +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) | ||
152 | +{ | ||
153 | + fpu_switch_t fpu; | ||
154 | + | ||
155 | + fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; | ||
156 | + if (__thread_has_fpu(old)) { | ||
157 | + if (__save_init_fpu(old)) | ||
158 | + fpu_lazy_state_intact(old); | ||
159 | + __thread_clear_has_fpu(old); | ||
160 | + old->fpu_counter++; | ||
161 | + | ||
162 | + /* Don't change CR0.TS if we just switch! */ | ||
163 | + if (fpu.preload) { | ||
164 | + __thread_set_has_fpu(new); | ||
165 | + prefetch(new->thread.fpu.state); | ||
166 | + } else | ||
167 | + stts(); | ||
168 | + } else { | ||
169 | + old->fpu_counter = 0; | ||
170 | + if (fpu.preload) { | ||
171 | + if (fpu_lazy_restore(new)) | ||
172 | + fpu.preload = 0; | ||
173 | + else | ||
174 | + prefetch(new->thread.fpu.state); | ||
175 | + __thread_fpu_begin(new); | ||
176 | + } | ||
177 | + } | ||
178 | + return fpu; | ||
179 | +} | ||
180 | |||
181 | -static inline void __unlazy_fpu(struct task_struct *tsk) | ||
182 | +/* | ||
183 | + * By the time this gets called, we've already cleared CR0.TS and | ||
184 | + * given the process the FPU if we are going to preload the FPU | ||
185 | + * state - all we need to do is to conditionally restore the register | ||
186 | + * state itself. | ||
187 | + */ | ||
188 | +static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) | ||
189 | { | ||
190 | - if (__thread_has_fpu(tsk)) { | ||
191 | - __save_init_fpu(tsk); | ||
192 | - __thread_fpu_end(tsk); | ||
193 | - } else | ||
194 | - tsk->fpu_counter = 0; | ||
195 | + if (fpu.preload) | ||
196 | + __math_state_restore(new); | ||
197 | } | ||
198 | |||
199 | +/* | ||
200 | + * Signal frame handlers... | ||
201 | + */ | ||
202 | +extern int save_i387_xstate(void __user *buf); | ||
203 | +extern int restore_i387_xstate(void __user *buf); | ||
204 | + | ||
205 | static inline void __clear_fpu(struct task_struct *tsk) | ||
206 | { | ||
207 | if (__thread_has_fpu(tsk)) { | ||
208 | @@ -474,7 +546,11 @@ static inline void save_init_fpu(struct task_struct *tsk) | ||
209 | static inline void unlazy_fpu(struct task_struct *tsk) | ||
210 | { | ||
211 | preempt_disable(); | ||
212 | - __unlazy_fpu(tsk); | ||
213 | + if (__thread_has_fpu(tsk)) { | ||
214 | + __save_init_fpu(tsk); | ||
215 | + __thread_fpu_end(tsk); | ||
216 | + } else | ||
217 | + tsk->fpu_counter = 0; | ||
218 | preempt_enable(); | ||
219 | } | ||
220 | |||
221 | diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c | ||
222 | index 0cdb4fa..8598296 100644 | ||
223 | --- a/arch/x86/kernel/process_32.c | ||
224 | +++ b/arch/x86/kernel/process_32.c | ||
225 | @@ -297,10 +297,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
226 | *next = &next_p->thread; | ||
227 | int cpu = smp_processor_id(); | ||
228 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
229 | + fpu_switch_t fpu; | ||
230 | |||
231 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | ||
232 | |||
233 | - __unlazy_fpu(prev_p); | ||
234 | + fpu = switch_fpu_prepare(prev_p, next_p); | ||
235 | |||
236 | /* | ||
237 | * Reload esp0. | ||
238 | @@ -355,6 +356,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
239 | if (prev->gs | next->gs) | ||
240 | lazy_load_gs(next->gs); | ||
241 | |||
242 | + switch_fpu_finish(next_p, fpu); | ||
243 | + | ||
244 | percpu_write(current_task, next_p); | ||
245 | |||
246 | return prev_p; | ||
247 | diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c | ||
248 | index 042b18f..6a364a6 100644 | ||
249 | --- a/arch/x86/kernel/process_64.c | ||
250 | +++ b/arch/x86/kernel/process_64.c | ||
251 | @@ -381,8 +381,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
252 | int cpu = smp_processor_id(); | ||
253 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
254 | unsigned fsindex, gsindex; | ||
255 | + fpu_switch_t fpu; | ||
256 | |||
257 | - __unlazy_fpu(prev_p); | ||
258 | + fpu = switch_fpu_prepare(prev_p, next_p); | ||
259 | |||
260 | /* | ||
261 | * Reload esp0, LDT and the page table pointer: | ||
262 | @@ -452,6 +453,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
263 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | ||
264 | prev->gsindex = gsindex; | ||
265 | |||
266 | + switch_fpu_finish(next_p, fpu); | ||
267 | + | ||
268 | /* | ||
269 | * Switch the PDA and FPU contexts. | ||
270 | */ | ||
271 | diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c | ||
272 | index a99badf..31d9d0f 100644 | ||
273 | --- a/arch/x86/kernel/traps.c | ||
274 | +++ b/arch/x86/kernel/traps.c | ||
275 | @@ -562,6 +562,37 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | + * This gets called with the process already owning the | ||
280 | + * FPU state, and with CR0.TS cleared. It just needs to | ||
281 | + * restore the FPU register state. | ||
282 | + */ | ||
283 | +void __math_state_restore(struct task_struct *tsk) | ||
284 | +{ | ||
285 | + /* We need a safe address that is cheap to find and that is already | ||
286 | + in L1. We've just brought in "tsk->thread.has_fpu", so use that */ | ||
287 | +#define safe_address (tsk->thread.has_fpu) | ||
288 | + | ||
289 | + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception | ||
290 | + is pending. Clear the x87 state here by setting it to fixed | ||
291 | + values. safe_address is a random variable that should be in L1 */ | ||
292 | + alternative_input( | ||
293 | + ASM_NOP8 ASM_NOP2, | ||
294 | + "emms\n\t" /* clear stack tags */ | ||
295 | + "fildl %P[addr]", /* set F?P to defined value */ | ||
296 | + X86_FEATURE_FXSAVE_LEAK, | ||
297 | + [addr] "m" (safe_address)); | ||
298 | + | ||
299 | + /* | ||
300 | + * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
301 | + */ | ||
302 | + if (unlikely(restore_fpu_checking(tsk))) { | ||
303 | + __thread_fpu_end(tsk); | ||
304 | + force_sig(SIGSEGV, tsk); | ||
305 | + return; | ||
306 | + } | ||
307 | +} | ||
308 | + | ||
309 | +/* | ||
310 | * 'math_state_restore()' saves the current math information in the | ||
311 | * old math state array, and gets the new ones from the current task | ||
312 | * | ||
313 | @@ -575,10 +606,6 @@ void math_state_restore(void) | ||
314 | { | ||
315 | struct task_struct *tsk = current; | ||
316 | |||
317 | - /* We need a safe address that is cheap to find and that is already | ||
318 | - in L1. We're just bringing in "tsk->thread.has_fpu", so use that */ | ||
319 | -#define safe_address (tsk->thread.has_fpu) | ||
320 | - | ||
321 | if (!tsk_used_math(tsk)) { | ||
322 | local_irq_enable(); | ||
323 | /* | ||
324 | @@ -595,25 +622,7 @@ void math_state_restore(void) | ||
325 | } | ||
326 | |||
327 | __thread_fpu_begin(tsk); | ||
328 | - | ||
329 | - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception | ||
330 | - is pending. Clear the x87 state here by setting it to fixed | ||
331 | - values. safe_address is a random variable that should be in L1 */ | ||
332 | - alternative_input( | ||
333 | - ASM_NOP8 ASM_NOP2, | ||
334 | - "emms\n\t" /* clear stack tags */ | ||
335 | - "fildl %P[addr]", /* set F?P to defined value */ | ||
336 | - X86_FEATURE_FXSAVE_LEAK, | ||
337 | - [addr] "m" (safe_address)); | ||
338 | - | ||
339 | - /* | ||
340 | - * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
341 | - */ | ||
342 | - if (unlikely(restore_fpu_checking(tsk))) { | ||
343 | - __thread_fpu_end(tsk); | ||
344 | - force_sig(SIGSEGV, tsk); | ||
345 | - return; | ||
346 | - } | ||
347 | + __math_state_restore(tsk); | ||
348 | |||
349 | tsk->fpu_counter++; | ||
350 | } | ||
351 | -- | ||
352 | 1.7.7.4 | ||
353 | |||