diff options
Diffstat (limited to 'recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.22/0007-mm-pmd_read_atomic-fix-32bit-PAE-pmd-walk-vs-pmd_pop.patch')
-rw-r--r-- | recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.22/0007-mm-pmd_read_atomic-fix-32bit-PAE-pmd-walk-vs-pmd_pop.patch | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.22/0007-mm-pmd_read_atomic-fix-32bit-PAE-pmd-walk-vs-pmd_pop.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.22/0007-mm-pmd_read_atomic-fix-32bit-PAE-pmd-walk-vs-pmd_pop.patch new file mode 100644 index 00000000..9cac18dd --- /dev/null +++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.22/0007-mm-pmd_read_atomic-fix-32bit-PAE-pmd-walk-vs-pmd_pop.patch | |||
@@ -0,0 +1,218 @@ | |||
1 | From 8bf1709b6925d4e05120bdfed73992d50e7f11bf Mon Sep 17 00:00:00 2001 | ||
2 | From: Andrea Arcangeli <aarcange@redhat.com> | ||
3 | Date: Tue, 29 May 2012 15:06:49 -0700 | ||
4 | Subject: [PATCH 07/46] mm: pmd_read_atomic: fix 32bit PAE pmd walk vs | ||
5 | pmd_populate SMP race condition | ||
6 | |||
7 | commit 26c191788f18129af0eb32a358cdaea0c7479626 upstream. | ||
8 | |||
9 | When holding the mmap_sem for reading, pmd_offset_map_lock should only | ||
10 | run on a pmd_t that has been read atomically from the pmdp pointer, | ||
11 | otherwise we may read only half of it leading to this crash. | ||
12 | |||
13 | PID: 11679 TASK: f06e8000 CPU: 3 COMMAND: "do_race_2_panic" | ||
14 | #0 [f06a9dd8] crash_kexec at c049b5ec | ||
15 | #1 [f06a9e2c] oops_end at c083d1c2 | ||
16 | #2 [f06a9e40] no_context at c0433ded | ||
17 | #3 [f06a9e64] bad_area_nosemaphore at c043401a | ||
18 | #4 [f06a9e6c] __do_page_fault at c0434493 | ||
19 | #5 [f06a9eec] do_page_fault at c083eb45 | ||
20 | #6 [f06a9f04] error_code (via page_fault) at c083c5d5 | ||
21 | EAX: 01fb470c EBX: fff35000 ECX: 00000003 EDX: 00000100 EBP: | ||
22 | 00000000 | ||
23 | DS: 007b ESI: 9e201000 ES: 007b EDI: 01fb4700 GS: 00e0 | ||
24 | CS: 0060 EIP: c083bc14 ERR: ffffffff EFLAGS: 00010246 | ||
25 | #7 [f06a9f38] _spin_lock at c083bc14 | ||
26 | #8 [f06a9f44] sys_mincore at c0507b7d | ||
27 | #9 [f06a9fb0] system_call at c083becd | ||
28 | start len | ||
29 | EAX: ffffffda EBX: 9e200000 ECX: 00001000 EDX: 6228537f | ||
30 | DS: 007b ESI: 00000000 ES: 007b EDI: 003d0f00 | ||
31 | SS: 007b ESP: 62285354 EBP: 62285388 GS: 0033 | ||
32 | CS: 0073 EIP: 00291416 ERR: 000000da EFLAGS: 00000286 | ||
33 | |||
34 | This should be a longstanding bug affecting x86 32bit PAE without THP. | ||
35 | Only archs with 64bit large pmd_t and 32bit unsigned long should be | ||
36 | affected. | ||
37 | |||
38 | With THP enabled the barrier() in pmd_none_or_trans_huge_or_clear_bad() | ||
39 | would partly hide the bug when the pmd transition from none to stable, | ||
40 | by forcing a re-read of the *pmd in pmd_offset_map_lock, but when THP is | ||
41 | enabled a new set of problem arises by the fact could then transition | ||
42 | freely in any of the none, pmd_trans_huge or pmd_trans_stable states. | ||
43 | So making the barrier in pmd_none_or_trans_huge_or_clear_bad() | ||
44 | unconditional isn't good idea and it would be a flakey solution. | ||
45 | |||
46 | This should be fully fixed by introducing a pmd_read_atomic that reads | ||
47 | the pmd in order with THP disabled, or by reading the pmd atomically | ||
48 | with cmpxchg8b with THP enabled. | ||
49 | |||
50 | Luckily this new race condition only triggers in the places that must | ||
51 | already be covered by pmd_none_or_trans_huge_or_clear_bad() so the fix | ||
52 | is localized there but this bug is not related to THP. | ||
53 | |||
54 | NOTE: this can trigger on x86 32bit systems with PAE enabled with more | ||
55 | than 4G of ram, otherwise the high part of the pmd will never risk to be | ||
56 | truncated because it would be zero at all times, in turn so hiding the | ||
57 | SMP race. | ||
58 | |||
59 | This bug was discovered and fully debugged by Ulrich, quote: | ||
60 | |||
61 | ---- | ||
62 | [..] | ||
63 | pmd_none_or_trans_huge_or_clear_bad() loads the content of edx and | ||
64 | eax. | ||
65 | |||
66 | 496 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t | ||
67 | *pmd) | ||
68 | 497 { | ||
69 | 498 /* depend on compiler for an atomic pmd read */ | ||
70 | 499 pmd_t pmdval = *pmd; | ||
71 | |||
72 | // edi = pmd pointer | ||
73 | 0xc0507a74 <sys_mincore+548>: mov 0x8(%esp),%edi | ||
74 | ... | ||
75 | // edx = PTE page table high address | ||
76 | 0xc0507a84 <sys_mincore+564>: mov 0x4(%edi),%edx | ||
77 | ... | ||
78 | // eax = PTE page table low address | ||
79 | 0xc0507a8e <sys_mincore+574>: mov (%edi),%eax | ||
80 | |||
81 | [..] | ||
82 | |||
83 | Please note that the PMD is not read atomically. These are two "mov" | ||
84 | instructions where the high order bits of the PMD entry are fetched | ||
85 | first. Hence, the above machine code is prone to the following race. | ||
86 | |||
87 | - The PMD entry {high|low} is 0x0000000000000000. | ||
88 | The "mov" at 0xc0507a84 loads 0x00000000 into edx. | ||
89 | |||
90 | - A page fault (on another CPU) sneaks in between the two "mov" | ||
91 | instructions and instantiates the PMD. | ||
92 | |||
93 | - The PMD entry {high|low} is now 0x00000003fda38067. | ||
94 | The "mov" at 0xc0507a8e loads 0xfda38067 into eax. | ||
95 | ---- | ||
96 | |||
97 | Reported-by: Ulrich Obergfell <uobergfe@redhat.com> | ||
98 | Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> | ||
99 | Cc: Mel Gorman <mgorman@suse.de> | ||
100 | Cc: Hugh Dickins <hughd@google.com> | ||
101 | Cc: Larry Woodman <lwoodman@redhat.com> | ||
102 | Cc: Petr Matousek <pmatouse@redhat.com> | ||
103 | Cc: Rik van Riel <riel@redhat.com> | ||
104 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | ||
105 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | ||
106 | Signed-off-by: Ben Hutchings <ben@decadent.org.uk> | ||
107 | --- | ||
108 | arch/x86/include/asm/pgtable-3level.h | 50 +++++++++++++++++++++++++++++++++ | ||
109 | include/asm-generic/pgtable.h | 22 +++++++++++++-- | ||
110 | 2 files changed, 70 insertions(+), 2 deletions(-) | ||
111 | |||
112 | diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h | ||
113 | index effff47..43876f1 100644 | ||
114 | --- a/arch/x86/include/asm/pgtable-3level.h | ||
115 | +++ b/arch/x86/include/asm/pgtable-3level.h | ||
116 | @@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) | ||
117 | ptep->pte_low = pte.pte_low; | ||
118 | } | ||
119 | |||
120 | +#define pmd_read_atomic pmd_read_atomic | ||
121 | +/* | ||
122 | + * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with | ||
123 | + * a "*pmdp" dereference done by gcc. Problem is, in certain places | ||
124 | + * where pte_offset_map_lock is called, concurrent page faults are | ||
125 | + * allowed, if the mmap_sem is hold for reading. An example is mincore | ||
126 | + * vs page faults vs MADV_DONTNEED. On the page fault side | ||
127 | + * pmd_populate rightfully does a set_64bit, but if we're reading the | ||
128 | + * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen | ||
129 | + * because gcc will not read the 64bit of the pmd atomically. To fix | ||
130 | + * this all places running pmd_offset_map_lock() while holding the | ||
131 | + * mmap_sem in read mode, shall read the pmdp pointer using this | ||
132 | + * function to know if the pmd is null nor not, and in turn to know if | ||
133 | + * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd | ||
134 | + * operations. | ||
135 | + * | ||
136 | + * Without THP if the mmap_sem is hold for reading, the | ||
137 | + * pmd can only transition from null to not null while pmd_read_atomic runs. | ||
138 | + * So there's no need of literally reading it atomically. | ||
139 | + * | ||
140 | + * With THP if the mmap_sem is hold for reading, the pmd can become | ||
141 | + * THP or null or point to a pte (and in turn become "stable") at any | ||
142 | + * time under pmd_read_atomic, so it's mandatory to read it atomically | ||
143 | + * with cmpxchg8b. | ||
144 | + */ | ||
145 | +#ifndef CONFIG_TRANSPARENT_HUGEPAGE | ||
146 | +static inline pmd_t pmd_read_atomic(pmd_t *pmdp) | ||
147 | +{ | ||
148 | + pmdval_t ret; | ||
149 | + u32 *tmp = (u32 *)pmdp; | ||
150 | + | ||
151 | + ret = (pmdval_t) (*tmp); | ||
152 | + if (ret) { | ||
153 | + /* | ||
154 | + * If the low part is null, we must not read the high part | ||
155 | + * or we can end up with a partial pmd. | ||
156 | + */ | ||
157 | + smp_rmb(); | ||
158 | + ret |= ((pmdval_t)*(tmp + 1)) << 32; | ||
159 | + } | ||
160 | + | ||
161 | + return (pmd_t) { ret }; | ||
162 | +} | ||
163 | +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
164 | +static inline pmd_t pmd_read_atomic(pmd_t *pmdp) | ||
165 | +{ | ||
166 | + return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; | ||
167 | +} | ||
168 | +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
169 | + | ||
170 | static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
171 | { | ||
172 | set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); | ||
173 | diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h | ||
174 | index a03c098..831924a 100644 | ||
175 | --- a/include/asm-generic/pgtable.h | ||
176 | +++ b/include/asm-generic/pgtable.h | ||
177 | @@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd) | ||
178 | #endif /* __HAVE_ARCH_PMD_WRITE */ | ||
179 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
180 | |||
181 | +#ifndef pmd_read_atomic | ||
182 | +static inline pmd_t pmd_read_atomic(pmd_t *pmdp) | ||
183 | +{ | ||
184 | + /* | ||
185 | + * Depend on compiler for an atomic pmd read. NOTE: this is | ||
186 | + * only going to work, if the pmdval_t isn't larger than | ||
187 | + * an unsigned long. | ||
188 | + */ | ||
189 | + return *pmdp; | ||
190 | +} | ||
191 | +#endif | ||
192 | + | ||
193 | /* | ||
194 | * This function is meant to be used by sites walking pagetables with | ||
195 | * the mmap_sem hold in read mode to protect against MADV_DONTNEED and | ||
196 | @@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd) | ||
197 | * undefined so behaving like if the pmd was none is safe (because it | ||
198 | * can return none anyway). The compiler level barrier() is critically | ||
199 | * important to compute the two checks atomically on the same pmdval. | ||
200 | + * | ||
201 | + * For 32bit kernels with a 64bit large pmd_t this automatically takes | ||
202 | + * care of reading the pmd atomically to avoid SMP race conditions | ||
203 | + * against pmd_populate() when the mmap_sem is hold for reading by the | ||
204 | + * caller (a special atomic read not done by "gcc" as in the generic | ||
205 | + * version above, is also needed when THP is disabled because the page | ||
206 | + * fault can populate the pmd from under us). | ||
207 | */ | ||
208 | static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) | ||
209 | { | ||
210 | - /* depend on compiler for an atomic pmd read */ | ||
211 | - pmd_t pmdval = *pmd; | ||
212 | + pmd_t pmdval = pmd_read_atomic(pmd); | ||
213 | /* | ||
214 | * The barrier will stabilize the pmdval in a register or on | ||
215 | * the stack so that it will stop changing under the code. | ||
216 | -- | ||
217 | 1.7.10 | ||
218 | |||