diff options
Diffstat (limited to 'recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0068-ore-Fix-NFS-crash-by-supporting-any-unaligned-RAID-I.patch')
-rw-r--r-- | recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0068-ore-Fix-NFS-crash-by-supporting-any-unaligned-RAID-I.patch | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0068-ore-Fix-NFS-crash-by-supporting-any-unaligned-RAID-I.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0068-ore-Fix-NFS-crash-by-supporting-any-unaligned-RAID-I.patch new file mode 100644 index 00000000..0dabe3b9 --- /dev/null +++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.24/0068-ore-Fix-NFS-crash-by-supporting-any-unaligned-RAID-I.patch | |||
@@ -0,0 +1,211 @@ | |||
1 | From b4c39a3690fd0d723f50eba441fe567e8fee68f1 Mon Sep 17 00:00:00 2001 | ||
2 | From: Boaz Harrosh <bharrosh@panasas.com> | ||
3 | Date: Fri, 8 Jun 2012 01:19:07 +0300 | ||
4 | Subject: [PATCH 068/109] ore: Fix NFS crash by supporting any unaligned RAID | ||
5 | IO | ||
6 | |||
7 | commit 9ff19309a9623f2963ac5a136782ea4d8b5d67fb upstream. | ||
8 | |||
9 | In RAID_5/6 We used to not permit an IO that it's end | ||
10 | byte is not stripe_size aligned and spans more than one stripe. | ||
11 | .i.e the caller must check if after submission the actual | ||
12 | transferred bytes is shorter, and would need to resubmit | ||
13 | a new IO with the remainder. | ||
14 | |||
15 | Exofs supports this, and NFS was supposed to support this | ||
16 | as well with it's short write mechanism. But late testing has | ||
17 | exposed a CRASH when this is used with none-RPC layout-drivers. | ||
18 | |||
19 | The change at NFS is deep and risky, in it's place the fix | ||
20 | at ORE to lift the limitation is actually clean and simple. | ||
21 | So here it is below. | ||
22 | |||
23 | The principal here is that in the case of unaligned IO on | ||
24 | both ends, beginning and end, we will send two read requests | ||
25 | one like old code, before the calculation of the first stripe, | ||
26 | and also a new site, before the calculation of the last stripe. | ||
27 | If any "boundary" is aligned or the complete IO is within a single | ||
28 | stripe. we do a single read like before. | ||
29 | |||
30 | The code is clean and simple by splitting the old _read_4_write | ||
31 | into 3 even parts: | ||
32 | 1._read_4_write_first_stripe | ||
33 | 2. _read_4_write_last_stripe | ||
34 | 3. _read_4_write_execute | ||
35 | |||
36 | And calling 1+3 at the same place as before. 2+3 before last | ||
37 | stripe, and in the case of all in a single stripe then 1+2+3 | ||
38 | is preformed additively. | ||
39 | |||
40 | Why did I not think of it before. Well I had a strike of | ||
41 | genius because I have stared at this code for 2 years, and did | ||
42 | not find this simple solution, til today. Not that I did not try. | ||
43 | |||
44 | This solution is much better for NFS than the previous supposedly | ||
45 | solution because the short write was dealt with out-of-band after | ||
46 | IO_done, which would cause for a seeky IO pattern where as in here | ||
47 | we execute in order. At both solutions we do 2 separate reads, only | ||
48 | here we do it within a single IO request. (And actually combine two | ||
49 | writes into a single submission) | ||
50 | |||
51 | NFS/exofs code need not change since the ORE API communicates the new | ||
52 | shorter length on return, what will happen is that this case would not | ||
53 | occur anymore. | ||
54 | |||
55 | hurray!! | ||
56 | |||
57 | [Stable this is an NFS bug since 3.2 Kernel should apply cleanly] | ||
58 | Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> | ||
59 | Signed-off-by: Ben Hutchings <ben@decadent.org.uk> | ||
60 | --- | ||
61 | fs/exofs/ore_raid.c | 67 +++++++++++++++++++++++++++----------------------- | ||
62 | 1 files changed, 36 insertions(+), 31 deletions(-) | ||
63 | |||
64 | diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c | ||
65 | index d222c77..fff2070 100644 | ||
66 | --- a/fs/exofs/ore_raid.c | ||
67 | +++ b/fs/exofs/ore_raid.c | ||
68 | @@ -461,16 +461,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) | ||
69 | * ios->sp2d[p][*], xor is calculated the same way. These pages are | ||
70 | * allocated/freed and don't go through cache | ||
71 | */ | ||
72 | -static int _read_4_write(struct ore_io_state *ios) | ||
73 | +static int _read_4_write_first_stripe(struct ore_io_state *ios) | ||
74 | { | ||
75 | - struct ore_io_state *ios_read; | ||
76 | struct ore_striping_info read_si; | ||
77 | struct __stripe_pages_2d *sp2d = ios->sp2d; | ||
78 | u64 offset = ios->si.first_stripe_start; | ||
79 | - u64 last_stripe_end; | ||
80 | - unsigned bytes_in_stripe = ios->si.bytes_in_stripe; | ||
81 | - unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; | ||
82 | - int ret; | ||
83 | + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; | ||
84 | |||
85 | if (offset == ios->offset) /* Go to start collect $200 */ | ||
86 | goto read_last_stripe; | ||
87 | @@ -478,6 +474,9 @@ static int _read_4_write(struct ore_io_state *ios) | ||
88 | min_p = _sp2d_min_pg(sp2d); | ||
89 | max_p = _sp2d_max_pg(sp2d); | ||
90 | |||
91 | + ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", | ||
92 | + offset, ios->offset, min_p, max_p); | ||
93 | + | ||
94 | for (c = 0; ; c++) { | ||
95 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
96 | read_si.obj_offset += min_p * PAGE_SIZE; | ||
97 | @@ -512,6 +511,18 @@ static int _read_4_write(struct ore_io_state *ios) | ||
98 | } | ||
99 | |||
100 | read_last_stripe: | ||
101 | + return 0; | ||
102 | +} | ||
103 | + | ||
104 | +static int _read_4_write_last_stripe(struct ore_io_state *ios) | ||
105 | +{ | ||
106 | + struct ore_striping_info read_si; | ||
107 | + struct __stripe_pages_2d *sp2d = ios->sp2d; | ||
108 | + u64 offset; | ||
109 | + u64 last_stripe_end; | ||
110 | + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; | ||
111 | + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; | ||
112 | + | ||
113 | offset = ios->offset + ios->length; | ||
114 | if (offset % PAGE_SIZE) | ||
115 | _add_to_r4w_last_page(ios, &offset); | ||
116 | @@ -527,15 +538,15 @@ read_last_stripe: | ||
117 | c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, | ||
118 | ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); | ||
119 | |||
120 | - BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); | ||
121 | - /* unaligned IO must be within a single stripe */ | ||
122 | - | ||
123 | if (min_p == sp2d->pages_in_unit) { | ||
124 | /* Didn't do it yet */ | ||
125 | min_p = _sp2d_min_pg(sp2d); | ||
126 | max_p = _sp2d_max_pg(sp2d); | ||
127 | } | ||
128 | |||
129 | + ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", | ||
130 | + offset, last_stripe_end, min_p, max_p); | ||
131 | + | ||
132 | while (offset < last_stripe_end) { | ||
133 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
134 | |||
135 | @@ -568,6 +579,15 @@ read_last_stripe: | ||
136 | } | ||
137 | |||
138 | read_it: | ||
139 | + return 0; | ||
140 | +} | ||
141 | + | ||
142 | +static int _read_4_write_execute(struct ore_io_state *ios) | ||
143 | +{ | ||
144 | + struct ore_io_state *ios_read; | ||
145 | + unsigned i; | ||
146 | + int ret; | ||
147 | + | ||
148 | ios_read = ios->ios_read_4_write; | ||
149 | if (!ios_read) | ||
150 | return 0; | ||
151 | @@ -591,6 +611,8 @@ read_it: | ||
152 | } | ||
153 | |||
154 | _mark_read4write_pages_uptodate(ios_read, ret); | ||
155 | + ore_put_io_state(ios_read); | ||
156 | + ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ | ||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | @@ -626,8 +648,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios, | ||
161 | /* If first stripe, Read in all read4write pages | ||
162 | * (if needed) before we calculate the first parity. | ||
163 | */ | ||
164 | - _read_4_write(ios); | ||
165 | + _read_4_write_first_stripe(ios); | ||
166 | } | ||
167 | + if (!cur_len) /* If last stripe r4w pages of last stripe */ | ||
168 | + _read_4_write_last_stripe(ios); | ||
169 | + _read_4_write_execute(ios); | ||
170 | |||
171 | for (i = 0; i < num_pages; i++) { | ||
172 | pages[i] = _raid_page_alloc(); | ||
173 | @@ -654,34 +679,14 @@ int _ore_add_parity_unit(struct ore_io_state *ios, | ||
174 | |||
175 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | ||
176 | { | ||
177 | - struct ore_layout *layout = ios->layout; | ||
178 | - | ||
179 | if (ios->parity_pages) { | ||
180 | + struct ore_layout *layout = ios->layout; | ||
181 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; | ||
182 | - unsigned stripe_size = ios->si.bytes_in_stripe; | ||
183 | - u64 last_stripe, first_stripe; | ||
184 | |||
185 | if (_sp2d_alloc(pages_in_unit, layout->group_width, | ||
186 | layout->parity, &ios->sp2d)) { | ||
187 | return -ENOMEM; | ||
188 | } | ||
189 | - | ||
190 | - /* Round io down to last full strip */ | ||
191 | - first_stripe = div_u64(ios->offset, stripe_size); | ||
192 | - last_stripe = div_u64(ios->offset + ios->length, stripe_size); | ||
193 | - | ||
194 | - /* If an IO spans more then a single stripe it must end at | ||
195 | - * a stripe boundary. The reminder at the end is pushed into the | ||
196 | - * next IO. | ||
197 | - */ | ||
198 | - if (last_stripe != first_stripe) { | ||
199 | - ios->length = last_stripe * stripe_size - ios->offset; | ||
200 | - | ||
201 | - BUG_ON(!ios->length); | ||
202 | - ios->nr_pages = (ios->length + PAGE_SIZE - 1) / | ||
203 | - PAGE_SIZE; | ||
204 | - ios->si.length = ios->length; /*make it consistent */ | ||
205 | - } | ||
206 | } | ||
207 | return 0; | ||
208 | } | ||
209 | -- | ||
210 | 1.7.7.6 | ||
211 | |||