Line data Source code
1 : /*
2 : * Copyright (C) 2013 Andrea Mazzoleni
3 : *
4 : * This program is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : */
14 :
15 : #include "internal.h"
16 : #include "gf.h"
17 :
18 : /*
19 : * This is a RAID implementation working in the Galois Field GF(2^8) with
20 : * the primitive polynomial x^8 + x^4 + x^3 + x^2 + 1 (285 decimal), and
21 : * supporting up to six parity levels.
22 : *
23 : * For RAID5 and RAID6 it works as as described in the H. Peter Anvin's
24 : * paper "The mathematics of RAID-6" [1]. Please refer to this paper for a
25 : * complete explanation.
26 : *
27 : * To support triple parity, it was first evaluated and then dropped, an
28 : * extension of the same approach, with additional parity coefficients set
29 : * as powers of 2^-1, with equations:
30 : *
31 : * P = sum(Di)
32 : * Q = sum(2^i * Di)
33 : * R = sum(2^-i * Di) with 0<=i<N
34 : *
35 : * This approach works well for triple parity and it's very efficient,
36 : * because we can implement very fast parallel multiplications and
37 : * divisions by 2 in GF(2^8).
38 : *
39 : * It's also similar at the approach used by ZFS RAIDZ3, with the
40 : * difference that ZFS uses powers of 4 instead of 2^-1.
41 : *
42 : * Unfortunately it doesn't work beyond triple parity, because whatever
43 : * value we choose to generate the power coefficients to compute other
44 : * parities, the resulting equations are not solvable for some
45 : * combinations of missing disks.
46 : *
47 : * This is expected, because the Vandermonde matrix used to compute the
48 : * parity has no guarantee to have all submatrices not singular
49 : * [2, Chap 11, Problem 7] and this is a requirement to have
50 : * a MDS (Maximum Distance Separable) code [2, Chap 11, Theorem 8].
51 : *
52 : * To overcome this limitation, we use a Cauchy matrix [3][4] to compute
53 : * the parity. A Cauchy matrix has the property to have all the square
54 : * submatrices not singular, resulting in always solvable equations,
55 : * for any combination of missing disks.
56 : *
57 : * The problem of this approach is that it requires the use of
58 : * generic multiplications, and not only by 2 or 2^-1, potentially
59 : * affecting badly the performance.
60 : *
61 : * Hopefully there is a method to implement parallel multiplications
62 : * using SSSE3 or AVX2 instructions [1][5]. Method competitive with the
63 : * computation of triple parity using power coefficients.
64 : *
65 : * Another important property of the Cauchy matrix is that we can setup
66 : * the first two rows with coeffients equal at the RAID5 and RAID6 approach
67 : * decribed, resulting in a compatible extension, and requiring SSSE3
68 : * or AVX2 instructions only if triple parity or beyond is used.
69 : *
70 : * The matrix is also adjusted, multipling each row by a constant factor
71 : * to make the first column of all 1, to optimize the computation for
72 : * the first disk.
73 : *
74 : * This results in the matrix A[row,col] defined as:
75 : *
76 : * 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01...
77 : * 01 02 04 08 10 20 40 80 1d 3a 74 e8 cd 87 13 26 4c 98 2d 5a b4 75...
78 : * 01 f5 d2 c4 9a 71 f1 7f fc 87 c1 c6 19 2f 40 55 3d ba 53 04 9c 61...
79 : * 01 bb a6 d7 c7 07 ce 82 4a 2f a5 9b b6 60 f1 ad e7 f4 06 d2 df 2e...
80 : * 01 97 7f 9c 7c 18 bd a2 58 1a da 74 70 a3 e5 47 29 07 f5 80 23 e9...
81 : * 01 2b 3f cf 73 2c d6 ed cb 74 15 78 8a c1 17 c9 89 68 21 ab 76 3b...
82 : *
83 : * This matrix supports 6 level of parity, one for each row, for up to 251
84 : * data disks, one for each column, with all the 377,342,351,231 square
85 : * submatrices not singular, verified also with brute-force.
86 : *
87 : * This matrix can be extended to support any number of parities, just
88 : * adding additional rows, and removing one column for each new row.
89 : * (see mktables.c for more details in how the matrix is generated)
90 : *
91 : * In details, parity is computed as:
92 : *
93 : * P = sum(Di)
94 : * Q = sum(2^i * Di)
95 : * R = sum(A[2,i] * Di)
96 : * S = sum(A[3,i] * Di)
97 : * T = sum(A[4,i] * Di)
98 : * U = sum(A[5,i] * Di) with 0<=i<N
99 : *
100 : * To recover from a failure of six disks at indexes x,y,z,h,v,w,
101 : * with 0<=x<y<z<h<v<w<N, we compute the parity of the available N-6
102 : * disks as:
103 : *
104 : * Pa = sum(Di)
105 : * Qa = sum(2^i * Di)
106 : * Ra = sum(A[2,i] * Di)
107 : * Sa = sum(A[3,i] * Di)
108 : * Ta = sum(A[4,i] * Di)
109 : * Ua = sum(A[5,i] * Di) with 0<=i<N,i!=x,i!=y,i!=z,i!=h,i!=v,i!=w.
110 : *
111 : * And if we define:
112 : *
113 : * Pd = Pa + P
114 : * Qd = Qa + Q
115 : * Rd = Ra + R
116 : * Sd = Sa + S
117 : * Td = Ta + T
118 : * Ud = Ua + U
119 : *
120 : * we can sum these two sets of equations, obtaining:
121 : *
122 : * Pd = Dx + Dy + Dz + Dh + Dv + Dw
123 : * Qd = 2^x * Dx + 2^y * Dy + 2^z * Dz + 2^h * Dh + 2^v * Dv + 2^w * Dw
124 : * Rd = A[2,x] * Dx + A[2,y] * Dy + A[2,z] * Dz + A[2,h] * Dh + A[2,v] * Dv + A[2,w] * Dw
125 : * Sd = A[3,x] * Dx + A[3,y] * Dy + A[3,z] * Dz + A[3,h] * Dh + A[3,v] * Dv + A[3,w] * Dw
126 : * Td = A[4,x] * Dx + A[4,y] * Dy + A[4,z] * Dz + A[4,h] * Dh + A[4,v] * Dv + A[4,w] * Dw
127 : * Ud = A[5,x] * Dx + A[5,y] * Dy + A[5,z] * Dz + A[5,h] * Dh + A[5,v] * Dv + A[5,w] * Dw
128 : *
129 : * A linear system always solvable because the coefficients matrix is
130 : * always not singular due the properties of the matrix A[].
131 : *
132 : * Resulting speed in x64, with 8 data disks, using a stripe of 256 KiB,
133 : * for a Core i5-4670K Haswell Quad-Core 3.4GHz is:
134 : *
135 : * int8 int32 int64 sse2 ssse3 avx2
136 : * gen1 13339 25438 45438 50588
137 : * gen2 4115 6514 21840 32201
138 : * gen3 814 10154 18613
139 : * gen4 620 7569 14229
140 : * gen5 496 5149 10051
141 : * gen6 413 4239 8190
142 : *
143 : * Values are in MiB/s of data processed by a single thread, not counting
144 : * generated parity.
145 : *
146 : * You can replicate these results in your machine using the
147 : * "raid/test/speedtest.c" program.
148 : *
149 : * For comparison, the triple parity computation using the power
150 : * coeffients "1,2,2^-1" is only a little faster than the one based on
151 : * the Cauchy matrix if SSSE3 or AVX2 is present.
152 : *
153 : * int8 int32 int64 sse2 ssse3 avx2
154 : * genz 2337 2874 10920 18944
155 : *
156 : * In conclusion, the use of power coefficients, and specifically powers
157 : * of 1,2,2^-1, is the best option to implement triple parity in CPUs
158 : * without SSSE3 and AVX2.
159 : * But if a modern CPU with SSSE3 or AVX2 is available, the Cauchy
160 : * matrix is the best option because it provides a fast and general
161 : * approach working for any number of parities.
162 : *
163 : * References:
164 : * [1] Anvin, "The mathematics of RAID-6", 2004
165 : * [2] MacWilliams, Sloane, "The Theory of Error-Correcting Codes", 1977
166 : * [3] Blomer, "An XOR-Based Erasure-Resilient Coding Scheme", 1995
167 : * [4] Roth, "Introduction to Coding Theory", 2006
168 : * [5] Plank, "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions", 2013
169 : */
170 :
171 : /**
172 : * Generator matrix currently used.
173 : */
174 : const uint8_t (*raid_gfgen)[256];
175 :
176 541 : void raid_mode(int mode)
177 : {
178 541 : if (mode == RAID_MODE_VANDERMONDE) {
179 2 : raid_gen_ptr[2] = raid_genz_ptr;
180 2 : raid_gfgen = gfvandermonde;
181 : } else {
182 539 : raid_gen_ptr[2] = raid_gen3_ptr;
183 539 : raid_gfgen = gfcauchy;
184 : }
185 541 : }
186 :
187 : /**
188 : * Buffer filled with 0 used in recovering.
189 : */
190 : static void *raid_zero_block;
191 :
192 208 : void raid_zero(void *zero)
193 : {
194 208 : raid_zero_block = zero;
195 208 : }
196 :
197 : /*
198 : * Forwarders for parity computation.
199 : *
200 : * These functions compute the parity blocks from the provided data.
201 : *
202 : * The number of parities to compute is implicit in the position in the
203 : * forwarder vector. Position at index #i, computes (#i+1) parities.
204 : *
205 : * All these functions give the guarantee that parities are written
206 : * in order. First parity P, then parity Q, and so on.
207 : * This allows to specify the same memory buffer for multiple parities
208 : * knowning that you'll get the latest written one.
209 : * This characteristic is used by the raid_delta_gen() function to
210 : * avoid to damage unused parities in recovering.
211 : *
212 : * @nd Number of data blocks
213 : * @size Size of the blocks pointed by @v. It must be a multipler of 64.
214 : * @v Vector of pointers to the blocks of data and parity.
215 : * It has (@nd + #parities) elements. The starting elements are the blocks
216 : * for data, following with the parity blocks.
217 : * Each block has @size bytes.
218 : */
219 : void (*raid_gen_ptr[RAID_PARITY_MAX])(int nd, size_t size, void **vv);
220 : void (*raid_gen3_ptr)(int nd, size_t size, void **vv);
221 : void (*raid_genz_ptr)(int nd, size_t size, void **vv);
222 :
223 1235761 : void raid_gen(int nd, int np, size_t size, void **v)
224 : {
225 : /* enforce limit on size */
226 1235761 : BUG_ON(size % 64 != 0);
227 :
228 : /* enforce limit on number of failures */
229 1235761 : BUG_ON(np < 1);
230 1235761 : BUG_ON(np > RAID_PARITY_MAX);
231 :
232 1235761 : raid_gen_ptr[np - 1](nd, size, v);
233 1235761 : }
234 :
235 : /**
236 : * Inverts the square matrix M of size nxn into V.
237 : *
238 : * This is not a general matrix inversion because we assume the matrix M
239 : * to have all the square submatrix not singular.
240 : * We use Gauss elimination to invert.
241 : *
242 : * @M Matrix to invert with @n rows and @n columns.
243 : * @V Destination matrix where the result is put.
244 : * @n Number of rows and columns of the matrix.
245 : */
246 172769 : void raid_invert(uint8_t *M, uint8_t *V, int n)
247 : {
248 : int i, j, k;
249 :
250 : /* set the identity matrix in V */
251 798364 : for (i = 0; i < n; ++i)
252 3221012 : for (j = 0; j < n; ++j)
253 2595417 : V[i * n + j] = i == j;
254 :
255 : /* for each element in the diagonal */
256 798364 : for (k = 0; k < n; ++k) {
257 : uint8_t f;
258 :
259 : /* the diagonal element cannot be 0 because */
260 : /* we are inverting matrices with all the square */
261 : /* submatrices not singular */
262 625595 : BUG_ON(M[k * n + k] == 0);
263 :
264 : /* make the diagonal element to be 1 */
265 1251190 : f = inv(M[k * n + k]);
266 3221012 : for (j = 0; j < n; ++j) {
267 5190834 : M[k * n + j] = mul(f, M[k * n + j]);
268 5190834 : V[k * n + j] = mul(f, V[k * n + j]);
269 : }
270 :
271 : /* make all the elements over and under the diagonal */
272 : /* to be zero */
273 3221012 : for (i = 0; i < n; ++i) {
274 2595417 : if (i == k)
275 625595 : continue;
276 1969822 : f = M[i * n + k];
277 11272530 : for (j = 0; j < n; ++j) {
278 18605416 : M[i * n + j] ^= mul(f, M[k * n + j]);
279 18605416 : V[i * n + j] ^= mul(f, V[k * n + j]);
280 : }
281 : }
282 : }
283 172769 : }
284 :
285 : /**
286 : * Computes the parity without the missing data blocks
287 : * and store it in the buffers of such data blocks.
288 : *
289 : * This is the parity expressed as Pa,Qa,Ra,Sa,Ta,Ua in the equations.
290 : */
291 171901 : void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v)
292 : {
293 : void *p[RAID_PARITY_MAX];
294 : void *pa[RAID_PARITY_MAX];
295 : int i, j;
296 : int np;
297 : void *latest;
298 :
299 : /* total number of parities we are going to process */
300 : /* they are both the used and the unused ones */
301 171901 : np = ip[nr - 1] + 1;
302 :
303 : /* latest missing data block */
304 171901 : latest = v[id[nr - 1]];
305 :
306 : /* setup pointers for delta computation */
307 874239 : for (i = 0, j = 0; i < np; ++i) {
308 : /* keep a copy of the original parity vector */
309 702338 : p[i] = v[nd + i];
310 :
311 702338 : if (ip[j] == i) {
312 : /*
313 : * Set used parities to point to the missing
314 : * data blocks.
315 : *
316 : * The related data blocks are instead set
317 : * to point to the "zero" buffer.
318 : */
319 :
320 : /* the latest parity to use ends the for loop and */
321 : /* then it cannot happen to process more of them */
322 606053 : BUG_ON(j >= nr);
323 :
324 : /* buffer for missing data blocks */
325 606053 : pa[j] = v[id[j]];
326 :
327 : /* set at zero the missing data blocks */
328 606053 : v[id[j]] = raid_zero_block;
329 :
330 : /* compute the parity over the missing data blocks */
331 606053 : v[nd + i] = pa[j];
332 :
333 : /* check for the next used entry */
334 606053 : ++j;
335 : } else {
336 : /*
337 : * Unused parities are going to be rewritten with
338 : * not significative data, becase we don't have
339 : * functions able to compute only a subset of
340 : * parities.
341 : *
342 : * To avoid this, we reuse parity buffers,
343 : * assuming that all the parity functions write
344 : * parities in order.
345 : *
346 : * We assign the unused parity block to the same
347 : * block of the latest used parity that we know it
348 : * will be written.
349 : *
350 : * This means that this block will be written
351 : * multiple times and only the latest write will
352 : * contain the correct data.
353 : */
354 96285 : v[nd + i] = latest;
355 : }
356 : }
357 :
358 : /* all the parities have to be processed */
359 171901 : BUG_ON(j != nr);
360 :
361 : /* recompute the parity, note that np may be smaller than the */
362 : /* total number of parities available */
363 171901 : raid_gen(nd, np, size, v);
364 :
365 : /* restore data buffers as before */
366 777954 : for (j = 0; j < nr; ++j)
367 606053 : v[id[j]] = pa[j];
368 :
369 : /* restore parity buffers as before */
370 874239 : for (i = 0; i < np; ++i)
371 702338 : v[nd + i] = p[i];
372 171901 : }
373 :
374 : /**
375 : * Recover failure of one data block for PAR1.
376 : *
377 : * Starting from the equation:
378 : *
379 : * Pd = Dx
380 : *
381 : * and solving we get:
382 : *
383 : * Dx = Pd
384 : */
385 101555 : void raid_rec1of1(int *id, int nd, size_t size, void **v)
386 : {
387 : void *p;
388 : void *pa;
389 :
390 : /* for PAR1 we can directly compute the missing block */
391 : /* and we don't need to use the zero buffer */
392 101555 : p = v[nd];
393 101555 : pa = v[id[0]];
394 :
395 : /* use the parity as missing data block */
396 101555 : v[id[0]] = p;
397 :
398 : /* compute the parity over the missing data block */
399 101555 : v[nd] = pa;
400 :
401 : /* compute */
402 101555 : raid_gen(nd, 1, size, v);
403 :
404 : /* restore as before */
405 101555 : v[id[0]] = pa;
406 101555 : v[nd] = p;
407 101555 : }
408 :
409 : /**
410 : * Recover failure of two data blocks for PAR2.
411 : *
412 : * Starting from the equations:
413 : *
414 : * Pd = Dx + Dy
415 : * Qd = 2^id[0] * Dx + 2^id[1] * Dy
416 : *
417 : * and solving we get:
418 : *
419 : * 1 2^(-id[0])
420 : * Dy = ------------------- * Pd + ------------------- * Qd
421 : * 2^(id[1]-id[0]) + 1 2^(id[1]-id[0]) + 1
422 : *
423 : * Dx = Dy + Pd
424 : *
425 : * with conditions:
426 : *
427 : * 2^id[0] != 0
428 : * 2^(id[1]-id[0]) + 1 != 0
429 : *
430 : * That are always satisfied for any 0<=id[0]<id[1]<255.
431 : */
432 132 : void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv)
433 : {
434 132 : uint8_t **v = (uint8_t **)vv;
435 : size_t i;
436 : uint8_t *p;
437 : uint8_t *pa;
438 : uint8_t *q;
439 : uint8_t *qa;
440 : const uint8_t *T[2];
441 :
442 : /* get multiplication tables */
443 528 : T[0] = table(inv(pow2(id[1] - id[0]) ^ 1));
444 660 : T[1] = table(inv(pow2(id[0]) ^ pow2(id[1])));
445 :
446 : /* compute delta parity */
447 132 : raid_delta_gen(2, id, ip, nd, size, vv);
448 :
449 132 : p = v[nd];
450 132 : q = v[nd + 1];
451 132 : pa = v[id[0]];
452 132 : qa = v[id[1]];
453 :
454 33924 : for (i = 0; i < size; ++i) {
455 : /* delta */
456 33792 : uint8_t Pd = p[i] ^ pa[i];
457 33792 : uint8_t Qd = q[i] ^ qa[i];
458 :
459 : /* reconstruct */
460 33792 : uint8_t Dy = T[0][Pd] ^ T[1][Qd];
461 33792 : uint8_t Dx = Pd ^ Dy;
462 :
463 : /* set */
464 33792 : pa[i] = Dx;
465 33792 : qa[i] = Dy;
466 : }
467 132 : }
468 :
469 : /*
470 : * Forwarders for data recovery.
471 : *
472 : * These functions recover data blocks using the specified parity
473 : * to recompute the missing data.
474 : *
475 : * Note that the format of vectors @id/@ip is different than raid_rec().
476 : * For example, in the vector @ip the first parity is represented with the
477 : * value 0 and not @nd.
478 : *
479 : * @nr Number of failed data blocks to recover.
480 : * @id[] Vector of @nr indexes of the data blocks to recover.
481 : * The indexes start from 0. They must be in order.
482 : * @ip[] Vector of @nr indexes of the parity blocks to use in the recovering.
483 : * The indexes start from 0. They must be in order.
484 : * @nd Number of data blocks.
485 : * @np Number of parity blocks.
486 : * @size Size of the blocks pointed by @v. It must be a multipler of 64.
487 : * @v Vector of pointers to the blocks of data and parity.
488 : * It has (@nd + @np) elements. The starting elements are the blocks
489 : * for data, following with the parity blocks.
490 : * Each block has @size bytes.
491 : */
492 : void (*raid_rec_ptr[RAID_PARITY_MAX])(
493 : int nr, int *id, int *ip, int nd, size_t size, void **vv);
494 :
495 871 : void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
496 : {
497 : int nrd; /* number of data blocks to recover */
498 : int nrp; /* number of parity blocks to recover */
499 :
500 : /* enforce limit on size */
501 871 : BUG_ON(size % 64 != 0);
502 :
503 : /* enforce limit on number of failures */
504 871 : BUG_ON(nr > np);
505 871 : BUG_ON(np > RAID_PARITY_MAX);
506 :
507 : /* enforce order in index vector */
508 871 : BUG_ON(nr >= 2 && ir[0] >= ir[1]);
509 871 : BUG_ON(nr >= 3 && ir[1] >= ir[2]);
510 871 : BUG_ON(nr >= 4 && ir[2] >= ir[3]);
511 871 : BUG_ON(nr >= 5 && ir[3] >= ir[4]);
512 871 : BUG_ON(nr >= 6 && ir[4] >= ir[5]);
513 :
514 : /* enforce limit on index vector */
515 871 : BUG_ON(nr > 0 && ir[nr-1] >= nd + np);
516 :
517 : /* count the number of data blocks to recover */
518 871 : nrd = 0;
519 3487 : while (nrd < nr && ir[nrd] < nd)
520 1745 : ++nrd;
521 :
522 : /* all the remaining are parity */
523 871 : nrp = nr - nrd;
524 :
525 : /* enforce limit on number of failures */
526 871 : BUG_ON(nrd > nd);
527 871 : BUG_ON(nrp > np);
528 :
529 : /* if failed data is present */
530 871 : if (nrd != 0) {
531 : int ip[RAID_PARITY_MAX];
532 : int i, j, k;
533 :
534 : /* setup the vector of parities to use */
535 6048 : for (i = 0, j = 0, k = 0; i < np; ++i) {
536 5179 : if (j < nrp && ir[nrd + j] == nd + i) {
537 : /* this parity has to be recovered */
538 22 : ++j;
539 : } else {
540 : /* this parity is used for recovering */
541 5157 : ip[k] = i;
542 5157 : ++k;
543 : }
544 : }
545 :
546 : /* recover the nrd data blocks specified in ir[], */
547 : /* using the first nrd parity in ip[] for recovering */
548 869 : raid_rec_ptr[nrd - 1](nrd, ir, ip, nd, size, v);
549 : }
550 :
551 : /* recompute all the parities up to the last bad one */
552 871 : if (nrp != 0)
553 12 : raid_gen(nd, ir[nr - 1] - nd + 1, size, v);
554 871 : }
555 :
556 215082 : void raid_data(int nr, int *id, int *ip, int nd, size_t size, void **v)
557 : {
558 : /* enforce limit on size */
559 215082 : BUG_ON(size % 64 != 0);
560 :
561 : /* enforce limit on number of failures */
562 215082 : BUG_ON(nr > nd);
563 215082 : BUG_ON(nr > RAID_PARITY_MAX);
564 :
565 : /* enforce order in index vector for data */
566 215082 : BUG_ON(nr >= 2 && id[0] >= id[1]);
567 215082 : BUG_ON(nr >= 3 && id[1] >= id[2]);
568 215082 : BUG_ON(nr >= 4 && id[2] >= id[3]);
569 215082 : BUG_ON(nr >= 5 && id[3] >= id[4]);
570 215082 : BUG_ON(nr >= 6 && id[4] >= id[5]);
571 :
572 : /* enforce limit on index vector for data */
573 215082 : BUG_ON(nr > 0 && id[nr-1] >= nd);
574 :
575 : /* enforce order in index vector for parity */
576 215082 : BUG_ON(nr >= 2 && ip[0] >= ip[1]);
577 215082 : BUG_ON(nr >= 3 && ip[1] >= ip[2]);
578 215082 : BUG_ON(nr >= 4 && ip[2] >= ip[3]);
579 215082 : BUG_ON(nr >= 5 && ip[3] >= ip[4]);
580 215082 : BUG_ON(nr >= 6 && ip[4] >= ip[5]);
581 :
582 : /* if failed data is present */
583 215082 : if (nr != 0)
584 215080 : raid_rec_ptr[nr - 1](nr, id, ip, nd, size, v);
585 215082 : }
586 :
|