Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : // Copyright (C) 2013 Andrea Mazzoleni
3 :
4 : #include "internal.h"
5 : #include "gf.h"
6 :
7 : /*
8 : * Cauchy Matrix Construction for MDS RAID with Arbitrary Parity Levels in GF(2^8)
9 : *
10 : * This is a RAID implementation operating in the Galois Field GF(2^8) with
11 : * the primitive polynomial x^8 + x^4 + x^3 + x^2 + 1 (285 decimal), supporting
12 : * up to six parity levels.
13 : *
14 : * For RAID5 and RAID6, it follows the method described in H. Peter Anvin's
15 : * paper "The mathematics of RAID-6" [1]. Please refer to this paper for a
16 : * complete explanation.
17 : *
18 : * Triple parity was first evaluated using an extension of the same approach,
19 : * with additional parity coefficients set as powers of 2^-1, defined as:
20 : *
21 : * P = sum(Di)
22 : * Q = sum(2^i * Di)
23 : * R = sum(2^-i * Di) for 0 <= i < N
24 : *
25 : * This approach works efficiently for triple parity because multiplications
26 : * and divisions by 2 in GF(2^8) are very fast. It is similar to ZFS RAIDZ3,
27 : * which uses powers of 4 instead of 2^-1.
28 : *
29 : * Unfortunately, this method does not extend beyond triple parity because
30 : * no choice of power coefficients guarantees solvable equations for all
31 : * combinations of missing disks. This is expected: a Vandermonde matrix,
32 : * used in this method, does not ensure that all submatrices are nonsingular
33 : * [2, Chap. 11, Problem 7], which is required for an MDS (Maximum Distance
34 : * Separable) code [2, Chap. 11, Theorem 8].
35 : *
36 : * To overcome this limitation, a Cauchy matrix [3][4] is used for parity
37 : * computation. A Cauchy matrix ensures that all square submatrices are
38 : * nonsingular, so the linear system is always solvable for any combination
39 : * of missing disks. This guarantees an MDS code.
40 : *
41 : * How the matrix is obtained:
42 : *
43 : * The matrix is constructed to match Linux RAID coefficients for the
44 : * first two rows, while ensuring that all square submatrices are nonsingular.
45 : *
46 : * 1. Start by forming a Cauchy matrix with elements defined as 1/(x_i + y_j),
47 : * where all x_i and y_j are distinct elements (the textbook definition
48 : * of a Cauchy matrix).
49 : *
50 : * 2. For the first row (j=0), set x_i = 2^-i and y_0 = 0, resulting in:
51 : *
52 : * row j=0 -> 1/(x_i + y_0) = 1/(2^-i + 0) = 2^i
53 : *
54 : * which reproduces the RAID-6 coefficients.
55 : *
56 : * 3. For subsequent rows (j>0), set y_j = 2^j, yielding:
57 : *
58 : * rows j>0 -> 1/(x_i + y_j) = 1/(2^-i + 2^j)
59 : *
60 : * ensuring x_i != y_j for any i >= 0, j >= 1, and i + j < 255.
61 : *
62 : * 4. Place a row filled with 1 at the top of the matrix, transforming it
63 : * into an Extended Cauchy Matrix. This preserves the nonsingularity
64 : * of all square submatrices.
65 : *
66 : * This first row reproduces the RAID-5 coefficients.
67 : *
68 : * 5. Adjust each row with a factor so that the first column contains all 1s.
69 : * This transformation also preserves the nonsingularity property,
70 : * maintaining the MDS code property.
71 : *
72 : * Concrete example in GF(256) for k=6, m=4:
73 : *
74 : * First, create a 3x6 Cauchy matrix using x_i = 2^-i and y_0 = 0, y_j = 2^j for j>0:
75 : *
76 : * x = { 1, 142, 71, 173, 216, 108 }
77 : * y = { 0, 2, 4 }
78 : *
79 : * The Cauchy matrix is:
80 : *
81 : * 1 2 4 8 16 32
82 : * 244 83 78 183 118 47
83 : * 167 39 213 59 153 82
84 : *
85 : * Divide row 2 by 244 and row 3 by 167. Then extend it with a row of ones
86 : * on top, and it remains MDS. This forms the code for m=4, with RAID-6 as
87 : * a subset:
88 : *
89 : * 1 1 1 1 1 1
90 : * 1 2 4 8 16 32
91 : * 1 245 210 196 154 113
92 : * 1 187 166 215 199 7
93 : *
94 : * Extending the same computation to k=251 and m=6 gives:
95 : *
96 : * This results in the matrix A[row, col] defined as:
97 : *
98 : * 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01...
99 : * 01 02 04 08 10 20 40 80 1d 3a 74 e8 cd 87 13 26 4c 98 2d 5a b4 75...
100 : * 01 f5 d2 c4 9a 71 f1 7f fc 87 c1 c6 19 2f 40 55 3d ba 53 04 9c 61...
101 : * 01 bb a6 d7 c7 07 ce 82 4a 2f a5 9b b6 60 f1 ad e7 f4 06 d2 df 2e...
102 : * 01 97 7f 9c 7c 18 bd a2 58 1a da 74 70 a3 e5 47 29 07 f5 80 23 e9...
103 : * 01 2b 3f cf 73 2c d6 ed cb 74 15 78 8a c1 17 c9 89 68 21 ab 76 3b...
104 : *
105 : * This matrix supports six levels of parity, one for each row, for up to 251
106 : * data disks, one for each column. All the 377,342,351,231 square submatrices
107 : * are nonsingular, verified also by brute-force testing.
108 : *
109 : * This matrix can be extended to support any number of parities by simply
110 : * adding additional rows and removing one column for each new row to maintain
111 : * the condition i + j < 255.
112 : * (See mktables.c for more details on how the matrix is generated.)
113 : *
114 : * The downside is that generic multiplications are required to compute the
115 : * parity, not only fast multiplications by 2^n or 2^-n, which negatively
116 : * affect performance. However, optimized parallel multiplications using SSSE3
117 : * or AVX2 instructions [1][5] make this approach competitive with the triple
118 : * parity computation using power coefficients.
119 : *
120 : * Another advantage of the Cauchy matrix is that the first two rows can
121 : * replicate the RAID5 and RAID6 approach, resulting in a compatible extension.
122 : * SSSE3 or AVX2 instructions are only needed for triple parity or beyond.
123 : *
124 : * Parity computation is as follows:
125 : *
126 : * P = sum(Di)
127 : * Q = sum(2^i * Di)
128 : * R = sum(A[2,i] * Di)
129 : * S = sum(A[3,i] * Di)
130 : * T = sum(A[4,i] * Di)
131 : * U = sum(A[5,i] * Di) for 0 <= i < N
132 : *
133 : * Recovery from six disk failures at indices x, y, z, h, v, w (0 <= x < y < z < h < v < w < N)
134 : * involves computing parity of the remaining N-6 disks:
135 : *
136 : * Pa = sum(Di)
137 : * Qa = sum(2^i * Di)
138 : * Ra = sum(A[2,i] * Di)
139 : * Sa = sum(A[3,i] * Di)
140 : * Ta = sum(A[4,i] * Di)
141 : * Ua = sum(A[5,i] * Di) for i != x,y,z,h,v,w
142 : *
143 : * Defining:
144 : *
145 : * Pd = Pa + P
146 : * Qd = Qa + Q
147 : * Rd = Ra + R
148 : * Sd = Sa + S
149 : * Td = Ta + T
150 : * Ud = Ua + U
151 : *
152 : * yields:
153 : *
154 : * Pd = Dx + Dy + Dz + Dh + Dv + Dw
155 : * Qd = 2^x * Dx + 2^y * Dy + 2^z * Dz + 2^h * Dh + 2^v * Dv + 2^w * Dw
156 : * Rd = A[2,x] * Dx + A[2,y] * Dy + A[2,z] * Dz + A[2,h] * Dh + A[2,v] * Dv + A[2,w] * Dw
157 : * Sd = A[3,x] * Dx + A[3,y] * Dy + A[3,z] * Dz + A[3,h] * Dh + A[3,v] * Dv + A[3,w] * Dw
158 : * Td = A[4,x] * Dx + A[4,y] * Dy + A[4,z] * Dz + A[4,h] * Dh + A[4,v] * Dv + A[4,w] * Dw
159 : * Ud = A[5,x] * Dx + A[5,y] * Dy + A[5,z] * Dz + A[5,h] * Dh + A[5,v] * Dv + A[5,w] * Dw
160 : *
161 : * This linear system is always solvable since the coefficient matrix is
162 : * nonsingular due to the properties of A[].
163 : *
164 : * Example performance on a Intel Core i7-10700 CPU @ 2.90GHz, stripe 256 KiB, 8 data disks:
165 : *
166 : * int8 int32 int64 sse2 sse2e ssse3 ssse3e avx2 avx2e
167 : * gen1 24995 46562 62683 77069
168 : * gen2 8243 15879 25788 27292 44907
169 : * genz 5509 10588 14485 13920 25175
170 : * gen3 1190 13201 14307 27268
171 : * gen4 914 10061 11006 21328
172 : * gen5 746 7970 8828 17040
173 : * gen6 630 6627 7516 14584
174 : *
175 : * Values are in MiB/s of data processed by a single thread.
176 : * Results can be reproduced using "raid/test/speedtest.c".
177 : *
178 : * Triple parity with power coefficients "1, 2, 2^-1" is slightly faster than
179 : * the Cauchy matrix computation if SSSE3 or AVX2 are available:
180 : *
181 : * int8 int32 int64 sse2 ssse3 avx2
182 : * genz 2337 2874 10920 18944
183 : *
184 : * In conclusion, the use of power coefficients, specifically powers
185 : * of 1, 2, and 2^-1, is the best option to implement triple parity on CPUs
186 : * without SSSE3 and AVX2.
187 : * On modern CPUs with SSSE3 or AVX2, the Cauchy matrix is the best
188 : * option because it provides a fast and general approach that works for any
189 : * number of parities.
190 : *
191 : * References:
192 : * [1] Anvin, "The mathematics of RAID-6", 2004
193 : * [2] MacWilliams, Sloane, "The Theory of Error-Correcting Codes", 1977
194 : * [3] Blomer, "An XOR-Based Erasure-Resilient Coding Scheme", 1995
195 : * [4] Roth, "Introduction to Coding Theory", 2006
196 : * [5] Plank, "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions", 2013
197 : */
198 :
199 : /**
200 : * Buffer filled with 0 used in recovering.
201 : */
202 : static void *raid_zero_block;
203 :
204 240 : void raid_zero(void *zero)
205 : {
206 240 : raid_zero_block = zero;
207 240 : }
208 :
209 : /**
210 : * Inverts the square matrix M of size nxn into V.
211 : *
212 : * This is not a general matrix inversion because we assume the matrix M
213 : * to have all the square submatrix not singular.
214 : * We use Gauss elimination to invert.
215 : *
216 : * @M Matrix to invert with @n rows and @n columns.
217 : * @V Destination matrix where the result is put.
218 : * @n Number of rows and columns of the matrix.
219 : */
220 204206 : void raid_invert(uint8_t *M, uint8_t *V, int n)
221 : {
222 : int i, j, k;
223 :
224 : /* set the identity matrix in V */
225 942501 : for (i = 0; i < n; ++i)
226 3760858 : for (j = 0; j < n; ++j)
227 3022563 : V[i * n + j] = i == j;
228 :
229 : /* for each element in the diagonal */
230 942501 : for (k = 0; k < n; ++k) {
231 : uint8_t f;
232 :
233 : /*
234 : * The diagonal element cannot be 0 because
235 : * we are inverting matrices with all the square
236 : * submatrices not singular
237 : */
238 738295 : BUG_ON(M[k * n + k] == 0);
239 :
240 : /* make the diagonal element to be 1 */
241 738295 : f = inv(M[k * n + k]);
242 3760858 : for (j = 0; j < n; ++j) {
243 3022563 : M[k * n + j] = mul(f, M[k * n + j]);
244 6045126 : V[k * n + j] = mul(f, V[k * n + j]);
245 : }
246 :
247 : /*
248 : * Make all the elements over and under the diagonal
249 : * to be zero
250 : */
251 3760858 : for (i = 0; i < n; ++i) {
252 3022563 : if (i == k)
253 738295 : continue;
254 2284268 : f = M[i * n + k];
255 12876408 : for (j = 0; j < n; ++j) {
256 10592140 : M[i * n + j] ^= mul(f, M[k * n + j]);
257 21184280 : V[i * n + j] ^= mul(f, V[k * n + j]);
258 : }
259 : }
260 : }
261 204206 : }
262 :
263 : /**
264 : * Computes the parity without the missing data blocks
265 : * and store it in the buffers of such data blocks.
266 : *
267 : * This is the parity expressed as Pa,Qa,Ra,Sa,Ta,Ua in the equations.
268 : */
269 203530 : void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v)
270 : {
271 : void *p[RAID_PARITY_MAX];
272 : void *pa[RAID_PARITY_MAX];
273 : int i, j;
274 : int np;
275 : void *latest;
276 :
277 : /*
278 : * Total number of parities we are going to process
279 : * they are both the used and the unused ones
280 : */
281 203530 : np = ip[nr - 1] + 1;
282 :
283 : /* latest missing data block */
284 203530 : latest = v[id[nr - 1]];
285 :
286 : /* setup pointers for delta computation */
287 1047537 : for (i = 0, j = 0; i < np; ++i) {
288 : /* keep a copy of the original parity vector */
289 844007 : p[i] = v[nd + i];
290 :
291 844007 : if (ip[j] == i) {
292 : /*
293 : * Set used parities to point to the missing
294 : * data blocks.
295 : *
296 : * The related data blocks are instead set
297 : * to point to the "zero" buffer.
298 : */
299 :
300 : /*
301 : * The latest parity to use ends the for loop and
302 : * then it cannot happen to process more of them
303 : */
304 718945 : BUG_ON(j >= nr);
305 :
306 : /* buffer for missing data blocks */
307 718945 : pa[j] = v[id[j]];
308 :
309 : /* set at zero the missing data blocks */
310 718945 : v[id[j]] = raid_zero_block;
311 :
312 : /* compute the parity over the missing data blocks */
313 718945 : v[nd + i] = pa[j];
314 :
315 : /* check for the next used entry */
316 718945 : ++j;
317 : } else {
318 : /*
319 : * Unused parities are going to be rewritten with
320 : * not significant data, because we don't have
321 : * functions able to compute only a subset of
322 : * parities.
323 : *
324 : * To avoid this, we reuse parity buffers,
325 : * assuming that all the parity functions write
326 : * parities in order.
327 : *
328 : * We assign the unused parity block to the same
329 : * block of the latest used parity that we know it
330 : * will be written.
331 : *
332 : * This means that this block will be written
333 : * multiple times and only the latest write will
334 : * contain the correct data.
335 : */
336 125062 : v[nd + i] = latest;
337 : }
338 : }
339 :
340 : /* all the parities have to be processed */
341 203530 : BUG_ON(j != nr);
342 :
343 : /*
344 : * Recompute the parity, note that np may be smaller than the
345 : * total number of parities available
346 : */
347 203530 : raid_gen(nd, np, size, v);
348 :
349 : /* restore data buffers as before */
350 922475 : for (j = 0; j < nr; ++j)
351 718945 : v[id[j]] = pa[j];
352 :
353 : /* restore parity buffers as before */
354 1047537 : for (i = 0; i < np; ++i)
355 844007 : v[nd + i] = p[i];
356 203530 : }
357 :
358 : /**
359 : * Recover failure of one data block for PAR1.
360 : *
361 : * Starting from the equation:
362 : *
363 : * Pd = Dx
364 : *
365 : * and solving we get:
366 : *
367 : * Dx = Pd
368 : */
369 101008 : void raid_rec1of1(int *id, int nd, size_t size, void **v)
370 : {
371 : void *p;
372 : void *pa;
373 :
374 : /*
375 : * For PAR1 we can directly compute the missing block
376 : * and we don't need to use the zero buffer
377 : */
378 101008 : p = v[nd];
379 101008 : pa = v[id[0]];
380 :
381 : /* use the parity as missing data block */
382 101008 : v[id[0]] = p;
383 :
384 : /* compute the parity over the missing data block */
385 101008 : v[nd] = pa;
386 :
387 : /* compute */
388 101008 : raid_gen(nd, 1, size, v);
389 :
390 : /* restore as before */
391 101008 : v[id[0]] = pa;
392 101008 : v[nd] = p;
393 101008 : }
394 :
395 : /**
396 : * Recover failure of two data blocks for PAR2.
397 : *
398 : * Starting from the equations:
399 : *
400 : * Pd = Dx + Dy
401 : * Qd = 2^id[0] * Dx + 2^id[1] * Dy
402 : *
403 : * and solving we get:
404 : *
405 : * 1 2^(-id[0])
406 : * Dy = ------------------- * Pd + ------------------- * Qd
407 : * 2^(id[1]-id[0]) + 1 2^(id[1]-id[0]) + 1
408 : *
409 : * Dx = Dy + Pd
410 : *
411 : * with conditions:
412 : *
413 : * 2^id[0] != 0
414 : * 2^(id[1]-id[0]) + 1 != 0
415 : *
416 : * That are always satisfied for any 0<=id[0]<id[1]<255.
417 : */
418 132 : void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv)
419 : {
420 132 : uint8_t **v = (uint8_t **)vv;
421 : size_t i;
422 : uint8_t *p;
423 : uint8_t *pa;
424 : uint8_t *q;
425 : uint8_t *qa;
426 : const uint8_t *T[2];
427 :
428 : /* get multiplication tables */
429 396 : T[0] = table(inv(pow2(id[1] - id[0]) ^ 1));
430 528 : T[1] = table(inv(pow2(id[0]) ^ pow2(id[1])));
431 :
432 : /* compute delta parity */
433 132 : raid_delta_gen(2, id, ip, nd, size, vv);
434 :
435 132 : p = v[nd];
436 132 : q = v[nd + 1];
437 132 : pa = v[id[0]];
438 132 : qa = v[id[1]];
439 :
440 33924 : for (i = 0; i < size; ++i) {
441 : /* delta */
442 33792 : uint8_t Pd = p[i] ^ pa[i];
443 33792 : uint8_t Qd = q[i] ^ qa[i];
444 :
445 : /* reconstruct */
446 33792 : uint8_t Dy = T[0][Pd] ^ T[1][Qd];
447 33792 : uint8_t Dx = Pd ^ Dy;
448 :
449 : /* set */
450 33792 : pa[i] = Dx;
451 33792 : qa[i] = Dy;
452 : }
453 132 : }
|