Line data Source code
1 : // SPDX-License-Identifier: GPL-3.0-or-later
2 : // Copyright (C) 2013 Andrea Mazzoleni
3 :
4 : #include "portable.h"
5 :
6 : #include "support.h"
7 : #include "elem.h"
8 : #include "state.h"
9 : #include "parity.h"
10 : #include "handle.h"
11 : #include "io.h"
12 : #include "raid/raid.h"
13 :
14 : /****************************************************************************/
15 : /* scrub */
16 :
17 0 : static const char* es(int err)
18 : {
19 0 : if (is_hw(err))
20 0 : return "error_io";
21 : else
22 0 : return "error";
23 : }
24 :
25 : /**
26 : * Buffer for storing the new hashes.
27 : */
28 : struct snapraid_rehash {
29 : unsigned char hash[HASH_MAX];
30 : struct snapraid_block* block;
31 : };
32 :
33 : /**
34 : * Scrub plan to use.
35 : */
36 : struct snapraid_plan {
37 : struct snapraid_state* state;
38 : int plan; /**< One of the SCRUB_*. */
39 : time_t timelimit; /**< Time limit. Valid only with SCRUB_AUTO. */
40 : block_off_t lastlimit; /**< Number of blocks allowed with time exactly at ::timelimit. Valid only with SCRUB_AUTO. */
41 : };
42 :
43 : /**
44 : * Check if we have to process the specified block index ::i.
45 : */
46 57095 : static int block_is_enabled(struct snapraid_plan* plan, block_off_t* countlast, block_off_t i)
47 : {
48 : time_t blocktime;
49 : snapraid_info info;
50 :
51 : /* don't scrub unused blocks in all plans */
52 57095 : info = info_get(&plan->state->infoarr, i);
53 57095 : if (info == 0)
54 0 : return 0;
55 :
56 : /* bad blocks are always scrubbed in all plans */
57 57095 : if (info_get_bad(info))
58 1772 : return 1;
59 :
60 55323 : switch (plan->plan) {
61 9472 : case SCRUB_FULL :
62 : /* in 'full' plan everything is scrubbed */
63 9472 : return 1;
64 9275 : case SCRUB_EVEN :
65 : /* in 'even' plan, scrub only even blocks */
66 9275 : return i % 2 == 0;
67 4687 : case SCRUB_NEW :
68 : /* in 'sync' plan, only blocks never scrubbed */
69 4687 : return info_get_justsynced(info);
70 3767 : case SCRUB_BAD :
71 : /* in 'bad' plan, only bad blocks (already reported) */
72 3767 : return 0;
73 : }
74 :
75 : /* if it's too new */
76 28122 : blocktime = info_get_time(info);
77 28122 : if (blocktime > plan->timelimit) {
78 : /* skip it */
79 5711 : return 0;
80 : }
81 :
82 : /*
83 : * If the time is less than the limit, always include
84 : * otherwise, check if we reached the last limit count
85 : */
86 22411 : if (blocktime == plan->timelimit) {
87 : /* if we reached the count limit */
88 13037 : if (*countlast >= plan->lastlimit) {
89 : /* skip it */
90 11546 : return 0;
91 : }
92 :
93 1491 : ++*countlast;
94 : }
95 :
96 10865 : return 1;
97 : }
98 :
99 157632 : static void scrub_data_reader(struct snapraid_worker* worker, struct snapraid_task* task)
100 : {
101 157632 : struct snapraid_io* io = worker->io;
102 157632 : struct snapraid_state* state = io->state;
103 157632 : struct snapraid_handle* handle = worker->handle;
104 157632 : struct snapraid_disk* disk = handle->disk;
105 157632 : block_off_t blockcur = task->position;
106 157632 : unsigned char* buffer = task->buffer;
107 : int ret;
108 : char esc_buffer[ESC_MAX];
109 :
110 : /* if the disk position is not used */
111 157632 : if (!disk) {
112 : /* use an empty block */
113 0 : memset(buffer, 0, state->block_size);
114 0 : task->state = TASK_STATE_DONE;
115 1818 : return;
116 : }
117 :
118 : /* get the block */
119 157632 : task->block = fs_par2block_find(disk, blockcur);
120 :
121 : /* if the block is not used */
122 157632 : if (!block_has_file(task->block)) {
123 : /* use an empty block */
124 1818 : memset(buffer, 0, state->block_size);
125 1818 : task->state = TASK_STATE_DONE;
126 1818 : return;
127 : }
128 :
129 : /* get the file of this block */
130 155814 : task->file = fs_par2file_get(disk, blockcur, &task->file_pos);
131 :
132 : /* if the file is different than the current one, close it */
133 155814 : if (handle->file != 0 && handle->file != task->file) {
134 : /* keep a pointer at the file we are going to close for error reporting */
135 75677 : struct snapraid_file* report = handle->file;
136 75677 : ret = handle_close(handle);
137 75677 : if (ret == -1) {
138 : /* LCOV_EXCL_START */
139 : /*
140 : * This one is really an unexpected error, because we are only reading
141 : * and closing a descriptor should never fail
142 : */
143 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
144 : log_fatal_errno(errno, disk->name);
145 : log_fatal(errno, "Stopping at block %u\n", blockcur);
146 :
147 : if (is_hw(errno)) {
148 : task->state = TASK_STATE_IOERROR;
149 : } else {
150 : task->state = TASK_STATE_ERROR;
151 : }
152 : return;
153 : /* LCOV_EXCL_STOP */
154 : }
155 : }
156 :
157 155814 : ret = handle_open(handle, task->file, state->file_mode, log_error, 0); /* for missing file don't output a message */
158 155814 : if (ret == -1) {
159 0 : log_tag("%s:%u:%s:%s: Open error. %s.\n", es(errno), blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
160 0 : if (is_hw(errno)) {
161 : /* LCOV_EXCL_START */
162 : log_fatal_errno(errno, disk->name);
163 : log_fatal(errno, "Stopping at block %u\n", blockcur);
164 : task->state = TASK_STATE_IOERROR;
165 : return;
166 : /* LCOV_EXCL_STOP */
167 : }
168 :
169 0 : task->state = TASK_STATE_ERROR_CONTINUE;
170 0 : return;
171 : }
172 :
173 : /* check if the file is changed */
174 155814 : if (handle->st.st_size != task->file->size
175 155814 : || handle->st.st_mtime != task->file->mtime_sec
176 155814 : || STAT_NSEC(&handle->st) != task->file->mtime_nsec
177 : /* don't check the inode to support filesystem without persistent inodes */
178 : ) {
179 : /* report that the block and the file are not synced */
180 0 : task->is_timestamp_different = 1;
181 : /* follow */
182 : }
183 :
184 : /*
185 : * Note that we intentionally don't abort if the file has different attributes
186 : * from the last sync, as we are expected to return errors if running
187 : * in an unsynced array. This is just like the check command.
188 : */
189 :
190 155814 : task->read_size = handle_read(handle, task->file_pos, buffer, state->block_size, log_error, 0);
191 155814 : if (task->read_size == -1) {
192 0 : log_tag("%s:%u:%s:%s: Read error at position %u. %s.\n", es(errno), blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
193 0 : if (is_hw(errno)) {
194 : /* LCOV_EXCL_START */
195 : log_error_errno(errno, disk->name);
196 : task->state = TASK_STATE_IOERROR_CONTINUE;
197 : return;
198 : /* LCOV_EXCL_STOP */
199 : }
200 :
201 0 : task->state = TASK_STATE_ERROR_CONTINUE;
202 0 : return;
203 : }
204 :
205 : /* store the path of the opened file */
206 155814 : pathcpy(task->path, sizeof(task->path), handle->path);
207 :
208 155814 : task->state = TASK_STATE_DONE;
209 : }
210 :
211 155732 : static void scrub_parity_reader(struct snapraid_worker* worker, struct snapraid_task* task)
212 : {
213 155732 : struct snapraid_io* io = worker->io;
214 155732 : struct snapraid_state* state = io->state;
215 155732 : struct snapraid_parity_handle* parity_handle = worker->parity_handle;
216 155732 : unsigned level = parity_handle->level;
217 155732 : block_off_t blockcur = task->position;
218 155732 : unsigned char* buffer = task->buffer;
219 : int ret;
220 :
221 : /* read the parity */
222 155732 : ret = parity_read(parity_handle, blockcur, buffer, state->block_size, log_error);
223 155732 : if (ret == -1) {
224 0 : log_tag("parity_%s:%u:%s: Read error. %s.\n", es(errno), blockcur, lev_config_name(level), strerror(errno));
225 0 : if (is_hw(errno)) {
226 : /* LCOV_EXCL_START */
227 : log_error_errno(errno, lev_config_name(level));
228 : task->state = TASK_STATE_IOERROR_CONTINUE;
229 : return;
230 : /* LCOV_EXCL_STOP */
231 : }
232 :
233 0 : task->state = TASK_STATE_ERROR_CONTINUE;
234 0 : return;
235 : }
236 :
237 155732 : task->state = TASK_STATE_DONE;
238 : }
239 :
240 12 : static int state_scrub_process(struct snapraid_state* state, struct snapraid_parity_handle* parity_handle, block_off_t blockstart, block_off_t blockmax, struct snapraid_plan* plan, time_t now)
241 : {
242 : struct snapraid_io io;
243 : struct snapraid_handle* handle;
244 : void* rehandle_alloc;
245 : struct snapraid_rehash* rehandle;
246 : unsigned diskmax;
247 : block_off_t blockcur;
248 : unsigned j;
249 : unsigned buffermax;
250 : data_off_t countsize;
251 : block_off_t countpos;
252 : block_off_t countmax;
253 : block_off_t countlast;
254 : block_off_t autosavedone;
255 : block_off_t autosavelimit;
256 : block_off_t autosavemissing;
257 : int ret;
258 : unsigned soft_error;
259 : unsigned silent_error;
260 : unsigned io_error;
261 : unsigned l;
262 : unsigned* waiting_map;
263 : unsigned waiting_mac;
264 : char esc_buffer[ESC_MAX];
265 : bit_vect_t* block_enabled;
266 :
267 : /* maps the disks to handles */
268 12 : handle = handle_mapping(state, &diskmax);
269 :
270 : /* rehash buffers */
271 12 : rehandle = malloc_nofail_align(diskmax * sizeof(struct snapraid_rehash), &rehandle_alloc);
272 :
273 : /* we need 1 * data + 2 * parity */
274 12 : buffermax = diskmax + 2 * state->level;
275 :
276 : /* initialize the io threads */
277 12 : io_init(&io, state, state->opt.io_cache, buffermax, scrub_data_reader, handle, diskmax, scrub_parity_reader, 0, parity_handle, state->level);
278 :
279 : /* possibly waiting disks */
280 12 : waiting_mac = diskmax > RAID_PARITY_MAX ? diskmax : RAID_PARITY_MAX;
281 12 : waiting_map = malloc_nofail(waiting_mac * sizeof(unsigned));
282 :
283 12 : soft_error = 0;
284 12 : silent_error = 0;
285 12 : io_error = 0;
286 :
287 12 : msg_progress("Selecting...\n");
288 :
289 : /* first count the number of blocks to process */
290 12 : countmax = 0;
291 12 : countlast = 0;
292 12 : block_enabled = calloc_nofail(1, bit_vect_size(blockmax)); /* preinitialize to 0 */
293 57107 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
294 57095 : if (!block_is_enabled(plan, &countlast, blockcur))
295 30348 : continue;
296 26747 : bit_vect_set(block_enabled, blockcur);
297 26747 : ++countmax;
298 : }
299 :
300 : /*
301 : * Compute the autosave size for all disk, even if not read
302 : * this makes sense because the speed should be almost the same
303 : * if the disks are read in parallel
304 : */
305 12 : autosavelimit = state->autosave / (diskmax * state->block_size);
306 12 : autosavemissing = countmax; /* blocks to do */
307 12 : autosavedone = 0; /* blocks done */
308 :
309 : /* drop until now */
310 12 : state_usage_waste(state);
311 :
312 12 : countsize = 0;
313 12 : countpos = 0;
314 :
315 12 : msg_progress("Scrubbing...\n");
316 :
317 : /* start all the worker threads */
318 12 : io_start(&io, blockstart, blockmax, block_enabled);
319 :
320 12 : int alert = state_progress_begin(state, blockstart, blockmax, countmax);
321 12 : if (alert > 0)
322 0 : goto end;
323 12 : if (alert < 0)
324 0 : goto bail;
325 :
326 26747 : while (1) {
327 : unsigned char* buffer_recov[LEV_MAX];
328 : snapraid_info info;
329 : int error_on_this_block;
330 : int silent_error_on_this_block;
331 : int io_error_on_this_block;
332 : int block_is_unsynced;
333 : int rehash;
334 : void** buffer;
335 :
336 : /* go to the next block */
337 26759 : blockcur = io_read_next(&io, &buffer);
338 26759 : if (blockcur >= blockmax)
339 12 : break;
340 :
341 : /* until now is scheduling */
342 26747 : state_usage_sched(state);
343 :
344 : /* one more block processed for autosave */
345 26747 : ++autosavedone;
346 26747 : --autosavemissing;
347 :
348 : /* by default process the block, and skip it if something goes wrong */
349 26747 : error_on_this_block = 0;
350 26747 : silent_error_on_this_block = 0;
351 26747 : io_error_on_this_block = 0;
352 :
353 : /*
354 : * If all the blocks at this address are synced
355 : * if not, parity is not even checked
356 : */
357 26747 : block_is_unsynced = 0;
358 :
359 : /* get block specific info */
360 26747 : info = info_get(&state->infoarr, blockcur);
361 :
362 : /* if we have to use the old hash */
363 26747 : rehash = info_get_rehash(info);
364 :
365 : /* for each disk, process the block */
366 184379 : for (j = 0; j < diskmax; ++j) {
367 : struct snapraid_task* task;
368 : int read_size;
369 : unsigned char hash[HASH_MAX];
370 : struct snapraid_block* block;
371 : int file_is_unsynced;
372 : struct snapraid_disk* disk;
373 : struct snapraid_file* file;
374 : block_off_t file_pos;
375 : unsigned diskcur;
376 :
377 : /*
378 : * If the file on this disk is synced
379 : * if not, silent errors are assumed as expected error
380 : */
381 157632 : file_is_unsynced = 0;
382 :
383 : /* until now is misc */
384 157632 : state_usage_misc(state);
385 :
386 : /* get the next task */
387 157632 : task = io_data_read(&io, &diskcur, waiting_map, &waiting_mac);
388 :
389 : /* until now is disk */
390 157632 : state_usage_disk(state, handle, waiting_map, waiting_mac);
391 :
392 : /* get the task results */
393 157632 : disk = task->disk;
394 157632 : block = task->block;
395 157632 : file = task->file;
396 157632 : file_pos = task->file_pos;
397 157632 : read_size = task->read_size;
398 :
399 : /* by default no rehash in case of "continue" */
400 157632 : rehandle[diskcur].block = 0;
401 :
402 : /* if the disk position is not used */
403 157632 : if (!disk)
404 2738 : continue;
405 :
406 157632 : state_usage_file(state, disk, file);
407 :
408 : /* if the block is unsynced, errors are expected */
409 157632 : if (block_has_invalid_parity(block)) {
410 : /* report that the block and the file are not synced */
411 0 : block_is_unsynced = 1;
412 0 : file_is_unsynced = 1;
413 : /* follow */
414 : }
415 :
416 : /* if the block is not used */
417 157632 : if (!block_has_file(block))
418 1818 : continue;
419 :
420 : /* if the block is unsynced, errors are expected */
421 155814 : if (task->is_timestamp_different) {
422 : /* report that the block and the file are not synced */
423 0 : block_is_unsynced = 1;
424 0 : file_is_unsynced = 1;
425 : /* follow */
426 : }
427 :
428 : /* handle error conditions */
429 155814 : if (task->state == TASK_STATE_IOERROR) {
430 : /* LCOV_EXCL_START */
431 : ++io_error;
432 : goto bail;
433 : /* LCOV_EXCL_STOP */
434 : }
435 155814 : if (task->state == TASK_STATE_ERROR) {
436 : /* LCOV_EXCL_START */
437 : ++soft_error;
438 : goto bail;
439 : /* LCOV_EXCL_STOP */
440 : }
441 155814 : if (task->state == TASK_STATE_ERROR_CONTINUE) {
442 0 : ++soft_error;
443 0 : error_on_this_block = 1;
444 0 : continue;
445 : }
446 155814 : if (task->state == TASK_STATE_IOERROR_CONTINUE) {
447 0 : ++io_error;
448 0 : if (io_error >= state->opt.io_error_limit) {
449 : /* LCOV_EXCL_START */
450 : log_fatal(EIO, "DANGER! Too many input/output errors in the %s disk. It isn't possible to continue.\n", disk->dir);
451 : log_fatal(EIO, "Stopping at block %u\n", blockcur);
452 : goto bail;
453 : /* LCOV_EXCL_STOP */
454 : }
455 :
456 : /* otherwise continue */
457 0 : io_error_on_this_block = 1;
458 0 : continue;
459 : }
460 155814 : if (task->state != TASK_STATE_DONE) {
461 : /* LCOV_EXCL_START */
462 : log_fatal(EINTERNAL, "Internal inconsistency in task state\n");
463 : os_abort();
464 : /* LCOV_EXCL_STOP */
465 : }
466 :
467 155814 : countsize += read_size;
468 :
469 : /* now compute the hash */
470 155814 : if (rehash) {
471 27203 : memhash(state->prevhash, state->prevhashseed, hash, buffer[diskcur], read_size);
472 :
473 : /* compute the new hash, and store it */
474 27203 : rehandle[diskcur].block = block;
475 27203 : memhash(state->hash, state->hashseed, rehandle[diskcur].hash, buffer[diskcur], read_size);
476 : } else {
477 128611 : memhash(state->hash, state->hashseed, hash, buffer[diskcur], read_size);
478 : }
479 :
480 : /* until now is hash */
481 155814 : state_usage_hash(state);
482 :
483 155814 : if (block_has_updated_hash(block)) {
484 : /* compare the hash */
485 155814 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
486 920 : unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
487 :
488 : /* it's a silent error only if we are dealing with synced files */
489 920 : if (file_is_unsynced) {
490 0 : log_tag("error:%u:%s:%s: Data error at position %u, diff hash bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
491 0 : ++soft_error;
492 0 : error_on_this_block = 1;
493 : } else {
494 920 : log_tag("error_data:%u:%s:%s: Data error at position %u, diff hash bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
495 920 : log_error(EDATA, "Data error in file '%s' at position '%u', diff hash bits %u/%u\n", task->path, file_pos, diff, BLOCK_HASH_SIZE * 8);
496 920 : ++silent_error;
497 920 : silent_error_on_this_block = 1;
498 : }
499 920 : continue;
500 : }
501 : }
502 : }
503 :
504 : /* buffers for parity read and not computed */
505 182479 : for (l = 0; l < state->level; ++l)
506 155732 : buffer_recov[l] = buffer[diskmax + state->level + l];
507 31497 : for (; l < LEV_MAX; ++l)
508 4750 : buffer_recov[l] = 0;
509 :
510 : /* until now is misc */
511 26747 : state_usage_misc(state);
512 :
513 : /* read the parity */
514 182479 : for (l = 0; l < state->level; ++l) {
515 : struct snapraid_task* task;
516 : unsigned levcur;
517 :
518 155732 : task = io_parity_read(&io, &levcur, waiting_map, &waiting_mac);
519 :
520 : /* until now is parity */
521 155732 : state_usage_parity(state, waiting_map, waiting_mac);
522 :
523 : /* handle error conditions */
524 155732 : if (task->state == TASK_STATE_IOERROR) {
525 : /* LCOV_EXCL_START */
526 : ++io_error;
527 : goto bail;
528 : /* LCOV_EXCL_STOP */
529 : }
530 155732 : if (task->state == TASK_STATE_ERROR) {
531 : /* LCOV_EXCL_START */
532 : ++soft_error;
533 : goto bail;
534 : /* LCOV_EXCL_STOP */
535 : }
536 155732 : if (task->state == TASK_STATE_ERROR_CONTINUE) {
537 0 : ++soft_error;
538 0 : error_on_this_block = 1;
539 :
540 : /* if continuing on error, clear the missing buffer */
541 0 : buffer_recov[levcur] = 0;
542 0 : continue;
543 : }
544 155732 : if (task->state == TASK_STATE_IOERROR_CONTINUE) {
545 0 : ++io_error;
546 0 : if (io_error >= state->opt.io_error_limit) {
547 : /* LCOV_EXCL_START */
548 : log_fatal(EIO, "DANGER! Too many input/output errors in the %s disk. It isn't possible to continue.\n", lev_name(levcur));
549 : log_fatal(EIO, "Stopping at block %u\n", blockcur);
550 : goto bail;
551 : /* LCOV_EXCL_STOP */
552 : }
553 :
554 : /* otherwise continue */
555 0 : io_error_on_this_block = 1;
556 :
557 : /* if continuing on error, clear the missing buffer */
558 0 : buffer_recov[levcur] = 0;
559 0 : continue;
560 : }
561 155732 : if (task->state != TASK_STATE_DONE) {
562 : /* LCOV_EXCL_START */
563 : log_fatal(EINTERNAL, "Internal inconsistency in task state\n");
564 : os_abort();
565 : /* LCOV_EXCL_STOP */
566 : }
567 : }
568 :
569 : /* if we have read all the data required and it's correct, proceed with the parity check */
570 26747 : if (!error_on_this_block && !silent_error_on_this_block && !io_error_on_this_block) {
571 :
572 : /* compute the parity */
573 25827 : raid_gen(diskmax, state->level, state->block_size, buffer);
574 :
575 : /* compare the parity */
576 176039 : for (l = 0; l < state->level; ++l) {
577 150212 : if (buffer_recov[l] && memcmp(buffer[diskmax + l], buffer_recov[l], state->block_size) != 0) {
578 0 : unsigned diff = memdiff(buffer[diskmax + l], buffer_recov[l], state->block_size);
579 :
580 : /* it's a silent error only if we are dealing with synced blocks */
581 0 : if (block_is_unsynced) {
582 0 : log_tag("parity_error:%u:%s: Data error, diff parity bits %u/%u\n", blockcur, lev_config_name(l), diff, state->block_size * 8);
583 0 : ++soft_error;
584 0 : error_on_this_block = 1;
585 : } else {
586 0 : log_tag("parity_error_data:%u:%s: Data error, diff parity bits %u/%u\n", blockcur, lev_config_name(l), diff, state->block_size * 8);
587 0 : log_error(EDATA, "Data error in parity '%s' at position '%u', diff parity bits %u/%u\n", lev_config_name(l), blockcur, diff, state->block_size * 8);
588 0 : ++silent_error;
589 0 : silent_error_on_this_block = 1;
590 : }
591 : }
592 : }
593 :
594 : /* until now is raid */
595 25827 : state_usage_raid(state);
596 : }
597 :
598 26747 : if (silent_error_on_this_block || io_error_on_this_block) {
599 : /* set the error status keeping other info */
600 920 : info_set(&state->infoarr, blockcur, info_set_bad(info));
601 25827 : } else if (error_on_this_block) {
602 : /*
603 : * Do nothing, as this is a generic error
604 : * likely caused by a not synced array
605 : */
606 : } else {
607 : /* if rehash is needed */
608 25827 : if (rehash) {
609 : /* store all the new hash already computed */
610 32466 : for (j = 0; j < diskmax; ++j) {
611 27828 : if (rehandle[j].block)
612 27203 : memcpy(rehandle[j].block->hash, rehandle[j].hash, BLOCK_HASH_SIZE);
613 : }
614 : }
615 :
616 : /*
617 : * Update the time info of the block
618 : * and clear any other flag
619 : */
620 25827 : info_set(&state->infoarr, blockcur, info_make(now, 0, 0, 0));
621 : }
622 :
623 : /* mark the state as needing write */
624 26747 : state->need_write = 1;
625 :
626 : /* count the number of processed block */
627 26747 : ++countpos;
628 :
629 : /* progress */
630 26747 : if (state_progress(state, &io, blockcur, countpos, countmax, countsize)) {
631 : /* LCOV_EXCL_START */
632 : break;
633 : /* LCOV_EXCL_STOP */
634 : }
635 :
636 : /* thermal control */
637 26747 : if (state_thermal_alarm(state)) {
638 : /* until now is misc */
639 0 : state_usage_misc(state);
640 :
641 0 : state_progress_stop(state);
642 :
643 0 : state_thermal_cooldown(state);
644 :
645 0 : state_progress_restart(state);
646 :
647 : /* drop until now */
648 0 : state_usage_waste(state);
649 : }
650 :
651 : /* autosave */
652 26747 : if (state->autosave != 0
653 0 : && autosavedone >= autosavelimit /* if we have reached the limit */
654 0 : && autosavemissing >= autosavelimit /* if we have at least a full step to do */
655 : ) {
656 0 : autosavedone = 0; /* restart the counter */
657 :
658 : /* until now is misc */
659 0 : state_usage_misc(state);
660 :
661 0 : state_progress_stop(state);
662 :
663 0 : msg_progress("Autosaving...\n");
664 0 : state_write(state);
665 :
666 0 : state_progress_restart(state);
667 :
668 : /* drop until now */
669 0 : state_usage_waste(state);
670 : }
671 : }
672 :
673 12 : end:
674 12 : state_progress_end(state, countpos, countmax, countsize, "Nothing to scrub. Use the -p PLAN option to select a different plan, like -p full.\n");
675 :
676 : /* save the new state if required */
677 12 : if (state->need_write || state->opt.force_content_write)
678 10 : state_write(state);
679 :
680 12 : state_usage_print(state);
681 :
682 12 : if (soft_error || silent_error || io_error) {
683 1 : msg_status("\n");
684 1 : msg_status("%8u soft errors\n", soft_error);
685 1 : msg_status("%8u io errors\n", io_error);
686 1 : msg_status("%8u data errors\n", silent_error);
687 : } else {
688 11 : msg_status("Everything OK\n");
689 : }
690 :
691 12 : if (soft_error)
692 0 : log_fatal(ESOFT, "WARNING! Unexpected soft errors!\n");
693 12 : if (io_error)
694 0 : log_fatal(EIO, "DANGER! Unexpected input/output errors! The failing blocks are now marked as bad!\n");
695 12 : if (silent_error)
696 1 : log_fatal(EDATA, "DANGER! Unexpected data errors! The failing blocks are now marked as bad!\n");
697 12 : if (io_error || silent_error) {
698 1 : log_fatal(ESOFT, "Use 'snapraid status' to list the bad blocks.\n");
699 1 : log_fatal(ESOFT, "Use 'snapraid -e fix' to recover them.\n");
700 1 : log_fatal(ESOFT, "Use 'snapraid -p bad scrub' to recheck after fixing to clear the bad state.\n");
701 : }
702 :
703 12 : log_tag("summary:error_soft:%u\n", soft_error);
704 12 : log_tag("summary:error_io:%u\n", io_error);
705 12 : log_tag("summary:error_data:%u\n", silent_error);
706 12 : if (soft_error + silent_error + io_error == 0)
707 11 : log_tag("summary:exit:ok\n");
708 1 : else if (silent_error + io_error == 0)
709 0 : log_tag("summary:exit:warning\n");
710 : else
711 1 : log_tag("summary:exit:error\n");
712 12 : log_flush();
713 :
714 12 : bail:
715 : /* stop all the worker threads */
716 12 : io_stop(&io);
717 :
718 81 : for (j = 0; j < diskmax; ++j) {
719 69 : struct snapraid_file* file = handle[j].file;
720 69 : struct snapraid_disk* disk = handle[j].disk;
721 69 : ret = handle_close(&handle[j]);
722 69 : if (ret == -1) {
723 : /* LCOV_EXCL_START */
724 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
725 : log_fatal_errno(errno, disk->name);
726 :
727 : if (is_hw(errno)) {
728 : ++io_error;
729 : } else {
730 : ++soft_error;
731 : }
732 : /* continue, as we are already exiting */
733 : /* LCOV_EXCL_STOP */
734 : }
735 : }
736 :
737 12 : free(handle);
738 12 : free(rehandle_alloc);
739 12 : free(waiting_map);
740 12 : io_done(&io);
741 12 : free(block_enabled);
742 :
743 12 : if (state->opt.expect_recoverable) {
744 1 : if (soft_error + silent_error + io_error == 0)
745 0 : return -1;
746 : } else {
747 11 : if (soft_error + silent_error + io_error != 0)
748 0 : return -1;
749 : }
750 :
751 12 : if (alert < 0)
752 0 : return -1;
753 :
754 12 : return 0;
755 : }
756 :
757 : /**
758 : * Return a * b / c approximated to the upper value.
759 : */
760 2 : static uint32_t md(uint32_t a, uint32_t b, uint32_t c)
761 : {
762 2 : uint64_t v = a;
763 :
764 2 : v *= b;
765 2 : v += c - 1;
766 2 : v /= c;
767 :
768 2 : return v;
769 : }
770 :
771 12 : int state_scrub(struct snapraid_state* state, int plan100, int olderthan)
772 : {
773 : block_off_t blockmax;
774 : block_off_t countlimit;
775 : block_off_t count;
776 : time_t recentlimit;
777 : int ret;
778 : struct snapraid_parity_handle parity_handle[LEV_MAX];
779 : struct snapraid_plan ps;
780 : unsigned process_error;
781 : time_t now;
782 : unsigned l;
783 :
784 : /* get the present time */
785 12 : now = time(0);
786 :
787 12 : msg_progress("Initializing...\n");
788 :
789 12 : if ((plan100 == SCRUB_BAD || plan100 == SCRUB_NEW || plan100 == SCRUB_FULL)
790 5 : && olderthan >= 0) {
791 : /* LCOV_EXCL_START */
792 : log_fatal(EUSER, "You can specify -o, --older-than only with a numeric percentage.\n");
793 : exit(EXIT_FAILURE);
794 : /* LCOV_EXCL_STOP */
795 : }
796 :
797 12 : blockmax = parity_allocated_size(state);
798 :
799 : /* preinitialize to avoid warnings */
800 12 : countlimit = 0;
801 12 : recentlimit = 0;
802 :
803 12 : ps.state = state;
804 12 : if (state->opt.force_scrub_even) {
805 1 : ps.plan = SCRUB_EVEN;
806 11 : } else if (plan100 == SCRUB_FULL) {
807 3 : ps.plan = SCRUB_FULL;
808 3 : msg_progress("Scrub plan: full. All data blocks will be checked.\n");
809 8 : } else if (plan100 == SCRUB_NEW) {
810 1 : ps.plan = SCRUB_NEW;
811 1 : msg_progress("Scrub plan: new. Only blocks that have never been scrubbed will be checked.\n");
812 7 : } else if (plan100 == SCRUB_BAD) {
813 1 : ps.plan = SCRUB_BAD;
814 1 : msg_progress("Scrub plan: bad. Only blocks previously marked as bad will be checked.\n");
815 6 : } else if (state->opt.force_scrub_at) {
816 : /* scrub the specified amount of blocks */
817 4 : ps.plan = SCRUB_AUTO;
818 4 : countlimit = state->opt.force_scrub_at;
819 4 : recentlimit = now;
820 : } else {
821 2 : ps.plan = SCRUB_AUTO;
822 2 : if (plan100 >= 0) {
823 1 : countlimit = md(blockmax, plan100, 10000);
824 : } else {
825 : /* by default scrub 8.33% of the array (100/12=8.(3)) */
826 1 : countlimit = md(blockmax, 1, 12);
827 : }
828 :
829 2 : if (olderthan >= 0) {
830 1 : recentlimit = now - olderthan * 24 * 3600;
831 : } else {
832 : /* by default use a 10 day time limit */
833 1 : recentlimit = now - 10 * 24 * 3600;
834 : }
835 :
836 2 : if (plan100 >= 0) {
837 1 : if (olderthan >= 0)
838 0 : msg_progress("Scrub plan: auto. %.1f%% of the array, older than %d days, will be checked.\n", plan100 / 100.0, olderthan);
839 : else
840 1 : msg_progress("Scrub plan: auto. %.1f%% of the array, older than 10 days, will be checked.\n", plan100 / 100.0);
841 : } else {
842 1 : if (olderthan >= 0)
843 1 : msg_progress("Scrub plan: auto. 8.3%% of the array, older than %d days, will be checked.\n", olderthan);
844 : else
845 0 : msg_progress("Scrub plan: auto. 8.3%% of the array, older than 10 days, will be checked.\n");
846 : }
847 : }
848 :
849 12 : count = 0;
850 38 : for (tommy_node* j = tommy_list_head(&state->bucketlist); j != 0; j = j->next) {
851 26 : struct snapraid_bucket* bucket = j->data;
852 26 : count += bucket->count_scrubbed + bucket->count_justsynced;
853 : }
854 :
855 12 : if (!count) {
856 : /* LCOV_EXCL_START */
857 : log_fatal(EUSER, "The array is empty.\n");
858 : exit(EXIT_FAILURE);
859 : /* LCOV_EXCL_STOP */
860 : }
861 :
862 : /* compute the limits from count/recentlimit */
863 12 : if (ps.plan == SCRUB_AUTO) {
864 : /* no more than the full count */
865 6 : if (countlimit > count)
866 2 : countlimit = count;
867 :
868 : /* by default process everything */
869 6 : ps.timelimit = now;
870 6 : ps.lastlimit = 0;
871 :
872 6 : tommy_node* j = tommy_list_head(&state->bucketlist);
873 6 : block_off_t processed_count = 0;
874 11 : while (j) {
875 9 : struct snapraid_bucket* bucket = j->data;
876 9 : block_off_t bucket_count = bucket->count_justsynced + bucket->count_scrubbed;
877 :
878 9 : if (bucket->time_at > recentlimit) {
879 1 : ps.timelimit = recentlimit;
880 1 : ps.lastlimit = 0;
881 1 : break;
882 : }
883 :
884 8 : if (processed_count + bucket_count > countlimit) {
885 3 : ps.timelimit = bucket->time_at;
886 3 : ps.lastlimit = countlimit - processed_count;
887 3 : processed_count = countlimit;
888 3 : break;
889 : }
890 :
891 5 : processed_count += bucket_count;
892 5 : j = j->next;
893 : }
894 :
895 : /* if nothing to scrub, disable also other limits */
896 6 : if (processed_count == 0) {
897 1 : ps.timelimit = 0;
898 1 : ps.lastlimit = 0;
899 : }
900 :
901 6 : log_tag("count_limit:%u\n", countlimit);
902 6 : log_tag("time_limit:%" PRIu64 "\n", (uint64_t)ps.timelimit);
903 6 : log_tag("last_limit:%u\n", ps.lastlimit);
904 : } else {
905 : /* avoid compiler warnings */
906 6 : ps.timelimit = 0;
907 6 : ps.lastlimit = 0;
908 : }
909 :
910 : /* open the file for reading */
911 79 : for (l = 0; l < state->level; ++l) {
912 67 : ret = parity_open(&parity_handle[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
913 67 : if (ret == -1) {
914 : /* LCOV_EXCL_START */
915 : log_tag("parity_%s:%u:%s: Open error. %s.\n", es(errno), blockmax, lev_config_name(l), strerror(errno));
916 : log_fatal_errno(errno, lev_config_name(l));
917 : exit(EXIT_FAILURE);
918 : /* LCOV_EXCL_STOP */
919 : }
920 : }
921 :
922 12 : process_error = 0;
923 :
924 12 : ret = state_scrub_process(state, parity_handle, 0, blockmax, &ps, now);
925 12 : if (ret == -1) {
926 0 : ++process_error;
927 : /* continue, as we are already exiting */
928 : }
929 :
930 79 : for (l = 0; l < state->level; ++l) {
931 67 : ret = parity_close(&parity_handle[l]);
932 67 : if (ret == -1) {
933 : /* LCOV_EXCL_START */
934 : log_tag("parity_%s:%u:%s: Close error. %s.\n", es(errno), blockmax, lev_config_name(l), strerror(errno));
935 : log_fatal_errno(errno, lev_config_name(l));
936 :
937 : ++process_error;
938 : /* continue, as we are already exiting */
939 : /* LCOV_EXCL_STOP */
940 : }
941 : }
942 :
943 12 : if (process_error != 0)
944 0 : return -1;
945 12 : return 0;
946 : }
947 :
|