Line data Source code
1 : /*
2 : * Copyright (C) 2013 Andrea Mazzoleni
3 : *
4 : * This program is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 3 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License
15 : * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 : */
17 :
18 : #include "portable.h"
19 :
20 : #include "support.h"
21 : #include "elem.h"
22 : #include "state.h"
23 : #include "parity.h"
24 : #include "handle.h"
25 : #include "io.h"
26 : #include "raid/raid.h"
27 :
28 : /****************************************************************************/
29 : /* scrub */
30 :
31 : /**
32 : * Buffer for storing the new hashes.
33 : */
34 : struct snapraid_rehash {
35 : unsigned char hash[HASH_MAX];
36 : struct snapraid_block* block;
37 : };
38 :
39 : /**
40 : * Scrub plan to use.
41 : */
42 : struct snapraid_plan {
43 : struct snapraid_state* state;
44 : int plan; /**< One of the SCRUB_*. */
45 : time_t timelimit; /**< Time limit. Valid only with SCRUB_AUTO. */
46 : block_off_t lastlimit; /**< Number of blocks allowed with time exactly at ::timelimit. */
47 : block_off_t countlast; /**< Counter of blocks with time exactly at ::timelimit. */
48 : };
49 :
50 : /**
51 : * Check if we have to process the specified block index ::i.
52 : */
53 56085 : static int block_is_enabled(struct snapraid_plan* plan, block_off_t i)
54 : {
55 : time_t blocktime;
56 : snapraid_info info;
57 :
58 : /* don't scrub unused blocks in all plans */
59 56085 : info = info_get(&plan->state->infoarr, i);
60 56085 : if (info == 0)
61 0 : return 0;
62 :
63 : /* bad blocks are always scrubbed in all plans */
64 56085 : if (info_get_bad(info))
65 1772 : return 1;
66 :
67 54313 : switch (plan->plan) {
68 8522 : case SCRUB_FULL :
69 : /* in 'full' plan everything is scrubbed */
70 8522 : return 1;
71 9215 : case SCRUB_EVEN :
72 : /* in 'even' plan, scrub only even blocks */
73 9215 : return i % 2 == 0;
74 4687 : case SCRUB_NEW :
75 : /* in 'sync' plan, only blocks never scrubbed */
76 4687 : return info_get_justsynced(info);
77 3767 : case SCRUB_BAD :
78 : /* in 'bad' plan, only bad blocks (already reported) */
79 3767 : return 0;
80 : }
81 :
82 : /* if it's too new */
83 28122 : blocktime = info_get_time(info);
84 28122 : if (blocktime > plan->timelimit) {
85 : /* skip it */
86 4787 : return 0;
87 : }
88 :
89 : /* if the time is less than the limit, always include */
90 : /* otherwise, check if we reached the last limit count */
91 23335 : if (blocktime == plan->timelimit) {
92 : /* if we reached the count limit */
93 15066 : if (plan->countlast >= plan->lastlimit) {
94 : /* skip it */
95 12470 : return 0;
96 : }
97 :
98 2596 : ++plan->countlast;
99 : }
100 :
101 10865 : return 1;
102 : }
103 :
104 154602 : static void scrub_data_reader(struct snapraid_worker* worker, struct snapraid_task* task)
105 : {
106 154602 : struct snapraid_io* io = worker->io;
107 154602 : struct snapraid_state* state = io->state;
108 154602 : struct snapraid_handle* handle = worker->handle;
109 154602 : struct snapraid_disk* disk = handle->disk;
110 154602 : block_off_t blockcur = task->position;
111 154602 : unsigned char* buffer = task->buffer;
112 : int ret;
113 : char esc_buffer[ESC_MAX];
114 :
115 : /* if the disk position is not used */
116 154602 : if (!disk) {
117 : /* use an empty block */
118 0 : memset(buffer, 0, state->block_size);
119 0 : task->state = TASK_STATE_DONE;
120 1585 : return;
121 : }
122 :
123 : /* get the block */
124 154602 : task->block = fs_par2block_find(disk, blockcur);
125 :
126 : /* if the block is not used */
127 154602 : if (!block_has_file(task->block)) {
128 : /* use an empty block */
129 1585 : memset(buffer, 0, state->block_size);
130 1585 : task->state = TASK_STATE_DONE;
131 1585 : return;
132 : }
133 :
134 : /* get the file of this block */
135 153017 : task->file = fs_par2file_get(disk, blockcur, &task->file_pos);
136 :
137 : /* if the file is different than the current one, close it */
138 153017 : if (handle->file != 0 && handle->file != task->file) {
139 : /* keep a pointer at the file we are going to close for error reporting */
140 72572 : struct snapraid_file* report = handle->file;
141 72572 : ret = handle_close(handle);
142 72572 : if (ret == -1) {
143 : /* LCOV_EXCL_START */
144 : /* This one is really an unexpected error, because we are only reading */
145 : /* and closing a descriptor should never fail */
146 : if (errno == EIO) {
147 : log_tag("error:%u:%s:%s: Close EIO error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
148 : log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to scrub.\n");
149 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
150 : log_fatal("Stopping at block %u\n", blockcur);
151 : task->state = TASK_STATE_IOERROR;
152 : return;
153 : }
154 :
155 : log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
156 : log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to scrub.\n");
157 : log_fatal("Ensure that file '%s' can be accessed.\n", handle->path);
158 : log_fatal("Stopping at block %u\n", blockcur);
159 : task->state = TASK_STATE_ERROR;
160 : return;
161 : /* LCOV_EXCL_STOP */
162 : }
163 : }
164 :
165 153017 : ret = handle_open(handle, task->file, state->file_mode, log_error, 0);
166 153017 : if (ret == -1) {
167 0 : if (errno == EIO) {
168 : /* LCOV_EXCL_START */
169 : log_tag("error:%u:%s:%s: Open EIO error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
170 : log_fatal("DANGER! Unexpected input/output open error in a data disk, it isn't possible to scrub.\n");
171 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
172 : log_fatal("Stopping at block %u\n", blockcur);
173 : task->state = TASK_STATE_IOERROR;
174 : return;
175 : /* LCOV_EXCL_STOP */
176 : }
177 :
178 0 : log_tag("error:%u:%s:%s: Open error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
179 0 : task->state = TASK_STATE_ERROR_CONTINUE;
180 0 : return;
181 : }
182 :
183 : /* check if the file is changed */
184 153017 : if (handle->st.st_size != task->file->size
185 153017 : || handle->st.st_mtime != task->file->mtime_sec
186 153017 : || STAT_NSEC(&handle->st) != task->file->mtime_nsec
187 : /* don't check the inode to support filesystem without persistent inodes */
188 : ) {
189 : /* report that the block and the file are not synced */
190 0 : task->is_timestamp_different = 1;
191 : /* follow */
192 : }
193 :
194 : /* note that we intentionally don't abort if the file has different attributes */
195 : /* from the last sync, as we are expected to return errors if running */
196 : /* in an unsynced array. This is just like the check command. */
197 :
198 153017 : task->read_size = handle_read(handle, task->file_pos, buffer, state->block_size, log_error, 0);
199 153017 : if (task->read_size == -1) {
200 0 : if (errno == EIO) {
201 0 : log_tag("error:%u:%s:%s: Read EIO error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
202 0 : log_error("Input/Output error in file '%s' at position '%u'\n", handle->path, task->file_pos);
203 0 : task->state = TASK_STATE_IOERROR_CONTINUE;
204 0 : return;
205 : }
206 :
207 0 : log_tag("error:%u:%s:%s: Read error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
208 0 : task->state = TASK_STATE_ERROR_CONTINUE;
209 0 : return;
210 : }
211 :
212 : /* store the path of the opened file */
213 153017 : pathcpy(task->path, sizeof(task->path), handle->path);
214 :
215 153017 : task->state = TASK_STATE_DONE;
216 : }
217 :
218 154602 : static void scrub_parity_reader(struct snapraid_worker* worker, struct snapraid_task* task)
219 : {
220 154602 : struct snapraid_io* io = worker->io;
221 154602 : struct snapraid_state* state = io->state;
222 154602 : struct snapraid_parity_handle* parity_handle = worker->parity_handle;
223 154602 : unsigned level = parity_handle->level;
224 154602 : block_off_t blockcur = task->position;
225 154602 : unsigned char* buffer = task->buffer;
226 : int ret;
227 :
228 : /* read the parity */
229 154602 : ret = parity_read(parity_handle, blockcur, buffer, state->block_size, log_error);
230 154602 : if (ret == -1) {
231 0 : if (errno == EIO) {
232 0 : log_tag("parity_error:%u:%s: Read EIO error. %s\n", blockcur, lev_config_name(level), strerror(errno));
233 0 : log_error("Input/Output error in parity '%s' at position '%u'\n", lev_config_name(level), blockcur);
234 0 : task->state = TASK_STATE_IOERROR_CONTINUE;
235 0 : return;
236 : }
237 :
238 0 : log_tag("parity_error:%u:%s: Read error. %s\n", blockcur, lev_config_name(level), strerror(errno));
239 0 : task->state = TASK_STATE_ERROR_CONTINUE;
240 0 : return;
241 : }
242 :
243 154602 : task->state = TASK_STATE_DONE;
244 : }
245 :
246 11 : static int state_scrub_process(struct snapraid_state* state, struct snapraid_parity_handle* parity_handle, block_off_t blockstart, block_off_t blockmax, struct snapraid_plan* plan, time_t now)
247 : {
248 : struct snapraid_io io;
249 : struct snapraid_handle* handle;
250 : void* rehandle_alloc;
251 : struct snapraid_rehash* rehandle;
252 : unsigned diskmax;
253 : block_off_t blockcur;
254 : unsigned j;
255 : unsigned buffermax;
256 : data_off_t countsize;
257 : block_off_t countpos;
258 : block_off_t countmax;
259 : block_off_t autosavedone;
260 : block_off_t autosavelimit;
261 : block_off_t autosavemissing;
262 : int ret;
263 : unsigned error;
264 : unsigned silent_error;
265 : unsigned io_error;
266 : unsigned l;
267 : unsigned* waiting_map;
268 : unsigned waiting_mac;
269 : char esc_buffer[ESC_MAX];
270 : bit_vect_t* block_enabled;
271 :
272 : /* maps the disks to handles */
273 11 : handle = handle_mapping(state, &diskmax);
274 :
275 : /* rehash buffers */
276 11 : rehandle = malloc_nofail_align(diskmax * sizeof(struct snapraid_rehash), &rehandle_alloc);
277 :
278 : /* we need 1 * data + 2 * parity */
279 11 : buffermax = diskmax + 2 * state->level;
280 :
281 : /* initialize the io threads */
282 11 : io_init(&io, state, state->opt.io_cache, buffermax, scrub_data_reader, handle, diskmax, scrub_parity_reader, 0, parity_handle, state->level);
283 :
284 : /* possibly waiting disks */
285 11 : waiting_mac = diskmax > RAID_PARITY_MAX ? diskmax : RAID_PARITY_MAX;
286 11 : waiting_map = malloc_nofail(waiting_mac * sizeof(unsigned));
287 :
288 11 : error = 0;
289 11 : silent_error = 0;
290 11 : io_error = 0;
291 :
292 11 : msg_progress("Selecting...\n");
293 :
294 : /* first count the number of blocks to process */
295 11 : countmax = 0;
296 11 : plan->countlast = 0;
297 11 : block_enabled = calloc_nofail(1, bit_vect_size(blockmax)); /* preinitialize to 0 */
298 56096 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
299 56085 : if (!block_is_enabled(plan, blockcur))
300 30318 : continue;
301 25767 : bit_vect_set(block_enabled, blockcur);
302 25767 : ++countmax;
303 : }
304 :
305 : /* compute the autosave size for all disk, even if not read */
306 : /* this makes sense because the speed should be almost the same */
307 : /* if the disks are read in parallel */
308 11 : autosavelimit = state->autosave / (diskmax * state->block_size);
309 11 : autosavemissing = countmax; /* blocks to do */
310 11 : autosavedone = 0; /* blocks done */
311 :
312 : /* drop until now */
313 11 : state_usage_waste(state);
314 :
315 11 : countsize = 0;
316 11 : countpos = 0;
317 :
318 11 : msg_progress("Scrubbing...\n");
319 :
320 : /* start all the worker threads */
321 11 : io_start(&io, blockstart, blockmax, block_enabled);
322 :
323 11 : if (!state_progress_begin(state, blockstart, blockmax, countmax))
324 0 : goto end;
325 :
326 25767 : while (1) {
327 : unsigned char* buffer_recov[LEV_MAX];
328 : snapraid_info info;
329 : int error_on_this_block;
330 : int silent_error_on_this_block;
331 : int io_error_on_this_block;
332 : int block_is_unsynced;
333 : int rehash;
334 : void** buffer;
335 :
336 : /* go to the next block */
337 25778 : blockcur = io_read_next(&io, &buffer);
338 25778 : if (blockcur >= blockmax)
339 11 : break;
340 :
341 : /* until now is scheduling */
342 25767 : state_usage_sched(state);
343 :
344 : /* one more block processed for autosave */
345 25767 : ++autosavedone;
346 25767 : --autosavemissing;
347 :
348 : /* by default process the block, and skip it if something goes wrong */
349 25767 : error_on_this_block = 0;
350 25767 : silent_error_on_this_block = 0;
351 25767 : io_error_on_this_block = 0;
352 :
353 : /* if all the blocks at this address are synced */
354 : /* if not, parity is not even checked */
355 25767 : block_is_unsynced = 0;
356 :
357 : /* get block specific info */
358 25767 : info = info_get(&state->infoarr, blockcur);
359 :
360 : /* if we have to use the old hash */
361 25767 : rehash = info_get_rehash(info);
362 :
363 : /* for each disk, process the block */
364 180369 : for (j = 0; j < diskmax; ++j) {
365 : struct snapraid_task* task;
366 : int read_size;
367 : unsigned char hash[HASH_MAX];
368 : struct snapraid_block* block;
369 : int file_is_unsynced;
370 : struct snapraid_disk* disk;
371 : struct snapraid_file* file;
372 : block_off_t file_pos;
373 : unsigned diskcur;
374 :
375 : /* if the file on this disk is synced */
376 : /* if not, silent errors are assumed as expected error */
377 154602 : file_is_unsynced = 0;
378 :
379 : /* until now is misc */
380 154602 : state_usage_misc(state);
381 :
382 : /* get the next task */
383 154602 : task = io_data_read(&io, &diskcur, waiting_map, &waiting_mac);
384 :
385 : /* until now is disk */
386 154602 : state_usage_disk(state, handle, waiting_map, waiting_mac);
387 :
388 : /* get the task results */
389 154602 : disk = task->disk;
390 154602 : block = task->block;
391 154602 : file = task->file;
392 154602 : file_pos = task->file_pos;
393 154602 : read_size = task->read_size;
394 :
395 : /* by default no rehash in case of "continue" */
396 154602 : rehandle[diskcur].block = 0;
397 :
398 : /* if the disk position is not used */
399 154602 : if (!disk)
400 2505 : continue;
401 :
402 154602 : state_usage_file(state, disk, file);
403 :
404 : /* if the block is unsynced, errors are expected */
405 154602 : if (block_has_invalid_parity(block)) {
406 : /* report that the block and the file are not synced */
407 0 : block_is_unsynced = 1;
408 0 : file_is_unsynced = 1;
409 : /* follow */
410 : }
411 :
412 : /* if the block is not used */
413 154602 : if (!block_has_file(block))
414 1585 : continue;
415 :
416 : /* if the block is unsynced, errors are expected */
417 153017 : if (task->is_timestamp_different) {
418 : /* report that the block and the file are not synced */
419 0 : block_is_unsynced = 1;
420 0 : file_is_unsynced = 1;
421 : /* follow */
422 : }
423 :
424 : /* handle error conditions */
425 153017 : if (task->state == TASK_STATE_IOERROR) {
426 : /* LCOV_EXCL_START */
427 : ++io_error;
428 : goto bail;
429 : /* LCOV_EXCL_STOP */
430 : }
431 153017 : if (task->state == TASK_STATE_ERROR) {
432 : /* LCOV_EXCL_START */
433 : ++error;
434 : goto bail;
435 : /* LCOV_EXCL_STOP */
436 : }
437 153017 : if (task->state == TASK_STATE_ERROR_CONTINUE) {
438 0 : ++error;
439 0 : error_on_this_block = 1;
440 0 : continue;
441 : }
442 153017 : if (task->state == TASK_STATE_IOERROR_CONTINUE) {
443 0 : ++io_error;
444 0 : if (io_error >= state->opt.io_error_limit) {
445 : /* LCOV_EXCL_START */
446 : log_fatal("DANGER! Too many input/output read error in a data disk, it isn't possible to scrub.\n");
447 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, task->path);
448 : log_fatal("Stopping at block %u\n", blockcur);
449 : goto bail;
450 : /* LCOV_EXCL_STOP */
451 : }
452 :
453 : /* otherwise continue */
454 0 : io_error_on_this_block = 1;
455 0 : continue;
456 : }
457 153017 : if (task->state != TASK_STATE_DONE) {
458 : /* LCOV_EXCL_START */
459 : log_fatal("Internal inconsistency in task state\n");
460 : os_abort();
461 : /* LCOV_EXCL_STOP */
462 : }
463 :
464 153017 : countsize += read_size;
465 :
466 : /* now compute the hash */
467 153017 : if (rehash) {
468 27247 : memhash(state->prevhash, state->prevhashseed, hash, buffer[diskcur], read_size);
469 :
470 : /* compute the new hash, and store it */
471 27247 : rehandle[diskcur].block = block;
472 27247 : memhash(state->hash, state->hashseed, rehandle[diskcur].hash, buffer[diskcur], read_size);
473 : } else {
474 125770 : memhash(state->hash, state->hashseed, hash, buffer[diskcur], read_size);
475 : }
476 :
477 : /* until now is hash */
478 153017 : state_usage_hash(state);
479 :
480 153017 : if (block_has_updated_hash(block)) {
481 : /* compare the hash */
482 153017 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
483 920 : unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
484 :
485 920 : log_tag("error:%u:%s:%s: Data error at position %u, diff bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
486 :
487 : /* it's a silent error only if we are dealing with synced files */
488 920 : if (file_is_unsynced) {
489 0 : ++error;
490 0 : error_on_this_block = 1;
491 : } else {
492 920 : log_error("Data error in file '%s' at position '%u', diff bits %u/%u\n", task->path, file_pos, diff, BLOCK_HASH_SIZE * 8);
493 920 : ++silent_error;
494 920 : silent_error_on_this_block = 1;
495 : }
496 920 : continue;
497 : }
498 : }
499 : }
500 :
501 : /* buffers for parity read and not computed */
502 180369 : for (l = 0; l < state->level; ++l)
503 154602 : buffer_recov[l] = buffer[diskmax + state->level + l];
504 25767 : for (; l < LEV_MAX; ++l)
505 0 : buffer_recov[l] = 0;
506 :
507 : /* until now is misc */
508 25767 : state_usage_misc(state);
509 :
510 : /* read the parity */
511 180369 : for (l = 0; l < state->level; ++l) {
512 : struct snapraid_task* task;
513 : unsigned levcur;
514 :
515 154602 : task = io_parity_read(&io, &levcur, waiting_map, &waiting_mac);
516 :
517 : /* until now is parity */
518 154602 : state_usage_parity(state, waiting_map, waiting_mac);
519 :
520 : /* handle error conditions */
521 154602 : if (task->state == TASK_STATE_IOERROR) {
522 : /* LCOV_EXCL_START */
523 : ++io_error;
524 : goto bail;
525 : /* LCOV_EXCL_STOP */
526 : }
527 154602 : if (task->state == TASK_STATE_ERROR) {
528 : /* LCOV_EXCL_START */
529 : ++error;
530 : goto bail;
531 : /* LCOV_EXCL_STOP */
532 : }
533 154602 : if (task->state == TASK_STATE_ERROR_CONTINUE) {
534 0 : ++error;
535 0 : error_on_this_block = 1;
536 :
537 : /* if continuing on error, clear the missing buffer */
538 0 : buffer_recov[levcur] = 0;
539 0 : continue;
540 : }
541 154602 : if (task->state == TASK_STATE_IOERROR_CONTINUE) {
542 0 : ++io_error;
543 0 : if (io_error >= state->opt.io_error_limit) {
544 : /* LCOV_EXCL_START */
545 : log_fatal("DANGER! Too many input/output read error in the %s disk, it isn't possible to scrub.\n", lev_name(levcur));
546 : log_fatal("Ensure that disk '%s' is sane and can be read.\n", lev_config_name(levcur));
547 : log_fatal("Stopping at block %u\n", blockcur);
548 : goto bail;
549 : /* LCOV_EXCL_STOP */
550 : }
551 :
552 : /* otherwise continue */
553 0 : io_error_on_this_block = 1;
554 :
555 : /* if continuing on error, clear the missing buffer */
556 0 : buffer_recov[levcur] = 0;
557 0 : continue;
558 : }
559 154602 : if (task->state != TASK_STATE_DONE) {
560 : /* LCOV_EXCL_START */
561 : log_fatal("Internal inconsistency in task state\n");
562 : os_abort();
563 : /* LCOV_EXCL_STOP */
564 : }
565 : }
566 :
567 : /* if we have read all the data required and it's correct, proceed with the parity check */
568 25767 : if (!error_on_this_block && !silent_error_on_this_block && !io_error_on_this_block) {
569 :
570 : /* compute the parity */
571 24847 : raid_gen(diskmax, state->level, state->block_size, buffer);
572 :
573 : /* compare the parity */
574 173929 : for (l = 0; l < state->level; ++l) {
575 149082 : if (buffer_recov[l] && memcmp(buffer[diskmax + l], buffer_recov[l], state->block_size) != 0) {
576 0 : unsigned diff = memdiff(buffer[diskmax + l], buffer_recov[l], state->block_size);
577 :
578 0 : log_tag("parity_error:%u:%s: Data error, diff bits %u/%u\n", blockcur, lev_config_name(l), diff, state->block_size * 8);
579 :
580 : /* it's a silent error only if we are dealing with synced blocks */
581 0 : if (block_is_unsynced) {
582 0 : ++error;
583 0 : error_on_this_block = 1;
584 : } else {
585 0 : log_fatal("Data error in parity '%s' at position '%u', diff bits %u/%u\n", lev_config_name(l), blockcur, diff, state->block_size * 8);
586 0 : ++silent_error;
587 0 : silent_error_on_this_block = 1;
588 : }
589 : }
590 : }
591 :
592 : /* until now is raid */
593 24847 : state_usage_raid(state);
594 : }
595 :
596 25767 : if (silent_error_on_this_block || io_error_on_this_block) {
597 : /* set the error status keeping other info */
598 920 : info_set(&state->infoarr, blockcur, info_set_bad(info));
599 24847 : } else if (error_on_this_block) {
600 : /* do nothing, as this is a generic error */
601 : /* likely caused by a not synced array */
602 : } else {
603 : /* if rehash is needed */
604 24847 : if (rehash) {
605 : /* store all the new hash already computed */
606 32256 : for (j = 0; j < diskmax; ++j) {
607 27648 : if (rehandle[j].block)
608 27247 : memcpy(rehandle[j].block->hash, rehandle[j].hash, BLOCK_HASH_SIZE);
609 : }
610 : }
611 :
612 : /* update the time info of the block */
613 : /* and clear any other flag */
614 24847 : info_set(&state->infoarr, blockcur, info_make(now, 0, 0, 0));
615 : }
616 :
617 : /* mark the state as needing write */
618 25767 : state->need_write = 1;
619 :
620 : /* count the number of processed block */
621 25767 : ++countpos;
622 :
623 : /* progress */
624 25767 : if (state_progress(state, &io, blockcur, countpos, countmax, countsize)) {
625 : /* LCOV_EXCL_START */
626 : break;
627 : /* LCOV_EXCL_STOP */
628 : }
629 :
630 : /* thermal control */
631 25767 : if (state_thermal_alarm(state)) {
632 : /* until now is misc */
633 0 : state_usage_misc(state);
634 :
635 0 : state_progress_stop(state);
636 :
637 0 : state_thermal_cooldown(state);
638 :
639 0 : state_progress_restart(state);
640 :
641 : /* drop until now */
642 0 : state_usage_waste(state);
643 : }
644 :
645 : /* autosave */
646 25767 : if (state->autosave != 0
647 0 : && autosavedone >= autosavelimit /* if we have reached the limit */
648 0 : && autosavemissing >= autosavelimit /* if we have at least a full step to do */
649 : ) {
650 0 : autosavedone = 0; /* restart the counter */
651 :
652 : /* until now is misc */
653 0 : state_usage_misc(state);
654 :
655 0 : state_progress_stop(state);
656 :
657 0 : msg_progress("Autosaving...\n");
658 0 : state_write(state);
659 :
660 0 : state_progress_restart(state);
661 :
662 : /* drop until now */
663 0 : state_usage_waste(state);
664 : }
665 : }
666 :
667 11 : end:
668 11 : state_progress_end(state, countpos, countmax, countsize, "Nothing to scrub. Use the -p PLAN option to select a different plan, like -p full.\n");
669 :
670 : /* save the new state if required */
671 11 : if (state->need_write || state->opt.force_content_write)
672 9 : state_write(state);
673 :
674 11 : state_usage_print(state);
675 :
676 11 : if (error || silent_error || io_error) {
677 1 : msg_status("\n");
678 1 : msg_status("%8u file errors\n", error);
679 1 : msg_status("%8u io errors\n", io_error);
680 1 : msg_status("%8u data errors\n", silent_error);
681 : } else {
682 : /* print the result only if processed something */
683 10 : if (countpos != 0)
684 8 : msg_status("Everything OK\n");
685 : }
686 :
687 11 : if (error)
688 0 : log_fatal("WARNING! Unexpected file errors!\n");
689 11 : if (io_error)
690 0 : log_fatal("DANGER! Unexpected input/output errors! The failing blocks are now marked as bad!\n");
691 11 : if (silent_error)
692 1 : log_fatal("DANGER! Unexpected data errors! The failing blocks are now marked as bad!\n");
693 11 : if (io_error || silent_error) {
694 1 : log_fatal("Use 'snapraid status' to list the bad blocks.\n");
695 1 : log_fatal("Use 'snapraid -e fix' to recover them.\n");
696 1 : log_fatal("Use 'snapraid -p bad scrub' to recheck after fixing to clear the bad state.\n");
697 : }
698 :
699 11 : log_tag("summary:error_file:%u\n", error);
700 11 : log_tag("summary:error_io:%u\n", io_error);
701 11 : log_tag("summary:error_data:%u\n", silent_error);
702 11 : if (error + silent_error + io_error == 0)
703 10 : log_tag("summary:exit:ok\n");
704 : else
705 1 : log_tag("summary:exit:error\n");
706 11 : log_flush();
707 :
708 11 : bail:
709 : /* stop all the worker threads */
710 11 : io_stop(&io);
711 :
712 77 : for (j = 0; j < diskmax; ++j) {
713 66 : struct snapraid_file* file = handle[j].file;
714 66 : struct snapraid_disk* disk = handle[j].disk;
715 66 : ret = handle_close(&handle[j]);
716 66 : if (ret == -1) {
717 : /* LCOV_EXCL_START */
718 : log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
719 : log_fatal("DANGER! Unexpected close error in a data disk.\n");
720 : ++error;
721 : /* continue, as we are already exiting */
722 : /* LCOV_EXCL_STOP */
723 : }
724 : }
725 :
726 11 : free(handle);
727 11 : free(rehandle_alloc);
728 11 : free(waiting_map);
729 11 : io_done(&io);
730 11 : free(block_enabled);
731 :
732 11 : if (state->opt.expect_recoverable) {
733 1 : if (error + silent_error + io_error == 0)
734 0 : return -1;
735 : } else {
736 10 : if (error + silent_error + io_error != 0)
737 0 : return -1;
738 : }
739 11 : return 0;
740 : }
741 :
742 : /**
743 : * Return a * b / c approximated to the upper value.
744 : */
745 2 : static uint32_t md(uint32_t a, uint32_t b, uint32_t c)
746 : {
747 2 : uint64_t v = a;
748 :
749 2 : v *= b;
750 2 : v += c - 1;
751 2 : v /= c;
752 :
753 2 : return v;
754 : }
755 :
756 11 : int state_scrub(struct snapraid_state* state, int plan, int olderthan)
757 : {
758 : block_off_t blockmax;
759 : block_off_t countlimit;
760 : block_off_t i;
761 : block_off_t count;
762 : time_t recentlimit;
763 : int ret;
764 : struct snapraid_parity_handle parity_handle[LEV_MAX];
765 : struct snapraid_plan ps;
766 : time_t* timemap;
767 : unsigned error;
768 : time_t now;
769 : unsigned l;
770 :
771 : /* get the present time */
772 11 : now = time(0);
773 :
774 11 : msg_progress("Initializing...\n");
775 :
776 11 : if ((plan == SCRUB_BAD || plan == SCRUB_NEW || plan == SCRUB_FULL)
777 4 : && olderthan >= 0) {
778 : /* LCOV_EXCL_START */
779 : log_fatal("You can specify -o, --older-than only with a numeric percentage.\n");
780 : exit(EXIT_FAILURE);
781 : /* LCOV_EXCL_STOP */
782 : }
783 :
784 11 : blockmax = parity_allocated_size(state);
785 :
786 : /* preinitialize to avoid warnings */
787 11 : countlimit = 0;
788 11 : recentlimit = 0;
789 :
790 11 : ps.state = state;
791 11 : if (state->opt.force_scrub_even) {
792 1 : ps.plan = SCRUB_EVEN;
793 10 : } else if (plan == SCRUB_FULL) {
794 2 : ps.plan = SCRUB_FULL;
795 2 : msg_info("Scrub plan: full. All data blocks will be checked.\n");
796 8 : } else if (plan == SCRUB_NEW) {
797 1 : ps.plan = SCRUB_NEW;
798 1 : msg_info("Scrub plan: new. Only blocks that have never been scrubbed will be checked.\n");
799 7 : } else if (plan == SCRUB_BAD) {
800 1 : ps.plan = SCRUB_BAD;
801 1 : msg_info("Scrub plan: bad. Only blocks previously marked as bad will be checked.\n");
802 6 : } else if (state->opt.force_scrub_at) {
803 : /* scrub the specified amount of blocks */
804 4 : ps.plan = SCRUB_AUTO;
805 4 : countlimit = state->opt.force_scrub_at;
806 4 : recentlimit = now;
807 : } else {
808 2 : ps.plan = SCRUB_AUTO;
809 2 : if (plan >= 0) {
810 1 : countlimit = md(blockmax, plan, 100);
811 : } else {
812 : /* by default scrub 8.33% of the array (100/12=8.(3)) */
813 1 : countlimit = md(blockmax, 1, 12);
814 : }
815 :
816 2 : if (olderthan >= 0) {
817 1 : recentlimit = now - olderthan * 24 * 3600;
818 : } else {
819 : /* by default use a 10 day time limit */
820 1 : recentlimit = now - 10 * 24 * 3600;
821 : }
822 :
823 2 : if (plan >= 0) {
824 1 : if (olderthan >= 0)
825 0 : msg_info("Scrub plan: auto. %d%% of the array, older than %d days, will be checked.\n", plan, olderthan);
826 : else
827 1 : msg_info("Scrub plan: auto. %d%% of the array, older than 10 days, will be checked.\n", plan);
828 : } else {
829 1 : if (olderthan >= 0)
830 1 : msg_info("Scrub plan: auto. 8.3%% of the array, older than %d days, will be checked.\n", olderthan);
831 : else
832 0 : msg_info("Scrub plan: auto. 8.3%% of the array, older than 10 days, will be checked.\n");
833 : }
834 : }
835 :
836 : /* identify the time limit */
837 : /* we sort all the block times, and we identify the time limit for which we reach the quota */
838 : /* this allow to process first the oldest blocks */
839 11 : timemap = malloc_nofail(blockmax * sizeof(time_t));
840 :
841 : /* copy the info in the temp vector */
842 11 : count = 0;
843 11 : log_tag("block_count:%u\n", blockmax);
844 56096 : for (i = 0; i < blockmax; ++i) {
845 56085 : snapraid_info info = info_get(&state->infoarr, i);
846 :
847 : /* skip unused blocks */
848 56085 : if (info == 0)
849 0 : continue;
850 :
851 56085 : timemap[count++] = info_get_time(info);
852 : }
853 :
854 11 : if (!count) {
855 : /* LCOV_EXCL_START */
856 : log_fatal("The array appears to be empty.\n");
857 : exit(EXIT_FAILURE);
858 : /* LCOV_EXCL_STOP */
859 : }
860 :
861 : /* sort it */
862 11 : qsort(timemap, count, sizeof(time_t), time_compare);
863 :
864 : /* output the info map */
865 11 : i = 0;
866 11 : log_tag("info_count:%u\n", count);
867 29 : while (i < count) {
868 18 : unsigned j = i + 1;
869 56085 : while (j < count && timemap[i] == timemap[j])
870 56067 : ++j;
871 18 : log_tag("info_time:%" PRIu64 ":%u\n", (uint64_t)timemap[i], j - i);
872 18 : i = j;
873 : }
874 :
875 : /* compute the limits from count/recentlimit */
876 11 : if (ps.plan == SCRUB_AUTO) {
877 : /* no more than the full count */
878 6 : if (countlimit > count)
879 2 : countlimit = count;
880 :
881 : /* decrease until we reach the specific recentlimit */
882 53 : while (countlimit > 0 && timemap[countlimit - 1] > recentlimit)
883 47 : --countlimit;
884 :
885 : /* if there is something to scrub */
886 6 : if (countlimit > 0) {
887 : /* get the most recent time we want to scrub */
888 5 : ps.timelimit = timemap[countlimit - 1];
889 :
890 : /* count how many entries for this exact time we have to scrub */
891 : /* if the blocks have all the same time, we end with countlimit == lastlimit */
892 5 : ps.lastlimit = 1;
893 2596 : while (countlimit > ps.lastlimit && timemap[countlimit - ps.lastlimit - 1] == ps.timelimit)
894 2591 : ++ps.lastlimit;
895 : } else {
896 : /* if nothing to scrub, disable also other limits */
897 1 : ps.timelimit = 0;
898 1 : ps.lastlimit = 0;
899 : }
900 :
901 6 : log_tag("count_limit:%u\n", countlimit);
902 6 : log_tag("time_limit:%" PRIu64 "\n", (uint64_t)ps.timelimit);
903 6 : log_tag("last_limit:%u\n", ps.lastlimit);
904 : }
905 :
906 : /* free the temp vector */
907 11 : free(timemap);
908 :
909 : /* open the file for reading */
910 77 : for (l = 0; l < state->level; ++l) {
911 66 : ret = parity_open(&parity_handle[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
912 66 : if (ret == -1) {
913 : /* LCOV_EXCL_START */
914 : log_fatal("WARNING! Without an accessible %s file, it isn't possible to scrub.\n", lev_name(l));
915 : exit(EXIT_FAILURE);
916 : /* LCOV_EXCL_STOP */
917 : }
918 : }
919 :
920 11 : error = 0;
921 :
922 11 : ret = state_scrub_process(state, parity_handle, 0, blockmax, &ps, now);
923 11 : if (ret == -1) {
924 0 : ++error;
925 : /* continue, as we are already exiting */
926 : }
927 :
928 77 : for (l = 0; l < state->level; ++l) {
929 66 : ret = parity_close(&parity_handle[l]);
930 66 : if (ret == -1) {
931 : /* LCOV_EXCL_START */
932 : log_fatal("DANGER! Unexpected close error in %s disk.\n", lev_name(l));
933 : ++error;
934 : /* continue, as we are already exiting */
935 : /* LCOV_EXCL_STOP */
936 : }
937 : }
938 :
939 : /* abort if required */
940 11 : if (error != 0)
941 0 : return -1;
942 11 : return 0;
943 : }
944 :
|