Line data Source code
1 : /*
2 : * Copyright (C) 2013 Andrea Mazzoleni
3 : *
4 : * This program is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 3 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License
15 : * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 : */
17 :
18 : #include "portable.h"
19 :
20 : #include "support.h"
21 : #include "elem.h"
22 : #include "state.h"
23 : #include "parity.h"
24 : #include "handle.h"
25 : #include "io.h"
26 : #include "raid/raid.h"
27 :
28 : /****************************************************************************/
29 : /* scrub */
30 :
31 : /**
32 : * Buffer for storing the new hashes.
33 : */
34 : struct snapraid_rehash {
35 : unsigned char hash[HASH_MAX];
36 : struct snapraid_block* block;
37 : };
38 :
39 : /**
40 : * Scrub plan to use.
41 : */
42 : struct snapraid_plan {
43 : struct snapraid_state* state;
44 : int plan; /**< One of the SCRUB_*. */
45 : time_t timelimit; /**< Time limit. Valid only with SCRUB_AUTO. */
46 : block_off_t lastlimit; /**< Number of blocks allowed with time exactly at ::timelimit. */
47 : block_off_t countlast; /**< Counter of blocks with time exactly at ::timelimit. */
48 : };
49 :
50 : /**
51 : * Check if we have to process the specified block index ::i.
52 : */
53 112170 : static int block_is_enabled(void* void_plan, block_off_t i)
54 : {
55 112170 : struct snapraid_plan* plan = void_plan;
56 : time_t blocktime;
57 : snapraid_info info;
58 :
59 : /* don't scrub unused blocks in all plans */
60 112170 : info = info_get(&plan->state->infoarr, i);
61 112170 : if (info == 0)
62 0 : return 0;
63 :
64 : /* bad blocks are always scrubbed in all plans */
65 112170 : if (info_get_bad(info))
66 3546 : return 1;
67 :
68 108624 : switch (plan->plan) {
69 : case SCRUB_FULL :
70 : /* in 'full' plan everything is scrubbed */
71 17042 : return 1;
72 : case SCRUB_EVEN :
73 : /* in 'even' plan, scrub only even blocks */
74 18430 : return i % 2 == 0;
75 : case SCRUB_NEW :
76 : /* in 'sync' plan, only blocks never scrubbed */
77 9374 : return info_get_justsynced(info);
78 : case SCRUB_BAD :
79 : /* in 'bad' plan, only bad blocks (already reported) */
80 7534 : return 0;
81 : }
82 :
83 : /* if it's too new */
84 56244 : blocktime = info_get_time(info);
85 56244 : if (blocktime > plan->timelimit) {
86 : /* skip it */
87 9574 : return 0;
88 : }
89 :
90 : /* if the time is less than the limit, always include */
91 : /* otherwise, check if we reached the last limit count */
92 46670 : if (blocktime == plan->timelimit) {
93 : /* if we reached the count limit */
94 39496 : if (plan->countlast >= plan->lastlimit) {
95 : /* skip it */
96 24940 : return 0;
97 : }
98 :
99 14556 : ++plan->countlast;
100 : }
101 :
102 21730 : return 1;
103 : }
104 :
105 168097 : static void scrub_data_reader(struct snapraid_worker* worker, struct snapraid_task* task)
106 : {
107 168097 : struct snapraid_io* io = worker->io;
108 168097 : struct snapraid_state* state = io->state;
109 168097 : struct snapraid_handle* handle = worker->handle;
110 168097 : struct snapraid_disk* disk = handle->disk;
111 168097 : block_off_t blockcur = task->position;
112 168097 : unsigned char* buffer = task->buffer;
113 : int ret;
114 : char esc_buffer[ESC_MAX];
115 :
116 : /* if the disk position is not used */
117 168097 : if (!disk) {
118 : /* use an empty block */
119 0 : memset(buffer, 0, state->block_size);
120 0 : task->state = TASK_STATE_DONE;
121 1572 : return;
122 : }
123 :
124 : /* get the block */
125 168098 : task->block = fs_par2block_find(disk, blockcur);
126 :
127 : /* if the block is not used */
128 167743 : if (!block_has_file(task->block)) {
129 : /* use an empty block */
130 1574 : memset(buffer, 0, state->block_size);
131 1574 : task->state = TASK_STATE_DONE;
132 1574 : return;
133 : }
134 :
135 : /* get the file of this block */
136 166321 : task->file = fs_par2file_get(disk, blockcur, &task->file_pos);
137 :
138 : /* if the file is different than the current one, close it */
139 166395 : if (handle->file != 0 && handle->file != task->file) {
140 : /* keep a pointer at the file we are going to close for error reporting */
141 78046 : struct snapraid_file* report = handle->file;
142 78046 : ret = handle_close(handle);
143 77385 : if (ret == -1) {
144 : /* LCOV_EXCL_START */
145 : /* This one is really an unexpected error, because we are only reading */
146 : /* and closing a descriptor should never fail */
147 : if (errno == EIO) {
148 : log_tag("error:%u:%s:%s: Close EIO error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
149 : log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to scrub.\n");
150 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
151 : log_fatal("Stopping at block %u\n", blockcur);
152 : task->state = TASK_STATE_IOERROR;
153 : return;
154 : }
155 :
156 : log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
157 : log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to scrub.\n");
158 : log_fatal("Ensure that file '%s' can be accessed.\n", handle->path);
159 : log_fatal("Stopping at block %u\n", blockcur);
160 : task->state = TASK_STATE_ERROR;
161 : return;
162 : /* LCOV_EXCL_STOP */
163 : }
164 : }
165 :
166 165734 : ret = handle_open(handle, task->file, state->file_mode, log_error, 0);
167 163716 : if (ret == -1) {
168 0 : if (errno == EIO) {
169 : /* LCOV_EXCL_START */
170 : log_tag("error:%u:%s:%s: Open EIO error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
171 : log_fatal("DANGER! Unexpected input/output open error in a data disk, it isn't possible to scrub.\n");
172 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
173 : log_fatal("Stopping at block %u\n", blockcur);
174 : task->state = TASK_STATE_IOERROR;
175 : return;
176 : /* LCOV_EXCL_STOP */
177 : }
178 :
179 0 : log_tag("error:%u:%s:%s: Open error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
180 0 : task->state = TASK_STATE_ERROR_CONTINUE;
181 0 : return;
182 : }
183 :
184 : /* check if the file is changed */
185 163716 : if (handle->st.st_size != task->file->size
186 164597 : || handle->st.st_mtime != task->file->mtime_sec
187 159902 : || STAT_NSEC(&handle->st) != task->file->mtime_nsec
188 : /* don't check the inode to support filesystem without persistent inodes */
189 : ) {
190 : /* report that the block and the file are not synced */
191 5598 : task->is_timestamp_different = 1;
192 : /* follow */
193 : }
194 :
195 : /* note that we intentionally don't abort if the file has different attributes */
196 : /* from the last sync, as we are expected to return errors if running */
197 : /* in an unsynced array. This is just like the check command. */
198 :
199 163716 : task->read_size = handle_read(handle, task->file_pos, buffer, state->block_size, log_error, 0);
200 165693 : if (task->read_size == -1) {
201 0 : if (errno == EIO) {
202 0 : log_tag("error:%u:%s:%s: Read EIO error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
203 0 : log_error("Input/Output error in file '%s' at position '%u'\n", handle->path, task->file_pos);
204 0 : task->state = TASK_STATE_IOERROR_CONTINUE;
205 0 : return;
206 : }
207 :
208 0 : log_tag("error:%u:%s:%s: Read error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
209 0 : task->state = TASK_STATE_ERROR_CONTINUE;
210 0 : return;
211 : }
212 :
213 : /* store the path of the opened file */
214 165693 : pathcpy(task->path, sizeof(task->path), handle->path);
215 :
216 163863 : task->state = TASK_STATE_DONE;
217 : }
218 :
219 168123 : static void scrub_parity_reader(struct snapraid_worker* worker, struct snapraid_task* task)
220 : {
221 168123 : struct snapraid_io* io = worker->io;
222 168123 : struct snapraid_state* state = io->state;
223 168123 : struct snapraid_parity_handle* parity_handle = worker->parity_handle;
224 168123 : unsigned level = parity_handle->level;
225 168123 : block_off_t blockcur = task->position;
226 168123 : unsigned char* buffer = task->buffer;
227 : int ret;
228 :
229 : /* read the parity */
230 168123 : ret = parity_read(parity_handle, blockcur, buffer, state->block_size, log_error);
231 168038 : if (ret == -1) {
232 0 : if (errno == EIO) {
233 0 : log_tag("parity_error:%u:%s: Read EIO error. %s\n", blockcur, lev_config_name(level), strerror(errno));
234 0 : log_error("Input/Output error in parity '%s' at position '%u'\n", lev_config_name(level), blockcur);
235 0 : task->state = TASK_STATE_IOERROR_CONTINUE;
236 0 : return;
237 : }
238 :
239 0 : log_tag("parity_error:%u:%s: Read error. %s\n", blockcur, lev_config_name(level), strerror(errno));
240 0 : task->state = TASK_STATE_ERROR_CONTINUE;
241 0 : return;
242 : }
243 :
244 168038 : task->state = TASK_STATE_DONE;
245 : }
246 :
247 11 : static int state_scrub_process(struct snapraid_state* state, struct snapraid_parity_handle* parity_handle, block_off_t blockstart, block_off_t blockmax, struct snapraid_plan* plan, time_t now)
248 : {
249 : struct snapraid_io io;
250 : struct snapraid_handle* handle;
251 : void* rehandle_alloc;
252 : struct snapraid_rehash* rehandle;
253 : unsigned diskmax;
254 : block_off_t blockcur;
255 : unsigned j;
256 : unsigned buffermax;
257 : data_off_t countsize;
258 : block_off_t countpos;
259 : block_off_t countmax;
260 : block_off_t autosavedone;
261 : block_off_t autosavelimit;
262 : block_off_t autosavemissing;
263 : int ret;
264 : unsigned error;
265 : unsigned silent_error;
266 : unsigned io_error;
267 : unsigned l;
268 : unsigned* waiting_map;
269 : unsigned waiting_mac;
270 : char esc_buffer[ESC_MAX];
271 :
272 : /* maps the disks to handles */
273 11 : handle = handle_mapping(state, &diskmax);
274 :
275 : /* rehash buffers */
276 11 : rehandle = malloc_nofail_align(diskmax * sizeof(struct snapraid_rehash), &rehandle_alloc);
277 :
278 : /* we need 1 * data + 2 * parity */
279 11 : buffermax = diskmax + 2 * state->level;
280 :
281 : /* initialize the io threads */
282 11 : io_init(&io, state, state->opt.io_cache, buffermax, scrub_data_reader, handle, diskmax, scrub_parity_reader, 0, parity_handle, state->level);
283 :
284 : /* possibly waiting disks */
285 11 : waiting_mac = diskmax > RAID_PARITY_MAX ? diskmax : RAID_PARITY_MAX;
286 11 : waiting_map = malloc_nofail(waiting_mac * sizeof(unsigned));
287 :
288 11 : error = 0;
289 11 : silent_error = 0;
290 11 : io_error = 0;
291 :
292 : /* first count the number of blocks to process */
293 11 : countmax = 0;
294 11 : plan->countlast = 0;
295 56096 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
296 56085 : if (!block_is_enabled(plan, blockcur))
297 28053 : continue;
298 28032 : ++countmax;
299 : }
300 :
301 : /* compute the autosave size for all disk, even if not read */
302 : /* this makes sense because the speed should be almost the same */
303 : /* if the disks are read in parallel */
304 11 : autosavelimit = state->autosave / (diskmax * state->block_size);
305 11 : autosavemissing = countmax; /* blocks to do */
306 11 : autosavedone = 0; /* blocks done */
307 :
308 : /* drop until now */
309 11 : state_usage_waste(state);
310 :
311 11 : countsize = 0;
312 11 : countpos = 0;
313 11 : plan->countlast = 0;
314 :
315 : /* start all the worker threads */
316 11 : io_start(&io, blockstart, blockmax, &block_is_enabled, plan);
317 :
318 11 : state_progress_begin(state, blockstart, blockmax, countmax);
319 : while (1) {
320 : unsigned char* buffer_recov[LEV_MAX];
321 : snapraid_info info;
322 : int error_on_this_block;
323 : int silent_error_on_this_block;
324 : int io_error_on_this_block;
325 : int block_is_unsynced;
326 : int rehash;
327 : void** buffer;
328 :
329 : /* go to the next block */
330 28043 : blockcur = io_read_next(&io, &buffer);
331 28043 : if (blockcur >= blockmax)
332 22 : break;
333 :
334 : /* until now is scheduling */
335 28032 : state_usage_sched(state);
336 :
337 : /* one more block processed for autosave */
338 28032 : ++autosavedone;
339 28032 : --autosavemissing;
340 :
341 : /* by default process the block, and skip it if something goes wrong */
342 28032 : error_on_this_block = 0;
343 28032 : silent_error_on_this_block = 0;
344 28032 : io_error_on_this_block = 0;
345 :
346 : /* if all the blocks at this address are synced */
347 : /* if not, parity is not even checked */
348 28032 : block_is_unsynced = 0;
349 :
350 : /* get block specific info */
351 28032 : info = info_get(&state->infoarr, blockcur);
352 :
353 : /* if we have to use the old hash */
354 28032 : rehash = info_get_rehash(info);
355 :
356 : /* for each disk, process the block */
357 196224 : for (j = 0; j < diskmax; ++j) {
358 : struct snapraid_task* task;
359 : int read_size;
360 : unsigned char hash[HASH_MAX];
361 : struct snapraid_block* block;
362 : int file_is_unsynced;
363 : struct snapraid_disk* disk;
364 : struct snapraid_file* file;
365 : block_off_t file_pos;
366 : unsigned diskcur;
367 :
368 : /* if the file on this disk is synced */
369 : /* if not, silent errors are assumed as expected error */
370 168192 : file_is_unsynced = 0;
371 :
372 : /* until now is misc */
373 168192 : state_usage_misc(state);
374 :
375 : /* get the next task */
376 168192 : task = io_data_read(&io, &diskcur, waiting_map, &waiting_mac);
377 :
378 : /* until now is disk */
379 168192 : state_usage_disk(state, handle, waiting_map, waiting_mac);
380 :
381 : /* get the task results */
382 168192 : disk = task->disk;
383 168192 : block = task->block;
384 168192 : file = task->file;
385 168192 : file_pos = task->file_pos;
386 168192 : read_size = task->read_size;
387 :
388 : /* by default no rehash in case of "continue" */
389 168192 : rehandle[diskcur].block = 0;
390 :
391 : /* if the disk position is not used */
392 168192 : if (!disk)
393 2505 : continue;
394 :
395 : /* if the block is unsynced, errors are expected */
396 168192 : if (block_has_invalid_parity(block)) {
397 : /* report that the block and the file are not synced */
398 0 : block_is_unsynced = 1;
399 0 : file_is_unsynced = 1;
400 : /* follow */
401 : }
402 :
403 : /* if the block is not used */
404 168192 : if (!block_has_file(block))
405 1585 : continue;
406 :
407 : /* if the block is unsynced, errors are expected */
408 166607 : if (task->is_timestamp_different) {
409 : /* report that the block and the file are not synced */
410 5598 : block_is_unsynced = 1;
411 5598 : file_is_unsynced = 1;
412 : /* follow */
413 : }
414 :
415 : /* handle error conditions */
416 166607 : if (task->state == TASK_STATE_IOERROR) {
417 : /* LCOV_EXCL_START */
418 : ++io_error;
419 : goto bail;
420 : /* LCOV_EXCL_STOP */
421 : }
422 166607 : if (task->state == TASK_STATE_ERROR) {
423 : /* LCOV_EXCL_START */
424 : ++error;
425 : goto bail;
426 : /* LCOV_EXCL_STOP */
427 : }
428 166607 : if (task->state == TASK_STATE_ERROR_CONTINUE) {
429 0 : ++error;
430 0 : error_on_this_block = 1;
431 0 : continue;
432 : }
433 166607 : if (task->state == TASK_STATE_IOERROR_CONTINUE) {
434 0 : ++io_error;
435 0 : if (io_error >= state->opt.io_error_limit) {
436 : /* LCOV_EXCL_START */
437 : log_fatal("DANGER! Too many input/output read error in a data disk, it isn't possible to scrub.\n");
438 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, task->path);
439 : log_fatal("Stopping at block %u\n", blockcur);
440 : goto bail;
441 : /* LCOV_EXCL_STOP */
442 : }
443 :
444 : /* otherwise continue */
445 0 : io_error_on_this_block = 1;
446 0 : continue;
447 : }
448 166607 : if (task->state != TASK_STATE_DONE) {
449 : /* LCOV_EXCL_START */
450 : log_fatal("Internal inconsistency in task state\n");
451 : os_abort();
452 : /* LCOV_EXCL_STOP */
453 : }
454 :
455 166607 : countsize += read_size;
456 :
457 : /* now compute the hash */
458 166607 : if (rehash) {
459 27247 : memhash(state->prevhash, state->prevhashseed, hash, buffer[diskcur], read_size);
460 :
461 : /* compute the new hash, and store it */
462 27247 : rehandle[diskcur].block = block;
463 27247 : memhash(state->hash, state->hashseed, rehandle[diskcur].hash, buffer[diskcur], read_size);
464 : } else {
465 139360 : memhash(state->hash, state->hashseed, hash, buffer[diskcur], read_size);
466 : }
467 :
468 : /* until now is hash */
469 166607 : state_usage_hash(state);
470 :
471 166607 : if (block_has_updated_hash(block)) {
472 : /* compare the hash */
473 166607 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
474 920 : unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
475 :
476 920 : log_tag("error:%u:%s:%s: Data error at position %u, diff bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
477 :
478 : /* it's a silent error only if we are dealing with synced files */
479 920 : if (file_is_unsynced) {
480 0 : ++error;
481 0 : error_on_this_block = 1;
482 : } else {
483 920 : log_error("Data error in file '%s' at position '%u', diff bits %u/%u\n", task->path, file_pos, diff, BLOCK_HASH_SIZE * 8);
484 920 : ++silent_error;
485 920 : silent_error_on_this_block = 1;
486 : }
487 920 : continue;
488 : }
489 : }
490 : }
491 :
492 : /* buffers for parity read and not computed */
493 196224 : for (l = 0; l < state->level; ++l)
494 168192 : buffer_recov[l] = buffer[diskmax + state->level + l];
495 28032 : for (; l < LEV_MAX; ++l)
496 0 : buffer_recov[l] = 0;
497 :
498 : /* until now is misc */
499 28032 : state_usage_misc(state);
500 :
501 : /* read the parity */
502 196224 : for (l = 0; l < state->level; ++l) {
503 : struct snapraid_task* task;
504 : unsigned levcur;
505 :
506 168192 : task = io_parity_read(&io, &levcur, waiting_map, &waiting_mac);
507 :
508 : /* until now is parity */
509 168192 : state_usage_parity(state, waiting_map, waiting_mac);
510 :
511 : /* handle error conditions */
512 168192 : if (task->state == TASK_STATE_IOERROR) {
513 : /* LCOV_EXCL_START */
514 : ++io_error;
515 : goto bail;
516 : /* LCOV_EXCL_STOP */
517 : }
518 168192 : if (task->state == TASK_STATE_ERROR) {
519 : /* LCOV_EXCL_START */
520 : ++error;
521 : goto bail;
522 : /* LCOV_EXCL_STOP */
523 : }
524 168192 : if (task->state == TASK_STATE_ERROR_CONTINUE) {
525 0 : ++error;
526 0 : error_on_this_block = 1;
527 :
528 : /* if continuing on error, clear the missing buffer */
529 0 : buffer_recov[levcur] = 0;
530 0 : continue;
531 : }
532 168192 : if (task->state == TASK_STATE_IOERROR_CONTINUE) {
533 0 : ++io_error;
534 0 : if (io_error >= state->opt.io_error_limit) {
535 : /* LCOV_EXCL_START */
536 : log_fatal("DANGER! Too many input/output read error in the %s disk, it isn't possible to scrub.\n", lev_name(levcur));
537 : log_fatal("Ensure that disk '%s' is sane and can be read.\n", lev_config_name(levcur));
538 : log_fatal("Stopping at block %u\n", blockcur);
539 : goto bail;
540 : /* LCOV_EXCL_STOP */
541 : }
542 :
543 : /* otherwise continue */
544 0 : io_error_on_this_block = 1;
545 :
546 : /* if continuing on error, clear the missing buffer */
547 0 : buffer_recov[levcur] = 0;
548 0 : continue;
549 : }
550 168192 : if (task->state != TASK_STATE_DONE) {
551 : /* LCOV_EXCL_START */
552 : log_fatal("Internal inconsistency in task state\n");
553 : os_abort();
554 : /* LCOV_EXCL_STOP */
555 : }
556 : }
557 :
558 : /* if we have read all the data required and it's correct, proceed with the parity check */
559 28032 : if (!error_on_this_block && !silent_error_on_this_block && !io_error_on_this_block) {
560 :
561 : /* compute the parity */
562 27112 : raid_gen(diskmax, state->level, state->block_size, buffer);
563 :
564 : /* compare the parity */
565 189784 : for (l = 0; l < state->level; ++l) {
566 162672 : if (buffer_recov[l] && memcmp(buffer[diskmax + l], buffer_recov[l], state->block_size) != 0) {
567 0 : unsigned diff = memdiff(buffer[diskmax + l], buffer_recov[l], state->block_size);
568 :
569 0 : log_tag("parity_error:%u:%s: Data error, diff bits %u/%u\n", blockcur, lev_config_name(l), diff, state->block_size * 8);
570 :
571 : /* it's a silent error only if we are dealing with synced blocks */
572 0 : if (block_is_unsynced) {
573 0 : ++error;
574 0 : error_on_this_block = 1;
575 : } else {
576 0 : log_fatal("Data error in parity '%s' at position '%u', diff bits %u/%u\n", lev_config_name(l), blockcur, diff, state->block_size * 8);
577 0 : ++silent_error;
578 0 : silent_error_on_this_block = 1;
579 : }
580 : }
581 : }
582 :
583 : /* until now is raid */
584 27112 : state_usage_raid(state);
585 : }
586 :
587 28032 : if (silent_error_on_this_block || io_error_on_this_block) {
588 : /* set the error status keeping other info */
589 920 : info_set(&state->infoarr, blockcur, info_set_bad(info));
590 27112 : } else if (error_on_this_block) {
591 : /* do nothing, as this is a generic error */
592 : /* likely caused by a not synced array */
593 : } else {
594 : /* if rehash is needed */
595 27112 : if (rehash) {
596 : /* store all the new hash already computed */
597 32256 : for (j = 0; j < diskmax; ++j) {
598 27648 : if (rehandle[j].block)
599 27247 : memcpy(rehandle[j].block->hash, rehandle[j].hash, BLOCK_HASH_SIZE);
600 : }
601 : }
602 :
603 : /* update the time info of the block */
604 : /* and clear any other flag */
605 27112 : info_set(&state->infoarr, blockcur, info_make(now, 0, 0, 0));
606 : }
607 :
608 : /* mark the state as needing write */
609 28032 : state->need_write = 1;
610 :
611 : /* count the number of processed block */
612 28032 : ++countpos;
613 :
614 : /* progress */
615 28032 : if (state_progress(state, &io, blockcur, countpos, countmax, countsize)) {
616 : /* LCOV_EXCL_START */
617 : break;
618 : /* LCOV_EXCL_STOP */
619 : }
620 :
621 : /* autosave */
622 28032 : if (state->autosave != 0
623 0 : && autosavedone >= autosavelimit /* if we have reached the limit */
624 0 : && autosavemissing >= autosavelimit /* if we have at least a full step to do */
625 : ) {
626 0 : autosavedone = 0; /* restart the counter */
627 :
628 : /* until now is misc */
629 0 : state_usage_misc(state);
630 :
631 0 : state_progress_stop(state);
632 :
633 0 : msg_progress("Autosaving...\n");
634 0 : state_write(state);
635 :
636 0 : state_progress_restart(state);
637 :
638 : /* drop until now */
639 0 : state_usage_waste(state);
640 : }
641 28032 : }
642 :
643 11 : state_progress_end(state, countpos, countmax, countsize);
644 :
645 11 : state_usage_print(state);
646 :
647 11 : if (error || silent_error || io_error) {
648 1 : msg_status("\n");
649 1 : msg_status("%8u file errors\n", error);
650 1 : msg_status("%8u io errors\n", io_error);
651 1 : msg_status("%8u data errors\n", silent_error);
652 : } else {
653 : /* print the result only if processed something */
654 10 : if (countpos != 0)
655 9 : msg_status("Everything OK\n");
656 : }
657 :
658 11 : if (error)
659 0 : log_fatal("WARNING! Unexpected file errors!\n");
660 11 : if (io_error)
661 0 : log_fatal("DANGER! Unexpected input/output errors! The failing blocks are now marked as bad!\n");
662 11 : if (silent_error)
663 1 : log_fatal("DANGER! Unexpected data errors! The failing blocks are now marked as bad!\n");
664 11 : if (io_error || silent_error) {
665 1 : log_fatal("Use 'snapraid status' to list the bad blocks.\n");
666 1 : log_fatal("Use 'snapraid -e fix' to recover.\n");
667 : }
668 :
669 11 : log_tag("summary:error_file:%u\n", error);
670 11 : log_tag("summary:error_io:%u\n", io_error);
671 11 : log_tag("summary:error_data:%u\n", silent_error);
672 11 : if (error + silent_error + io_error == 0)
673 10 : log_tag("summary:exit:ok\n");
674 : else
675 1 : log_tag("summary:exit:error\n");
676 11 : log_flush();
677 :
678 : bail:
679 : /* stop all the worker threads */
680 11 : io_stop(&io);
681 :
682 77 : for (j = 0; j < diskmax; ++j) {
683 66 : struct snapraid_file* file = handle[j].file;
684 66 : struct snapraid_disk* disk = handle[j].disk;
685 66 : ret = handle_close(&handle[j]);
686 66 : if (ret == -1) {
687 : /* LCOV_EXCL_START */
688 : log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
689 : log_fatal("DANGER! Unexpected close error in a data disk.\n");
690 : ++error;
691 : /* continue, as we are already exiting */
692 : /* LCOV_EXCL_STOP */
693 : }
694 : }
695 :
696 11 : free(handle);
697 11 : free(rehandle_alloc);
698 11 : free(waiting_map);
699 11 : io_done(&io);
700 :
701 11 : if (state->opt.expect_recoverable) {
702 1 : if (error + silent_error + io_error == 0)
703 0 : return -1;
704 : } else {
705 10 : if (error + silent_error + io_error != 0)
706 0 : return -1;
707 : }
708 11 : return 0;
709 : }
710 :
711 : /**
712 : * Return a * b / c approximated to the upper value.
713 : */
714 2 : static uint32_t md(uint32_t a, uint32_t b, uint32_t c)
715 : {
716 2 : uint64_t v = a;
717 :
718 2 : v *= b;
719 2 : v += c - 1;
720 2 : v /= c;
721 :
722 2 : return v;
723 : }
724 :
725 11 : int state_scrub(struct snapraid_state* state, int plan, int olderthan)
726 : {
727 : block_off_t blockmax;
728 : block_off_t countlimit;
729 : block_off_t i;
730 : block_off_t count;
731 : time_t recentlimit;
732 : int ret;
733 : struct snapraid_parity_handle parity_handle[LEV_MAX];
734 : struct snapraid_plan ps;
735 : time_t* timemap;
736 : unsigned error;
737 : time_t now;
738 : unsigned l;
739 :
740 : /* get the present time */
741 11 : now = time(0);
742 :
743 11 : msg_progress("Initializing...\n");
744 :
745 11 : if ((plan == SCRUB_BAD || plan == SCRUB_NEW || plan == SCRUB_FULL)
746 4 : && olderthan >= 0) {
747 : /* LCOV_EXCL_START */
748 : log_fatal("You can specify -o, --older-than only with a numeric percentage.\n");
749 : exit(EXIT_FAILURE);
750 : /* LCOV_EXCL_STOP */
751 : }
752 :
753 11 : blockmax = parity_allocated_size(state);
754 :
755 : /* preinitialize to avoid warnings */
756 11 : countlimit = 0;
757 11 : recentlimit = 0;
758 :
759 11 : ps.state = state;
760 11 : if (state->opt.force_scrub_even) {
761 1 : ps.plan = SCRUB_EVEN;
762 10 : } else if (plan == SCRUB_FULL) {
763 2 : ps.plan = SCRUB_FULL;
764 8 : } else if (plan == SCRUB_NEW) {
765 1 : ps.plan = SCRUB_NEW;
766 7 : } else if (plan == SCRUB_BAD) {
767 1 : ps.plan = SCRUB_BAD;
768 6 : } else if (state->opt.force_scrub_at) {
769 : /* scrub the specified amount of blocks */
770 4 : ps.plan = SCRUB_AUTO;
771 4 : countlimit = state->opt.force_scrub_at;
772 4 : recentlimit = now;
773 : } else {
774 2 : ps.plan = SCRUB_AUTO;
775 2 : if (plan >= 0) {
776 1 : countlimit = md(blockmax, plan, 100);
777 : } else {
778 : /* by default scrub 8.33% of the array (100/12=8.(3)) */
779 1 : countlimit = md(blockmax, 1, 12);
780 : }
781 :
782 2 : if (olderthan >= 0) {
783 1 : recentlimit = now - olderthan * 24 * 3600;
784 : } else {
785 : /* by default use a 10 day time limit */
786 1 : recentlimit = now - 10 * 24 * 3600;
787 : }
788 : }
789 :
790 : /* identify the time limit */
791 : /* we sort all the block times, and we identify the time limit for which we reach the quota */
792 : /* this allow to process first the oldest blocks */
793 11 : timemap = malloc_nofail(blockmax * sizeof(time_t));
794 :
795 : /* copy the info in the temp vector */
796 11 : count = 0;
797 11 : log_tag("block_count:%u\n", blockmax);
798 56096 : for (i = 0; i < blockmax; ++i) {
799 56085 : snapraid_info info = info_get(&state->infoarr, i);
800 :
801 : /* skip unused blocks */
802 56085 : if (info == 0)
803 0 : continue;
804 :
805 56085 : timemap[count++] = info_get_time(info);
806 : }
807 :
808 11 : if (!count) {
809 : /* LCOV_EXCL_START */
810 : log_fatal("The array appears to be empty.\n");
811 : exit(EXIT_FAILURE);
812 : /* LCOV_EXCL_STOP */
813 : }
814 :
815 : /* sort it */
816 11 : qsort(timemap, count, sizeof(time_t), time_compare);
817 :
818 : /* output the info map */
819 11 : i = 0;
820 11 : log_tag("info_count:%u\n", count);
821 38 : while (i < count) {
822 16 : unsigned j = i + 1;
823 56101 : while (j < count && timemap[i] == timemap[j])
824 56069 : ++j;
825 16 : log_tag("info_time:%" PRIu64 ":%u\n", (uint64_t)timemap[i], j - i);
826 16 : i = j;
827 : }
828 :
829 : /* compute the limits from count/recentlimit */
830 11 : if (ps.plan == SCRUB_AUTO) {
831 : /* no more than the full count */
832 6 : if (countlimit > count)
833 2 : countlimit = count;
834 :
835 : /* decrease until we reach the specific recentlimit */
836 59 : while (countlimit > 0 && timemap[countlimit - 1] > recentlimit)
837 47 : --countlimit;
838 :
839 : /* if there is something to scrub */
840 6 : if (countlimit > 0) {
841 : /* get the most recent time we want to scrub */
842 5 : ps.timelimit = timemap[countlimit - 1];
843 :
844 : /* count how many entries for this exact time we have to scrub */
845 : /* if the blocks have all the same time, we end with countlimit == lastlimit */
846 5 : ps.lastlimit = 1;
847 7283 : while (countlimit > ps.lastlimit && timemap[countlimit - ps.lastlimit - 1] == ps.timelimit)
848 7273 : ++ps.lastlimit;
849 : } else {
850 : /* if nothing to scrub, disable also other limits */
851 1 : ps.timelimit = 0;
852 1 : ps.lastlimit = 0;
853 : }
854 :
855 6 : log_tag("count_limit:%u\n", countlimit);
856 6 : log_tag("time_limit:%" PRIu64 "\n", (uint64_t)ps.timelimit);
857 6 : log_tag("last_limit:%u\n", ps.lastlimit);
858 : }
859 :
860 : /* free the temp vector */
861 11 : free(timemap);
862 :
863 : /* open the file for reading */
864 77 : for (l = 0; l < state->level; ++l) {
865 66 : ret = parity_open(&parity_handle[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
866 66 : if (ret == -1) {
867 : /* LCOV_EXCL_START */
868 : log_fatal("WARNING! Without an accessible %s file, it isn't possible to scrub.\n", lev_name(l));
869 : exit(EXIT_FAILURE);
870 : /* LCOV_EXCL_STOP */
871 : }
872 : }
873 :
874 11 : msg_progress("Scrubbing...\n");
875 :
876 11 : error = 0;
877 :
878 11 : ret = state_scrub_process(state, parity_handle, 0, blockmax, &ps, now);
879 11 : if (ret == -1) {
880 0 : ++error;
881 : /* continue, as we are already exiting */
882 : }
883 :
884 77 : for (l = 0; l < state->level; ++l) {
885 66 : ret = parity_close(&parity_handle[l]);
886 66 : if (ret == -1) {
887 : /* LCOV_EXCL_START */
888 : log_fatal("DANGER! Unexpected close error in %s disk.\n", lev_name(l));
889 : ++error;
890 : /* continue, as we are already exiting */
891 : /* LCOV_EXCL_STOP */
892 : }
893 : }
894 :
895 : /* abort if required */
896 11 : if (error != 0)
897 0 : return -1;
898 11 : return 0;
899 : }
900 :
|