Line data Source code
1 : // SPDX-License-Identifier: GPL-3.0-or-later
2 : // Copyright (C) 2011 Andrea Mazzoleni
3 :
4 : #include "portable.h"
5 :
6 : #include "support.h"
7 : #include "elem.h"
8 : #include "state.h"
9 : #include "parity.h"
10 : #include "handle.h"
11 : #include "io.h"
12 : #include "raid/raid.h"
13 :
14 : /****************************************************************************/
15 : /* hash */
16 :
17 18 : static const char* es(int err)
18 : {
19 18 : if (is_hw(err))
20 0 : return "error_io";
21 : else
22 18 : return "error";
23 : }
24 :
25 7 : static int state_hash_process(struct snapraid_state* state, block_off_t blockstart, block_off_t blockmax, int* skip_sync)
26 : {
27 : struct snapraid_handle* handle;
28 : unsigned diskmax;
29 : block_off_t blockcur;
30 : unsigned j;
31 : void* buffer;
32 : void* buffer_alloc;
33 : data_off_t countsize;
34 : block_off_t countpos;
35 : block_off_t countmax;
36 : int ret;
37 : unsigned soft_error;
38 : unsigned silent_error;
39 : unsigned io_error;
40 : char esc_buffer[ESC_MAX];
41 :
42 : /* maps the disks to handles */
43 7 : handle = handle_mapping(state, &diskmax);
44 :
45 : /* buffer for reading */
46 7 : buffer = malloc_nofail_direct(state->block_size, &buffer_alloc);
47 7 : if (!state->opt.skip_self)
48 0 : mtest_vector(1, state->block_size, &buffer);
49 :
50 7 : soft_error = 0;
51 7 : silent_error = 0;
52 7 : io_error = 0;
53 :
54 : /* first count the number of blocks to process */
55 7 : countmax = 0;
56 49 : for (j = 0; j < diskmax; ++j) {
57 42 : struct snapraid_disk* disk = handle[j].disk;
58 :
59 : /* if no disk, nothing to check */
60 42 : if (!disk)
61 0 : continue;
62 :
63 236706 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
64 : struct snapraid_block* block;
65 : unsigned block_state;
66 :
67 236664 : block = fs_par2block_find(disk, blockcur);
68 :
69 : /* get the state of the block */
70 236664 : block_state = block_state_get(block);
71 :
72 : /* process REP and CHG blocks */
73 236664 : if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
74 223625 : continue;
75 :
76 13039 : ++countmax;
77 : }
78 : }
79 :
80 : /* drop until now */
81 7 : state_usage_waste(state);
82 :
83 7 : countsize = 0;
84 7 : countpos = 0;
85 7 : blockcur = blockstart;
86 :
87 7 : int alert = state_progress_begin(state, blockstart, blockmax, countmax);
88 7 : if (alert > 0)
89 0 : goto end;
90 7 : if (alert < 0)
91 0 : goto bail;
92 :
93 49 : for (j = 0; j < diskmax; ++j) {
94 42 : struct snapraid_disk* disk = handle[j].disk;
95 :
96 : /* if no disk, nothing to check */
97 42 : if (!disk)
98 0 : continue;
99 :
100 236706 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
101 : snapraid_info info;
102 : int rehash;
103 : struct snapraid_block* block;
104 : int read_size;
105 : unsigned char hash[HASH_MAX];
106 : unsigned block_state;
107 : struct snapraid_file* file;
108 : block_off_t file_pos;
109 :
110 236664 : block = fs_par2block_find(disk, blockcur);
111 :
112 : /* get the state of the block */
113 236664 : block_state = block_state_get(block);
114 :
115 : /* process REP and CHG blocks */
116 236664 : if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
117 223635 : continue;
118 :
119 : /* get the file of this block */
120 13039 : file = fs_par2file_get(disk, blockcur, &file_pos);
121 :
122 : /* get block specific info */
123 13039 : info = info_get(&state->infoarr, blockcur);
124 :
125 : /* if we have to use the old hash */
126 13039 : rehash = info_get_rehash(info);
127 :
128 : /* until now is misc */
129 13039 : state_usage_misc(state);
130 :
131 : /* if the file is different than the current one, close it */
132 13039 : if (handle[j].file != 0 && handle[j].file != file) {
133 : /* keep a pointer at the file we are going to close for error reporting */
134 5252 : struct snapraid_file* report = handle[j].file;
135 5252 : ret = handle_close(&handle[j]);
136 5252 : if (ret == -1) {
137 : /* LCOV_EXCL_START */
138 : /*
139 : * This one is really an unexpected error, because we are only reading
140 : * and closing a descriptor should never fail
141 : */
142 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
143 : log_fatal_errno(errno, disk->name);
144 : log_fatal(errno, "Stopping at block %u\n", blockcur);
145 :
146 : if (is_hw(errno)) {
147 : ++io_error;
148 : } else {
149 : ++soft_error;
150 : }
151 : goto bail;
152 : /* LCOV_EXCL_STOP */
153 : }
154 : }
155 :
156 13039 : ret = handle_open(&handle[j], file, state->file_mode, log_error, log_error); /* output a message for missing files */
157 13039 : if (ret == -1) {
158 6 : log_tag("%s:%u:%s:%s: Open error. %s.\n", es(errno), blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
159 6 : if (errno == ENOENT) {
160 2 : log_error_errno(errno, disk->name);
161 :
162 2 : ++soft_error;
163 : /*
164 : * If the file is missing, it means that it was removed during sync
165 : * this isn't a serious error, so we skip this block, and continue with others
166 : */
167 2 : continue;
168 : }
169 :
170 4 : if (errno == EACCES) {
171 4 : log_error_errno(errno, disk->name);
172 :
173 4 : ++soft_error;
174 : /* this isn't a serious error, so we skip this block, and continue with others */
175 4 : continue;
176 : }
177 :
178 : /* LCOV_EXCL_START */
179 : log_fatal_errno(errno, disk->name);
180 :
181 : if (is_hw(errno)) {
182 : log_fatal(errno, "Stopping at block %u\n", blockcur);
183 : ++io_error;
184 : } else {
185 : log_fatal(errno, "Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
186 : ++soft_error;
187 : }
188 : goto bail;
189 : /* LCOV_EXCL_STOP */
190 : }
191 :
192 : /* check if the file is changed */
193 13033 : if (handle[j].st.st_size != file->size
194 13032 : || handle[j].st.st_mtime != file->mtime_sec
195 13031 : || STAT_NSEC(&handle[j].st) != file->mtime_nsec
196 13031 : || handle[j].st.st_ino != file->inode
197 : ) {
198 3 : if (handle[j].st.st_size != file->size) {
199 1 : log_tag("error:%u:%s:%s: Unexpected size change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
200 1 : log_error(ESOFT, "Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle[j].path, file->size, (uint64_t)handle[j].st.st_size);
201 2 : } else if (handle[j].st.st_mtime != file->mtime_sec
202 1 : || STAT_NSEC(&handle[j].st) != file->mtime_nsec) {
203 1 : log_tag("error:%u:%s:%s: Unexpected time change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
204 1 : log_error(ESOFT, "Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle[j].path, file->mtime_sec, file->mtime_nsec, (uint64_t)handle[j].st.st_mtime, STAT_NSEC(&handle[j].st));
205 : } else {
206 1 : log_tag("error:%u:%s:%s: Unexpected inode change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
207 1 : log_error(ESOFT, "Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", file->inode, (uint64_t)handle[j].st.st_ino, handle[j].path);
208 : }
209 3 : log_error_errno(ENOENT, disk->name); /* same message for ENOENT */
210 :
211 3 : ++soft_error;
212 :
213 : /*
214 : * If the file is changed, it means that it was modified during sync
215 : * this isn't a serious error, so we skip this block, and continue with others
216 : */
217 3 : continue;
218 : }
219 :
220 13030 : read_size = handle_read(&handle[j], file_pos, buffer, state->block_size, log_fatal, 0);
221 13030 : if (read_size == -1) {
222 : /* LCOV_EXCL_START */
223 : log_tag("%s:%u:%s:%s: Read error at position %u. %s.\n", es(errno), blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, strerror(errno));
224 : log_fatal_errno(errno, disk->name);
225 :
226 : if (is_hw(errno)) {
227 : log_fatal(errno, "Stopping at block %u\n", blockcur);
228 : ++io_error;
229 : } else {
230 : log_fatal(errno, "Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
231 : ++soft_error;
232 : }
233 : goto bail;
234 : /* LCOV_EXCL_STOP */
235 : }
236 :
237 : /* until now is disk */
238 13030 : state_usage_disk(state, handle, &j, 1);
239 :
240 13030 : state_usage_file(state, disk, file);
241 :
242 13030 : countsize += read_size;
243 :
244 : /* now compute the hash */
245 13030 : if (rehash) {
246 0 : memhash(state->prevhash, state->prevhashseed, hash, buffer, read_size);
247 : } else {
248 13030 : memhash(state->hash, state->hashseed, hash, buffer, read_size);
249 : }
250 :
251 : /* until now is hash */
252 13030 : state_usage_hash(state);
253 :
254 13030 : if (block_state == BLOCK_STATE_REP) {
255 : /* compare the hash */
256 13028 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
257 1 : log_tag("error_data:%u:%s:%s: Unexpected data change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
258 1 : log_error(EDATA, "Data change at file '%s' at position '%u'\n", handle[j].path, file_pos);
259 1 : log_error(EDATA, "WARNING! Unexpected data modification of a file without parity!\n");
260 :
261 1 : if (file_flag_has(file, FILE_IS_COPY)) {
262 1 : log_error(EDATA, "This file was detected as a copy of another file with the same name, size,\n");
263 1 : log_error(EDATA, "and timestamp, but the file data isn't matching the assumed copy.\n");
264 1 : log_error(EDATA, "If this is a false positive, and the files are expected to be different,\n");
265 1 : log_error(EDATA, "you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
266 : } else {
267 0 : log_error(EDATA, "Try removing the file from the array and rerun the 'sync' command!\n");
268 : }
269 :
270 : /*
271 : * Block sync to allow a recovery before overwriting
272 : * the parity needed to make such recovery
273 : */
274 1 : *skip_sync = 1; /* avoid to run the next sync */
275 :
276 1 : ++silent_error;
277 1 : continue;
278 : }
279 : } else {
280 : /* the only other case is BLOCK_STATE_CHG */
281 2 : assert(block_state == BLOCK_STATE_CHG);
282 :
283 : /* copy the hash in the block */
284 2 : memcpy(block->hash, hash, BLOCK_HASH_SIZE);
285 :
286 : /* and mark the block as hashed */
287 2 : block_state_set(block, BLOCK_STATE_REP);
288 :
289 : /* mark the state as needing write */
290 2 : state->need_write = 1;
291 : }
292 :
293 : /* count the number of processed block */
294 13029 : ++countpos;
295 :
296 : /* progress */
297 13029 : if (state_progress(state, 0, blockcur, countpos, countmax, countsize)) {
298 : /* LCOV_EXCL_START */
299 : *skip_sync = 1; /* avoid to run the next sync */
300 : break;
301 : /* LCOV_EXCL_STOP */
302 : }
303 : }
304 :
305 : /* close the last file in the disk */
306 42 : if (handle[j].file != 0) {
307 : /* keep a pointer at the file we are going to close for error reporting */
308 7 : struct snapraid_file* report = handle[j].file;
309 7 : ret = handle_close(&handle[j]);
310 7 : if (ret == -1) {
311 : /* LCOV_EXCL_START */
312 : /*
313 : * This one is really an unexpected error, because we are only reading
314 : * and closing a descriptor should never fail
315 : */
316 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockmax, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
317 : log_fatal_errno(errno, disk->name);
318 : log_fatal(errno, "Stopping at block %u\n", blockmax);
319 :
320 : if (is_hw(errno)) {
321 : ++io_error;
322 : } else {
323 : ++soft_error;
324 : }
325 : goto bail;
326 : /* LCOV_EXCL_STOP */
327 : }
328 : }
329 : }
330 :
331 7 : end:
332 7 : state_progress_end(state, countpos, countmax, countsize, "Nothing to hash.\n");
333 :
334 : /*
335 : * Note that at this point no io_error is possible
336 : * because at the first one we bail out
337 : */
338 7 : assert(io_error == 0);
339 :
340 7 : if (soft_error || io_error || silent_error) {
341 6 : msg_status("\n");
342 6 : msg_status("%8u soft errors\n", soft_error);
343 6 : msg_status("%8u io errors\n", io_error);
344 6 : msg_status("%8u data errors\n", silent_error);
345 : } else {
346 1 : msg_status("Everything OK\n");
347 : }
348 :
349 7 : if (soft_error)
350 5 : log_fatal(ESOFT, "WARNING! Unexpected soft errors!\n");
351 :
352 7 : log_tag("hash_summary:error_soft:%u\n", soft_error);
353 :
354 : /* proceed without bailing out */
355 7 : goto finish;
356 :
357 0 : bail:
358 : /* on bail, don't run the next sync */
359 0 : *skip_sync = 1;
360 :
361 : /* close files left open */
362 0 : for (j = 0; j < diskmax; ++j) {
363 0 : struct snapraid_file* file = handle[j].file;
364 0 : struct snapraid_disk* disk = handle[j].disk;
365 0 : ret = handle_close(&handle[j]);
366 0 : if (ret == -1) {
367 : /* LCOV_EXCL_START */
368 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
369 : log_fatal_errno(errno, disk->name);
370 :
371 : if (is_hw(errno)) {
372 : ++io_error;
373 : } else {
374 : ++soft_error;
375 : }
376 : /* continue, as we are already exiting */
377 : /* LCOV_EXCL_STOP */
378 : }
379 : }
380 :
381 0 : finish:
382 7 : free(handle);
383 7 : free(buffer_alloc);
384 :
385 7 : if (soft_error + io_error + silent_error != 0)
386 6 : return -1;
387 :
388 1 : if (alert < 0)
389 0 : return -1;
390 :
391 1 : return 0;
392 : }
393 :
394 : /****************************************************************************/
395 : /* sync */
396 :
397 : /**
398 : * Sync plan to use.
399 : */
400 : struct snapraid_plan {
401 : unsigned handle_max;
402 : struct snapraid_handle* handle_map;
403 : int force_full;
404 : };
405 :
406 : /**
407 : * A block that failed the hash check, or that was deleted.
408 : */
409 : struct failed_struct {
410 : unsigned index; /**< Index of the failed block. */
411 : unsigned size; /**< Size of the block. */
412 :
413 : struct snapraid_block* block; /**< The failed block, or BLOCK_DELETED for a deleted block */
414 : };
415 :
416 : /**
417 : * Comparison function for sorting by index.
418 : */
419 852 : int failed_compare_by_index(const void* void_a, const void* void_b)
420 : {
421 852 : const struct failed_struct* a = void_a;
422 852 : const struct failed_struct* b = void_b;
423 :
424 852 : if (a->index < b->index)
425 852 : return -1;
426 0 : if (a->index > b->index)
427 0 : return 1;
428 0 : return 0;
429 : }
430 :
431 : /**
432 : * Buffer for storing the new hashes.
433 : */
434 : struct snapraid_rehash {
435 : unsigned char hash[HASH_MAX];
436 : struct snapraid_block* block;
437 : };
438 :
439 : /**
440 : * Check if we have to process the specified block index ::i.
441 : */
442 470714 : static int block_is_enabled(struct snapraid_plan* plan, block_off_t i)
443 : {
444 : unsigned j;
445 : int one_invalid;
446 : int one_valid;
447 :
448 : /* for each disk */
449 470714 : one_invalid = 0;
450 470714 : one_valid = 0;
451 3272198 : for (j = 0; j < plan->handle_max; ++j) {
452 : struct snapraid_block* block;
453 2801484 : struct snapraid_disk* disk = plan->handle_map[j].disk;
454 :
455 : /* if no disk, nothing to check */
456 2801484 : if (!disk)
457 4687 : continue;
458 :
459 2796797 : block = fs_par2block_find(disk, i);
460 :
461 2796797 : if (block_has_file(block))
462 2643779 : one_valid = 1;
463 :
464 2796797 : if (block_has_invalid_parity(block) || plan->force_full)
465 447833 : one_invalid = 1;
466 : }
467 :
468 : /* if none valid or none invalid, we don't need to update */
469 470714 : if (!one_invalid || !one_valid)
470 338919 : return 0;
471 :
472 131795 : return 1;
473 : }
474 :
475 773517 : static void sync_data_reader(struct snapraid_worker* worker, struct snapraid_task* task)
476 : {
477 773517 : struct snapraid_io* io = worker->io;
478 773517 : struct snapraid_state* state = io->state;
479 773517 : struct snapraid_handle* handle = worker->handle;
480 773517 : struct snapraid_disk* disk = handle->disk;
481 773517 : block_off_t blockcur = task->position;
482 773517 : unsigned char* buffer = task->buffer;
483 : int ret;
484 : char esc_buffer[ESC_MAX];
485 :
486 : /* if the disk position is not used */
487 773517 : if (!disk) {
488 : /* use an empty block */
489 0 : memset(buffer, 0, state->block_size);
490 0 : task->state = TASK_STATE_DONE;
491 111722 : return;
492 : }
493 :
494 : /* get the block */
495 773517 : task->block = fs_par2block_find(disk, blockcur);
496 :
497 : /*
498 : * If the block has no file, meaning that it's EMPTY or DELETED,
499 : * it doesn't participate in the new parity computation
500 : */
501 773517 : if (!block_has_file(task->block)) {
502 : /* use an empty block */
503 111704 : memset(buffer, 0, state->block_size);
504 111704 : task->state = TASK_STATE_DONE;
505 111704 : return;
506 : }
507 :
508 : /* get the file of this block */
509 661813 : task->file = fs_par2file_get(disk, blockcur, &task->file_pos);
510 :
511 : /* if the file is different than the current one, close it */
512 661813 : if (handle->file != 0 && handle->file != task->file) {
513 : /* keep a pointer at the file we are going to close for error reporting */
514 281148 : struct snapraid_file* report = handle->file;
515 281148 : ret = handle_close(handle);
516 281148 : if (ret == -1) {
517 : /* LCOV_EXCL_START */
518 : /*
519 : * This one is really an unexpected error, because we are only reading
520 : * and closing a descriptor should never fail
521 : */
522 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
523 : log_fatal_errno(errno, disk->name);
524 : log_fatal(errno, "Stopping at block %u\n", blockcur);
525 :
526 : if (is_hw(errno)) {
527 : task->state = TASK_STATE_IOERROR;
528 : } else {
529 : task->state = TASK_STATE_ERROR;
530 : }
531 : return;
532 : /* LCOV_EXCL_STOP */
533 : }
534 : }
535 :
536 661813 : ret = handle_open(handle, task->file, state->file_mode, log_error, log_error); /* output a message for missing files */
537 661813 : if (ret == -1) {
538 12 : log_tag("%s:%u:%s:%s: Open error. %s.\n", es(errno), blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
539 12 : if (errno == ENOENT) {
540 4 : log_error_errno(errno, disk->name);
541 :
542 : /*
543 : * If the file is missing, it means that it was removed during sync
544 : * this isn't a serious error, so we skip this block, and continue with others
545 : */
546 4 : task->state = TASK_STATE_ERROR_CONTINUE;
547 4 : return;
548 : }
549 :
550 8 : if (errno == EACCES) {
551 8 : log_error_errno(errno, disk->name);
552 :
553 : /* this isn't a serious error, so we skip this block, and continue with others */
554 8 : task->state = TASK_STATE_ERROR_CONTINUE;
555 8 : return;
556 : }
557 :
558 : /* LCOV_EXCL_START */
559 : log_fatal_errno(errno, disk->name);
560 :
561 : if (is_hw(errno)) {
562 : log_fatal(errno, "Stopping at block %u\n", blockcur);
563 : task->state = TASK_STATE_IOERROR;
564 : } else {
565 : log_fatal(errno, "Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
566 : task->state = TASK_STATE_ERROR;
567 : }
568 : return;
569 : /* LCOV_EXCL_STOP */
570 : }
571 :
572 : /* check if the file is changed */
573 661801 : if (handle->st.st_size != task->file->size
574 661799 : || handle->st.st_mtime != task->file->mtime_sec
575 661797 : || STAT_NSEC(&handle->st) != task->file->mtime_nsec
576 661797 : || handle->st.st_ino != task->file->inode
577 : ) {
578 6 : log_tag("error:%u:%s:%s: Unexpected attribute change\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer));
579 6 : if (handle->st.st_size != task->file->size) {
580 2 : log_error(ESOFT, "Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle->path, task->file->size, (uint64_t)handle->st.st_size);
581 4 : } else if (handle->st.st_mtime != task->file->mtime_sec
582 2 : || STAT_NSEC(&handle->st) != task->file->mtime_nsec) {
583 2 : log_error(ESOFT, "Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle->path, task->file->mtime_sec, task->file->mtime_nsec, (uint64_t)handle->st.st_mtime, STAT_NSEC(&handle->st));
584 : } else {
585 2 : log_error(ESOFT, "Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", task->file->inode, (uint64_t)handle->st.st_ino, handle->path);
586 : }
587 6 : log_error_errno(ENOENT, disk->name); /* same message for ENOENT */
588 :
589 : /*
590 : * If the file is changed, it means that it was modified during sync
591 : * this isn't a serious error, so we skip this block, and continue with others
592 : */
593 6 : task->state = TASK_STATE_ERROR_CONTINUE;
594 6 : return;
595 : }
596 :
597 661795 : task->read_size = handle_read(handle, task->file_pos, buffer, state->block_size, log_error, 0);
598 661795 : if (task->read_size == -1) {
599 : /* LCOV_EXCL_START */
600 : log_tag("%s:%u:%s:%s: Read error at position %u. %s.\n", es(errno), blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
601 :
602 : if (is_hw(errno)) {
603 : log_error_errno(errno, disk->name);
604 : /* continue until the error limit is reached */
605 : task->state = TASK_STATE_IOERROR_CONTINUE;
606 : } else {
607 : log_fatal_errno(errno, disk->name);
608 : log_fatal(errno, "Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
609 : task->state = TASK_STATE_ERROR;
610 : }
611 : return;
612 : /* LCOV_EXCL_STOP */
613 : }
614 :
615 : /* store the path of the opened file */
616 661795 : pathcpy(task->path, sizeof(task->path), handle->path);
617 :
618 661795 : task->state = TASK_STATE_DONE;
619 : }
620 :
621 584721 : static void sync_parity_writer(struct snapraid_worker* worker, struct snapraid_task* task)
622 : {
623 584721 : struct snapraid_io* io = worker->io;
624 584721 : struct snapraid_state* state = io->state;
625 584721 : struct snapraid_parity_handle* parity_handle = worker->parity_handle;
626 584721 : unsigned level = parity_handle->level;
627 584721 : block_off_t blockcur = task->position;
628 584721 : unsigned char* buffer = task->buffer;
629 : int ret;
630 :
631 : /* write parity */
632 584721 : ret = parity_write(parity_handle, blockcur, buffer, state->block_size);
633 584721 : if (ret == -1) {
634 : /* LCOV_EXCL_START */
635 : log_tag("parity_%s:%u:%s: Write error. %s.\n", es(errno), blockcur, lev_config_name(level), strerror(errno));
636 :
637 : if (is_hw(errno)) {
638 : log_error_errno(errno, lev_config_name(level));
639 : /* continue until the error limit is reached */
640 : task->state = TASK_STATE_IOERROR_CONTINUE;
641 : } else {
642 : log_fatal_errno(errno, lev_config_name(level));
643 : log_fatal(errno, "Stopping at block %u\n", blockcur);
644 : task->state = TASK_STATE_ERROR;
645 : }
646 : return;
647 : /* LCOV_EXCL_STOP */
648 : }
649 :
650 584721 : task->state = TASK_STATE_DONE;
651 : }
652 :
653 100 : static int state_sync_process(struct snapraid_state* state, struct snapraid_parity_handle* parity_handle, block_off_t blockstart, block_off_t blockmax)
654 : {
655 : struct snapraid_io io;
656 : struct snapraid_plan plan;
657 : struct snapraid_handle* handle;
658 : void* rehandle_alloc;
659 : struct snapraid_rehash* rehandle;
660 : unsigned diskmax;
661 : block_off_t blockcur;
662 : unsigned j;
663 : void* zero_alloc;
664 : void** zero;
665 : void* copy_alloc;
666 : void** copy;
667 : unsigned buffermax;
668 : data_off_t countsize;
669 : block_off_t countpos;
670 : block_off_t countmax;
671 : block_off_t autosavedone;
672 : block_off_t autosavelimit;
673 : block_off_t autosavemissing;
674 : int ret;
675 : unsigned soft_error;
676 : unsigned silent_error;
677 : unsigned io_error;
678 : time_t now;
679 : struct failed_struct* failed;
680 : int* failed_map;
681 : unsigned l;
682 : unsigned* waiting_map;
683 : unsigned waiting_mac;
684 : char esc_buffer[ESC_MAX];
685 : bit_vect_t* block_enabled;
686 :
687 : /* get the present time */
688 100 : now = time(0);
689 :
690 : /* maps the disks to handles */
691 100 : handle = handle_mapping(state, &diskmax);
692 :
693 : /* rehash buffers */
694 100 : rehandle = malloc_nofail_align(diskmax * sizeof(struct snapraid_rehash), &rehandle_alloc);
695 :
696 : /* we need 1 * data + 1 * parity */
697 100 : buffermax = diskmax + state->level;
698 :
699 : /* initialize the io threads */
700 100 : io_init(&io, state, state->opt.io_cache, buffermax, sync_data_reader, handle, diskmax, 0, sync_parity_writer, parity_handle, state->level);
701 :
702 : /* allocate the copy buffer */
703 100 : copy = malloc_nofail_vector_align(diskmax, diskmax, state->block_size, ©_alloc);
704 :
705 : /* allocate and fill the zero buffer */
706 100 : zero = malloc_nofail_align(state->block_size, &zero_alloc);
707 100 : memset(zero, 0, state->block_size);
708 100 : raid_zero(zero);
709 :
710 100 : failed = malloc_nofail(diskmax * sizeof(struct failed_struct));
711 100 : failed_map = malloc_nofail(diskmax * sizeof(unsigned));
712 :
713 : /* possibly waiting disks */
714 100 : waiting_mac = diskmax > RAID_PARITY_MAX ? diskmax : RAID_PARITY_MAX;
715 100 : waiting_map = malloc_nofail(waiting_mac * sizeof(unsigned));
716 :
717 100 : soft_error = 0;
718 100 : silent_error = 0;
719 100 : io_error = 0;
720 :
721 100 : msg_progress("Selecting...\n");
722 :
723 : /* first count the number of blocks to process */
724 100 : countmax = 0;
725 100 : plan.handle_max = diskmax;
726 100 : plan.handle_map = handle;
727 100 : plan.force_full = state->opt.force_full;
728 100 : block_enabled = calloc_nofail(1, bit_vect_size(blockmax)); /* preinitialize to 0 */
729 470814 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
730 470714 : if (!block_is_enabled(&plan, blockcur))
731 338919 : continue;
732 131795 : bit_vect_set(block_enabled, blockcur);
733 131795 : ++countmax;
734 : }
735 :
736 : /*
737 : * Compute the autosave size for all disk, even if not read
738 : * this makes sense because the speed should be almost the same
739 : * if the disks are read in parallel
740 : */
741 100 : autosavelimit = state->autosave / (diskmax * state->block_size);
742 100 : autosavemissing = countmax; /* blocks to do */
743 100 : autosavedone = 0; /* blocks done */
744 :
745 : /* drop until now */
746 100 : state_usage_waste(state);
747 :
748 100 : countsize = 0;
749 100 : countpos = 0;
750 :
751 100 : msg_progress("Syncing...\n");
752 :
753 : /* start all the worker threads */
754 100 : io_start(&io, blockstart, blockmax, block_enabled);
755 :
756 100 : int alert = state_progress_begin(state, blockstart, blockmax, countmax);
757 100 : if (alert > 0)
758 0 : goto end;
759 100 : if (alert < 0)
760 0 : goto bail;
761 :
762 131795 : while (1) {
763 : unsigned failed_count;
764 : int error_on_this_block;
765 : int silent_error_on_this_block;
766 : int io_error_on_this_block;
767 : int fixed_error_on_this_block;
768 : int parity_needs_to_be_updated;
769 : int parity_going_to_be_updated;
770 : snapraid_info info;
771 : int rehash;
772 : void** buffer;
773 : int writer_error[IO_WRITER_ERROR_MAX];
774 :
775 : /* go to the next block */
776 131895 : blockcur = io_read_next(&io, &buffer);
777 131895 : if (blockcur >= blockmax)
778 100 : break;
779 :
780 : /* until now is scheduling */
781 131795 : state_usage_sched(state);
782 :
783 : /* one more block processed for autosave */
784 131795 : ++autosavedone;
785 131795 : --autosavemissing;
786 :
787 : /* by default process the block, and skip it if something goes wrong */
788 131795 : error_on_this_block = 0;
789 131795 : silent_error_on_this_block = 0;
790 131795 : io_error_on_this_block = 0;
791 131795 : fixed_error_on_this_block = 0;
792 :
793 : /* keep track of the number of failed blocks */
794 131795 : failed_count = 0;
795 :
796 : /* get block specific info */
797 131795 : info = info_get(&state->infoarr, blockcur);
798 :
799 : /* if we have to use the old hash */
800 131795 : rehash = info_get_rehash(info);
801 :
802 : /*
803 : * If the parity requires to be updated
804 : *
805 : * It could happens that all the blocks are EMPTY/BLK and CHG but with the hash
806 : * still matching because the specific CHG block was not modified.
807 : * In such case, we can avoid to update parity, because it would be the same as before
808 : *
809 : * Note that if there is any CHG/DELETED blocks already present in the content
810 : * file loaded, meaning that there are unsynced_blocks, this optimization is disabled
811 : */
812 131795 : parity_needs_to_be_updated = state->opt.force_full || state->opt.force_parity_update;
813 :
814 : /* if the parity is going to be updated */
815 131795 : parity_going_to_be_updated = 0;
816 :
817 : /*
818 : * If the block is marked as bad, we force the parity update
819 : * because the bad block may be the result of a wrong parity
820 : */
821 131795 : if (info_get_bad(info))
822 0 : parity_needs_to_be_updated = 1;
823 :
824 : /* for each disk, process the block */
825 905312 : for (j = 0; j < diskmax; ++j) {
826 : struct snapraid_task* task;
827 : int read_size;
828 : unsigned char hash[HASH_MAX];
829 : struct snapraid_block* block;
830 : unsigned block_state;
831 : struct snapraid_disk* disk;
832 : struct snapraid_file* file;
833 : block_off_t file_pos;
834 : unsigned diskcur;
835 :
836 : /* until now is misc */
837 773517 : state_usage_misc(state);
838 :
839 773517 : task = io_data_read(&io, &diskcur, waiting_map, &waiting_mac);
840 :
841 : /* until now is disk */
842 773517 : state_usage_disk(state, handle, waiting_map, waiting_mac);
843 :
844 : /* get the results */
845 773517 : disk = task->disk;
846 773517 : block = task->block;
847 773517 : file = task->file;
848 773517 : file_pos = task->file_pos;
849 773517 : read_size = task->read_size;
850 :
851 : /* by default no rehash in case of "continue" */
852 773517 : rehandle[diskcur].block = 0;
853 :
854 : /* if the disk position is not used */
855 773517 : if (!disk)
856 112575 : continue;
857 :
858 773517 : state_usage_file(state, disk, file);
859 :
860 : /* get the state of the block */
861 773517 : block_state = block_state_get(block);
862 :
863 : /*
864 : * If the block has invalid parity,
865 : * we have to take care of it in case of recover
866 : */
867 773517 : if (block_has_invalid_parity(block)) {
868 : /*
869 : * Store it in the failed set, because
870 : * the parity may be still computed with the previous content
871 : */
872 188407 : failed[failed_count].index = diskcur;
873 188407 : failed[failed_count].size = state->block_size;
874 188407 : failed[failed_count].block = block;
875 188407 : ++failed_count;
876 :
877 : /*
878 : * If the block has invalid parity, we have to update the parity
879 : * to include this block change
880 : * This also apply to CHG blocks, but we are going to handle
881 : * later this case to do the updates only if really needed
882 : */
883 188407 : if (block_state != BLOCK_STATE_CHG)
884 90630 : parity_needs_to_be_updated = 1;
885 :
886 : /*
887 : * Note that DELETE blocks are skipped in the next check
888 : * and we have to store them in the failed blocks
889 : * before skipping
890 : */
891 :
892 : /* follow */
893 : }
894 :
895 : /* if the block is not used */
896 773517 : if (!block_has_file(block))
897 111704 : continue;
898 :
899 : /* handle error conditions */
900 661813 : if (task->state == TASK_STATE_IOERROR) {
901 : /* LCOV_EXCL_START */
902 : ++io_error;
903 : goto bail;
904 : /* LCOV_EXCL_STOP */
905 : }
906 661813 : if (task->state == TASK_STATE_ERROR) {
907 : /* LCOV_EXCL_START */
908 : ++soft_error;
909 : goto bail;
910 : /* LCOV_EXCL_STOP */
911 : }
912 661813 : if (task->state == TASK_STATE_ERROR_CONTINUE) {
913 18 : ++soft_error;
914 18 : error_on_this_block = 1;
915 18 : continue;
916 : }
917 661795 : if (task->state == TASK_STATE_IOERROR_CONTINUE) {
918 0 : ++io_error;
919 0 : if (io_error >= state->opt.io_error_limit) {
920 : /* LCOV_EXCL_START */
921 : log_fatal(EIO, "DANGER! Too many input/output errors in the %s disk. It isn't possible to continue.\n", disk->dir);
922 : log_fatal(EIO, "Stopping at block %u\n", blockcur);
923 : goto bail;
924 : /* LCOV_EXCL_STOP */
925 : }
926 :
927 : /* otherwise continue */
928 0 : io_error_on_this_block = 1;
929 0 : continue;
930 : }
931 661795 : if (task->state != TASK_STATE_DONE) {
932 : /* LCOV_EXCL_START */
933 : log_fatal(EINTERNAL, "Internal inconsistency in task state\n");
934 : os_abort();
935 : /* LCOV_EXCL_STOP */
936 : }
937 :
938 661795 : countsize += read_size;
939 :
940 : /* now compute the hash */
941 661795 : if (rehash) {
942 27200 : memhash(state->prevhash, state->prevhashseed, hash, buffer[diskcur], read_size);
943 :
944 : /* compute the new hash, and store it */
945 27200 : rehandle[diskcur].block = block;
946 27200 : memhash(state->hash, state->hashseed, rehandle[diskcur].hash, buffer[diskcur], read_size);
947 : } else {
948 634595 : memhash(state->hash, state->hashseed, hash, buffer[diskcur], read_size);
949 : }
950 :
951 : /* until now is hash */
952 661795 : state_usage_hash(state);
953 :
954 661795 : if (block_has_updated_hash(block)) {
955 : /* compare the hash */
956 564036 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
957 : /* if the file has invalid parity, it's a REP changed during the sync */
958 853 : if (block_has_invalid_parity(block)) {
959 1 : log_tag("error:%u:%s:%s: Unexpected data change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
960 1 : log_error(ESOFT, "Data change at file '%s' at position '%u'\n", task->path, file_pos);
961 1 : log_error(ESOFT, "WARNING! Unexpected data modification of a file without parity!\n");
962 :
963 1 : if (file_flag_has(file, FILE_IS_COPY)) {
964 1 : log_error(ESOFT, "This file was detected as a copy of another file with the same name, size,\n");
965 1 : log_error(ESOFT, "and timestamp, but the file data isn't matching the assumed copy.\n");
966 1 : log_error(ESOFT, "If this is a false positive, and the files are expected to be different,\n");
967 1 : log_error(ESOFT, "you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
968 : } else {
969 0 : log_error(ESOFT, "Try removing the file from the array and rerun the 'sync' command!\n");
970 : }
971 :
972 1 : ++soft_error;
973 :
974 : /*
975 : * If the file is changed, it means that it was modified during sync
976 : * this isn't a serious error, so we skip this block, and continue with others
977 : */
978 1 : error_on_this_block = 1;
979 1 : continue;
980 852 : } else { /* otherwise it's a BLK with silent error */
981 852 : unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
982 852 : log_tag("error_data:%u:%s:%s: Data error at position %u, diff hash bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
983 852 : log_error(EDATA, "Data error in file '%s' at position '%u', diff hash bits %u/%u\n", task->path, file_pos, diff, BLOCK_HASH_SIZE * 8);
984 :
985 : /* save the failed block for the fix */
986 852 : failed[failed_count].index = diskcur;
987 852 : failed[failed_count].size = read_size;
988 852 : failed[failed_count].block = block;
989 852 : ++failed_count;
990 :
991 : /*
992 : * Silent errors are very rare, and are not a signal that a disk
993 : * is going to fail. So, we just continue marking the block as bad
994 : * just like in scrub
995 : */
996 852 : ++silent_error;
997 852 : silent_error_on_this_block = 1;
998 852 : continue;
999 : }
1000 : }
1001 : } else {
1002 : /* if until now the parity doesn't need to be updated */
1003 97759 : if (!parity_needs_to_be_updated) {
1004 : /*
1005 : * For sure it's a CHG block, because EMPTY are processed before with "continue"
1006 : * and BLK and REP have "block_has_updated_hash()" as 1, and all the others
1007 : * have "parity_needs_to_be_updated" already at 1
1008 : */
1009 37374 : assert(block_state_get(block) == BLOCK_STATE_CHG);
1010 :
1011 : /*
1012 : * When a sync is interrupted, the state of the parity is unknown becase
1013 : * we don't know exactly where the process stopped.
1014 : *
1015 : * This means that the hash information of the OLD blocks stored in the
1016 : * content file for CHG/DELETED blocks may be correct or not.
1017 : *
1018 : * The sync process uses the hash of CHG blocks to decide if the parity has to be
1019 : * recomputed, avoiding the recomputation if the input data is the same as before.
1020 : * But in case of an interrupted sync we cannot trust this data, so we
1021 : * disable this optimization if there are unsynced blocks.
1022 : *
1023 : * Note that CHG blocks may be from reading the content file, or from
1024 : * scanning the disk for really changed file, like with a different timestamp.
1025 : *
1026 : * An example for CHG blocks is:
1027 : * - One file is added creating a CHG block with ZERO state
1028 : * - Sync aborted after updating the parity to the new state,
1029 : * but without saving the content file representing this new BLK state.
1030 : * - File is now deleted after the aborted sync
1031 : * - Sync again, deleting the blocks over the CHG ones
1032 : * with the hash of CHG blocks not representing the real parity state
1033 : *
1034 : * An example for DELETED blocks is:
1035 : * - One file is deleted creating DELETED blocks
1036 : * - Sync aborted after, updating the parity to the new state,
1037 : * but without saving the content file representing this new EMPTY state.
1038 : * - Another file is added again over the DELETE ones
1039 : * with the hash of DELETED blocks not representing the real parity state
1040 : */
1041 :
1042 : /* if the previous sync was completed and the hash represents the data unequivocally */
1043 37374 : if (state->unsynced_blocks == 0 && hash_is_unique(block->hash)) {
1044 : /* check if the hash is changed */
1045 0 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
1046 : /* the block is different, and we must update parity */
1047 0 : parity_needs_to_be_updated = 1;
1048 : }
1049 : } else {
1050 : /* if we don't know the hash, always update parity */
1051 37374 : parity_needs_to_be_updated = 1;
1052 : }
1053 : }
1054 :
1055 : /*
1056 : * Copy the hash in the block, but doesn't mark the block as hashed
1057 : * this allow in case of skipped block to do not save the failed computation
1058 : */
1059 97759 : memcpy(block->hash, hash, BLOCK_HASH_SIZE);
1060 :
1061 : /*
1062 : * Note that in case of rehash, this is the wrong hash,
1063 : * but it will be overwritten later
1064 : */
1065 : }
1066 : }
1067 :
1068 : /*
1069 : * If we have only silent errors we can try to fix them on-the-fly
1070 : * note the fix is not written to disk, but used only to
1071 : * compute the new parity
1072 : */
1073 131795 : if (!error_on_this_block && !io_error_on_this_block && silent_error_on_this_block) {
1074 : unsigned failed_mac;
1075 852 : int something_to_recover = 0;
1076 :
1077 : /*
1078 : * Sort the failed vector
1079 : * because with threads it may be in any order
1080 : * but RAID requires the indexes to be sorted
1081 : */
1082 852 : qsort(failed, failed_count, sizeof(failed[0]), failed_compare_by_index);
1083 :
1084 : /* setup the blocks to recover */
1085 852 : failed_mac = 0;
1086 2556 : for (j = 0; j < failed_count; ++j) {
1087 1704 : unsigned char* block_buffer = buffer[failed[j].index];
1088 1704 : unsigned char* block_copy = copy[failed[j].index];
1089 1704 : unsigned block_state = block_state_get(failed[j].block);
1090 :
1091 : /* we try to recover only if at least one BLK is present */
1092 1704 : if (block_state == BLOCK_STATE_BLK)
1093 852 : something_to_recover = 1;
1094 :
1095 : /*
1096 : * Save a copy of the content just read
1097 : * that it's going to be overwritten by the recovering function
1098 : */
1099 1704 : memcpy(block_copy, block_buffer, state->block_size);
1100 :
1101 1704 : if (block_state == BLOCK_STATE_CHG
1102 852 : && hash_is_zero(failed[j].block->hash)
1103 : ) {
1104 : /*
1105 : * If the block was filled with 0, restore this state
1106 : * and avoid to recover it
1107 : */
1108 0 : memset(block_buffer, 0, state->block_size);
1109 : } else {
1110 : /* if we have too many failures, we cannot recover */
1111 1704 : if (failed_mac >= state->level)
1112 0 : break;
1113 :
1114 : /* otherwise it has to be recovered */
1115 1704 : failed_map[failed_mac++] = failed[j].index;
1116 : }
1117 : }
1118 :
1119 : /* if we have something to recover and enough parity */
1120 852 : if (something_to_recover && j == failed_count) {
1121 : /* until now is misc */
1122 852 : state_usage_misc(state);
1123 :
1124 : /*
1125 : * Read the parity
1126 : * we are sure that parity exists because
1127 : * we have at least one BLK block
1128 : */
1129 5964 : for (l = 0; l < state->level; ++l) {
1130 5112 : ret = parity_read(&parity_handle[l], blockcur, buffer[diskmax + l], state->block_size, log_error);
1131 5112 : if (ret == -1) {
1132 : /* LCOV_EXCL_START */
1133 : log_tag("parity_%s:%u:%s: Read error. %s.\n", es(errno), blockcur, lev_config_name(l), strerror(errno));
1134 : if (is_hw(errno)) {
1135 : log_error_errno(errno, lev_config_name(l));
1136 : if (io_error >= state->opt.io_error_limit) {
1137 : log_fatal(errno, "DANGER! Too many input/output errors in the %s disk. It isn't possible to continue.\n", lev_config_name(l));
1138 : log_fatal(errno, "Stopping at block %u\n", blockcur);
1139 : ++io_error;
1140 : goto bail;
1141 : }
1142 :
1143 : ++io_error;
1144 : io_error_on_this_block = 1;
1145 : continue;
1146 : }
1147 :
1148 : log_fatal_errno(errno, lev_config_name(l));
1149 : log_fatal(errno, "Stopping at block %u\n", blockcur);
1150 : ++soft_error;
1151 : goto bail;
1152 : /* LCOV_EXCL_STOP */
1153 : }
1154 :
1155 : /* until now is parity */
1156 5112 : state_usage_parity(state, &l, 1);
1157 : }
1158 :
1159 : /* if no error in parity read */
1160 852 : if (!io_error_on_this_block) {
1161 : /*
1162 : * Try to fix the data
1163 : * note that this is a simple fix algorithm, that doesn't take into
1164 : * account the case of a wrong parity
1165 : * only 'fix' supports the most advanced fixing
1166 : */
1167 852 : raid_rec(failed_mac, failed_map, diskmax, state->level, state->block_size, buffer);
1168 :
1169 : /* until now is raid */
1170 852 : state_usage_raid(state);
1171 :
1172 : /* check the result and prepare the data */
1173 2556 : for (j = 0; j < failed_count; ++j) {
1174 : unsigned char hash[HASH_MAX];
1175 1704 : unsigned char* block_buffer = buffer[failed[j].index];
1176 1704 : unsigned char* block_copy = copy[failed[j].index];
1177 1704 : unsigned block_state = block_state_get(failed[j].block);
1178 :
1179 1704 : if (block_state == BLOCK_STATE_BLK) {
1180 852 : unsigned size = failed[j].size;
1181 :
1182 : /* compute the hash of the recovered block */
1183 852 : if (rehash) {
1184 0 : memhash(state->prevhash, state->prevhashseed, hash, block_buffer, size);
1185 : } else {
1186 852 : memhash(state->hash, state->hashseed, hash, block_buffer, size);
1187 : }
1188 :
1189 : /* until now is hash */
1190 852 : state_usage_hash(state);
1191 :
1192 : /* if the hash doesn't match */
1193 852 : if (memcmp(hash, failed[j].block->hash, BLOCK_HASH_SIZE) != 0) {
1194 : /* we have not recovered */
1195 0 : break;
1196 : }
1197 :
1198 : /* pad with 0 if needed */
1199 852 : if (size < state->block_size)
1200 341 : memset(block_buffer + size, 0, state->block_size - size);
1201 : } else {
1202 : /*
1203 : * Otherwise restore the content
1204 : * because we are not interested in the old state
1205 : * that it's recovered for CHG, REP and DELETED blocks
1206 : */
1207 852 : memcpy(block_buffer, block_copy, state->block_size);
1208 : }
1209 : }
1210 :
1211 : /* if all is processed, we have fixed it */
1212 852 : if (j == failed_count)
1213 852 : fixed_error_on_this_block = 1;
1214 : }
1215 : }
1216 : }
1217 :
1218 : /* if we have read all the data required and it's correct, proceed with the parity */
1219 131795 : if (!error_on_this_block && !io_error_on_this_block
1220 131776 : && (!silent_error_on_this_block || fixed_error_on_this_block)
1221 : ) {
1222 : /* update the parity only if really needed */
1223 131776 : if (parity_needs_to_be_updated) {
1224 : /* compute the parity */
1225 131776 : raid_gen(diskmax, state->level, state->block_size, buffer);
1226 :
1227 : /* until now is raid */
1228 131776 : state_usage_raid(state);
1229 :
1230 : /* mark that the parity is going to be written */
1231 131776 : parity_going_to_be_updated = 1;
1232 : }
1233 :
1234 : /* for each disk, mark the blocks as processed */
1235 905179 : for (j = 0; j < diskmax; ++j) {
1236 : struct snapraid_block* block;
1237 :
1238 773403 : if (!handle[j].disk)
1239 0 : continue;
1240 :
1241 773403 : block = fs_par2block_find(handle[j].disk, blockcur);
1242 :
1243 773403 : if (block == BLOCK_NULL) {
1244 : /* nothing to do */
1245 98047 : continue;
1246 : }
1247 :
1248 : /* if it's a deleted block */
1249 675356 : if (block_state_get(block) == BLOCK_STATE_DELETED) {
1250 : /* the parity is now updated without this block, so it's now empty */
1251 13655 : fs_deallocate(handle[j].disk, blockcur);
1252 13655 : continue;
1253 : }
1254 :
1255 : /* now all the blocks have the hash and the parity computed */
1256 661701 : block_state_set(block, BLOCK_STATE_BLK);
1257 : }
1258 :
1259 : /*
1260 : * We update the info block only if we really have updated the parity
1261 : * because otherwise the time/justsynced info would be misleading as we didn't
1262 : * wrote the parity at this time
1263 : * we also update the info block only if no silent error was found
1264 : * because has no sense to refresh the time for data that we know bad
1265 : */
1266 131776 : if (parity_needs_to_be_updated
1267 131776 : && !silent_error_on_this_block
1268 : ) {
1269 : /* if rehash is needed */
1270 130924 : if (rehash) {
1271 : /* store all the new hash already computed */
1272 32459 : for (j = 0; j < diskmax; ++j) {
1273 27822 : if (rehandle[j].block)
1274 27200 : memcpy(rehandle[j].block->hash, rehandle[j].hash, BLOCK_HASH_SIZE);
1275 : }
1276 : }
1277 :
1278 : /*
1279 : * Update the time info of the block
1280 : * we are also clearing any previous bad and rehash flag
1281 : */
1282 130924 : info_set(&state->infoarr, blockcur, info_make(now, 0, 0, 1));
1283 : }
1284 : }
1285 :
1286 : /*
1287 : * If a silent (even if corrected) or input/output error was found
1288 : * mark the block as bad to have check/fix to handle it
1289 : * because our correction is in memory only and not yet written
1290 : */
1291 131795 : if (silent_error_on_this_block || io_error_on_this_block) {
1292 : /* set the error status keeping the other info */
1293 852 : info_set(&state->infoarr, blockcur, info_set_bad(info));
1294 : }
1295 :
1296 : /*
1297 : * Finally schedule parity write
1298 : * Note that the calls to io_parity_write() are mandatory
1299 : * even if the parity doesn't need to be updated
1300 : * This because we want to keep track of the time usage
1301 : */
1302 131795 : state_usage_misc(state);
1303 :
1304 : /* write start */
1305 131795 : io_write_preset(&io, blockcur, !parity_going_to_be_updated);
1306 :
1307 : /* write the parity */
1308 716630 : for (l = 0; l < state->level; ++l) {
1309 : unsigned levcur;
1310 :
1311 584835 : io_parity_write(&io, &levcur, waiting_map, &waiting_mac);
1312 :
1313 : /* until now is parity */
1314 584835 : state_usage_parity(state, waiting_map, waiting_mac);
1315 : }
1316 :
1317 : /* write finished */
1318 131795 : io_write_next(&io, blockcur, !parity_going_to_be_updated, writer_error);
1319 :
1320 : /* handle errors reported */
1321 658975 : for (j = 0; j < IO_WRITER_ERROR_MAX; ++j) {
1322 527180 : if (writer_error[j]) {
1323 0 : switch (j + IO_WRITER_ERROR_BASE) {
1324 0 : case TASK_STATE_IOERROR_CONTINUE :
1325 0 : ++io_error;
1326 0 : if (io_error >= state->opt.io_error_limit) {
1327 : /* LCOV_EXCL_START */
1328 : log_fatal(EIO, "DANGER! Too many input/output errors in a parity disk. It isn't possible to continue.\n");
1329 : log_fatal(EIO, "Stopping at block %u\n", blockcur);
1330 : goto bail;
1331 : /* LCOV_EXCL_STOP */
1332 : }
1333 0 : break;
1334 0 : case TASK_STATE_ERROR_CONTINUE :
1335 0 : ++soft_error;
1336 0 : break;
1337 0 : case TASK_STATE_IOERROR :
1338 : /* LCOV_EXCL_START */
1339 : ++io_error;
1340 : goto bail;
1341 : /* LCOV_EXCL_STOP */
1342 0 : case TASK_STATE_ERROR :
1343 : /* LCOV_EXCL_START */
1344 : ++soft_error;
1345 : goto bail;
1346 : /* LCOV_EXCL_STOP */
1347 : }
1348 : }
1349 : }
1350 :
1351 : /* mark the state as needing write */
1352 131795 : state->need_write = 1;
1353 :
1354 : /* count the number of processed block */
1355 131795 : ++countpos;
1356 :
1357 : /* progress */
1358 131795 : if (state_progress(state, &io, blockcur, countpos, countmax, countsize)) {
1359 : /* LCOV_EXCL_START */
1360 : break;
1361 : /* LCOV_EXCL_STOP */
1362 : }
1363 :
1364 : /* thermal control */
1365 131795 : if (state_thermal_alarm(state)) {
1366 : /* until now is misc */
1367 0 : state_usage_misc(state);
1368 :
1369 0 : state_progress_stop(state);
1370 :
1371 : /* before spinning down flush all the caches */
1372 0 : ret = state_flush(state, &io, parity_handle, blockcur);
1373 0 : if (ret == -1) {
1374 : /* LCOV_EXCL_START */
1375 : log_fatal(errno, "Stopping at block %u\n", blockcur);
1376 : ++io_error;
1377 : goto bail;
1378 : /* LCOV_EXCL_STOP */
1379 : }
1380 :
1381 0 : state_thermal_cooldown(state);
1382 :
1383 0 : state_progress_restart(state);
1384 :
1385 : /* drop until now */
1386 0 : state_usage_waste(state);
1387 : }
1388 :
1389 : /* autosave */
1390 131795 : if ((state->autosave != 0
1391 35436 : && autosavedone >= autosavelimit /* if we have reached the limit */
1392 0 : && autosavemissing >= autosavelimit) /* if we have at least a full step to do */
1393 : /* or if we have a forced autosave at the specified block */
1394 131795 : || (state->opt.force_autosave_at != 0 && state->opt.force_autosave_at == blockcur)
1395 : ) {
1396 1 : autosavedone = 0; /* restart the counter */
1397 :
1398 : /* until now is misc */
1399 1 : state_usage_misc(state);
1400 :
1401 1 : state_progress_stop(state);
1402 :
1403 1 : msg_progress("Autosaving...\n");
1404 :
1405 : /*
1406 : * Before writing the new content file we ensure that
1407 : * the parity is really written flushing the disk cache
1408 : */
1409 1 : ret = state_flush(state, &io, parity_handle, blockcur);
1410 1 : if (ret == -1) {
1411 : /* LCOV_EXCL_START */
1412 : log_fatal(EIO, "Stopping at block %u\n", blockcur);
1413 : ++io_error;
1414 : goto bail;
1415 : /* LCOV_EXCL_STOP */
1416 : }
1417 :
1418 : /* now we can safely write the content file */
1419 1 : state_write(state);
1420 :
1421 1 : state_progress_restart(state);
1422 :
1423 : /* drop until now */
1424 1 : state_usage_waste(state);
1425 : }
1426 : }
1427 :
1428 100 : end:
1429 100 : state_progress_end(state, countpos, countmax, countsize, "Nothing to sync.\n");
1430 :
1431 : /*
1432 : * Before returning we ensure that
1433 : * the parity is really written flushing the disk cache
1434 : */
1435 100 : ret = state_flush(state, &io, parity_handle, blockcur);
1436 100 : if (ret == -1) {
1437 : /* LCOV_EXCL_START */
1438 : log_fatal(errno, "Stopping at block %u\n", blockcur);
1439 : ++io_error;
1440 : goto bail;
1441 : /* LCOV_EXCL_STOP */
1442 : }
1443 :
1444 : /* now the parity is fully written, no need to keep deallocated files */
1445 100 : state_commit(state);
1446 :
1447 100 : if (state->opt.kill_after_sync) {
1448 12 : log_fatal(EUSER, "WARNING! Killing due --test-kill-after-sync option.\n");
1449 12 : exit(EXIT_SUCCESS);
1450 : }
1451 :
1452 : /* save the new state if required */
1453 88 : if ((state->need_write || state->opt.force_content_write))
1454 71 : state_write(state);
1455 :
1456 88 : state_usage_print(state);
1457 :
1458 88 : if (soft_error || silent_error || io_error) {
1459 12 : msg_status("\n");
1460 12 : msg_status("%8u soft errors\n", soft_error);
1461 12 : msg_status("%8u io errors\n", io_error);
1462 12 : msg_status("%8u data errors\n", silent_error);
1463 : } else {
1464 : /* print the result only if processed something */
1465 76 : if (countpos != 0)
1466 59 : msg_status("Everything OK\n");
1467 : }
1468 :
1469 88 : if (soft_error)
1470 11 : log_fatal(ESOFT, "WARNING! Unexpected soft errors!\n");
1471 88 : if (io_error)
1472 0 : log_fatal(EIO, "DANGER! Unexpected input/output errors! The failing blocks are now marked as bad!\n");
1473 88 : if (silent_error)
1474 1 : log_fatal(EDATA, "DANGER! Unexpected silent data errors! The failing blocks are now marked as bad!\n");
1475 88 : if (io_error || silent_error) {
1476 1 : log_fatal(ESOFT, "Use 'snapraid status' to list the bad blocks.\n");
1477 1 : log_fatal(ESOFT, "Use 'snapraid -e fix' to recover.\n");
1478 : }
1479 :
1480 88 : log_tag("summary:error_soft:%u\n", soft_error);
1481 88 : log_tag("summary:error_io:%u\n", io_error);
1482 88 : log_tag("summary:error_data:%u\n", silent_error);
1483 88 : if (soft_error + silent_error + io_error == 0)
1484 76 : log_tag("summary:exit:ok\n");
1485 12 : else if (silent_error + io_error == 0)
1486 11 : log_tag("summary:exit:warning\n");
1487 : else
1488 1 : log_tag("summary:exit:error\n");
1489 88 : log_flush();
1490 :
1491 88 : bail:
1492 : /* stop all the worker threads */
1493 88 : io_stop(&io);
1494 :
1495 598 : for (j = 0; j < diskmax; ++j) {
1496 510 : struct snapraid_file* file = handle[j].file;
1497 510 : struct snapraid_disk* disk = handle[j].disk;
1498 510 : ret = handle_close(&handle[j]);
1499 510 : if (ret == -1) {
1500 : /* LCOV_EXCL_START */
1501 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
1502 : log_fatal_errno(errno, disk->name);
1503 :
1504 : if (is_hw(errno)) {
1505 : ++io_error;
1506 : } else {
1507 : ++soft_error;
1508 : }
1509 : /* continue, as we are already exiting */
1510 : /* LCOV_EXCL_STOP */
1511 : }
1512 : }
1513 :
1514 88 : free(handle);
1515 88 : free(zero_alloc);
1516 88 : free(copy_alloc);
1517 88 : free(copy);
1518 88 : free(rehandle_alloc);
1519 88 : free(failed);
1520 88 : free(failed_map);
1521 88 : free(waiting_map);
1522 88 : io_done(&io);
1523 88 : free(block_enabled);
1524 :
1525 88 : if (state->opt.expect_recoverable) {
1526 1 : if (soft_error + silent_error + io_error == 0)
1527 0 : return -1;
1528 : } else {
1529 87 : if (soft_error + silent_error + io_error != 0)
1530 11 : return -1;
1531 : }
1532 :
1533 77 : if (alert < 0)
1534 0 : return -1;
1535 :
1536 77 : return 0;
1537 : }
1538 :
1539 106 : int state_sync(struct snapraid_state* state, block_off_t blockstart, block_off_t blockcount)
1540 : {
1541 : block_off_t blockmax;
1542 : block_off_t used_paritymax;
1543 : block_off_t file_paritymax;
1544 : data_off_t size;
1545 : int ret;
1546 : struct snapraid_parity_handle parity_handle[LEV_MAX];
1547 : unsigned process_error;
1548 : unsigned l;
1549 106 : int skip_sync = 0;
1550 :
1551 106 : msg_progress("Initializing...\n");
1552 :
1553 106 : blockmax = parity_allocated_size(state);
1554 106 : size = blockmax * (data_off_t)state->block_size;
1555 :
1556 : /* minimum size of the parity files we expect */
1557 106 : used_paritymax = parity_used_size(state);
1558 :
1559 : /* effective size of the parity files */
1560 106 : file_paritymax = 0;
1561 :
1562 106 : if (blockstart > blockmax) {
1563 : /* LCOV_EXCL_START */
1564 : log_fatal(EUSER, "Error in the starting block %u. It is larger than the parity size %u.\n", blockstart, blockmax);
1565 : exit(EXIT_FAILURE);
1566 : /* LCOV_EXCL_STOP */
1567 : }
1568 :
1569 : /* adjust the number of block to process */
1570 106 : if (blockcount != 0 && blockstart + blockcount < blockmax) {
1571 6 : blockmax = blockstart + blockcount;
1572 : }
1573 :
1574 647 : for (l = 0; l < state->level; ++l) {
1575 : data_off_t out_size;
1576 : block_off_t parityblocks;
1577 :
1578 : /* create the file and open for writing */
1579 541 : ret = parity_create(&parity_handle[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
1580 541 : if (ret == -1) {
1581 : /* LCOV_EXCL_START */
1582 : log_tag("parity_%s:%u:%s: Create error. %s.\n", es(errno), 0, lev_config_name(l), strerror(errno));
1583 : log_fatal_errno(errno, lev_config_name(l));
1584 : exit(EXIT_FAILURE);
1585 : /* LCOV_EXCL_STOP */
1586 : }
1587 :
1588 : /* number of block in the parity file */
1589 541 : parity_size(&parity_handle[l], &out_size);
1590 541 : parityblocks = out_size / state->block_size;
1591 :
1592 : /* if the file is too small */
1593 541 : if (parityblocks < used_paritymax) {
1594 0 : log_fatal(ESOFT, "WARNING! The %s parity has only %u blocks instead of %u.\n", lev_name(l), parityblocks, used_paritymax);
1595 : }
1596 :
1597 : /* keep the smallest parity number of blocks */
1598 541 : if (l == 0 || file_paritymax > parityblocks)
1599 107 : file_paritymax = parityblocks;
1600 : }
1601 :
1602 : /* if we do a full parity realloc or computation, having a wrong parity size is expected */
1603 106 : if (!state->opt.force_realloc && !state->opt.force_full) {
1604 : /* if the parities are too small */
1605 93 : if (file_paritymax < used_paritymax) {
1606 : /* LCOV_EXCL_START */
1607 : log_fatal(ESOFT, "DANGER! One or more the parity files are smaller than expected!\n");
1608 : if (file_paritymax != 0) {
1609 : log_fatal(ESOFT, "If this happens because you are using an old content file,\n");
1610 : log_fatal(ESOFT, "you can 'sync' anyway using 'snapraid --force-full sync'\n");
1611 : log_fatal(ESOFT, "to force a full rebuild of the parity.\n");
1612 : } else {
1613 : log_fatal(ESOFT, "It's possible that the parity disks are not mounted.\n");
1614 : log_fatal(ESOFT, "If instead you are adding a new parity level, you can 'sync' using\n");
1615 : log_fatal(ESOFT, "'snapraid --force-full sync' to force a full rebuild of the parity.\n");
1616 : }
1617 : exit(EXIT_FAILURE);
1618 : /* LCOV_EXCL_STOP */
1619 : }
1620 : }
1621 :
1622 106 : process_error = 0;
1623 :
1624 106 : if (state->opt.prehash) {
1625 7 : msg_progress("Hashing...\n");
1626 :
1627 7 : ret = state_hash_process(state, blockstart, blockmax, &skip_sync);
1628 7 : if (ret == -1) {
1629 : /* LCOV_EXCL_START */
1630 : ++process_error;
1631 : /* continue, in case also doing the sync if ::skip_sync is not set */
1632 : /* LCOV_EXCL_STOP */
1633 : }
1634 : }
1635 :
1636 106 : if (!skip_sync) {
1637 105 : msg_progress("Resizing...\n");
1638 :
1639 : /* now change the size of all parities */
1640 640 : for (l = 0; l < state->level; ++l) {
1641 : int is_modified;
1642 :
1643 : /*
1644 : * Change the size of the parity file, truncating or extending it
1645 : * from this point all the DELETED blocks after the end of the parity are invalid
1646 : * and they are automatically removed when we save the new content file
1647 : */
1648 535 : ret = parity_chsize(&parity_handle[l], &state->parity[l], &is_modified, size, state->block_size, state->opt.skip_fallocate, state->opt.skip_space_holder);
1649 535 : if (ret == -1) {
1650 : /* LCOV_EXCL_START */
1651 : data_off_t out_size;
1652 : parity_size(&parity_handle[l], &out_size);
1653 : parity_overflow(state, out_size);
1654 : log_fatal(errno, "WARNING! Without a usable %s file, it isn't possible to sync.\n", lev_name(l));
1655 : exit(EXIT_FAILURE);
1656 : /* LCOV_EXCL_STOP */
1657 : }
1658 :
1659 535 : if (is_modified)
1660 200 : state->need_write = 1;
1661 : }
1662 :
1663 : /* after resizing parity files, refresh again the free info */
1664 105 : state_refresh(state);
1665 :
1666 : /**
1667 : * Save the new state before the sync but after the hashing phase
1668 : *
1669 : * This allows to recover after an aborted sync, and at the same time
1670 : * it allows to recover broken copied/moved files identified in the
1671 : * hashing phase.
1672 : *
1673 : * For example, think at this case:
1674 : * - Add some files at the array
1675 : * - Run a sync command, it will recompute the parity adding the new files
1676 : * - Abort the sync command before it stores the new content file
1677 : * - Delete the not yet synced files from the array
1678 : * - Run a new sync command
1679 : *
1680 : * The sync command has no way to know that the parity file was modified
1681 : * because the files triggering these changes are now deleted and they aren't
1682 : * listed in the content file.
1683 : * Instead, saving the new content file in advance, keeps track of all the parity
1684 : * that may be modified.
1685 : */
1686 105 : if (!state->opt.skip_content_write) {
1687 105 : if (state->need_write)
1688 94 : state_write(state);
1689 : } else {
1690 0 : log_fatal(EUSER, "WARNING! Skipped state write for --test-skip-content-write option.\n");
1691 : }
1692 :
1693 105 : if (state->opt.kill_before_sync) {
1694 2 : log_fatal(EUSER, "WARNING! Killing due --test-kill-before-sync option.\n");
1695 2 : exit(EXIT_SUCCESS);
1696 : }
1697 :
1698 : /* skip degenerated cases of empty parity, or skipping all */
1699 103 : if (blockstart < blockmax) {
1700 100 : ret = state_sync_process(state, parity_handle, blockstart, blockmax);
1701 88 : if (ret == -1) {
1702 : /* LCOV_EXCL_START */
1703 : ++process_error;
1704 : /* continue, as we are already exiting */
1705 : /* LCOV_EXCL_STOP */
1706 : }
1707 : } else {
1708 3 : msg_status("Nothing to sync.\n");
1709 : }
1710 : }
1711 :
1712 574 : for (l = 0; l < state->level; ++l) {
1713 482 : ret = parity_close(&parity_handle[l]);
1714 482 : if (ret == -1) {
1715 : /* LCOV_EXCL_START */
1716 : log_tag("parity_%s:%u:%s: Close error. %s.\n", es(errno), blockmax, lev_config_name(l), strerror(errno));
1717 : log_fatal_errno(errno, lev_config_name(l));
1718 :
1719 : ++process_error;
1720 : /* continue, as we are already exiting */
1721 : /* LCOV_EXCL_STOP */
1722 : }
1723 : }
1724 :
1725 92 : if (process_error != 0)
1726 12 : return -1;
1727 80 : return 0;
1728 : }
1729 :
|