Line data Source code
1 : /*
2 : * Copyright (C) 2011 Andrea Mazzoleni
3 : *
4 : * This program is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 3 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License
15 : * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 : */
17 :
18 : #include "portable.h"
19 :
20 : #include "support.h"
21 : #include "elem.h"
22 : #include "state.h"
23 : #include "parity.h"
24 : #include "handle.h"
25 : #include "io.h"
26 : #include "raid/raid.h"
27 :
28 : /****************************************************************************/
29 : /* hash */
30 :
31 18 : static const char* es(int err)
32 : {
33 18 : if (is_hw(err))
34 0 : return "error_io";
35 : else
36 18 : return "error";
37 : }
38 :
39 7 : static int state_hash_process(struct snapraid_state* state, block_off_t blockstart, block_off_t blockmax, int* skip_sync)
40 : {
41 : struct snapraid_handle* handle;
42 : unsigned diskmax;
43 : block_off_t blockcur;
44 : unsigned j;
45 : void* buffer;
46 : void* buffer_alloc;
47 : data_off_t countsize;
48 : block_off_t countpos;
49 : block_off_t countmax;
50 : int ret;
51 : unsigned soft_error;
52 : unsigned silent_error;
53 : unsigned io_error;
54 : char esc_buffer[ESC_MAX];
55 :
56 : /* maps the disks to handles */
57 7 : handle = handle_mapping(state, &diskmax);
58 :
59 : /* buffer for reading */
60 7 : buffer = malloc_nofail_direct(state->block_size, &buffer_alloc);
61 7 : if (!state->opt.skip_self)
62 0 : mtest_vector(1, state->block_size, &buffer);
63 :
64 7 : soft_error = 0;
65 7 : silent_error = 0;
66 7 : io_error = 0;
67 :
68 : /* first count the number of blocks to process */
69 7 : countmax = 0;
70 49 : for (j = 0; j < diskmax; ++j) {
71 42 : struct snapraid_disk* disk = handle[j].disk;
72 :
73 : /* if no disk, nothing to check */
74 42 : if (!disk)
75 0 : continue;
76 :
77 236706 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
78 : struct snapraid_block* block;
79 : unsigned block_state;
80 :
81 236664 : block = fs_par2block_find(disk, blockcur);
82 :
83 : /* get the state of the block */
84 236664 : block_state = block_state_get(block);
85 :
86 : /* process REP and CHG blocks */
87 236664 : if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
88 223625 : continue;
89 :
90 13039 : ++countmax;
91 : }
92 : }
93 :
94 : /* drop until now */
95 7 : state_usage_waste(state);
96 :
97 7 : countsize = 0;
98 7 : countpos = 0;
99 7 : blockcur = blockstart;
100 :
101 7 : int alert = state_progress_begin(state, blockstart, blockmax, countmax);
102 7 : if (alert > 0)
103 0 : goto end;
104 7 : if (alert < 0)
105 0 : goto bail;
106 :
107 49 : for (j = 0; j < diskmax; ++j) {
108 42 : struct snapraid_disk* disk = handle[j].disk;
109 :
110 : /* if no disk, nothing to check */
111 42 : if (!disk)
112 0 : continue;
113 :
114 236706 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
115 : snapraid_info info;
116 : int rehash;
117 : struct snapraid_block* block;
118 : int read_size;
119 : unsigned char hash[HASH_MAX];
120 : unsigned block_state;
121 : struct snapraid_file* file;
122 : block_off_t file_pos;
123 :
124 236664 : block = fs_par2block_find(disk, blockcur);
125 :
126 : /* get the state of the block */
127 236664 : block_state = block_state_get(block);
128 :
129 : /* process REP and CHG blocks */
130 236664 : if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
131 223635 : continue;
132 :
133 : /* get the file of this block */
134 13039 : file = fs_par2file_get(disk, blockcur, &file_pos);
135 :
136 : /* get block specific info */
137 13039 : info = info_get(&state->infoarr, blockcur);
138 :
139 : /* if we have to use the old hash */
140 13039 : rehash = info_get_rehash(info);
141 :
142 : /* until now is misc */
143 13039 : state_usage_misc(state);
144 :
145 : /* if the file is different than the current one, close it */
146 13039 : if (handle[j].file != 0 && handle[j].file != file) {
147 : /* keep a pointer at the file we are going to close for error reporting */
148 5252 : struct snapraid_file* report = handle[j].file;
149 5252 : ret = handle_close(&handle[j]);
150 5252 : if (ret == -1) {
151 : /* LCOV_EXCL_START */
152 : /* This one is really an unexpected error, because we are only reading */
153 : /* and closing a descriptor should never fail */
154 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
155 : log_fatal_errno(errno, disk->name);
156 : log_fatal(errno, "Stopping at block %u\n", blockcur);
157 :
158 : if (is_hw(errno)) {
159 : ++io_error;
160 : } else {
161 : ++soft_error;
162 : }
163 : goto bail;
164 : /* LCOV_EXCL_STOP */
165 : }
166 : }
167 :
168 13039 : ret = handle_open(&handle[j], file, state->file_mode, log_error, log_error); /* output a message for missing files */
169 13039 : if (ret == -1) {
170 6 : log_tag("%s:%u:%s:%s: Open error. %s.\n", es(errno), blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
171 6 : if (errno == ENOENT) {
172 2 : log_error_errno(errno, disk->name);
173 :
174 2 : ++soft_error;
175 : /* if the file is missing, it means that it was removed during sync */
176 : /* this isn't a serious error, so we skip this block, and continue with others */
177 2 : continue;
178 : }
179 :
180 4 : if (errno == EACCES) {
181 4 : log_error_errno(errno, disk->name);
182 :
183 4 : ++soft_error;
184 : /* this isn't a serious error, so we skip this block, and continue with others */
185 4 : continue;
186 : }
187 :
188 : /* LCOV_EXCL_START */
189 : log_fatal_errno(errno, disk->name);
190 :
191 : if (is_hw(errno)) {
192 : log_fatal(errno, "Stopping at block %u\n", blockcur);
193 : ++io_error;
194 : } else {
195 : log_fatal(errno, "Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
196 : ++soft_error;
197 : }
198 : goto bail;
199 : /* LCOV_EXCL_STOP */
200 : }
201 :
202 : /* check if the file is changed */
203 13033 : if (handle[j].st.st_size != file->size
204 13032 : || handle[j].st.st_mtime != file->mtime_sec
205 13031 : || STAT_NSEC(&handle[j].st) != file->mtime_nsec
206 13031 : || handle[j].st.st_ino != file->inode
207 : ) {
208 3 : if (handle[j].st.st_size != file->size) {
209 1 : log_tag("error:%u:%s:%s: Unexpected size change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
210 1 : log_error(ESOFT, "Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle[j].path, file->size, (uint64_t)handle[j].st.st_size);
211 2 : } else if (handle[j].st.st_mtime != file->mtime_sec
212 1 : || STAT_NSEC(&handle[j].st) != file->mtime_nsec) {
213 1 : log_tag("error:%u:%s:%s: Unexpected time change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
214 1 : log_error(ESOFT, "Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle[j].path, file->mtime_sec, file->mtime_nsec, (uint64_t)handle[j].st.st_mtime, STAT_NSEC(&handle[j].st));
215 : } else {
216 1 : log_tag("error:%u:%s:%s: Unexpected inode change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
217 1 : log_error(ESOFT, "Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", file->inode, (uint64_t)handle[j].st.st_ino, handle[j].path);
218 : }
219 3 : log_error_errno(ENOENT, disk->name); /* same message for ENOENT */
220 :
221 3 : ++soft_error;
222 :
223 : /* if the file is changed, it means that it was modified during sync */
224 : /* this isn't a serious error, so we skip this block, and continue with others */
225 3 : continue;
226 : }
227 :
228 13030 : read_size = handle_read(&handle[j], file_pos, buffer, state->block_size, log_fatal, 0);
229 13030 : if (read_size == -1) {
230 : /* LCOV_EXCL_START */
231 : log_tag("%s:%u:%s:%s: Read error at position %u. %s.\n", es(errno), blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, strerror(errno));
232 : log_fatal_errno(errno, disk->name);
233 :
234 : if (is_hw(errno)) {
235 : log_fatal(errno, "Stopping at block %u\n", blockcur);
236 : ++io_error;
237 : } else {
238 : log_fatal(errno, "Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
239 : ++soft_error;
240 : }
241 : goto bail;
242 : /* LCOV_EXCL_STOP */
243 : }
244 :
245 : /* until now is disk */
246 13030 : state_usage_disk(state, handle, &j, 1);
247 :
248 13030 : state_usage_file(state, disk, file);
249 :
250 13030 : countsize += read_size;
251 :
252 : /* now compute the hash */
253 13030 : if (rehash) {
254 0 : memhash(state->prevhash, state->prevhashseed, hash, buffer, read_size);
255 : } else {
256 13030 : memhash(state->hash, state->hashseed, hash, buffer, read_size);
257 : }
258 :
259 : /* until now is hash */
260 13030 : state_usage_hash(state);
261 :
262 13030 : if (block_state == BLOCK_STATE_REP) {
263 : /* compare the hash */
264 13028 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
265 1 : log_tag("error_data:%u:%s:%s: Unexpected data change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
266 1 : log_error(EDATA, "Data change at file '%s' at position '%u'\n", handle[j].path, file_pos);
267 1 : log_error(EDATA, "WARNING! Unexpected data modification of a file without parity!\n");
268 :
269 1 : if (file_flag_has(file, FILE_IS_COPY)) {
270 1 : log_error(EDATA, "This file was detected as a copy of another file with the same name, size,\n");
271 1 : log_error(EDATA, "and timestamp, but the file data isn't matching the assumed copy.\n");
272 1 : log_error(EDATA, "If this is a false positive, and the files are expected to be different,\n");
273 1 : log_error(EDATA, "you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
274 : } else {
275 0 : log_error(EDATA, "Try removing the file from the array and rerun the 'sync' command!\n");
276 : }
277 :
278 : /* block sync to allow a recovery before overwriting */
279 : /* the parity needed to make such recovery */
280 1 : *skip_sync = 1; /* avoid to run the next sync */
281 :
282 1 : ++silent_error;
283 1 : continue;
284 : }
285 : } else {
286 : /* the only other case is BLOCK_STATE_CHG */
287 2 : assert(block_state == BLOCK_STATE_CHG);
288 :
289 : /* copy the hash in the block */
290 2 : memcpy(block->hash, hash, BLOCK_HASH_SIZE);
291 :
292 : /* and mark the block as hashed */
293 2 : block_state_set(block, BLOCK_STATE_REP);
294 :
295 : /* mark the state as needing write */
296 2 : state->need_write = 1;
297 : }
298 :
299 : /* count the number of processed block */
300 13029 : ++countpos;
301 :
302 : /* progress */
303 13029 : if (state_progress(state, 0, blockcur, countpos, countmax, countsize)) {
304 : /* LCOV_EXCL_START */
305 : *skip_sync = 1; /* avoid to run the next sync */
306 : break;
307 : /* LCOV_EXCL_STOP */
308 : }
309 : }
310 :
311 : /* close the last file in the disk */
312 42 : if (handle[j].file != 0) {
313 : /* keep a pointer at the file we are going to close for error reporting */
314 7 : struct snapraid_file* report = handle[j].file;
315 7 : ret = handle_close(&handle[j]);
316 7 : if (ret == -1) {
317 : /* LCOV_EXCL_START */
318 : /* This one is really an unexpected error, because we are only reading */
319 : /* and closing a descriptor should never fail */
320 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockmax, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
321 : log_fatal_errno(errno, disk->name);
322 : log_fatal(errno, "Stopping at block %u\n", blockmax);
323 :
324 : if (is_hw(errno)) {
325 : ++io_error;
326 : } else {
327 : ++soft_error;
328 : }
329 : goto bail;
330 : /* LCOV_EXCL_STOP */
331 : }
332 : }
333 : }
334 :
335 7 : end:
336 7 : state_progress_end(state, countpos, countmax, countsize, "Nothing to hash.\n");
337 :
338 : /* note that at this point no io_error is possible */
339 : /* because at the first one we bail out */
340 7 : assert(io_error == 0);
341 :
342 7 : if (soft_error || io_error || silent_error) {
343 6 : msg_status("\n");
344 6 : msg_status("%8u soft errors\n", soft_error);
345 6 : msg_status("%8u io errors\n", io_error);
346 6 : msg_status("%8u data errors\n", silent_error);
347 : } else {
348 1 : msg_status("Everything OK\n");
349 : }
350 :
351 7 : if (soft_error)
352 5 : log_fatal(ESOFT, "WARNING! Unexpected soft errors!\n");
353 :
354 7 : log_tag("hash_summary:error_soft:%u\n", soft_error);
355 :
356 : /* proceed without bailing out */
357 7 : goto finish;
358 :
359 0 : bail:
360 : /* on bail, don't run the next sync */
361 0 : *skip_sync = 1;
362 :
363 : /* close files left open */
364 0 : for (j = 0; j < diskmax; ++j) {
365 0 : struct snapraid_file* file = handle[j].file;
366 0 : struct snapraid_disk* disk = handle[j].disk;
367 0 : ret = handle_close(&handle[j]);
368 0 : if (ret == -1) {
369 : /* LCOV_EXCL_START */
370 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
371 : log_fatal_errno(errno, disk->name);
372 :
373 : if (is_hw(errno)) {
374 : ++io_error;
375 : } else {
376 : ++soft_error;
377 : }
378 : /* continue, as we are already exiting */
379 : /* LCOV_EXCL_STOP */
380 : }
381 : }
382 :
383 0 : finish:
384 7 : free(handle);
385 7 : free(buffer_alloc);
386 :
387 7 : if (soft_error + io_error + silent_error != 0)
388 6 : return -1;
389 :
390 1 : if (alert < 0)
391 0 : return -1;
392 :
393 1 : return 0;
394 : }
395 :
396 : /****************************************************************************/
397 : /* sync */
398 :
399 : /**
400 : * Sync plan to use.
401 : */
402 : struct snapraid_plan {
403 : unsigned handle_max;
404 : struct snapraid_handle* handle_map;
405 : int force_full;
406 : };
407 :
408 : /**
409 : * A block that failed the hash check, or that was deleted.
410 : */
411 : struct failed_struct {
412 : unsigned index; /**< Index of the failed block. */
413 : unsigned size; /**< Size of the block. */
414 :
415 : struct snapraid_block* block; /**< The failed block, or BLOCK_DELETED for a deleted block */
416 : };
417 :
418 : /**
419 : * Comparison function for sorting by index.
420 : */
421 852 : int failed_compare_by_index(const void* void_a, const void* void_b)
422 : {
423 852 : const struct failed_struct* a = void_a;
424 852 : const struct failed_struct* b = void_b;
425 :
426 852 : if (a->index < b->index)
427 852 : return -1;
428 0 : if (a->index > b->index)
429 0 : return 1;
430 0 : return 0;
431 : }
432 :
433 : /**
434 : * Buffer for storing the new hashes.
435 : */
436 : struct snapraid_rehash {
437 : unsigned char hash[HASH_MAX];
438 : struct snapraid_block* block;
439 : };
440 :
441 : /**
442 : * Check if we have to process the specified block index ::i.
443 : */
444 463114 : static int block_is_enabled(struct snapraid_plan* plan, block_off_t i)
445 : {
446 : unsigned j;
447 : int one_invalid;
448 : int one_valid;
449 :
450 : /* for each disk */
451 463114 : one_invalid = 0;
452 463114 : one_valid = 0;
453 3241798 : for (j = 0; j < plan->handle_max; ++j) {
454 : struct snapraid_block* block;
455 2778684 : struct snapraid_disk* disk = plan->handle_map[j].disk;
456 :
457 : /* if no disk, nothing to check */
458 2778684 : if (!disk)
459 4687 : continue;
460 :
461 2773997 : block = fs_par2block_find(disk, i);
462 :
463 2773997 : if (block_has_file(block))
464 2623692 : one_valid = 1;
465 :
466 2773997 : if (block_has_invalid_parity(block) || plan->force_full)
467 434554 : one_invalid = 1;
468 : }
469 :
470 : /* if none valid or none invalid, we don't need to update */
471 463114 : if (!one_invalid || !one_valid)
472 337042 : return 0;
473 :
474 126072 : return 1;
475 : }
476 :
477 756432 : static void sync_data_reader(struct snapraid_worker* worker, struct snapraid_task* task)
478 : {
479 756432 : struct snapraid_io* io = worker->io;
480 756432 : struct snapraid_state* state = io->state;
481 756432 : struct snapraid_handle* handle = worker->handle;
482 756432 : struct snapraid_disk* disk = handle->disk;
483 756432 : block_off_t blockcur = task->position;
484 756432 : unsigned char* buffer = task->buffer;
485 : int ret;
486 : char esc_buffer[ESC_MAX];
487 :
488 : /* if the disk position is not used */
489 756432 : if (!disk) {
490 : /* use an empty block */
491 0 : memset(buffer, 0, state->block_size);
492 0 : task->state = TASK_STATE_DONE;
493 110636 : return;
494 : }
495 :
496 : /* get the block */
497 756432 : task->block = fs_par2block_find(disk, blockcur);
498 :
499 : /* if the block has no file, meaning that it's EMPTY or DELETED, */
500 : /* it doesn't participate in the new parity computation */
501 756432 : if (!block_has_file(task->block)) {
502 : /* use an empty block */
503 110618 : memset(buffer, 0, state->block_size);
504 110618 : task->state = TASK_STATE_DONE;
505 110618 : return;
506 : }
507 :
508 : /* get the file of this block */
509 645814 : task->file = fs_par2file_get(disk, blockcur, &task->file_pos);
510 :
511 : /* if the file is different than the current one, close it */
512 645814 : if (handle->file != 0 && handle->file != task->file) {
513 : /* keep a pointer at the file we are going to close for error reporting */
514 264963 : struct snapraid_file* report = handle->file;
515 264963 : ret = handle_close(handle);
516 264963 : if (ret == -1) {
517 : /* LCOV_EXCL_START */
518 : /* This one is really an unexpected error, because we are only reading */
519 : /* and closing a descriptor should never fail */
520 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
521 : log_fatal_errno(errno, disk->name);
522 : log_fatal(errno, "Stopping at block %u\n", blockcur);
523 :
524 : if (is_hw(errno)) {
525 : task->state = TASK_STATE_IOERROR;
526 : } else {
527 : task->state = TASK_STATE_ERROR;
528 : }
529 : return;
530 : /* LCOV_EXCL_STOP */
531 : }
532 : }
533 :
534 645814 : ret = handle_open(handle, task->file, state->file_mode, log_error, log_error); /* output a message for missing files */
535 645814 : if (ret == -1) {
536 12 : log_tag("%s:%u:%s:%s: Open error. %s.\n", es(errno), blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
537 12 : if (errno == ENOENT) {
538 4 : log_error_errno(errno, disk->name);
539 :
540 : /* if the file is missing, it means that it was removed during sync */
541 : /* this isn't a serious error, so we skip this block, and continue with others */
542 4 : task->state = TASK_STATE_ERROR_CONTINUE;
543 4 : return;
544 : }
545 :
546 8 : if (errno == EACCES) {
547 8 : log_error_errno(errno, disk->name);
548 :
549 : /* this isn't a serious error, so we skip this block, and continue with others */
550 8 : task->state = TASK_STATE_ERROR_CONTINUE;
551 8 : return;
552 : }
553 :
554 : /* LCOV_EXCL_START */
555 : log_fatal_errno(errno, disk->name);
556 :
557 : if (is_hw(errno)) {
558 : log_fatal(errno, "Stopping at block %u\n", blockcur);
559 : task->state = TASK_STATE_IOERROR;
560 : } else {
561 : log_fatal(errno, "Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
562 : task->state = TASK_STATE_ERROR;
563 : }
564 : return;
565 : /* LCOV_EXCL_STOP */
566 : }
567 :
568 : /* check if the file is changed */
569 645802 : if (handle->st.st_size != task->file->size
570 645800 : || handle->st.st_mtime != task->file->mtime_sec
571 645798 : || STAT_NSEC(&handle->st) != task->file->mtime_nsec
572 645798 : || handle->st.st_ino != task->file->inode
573 : ) {
574 6 : log_tag("error:%u:%s:%s: Unexpected attribute change\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer));
575 6 : if (handle->st.st_size != task->file->size) {
576 2 : log_error(ESOFT, "Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle->path, task->file->size, (uint64_t)handle->st.st_size);
577 4 : } else if (handle->st.st_mtime != task->file->mtime_sec
578 2 : || STAT_NSEC(&handle->st) != task->file->mtime_nsec) {
579 2 : log_error(ESOFT, "Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle->path, task->file->mtime_sec, task->file->mtime_nsec, (uint64_t)handle->st.st_mtime, STAT_NSEC(&handle->st));
580 : } else {
581 2 : log_error(ESOFT, "Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", task->file->inode, (uint64_t)handle->st.st_ino, handle->path);
582 : }
583 6 : log_error_errno(ENOENT, disk->name); /* same message for ENOENT */
584 :
585 : /* if the file is changed, it means that it was modified during sync */
586 : /* this isn't a serious error, so we skip this block, and continue with others */
587 6 : task->state = TASK_STATE_ERROR_CONTINUE;
588 6 : return;
589 : }
590 :
591 645796 : task->read_size = handle_read(handle, task->file_pos, buffer, state->block_size, log_error, 0);
592 645796 : if (task->read_size == -1) {
593 : /* LCOV_EXCL_START */
594 : log_tag("%s:%u:%s:%s: Read error at position %u. %s.\n", es(errno), blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
595 :
596 : if (is_hw(errno)) {
597 : log_error_errno(errno, disk->name);
598 : /* continue until the error limit is reached */
599 : task->state = TASK_STATE_IOERROR_CONTINUE;
600 : } else {
601 : log_fatal_errno(errno, disk->name);
602 : log_fatal(errno, "Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
603 : task->state = TASK_STATE_ERROR;
604 : }
605 : return;
606 : /* LCOV_EXCL_STOP */
607 : }
608 :
609 : /* store the path of the opened file */
610 645796 : pathcpy(task->path, sizeof(task->path), handle->path);
611 :
612 645796 : task->state = TASK_STATE_DONE;
613 : }
614 :
615 573582 : static void sync_parity_writer(struct snapraid_worker* worker, struct snapraid_task* task)
616 : {
617 573582 : struct snapraid_io* io = worker->io;
618 573582 : struct snapraid_state* state = io->state;
619 573582 : struct snapraid_parity_handle* parity_handle = worker->parity_handle;
620 573582 : unsigned level = parity_handle->level;
621 573582 : block_off_t blockcur = task->position;
622 573582 : unsigned char* buffer = task->buffer;
623 : int ret;
624 :
625 : /* write parity */
626 573582 : ret = parity_write(parity_handle, blockcur, buffer, state->block_size);
627 573582 : if (ret == -1) {
628 : /* LCOV_EXCL_START */
629 : log_tag("parity_%s:%u:%s: Write error. %s.\n", es(errno), blockcur, lev_config_name(level), strerror(errno));
630 :
631 : if (is_hw(errno)) {
632 : log_error_errno(errno, lev_config_name(level));
633 : /* continue until the error limit is reached */
634 : task->state = TASK_STATE_IOERROR_CONTINUE;
635 : } else {
636 : log_fatal_errno(errno, lev_config_name(level));
637 : log_fatal(errno, "Stopping at block %u\n", blockcur);
638 : task->state = TASK_STATE_ERROR;
639 : }
640 : return;
641 : /* LCOV_EXCL_STOP */
642 : }
643 :
644 573582 : task->state = TASK_STATE_DONE;
645 : }
646 :
647 92 : static int state_sync_process(struct snapraid_state* state, struct snapraid_parity_handle* parity_handle, block_off_t blockstart, block_off_t blockmax)
648 : {
649 : struct snapraid_io io;
650 : struct snapraid_plan plan;
651 : struct snapraid_handle* handle;
652 : void* rehandle_alloc;
653 : struct snapraid_rehash* rehandle;
654 : unsigned diskmax;
655 : block_off_t blockcur;
656 : unsigned j;
657 : void* zero_alloc;
658 : void** zero;
659 : void* copy_alloc;
660 : void** copy;
661 : unsigned buffermax;
662 : data_off_t countsize;
663 : block_off_t countpos;
664 : block_off_t countmax;
665 : block_off_t autosavedone;
666 : block_off_t autosavelimit;
667 : block_off_t autosavemissing;
668 : int ret;
669 : unsigned soft_error;
670 : unsigned silent_error;
671 : unsigned io_error;
672 : time_t now;
673 : struct failed_struct* failed;
674 : int* failed_map;
675 : unsigned l;
676 : unsigned* waiting_map;
677 : unsigned waiting_mac;
678 : char esc_buffer[ESC_MAX];
679 : bit_vect_t* block_enabled;
680 :
681 : /* the sync process assumes that all the hashes are correct */
682 : /* including the ones from CHG and DELETED blocks */
683 92 : assert(state->clear_past_hash != 0);
684 :
685 : /* get the present time */
686 92 : now = time(0);
687 :
688 : /* maps the disks to handles */
689 92 : handle = handle_mapping(state, &diskmax);
690 :
691 : /* rehash buffers */
692 92 : rehandle = malloc_nofail_align(diskmax * sizeof(struct snapraid_rehash), &rehandle_alloc);
693 :
694 : /* we need 1 * data + 1 * parity */
695 92 : buffermax = diskmax + state->level;
696 :
697 : /* initialize the io threads */
698 92 : io_init(&io, state, state->opt.io_cache, buffermax, sync_data_reader, handle, diskmax, 0, sync_parity_writer, parity_handle, state->level);
699 :
700 : /* allocate the copy buffer */
701 92 : copy = malloc_nofail_vector_align(diskmax, diskmax, state->block_size, ©_alloc);
702 :
703 : /* allocate and fill the zero buffer */
704 92 : zero = malloc_nofail_align(state->block_size, &zero_alloc);
705 92 : memset(zero, 0, state->block_size);
706 92 : raid_zero(zero);
707 :
708 92 : failed = malloc_nofail(diskmax * sizeof(struct failed_struct));
709 92 : failed_map = malloc_nofail(diskmax * sizeof(unsigned));
710 :
711 : /* possibly waiting disks */
712 92 : waiting_mac = diskmax > RAID_PARITY_MAX ? diskmax : RAID_PARITY_MAX;
713 92 : waiting_map = malloc_nofail(waiting_mac * sizeof(unsigned));
714 :
715 92 : soft_error = 0;
716 92 : silent_error = 0;
717 92 : io_error = 0;
718 :
719 92 : msg_progress("Selecting...\n");
720 :
721 : /* first count the number of blocks to process */
722 92 : countmax = 0;
723 92 : plan.handle_max = diskmax;
724 92 : plan.handle_map = handle;
725 92 : plan.force_full = state->opt.force_full;
726 92 : block_enabled = calloc_nofail(1, bit_vect_size(blockmax)); /* preinitialize to 0 */
727 463206 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
728 463114 : if (!block_is_enabled(&plan, blockcur))
729 337042 : continue;
730 126072 : bit_vect_set(block_enabled, blockcur);
731 126072 : ++countmax;
732 : }
733 :
734 : /* compute the autosave size for all disk, even if not read */
735 : /* this makes sense because the speed should be almost the same */
736 : /* if the disks are read in parallel */
737 92 : autosavelimit = state->autosave / (diskmax * state->block_size);
738 92 : autosavemissing = countmax; /* blocks to do */
739 92 : autosavedone = 0; /* blocks done */
740 :
741 : /* drop until now */
742 92 : state_usage_waste(state);
743 :
744 92 : countsize = 0;
745 92 : countpos = 0;
746 :
747 92 : msg_progress("Syncing...\n");
748 :
749 : /* start all the worker threads */
750 92 : io_start(&io, blockstart, blockmax, block_enabled);
751 :
752 92 : int alert = state_progress_begin(state, blockstart, blockmax, countmax);
753 92 : if (alert > 0)
754 0 : goto end;
755 92 : if (alert < 0)
756 0 : goto bail;
757 :
758 126072 : while (1) {
759 : unsigned failed_count;
760 : int error_on_this_block;
761 : int silent_error_on_this_block;
762 : int io_error_on_this_block;
763 : int fixed_error_on_this_block;
764 : int parity_needs_to_be_updated;
765 : int parity_going_to_be_updated;
766 : snapraid_info info;
767 : int rehash;
768 : void** buffer;
769 : int writer_error[IO_WRITER_ERROR_MAX];
770 :
771 : /* go to the next block */
772 126164 : blockcur = io_read_next(&io, &buffer);
773 126164 : if (blockcur >= blockmax)
774 92 : break;
775 :
776 : /* until now is scheduling */
777 126072 : state_usage_sched(state);
778 :
779 : /* one more block processed for autosave */
780 126072 : ++autosavedone;
781 126072 : --autosavemissing;
782 :
783 : /* by default process the block, and skip it if something goes wrong */
784 126072 : error_on_this_block = 0;
785 126072 : silent_error_on_this_block = 0;
786 126072 : io_error_on_this_block = 0;
787 126072 : fixed_error_on_this_block = 0;
788 :
789 : /* keep track of the number of failed blocks */
790 126072 : failed_count = 0;
791 :
792 : /* get block specific info */
793 126072 : info = info_get(&state->infoarr, blockcur);
794 :
795 : /* if we have to use the old hash */
796 126072 : rehash = info_get_rehash(info);
797 :
798 : /* if the parity requires to be updated */
799 : /* It could happens that all the blocks are EMPTY/BLK and CHG but with the hash */
800 : /* still matching because the specific CHG block was not modified. */
801 : /* In such case, we can avoid to update parity, because it would be the same as before */
802 : /* Note that CHG/DELETED blocks already present in the content file loaded */
803 : /* have the hash cleared (::clear_past_hash flag), and then they won't never match the hash. */
804 : /* We are treating only CHG blocks created at runtime. */
805 126072 : parity_needs_to_be_updated = state->opt.force_full || state->opt.force_parity_update;
806 :
807 : /* if the parity is going to be updated */
808 126072 : parity_going_to_be_updated = 0;
809 :
810 : /* if the block is marked as bad, we force the parity update */
811 : /* because the bad block may be the result of a wrong parity */
812 126072 : if (info_get_bad(info))
813 0 : parity_needs_to_be_updated = 1;
814 :
815 : /* for each disk, process the block */
816 882504 : for (j = 0; j < diskmax; ++j) {
817 : struct snapraid_task* task;
818 : int read_size;
819 : unsigned char hash[HASH_MAX];
820 : struct snapraid_block* block;
821 : unsigned block_state;
822 : struct snapraid_disk* disk;
823 : struct snapraid_file* file;
824 : block_off_t file_pos;
825 : unsigned diskcur;
826 :
827 : /* until now is misc */
828 756432 : state_usage_misc(state);
829 :
830 756432 : task = io_data_read(&io, &diskcur, waiting_map, &waiting_mac);
831 :
832 : /* until now is disk */
833 756432 : state_usage_disk(state, handle, waiting_map, waiting_mac);
834 :
835 : /* get the results */
836 756432 : disk = task->disk;
837 756432 : block = task->block;
838 756432 : file = task->file;
839 756432 : file_pos = task->file_pos;
840 756432 : read_size = task->read_size;
841 :
842 : /* by default no rehash in case of "continue" */
843 756432 : rehandle[diskcur].block = 0;
844 :
845 : /* if the disk position is not used */
846 756432 : if (!disk)
847 111489 : continue;
848 :
849 756432 : state_usage_file(state, disk, file);
850 :
851 : /* get the state of the block */
852 756432 : block_state = block_state_get(block);
853 :
854 : /* if the block has invalid parity, */
855 : /* we have to take care of it in case of recover */
856 756432 : if (block_has_invalid_parity(block)) {
857 : /* store it in the failed set, because */
858 : /* the parity may be still computed with the previous content */
859 182050 : failed[failed_count].index = diskcur;
860 182050 : failed[failed_count].size = state->block_size;
861 182050 : failed[failed_count].block = block;
862 182050 : ++failed_count;
863 :
864 : /* if the block has invalid parity, we have to update the parity */
865 : /* to include this block change */
866 : /* This also apply to CHG blocks, but we are going to handle */
867 : /* later this case to do the updates only if really needed */
868 182050 : if (block_state != BLOCK_STATE_CHG)
869 89762 : parity_needs_to_be_updated = 1;
870 :
871 : /* note that DELETE blocks are skipped in the next check */
872 : /* and we have to store them in the failed blocks */
873 : /* before skipping */
874 :
875 : /* follow */
876 : }
877 :
878 : /* if the block is not used */
879 756432 : if (!block_has_file(block))
880 110618 : continue;
881 :
882 : /* handle error conditions */
883 645814 : if (task->state == TASK_STATE_IOERROR) {
884 : /* LCOV_EXCL_START */
885 : ++io_error;
886 : goto bail;
887 : /* LCOV_EXCL_STOP */
888 : }
889 645814 : if (task->state == TASK_STATE_ERROR) {
890 : /* LCOV_EXCL_START */
891 : ++soft_error;
892 : goto bail;
893 : /* LCOV_EXCL_STOP */
894 : }
895 645814 : if (task->state == TASK_STATE_ERROR_CONTINUE) {
896 18 : ++soft_error;
897 18 : error_on_this_block = 1;
898 18 : continue;
899 : }
900 645796 : if (task->state == TASK_STATE_IOERROR_CONTINUE) {
901 0 : ++io_error;
902 0 : if (io_error >= state->opt.io_error_limit) {
903 : /* LCOV_EXCL_START */
904 : log_fatal(EIO, "DANGER! Too many input/output errors in the %s disk. It isn't possible to continue.\n", disk->dir);
905 : log_fatal(EIO, "Stopping at block %u\n", blockcur);
906 : goto bail;
907 : /* LCOV_EXCL_STOP */
908 : }
909 :
910 : /* otherwise continue */
911 0 : io_error_on_this_block = 1;
912 0 : continue;
913 : }
914 645796 : if (task->state != TASK_STATE_DONE) {
915 : /* LCOV_EXCL_START */
916 : log_fatal(EINTERNAL, "Internal inconsistency in task state\n");
917 : os_abort();
918 : /* LCOV_EXCL_STOP */
919 : }
920 :
921 645796 : countsize += read_size;
922 :
923 : /* now compute the hash */
924 645796 : if (rehash) {
925 27200 : memhash(state->prevhash, state->prevhashseed, hash, buffer[diskcur], read_size);
926 :
927 : /* compute the new hash, and store it */
928 27200 : rehandle[diskcur].block = block;
929 27200 : memhash(state->hash, state->hashseed, rehandle[diskcur].hash, buffer[diskcur], read_size);
930 : } else {
931 618596 : memhash(state->hash, state->hashseed, hash, buffer[diskcur], read_size);
932 : }
933 :
934 : /* until now is hash */
935 645796 : state_usage_hash(state);
936 :
937 645796 : if (block_has_updated_hash(block)) {
938 : /* compare the hash */
939 553526 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
940 : /* if the file has invalid parity, it's a REP changed during the sync */
941 853 : if (block_has_invalid_parity(block)) {
942 1 : log_tag("error:%u:%s:%s: Unexpected data change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
943 1 : log_error(ESOFT, "Data change at file '%s' at position '%u'\n", task->path, file_pos);
944 1 : log_error(ESOFT, "WARNING! Unexpected data modification of a file without parity!\n");
945 :
946 1 : if (file_flag_has(file, FILE_IS_COPY)) {
947 1 : log_error(ESOFT, "This file was detected as a copy of another file with the same name, size,\n");
948 1 : log_error(ESOFT, "and timestamp, but the file data isn't matching the assumed copy.\n");
949 1 : log_error(ESOFT, "If this is a false positive, and the files are expected to be different,\n");
950 1 : log_error(ESOFT, "you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
951 : } else {
952 0 : log_error(ESOFT, "Try removing the file from the array and rerun the 'sync' command!\n");
953 : }
954 :
955 1 : ++soft_error;
956 :
957 : /* if the file is changed, it means that it was modified during sync */
958 : /* this isn't a serious error, so we skip this block, and continue with others */
959 1 : error_on_this_block = 1;
960 1 : continue;
961 852 : } else { /* otherwise it's a BLK with silent error */
962 852 : unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
963 852 : log_tag("error_data:%u:%s:%s: Data error at position %u, diff hash bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
964 852 : log_error(EDATA, "Data error in file '%s' at position '%u', diff hash bits %u/%u\n", task->path, file_pos, diff, BLOCK_HASH_SIZE * 8);
965 :
966 : /* save the failed block for the fix */
967 852 : failed[failed_count].index = diskcur;
968 852 : failed[failed_count].size = read_size;
969 852 : failed[failed_count].block = block;
970 852 : ++failed_count;
971 :
972 : /* silent errors are very rare, and are not a signal that a disk */
973 : /* is going to fail. So, we just continue marking the block as bad */
974 : /* just like in scrub */
975 852 : ++silent_error;
976 852 : silent_error_on_this_block = 1;
977 852 : continue;
978 : }
979 : }
980 : } else {
981 : /* if until now the parity doesn't need to be updated */
982 92270 : if (!parity_needs_to_be_updated) {
983 : /* for sure it's a CHG block, because EMPTY are processed before with "continue" */
984 : /* and BLK and REP have "block_has_updated_hash()" as 1, and all the others */
985 : /* have "parity_needs_to_be_updated" already at 1 */
986 35190 : assert(block_state_get(block) == BLOCK_STATE_CHG);
987 :
988 : /* if the hash represents the data unequivocally */
989 35190 : if (hash_is_unique(block->hash)) {
990 : /* check if the hash is changed */
991 6250 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
992 : /* the block is different, and we must update parity */
993 5087 : parity_needs_to_be_updated = 1;
994 : }
995 : } else {
996 : /* if the hash is already invalid, we update parity */
997 28940 : parity_needs_to_be_updated = 1;
998 : }
999 : }
1000 :
1001 : /* copy the hash in the block, but doesn't mark the block as hashed */
1002 : /* this allow in case of skipped block to do not save the failed computation */
1003 92270 : memcpy(block->hash, hash, BLOCK_HASH_SIZE);
1004 :
1005 : /* note that in case of rehash, this is the wrong hash, */
1006 : /* but it will be overwritten later */
1007 : }
1008 : }
1009 :
1010 : /* if we have only silent errors we can try to fix them on-the-fly */
1011 : /* note the fix is not written to disk, but used only to */
1012 : /* compute the new parity */
1013 126072 : if (!error_on_this_block && !io_error_on_this_block && silent_error_on_this_block) {
1014 : unsigned failed_mac;
1015 852 : int something_to_recover = 0;
1016 :
1017 : /* sort the failed vector */
1018 : /* because with threads it may be in any order */
1019 : /* but RAID requires the indexes to be sorted */
1020 852 : qsort(failed, failed_count, sizeof(failed[0]), failed_compare_by_index);
1021 :
1022 : /* setup the blocks to recover */
1023 852 : failed_mac = 0;
1024 2556 : for (j = 0; j < failed_count; ++j) {
1025 1704 : unsigned char* block_buffer = buffer[failed[j].index];
1026 1704 : unsigned char* block_copy = copy[failed[j].index];
1027 1704 : unsigned block_state = block_state_get(failed[j].block);
1028 :
1029 : /* we try to recover only if at least one BLK is present */
1030 1704 : if (block_state == BLOCK_STATE_BLK)
1031 852 : something_to_recover = 1;
1032 :
1033 : /* save a copy of the content just read */
1034 : /* that it's going to be overwritten by the recovering function */
1035 1704 : memcpy(block_copy, block_buffer, state->block_size);
1036 :
1037 1704 : if (block_state == BLOCK_STATE_CHG
1038 852 : && hash_is_zero(failed[j].block->hash)
1039 : ) {
1040 : /* if the block was filled with 0, restore this state */
1041 : /* and avoid to recover it */
1042 0 : memset(block_buffer, 0, state->block_size);
1043 : } else {
1044 : /* if we have too many failures, we cannot recover */
1045 1704 : if (failed_mac >= state->level)
1046 0 : break;
1047 :
1048 : /* otherwise it has to be recovered */
1049 1704 : failed_map[failed_mac++] = failed[j].index;
1050 : }
1051 : }
1052 :
1053 : /* if we have something to recover and enough parity */
1054 852 : if (something_to_recover && j == failed_count) {
1055 : /* until now is misc */
1056 852 : state_usage_misc(state);
1057 :
1058 : /* read the parity */
1059 : /* we are sure that parity exists because */
1060 : /* we have at least one BLK block */
1061 5964 : for (l = 0; l < state->level; ++l) {
1062 5112 : ret = parity_read(&parity_handle[l], blockcur, buffer[diskmax + l], state->block_size, log_error);
1063 5112 : if (ret == -1) {
1064 : /* LCOV_EXCL_START */
1065 : log_tag("parity_%s:%u:%s: Read error. %s.\n", es(errno), blockcur, lev_config_name(l), strerror(errno));
1066 : if (is_hw(errno)) {
1067 : log_error_errno(errno, lev_config_name(l));
1068 : if (io_error >= state->opt.io_error_limit) {
1069 : log_fatal(errno, "DANGER! Too many input/output errors in the %s disk. It isn't possible to continue.\n", lev_config_name(l));
1070 : log_fatal(errno, "Stopping at block %u\n", blockcur);
1071 : ++io_error;
1072 : goto bail;
1073 : }
1074 :
1075 : ++io_error;
1076 : io_error_on_this_block = 1;
1077 : continue;
1078 : }
1079 :
1080 : log_fatal_errno(errno, lev_config_name(l));
1081 : log_fatal(errno, "Stopping at block %u\n", blockcur);
1082 : ++soft_error;
1083 : goto bail;
1084 : /* LCOV_EXCL_STOP */
1085 : }
1086 :
1087 : /* until now is parity */
1088 5112 : state_usage_parity(state, &l, 1);
1089 : }
1090 :
1091 : /* if no error in parity read */
1092 852 : if (!io_error_on_this_block) {
1093 : /* try to fix the data */
1094 : /* note that this is a simple fix algorithm, that doesn't take into */
1095 : /* account the case of a wrong parity */
1096 : /* only 'fix' supports the most advanced fixing */
1097 852 : raid_rec(failed_mac, failed_map, diskmax, state->level, state->block_size, buffer);
1098 :
1099 : /* until now is raid */
1100 852 : state_usage_raid(state);
1101 :
1102 : /* check the result and prepare the data */
1103 2556 : for (j = 0; j < failed_count; ++j) {
1104 : unsigned char hash[HASH_MAX];
1105 1704 : unsigned char* block_buffer = buffer[failed[j].index];
1106 1704 : unsigned char* block_copy = copy[failed[j].index];
1107 1704 : unsigned block_state = block_state_get(failed[j].block);
1108 :
1109 1704 : if (block_state == BLOCK_STATE_BLK) {
1110 852 : unsigned size = failed[j].size;
1111 :
1112 : /* compute the hash of the recovered block */
1113 852 : if (rehash) {
1114 0 : memhash(state->prevhash, state->prevhashseed, hash, block_buffer, size);
1115 : } else {
1116 852 : memhash(state->hash, state->hashseed, hash, block_buffer, size);
1117 : }
1118 :
1119 : /* until now is hash */
1120 852 : state_usage_hash(state);
1121 :
1122 : /* if the hash doesn't match */
1123 852 : if (memcmp(hash, failed[j].block->hash, BLOCK_HASH_SIZE) != 0) {
1124 : /* we have not recovered */
1125 0 : break;
1126 : }
1127 :
1128 : /* pad with 0 if needed */
1129 852 : if (size < state->block_size)
1130 333 : memset(block_buffer + size, 0, state->block_size - size);
1131 : } else {
1132 : /* otherwise restore the content */
1133 : /* because we are not interested in the old state */
1134 : /* that it's recovered for CHG, REP and DELETED blocks */
1135 852 : memcpy(block_buffer, block_copy, state->block_size);
1136 : }
1137 : }
1138 :
1139 : /* if all is processed, we have fixed it */
1140 852 : if (j == failed_count)
1141 852 : fixed_error_on_this_block = 1;
1142 : }
1143 : }
1144 : }
1145 :
1146 : /* if we have read all the data required and it's correct, proceed with the parity */
1147 126072 : if (!error_on_this_block && !io_error_on_this_block
1148 126053 : && (!silent_error_on_this_block || fixed_error_on_this_block)
1149 : ) {
1150 : /* update the parity only if really needed */
1151 126053 : if (parity_needs_to_be_updated) {
1152 : /* compute the parity */
1153 125127 : raid_gen(diskmax, state->level, state->block_size, buffer);
1154 :
1155 : /* until now is raid */
1156 125127 : state_usage_raid(state);
1157 :
1158 : /* mark that the parity is going to be written */
1159 125127 : parity_going_to_be_updated = 1;
1160 : }
1161 :
1162 : /* for each disk, mark the blocks as processed */
1163 882371 : for (j = 0; j < diskmax; ++j) {
1164 : struct snapraid_block* block;
1165 :
1166 756318 : if (!handle[j].disk)
1167 0 : continue;
1168 :
1169 756318 : block = fs_par2block_find(handle[j].disk, blockcur);
1170 :
1171 756318 : if (block == BLOCK_NULL) {
1172 : /* nothing to do */
1173 97829 : continue;
1174 : }
1175 :
1176 : /* if it's a deleted block */
1177 658489 : if (block_state_get(block) == BLOCK_STATE_DELETED) {
1178 : /* the parity is now updated without this block, so it's now empty */
1179 12787 : fs_deallocate(handle[j].disk, blockcur);
1180 12787 : continue;
1181 : }
1182 :
1183 : /* now all the blocks have the hash and the parity computed */
1184 645702 : block_state_set(block, BLOCK_STATE_BLK);
1185 : }
1186 :
1187 : /* we update the info block only if we really have updated the parity */
1188 : /* because otherwise the time/justsynced info would be misleading as we didn't */
1189 : /* wrote the parity at this time */
1190 : /* we also update the info block only if no silent error was found */
1191 : /* because has no sense to refresh the time for data that we know bad */
1192 126053 : if (parity_needs_to_be_updated
1193 125127 : && !silent_error_on_this_block
1194 : ) {
1195 : /* if rehash is needed */
1196 124276 : if (rehash) {
1197 : /* store all the new hash already computed */
1198 32459 : for (j = 0; j < diskmax; ++j) {
1199 27822 : if (rehandle[j].block)
1200 27200 : memcpy(rehandle[j].block->hash, rehandle[j].hash, BLOCK_HASH_SIZE);
1201 : }
1202 : }
1203 :
1204 : /* update the time info of the block */
1205 : /* we are also clearing any previous bad and rehash flag */
1206 124276 : info_set(&state->infoarr, blockcur, info_make(now, 0, 0, 1));
1207 : }
1208 : }
1209 :
1210 : /* if a silent (even if corrected) or input/output error was found */
1211 : /* mark the block as bad to have check/fix to handle it */
1212 : /* because our correction is in memory only and not yet written */
1213 126072 : if (silent_error_on_this_block || io_error_on_this_block) {
1214 : /* set the error status keeping the other info */
1215 852 : info_set(&state->infoarr, blockcur, info_set_bad(info));
1216 : }
1217 :
1218 : /* finally schedule parity write */
1219 : /* Note that the calls to io_parity_write() are mandatory */
1220 : /* even if the parity doesn't need to be updated */
1221 : /* This because we want to keep track of the time usage */
1222 126072 : state_usage_misc(state);
1223 :
1224 : /* write start */
1225 126072 : io_write_preset(&io, blockcur, !parity_going_to_be_updated);
1226 :
1227 : /* write the parity */
1228 705324 : for (l = 0; l < state->level; ++l) {
1229 : unsigned levcur;
1230 :
1231 579252 : io_parity_write(&io, &levcur, waiting_map, &waiting_mac);
1232 :
1233 : /* until now is parity */
1234 579252 : state_usage_parity(state, waiting_map, waiting_mac);
1235 : }
1236 :
1237 : /* write finished */
1238 126072 : io_write_next(&io, blockcur, !parity_going_to_be_updated, writer_error);
1239 :
1240 : /* handle errors reported */
1241 630360 : for (j = 0; j < IO_WRITER_ERROR_MAX; ++j) {
1242 504288 : if (writer_error[j]) {
1243 0 : switch (j + IO_WRITER_ERROR_BASE) {
1244 0 : case TASK_STATE_IOERROR_CONTINUE :
1245 0 : ++io_error;
1246 0 : if (io_error >= state->opt.io_error_limit) {
1247 : /* LCOV_EXCL_START */
1248 : log_fatal(EIO, "DANGER! Too many input/output errors in a parity disk. It isn't possible to continue.\n");
1249 : log_fatal(EIO, "Stopping at block %u\n", blockcur);
1250 : goto bail;
1251 : /* LCOV_EXCL_STOP */
1252 : }
1253 0 : break;
1254 0 : case TASK_STATE_ERROR_CONTINUE :
1255 0 : ++soft_error;
1256 0 : break;
1257 0 : case TASK_STATE_IOERROR :
1258 : /* LCOV_EXCL_START */
1259 : ++io_error;
1260 : goto bail;
1261 : /* LCOV_EXCL_STOP */
1262 0 : case TASK_STATE_ERROR :
1263 : /* LCOV_EXCL_START */
1264 : ++soft_error;
1265 : goto bail;
1266 : /* LCOV_EXCL_STOP */
1267 : }
1268 : }
1269 : }
1270 :
1271 : /* mark the state as needing write */
1272 126072 : state->need_write = 1;
1273 :
1274 : /* count the number of processed block */
1275 126072 : ++countpos;
1276 :
1277 : /* progress */
1278 126072 : if (state_progress(state, &io, blockcur, countpos, countmax, countsize)) {
1279 : /* LCOV_EXCL_START */
1280 : break;
1281 : /* LCOV_EXCL_STOP */
1282 : }
1283 :
1284 : /* thermal control */
1285 126072 : if (state_thermal_alarm(state)) {
1286 : /* until now is misc */
1287 0 : state_usage_misc(state);
1288 :
1289 0 : state_progress_stop(state);
1290 :
1291 : /* before spinning down flush all the caches */
1292 0 : ret = state_flush(state, &io, parity_handle, blockcur);
1293 0 : if (ret == -1) {
1294 : /* LCOV_EXCL_START */
1295 : log_fatal(errno, "Stopping at block %u\n", blockcur);
1296 : ++io_error;
1297 : goto bail;
1298 : /* LCOV_EXCL_STOP */
1299 : }
1300 :
1301 0 : state_thermal_cooldown(state);
1302 :
1303 0 : state_progress_restart(state);
1304 :
1305 : /* drop until now */
1306 0 : state_usage_waste(state);
1307 : }
1308 :
1309 : /* autosave */
1310 126072 : if ((state->autosave != 0
1311 35436 : && autosavedone >= autosavelimit /* if we have reached the limit */
1312 0 : && autosavemissing >= autosavelimit) /* if we have at least a full step to do */
1313 : /* or if we have a forced autosave at the specified block */
1314 126072 : || (state->opt.force_autosave_at != 0 && state->opt.force_autosave_at == blockcur)
1315 : ) {
1316 1 : autosavedone = 0; /* restart the counter */
1317 :
1318 : /* until now is misc */
1319 1 : state_usage_misc(state);
1320 :
1321 1 : state_progress_stop(state);
1322 :
1323 1 : msg_progress("Autosaving...\n");
1324 :
1325 : /* before writing the new content file we ensure that */
1326 : /* the parity is really written flushing the disk cache */
1327 1 : ret = state_flush(state, &io, parity_handle, blockcur);
1328 1 : if (ret == -1) {
1329 : /* LCOV_EXCL_START */
1330 : log_fatal(EIO, "Stopping at block %u\n", blockcur);
1331 : ++io_error;
1332 : goto bail;
1333 : /* LCOV_EXCL_STOP */
1334 : }
1335 :
1336 : /* now we can safely write the content file */
1337 1 : state_write(state);
1338 :
1339 1 : state_progress_restart(state);
1340 :
1341 : /* drop until now */
1342 1 : state_usage_waste(state);
1343 : }
1344 : }
1345 :
1346 92 : end:
1347 92 : state_progress_end(state, countpos, countmax, countsize, "Nothing to sync.\n");
1348 :
1349 : /* before returning we ensure that */
1350 : /* the parity is really written flushing the disk cache */
1351 92 : ret = state_flush(state, &io, parity_handle, blockcur);
1352 92 : if (ret == -1) {
1353 : /* LCOV_EXCL_START */
1354 : log_fatal(errno, "Stopping at block %u\n", blockcur);
1355 : ++io_error;
1356 : goto bail;
1357 : /* LCOV_EXCL_STOP */
1358 : }
1359 :
1360 : /* save the new state if required */
1361 92 : if (!state->opt.kill_after_sync) {
1362 82 : if ((state->need_write || state->opt.force_content_write))
1363 64 : state_write(state);
1364 : } else {
1365 10 : log_fatal(EUSER, "WARNING! Skipped writing state due to --test-kill-after-sync option.\n");
1366 : }
1367 :
1368 92 : state_usage_print(state);
1369 :
1370 92 : if (soft_error || silent_error || io_error) {
1371 12 : msg_status("\n");
1372 12 : msg_status("%8u soft errors\n", soft_error);
1373 12 : msg_status("%8u io errors\n", io_error);
1374 12 : msg_status("%8u data errors\n", silent_error);
1375 : } else {
1376 : /* print the result only if processed something */
1377 80 : if (countpos != 0)
1378 60 : msg_status("Everything OK\n");
1379 : }
1380 :
1381 92 : if (soft_error)
1382 11 : log_fatal(ESOFT, "WARNING! Unexpected soft errors!\n");
1383 92 : if (io_error)
1384 0 : log_fatal(EIO, "DANGER! Unexpected input/output errors! The failing blocks are now marked as bad!\n");
1385 92 : if (silent_error)
1386 1 : log_fatal(EDATA, "DANGER! Unexpected silent data errors! The failing blocks are now marked as bad!\n");
1387 92 : if (io_error || silent_error) {
1388 1 : log_fatal(ESOFT, "Use 'snapraid status' to list the bad blocks.\n");
1389 1 : log_fatal(ESOFT, "Use 'snapraid -e fix' to recover.\n");
1390 : }
1391 :
1392 92 : log_tag("summary:error_soft:%u\n", soft_error);
1393 92 : log_tag("summary:error_io:%u\n", io_error);
1394 92 : log_tag("summary:error_data:%u\n", silent_error);
1395 92 : if (soft_error + silent_error + io_error == 0)
1396 80 : log_tag("summary:exit:ok\n");
1397 12 : else if (silent_error + io_error == 0)
1398 11 : log_tag("summary:exit:warning\n");
1399 : else
1400 1 : log_tag("summary:exit:error\n");
1401 92 : log_flush();
1402 :
1403 92 : bail:
1404 : /* stop all the worker threads */
1405 92 : io_stop(&io);
1406 :
1407 644 : for (j = 0; j < diskmax; ++j) {
1408 552 : struct snapraid_file* file = handle[j].file;
1409 552 : struct snapraid_disk* disk = handle[j].disk;
1410 552 : ret = handle_close(&handle[j]);
1411 552 : if (ret == -1) {
1412 : /* LCOV_EXCL_START */
1413 : log_tag("%s:%u:%s:%s: Close error. %s.\n", es(errno), blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
1414 : log_fatal_errno(errno, disk->name);
1415 :
1416 : if (is_hw(errno)) {
1417 : ++io_error;
1418 : } else {
1419 : ++soft_error;
1420 : }
1421 : /* continue, as we are already exiting */
1422 : /* LCOV_EXCL_STOP */
1423 : }
1424 : }
1425 :
1426 92 : free(handle);
1427 92 : free(zero_alloc);
1428 92 : free(copy_alloc);
1429 92 : free(copy);
1430 92 : free(rehandle_alloc);
1431 92 : free(failed);
1432 92 : free(failed_map);
1433 92 : free(waiting_map);
1434 92 : io_done(&io);
1435 92 : free(block_enabled);
1436 :
1437 92 : if (state->opt.expect_recoverable) {
1438 1 : if (soft_error + silent_error + io_error == 0)
1439 0 : return -1;
1440 : } else {
1441 91 : if (soft_error + silent_error + io_error != 0)
1442 11 : return -1;
1443 : }
1444 :
1445 81 : if (alert < 0)
1446 0 : return -1;
1447 :
1448 81 : return 0;
1449 : }
1450 :
1451 96 : int state_sync(struct snapraid_state* state, block_off_t blockstart, block_off_t blockcount)
1452 : {
1453 : block_off_t blockmax;
1454 : block_off_t used_paritymax;
1455 : block_off_t file_paritymax;
1456 : data_off_t size;
1457 : int ret;
1458 : struct snapraid_parity_handle parity_handle[LEV_MAX];
1459 : unsigned process_error;
1460 : unsigned l;
1461 96 : int skip_sync = 0;
1462 :
1463 96 : msg_progress("Initializing...\n");
1464 :
1465 96 : blockmax = parity_allocated_size(state);
1466 96 : size = blockmax * (data_off_t)state->block_size;
1467 :
1468 : /* minimum size of the parity files we expect */
1469 96 : used_paritymax = parity_used_size(state);
1470 :
1471 : /* effective size of the parity files */
1472 96 : file_paritymax = 0;
1473 :
1474 96 : if (blockstart > blockmax) {
1475 : /* LCOV_EXCL_START */
1476 : log_fatal(EUSER, "Error in the starting block %u. It is larger than the parity size %u.\n", blockstart, blockmax);
1477 : exit(EXIT_FAILURE);
1478 : /* LCOV_EXCL_STOP */
1479 : }
1480 :
1481 : /* adjust the number of block to process */
1482 96 : if (blockcount != 0 && blockstart + blockcount < blockmax) {
1483 6 : blockmax = blockstart + blockcount;
1484 : }
1485 :
1486 627 : for (l = 0; l < state->level; ++l) {
1487 : data_off_t out_size;
1488 : block_off_t parityblocks;
1489 :
1490 : /* create the file and open for writing */
1491 531 : ret = parity_create(&parity_handle[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
1492 531 : if (ret == -1) {
1493 : /* LCOV_EXCL_START */
1494 : log_tag("parity_%s:%u:%s: Create error. %s.\n", es(errno), 0, lev_config_name(l), strerror(errno));
1495 : log_fatal_errno(errno, lev_config_name(l));
1496 : exit(EXIT_FAILURE);
1497 : /* LCOV_EXCL_STOP */
1498 : }
1499 :
1500 : /* number of block in the parity file */
1501 531 : parity_size(&parity_handle[l], &out_size);
1502 531 : parityblocks = out_size / state->block_size;
1503 :
1504 : /* if the file is too small */
1505 531 : if (parityblocks < used_paritymax) {
1506 0 : log_fatal(ESOFT, "WARNING! The %s parity has only %u blocks instead of %u.\n", lev_name(l), parityblocks, used_paritymax);
1507 : }
1508 :
1509 : /* keep the smallest parity number of blocks */
1510 531 : if (l == 0 || file_paritymax > parityblocks)
1511 97 : file_paritymax = parityblocks;
1512 : }
1513 :
1514 : /* if we do a full parity realloc or computation, having a wrong parity size is expected */
1515 96 : if (!state->opt.force_realloc && !state->opt.force_full) {
1516 : /* if the parities are too small */
1517 86 : if (file_paritymax < used_paritymax) {
1518 : /* LCOV_EXCL_START */
1519 : log_fatal(ESOFT, "DANGER! One or more the parity files are smaller than expected!\n");
1520 : if (file_paritymax != 0) {
1521 : log_fatal(ESOFT, "If this happens because you are using an old content file,\n");
1522 : log_fatal(ESOFT, "you can 'sync' anyway using 'snapraid --force-full sync'\n");
1523 : log_fatal(ESOFT, "to force a full rebuild of the parity.\n");
1524 : } else {
1525 : log_fatal(ESOFT, "It's possible that the parity disks are not mounted.\n");
1526 : log_fatal(ESOFT, "If instead you are adding a new parity level, you can 'sync' using\n");
1527 : log_fatal(ESOFT, "'snapraid --force-full sync' to force a full rebuild of the parity.\n");
1528 : }
1529 : exit(EXIT_FAILURE);
1530 : /* LCOV_EXCL_STOP */
1531 : }
1532 : }
1533 :
1534 96 : process_error = 0;
1535 :
1536 96 : if (state->opt.prehash) {
1537 7 : msg_progress("Hashing...\n");
1538 :
1539 7 : ret = state_hash_process(state, blockstart, blockmax, &skip_sync);
1540 7 : if (ret == -1) {
1541 : /* LCOV_EXCL_START */
1542 : ++process_error;
1543 : /* continue, in case also doing the sync if ::skip_sync is not set */
1544 : /* LCOV_EXCL_STOP */
1545 : }
1546 : }
1547 :
1548 96 : if (!skip_sync) {
1549 95 : msg_progress("Resizing...\n");
1550 :
1551 : /* now change the size of all parities */
1552 620 : for (l = 0; l < state->level; ++l) {
1553 : int is_modified;
1554 :
1555 : /* change the size of the parity file, truncating or extending it */
1556 : /* from this point all the DELETED blocks after the end of the parity are invalid */
1557 : /* and they are automatically removed when we save the new content file */
1558 525 : ret = parity_chsize(&parity_handle[l], &state->parity[l], &is_modified, size, state->block_size, state->opt.skip_fallocate, state->opt.skip_space_holder);
1559 525 : if (ret == -1) {
1560 : /* LCOV_EXCL_START */
1561 : data_off_t out_size;
1562 : parity_size(&parity_handle[l], &out_size);
1563 : parity_overflow(state, out_size);
1564 : log_fatal(errno, "WARNING! Without a usable %s file, it isn't possible to sync.\n", lev_name(l));
1565 : exit(EXIT_FAILURE);
1566 : /* LCOV_EXCL_STOP */
1567 : }
1568 :
1569 525 : if (is_modified)
1570 199 : state->need_write = 1;
1571 : }
1572 :
1573 : /* after resizing parity files, refresh again the free info */
1574 95 : state_refresh(state);
1575 :
1576 : /**
1577 : * Save the new state before the sync but after the hashing phase
1578 : *
1579 : * This allows to recover after an aborted sync, and at the same time
1580 : * it allows to recover broken copied/moved files identified in the
1581 : * hashing phase.
1582 : *
1583 : * For example, think at this case:
1584 : * - Add some files at the array
1585 : * - Run a sync command, it will recompute the parity adding the new files
1586 : * - Abort the sync command before it stores the new content file
1587 : * - Delete the not yet synced files from the array
1588 : * - Run a new sync command
1589 : *
1590 : * The sync command has no way to know that the parity file was modified
1591 : * because the files triggering these changes are now deleted and they aren't
1592 : * listed in the content file.
1593 : * Instead, saving the new content file in advance, keeps track of all the parity
1594 : * that may be modified.
1595 : */
1596 95 : if (!state->opt.skip_content_write) {
1597 95 : if (state->need_write)
1598 84 : state_write(state);
1599 : } else {
1600 0 : log_fatal(EUSER, "WARNING! Skipped state write for --test-skip-content-write option.\n");
1601 : }
1602 :
1603 : /* skip degenerated cases of empty parity, or skipping all */
1604 95 : if (blockstart < blockmax) {
1605 92 : ret = state_sync_process(state, parity_handle, blockstart, blockmax);
1606 92 : if (ret == -1) {
1607 : /* LCOV_EXCL_START */
1608 : ++process_error;
1609 : /* continue, as we are already exiting */
1610 : /* LCOV_EXCL_STOP */
1611 : }
1612 : } else {
1613 3 : msg_status("Nothing to sync.\n");
1614 : }
1615 : }
1616 :
1617 627 : for (l = 0; l < state->level; ++l) {
1618 531 : ret = parity_close(&parity_handle[l]);
1619 531 : if (ret == -1) {
1620 : /* LCOV_EXCL_START */
1621 : log_tag("parity_%s:%u:%s: Close error. %s.\n", es(errno), blockmax, lev_config_name(l), strerror(errno));
1622 : log_fatal_errno(errno, lev_config_name(l));
1623 :
1624 : ++process_error;
1625 : /* continue, as we are already exiting */
1626 : /* LCOV_EXCL_STOP */
1627 : }
1628 : }
1629 :
1630 96 : if (process_error != 0)
1631 12 : return -1;
1632 84 : return 0;
1633 : }
1634 :
|