Line data Source code
1 : /*
2 : * Copyright (C) 2011 Andrea Mazzoleni
3 : *
4 : * This program is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 3 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License
15 : * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 : */
17 :
18 : #include "portable.h"
19 :
20 : #include "support.h"
21 : #include "elem.h"
22 : #include "state.h"
23 : #include "parity.h"
24 : #include "handle.h"
25 : #include "io.h"
26 : #include "raid/raid.h"
27 :
28 : /****************************************************************************/
29 : /* hash */
30 :
31 7 : static int state_hash_process(struct snapraid_state* state, block_off_t blockstart, block_off_t blockmax, int* skip_sync)
32 : {
33 : struct snapraid_handle* handle;
34 : unsigned diskmax;
35 : block_off_t i;
36 : unsigned j;
37 : void* buffer;
38 : void* buffer_alloc;
39 : data_off_t countsize;
40 : block_off_t countpos;
41 : block_off_t countmax;
42 : int ret;
43 : unsigned error;
44 : unsigned silent_error;
45 : unsigned io_error;
46 : char esc_buffer[ESC_MAX];
47 :
48 : /* maps the disks to handles */
49 7 : handle = handle_mapping(state, &diskmax);
50 :
51 : /* buffer for reading */
52 7 : buffer = malloc_nofail_direct(state->block_size, &buffer_alloc);
53 7 : if (!state->opt.skip_self)
54 0 : mtest_vector(1, state->block_size, &buffer);
55 :
56 7 : error = 0;
57 7 : silent_error = 0;
58 7 : io_error = 0;
59 :
60 : /* first count the number of blocks to process */
61 7 : countmax = 0;
62 49 : for (j = 0; j < diskmax; ++j) {
63 42 : struct snapraid_disk* disk = handle[j].disk;
64 :
65 : /* if no disk, nothing to check */
66 42 : if (!disk)
67 0 : continue;
68 :
69 236670 : for (i = blockstart; i < blockmax; ++i) {
70 : struct snapraid_block* block;
71 : unsigned block_state;
72 :
73 236628 : block = fs_par2block_find(disk, i);
74 :
75 : /* get the state of the block */
76 236628 : block_state = block_state_get(block);
77 :
78 : /* process REP and CHG blocks */
79 236628 : if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
80 223278 : continue;
81 :
82 13350 : ++countmax;
83 : }
84 : }
85 :
86 : /* drop until now */
87 7 : state_usage_waste(state);
88 :
89 7 : countsize = 0;
90 7 : countpos = 0;
91 7 : if (!state_progress_begin(state, blockstart, blockmax, countmax))
92 0 : goto end;
93 :
94 49 : for (j = 0; j < diskmax; ++j) {
95 42 : struct snapraid_disk* disk = handle[j].disk;
96 :
97 : /* if no disk, nothing to check */
98 42 : if (!disk)
99 0 : continue;
100 :
101 236670 : for (i = blockstart; i < blockmax; ++i) {
102 : snapraid_info info;
103 : int rehash;
104 : struct snapraid_block* block;
105 : int read_size;
106 : unsigned char hash[HASH_MAX];
107 : unsigned block_state;
108 : struct snapraid_file* file;
109 : block_off_t file_pos;
110 :
111 236628 : block = fs_par2block_find(disk, i);
112 :
113 : /* get the state of the block */
114 236628 : block_state = block_state_get(block);
115 :
116 : /* process REP and CHG blocks */
117 236628 : if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
118 223288 : continue;
119 :
120 : /* get the file of this block */
121 13350 : file = fs_par2file_get(disk, i, &file_pos);
122 :
123 : /* get block specific info */
124 13350 : info = info_get(&state->infoarr, i);
125 :
126 : /* if we have to use the old hash */
127 13350 : rehash = info_get_rehash(info);
128 :
129 : /* until now is misc */
130 13350 : state_usage_misc(state);
131 :
132 : /* if the file is different than the current one, close it */
133 13350 : if (handle[j].file != 0 && handle[j].file != file) {
134 : /* keep a pointer at the file we are going to close for error reporting */
135 5366 : struct snapraid_file* report = handle[j].file;
136 5366 : ret = handle_close(&handle[j]);
137 5366 : if (ret == -1) {
138 : /* LCOV_EXCL_START */
139 : /* This one is really an unexpected error, because we are only reading */
140 : /* and closing a descriptor should never fail */
141 : if (errno == EIO) {
142 : log_tag("error:%u:%s:%s: Close EIO error. %s\n", i, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
143 : log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to sync.\n");
144 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle[j].path);
145 : log_fatal("Stopping at block %u\n", i);
146 : ++io_error;
147 : goto bail;
148 : }
149 :
150 : log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
151 : log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to sync.\n");
152 : log_fatal("Ensure that file '%s' can be accessed.\n", handle[j].path);
153 : log_fatal("Stopping at block %u\n", i);
154 : ++error;
155 : goto bail;
156 : /* LCOV_EXCL_STOP */
157 : }
158 : }
159 :
160 13350 : ret = handle_open(&handle[j], file, state->file_mode, log_error, 0);
161 13350 : if (ret == -1) {
162 6 : if (errno == EIO) {
163 : /* LCOV_EXCL_START */
164 : log_tag("error:%u:%s:%s: Open EIO error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
165 : log_fatal("DANGER! Unexpected input/output open error in a data disk, it isn't possible to sync.\n");
166 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle[j].path);
167 : log_fatal("Stopping at block %u\n", i);
168 : ++io_error;
169 : goto bail;
170 : /* LCOV_EXCL_STOP */
171 : }
172 :
173 6 : if (errno == ENOENT) {
174 2 : log_tag("error:%u:%s:%s: Open ENOENT error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
175 2 : log_error("Missing file '%s'.\n", handle[j].path);
176 2 : log_error("WARNING! You cannot modify data disk during a sync.\n");
177 2 : log_error("Rerun the sync command when finished.\n");
178 2 : ++error;
179 : /* if the file is missing, it means that it was removed during sync */
180 : /* this isn't a serious error, so we skip this block, and continue with others */
181 2 : continue;
182 : }
183 :
184 4 : if (errno == EACCES) {
185 4 : log_tag("error:%u:%s:%s: Open EACCES error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
186 4 : log_error("No access at file '%s'.\n", handle[j].path);
187 4 : log_error("WARNING! Please fix the access permission in the data disk.\n");
188 4 : log_error("Rerun the sync command when finished.\n");
189 4 : ++error;
190 : /* this isn't a serious error, so we skip this block, and continue with others */
191 4 : continue;
192 : }
193 :
194 : /* LCOV_EXCL_START */
195 : log_tag("error:%u:%s:%s: Open error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
196 : log_fatal("WARNING! Unexpected open error in a data disk, it isn't possible to sync.\n");
197 : log_fatal("Ensure that file '%s' can be accessed.\n", handle[j].path);
198 : log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
199 : ++error;
200 : goto bail;
201 : /* LCOV_EXCL_STOP */
202 : }
203 :
204 : /* check if the file is changed */
205 13344 : if (handle[j].st.st_size != file->size
206 13343 : || handle[j].st.st_mtime != file->mtime_sec
207 13342 : || STAT_NSEC(&handle[j].st) != file->mtime_nsec
208 13342 : || handle[j].st.st_ino != file->inode
209 : ) {
210 3 : log_tag("error:%u:%s:%s: Unexpected attribute change\n", i, disk->name, esc_tag(file->sub, esc_buffer));
211 3 : if (handle[j].st.st_size != file->size) {
212 1 : log_error("Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle[j].path, file->size, (uint64_t)handle[j].st.st_size);
213 2 : } else if (handle[j].st.st_mtime != file->mtime_sec
214 1 : || STAT_NSEC(&handle[j].st) != file->mtime_nsec) {
215 1 : log_error("Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle[j].path, file->mtime_sec, file->mtime_nsec, (uint64_t)handle[j].st.st_mtime, STAT_NSEC(&handle[j].st));
216 : } else {
217 1 : log_error("Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", file->inode, (uint64_t)handle[j].st.st_ino, handle[j].path);
218 : }
219 3 : log_error("WARNING! You cannot modify files during a sync.\n");
220 3 : log_error("Rerun the sync command when finished.\n");
221 3 : ++error;
222 : /* if the file is changed, it means that it was modified during sync */
223 : /* this isn't a serious error, so we skip this block, and continue with others */
224 3 : continue;
225 : }
226 :
227 13341 : read_size = handle_read(&handle[j], file_pos, buffer, state->block_size, log_fatal, 0);
228 13341 : if (read_size == -1) {
229 : /* LCOV_EXCL_START */
230 : if (errno == EIO) {
231 : log_tag("error:%u:%s:%s: Read EIO error at position %u. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), file_pos, strerror(errno));
232 : log_fatal("DANGER! Unexpected input/output read error in a data disk, it isn't possible to sync.\n");
233 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be read.\n", disk->dir, handle[j].path);
234 : log_fatal("Stopping at block %u\n", i);
235 : ++io_error;
236 : goto bail;
237 : }
238 :
239 : log_tag("error:%u:%s:%s: Read error at position %u. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), file_pos, strerror(errno));
240 : log_fatal("WARNING! Unexpected read error in a data disk, it isn't possible to sync.\n");
241 : log_fatal("Ensure that file '%s' can be read.\n", handle[j].path);
242 : log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
243 : ++error;
244 : goto bail;
245 : /* LCOV_EXCL_STOP */
246 : }
247 :
248 : /* until now is disk */
249 13341 : state_usage_disk(state, handle, &j, 1);
250 :
251 13341 : state_usage_file(state, disk, file);
252 :
253 13341 : countsize += read_size;
254 :
255 : /* now compute the hash */
256 13341 : if (rehash) {
257 0 : memhash(state->prevhash, state->prevhashseed, hash, buffer, read_size);
258 : } else {
259 13341 : memhash(state->hash, state->hashseed, hash, buffer, read_size);
260 : }
261 :
262 : /* until now is hash */
263 13341 : state_usage_hash(state);
264 :
265 13341 : if (block_state == BLOCK_STATE_REP) {
266 : /* compare the hash */
267 13339 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
268 1 : log_tag("error:%u:%s:%s: Unexpected data change\n", i, disk->name, esc_tag(file->sub, esc_buffer));
269 1 : log_error("Data change at file '%s' at position '%u'\n", handle[j].path, file_pos);
270 1 : log_error("WARNING! Unexpected data modification of a file without parity!\n");
271 :
272 1 : if (file_flag_has(file, FILE_IS_COPY)) {
273 1 : log_error("This file was detected as a copy of another file with the same name, size,\n");
274 1 : log_error("and timestamp, but the file data isn't matching the assumed copy.\n");
275 1 : log_error("If this is a false positive, and the files are expected to be different,\n");
276 1 : log_error("you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
277 : } else {
278 0 : log_error("Try removing the file from the array and rerun the 'sync' command!\n");
279 : }
280 :
281 : /* block sync to allow a recovery before overwriting */
282 : /* the parity needed to make such recovery */
283 1 : *skip_sync = 1; /* avoid to run the next sync */
284 :
285 1 : ++silent_error;
286 1 : continue;
287 : }
288 : } else {
289 : /* the only other case is BLOCK_STATE_CHG */
290 2 : assert(block_state == BLOCK_STATE_CHG);
291 :
292 : /* copy the hash in the block */
293 2 : memcpy(block->hash, hash, BLOCK_HASH_SIZE);
294 :
295 : /* and mark the block as hashed */
296 2 : block_state_set(block, BLOCK_STATE_REP);
297 :
298 : /* mark the state as needing write */
299 2 : state->need_write = 1;
300 : }
301 :
302 : /* count the number of processed block */
303 13340 : ++countpos;
304 :
305 : /* progress */
306 13340 : if (state_progress(state, 0, i, countpos, countmax, countsize)) {
307 : /* LCOV_EXCL_START */
308 : *skip_sync = 1; /* avoid to run the next sync */
309 : break;
310 : /* LCOV_EXCL_STOP */
311 : }
312 : }
313 :
314 : /* close the last file in the disk */
315 42 : if (handle[j].file != 0) {
316 : /* keep a pointer at the file we are going to close for error reporting */
317 7 : struct snapraid_file* report = handle[j].file;
318 7 : ret = handle_close(&handle[j]);
319 7 : if (ret == -1) {
320 : /* LCOV_EXCL_START */
321 : /* This one is really an unexpected error, because we are only reading */
322 : /* and closing a descriptor should never fail */
323 : if (errno == EIO) {
324 : log_tag("error:%u:%s:%s: Close EIO error. %s\n", blockmax, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
325 : log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to sync.\n");
326 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle[j].path);
327 : log_fatal("Stopping at block %u\n", blockmax);
328 : ++io_error;
329 : goto bail;
330 : }
331 :
332 : log_tag("error:%u:%s:%s: Close error. %s\n", blockmax, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
333 : log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to sync.\n");
334 : log_fatal("Ensure that file '%s' can be accessed.\n", handle[j].path);
335 : log_fatal("Stopping at block %u\n", blockmax);
336 : ++error;
337 : goto bail;
338 : /* LCOV_EXCL_STOP */
339 : }
340 : }
341 : }
342 :
343 7 : end:
344 7 : state_progress_end(state, countpos, countmax, countsize, "Nothing to hash.\n");
345 :
346 : /* note that at this point no io_error is possible */
347 : /* because at the first one we bail out */
348 7 : assert(io_error == 0);
349 :
350 7 : if (error || io_error || silent_error) {
351 6 : msg_status("\n");
352 6 : msg_status("%8u file errors\n", error);
353 6 : msg_status("%8u io errors\n", io_error);
354 6 : msg_status("%8u data errors\n", silent_error);
355 : } else {
356 : /* print the result only if processed something */
357 1 : if (countpos != 0)
358 1 : msg_status("Everything OK\n");
359 : }
360 :
361 7 : if (error)
362 5 : log_fatal("WARNING! Unexpected file errors!\n");
363 :
364 7 : log_tag("hash_summary:error_file:%u\n", error);
365 :
366 : /* proceed without bailing out */
367 7 : goto finish;
368 :
369 0 : bail:
370 : /* on bail, don't run the next sync */
371 0 : *skip_sync = 1;
372 :
373 : /* close files left open */
374 0 : for (j = 0; j < diskmax; ++j) {
375 0 : struct snapraid_file* file = handle[j].file;
376 0 : struct snapraid_disk* disk = handle[j].disk;
377 0 : ret = handle_close(&handle[j]);
378 0 : if (ret == -1) {
379 0 : log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
380 0 : log_fatal("DANGER! Unexpected close error in a data disk.\n");
381 0 : ++error;
382 : /* continue, as we are already exiting */
383 : }
384 : }
385 :
386 0 : finish:
387 7 : free(handle);
388 7 : free(buffer_alloc);
389 :
390 7 : if (error + io_error + silent_error != 0)
391 6 : return -1;
392 1 : return 0;
393 : }
394 :
395 : /****************************************************************************/
396 : /* sync */
397 :
398 : /**
399 : * Sync plan to use.
400 : */
401 : struct snapraid_plan {
402 : unsigned handle_max;
403 : struct snapraid_handle* handle_map;
404 : int force_full;
405 : };
406 :
407 : /**
408 : * A block that failed the hash check, or that was deleted.
409 : */
410 : struct failed_struct {
411 : unsigned index; /**< Index of the failed block. */
412 : unsigned size; /**< Size of the block. */
413 :
414 : struct snapraid_block* block; /**< The failed block, or BLOCK_DELETED for a deleted block */
415 : };
416 :
417 : /**
418 : * Comparison function for sorting by index.
419 : */
420 852 : int failed_compare_by_index(const void* void_a, const void* void_b)
421 : {
422 852 : const struct failed_struct* a = void_a;
423 852 : const struct failed_struct* b = void_b;
424 :
425 852 : if (a->index < b->index)
426 851 : return -1;
427 1 : if (a->index > b->index)
428 1 : return 1;
429 0 : return 0;
430 : }
431 :
432 : /**
433 : * Buffer for storing the new hashes.
434 : */
435 : struct snapraid_rehash {
436 : unsigned char hash[HASH_MAX];
437 : struct snapraid_block* block;
438 : };
439 :
440 : /**
441 : * Check if we have to process the specified block index ::i.
442 : */
443 410085 : static int block_is_enabled(struct snapraid_plan* plan, block_off_t i)
444 : {
445 : unsigned j;
446 : int one_invalid;
447 : int one_valid;
448 :
449 : /* for each disk */
450 410085 : one_invalid = 0;
451 410085 : one_valid = 0;
452 2870595 : for (j = 0; j < plan->handle_max; ++j) {
453 : struct snapraid_block* block;
454 2460510 : struct snapraid_disk* disk = plan->handle_map[j].disk;
455 :
456 : /* if no disk, nothing to check */
457 2460510 : if (!disk)
458 4687 : continue;
459 :
460 2455823 : block = fs_par2block_find(disk, i);
461 :
462 2455823 : if (block_has_file(block))
463 2344371 : one_valid = 1;
464 :
465 2455823 : if (block_has_invalid_parity(block) || plan->force_full)
466 374320 : one_invalid = 1;
467 : }
468 :
469 : /* if none valid or none invalid, we don't need to update */
470 410085 : if (!one_invalid || !one_valid)
471 297623 : return 0;
472 :
473 112462 : return 1;
474 : }
475 :
476 674772 : static void sync_data_reader(struct snapraid_worker* worker, struct snapraid_task* task)
477 : {
478 674772 : struct snapraid_io* io = worker->io;
479 674772 : struct snapraid_state* state = io->state;
480 674772 : struct snapraid_handle* handle = worker->handle;
481 674772 : struct snapraid_disk* disk = handle->disk;
482 674772 : block_off_t blockcur = task->position;
483 674772 : unsigned char* buffer = task->buffer;
484 : int ret;
485 : char esc_buffer[ESC_MAX];
486 :
487 : /* if the disk position is not used */
488 674772 : if (!disk) {
489 : /* use an empty block */
490 0 : memset(buffer, 0, state->block_size);
491 0 : task->state = TASK_STATE_DONE;
492 92198 : return;
493 : }
494 :
495 : /* get the block */
496 674772 : task->block = fs_par2block_find(disk, blockcur);
497 :
498 : /* if the block has no file, meaning that it's EMPTY or DELETED, */
499 : /* it doesn't participate in the new parity computation */
500 674772 : if (!block_has_file(task->block)) {
501 : /* use an empty block */
502 92180 : memset(buffer, 0, state->block_size);
503 92180 : task->state = TASK_STATE_DONE;
504 92180 : return;
505 : }
506 :
507 : /* get the file of this block */
508 582592 : task->file = fs_par2file_get(disk, blockcur, &task->file_pos);
509 :
510 : /* if the file is different than the current one, close it */
511 582592 : if (handle->file != 0 && handle->file != task->file) {
512 : /* keep a pointer at the file we are going to close for error reporting */
513 240593 : struct snapraid_file* report = handle->file;
514 240593 : ret = handle_close(handle);
515 240593 : if (ret == -1) {
516 : /* LCOV_EXCL_START */
517 : /* This one is really an unexpected error, because we are only reading */
518 : /* and closing a descriptor should never fail */
519 : if (errno == EIO) {
520 : log_tag("error:%u:%s:%s: Close EIO error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
521 : log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to sync.\n");
522 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
523 : log_fatal("Stopping at block %u\n", blockcur);
524 : task->state = TASK_STATE_IOERROR;
525 : return;
526 : }
527 :
528 : log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
529 : log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to sync.\n");
530 : log_fatal("Ensure that file '%s' can be accessed.\n", handle->path);
531 : log_fatal("Stopping at block %u\n", blockcur);
532 : task->state = TASK_STATE_ERROR;
533 : return;
534 : /* LCOV_EXCL_STOP */
535 : }
536 : }
537 :
538 582592 : ret = handle_open(handle, task->file, state->file_mode, log_error, 0);
539 582592 : if (ret == -1) {
540 12 : if (errno == EIO) {
541 : /* LCOV_EXCL_START */
542 : log_tag("error:%u:%s:%s: Open EIO error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
543 : log_fatal("DANGER! Unexpected input/output open error in a data disk, it isn't possible to sync.\n");
544 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
545 : log_fatal("Stopping at block %u\n", blockcur);
546 : task->state = TASK_STATE_IOERROR;
547 : return;
548 : /* LCOV_EXCL_STOP */
549 : }
550 :
551 12 : if (errno == ENOENT) {
552 4 : log_tag("error:%u:%s:%s: Open ENOENT error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
553 4 : log_error("Missing file '%s'.\n", handle->path);
554 4 : log_error("WARNING! You cannot modify data disk during a sync.\n");
555 4 : log_error("Rerun the sync command when finished.\n");
556 : /* if the file is missing, it means that it was removed during sync */
557 : /* this isn't a serious error, so we skip this block, and continue with others */
558 4 : task->state = TASK_STATE_ERROR_CONTINUE;
559 4 : return;
560 : }
561 :
562 8 : if (errno == EACCES) {
563 8 : log_tag("error:%u:%s:%s: Open EACCES error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
564 8 : log_error("No access at file '%s'.\n", handle->path);
565 8 : log_error("WARNING! Please fix the access permission in the data disk.\n");
566 8 : log_error("Rerun the sync command when finished.\n");
567 : /* this isn't a serious error, so we skip this block, and continue with others */
568 8 : task->state = TASK_STATE_ERROR_CONTINUE;
569 8 : return;
570 : }
571 :
572 : /* LCOV_EXCL_START */
573 : log_tag("error:%u:%s:%s: Open error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
574 : log_fatal("WARNING! Unexpected open error in a data disk, it isn't possible to sync.\n");
575 : log_fatal("Ensure that file '%s' can be accessed.\n", handle->path);
576 : log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
577 : task->state = TASK_STATE_ERROR;
578 : return;
579 : /* LCOV_EXCL_STOP */
580 : }
581 :
582 : /* check if the file is changed */
583 582580 : if (handle->st.st_size != task->file->size
584 582578 : || handle->st.st_mtime != task->file->mtime_sec
585 582576 : || STAT_NSEC(&handle->st) != task->file->mtime_nsec
586 582576 : || handle->st.st_ino != task->file->inode
587 : ) {
588 6 : log_tag("error:%u:%s:%s: Unexpected attribute change\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer));
589 6 : if (handle->st.st_size != task->file->size) {
590 2 : log_error("Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle->path, task->file->size, (uint64_t)handle->st.st_size);
591 4 : } else if (handle->st.st_mtime != task->file->mtime_sec
592 2 : || STAT_NSEC(&handle->st) != task->file->mtime_nsec) {
593 2 : log_error("Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle->path, task->file->mtime_sec, task->file->mtime_nsec, (uint64_t)handle->st.st_mtime, STAT_NSEC(&handle->st));
594 : } else {
595 2 : log_error("Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", task->file->inode, (uint64_t)handle->st.st_ino, handle->path);
596 : }
597 6 : log_error("WARNING! You cannot modify files during a sync.\n");
598 6 : log_error("Rerun the sync command when finished.\n");
599 : /* if the file is changed, it means that it was modified during sync */
600 : /* this isn't a serious error, so we skip this block, and continue with others */
601 6 : task->state = TASK_STATE_ERROR_CONTINUE;
602 6 : return;
603 : }
604 :
605 582574 : task->read_size = handle_read(handle, task->file_pos, buffer, state->block_size, log_error, 0);
606 582574 : if (task->read_size == -1) {
607 : /* LCOV_EXCL_START */
608 : if (errno == EIO) {
609 : log_tag("error:%u:%s:%s: Read EIO error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
610 : log_error("Input/Output error in file '%s' at position '%u'\n", handle->path, task->file_pos);
611 : task->state = TASK_STATE_IOERROR_CONTINUE;
612 : return;
613 : }
614 :
615 : log_tag("error:%u:%s:%s: Read error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
616 : log_fatal("WARNING! Unexpected read error in a data disk, it isn't possible to sync.\n");
617 : log_fatal("Ensure that file '%s' can be read.\n", handle->path);
618 : log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
619 : task->state = TASK_STATE_ERROR;
620 : return;
621 : /* LCOV_EXCL_STOP */
622 : }
623 :
624 : /* store the path of the opened file */
625 582574 : pathcpy(task->path, sizeof(task->path), handle->path);
626 :
627 582574 : task->state = TASK_STATE_DONE;
628 : }
629 :
630 538757 : static void sync_parity_writer(struct snapraid_worker* worker, struct snapraid_task* task)
631 : {
632 538757 : struct snapraid_io* io = worker->io;
633 538757 : struct snapraid_state* state = io->state;
634 538757 : struct snapraid_parity_handle* parity_handle = worker->parity_handle;
635 538757 : unsigned level = parity_handle->level;
636 538757 : block_off_t blockcur = task->position;
637 538757 : unsigned char* buffer = task->buffer;
638 : int ret;
639 :
640 : /* write parity */
641 538757 : ret = parity_write(parity_handle, blockcur, buffer, state->block_size);
642 538757 : if (ret == -1) {
643 : /* LCOV_EXCL_START */
644 : if (errno == EIO) {
645 : log_tag("parity_error:%u:%s: Write EIO error. %s\n", blockcur, lev_config_name(level), strerror(errno));
646 : log_error("Input/Output error in parity '%s' at position '%u'\n", lev_config_name(level), blockcur);
647 : task->state = TASK_STATE_IOERROR_CONTINUE;
648 : return;
649 : }
650 :
651 : log_tag("parity_error:%u:%s: Write error. %s\n", blockcur, lev_config_name(level), strerror(errno));
652 : log_fatal("WARNING! Unexpected write error in the %s disk, it isn't possible to sync.\n", lev_name(level));
653 : log_fatal("Ensure that disk '%s' has some free space available.\n", lev_config_name(level));
654 : log_fatal("Stopping at block %u\n", blockcur);
655 : task->state = TASK_STATE_ERROR;
656 : return;
657 : /* LCOV_EXCL_STOP */
658 : }
659 :
660 538757 : task->state = TASK_STATE_DONE;
661 : }
662 :
663 83 : static int state_sync_process(struct snapraid_state* state, struct snapraid_parity_handle* parity_handle, block_off_t blockstart, block_off_t blockmax)
664 : {
665 : struct snapraid_io io;
666 : struct snapraid_plan plan;
667 : struct snapraid_handle* handle;
668 : void* rehandle_alloc;
669 : struct snapraid_rehash* rehandle;
670 : unsigned diskmax;
671 : block_off_t blockcur;
672 : unsigned j;
673 : void* zero_alloc;
674 : void** zero;
675 : void* copy_alloc;
676 : void** copy;
677 : unsigned buffermax;
678 : data_off_t countsize;
679 : block_off_t countpos;
680 : block_off_t countmax;
681 : block_off_t autosavedone;
682 : block_off_t autosavelimit;
683 : block_off_t autosavemissing;
684 : int ret;
685 : unsigned error;
686 : unsigned silent_error;
687 : unsigned io_error;
688 : time_t now;
689 : struct failed_struct* failed;
690 : int* failed_map;
691 : unsigned l;
692 : unsigned* waiting_map;
693 : unsigned waiting_mac;
694 : char esc_buffer[ESC_MAX];
695 : bit_vect_t* block_enabled;
696 :
697 : /* the sync process assumes that all the hashes are correct */
698 : /* including the ones from CHG and DELETED blocks */
699 83 : assert(state->clear_past_hash != 0);
700 :
701 : /* get the present time */
702 83 : now = time(0);
703 :
704 : /* maps the disks to handles */
705 83 : handle = handle_mapping(state, &diskmax);
706 :
707 : /* rehash buffers */
708 83 : rehandle = malloc_nofail_align(diskmax * sizeof(struct snapraid_rehash), &rehandle_alloc);
709 :
710 : /* we need 1 * data + 1 * parity */
711 83 : buffermax = diskmax + state->level;
712 :
713 : /* initialize the io threads */
714 83 : io_init(&io, state, state->opt.io_cache, buffermax, sync_data_reader, handle, diskmax, 0, sync_parity_writer, parity_handle, state->level);
715 :
716 : /* allocate the copy buffer */
717 83 : copy = malloc_nofail_vector_align(diskmax, diskmax, state->block_size, ©_alloc);
718 :
719 : /* allocate and fill the zero buffer */
720 83 : zero = malloc_nofail_align(state->block_size, &zero_alloc);
721 83 : memset(zero, 0, state->block_size);
722 83 : raid_zero(zero);
723 :
724 83 : failed = malloc_nofail(diskmax * sizeof(struct failed_struct));
725 83 : failed_map = malloc_nofail(diskmax * sizeof(unsigned));
726 :
727 : /* possibly waiting disks */
728 83 : waiting_mac = diskmax > RAID_PARITY_MAX ? diskmax : RAID_PARITY_MAX;
729 83 : waiting_map = malloc_nofail(waiting_mac * sizeof(unsigned));
730 :
731 83 : error = 0;
732 83 : silent_error = 0;
733 83 : io_error = 0;
734 :
735 83 : msg_progress("Selecting...\n");
736 :
737 : /* first count the number of blocks to process */
738 83 : countmax = 0;
739 83 : plan.handle_max = diskmax;
740 83 : plan.handle_map = handle;
741 83 : plan.force_full = state->opt.force_full;
742 83 : block_enabled = calloc_nofail(1, bit_vect_size(blockmax)); /* preinitialize to 0 */
743 410168 : for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
744 410085 : if (!block_is_enabled(&plan, blockcur))
745 297623 : continue;
746 112462 : bit_vect_set(block_enabled, blockcur);
747 112462 : ++countmax;
748 : }
749 :
750 : /* compute the autosave size for all disk, even if not read */
751 : /* this makes sense because the speed should be almost the same */
752 : /* if the disks are read in parallel */
753 83 : autosavelimit = state->autosave / (diskmax * state->block_size);
754 83 : autosavemissing = countmax; /* blocks to do */
755 83 : autosavedone = 0; /* blocks done */
756 :
757 : /* drop until now */
758 83 : state_usage_waste(state);
759 :
760 83 : countsize = 0;
761 83 : countpos = 0;
762 :
763 83 : msg_progress("Syncing...\n");
764 :
765 : /* start all the worker threads */
766 83 : io_start(&io, blockstart, blockmax, block_enabled);
767 :
768 83 : if (!state_progress_begin(state, blockstart, blockmax, countmax))
769 0 : goto end;
770 :
771 112462 : while (1) {
772 : unsigned failed_count;
773 : int error_on_this_block;
774 : int silent_error_on_this_block;
775 : int io_error_on_this_block;
776 : int fixed_error_on_this_block;
777 : int parity_needs_to_be_updated;
778 : int parity_going_to_be_updated;
779 : snapraid_info info;
780 : int rehash;
781 : void** buffer;
782 : int writer_error[IO_WRITER_ERROR_MAX];
783 :
784 : /* go to the next block */
785 112545 : blockcur = io_read_next(&io, &buffer);
786 112545 : if (blockcur >= blockmax)
787 83 : break;
788 :
789 : /* until now is scheduling */
790 112462 : state_usage_sched(state);
791 :
792 : /* one more block processed for autosave */
793 112462 : ++autosavedone;
794 112462 : --autosavemissing;
795 :
796 : /* by default process the block, and skip it if something goes wrong */
797 112462 : error_on_this_block = 0;
798 112462 : silent_error_on_this_block = 0;
799 112462 : io_error_on_this_block = 0;
800 112462 : fixed_error_on_this_block = 0;
801 :
802 : /* keep track of the number of failed blocks */
803 112462 : failed_count = 0;
804 :
805 : /* get block specific info */
806 112462 : info = info_get(&state->infoarr, blockcur);
807 :
808 : /* if we have to use the old hash */
809 112462 : rehash = info_get_rehash(info);
810 :
811 : /* if the parity requires to be updated */
812 : /* It could happens that all the blocks are EMPTY/BLK and CHG but with the hash */
813 : /* still matching because the specific CHG block was not modified. */
814 : /* In such case, we can avoid to update parity, because it would be the same as before */
815 : /* Note that CHG/DELETED blocks already present in the content file loaded */
816 : /* have the hash cleared (::clear_past_hash flag), and then they won't never match the hash. */
817 : /* We are treating only CHG blocks created at runtime. */
818 112462 : parity_needs_to_be_updated = state->opt.force_full || state->opt.force_parity_update;
819 :
820 : /* if the parity is going to be updated */
821 112462 : parity_going_to_be_updated = 0;
822 :
823 : /* if the block is marked as bad, we force the parity update */
824 : /* because the bad block may be the result of a wrong parity */
825 112462 : if (info_get_bad(info))
826 0 : parity_needs_to_be_updated = 1;
827 :
828 : /* for each disk, process the block */
829 787234 : for (j = 0; j < diskmax; ++j) {
830 : struct snapraid_task* task;
831 : int read_size;
832 : unsigned char hash[HASH_MAX];
833 : struct snapraid_block* block;
834 : unsigned block_state;
835 : struct snapraid_disk* disk;
836 : struct snapraid_file* file;
837 : block_off_t file_pos;
838 : unsigned diskcur;
839 :
840 : /* until now is misc */
841 674772 : state_usage_misc(state);
842 :
843 674772 : task = io_data_read(&io, &diskcur, waiting_map, &waiting_mac);
844 :
845 : /* until now is disk */
846 674772 : state_usage_disk(state, handle, waiting_map, waiting_mac);
847 :
848 : /* get the results */
849 674772 : disk = task->disk;
850 674772 : block = task->block;
851 674772 : file = task->file;
852 674772 : file_pos = task->file_pos;
853 674772 : read_size = task->read_size;
854 :
855 : /* by default no rehash in case of "continue" */
856 674772 : rehandle[diskcur].block = 0;
857 :
858 : /* if the disk position is not used */
859 674772 : if (!disk)
860 93051 : continue;
861 :
862 674772 : state_usage_file(state, disk, file);
863 :
864 : /* get the state of the block */
865 674772 : block_state = block_state_get(block);
866 :
867 : /* if the block has invalid parity, */
868 : /* we have to take care of it in case of recover */
869 674772 : if (block_has_invalid_parity(block)) {
870 : /* store it in the failed set, because */
871 : /* the parity may be still computed with the previous content */
872 178420 : failed[failed_count].index = diskcur;
873 178420 : failed[failed_count].size = state->block_size;
874 178420 : failed[failed_count].block = block;
875 178420 : ++failed_count;
876 :
877 : /* if the block has invalid parity, we have to update the parity */
878 : /* to include this block change */
879 : /* This also apply to CHG blocks, but we are going to handle */
880 : /* later this case to do the updates only if really needed */
881 178420 : if (block_state != BLOCK_STATE_CHG)
882 88225 : parity_needs_to_be_updated = 1;
883 :
884 : /* note that DELETE blocks are skipped in the next check */
885 : /* and we have to store them in the failed blocks */
886 : /* before skipping */
887 :
888 : /* follow */
889 : }
890 :
891 : /* if the block is not used */
892 674772 : if (!block_has_file(block))
893 92180 : continue;
894 :
895 : /* handle error conditions */
896 582592 : if (task->state == TASK_STATE_IOERROR) {
897 : /* LCOV_EXCL_START */
898 : ++io_error;
899 : goto bail;
900 : /* LCOV_EXCL_STOP */
901 : }
902 582592 : if (task->state == TASK_STATE_ERROR) {
903 : /* LCOV_EXCL_START */
904 : ++error;
905 : goto bail;
906 : /* LCOV_EXCL_STOP */
907 : }
908 582592 : if (task->state == TASK_STATE_ERROR_CONTINUE) {
909 18 : ++error;
910 18 : error_on_this_block = 1;
911 18 : continue;
912 : }
913 582574 : if (task->state == TASK_STATE_IOERROR_CONTINUE) {
914 0 : ++io_error;
915 0 : if (io_error >= state->opt.io_error_limit) {
916 : /* LCOV_EXCL_START */
917 : log_fatal("DANGER! Unexpected input/output read error in a data disk, it isn't possible to sync.\n");
918 : log_fatal("Ensure that disk '%s' is sane and that file '%s' can be read.\n", disk->dir, task->path);
919 : log_fatal("Stopping at block %u\n", blockcur);
920 : goto bail;
921 : /* LCOV_EXCL_STOP */
922 : }
923 :
924 : /* otherwise continue */
925 0 : io_error_on_this_block = 1;
926 0 : continue;
927 : }
928 582574 : if (task->state != TASK_STATE_DONE) {
929 : /* LCOV_EXCL_START */
930 : log_fatal("Internal inconsistency in task state\n");
931 : os_abort();
932 : /* LCOV_EXCL_STOP */
933 : }
934 :
935 582574 : countsize += read_size;
936 :
937 : /* now compute the hash */
938 582574 : if (rehash) {
939 27243 : memhash(state->prevhash, state->prevhashseed, hash, buffer[diskcur], read_size);
940 :
941 : /* compute the new hash, and store it */
942 27243 : rehandle[diskcur].block = block;
943 27243 : memhash(state->hash, state->hashseed, rehandle[diskcur].hash, buffer[diskcur], read_size);
944 : } else {
945 555331 : memhash(state->hash, state->hashseed, hash, buffer[diskcur], read_size);
946 : }
947 :
948 : /* until now is hash */
949 582574 : state_usage_hash(state);
950 :
951 582574 : if (block_has_updated_hash(block)) {
952 : /* compare the hash */
953 492397 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
954 : /* if the file has invalid parity, it's a REP changed during the sync */
955 853 : if (block_has_invalid_parity(block)) {
956 1 : log_tag("error:%u:%s:%s: Unexpected data change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
957 1 : log_error("Data change at file '%s' at position '%u'\n", task->path, file_pos);
958 1 : log_error("WARNING! Unexpected data modification of a file without parity!\n");
959 :
960 1 : if (file_flag_has(file, FILE_IS_COPY)) {
961 1 : log_error("This file was detected as a copy of another file with the same name, size,\n");
962 1 : log_error("and timestamp, but the file data isn't matching the assumed copy.\n");
963 1 : log_error("If this is a false positive, and the files are expected to be different,\n");
964 1 : log_error("you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
965 : } else {
966 0 : log_error("Try removing the file from the array and rerun the 'sync' command!\n");
967 : }
968 :
969 1 : ++error;
970 :
971 : /* if the file is changed, it means that it was modified during sync */
972 : /* this isn't a serious error, so we skip this block, and continue with others */
973 1 : error_on_this_block = 1;
974 1 : continue;
975 852 : } else { /* otherwise it's a BLK with silent error */
976 852 : unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
977 852 : log_tag("error:%u:%s:%s: Data error at position %u, diff bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
978 852 : log_error("Data error in file '%s' at position '%u', diff bits %u/%u\n", task->path, file_pos, diff, BLOCK_HASH_SIZE * 8);
979 :
980 : /* save the failed block for the fix */
981 852 : failed[failed_count].index = diskcur;
982 852 : failed[failed_count].size = read_size;
983 852 : failed[failed_count].block = block;
984 852 : ++failed_count;
985 :
986 : /* silent errors are very rare, and are not a signal that a disk */
987 : /* is going to fail. So, we just continue marking the block as bad */
988 : /* just like in scrub */
989 852 : ++silent_error;
990 852 : silent_error_on_this_block = 1;
991 852 : continue;
992 : }
993 : }
994 : } else {
995 : /* if until now the parity doesn't need to be updated */
996 90177 : if (!parity_needs_to_be_updated) {
997 : /* for sure it's a CHG block, because EMPTY are processed before with "continue" */
998 : /* and BLK and REP have "block_has_updated_hash()" as 1, and all the others */
999 : /* have "parity_needs_to_be_updated" already at 1 */
1000 32959 : assert(block_state_get(block) == BLOCK_STATE_CHG);
1001 :
1002 : /* if the hash represents the data unequivocally */
1003 32959 : if (hash_is_unique(block->hash)) {
1004 : /* check if the hash is changed */
1005 6285 : if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
1006 : /* the block is different, and we must update parity */
1007 5078 : parity_needs_to_be_updated = 1;
1008 : }
1009 : } else {
1010 : /* if the hash is already invalid, we update parity */
1011 26674 : parity_needs_to_be_updated = 1;
1012 : }
1013 : }
1014 :
1015 : /* copy the hash in the block, but doesn't mark the block as hashed */
1016 : /* this allow in case of skipped block to do not save the failed computation */
1017 90177 : memcpy(block->hash, hash, BLOCK_HASH_SIZE);
1018 :
1019 : /* note that in case of rehash, this is the wrong hash, */
1020 : /* but it will be overwritten later */
1021 : }
1022 : }
1023 :
1024 : /* if we have only silent errors we can try to fix them on-the-fly */
1025 : /* note the fix is not written to disk, but used only to */
1026 : /* compute the new parity */
1027 112462 : if (!error_on_this_block && !io_error_on_this_block && silent_error_on_this_block) {
1028 : unsigned failed_mac;
1029 852 : int something_to_recover = 0;
1030 :
1031 : /* sort the failed vector */
1032 : /* because with threads it may be in any order */
1033 : /* but RAID requires the indexes to be sorted */
1034 852 : qsort(failed, failed_count, sizeof(failed[0]), failed_compare_by_index);
1035 :
1036 : /* setup the blocks to recover */
1037 852 : failed_mac = 0;
1038 2556 : for (j = 0; j < failed_count; ++j) {
1039 1704 : unsigned char* block_buffer = buffer[failed[j].index];
1040 1704 : unsigned char* block_copy = copy[failed[j].index];
1041 1704 : unsigned block_state = block_state_get(failed[j].block);
1042 :
1043 : /* we try to recover only if at least one BLK is present */
1044 1704 : if (block_state == BLOCK_STATE_BLK)
1045 852 : something_to_recover = 1;
1046 :
1047 : /* save a copy of the content just read */
1048 : /* that it's going to be overwritten by the recovering function */
1049 1704 : memcpy(block_copy, block_buffer, state->block_size);
1050 :
1051 1704 : if (block_state == BLOCK_STATE_CHG
1052 852 : && hash_is_zero(failed[j].block->hash)
1053 : ) {
1054 : /* if the block was filled with 0, restore this state */
1055 : /* and avoid to recover it */
1056 0 : memset(block_buffer, 0, state->block_size);
1057 : } else {
1058 : /* if we have too many failures, we cannot recover */
1059 1704 : if (failed_mac >= state->level)
1060 0 : break;
1061 :
1062 : /* otherwise it has to be recovered */
1063 1704 : failed_map[failed_mac++] = failed[j].index;
1064 : }
1065 : }
1066 :
1067 : /* if we have something to recover and enough parity */
1068 852 : if (something_to_recover && j == failed_count) {
1069 : /* until now is misc */
1070 852 : state_usage_misc(state);
1071 :
1072 : /* read the parity */
1073 : /* we are sure that parity exists because */
1074 : /* we have at least one BLK block */
1075 5964 : for (l = 0; l < state->level; ++l) {
1076 5112 : ret = parity_read(&parity_handle[l], blockcur, buffer[diskmax + l], state->block_size, log_error);
1077 5112 : if (ret == -1) {
1078 : /* LCOV_EXCL_START */
1079 : if (errno == EIO) {
1080 : log_tag("parity_error:%u:%s: Read EIO error. %s\n", blockcur, lev_config_name(l), strerror(errno));
1081 : if (io_error >= state->opt.io_error_limit) {
1082 : log_fatal("DANGER! Unexpected input/output read error in the %s disk, it isn't possible to sync.\n", lev_name(l));
1083 : log_fatal("Ensure that disk '%s' is sane and can be read.\n", lev_config_name(l));
1084 : log_fatal("Stopping at block %u\n", blockcur);
1085 : ++io_error;
1086 : goto bail;
1087 : }
1088 :
1089 : log_error("Input/Output error in parity '%s' at position '%u'\n", lev_config_name(l), blockcur);
1090 : ++io_error;
1091 : io_error_on_this_block = 1;
1092 : continue;
1093 : }
1094 :
1095 : log_tag("parity_error:%u:%s: Read error. %s\n", blockcur, lev_config_name(l), strerror(errno));
1096 : log_fatal("WARNING! Unexpected read error in the %s disk, it isn't possible to sync.\n", lev_name(l));
1097 : log_fatal("Ensure that disk '%s' can be read.\n", lev_config_name(l));
1098 : log_fatal("Stopping at block %u\n", blockcur);
1099 : ++error;
1100 : goto bail;
1101 : /* LCOV_EXCL_STOP */
1102 : }
1103 :
1104 : /* until now is parity */
1105 5112 : state_usage_parity(state, &l, 1);
1106 : }
1107 :
1108 : /* if no error in parity read */
1109 852 : if (!io_error_on_this_block) {
1110 : /* try to fix the data */
1111 : /* note that this is a simple fix algorithm, that doesn't take into */
1112 : /* account the case of a wrong parity */
1113 : /* only 'fix' supports the most advanced fixing */
1114 852 : raid_rec(failed_mac, failed_map, diskmax, state->level, state->block_size, buffer);
1115 :
1116 : /* until now is raid */
1117 852 : state_usage_raid(state);
1118 :
1119 : /* check the result and prepare the data */
1120 2556 : for (j = 0; j < failed_count; ++j) {
1121 : unsigned char hash[HASH_MAX];
1122 1704 : unsigned char* block_buffer = buffer[failed[j].index];
1123 1704 : unsigned char* block_copy = copy[failed[j].index];
1124 1704 : unsigned block_state = block_state_get(failed[j].block);
1125 :
1126 1704 : if (block_state == BLOCK_STATE_BLK) {
1127 852 : unsigned size = failed[j].size;
1128 :
1129 : /* compute the hash of the recovered block */
1130 852 : if (rehash) {
1131 0 : memhash(state->prevhash, state->prevhashseed, hash, block_buffer, size);
1132 : } else {
1133 852 : memhash(state->hash, state->hashseed, hash, block_buffer, size);
1134 : }
1135 :
1136 : /* until now is hash */
1137 852 : state_usage_hash(state);
1138 :
1139 : /* if the hash doesn't match */
1140 852 : if (memcmp(hash, failed[j].block->hash, BLOCK_HASH_SIZE) != 0) {
1141 : /* we have not recovered */
1142 0 : break;
1143 : }
1144 :
1145 : /* pad with 0 if needed */
1146 852 : if (size < state->block_size)
1147 331 : memset(block_buffer + size, 0, state->block_size - size);
1148 : } else {
1149 : /* otherwise restore the content */
1150 : /* because we are not interested in the old state */
1151 : /* that it's recovered for CHG, REP and DELETED blocks */
1152 852 : memcpy(block_buffer, block_copy, state->block_size);
1153 : }
1154 : }
1155 :
1156 : /* if all is processed, we have fixed it */
1157 852 : if (j == failed_count)
1158 852 : fixed_error_on_this_block = 1;
1159 : }
1160 : }
1161 : }
1162 :
1163 : /* if we have read all the data required and it's correct, proceed with the parity */
1164 112462 : if (!error_on_this_block && !io_error_on_this_block
1165 112443 : && (!silent_error_on_this_block || fixed_error_on_this_block)
1166 : ) {
1167 : /* update the parity only if really needed */
1168 112443 : if (parity_needs_to_be_updated) {
1169 : /* compute the parity */
1170 111477 : raid_gen(diskmax, state->level, state->block_size, buffer);
1171 :
1172 : /* until now is raid */
1173 111477 : state_usage_raid(state);
1174 :
1175 : /* mark that the parity is going to be written */
1176 111477 : parity_going_to_be_updated = 1;
1177 : }
1178 :
1179 : /* for each disk, mark the blocks as processed */
1180 787101 : for (j = 0; j < diskmax; ++j) {
1181 : struct snapraid_block* block;
1182 :
1183 674658 : if (!handle[j].disk)
1184 0 : continue;
1185 :
1186 674658 : block = fs_par2block_find(handle[j].disk, blockcur);
1187 :
1188 674658 : if (block == BLOCK_NULL) {
1189 : /* nothing to do */
1190 81225 : continue;
1191 : }
1192 :
1193 : /* if it's a deleted block */
1194 593433 : if (block_state_get(block) == BLOCK_STATE_DELETED) {
1195 : /* the parity is now updated without this block, so it's now empty */
1196 10953 : fs_deallocate(handle[j].disk, blockcur);
1197 10953 : continue;
1198 : }
1199 :
1200 : /* now all the blocks have the hash and the parity computed */
1201 582480 : block_state_set(block, BLOCK_STATE_BLK);
1202 : }
1203 :
1204 : /* we update the info block only if we really have updated the parity */
1205 : /* because otherwise the time/justsynced info would be misleading as we didn't */
1206 : /* wrote the parity at this time */
1207 : /* we also update the info block only if no silent error was found */
1208 : /* because has no sense to refresh the time for data that we know bad */
1209 112443 : if (parity_needs_to_be_updated
1210 111477 : && !silent_error_on_this_block
1211 : ) {
1212 : /* if rehash is needed */
1213 110625 : if (rehash) {
1214 : /* store all the new hash already computed */
1215 32249 : for (j = 0; j < diskmax; ++j) {
1216 27642 : if (rehandle[j].block)
1217 27243 : memcpy(rehandle[j].block->hash, rehandle[j].hash, BLOCK_HASH_SIZE);
1218 : }
1219 : }
1220 :
1221 : /* update the time info of the block */
1222 : /* we are also clearing any previous bad and rehash flag */
1223 110625 : info_set(&state->infoarr, blockcur, info_make(now, 0, 0, 1));
1224 : }
1225 : }
1226 :
1227 : /* if a silent (even if corrected) or input/output error was found */
1228 : /* mark the block as bad to have check/fix to handle it */
1229 : /* because our correction is in memory only and not yet written */
1230 112462 : if (silent_error_on_this_block || io_error_on_this_block) {
1231 : /* set the error status keeping the other info */
1232 852 : info_set(&state->infoarr, blockcur, info_set_bad(info));
1233 : }
1234 :
1235 : /* finally schedule parity write */
1236 : /* Note that the calls to io_parity_write() are mandatory */
1237 : /* even if the parity doesn't need to be updated */
1238 : /* This because we want to keep track of the time usage */
1239 112462 : state_usage_misc(state);
1240 :
1241 : /* write start */
1242 112462 : io_write_preset(&io, blockcur, !parity_going_to_be_updated);
1243 :
1244 : /* write the parity */
1245 657129 : for (l = 0; l < state->level; ++l) {
1246 : unsigned levcur;
1247 :
1248 544667 : io_parity_write(&io, &levcur, waiting_map, &waiting_mac);
1249 :
1250 : /* until now is parity */
1251 544667 : state_usage_parity(state, waiting_map, waiting_mac);
1252 : }
1253 :
1254 : /* write finished */
1255 112462 : io_write_next(&io, blockcur, !parity_going_to_be_updated, writer_error);
1256 :
1257 : /* handle errors reported */
1258 562310 : for (j = 0; j < IO_WRITER_ERROR_MAX; ++j) {
1259 449848 : if (writer_error[j]) {
1260 0 : switch (j + IO_WRITER_ERROR_BASE) {
1261 0 : case TASK_STATE_IOERROR_CONTINUE :
1262 0 : ++io_error;
1263 0 : if (io_error >= state->opt.io_error_limit) {
1264 : /* LCOV_EXCL_START */
1265 : log_fatal("DANGER! Unexpected input/output write error in a parity disk, it isn't possible to sync.\n");
1266 : log_fatal("Stopping at block %u\n", blockcur);
1267 : goto bail;
1268 : /* LCOV_EXCL_STOP */
1269 : }
1270 0 : break;
1271 0 : case TASK_STATE_ERROR_CONTINUE :
1272 0 : ++error;
1273 0 : break;
1274 0 : case TASK_STATE_IOERROR :
1275 : /* LCOV_EXCL_START */
1276 : ++io_error;
1277 : goto bail;
1278 : /* LCOV_EXCL_STOP */
1279 0 : case TASK_STATE_ERROR :
1280 : /* LCOV_EXCL_START */
1281 : ++error;
1282 : goto bail;
1283 : /* LCOV_EXCL_STOP */
1284 : }
1285 : }
1286 : }
1287 :
1288 : /* mark the state as needing write */
1289 112462 : state->need_write = 1;
1290 :
1291 : /* count the number of processed block */
1292 112462 : ++countpos;
1293 :
1294 : /* progress */
1295 112462 : if (state_progress(state, &io, blockcur, countpos, countmax, countsize)) {
1296 : /* LCOV_EXCL_START */
1297 : break;
1298 : /* LCOV_EXCL_STOP */
1299 : }
1300 :
1301 : /* thermal control */
1302 112462 : if (state_thermal_alarm(state)) {
1303 : /* until now is misc */
1304 0 : state_usage_misc(state);
1305 :
1306 0 : state_progress_stop(state);
1307 :
1308 : /* before spinning down flush all the caches */
1309 0 : ret = state_flush(state, &io, parity_handle, blockcur);
1310 0 : if (ret == -1) {
1311 : /* LCOV_EXCL_START */
1312 : log_fatal("Stopping at block %u\n", blockcur);
1313 : ++error;
1314 : goto bail;
1315 : /* LCOV_EXCL_STOP */
1316 : }
1317 :
1318 0 : state_thermal_cooldown(state);
1319 :
1320 0 : state_progress_restart(state);
1321 :
1322 : /* drop until now */
1323 0 : state_usage_waste(state);
1324 : }
1325 :
1326 : /* autosave */
1327 112462 : if ((state->autosave != 0
1328 26021 : && autosavedone >= autosavelimit /* if we have reached the limit */
1329 0 : && autosavemissing >= autosavelimit) /* if we have at least a full step to do */
1330 : /* or if we have a forced autosave at the specified block */
1331 112462 : || (state->opt.force_autosave_at != 0 && state->opt.force_autosave_at == blockcur)
1332 : ) {
1333 1 : autosavedone = 0; /* restart the counter */
1334 :
1335 : /* until now is misc */
1336 1 : state_usage_misc(state);
1337 :
1338 1 : state_progress_stop(state);
1339 :
1340 1 : msg_progress("Autosaving...\n");
1341 :
1342 : /* before writing the new content file we ensure that */
1343 : /* the parity is really written flushing the disk cache */
1344 1 : ret = state_flush(state, &io, parity_handle, blockcur);
1345 1 : if (ret == -1) {
1346 : /* LCOV_EXCL_START */
1347 : log_fatal("Stopping at block %u\n", blockcur);
1348 : ++error;
1349 : goto bail;
1350 : /* LCOV_EXCL_STOP */
1351 : }
1352 :
1353 : /* now we can safely write the content file */
1354 1 : state_write(state);
1355 :
1356 1 : state_progress_restart(state);
1357 :
1358 : /* drop until now */
1359 1 : state_usage_waste(state);
1360 : }
1361 : }
1362 :
1363 83 : end:
1364 83 : state_progress_end(state, countpos, countmax, countsize, "Nothing to sync.\n");
1365 :
1366 : /* before returning we ensure that */
1367 : /* the parity is really written flushing the disk cache */
1368 83 : ret = state_flush(state, &io, parity_handle, blockcur);
1369 83 : if (ret == -1) {
1370 : /* LCOV_EXCL_START */
1371 : log_fatal("Stopping at block %u\n", blockcur);
1372 : ++error;
1373 : goto bail;
1374 : /* LCOV_EXCL_STOP */
1375 : }
1376 :
1377 : /* save the new state if required */
1378 83 : if (!state->opt.kill_after_sync) {
1379 73 : if ((state->need_write || state->opt.force_content_write))
1380 56 : state_write(state);
1381 : } else {
1382 10 : log_fatal("WARNING! Skipped writing state due to --test-kill-after-sync option.\n");
1383 : }
1384 :
1385 83 : state_usage_print(state);
1386 :
1387 83 : if (error || silent_error || io_error) {
1388 12 : msg_status("\n");
1389 12 : msg_status("%8u file errors\n", error);
1390 12 : msg_status("%8u io errors\n", io_error);
1391 12 : msg_status("%8u data errors\n", silent_error);
1392 : } else {
1393 : /* print the result only if processed something */
1394 71 : if (countpos != 0)
1395 52 : msg_status("Everything OK\n");
1396 : }
1397 :
1398 83 : if (error)
1399 11 : log_fatal("WARNING! Unexpected file errors!\n");
1400 83 : if (io_error)
1401 0 : log_fatal("DANGER! Unexpected input/output errors! The failing blocks are now marked as bad!\n");
1402 83 : if (silent_error)
1403 1 : log_fatal("DANGER! Unexpected data errors! The failing blocks are now marked as bad!\n");
1404 83 : if (io_error || silent_error) {
1405 1 : log_fatal("Use 'snapraid status' to list the bad blocks.\n");
1406 1 : log_fatal("Use 'snapraid -e fix' to recover.\n");
1407 : }
1408 :
1409 83 : log_tag("summary:error_file:%u\n", error);
1410 83 : log_tag("summary:error_io:%u\n", io_error);
1411 83 : log_tag("summary:error_data:%u\n", silent_error);
1412 83 : if (error + silent_error + io_error == 0)
1413 71 : log_tag("summary:exit:ok\n");
1414 : else
1415 12 : log_tag("summary:exit:error\n");
1416 83 : log_flush();
1417 :
1418 83 : bail:
1419 : /* stop all the worker threads */
1420 83 : io_stop(&io);
1421 :
1422 581 : for (j = 0; j < diskmax; ++j) {
1423 498 : struct snapraid_file* file = handle[j].file;
1424 498 : struct snapraid_disk* disk = handle[j].disk;
1425 498 : ret = handle_close(&handle[j]);
1426 498 : if (ret == -1) {
1427 : /* LCOV_EXCL_START */
1428 : log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
1429 : log_fatal("DANGER! Unexpected close error in a data disk.\n");
1430 : ++error;
1431 : /* continue, as we are already exiting */
1432 : /* LCOV_EXCL_STOP */
1433 : }
1434 : }
1435 :
1436 83 : free(handle);
1437 83 : free(zero_alloc);
1438 83 : free(copy_alloc);
1439 83 : free(copy);
1440 83 : free(rehandle_alloc);
1441 83 : free(failed);
1442 83 : free(failed_map);
1443 83 : free(waiting_map);
1444 83 : io_done(&io);
1445 83 : free(block_enabled);
1446 :
1447 83 : if (state->opt.expect_recoverable) {
1448 1 : if (error + silent_error + io_error == 0)
1449 0 : return -1;
1450 : } else {
1451 82 : if (error + silent_error + io_error != 0)
1452 11 : return -1;
1453 : }
1454 72 : return 0;
1455 : }
1456 :
1457 87 : int state_sync(struct snapraid_state* state, block_off_t blockstart, block_off_t blockcount)
1458 : {
1459 : block_off_t blockmax;
1460 : block_off_t used_paritymax;
1461 : block_off_t file_paritymax;
1462 : data_off_t size;
1463 : int ret;
1464 : struct snapraid_parity_handle parity_handle[LEV_MAX];
1465 : unsigned unrecoverable_error;
1466 : unsigned l;
1467 87 : int skip_sync = 0;
1468 :
1469 87 : msg_progress("Initializing...\n");
1470 :
1471 87 : blockmax = parity_allocated_size(state);
1472 87 : size = blockmax * (data_off_t)state->block_size;
1473 :
1474 : /* minimum size of the parity files we expect */
1475 87 : used_paritymax = parity_used_size(state);
1476 :
1477 : /* effective size of the parity files */
1478 87 : file_paritymax = 0;
1479 :
1480 87 : if (blockstart > blockmax) {
1481 : /* LCOV_EXCL_START */
1482 : log_fatal("Error in the starting block %u. It is larger than the parity size %u.\n", blockstart, blockmax);
1483 : exit(EXIT_FAILURE);
1484 : /* LCOV_EXCL_STOP */
1485 : }
1486 :
1487 : /* adjust the number of block to process */
1488 87 : if (blockcount != 0 && blockstart + blockcount < blockmax) {
1489 6 : blockmax = blockstart + blockcount;
1490 : }
1491 :
1492 574 : for (l = 0; l < state->level; ++l) {
1493 : data_off_t out_size;
1494 : block_off_t parityblocks;
1495 :
1496 : /* create the file and open for writing */
1497 487 : ret = parity_create(&parity_handle[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
1498 487 : if (ret == -1) {
1499 : /* LCOV_EXCL_START */
1500 : log_fatal("WARNING! Without an accessible %s file, it isn't possible to sync.\n", lev_name(l));
1501 : exit(EXIT_FAILURE);
1502 : /* LCOV_EXCL_STOP */
1503 : }
1504 :
1505 : /* number of block in the parity file */
1506 487 : parity_size(&parity_handle[l], &out_size);
1507 487 : parityblocks = out_size / state->block_size;
1508 :
1509 : /* if the file is too small */
1510 487 : if (parityblocks < used_paritymax) {
1511 0 : log_fatal("WARNING! The %s parity has only %u blocks instead of %u.\n", lev_name(l), parityblocks, used_paritymax);
1512 : }
1513 :
1514 : /* keep the smallest parity number of blocks */
1515 487 : if (l == 0 || file_paritymax > parityblocks)
1516 88 : file_paritymax = parityblocks;
1517 : }
1518 :
1519 : /* if we do a full parity realloc or computation, having a wrong parity size is expected */
1520 87 : if (!state->opt.force_realloc && !state->opt.force_full) {
1521 : /* if the parities are too small */
1522 80 : if (file_paritymax < used_paritymax) {
1523 : /* LCOV_EXCL_START */
1524 : log_fatal("DANGER! One or more the parity files are smaller than expected!\n");
1525 : if (file_paritymax != 0) {
1526 : log_fatal("If this happens because you are using an old content file,\n");
1527 : log_fatal("you can 'sync' anyway using 'snapraid --force-full sync'\n");
1528 : log_fatal("to force a full rebuild of the parity.\n");
1529 : } else {
1530 : log_fatal("It's possible that the parity disks are not mounted.\n");
1531 : log_fatal("If instead you are adding a new parity level, you can 'sync' using\n");
1532 : log_fatal("'snapraid --force-full sync' to force a full rebuild of the parity.\n");
1533 : }
1534 : exit(EXIT_FAILURE);
1535 : /* LCOV_EXCL_STOP */
1536 : }
1537 : }
1538 :
1539 87 : unrecoverable_error = 0;
1540 :
1541 87 : if (state->opt.prehash) {
1542 7 : msg_progress("Hashing...\n");
1543 :
1544 7 : ret = state_hash_process(state, blockstart, blockmax, &skip_sync);
1545 7 : if (ret == -1) {
1546 : /* LCOV_EXCL_START */
1547 : ++unrecoverable_error;
1548 : /* continue, in case also doing the sync if ::skip_sync is not set */
1549 : /* LCOV_EXCL_STOP */
1550 : }
1551 : }
1552 :
1553 87 : if (!skip_sync) {
1554 86 : msg_progress("Resizing...\n");
1555 :
1556 : /* now change the size of all parities */
1557 567 : for (l = 0; l < state->level; ++l) {
1558 : int is_modified;
1559 :
1560 : /* change the size of the parity file, truncating or extending it */
1561 : /* from this point all the DELETED blocks after the end of the parity are invalid */
1562 : /* and they are automatically removed when we save the new content file */
1563 481 : ret = parity_chsize(&parity_handle[l], &state->parity[l], &is_modified, size, state->block_size, state->opt.skip_fallocate, state->opt.skip_space_holder);
1564 481 : if (ret == -1) {
1565 : /* LCOV_EXCL_START */
1566 : data_off_t out_size;
1567 : parity_size(&parity_handle[l], &out_size);
1568 : parity_overflow(state, out_size);
1569 : log_fatal("WARNING! Without a usable %s file, it isn't possible to sync.\n", lev_name(l));
1570 : exit(EXIT_FAILURE);
1571 : /* LCOV_EXCL_STOP */
1572 : }
1573 :
1574 481 : if (is_modified)
1575 175 : state->need_write = 1;
1576 : }
1577 :
1578 : /* after resizing parity files, refresh again the free info */
1579 86 : state_refresh(state);
1580 :
1581 : /**
1582 : * Save the new state before the sync but after the hashing phase
1583 : *
1584 : * This allows to recover after an aborted sync, and at the same time
1585 : * it allows to recover broken copied/moved files identified in the
1586 : * hashing phase.
1587 : *
1588 : * For example, think at this case:
1589 : * - Add some files at the array
1590 : * - Run a sync command, it will recompute the parity adding the new files
1591 : * - Abort the sync command before it stores the new content file
1592 : * - Delete the not yet synced files from the array
1593 : * - Run a new sync command
1594 : *
1595 : * The sync command has no way to know that the parity file was modified
1596 : * because the files triggering these changes are now deleted and they aren't
1597 : * listed in the content file.
1598 : * Instead, saving the new content file in advance, keeps track of all the parity
1599 : * that may be modified.
1600 : */
1601 86 : if (!state->opt.skip_content_write) {
1602 86 : if (state->need_write)
1603 77 : state_write(state);
1604 : } else {
1605 0 : log_fatal("WARNING! Skipped state write for --test-skip-content-write option.\n");
1606 : }
1607 :
1608 : /* skip degenerated cases of empty parity, or skipping all */
1609 86 : if (blockstart < blockmax) {
1610 83 : ret = state_sync_process(state, parity_handle, blockstart, blockmax);
1611 83 : if (ret == -1) {
1612 : /* LCOV_EXCL_START */
1613 : ++unrecoverable_error;
1614 : /* continue, as we are already exiting */
1615 : /* LCOV_EXCL_STOP */
1616 : }
1617 : } else {
1618 3 : msg_status("Nothing to sync.\n");
1619 : }
1620 : }
1621 :
1622 574 : for (l = 0; l < state->level; ++l) {
1623 487 : ret = parity_close(&parity_handle[l]);
1624 487 : if (ret == -1) {
1625 : /* LCOV_EXCL_START */
1626 : log_fatal("DANGER! Unexpected close error in %s disk.\n", lev_name(l));
1627 : ++unrecoverable_error;
1628 : /* continue, as we are already exiting */
1629 : /* LCOV_EXCL_STOP */
1630 : }
1631 : }
1632 :
1633 : /* abort if required */
1634 87 : if (unrecoverable_error != 0)
1635 12 : return -1;
1636 75 : return 0;
1637 : }
1638 :
|