LCOV - lcov.info - cmdline/scrub.c

LCOV - code coverage report

Current view:	top level - cmdline - scrub.c (source / functions)		Hit	Total	Coverage
Test:	lcov.info	Lines:	299	378	79.1 %
Date:	2025-10-28 11:59:11	Functions:	6	6	100.0 %

          Line data    Source code

       1             : /*
       2             :  * Copyright (C) 2013 Andrea Mazzoleni
       3             :  *
       4             :  * This program is free software: you can redistribute it and/or modify
       5             :  * it under the terms of the GNU General Public License as published by
       6             :  * the Free Software Foundation, either version 3 of the License, or
       7             :  * (at your option) any later version.
       8             :  *
       9             :  * This program is distributed in the hope that it will be useful,
      10             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  * GNU General Public License for more details.
      13             :  *
      14             :  * You should have received a copy of the GNU General Public License
      15             :  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
      16             :  */
      17             : 
      18             : #include "portable.h"
      19             : 
      20             : #include "support.h"
      21             : #include "elem.h"
      22             : #include "state.h"
      23             : #include "parity.h"
      24             : #include "handle.h"
      25             : #include "io.h"
      26             : #include "raid/raid.h"
      27             : 
      28             : /****************************************************************************/
      29             : /* scrub */
      30             : 
      31             : /**
      32             :  * Buffer for storing the new hashes.
      33             :  */
      34             : struct snapraid_rehash {
      35             :         unsigned char hash[HASH_MAX];
      36             :         struct snapraid_block* block;
      37             : };
      38             : 
      39             : /**
      40             :  * Scrub plan to use.
      41             :  */
      42             : struct snapraid_plan {
      43             :         struct snapraid_state* state;
      44             :         int plan; /**< One of the SCRUB_*. */
      45             :         time_t timelimit; /**< Time limit. Valid only with SCRUB_AUTO. */
      46             :         block_off_t lastlimit; /**< Number of blocks allowed with time exactly at ::timelimit. */
      47             :         block_off_t countlast; /**< Counter of blocks with time exactly at ::timelimit. */
      48             : };
      49             : 
      50             : /**
      51             :  * Check if we have to process the specified block index ::i.
      52             :  */
      53       56085 : static int block_is_enabled(struct snapraid_plan* plan, block_off_t i)
      54             : {
      55             :         time_t blocktime;
      56             :         snapraid_info info;
      57             : 
      58             :         /* don't scrub unused blocks in all plans */
      59       56085 :         info = info_get(&plan->state->infoarr, i);
      60       56085 :         if (info == 0)
      61           0 :                 return 0;
      62             : 
      63             :         /* bad blocks are always scrubbed in all plans */
      64       56085 :         if (info_get_bad(info))
      65        1772 :                 return 1;
      66             : 
      67       54313 :         switch (plan->plan) {
      68        8522 :         case SCRUB_FULL :
      69             :                 /* in 'full' plan everything is scrubbed */
      70        8522 :                 return 1;
      71        9215 :         case SCRUB_EVEN :
      72             :                 /* in 'even' plan, scrub only even blocks */
      73        9215 :                 return i % 2 == 0;
      74        4687 :         case SCRUB_NEW :
      75             :                 /* in 'sync' plan, only blocks never scrubbed */
      76        4687 :                 return info_get_justsynced(info);
      77        3767 :         case SCRUB_BAD :
      78             :                 /* in 'bad' plan, only bad blocks (already reported) */
      79        3767 :                 return 0;
      80             :         }
      81             : 
      82             :         /* if it's too new */
      83       28122 :         blocktime = info_get_time(info);
      84       28122 :         if (blocktime > plan->timelimit) {
      85             :                 /* skip it */
      86        4787 :                 return 0;
      87             :         }
      88             : 
      89             :         /* if the time is less than the limit, always include */
      90             :         /* otherwise, check if we reached the last limit count */
      91       23335 :         if (blocktime == plan->timelimit) {
      92             :                 /* if we reached the count limit */
      93       15066 :                 if (plan->countlast >= plan->lastlimit) {
      94             :                         /* skip it */
      95       12470 :                         return 0;
      96             :                 }
      97             : 
      98        2596 :                 ++plan->countlast;
      99             :         }
     100             : 
     101       10865 :         return 1;
     102             : }
     103             : 
     104      154602 : static void scrub_data_reader(struct snapraid_worker* worker, struct snapraid_task* task)
     105             : {
     106      154602 :         struct snapraid_io* io = worker->io;
     107      154602 :         struct snapraid_state* state = io->state;
     108      154602 :         struct snapraid_handle* handle = worker->handle;
     109      154602 :         struct snapraid_disk* disk = handle->disk;
     110      154602 :         block_off_t blockcur = task->position;
     111      154602 :         unsigned char* buffer = task->buffer;
     112             :         int ret;
     113             :         char esc_buffer[ESC_MAX];
     114             : 
     115             :         /* if the disk position is not used */
     116      154602 :         if (!disk) {
     117             :                 /* use an empty block */
     118           0 :                 memset(buffer, 0, state->block_size);
     119           0 :                 task->state = TASK_STATE_DONE;
     120        1585 :                 return;
     121             :         }
     122             : 
     123             :         /* get the block */
     124      154602 :         task->block = fs_par2block_find(disk, blockcur);
     125             : 
     126             :         /* if the block is not used */
     127      154602 :         if (!block_has_file(task->block)) {
     128             :                 /* use an empty block */
     129        1585 :                 memset(buffer, 0, state->block_size);
     130        1585 :                 task->state = TASK_STATE_DONE;
     131        1585 :                 return;
     132             :         }
     133             : 
     134             :         /* get the file of this block */
     135      153017 :         task->file = fs_par2file_get(disk, blockcur, &task->file_pos);
     136             : 
     137             :         /* if the file is different than the current one, close it */
     138      153017 :         if (handle->file != 0 && handle->file != task->file) {
     139             :                 /* keep a pointer at the file we are going to close for error reporting */
     140       72572 :                 struct snapraid_file* report = handle->file;
     141       72572 :                 ret = handle_close(handle);
     142       72572 :                 if (ret == -1) {
     143             :                         /* LCOV_EXCL_START */
     144             :                         /* This one is really an unexpected error, because we are only reading */
     145             :                         /* and closing a descriptor should never fail */
     146             :                         if (errno == EIO) {
     147             :                                 log_tag("error:%u:%s:%s: Close EIO error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
     148             :                                 log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to scrub.\n");
     149             :                                 log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
     150             :                                 log_fatal("Stopping at block %u\n", blockcur);
     151             :                                 task->state = TASK_STATE_IOERROR;
     152             :                                 return;
     153             :                         }
     154             : 
     155             :                         log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
     156             :                         log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to scrub.\n");
     157             :                         log_fatal("Ensure that file '%s' can be accessed.\n", handle->path);
     158             :                         log_fatal("Stopping at block %u\n", blockcur);
     159             :                         task->state = TASK_STATE_ERROR;
     160             :                         return;
     161             :                         /* LCOV_EXCL_STOP */
     162             :                 }
     163             :         }
     164             : 
     165      153017 :         ret = handle_open(handle, task->file, state->file_mode, log_error, 0);
     166      153017 :         if (ret == -1) {
     167           0 :                 if (errno == EIO) {
     168             :                         /* LCOV_EXCL_START */
     169             :                         log_tag("error:%u:%s:%s: Open EIO error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
     170             :                         log_fatal("DANGER! Unexpected input/output open error in a data disk, it isn't possible to scrub.\n");
     171             :                         log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
     172             :                         log_fatal("Stopping at block %u\n", blockcur);
     173             :                         task->state = TASK_STATE_IOERROR;
     174             :                         return;
     175             :                         /* LCOV_EXCL_STOP */
     176             :                 }
     177             : 
     178           0 :                 log_tag("error:%u:%s:%s: Open error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
     179           0 :                 task->state = TASK_STATE_ERROR_CONTINUE;
     180           0 :                 return;
     181             :         }
     182             : 
     183             :         /* check if the file is changed */
     184      153017 :         if (handle->st.st_size != task->file->size
     185      153017 :                 || handle->st.st_mtime != task->file->mtime_sec
     186      153017 :                 || STAT_NSEC(&handle->st) != task->file->mtime_nsec
     187             :                 /* don't check the inode to support filesystem without persistent inodes */
     188             :         ) {
     189             :                 /* report that the block and the file are not synced */
     190           0 :                 task->is_timestamp_different = 1;
     191             :                 /* follow */
     192             :         }
     193             : 
     194             :         /* note that we intentionally don't abort if the file has different attributes */
     195             :         /* from the last sync, as we are expected to return errors if running */
     196             :         /* in an unsynced array. This is just like the check command. */
     197             : 
     198      153017 :         task->read_size = handle_read(handle, task->file_pos, buffer, state->block_size, log_error, 0);
     199      153017 :         if (task->read_size == -1) {
     200           0 :                 if (errno == EIO) {
     201           0 :                         log_tag("error:%u:%s:%s: Read EIO error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
     202           0 :                         log_error("Input/Output error in file '%s' at position '%u'\n", handle->path, task->file_pos);
     203           0 :                         task->state = TASK_STATE_IOERROR_CONTINUE;
     204           0 :                         return;
     205             :                 }
     206             : 
     207           0 :                 log_tag("error:%u:%s:%s: Read error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
     208           0 :                 task->state = TASK_STATE_ERROR_CONTINUE;
     209           0 :                 return;
     210             :         }
     211             : 
     212             :         /* store the path of the opened file */
     213      153017 :         pathcpy(task->path, sizeof(task->path), handle->path);
     214             : 
     215      153017 :         task->state = TASK_STATE_DONE;
     216             : }
     217             : 
     218      154602 : static void scrub_parity_reader(struct snapraid_worker* worker, struct snapraid_task* task)
     219             : {
     220      154602 :         struct snapraid_io* io = worker->io;
     221      154602 :         struct snapraid_state* state = io->state;
     222      154602 :         struct snapraid_parity_handle* parity_handle = worker->parity_handle;
     223      154602 :         unsigned level = parity_handle->level;
     224      154602 :         block_off_t blockcur = task->position;
     225      154602 :         unsigned char* buffer = task->buffer;
     226             :         int ret;
     227             : 
     228             :         /* read the parity */
     229      154602 :         ret = parity_read(parity_handle, blockcur, buffer, state->block_size, log_error);
     230      154602 :         if (ret == -1) {
     231           0 :                 if (errno == EIO) {
     232           0 :                         log_tag("parity_error:%u:%s: Read EIO error. %s\n", blockcur, lev_config_name(level), strerror(errno));
     233           0 :                         log_error("Input/Output error in parity '%s' at position '%u'\n", lev_config_name(level), blockcur);
     234           0 :                         task->state = TASK_STATE_IOERROR_CONTINUE;
     235           0 :                         return;
     236             :                 }
     237             : 
     238           0 :                 log_tag("parity_error:%u:%s: Read error. %s\n", blockcur, lev_config_name(level), strerror(errno));
     239           0 :                 task->state = TASK_STATE_ERROR_CONTINUE;
     240           0 :                 return;
     241             :         }
     242             : 
     243      154602 :         task->state = TASK_STATE_DONE;
     244             : }
     245             : 
     246          11 : static int state_scrub_process(struct snapraid_state* state, struct snapraid_parity_handle* parity_handle, block_off_t blockstart, block_off_t blockmax, struct snapraid_plan* plan, time_t now)
     247             : {
     248             :         struct snapraid_io io;
     249             :         struct snapraid_handle* handle;
     250             :         void* rehandle_alloc;
     251             :         struct snapraid_rehash* rehandle;
     252             :         unsigned diskmax;
     253             :         block_off_t blockcur;
     254             :         unsigned j;
     255             :         unsigned buffermax;
     256             :         data_off_t countsize;
     257             :         block_off_t countpos;
     258             :         block_off_t countmax;
     259             :         block_off_t autosavedone;
     260             :         block_off_t autosavelimit;
     261             :         block_off_t autosavemissing;
     262             :         int ret;
     263             :         unsigned error;
     264             :         unsigned silent_error;
     265             :         unsigned io_error;
     266             :         unsigned l;
     267             :         unsigned* waiting_map;
     268             :         unsigned waiting_mac;
     269             :         char esc_buffer[ESC_MAX];
     270             :         bit_vect_t* block_enabled;
     271             : 
     272             :         /* maps the disks to handles */
     273          11 :         handle = handle_mapping(state, &diskmax);
     274             : 
     275             :         /* rehash buffers */
     276          11 :         rehandle = malloc_nofail_align(diskmax * sizeof(struct snapraid_rehash), &rehandle_alloc);
     277             : 
     278             :         /* we need 1 * data + 2 * parity */
     279          11 :         buffermax = diskmax + 2 * state->level;
     280             : 
     281             :         /* initialize the io threads */
     282          11 :         io_init(&io, state, state->opt.io_cache, buffermax, scrub_data_reader, handle, diskmax, scrub_parity_reader, 0, parity_handle, state->level);
     283             : 
     284             :         /* possibly waiting disks */
     285          11 :         waiting_mac = diskmax > RAID_PARITY_MAX ? diskmax : RAID_PARITY_MAX;
     286          11 :         waiting_map = malloc_nofail(waiting_mac * sizeof(unsigned));
     287             : 
     288          11 :         error = 0;
     289          11 :         silent_error = 0;
     290          11 :         io_error = 0;
     291             : 
     292          11 :         msg_progress("Selecting...\n");
     293             : 
     294             :         /* first count the number of blocks to process */
     295          11 :         countmax = 0;
     296          11 :         plan->countlast = 0;
     297          11 :         block_enabled = calloc_nofail(1, bit_vect_size(blockmax)); /* preinitialize to 0 */
     298       56096 :         for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
     299       56085 :                 if (!block_is_enabled(plan, blockcur))
     300       30318 :                         continue;
     301       25767 :                 bit_vect_set(block_enabled, blockcur);
     302       25767 :                 ++countmax;
     303             :         }
     304             : 
     305             :         /* compute the autosave size for all disk, even if not read */
     306             :         /* this makes sense because the speed should be almost the same */
     307             :         /* if the disks are read in parallel */
     308          11 :         autosavelimit = state->autosave / (diskmax * state->block_size);
     309          11 :         autosavemissing = countmax; /* blocks to do */
     310          11 :         autosavedone = 0; /* blocks done */
     311             : 
     312             :         /* drop until now */
     313          11 :         state_usage_waste(state);
     314             : 
     315          11 :         countsize = 0;
     316          11 :         countpos = 0;
     317             : 
     318          11 :         msg_progress("Scrubbing...\n");
     319             : 
     320             :         /* start all the worker threads */
     321          11 :         io_start(&io, blockstart, blockmax, block_enabled);
     322             : 
     323          11 :         if (!state_progress_begin(state, blockstart, blockmax, countmax))
     324           0 :                 goto end;
     325             : 
     326       25767 :         while (1) {
     327             :                 unsigned char* buffer_recov[LEV_MAX];
     328             :                 snapraid_info info;
     329             :                 int error_on_this_block;
     330             :                 int silent_error_on_this_block;
     331             :                 int io_error_on_this_block;
     332             :                 int block_is_unsynced;
     333             :                 int rehash;
     334             :                 void** buffer;
     335             : 
     336             :                 /* go to the next block */
     337       25778 :                 blockcur = io_read_next(&io, &buffer);
     338       25778 :                 if (blockcur >= blockmax)
     339          11 :                         break;
     340             : 
     341             :                 /* until now is scheduling */
     342       25767 :                 state_usage_sched(state);
     343             : 
     344             :                 /* one more block processed for autosave */
     345       25767 :                 ++autosavedone;
     346       25767 :                 --autosavemissing;
     347             : 
     348             :                 /* by default process the block, and skip it if something goes wrong */
     349       25767 :                 error_on_this_block = 0;
     350       25767 :                 silent_error_on_this_block = 0;
     351       25767 :                 io_error_on_this_block = 0;
     352             : 
     353             :                 /* if all the blocks at this address are synced */
     354             :                 /* if not, parity is not even checked */
     355       25767 :                 block_is_unsynced = 0;
     356             : 
     357             :                 /* get block specific info */
     358       25767 :                 info = info_get(&state->infoarr, blockcur);
     359             : 
     360             :                 /* if we have to use the old hash */
     361       25767 :                 rehash = info_get_rehash(info);
     362             : 
     363             :                 /* for each disk, process the block */
     364      180369 :                 for (j = 0; j < diskmax; ++j) {
     365             :                         struct snapraid_task* task;
     366             :                         int read_size;
     367             :                         unsigned char hash[HASH_MAX];
     368             :                         struct snapraid_block* block;
     369             :                         int file_is_unsynced;
     370             :                         struct snapraid_disk* disk;
     371             :                         struct snapraid_file* file;
     372             :                         block_off_t file_pos;
     373             :                         unsigned diskcur;
     374             : 
     375             :                         /* if the file on this disk is synced */
     376             :                         /* if not, silent errors are assumed as expected error */
     377      154602 :                         file_is_unsynced = 0;
     378             : 
     379             :                         /* until now is misc */
     380      154602 :                         state_usage_misc(state);
     381             : 
     382             :                         /* get the next task */
     383      154602 :                         task = io_data_read(&io, &diskcur, waiting_map, &waiting_mac);
     384             : 
     385             :                         /* until now is disk */
     386      154602 :                         state_usage_disk(state, handle, waiting_map, waiting_mac);
     387             : 
     388             :                         /* get the task results */
     389      154602 :                         disk = task->disk;
     390      154602 :                         block = task->block;
     391      154602 :                         file = task->file;
     392      154602 :                         file_pos = task->file_pos;
     393      154602 :                         read_size = task->read_size;
     394             : 
     395             :                         /* by default no rehash in case of "continue" */
     396      154602 :                         rehandle[diskcur].block = 0;
     397             : 
     398             :                         /* if the disk position is not used */
     399      154602 :                         if (!disk)
     400        2505 :                                 continue;
     401             : 
     402      154602 :                         state_usage_file(state, disk, file);
     403             : 
     404             :                         /* if the block is unsynced, errors are expected */
     405      154602 :                         if (block_has_invalid_parity(block)) {
     406             :                                 /* report that the block and the file are not synced */
     407           0 :                                 block_is_unsynced = 1;
     408           0 :                                 file_is_unsynced = 1;
     409             :                                 /* follow */
     410             :                         }
     411             : 
     412             :                         /* if the block is not used */
     413      154602 :                         if (!block_has_file(block))
     414        1585 :                                 continue;
     415             : 
     416             :                         /* if the block is unsynced, errors are expected */
     417      153017 :                         if (task->is_timestamp_different) {
     418             :                                 /* report that the block and the file are not synced */
     419           0 :                                 block_is_unsynced = 1;
     420           0 :                                 file_is_unsynced = 1;
     421             :                                 /* follow */
     422             :                         }
     423             : 
     424             :                         /* handle error conditions */
     425      153017 :                         if (task->state == TASK_STATE_IOERROR) {
     426             :                                 /* LCOV_EXCL_START */
     427             :                                 ++io_error;
     428             :                                 goto bail;
     429             :                                 /* LCOV_EXCL_STOP */
     430             :                         }
     431      153017 :                         if (task->state == TASK_STATE_ERROR) {
     432             :                                 /* LCOV_EXCL_START */
     433             :                                 ++error;
     434             :                                 goto bail;
     435             :                                 /* LCOV_EXCL_STOP */
     436             :                         }
     437      153017 :                         if (task->state == TASK_STATE_ERROR_CONTINUE) {
     438           0 :                                 ++error;
     439           0 :                                 error_on_this_block = 1;
     440           0 :                                 continue;
     441             :                         }
     442      153017 :                         if (task->state == TASK_STATE_IOERROR_CONTINUE) {
     443           0 :                                 ++io_error;
     444           0 :                                 if (io_error >= state->opt.io_error_limit) {
     445             :                                         /* LCOV_EXCL_START */
     446             :                                         log_fatal("DANGER! Too many input/output read error in a data disk, it isn't possible to scrub.\n");
     447             :                                         log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, task->path);
     448             :                                         log_fatal("Stopping at block %u\n", blockcur);
     449             :                                         goto bail;
     450             :                                         /* LCOV_EXCL_STOP */
     451             :                                 }
     452             : 
     453             :                                 /* otherwise continue */
     454           0 :                                 io_error_on_this_block = 1;
     455           0 :                                 continue;
     456             :                         }
     457      153017 :                         if (task->state != TASK_STATE_DONE) {
     458             :                                 /* LCOV_EXCL_START */
     459             :                                 log_fatal("Internal inconsistency in task state\n");
     460             :                                 os_abort();
     461             :                                 /* LCOV_EXCL_STOP */
     462             :                         }
     463             : 
     464      153017 :                         countsize += read_size;
     465             : 
     466             :                         /* now compute the hash */
     467      153017 :                         if (rehash) {
     468       27247 :                                 memhash(state->prevhash, state->prevhashseed, hash, buffer[diskcur], read_size);
     469             : 
     470             :                                 /* compute the new hash, and store it */
     471       27247 :                                 rehandle[diskcur].block = block;
     472       27247 :                                 memhash(state->hash, state->hashseed, rehandle[diskcur].hash, buffer[diskcur], read_size);
     473             :                         } else {
     474      125770 :                                 memhash(state->hash, state->hashseed, hash, buffer[diskcur], read_size);
     475             :                         }
     476             : 
     477             :                         /* until now is hash */
     478      153017 :                         state_usage_hash(state);
     479             : 
     480      153017 :                         if (block_has_updated_hash(block)) {
     481             :                                 /* compare the hash */
     482      153017 :                                 if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
     483         920 :                                         unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
     484             : 
     485         920 :                                         log_tag("error:%u:%s:%s: Data error at position %u, diff bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
     486             : 
     487             :                                         /* it's a silent error only if we are dealing with synced files */
     488         920 :                                         if (file_is_unsynced) {
     489           0 :                                                 ++error;
     490           0 :                                                 error_on_this_block = 1;
     491             :                                         } else {
     492         920 :                                                 log_error("Data error in file '%s' at position '%u', diff bits %u/%u\n", task->path, file_pos, diff, BLOCK_HASH_SIZE * 8);
     493         920 :                                                 ++silent_error;
     494         920 :                                                 silent_error_on_this_block = 1;
     495             :                                         }
     496         920 :                                         continue;
     497             :                                 }
     498             :                         }
     499             :                 }
     500             : 
     501             :                 /* buffers for parity read and not computed */
     502      180369 :                 for (l = 0; l < state->level; ++l)
     503      154602 :                         buffer_recov[l] = buffer[diskmax + state->level + l];
     504       25767 :                 for (; l < LEV_MAX; ++l)
     505           0 :                         buffer_recov[l] = 0;
     506             : 
     507             :                 /* until now is misc */
     508       25767 :                 state_usage_misc(state);
     509             : 
     510             :                 /* read the parity */
     511      180369 :                 for (l = 0; l < state->level; ++l) {
     512             :                         struct snapraid_task* task;
     513             :                         unsigned levcur;
     514             : 
     515      154602 :                         task = io_parity_read(&io, &levcur, waiting_map, &waiting_mac);
     516             : 
     517             :                         /* until now is parity */
     518      154602 :                         state_usage_parity(state, waiting_map, waiting_mac);
     519             : 
     520             :                         /* handle error conditions */
     521      154602 :                         if (task->state == TASK_STATE_IOERROR) {
     522             :                                 /* LCOV_EXCL_START */
     523             :                                 ++io_error;
     524             :                                 goto bail;
     525             :                                 /* LCOV_EXCL_STOP */
     526             :                         }
     527      154602 :                         if (task->state == TASK_STATE_ERROR) {
     528             :                                 /* LCOV_EXCL_START */
     529             :                                 ++error;
     530             :                                 goto bail;
     531             :                                 /* LCOV_EXCL_STOP */
     532             :                         }
     533      154602 :                         if (task->state == TASK_STATE_ERROR_CONTINUE) {
     534           0 :                                 ++error;
     535           0 :                                 error_on_this_block = 1;
     536             : 
     537             :                                 /* if continuing on error, clear the missing buffer */
     538           0 :                                 buffer_recov[levcur] = 0;
     539           0 :                                 continue;
     540             :                         }
     541      154602 :                         if (task->state == TASK_STATE_IOERROR_CONTINUE) {
     542           0 :                                 ++io_error;
     543           0 :                                 if (io_error >= state->opt.io_error_limit) {
     544             :                                         /* LCOV_EXCL_START */
     545             :                                         log_fatal("DANGER! Too many input/output read error in the %s disk, it isn't possible to scrub.\n", lev_name(levcur));
     546             :                                         log_fatal("Ensure that disk '%s' is sane and can be read.\n", lev_config_name(levcur));
     547             :                                         log_fatal("Stopping at block %u\n", blockcur);
     548             :                                         goto bail;
     549             :                                         /* LCOV_EXCL_STOP */
     550             :                                 }
     551             : 
     552             :                                 /* otherwise continue */
     553           0 :                                 io_error_on_this_block = 1;
     554             : 
     555             :                                 /* if continuing on error, clear the missing buffer */
     556           0 :                                 buffer_recov[levcur] = 0;
     557           0 :                                 continue;
     558             :                         }
     559      154602 :                         if (task->state != TASK_STATE_DONE) {
     560             :                                 /* LCOV_EXCL_START */
     561             :                                 log_fatal("Internal inconsistency in task state\n");
     562             :                                 os_abort();
     563             :                                 /* LCOV_EXCL_STOP */
     564             :                         }
     565             :                 }
     566             : 
     567             :                 /* if we have read all the data required and it's correct, proceed with the parity check */
     568       25767 :                 if (!error_on_this_block && !silent_error_on_this_block && !io_error_on_this_block) {
     569             : 
     570             :                         /* compute the parity */
     571       24847 :                         raid_gen(diskmax, state->level, state->block_size, buffer);
     572             : 
     573             :                         /* compare the parity */
     574      173929 :                         for (l = 0; l < state->level; ++l) {
     575      149082 :                                 if (buffer_recov[l] && memcmp(buffer[diskmax + l], buffer_recov[l], state->block_size) != 0) {
     576           0 :                                         unsigned diff = memdiff(buffer[diskmax + l], buffer_recov[l], state->block_size);
     577             : 
     578           0 :                                         log_tag("parity_error:%u:%s: Data error, diff bits %u/%u\n", blockcur, lev_config_name(l), diff, state->block_size * 8);
     579             : 
     580             :                                         /* it's a silent error only if we are dealing with synced blocks */
     581           0 :                                         if (block_is_unsynced) {
     582           0 :                                                 ++error;
     583           0 :                                                 error_on_this_block = 1;
     584             :                                         } else {
     585           0 :                                                 log_fatal("Data error in parity '%s' at position '%u', diff bits %u/%u\n", lev_config_name(l), blockcur, diff, state->block_size * 8);
     586           0 :                                                 ++silent_error;
     587           0 :                                                 silent_error_on_this_block = 1;
     588             :                                         }
     589             :                                 }
     590             :                         }
     591             : 
     592             :                         /* until now is raid */
     593       24847 :                         state_usage_raid(state);
     594             :                 }
     595             : 
     596       25767 :                 if (silent_error_on_this_block || io_error_on_this_block) {
     597             :                         /* set the error status keeping other info */
     598         920 :                         info_set(&state->infoarr, blockcur, info_set_bad(info));
     599       24847 :                 } else if (error_on_this_block) {
     600             :                         /* do nothing, as this is a generic error */
     601             :                         /* likely caused by a not synced array */
     602             :                 } else {
     603             :                         /* if rehash is needed */
     604       24847 :                         if (rehash) {
     605             :                                 /* store all the new hash already computed */
     606       32256 :                                 for (j = 0; j < diskmax; ++j) {
     607       27648 :                                         if (rehandle[j].block)
     608       27247 :                                                 memcpy(rehandle[j].block->hash, rehandle[j].hash, BLOCK_HASH_SIZE);
     609             :                                 }
     610             :                         }
     611             : 
     612             :                         /* update the time info of the block */
     613             :                         /* and clear any other flag */
     614       24847 :                         info_set(&state->infoarr, blockcur, info_make(now, 0, 0, 0));
     615             :                 }
     616             : 
     617             :                 /* mark the state as needing write */
     618       25767 :                 state->need_write = 1;
     619             : 
     620             :                 /* count the number of processed block */
     621       25767 :                 ++countpos;
     622             : 
     623             :                 /* progress */
     624       25767 :                 if (state_progress(state, &io, blockcur, countpos, countmax, countsize)) {
     625             :                         /* LCOV_EXCL_START */
     626             :                         break;
     627             :                         /* LCOV_EXCL_STOP */
     628             :                 }
     629             : 
     630             :                 /* thermal control */
     631       25767 :                 if (state_thermal_alarm(state)) {
     632             :                         /* until now is misc */
     633           0 :                         state_usage_misc(state);
     634             : 
     635           0 :                         state_progress_stop(state);
     636             : 
     637           0 :                         state_thermal_cooldown(state);
     638             : 
     639           0 :                         state_progress_restart(state);
     640             : 
     641             :                         /* drop until now */
     642           0 :                         state_usage_waste(state);
     643             :                 }
     644             : 
     645             :                 /* autosave */
     646       25767 :                 if (state->autosave != 0
     647           0 :                         && autosavedone >= autosavelimit /* if we have reached the limit */
     648           0 :                         && autosavemissing >= autosavelimit /* if we have at least a full step to do */
     649             :                 ) {
     650           0 :                         autosavedone = 0; /* restart the counter */
     651             : 
     652             :                         /* until now is misc */
     653           0 :                         state_usage_misc(state);
     654             : 
     655           0 :                         state_progress_stop(state);
     656             : 
     657           0 :                         msg_progress("Autosaving...\n");
     658           0 :                         state_write(state);
     659             : 
     660           0 :                         state_progress_restart(state);
     661             : 
     662             :                         /* drop until now */
     663           0 :                         state_usage_waste(state);
     664             :                 }
     665             :         }
     666             : 
     667          11 : end:
     668          11 :         state_progress_end(state, countpos, countmax, countsize, "Nothing to scrub. Use the -p PLAN option to select a different plan, like -p full.\n");
     669             : 
     670             :         /* save the new state if required */
     671          11 :         if (state->need_write || state->opt.force_content_write)
     672           9 :                 state_write(state);
     673             : 
     674          11 :         state_usage_print(state);
     675             : 
     676          11 :         if (error || silent_error || io_error) {
     677           1 :                 msg_status("\n");
     678           1 :                 msg_status("%8u file errors\n", error);
     679           1 :                 msg_status("%8u io errors\n", io_error);
     680           1 :                 msg_status("%8u data errors\n", silent_error);
     681             :         } else {
     682             :                 /* print the result only if processed something */
     683          10 :                 if (countpos != 0)
     684           8 :                         msg_status("Everything OK\n");
     685             :         }
     686             : 
     687          11 :         if (error)
     688           0 :                 log_fatal("WARNING! Unexpected file errors!\n");
     689          11 :         if (io_error)
     690           0 :                 log_fatal("DANGER! Unexpected input/output errors! The failing blocks are now marked as bad!\n");
     691          11 :         if (silent_error)
     692           1 :                 log_fatal("DANGER! Unexpected data errors! The failing blocks are now marked as bad!\n");
     693          11 :         if (io_error || silent_error) {
     694           1 :                 log_fatal("Use 'snapraid status' to list the bad blocks.\n");
     695           1 :                 log_fatal("Use 'snapraid -e fix' to recover them.\n");
     696           1 :                 log_fatal("Use 'snapraid -p bad scrub' to recheck after fixing to clear the bad state.\n");
     697             :         }
     698             : 
     699          11 :         log_tag("summary:error_file:%u\n", error);
     700          11 :         log_tag("summary:error_io:%u\n", io_error);
     701          11 :         log_tag("summary:error_data:%u\n", silent_error);
     702          11 :         if (error + silent_error + io_error == 0)
     703          10 :                 log_tag("summary:exit:ok\n");
     704             :         else
     705           1 :                 log_tag("summary:exit:error\n");
     706          11 :         log_flush();
     707             : 
     708          11 : bail:
     709             :         /* stop all the worker threads */
     710          11 :         io_stop(&io);
     711             : 
     712          77 :         for (j = 0; j < diskmax; ++j) {
     713          66 :                 struct snapraid_file* file = handle[j].file;
     714          66 :                 struct snapraid_disk* disk = handle[j].disk;
     715          66 :                 ret = handle_close(&handle[j]);
     716          66 :                 if (ret == -1) {
     717             :                         /* LCOV_EXCL_START */
     718             :                         log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
     719             :                         log_fatal("DANGER! Unexpected close error in a data disk.\n");
     720             :                         ++error;
     721             :                         /* continue, as we are already exiting */
     722             :                         /* LCOV_EXCL_STOP */
     723             :                 }
     724             :         }
     725             : 
     726          11 :         free(handle);
     727          11 :         free(rehandle_alloc);
     728          11 :         free(waiting_map);
     729          11 :         io_done(&io);
     730          11 :         free(block_enabled);
     731             : 
     732          11 :         if (state->opt.expect_recoverable) {
     733           1 :                 if (error + silent_error + io_error == 0)
     734           0 :                         return -1;
     735             :         } else {
     736          10 :                 if (error + silent_error + io_error != 0)
     737           0 :                         return -1;
     738             :         }
     739          11 :         return 0;
     740             : }
     741             : 
     742             : /**
     743             :  * Return a * b / c approximated to the upper value.
     744             :  */
     745           2 : static uint32_t md(uint32_t a, uint32_t b, uint32_t c)
     746             : {
     747           2 :         uint64_t v = a;
     748             : 
     749           2 :         v *= b;
     750           2 :         v += c - 1;
     751           2 :         v /= c;
     752             : 
     753           2 :         return v;
     754             : }
     755             : 
     756          11 : int state_scrub(struct snapraid_state* state, int plan, int olderthan)
     757             : {
     758             :         block_off_t blockmax;
     759             :         block_off_t countlimit;
     760             :         block_off_t i;
     761             :         block_off_t count;
     762             :         time_t recentlimit;
     763             :         int ret;
     764             :         struct snapraid_parity_handle parity_handle[LEV_MAX];
     765             :         struct snapraid_plan ps;
     766             :         time_t* timemap;
     767             :         unsigned error;
     768             :         time_t now;
     769             :         unsigned l;
     770             : 
     771             :         /* get the present time */
     772          11 :         now = time(0);
     773             : 
     774          11 :         msg_progress("Initializing...\n");
     775             : 
     776          11 :         if ((plan == SCRUB_BAD || plan == SCRUB_NEW || plan == SCRUB_FULL)
     777           4 :                 && olderthan >= 0) {
     778             :                 /* LCOV_EXCL_START */
     779             :                 log_fatal("You can specify -o, --older-than only with a numeric percentage.\n");
     780             :                 exit(EXIT_FAILURE);
     781             :                 /* LCOV_EXCL_STOP */
     782             :         }
     783             : 
     784          11 :         blockmax = parity_allocated_size(state);
     785             : 
     786             :         /* preinitialize to avoid warnings */
     787          11 :         countlimit = 0;
     788          11 :         recentlimit = 0;
     789             : 
     790          11 :         ps.state = state;
     791          11 :         if (state->opt.force_scrub_even) {
     792           1 :                 ps.plan = SCRUB_EVEN;
     793          10 :         } else if (plan == SCRUB_FULL) {
     794           2 :                 ps.plan = SCRUB_FULL;
     795           2 :                 msg_info("Scrub plan: full. All data blocks will be checked.\n");
     796           8 :         } else if (plan == SCRUB_NEW) {
     797           1 :                 ps.plan = SCRUB_NEW;
     798           1 :                 msg_info("Scrub plan: new. Only blocks that have never been scrubbed will be checked.\n");
     799           7 :         } else if (plan == SCRUB_BAD) {
     800           1 :                 ps.plan = SCRUB_BAD;
     801           1 :                 msg_info("Scrub plan: bad. Only blocks previously marked as bad will be checked.\n");
     802           6 :         } else if (state->opt.force_scrub_at) {
     803             :                 /* scrub the specified amount of blocks */
     804           4 :                 ps.plan = SCRUB_AUTO;
     805           4 :                 countlimit = state->opt.force_scrub_at;
     806           4 :                 recentlimit = now;
     807             :         } else {
     808           2 :                 ps.plan = SCRUB_AUTO;
     809           2 :                 if (plan >= 0) {
     810           1 :                         countlimit = md(blockmax, plan, 100);
     811             :                 } else {
     812             :                         /* by default scrub 8.33% of the array (100/12=8.(3)) */
     813           1 :                         countlimit = md(blockmax, 1, 12);
     814             :                 }
     815             : 
     816           2 :                 if (olderthan >= 0) {
     817           1 :                         recentlimit = now - olderthan * 24 * 3600;
     818             :                 } else {
     819             :                         /* by default use a 10 day time limit */
     820           1 :                         recentlimit = now - 10 * 24 * 3600;
     821             :                 }
     822             : 
     823           2 :                 if (plan >= 0) {
     824           1 :                         if (olderthan >= 0)
     825           0 :                                 msg_info("Scrub plan: auto. %d%% of the array, older than %d days, will be checked.\n", plan, olderthan);
     826             :                         else
     827           1 :                                 msg_info("Scrub plan: auto. %d%% of the array, older than 10 days, will be checked.\n", plan);
     828             :                 } else {
     829           1 :                         if (olderthan >= 0)
     830           1 :                                 msg_info("Scrub plan: auto. 8.3%% of the array, older than %d days, will be checked.\n", olderthan);
     831             :                         else
     832           0 :                                 msg_info("Scrub plan: auto. 8.3%% of the array, older than 10 days, will be checked.\n");
     833             :                 }
     834             :         }
     835             : 
     836             :         /* identify the time limit */
     837             :         /* we sort all the block times, and we identify the time limit for which we reach the quota */
     838             :         /* this allow to process first the oldest blocks */
     839          11 :         timemap = malloc_nofail(blockmax * sizeof(time_t));
     840             : 
     841             :         /* copy the info in the temp vector */
     842          11 :         count = 0;
     843          11 :         log_tag("block_count:%u\n", blockmax);
     844       56096 :         for (i = 0; i < blockmax; ++i) {
     845       56085 :                 snapraid_info info = info_get(&state->infoarr, i);
     846             : 
     847             :                 /* skip unused blocks */
     848       56085 :                 if (info == 0)
     849           0 :                         continue;
     850             : 
     851       56085 :                 timemap[count++] = info_get_time(info);
     852             :         }
     853             : 
     854          11 :         if (!count) {
     855             :                 /* LCOV_EXCL_START */
     856             :                 log_fatal("The array appears to be empty.\n");
     857             :                 exit(EXIT_FAILURE);
     858             :                 /* LCOV_EXCL_STOP */
     859             :         }
     860             : 
     861             :         /* sort it */
     862          11 :         qsort(timemap, count, sizeof(time_t), time_compare);
     863             : 
     864             :         /* output the info map */
     865          11 :         i = 0;
     866          11 :         log_tag("info_count:%u\n", count);
     867          29 :         while (i < count) {
     868          18 :                 unsigned j = i + 1;
     869       56085 :                 while (j < count && timemap[i] == timemap[j])
     870       56067 :                         ++j;
     871          18 :                 log_tag("info_time:%" PRIu64 ":%u\n", (uint64_t)timemap[i], j - i);
     872          18 :                 i = j;
     873             :         }
     874             : 
     875             :         /* compute the limits from count/recentlimit */
     876          11 :         if (ps.plan == SCRUB_AUTO) {
     877             :                 /* no more than the full count */
     878           6 :                 if (countlimit > count)
     879           2 :                         countlimit = count;
     880             : 
     881             :                 /* decrease until we reach the specific recentlimit */
     882          53 :                 while (countlimit > 0 && timemap[countlimit - 1] > recentlimit)
     883          47 :                         --countlimit;
     884             : 
     885             :                 /* if there is something to scrub */
     886           6 :                 if (countlimit > 0) {
     887             :                         /* get the most recent time we want to scrub */
     888           5 :                         ps.timelimit = timemap[countlimit - 1];
     889             : 
     890             :                         /* count how many entries for this exact time we have to scrub */
     891             :                         /* if the blocks have all the same time, we end with countlimit == lastlimit */
     892           5 :                         ps.lastlimit = 1;
     893        2596 :                         while (countlimit > ps.lastlimit && timemap[countlimit - ps.lastlimit - 1] == ps.timelimit)
     894        2591 :                                 ++ps.lastlimit;
     895             :                 } else {
     896             :                         /* if nothing to scrub, disable also other limits */
     897           1 :                         ps.timelimit = 0;
     898           1 :                         ps.lastlimit = 0;
     899             :                 }
     900             : 
     901           6 :                 log_tag("count_limit:%u\n", countlimit);
     902           6 :                 log_tag("time_limit:%" PRIu64 "\n", (uint64_t)ps.timelimit);
     903           6 :                 log_tag("last_limit:%u\n", ps.lastlimit);
     904             :         }
     905             : 
     906             :         /* free the temp vector */
     907          11 :         free(timemap);
     908             : 
     909             :         /* open the file for reading */
     910          77 :         for (l = 0; l < state->level; ++l) {
     911          66 :                 ret = parity_open(&parity_handle[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
     912          66 :                 if (ret == -1) {
     913             :                         /* LCOV_EXCL_START */
     914             :                         log_fatal("WARNING! Without an accessible %s file, it isn't possible to scrub.\n", lev_name(l));
     915             :                         exit(EXIT_FAILURE);
     916             :                         /* LCOV_EXCL_STOP */
     917             :                 }
     918             :         }
     919             : 
     920          11 :         error = 0;
     921             : 
     922          11 :         ret = state_scrub_process(state, parity_handle, 0, blockmax, &ps, now);
     923          11 :         if (ret == -1) {
     924           0 :                 ++error;
     925             :                 /* continue, as we are already exiting */
     926             :         }
     927             : 
     928          77 :         for (l = 0; l < state->level; ++l) {
     929          66 :                 ret = parity_close(&parity_handle[l]);
     930          66 :                 if (ret == -1) {
     931             :                         /* LCOV_EXCL_START */
     932             :                         log_fatal("DANGER! Unexpected close error in %s disk.\n", lev_name(l));
     933             :                         ++error;
     934             :                         /* continue, as we are already exiting */
     935             :                         /* LCOV_EXCL_STOP */
     936             :                 }
     937             :         }
     938             : 
     939             :         /* abort if required */
     940          11 :         if (error != 0)
     941           0 :                 return -1;
     942          11 :         return 0;
     943             : }
     944             :

Generated by: LCOV version 1.0