LCOV - code coverage report
Current view: top level - cmdline - thermal.c (source / functions) Hit Total Coverage
Test: lcov.info Lines: 159 193 82.4 %
Date: 2025-10-28 11:59:11 Functions: 7 8 87.5 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (C) 2025 Andrea Mazzoleni
       3             :  *
       4             :  * This program is free software: you can redistribute it and/or modify
       5             :  * it under the terms of the GNU General Public License as published by
       6             :  * the Free Software Foundation, either version 3 of the License, or
       7             :  * (at your option) any later version.
       8             :  *
       9             :  * This program is distributed in the hope that it will be useful,
      10             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  * GNU General Public License for more details.
      13             :  *
      14             :  * You should have received a copy of the GNU General Public License
      15             :  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
      16             :  */
      17             : 
      18             : #include "portable.h"
      19             : 
      20             : #include "thermal.h"
      21             : #include "state.h"
      22             : #include "io.h"
      23             : 
      24             : #include <math.h>
      25             : 
      26          12 : struct snapraid_thermal* thermal_alloc(uint64_t dev, const char* name)
      27             : {
      28          12 :         struct snapraid_thermal* thermal = malloc_nofail(sizeof(struct snapraid_thermal));
      29             : 
      30          12 :         thermal->device = dev;
      31          12 :         thermal->latest_temperature = 0;
      32          12 :         thermal->count = 0;
      33          12 :         pathcpy(thermal->name, sizeof(thermal->name), name);
      34             : 
      35          12 :         return thermal;
      36             : }
      37             : 
      38          12 : void thermal_free(struct snapraid_thermal* thermal)
      39             : {
      40          12 :         free(thermal);
      41          12 : }
      42             : 
      43             : /*
      44             :  * Fit exponential heating model to data using least squares
      45             :  */
      46          12 : struct snapraid_thermal_params fit_thermal_model(const struct snapraid_thermal_point* points, int n_points, double t_ambient)
      47             : {
      48             :         struct snapraid_thermal_params model;
      49             :         double t_steady_try;
      50             :         double k_try;
      51             : 
      52          12 :         memset(&model, 0, sizeof(model));
      53             : 
      54          12 :         model.t_ambient = t_ambient;
      55             : 
      56             :         /* at least four points to have a result */
      57          12 :         if (n_points < 4)
      58           0 :                 return model;
      59             : 
      60          12 :         double last_temp = points[n_points - 1].temperature;
      61             : 
      62             :         /* iterative refinement to find best k_heat and t_steady */
      63          12 :         double best_error = 1e10;
      64          12 :         double best_k = 0;
      65          12 :         double best_t_steady = 0;
      66             : 
      67             :         /* grid search for parameters */
      68         576 :         for (t_steady_try = last_temp + 2.0; t_steady_try <= last_temp + 25.0; t_steady_try += 0.5) {
      69       15228 :                 for (k_try = 0.00001; k_try <= 0.001; k_try *= 1.2) {
      70       14664 :                         double error = 0.0;
      71             :                         int i;
      72             : 
      73             :                         /* calculate error for this parameter set */
      74       87984 :                         for (i = 0; i < n_points; i++) {
      75       73320 :                                 double t = points[i].time;
      76       73320 :                                 double t_predicted = t_steady_try - (t_steady_try - points[0].temperature) * exp(-k_try * t);
      77       73320 :                                 double diff = points[i].temperature - t_predicted;
      78       73320 :                                 error += diff * diff;
      79             :                         }
      80             : 
      81       14664 :                         if (error < best_error) {
      82         312 :                                 best_error = error;
      83         312 :                                 best_k = k_try;
      84         312 :                                 best_t_steady = t_steady_try;
      85             :                         }
      86             :                 }
      87             :         }
      88             : 
      89          12 :         model.k_heat = best_k;
      90          12 :         model.t_steady = best_t_steady;
      91             : 
      92             :         /* calculate quality metrics */
      93          12 :         double sum_squared_residuals = 0.0;
      94          12 :         double sum_total = 0.0;
      95          12 :         double mean_temp = 0.0;
      96          12 :         model.max_error = 0.0;
      97             :         int i;
      98             : 
      99             :         /* calculate mean temperature */
     100          72 :         for (i = 0; i < n_points; i++)
     101          60 :                 mean_temp += points[i].temperature;
     102          12 :         mean_temp /= n_points;
     103             : 
     104             :         /* calculate R-squared and errors */
     105          72 :         for (i = 0; i < n_points; i++) {
     106          60 :                 double t = points[i].time;
     107          60 :                 double t_predicted = model.t_steady - (model.t_steady - points[0].temperature) * exp(-model.k_heat * t);
     108          60 :                 double residual = points[i].temperature - t_predicted;
     109             : 
     110          60 :                 sum_squared_residuals += residual * residual;
     111          60 :                 sum_total += (points[i].temperature - mean_temp) * (points[i].temperature - mean_temp);
     112             : 
     113          60 :                 double abs_error = fabs(residual);
     114          60 :                 if (abs_error > model.max_error) {
     115          36 :                         model.max_error = abs_error;
     116             :                 }
     117             :         }
     118             : 
     119          12 :         model.rmse = sqrt(sum_squared_residuals / n_points);
     120          12 :         model.r_squared = 1.0 - (sum_squared_residuals / sum_total);
     121             : 
     122          12 :         return model;
     123             : }
     124             : 
     125       72510 : static int smart_temp(devinfo_t* devinfo)
     126             : {
     127       72510 :         uint64_t t = devinfo->smart[SMART_TEMPERATURE_CELSIUS];
     128             : 
     129             :         /* validate temperature */
     130       72510 :         if (t == SMART_UNASSIGNED)
     131           0 :                 return -1;
     132       72510 :         if (t == 0)
     133           0 :                 return -1;
     134       72510 :         if (t > 100)
     135           0 :                 return -1;
     136             : 
     137       72510 :         return t;
     138             : }
     139             : 
     140        2609 : void state_thermal(struct snapraid_state* state, time_t now)
     141             : {
     142             :         tommy_node* i;
     143             :         unsigned j;
     144             :         tommy_list high;
     145             :         tommy_list low;
     146             :         int ret;
     147             : 
     148        2609 :         if (state->thermal_temperature_limit == 0)
     149         193 :                 return;
     150             : 
     151        2416 :         tommy_list_init(&high);
     152        2416 :         tommy_list_init(&low);
     153             : 
     154             :         /* for all disks */
     155       16912 :         for (i = state->disklist; i != 0; i = i->next) {
     156       14496 :                 struct snapraid_disk* disk = i->data;
     157             :                 devinfo_t* entry;
     158             : 
     159       14496 :                 entry = calloc_nofail(1, sizeof(devinfo_t));
     160             : 
     161       14496 :                 entry->device = disk->device;
     162       14496 :                 pathcpy(entry->name, sizeof(entry->name), disk->name);
     163       14496 :                 pathcpy(entry->mount, sizeof(entry->mount), disk->dir);
     164       14496 :                 pathcpy(entry->smartctl, sizeof(entry->smartctl), disk->smartctl);
     165       14496 :                 memcpy(entry->smartignore, disk->smartignore, sizeof(entry->smartignore));
     166             : 
     167       14496 :                 tommy_list_insert_tail(&high, &entry->node, entry);
     168             :         }
     169             : 
     170             :         /* for all parities */
     171       16912 :         for (j = 0; j < state->level; ++j) {
     172             :                 devinfo_t* entry;
     173             :                 unsigned s;
     174             : 
     175       72480 :                 for (s = 0; s < state->parity[j].split_mac; ++s) {
     176       57984 :                         entry = calloc_nofail(1, sizeof(devinfo_t));
     177             : 
     178       57984 :                         entry->device = state->parity[j].split_map[s].device;
     179       57984 :                         pathcpy(entry->name, sizeof(entry->name), lev_config_name(j));
     180       57984 :                         pathcpy(entry->mount, sizeof(entry->mount), state->parity[j].split_map[s].path);
     181       57984 :                         pathcpy(entry->smartctl, sizeof(entry->smartctl), state->parity[j].smartctl);
     182       57984 :                         memcpy(entry->smartignore, state->parity[j].smartignore, sizeof(entry->smartignore));
     183       57984 :                         pathcut(entry->mount); /* remove the parity file */
     184             : 
     185       57984 :                         tommy_list_insert_tail(&high, &entry->node, entry);
     186             :                 }
     187             :         }
     188             : 
     189        2416 :         if (state->opt.fake_device) {
     190        2416 :                 ret = devtest(&high, &low, DEVICE_SMART);
     191             :         } else {
     192           0 :                 ret = devquery(&high, &low, DEVICE_SMART, 0 /* only disks in the array */);
     193             :         }
     194             : 
     195             :         /* on error, just disable thermal gathering */
     196        2416 :         if (ret != 0)
     197           0 :                 return;
     198             : 
     199             :         /* if the list is empty, it's not supported in this platform */
     200        2416 :         if (tommy_list_empty(&low))
     201           0 :                 return;
     202             : 
     203             :         /* if ambient temperature is not set, set it now with the lowest HD temperature */
     204        2416 :         if (state->thermal_ambient_temperature == 0) {
     205           1 :                 state->thermal_ambient_temperature = ambient_temperature();
     206             : 
     207          31 :                 for (i = tommy_list_head(&low); i != 0; i = i->next) {
     208          30 :                         devinfo_t* devinfo = i->data;
     209             : 
     210          30 :                         int temp = smart_temp(devinfo);
     211          30 :                         if (temp < 0)
     212           0 :                                 continue;
     213             : 
     214          30 :                         log_tag("thermal:system:candidate:%d\n", temp);
     215             : 
     216          30 :                         if (state->thermal_ambient_temperature == 0 || state->thermal_ambient_temperature > temp)
     217           1 :                                 state->thermal_ambient_temperature = temp;
     218             :                 }
     219             : 
     220           1 :                 log_tag("thermal:system:final:%d\n", state->thermal_ambient_temperature);
     221             :         }
     222             : 
     223        2416 :         int highest_temperature = 0;
     224       74896 :         for (i = tommy_list_head(&low); i != 0; i = i->next) {
     225             :                 tommy_node* t;
     226             :                 struct snapraid_thermal* found;
     227       72480 :                 devinfo_t* devinfo = i->data;
     228             :                 unsigned k;
     229             : 
     230       72480 :                 int temperature = smart_temp(devinfo);
     231       72480 :                 if (temperature < 0)
     232           0 :                         continue;
     233             : 
     234             :                 /* search of the entry */
     235       72480 :                 found = 0;
     236      601584 :                 for (t = tommy_list_head(&state->thermallist); t != 0; t = t->next) {
     237      601572 :                         struct snapraid_thermal* thermal = t->data;
     238      601572 :                         if (thermal->device == devinfo->device) {
     239       72468 :                                 found = thermal;
     240       72468 :                                 break;
     241             :                         }
     242             :                 }
     243             : 
     244             :                 /* if not found, create it */
     245       72480 :                 if (found == 0) {
     246          12 :                         found = thermal_alloc(devinfo->device, devinfo->name);
     247          12 :                         tommy_list_insert_tail(&state->thermallist, &found->node, found);
     248             :                 }
     249             : 
     250       72480 :                 found->latest_temperature = temperature;
     251             : 
     252       72480 :                 if (highest_temperature < temperature)
     253        2416 :                         highest_temperature = temperature;
     254             : 
     255       72480 :                 log_tag("thermal:current:%s:%" PRIu64 ":%d\n", devinfo->name, devinfo->device, temperature);
     256             : 
     257       72480 :                 if (state->thermal_stop_gathering)
     258           0 :                         continue;
     259             : 
     260       72480 :                 if (found->count + 1 >= THERMAL_MAX) /* keep one extra space at the end */
     261           0 :                         continue;
     262             : 
     263             :                 /* only monotone increasing temperature */
     264       72480 :                 if (found->count > 0 && found->data[found->count - 1].temperature >= temperature)
     265       72468 :                         continue;
     266             : 
     267             :                 /* insert the new data point */
     268          12 :                 found->data[found->count].temperature = temperature;
     269          12 :                 found->data[found->count].time = now - state->thermal_first;
     270          12 :                 ++found->count;
     271             : 
     272          12 :                 if (state->opt.fake_device) {
     273             :                         /* fill with fake data */
     274          12 :                         found->data[0].time = 0;
     275          12 :                         found->data[0].temperature = 27;
     276          12 :                         found->data[1].time = 100;
     277          12 :                         found->data[1].temperature = 28;
     278          12 :                         found->data[2].time = 300;
     279          12 :                         found->data[2].temperature = 29;
     280          12 :                         found->data[3].time = 700;
     281          12 :                         found->data[3].temperature = 30;
     282          12 :                         found->data[4].time = 1500;
     283          12 :                         found->data[4].temperature = 31;
     284          12 :                         found->count = 5;
     285             :                 }
     286             : 
     287             :                 /* log the new data */
     288          12 :                 log_tag("thermal:heat:%s:%" PRIu64 ":%u:", devinfo->name, devinfo->device, found->count);
     289          72 :                 for (k = 0; k < found->count; ++k)
     290          60 :                         log_tag("%s%d/%d", k > 0 ? "," : "", (int)found->data[k].temperature, (int)found->data[k].time);
     291          12 :                 log_tag("\n");
     292             : 
     293             :                 /* estimate parameters */
     294          12 :                 found->params = fit_thermal_model(found->data, found->count, state->thermal_ambient_temperature);
     295             : 
     296          12 :                 log_tag("thermal:params:%s:%" PRIu64 ":%g:%g:%g:%g:%g:%g\n", devinfo->name, devinfo->device,
     297             :                         found->params.k_heat, found->params.t_ambient, found->params.t_steady,
     298             :                         found->params.rmse, found->params.r_squared, found->params.max_error);
     299             :         }
     300             : 
     301             :         /* always update the highest temperature */
     302        2416 :         state->thermal_highest_temperature = highest_temperature;
     303             : 
     304        2416 :         log_tag("thermal:highest:%d\n", highest_temperature);
     305        2416 :         log_flush();
     306             : 
     307        2416 :         tommy_list_foreach(&high, free);
     308        2416 :         tommy_list_foreach(&low, free);
     309             : }
     310             : 
     311     1060633 : int state_thermal_alarm(struct snapraid_state* state)
     312             : {
     313             :         /* if no limit, there is no thermal support */
     314     1060633 :         if (state->thermal_temperature_limit == 0)
     315     1058217 :                 return 0;
     316             : 
     317        2416 :         if (state->thermal_highest_temperature <= state->thermal_temperature_limit)
     318        2416 :                 return 0;
     319             : 
     320           0 :         return 1;
     321             : }
     322             : 
     323           0 : void state_thermal_cooldown(struct snapraid_state* state)
     324             : {
     325           0 :         int sleep_time = state->thermal_cooldown_time;
     326             : 
     327           0 :         if (sleep_time == 0)
     328           0 :                 sleep_time = 5 * 60; /* default sleep time */
     329           0 :         if (sleep_time < 5 * 60)
     330           0 :                 sleep_time = 5 * 60; /* minimum sleep time */
     331             : 
     332             :         /* from now on, stop any further data gathering as the heating is interrupted */
     333           0 :         state->thermal_stop_gathering = 1;
     334             : 
     335           0 :         log_tag("thermal:spindown\n");
     336           0 :         state_device(state, DEVICE_DOWN, 0);
     337             : 
     338           0 :         msg_progress("Cooldown...\n");
     339             : 
     340           0 :         log_tag("thermal:cooldown:%d\n", sleep_time);
     341           0 :         printf("Waiting for %d minutes...\n", sleep_time / 60);
     342             : 
     343           0 :         log_flush();
     344             : 
     345             :         /* every 30 seconds spin down any disk that was spunup */
     346           0 :         while (sleep_time > 0) {
     347           0 :                 state_device(state, DEVICE_DOWNIFUP, 0);
     348             : 
     349           0 :                 sleep(30);
     350           0 :                 sleep_time -= 30;
     351             :         }
     352             : 
     353           0 :         if (!global_interrupt) { /* don't wake-up if we are interrupting */
     354           0 :                 log_tag("thermal:spinup\n");
     355             : 
     356             :                 /* spinup */
     357           0 :                 state_device(state, DEVICE_UP, 0);
     358             : 
     359             :                 /* log new thermal info */
     360           0 :                 state_thermal(state, 0);
     361             :         }
     362           0 : }
     363             : 
     364         222 : int state_thermal_begin(struct snapraid_state* state, time_t now)
     365             : {
     366         222 :         if (state->thermal_temperature_limit == 0)
     367         221 :                 return 1;
     368             : 
     369             :         /* initial thermal measure */
     370           1 :         state->thermal_first = now;
     371           1 :         state->thermal_latest = now;
     372           1 :         state_thermal(state, now);
     373             : 
     374           1 :         if (state->thermal_ambient_temperature != 0) {
     375           1 :                 printf("System temperature is %u degrees\n", state->thermal_ambient_temperature);
     376             : 
     377           1 :                 if (state->thermal_temperature_limit != 0 && state->thermal_temperature_limit <= state->thermal_ambient_temperature) {
     378             :                         /* LCOV_EXCL_START */
     379             :                         log_fatal("DANGER! System temperature of %d degrees is higher than the temperature limit of %d degrees. Unable to proceeed!\n", state->thermal_ambient_temperature, state->thermal_temperature_limit);
     380             :                         log_flush();
     381             :                         return 0;
     382             :                         /* LCOV_EXCL_STOP */
     383             :                 }
     384             :         }
     385             : 
     386           1 :         if (state_thermal_alarm(state)) {
     387             :                 /* LCOV_EXCL_START */
     388             :                 log_fatal("DANGER! Hard disk temperature of %d degrees is already outside the operating range. Unable to proceeed!\n", state->thermal_highest_temperature);
     389             :                 log_flush();
     390             :                 return 0;
     391             :                 /* LCOV_EXCL_STOP */
     392             :         }
     393             : 
     394           1 :         return 1;
     395             : }
     396             : 

Generated by: LCOV version 1.0