LCOV - code coverage report
Current view: top level - cmdline - thermal.c (source / functions) Hit Total Coverage
Test: lcov.info Lines: 155 187 82.9 %
Date: 2026-04-29 15:04:44 Functions: 6 7 85.7 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-3.0-or-later
       2             : // Copyright (C) 2025 Andrea Mazzoleni
       3             : 
       4             : #include "portable.h"
       5             : 
       6             : #include "thermal.h"
       7             : #include "state.h"
       8             : #include "io.h"
       9             : 
      10             : #include <math.h>
      11             : 
      12          30 : struct snapraid_thermal* thermal_alloc(uint64_t dev, const char* name)
      13             : {
      14          30 :         struct snapraid_thermal* thermal = malloc_nofail(sizeof(struct snapraid_thermal));
      15             : 
      16          30 :         thermal->device = dev;
      17          30 :         thermal->latest_temperature = 0;
      18          30 :         thermal->count = 0;
      19          30 :         pathcpy(thermal->name, sizeof(thermal->name), name);
      20             : 
      21          30 :         return thermal;
      22             : }
      23             : 
      24          30 : void thermal_free(struct snapraid_thermal* thermal)
      25             : {
      26          30 :         free(thermal);
      27          30 : }
      28             : 
      29             : /*
      30             :  * Fit exponential heating model to data using least squares
      31             :  */
      32          30 : struct snapraid_thermal_params fit_thermal_model(const struct snapraid_thermal_point* points, int n_points, double t_ambient)
      33             : {
      34             :         struct snapraid_thermal_params model;
      35             :         double t_steady_try;
      36             :         double k_try;
      37             : 
      38          30 :         memset(&model, 0, sizeof(model));
      39             : 
      40          30 :         model.t_ambient = t_ambient;
      41             : 
      42             :         /* at least four points to have a result */
      43          30 :         if (n_points < 4)
      44           0 :                 return model;
      45             : 
      46          30 :         double last_temp = points[n_points - 1].temperature;
      47             : 
      48             :         /* iterative refinement to find best k_heat and t_steady */
      49          30 :         double best_error = 1e10;
      50          30 :         double best_k = 0;
      51          30 :         double best_t_steady = 0;
      52             : 
      53             :         /* grid search for parameters */
      54        1440 :         for (t_steady_try = last_temp + 2.0; t_steady_try <= last_temp + 25.0; t_steady_try += 0.5) {
      55       38070 :                 for (k_try = 0.00001; k_try <= 0.001; k_try *= 1.2) {
      56       36660 :                         double error = 0.0;
      57             :                         int i;
      58             : 
      59             :                         /* calculate error for this parameter set */
      60      219960 :                         for (i = 0; i < n_points; i++) {
      61      183300 :                                 double t = points[i].time;
      62      183300 :                                 double t_predicted = t_steady_try - (t_steady_try - points[0].temperature) * exp(-k_try * t);
      63      183300 :                                 double diff = points[i].temperature - t_predicted;
      64      183300 :                                 error += diff * diff;
      65             :                         }
      66             : 
      67       36660 :                         if (error < best_error) {
      68         780 :                                 best_error = error;
      69         780 :                                 best_k = k_try;
      70         780 :                                 best_t_steady = t_steady_try;
      71             :                         }
      72             :                 }
      73             :         }
      74             : 
      75          30 :         model.k_heat = best_k;
      76          30 :         model.t_steady = best_t_steady;
      77             : 
      78             :         /* calculate quality metrics */
      79          30 :         double sum_squared_residuals = 0.0;
      80          30 :         double sum_total = 0.0;
      81          30 :         double mean_temp = 0.0;
      82          30 :         model.max_error = 0.0;
      83             :         int i;
      84             : 
      85             :         /* calculate mean temperature */
      86         180 :         for (i = 0; i < n_points; i++)
      87         150 :                 mean_temp += points[i].temperature;
      88          30 :         mean_temp /= n_points;
      89             : 
      90             :         /* calculate R-squared and errors */
      91         180 :         for (i = 0; i < n_points; i++) {
      92         150 :                 double t = points[i].time;
      93         150 :                 double t_predicted = model.t_steady - (model.t_steady - points[0].temperature) * exp(-model.k_heat * t);
      94         150 :                 double residual = points[i].temperature - t_predicted;
      95             : 
      96         150 :                 sum_squared_residuals += residual * residual;
      97         150 :                 sum_total += (points[i].temperature - mean_temp) * (points[i].temperature - mean_temp);
      98             : 
      99         150 :                 double abs_error = fabs(residual);
     100         150 :                 if (abs_error > model.max_error) {
     101          90 :                         model.max_error = abs_error;
     102             :                 }
     103             :         }
     104             : 
     105          30 :         model.rmse = sqrt(sum_squared_residuals / n_points);
     106          30 :         model.r_squared = 1.0 - (sum_squared_residuals / sum_total);
     107             : 
     108          30 :         return model;
     109             : }
     110             : 
     111        2642 : void state_thermal(struct snapraid_state* state, time_t now)
     112             : {
     113             :         tommy_node* i;
     114             :         unsigned j;
     115             :         tommy_list high;
     116             :         tommy_list low;
     117             :         int ret;
     118             : 
     119        2642 :         if (state->thermal_temperature_limit == 0)
     120         226 :                 return;
     121             : 
     122        2416 :         tommy_list_init(&high);
     123        2416 :         tommy_list_init(&low);
     124             : 
     125             :         /* for all disks */
     126       16912 :         for (i = state->disklist; i != 0; i = i->next) {
     127       14496 :                 struct snapraid_disk* disk = i->data;
     128             :                 devinfo_t* entry;
     129             : 
     130       14496 :                 entry = calloc_nofail(1, sizeof(devinfo_t));
     131             : 
     132       14496 :                 entry->device = disk->mount_device;
     133       14496 :                 device_name_set(entry, disk->name, 0);
     134       14496 :                 pathcpy(entry->mount, sizeof(entry->mount), disk->mount_point);
     135       14496 :                 pathcpy(entry->smartctl, sizeof(entry->smartctl), disk->smartctl);
     136       14496 :                 memcpy(entry->smartignore, disk->smartignore, sizeof(entry->smartignore));
     137             : 
     138       14496 :                 tommy_list_insert_tail(&high, &entry->node, entry);
     139             :         }
     140             : 
     141             :         /* for all parities */
     142       16912 :         for (j = 0; j < state->level; ++j) {
     143             :                 devinfo_t* entry;
     144             :                 unsigned s;
     145             : 
     146       72480 :                 for (s = 0; s < state->parity[j].split_mac; ++s) {
     147       57984 :                         entry = calloc_nofail(1, sizeof(devinfo_t));
     148             : 
     149       57984 :                         entry->device = state->parity[j].split_map[s].device;
     150       57984 :                         device_name_set(entry, lev_config_name(j), s);
     151       57984 :                         pathcpy(entry->mount, sizeof(entry->mount), state->parity[j].split_map[s].path);
     152       57984 :                         pathcpy(entry->smartctl, sizeof(entry->smartctl), state->parity[j].smartctl);
     153       57984 :                         memcpy(entry->smartignore, state->parity[j].smartignore, sizeof(entry->smartignore));
     154       57984 :                         pathcut(entry->mount); /* remove the parity file */
     155             : 
     156       57984 :                         tommy_list_insert_tail(&high, &entry->node, entry);
     157             :                 }
     158             :         }
     159             : 
     160             :         /* with a GUI always gives time reference */
     161        2416 :         if (state->opt.gui)
     162           0 :                 log_tag("unixtime:%" PRIi64 "\n", (int64_t)now);
     163             : 
     164        2416 :         if (state->opt.fake_device) {
     165        2416 :                 ret = devtest(&high, &low, DEVICE_SMART);
     166             :         } else {
     167           0 :                 ret = devquery(&high, &low, DEVICE_SMART);
     168             :         }
     169             : 
     170             :         /* on error, just disable thermal gathering */
     171        2416 :         if (ret != 0)
     172           0 :                 return;
     173             : 
     174             :         /* if the list is empty, it's not supported in this platform */
     175        2416 :         if (tommy_list_empty(&low))
     176           0 :                 return;
     177             : 
     178             :         /* report new attribute */
     179        2416 :         state_attr(state, &low);
     180             : 
     181             :         /* if ambient temperature is not set, set it now with the lowest HD temperature */
     182        2416 :         if (state->thermal_ambient_temperature == 0) {
     183           1 :                 state->thermal_ambient_temperature = ambient_temperature();
     184             : 
     185          31 :                 for (i = tommy_list_head(&low); i != 0; i = i->next) {
     186          30 :                         devinfo_t* devinfo = i->data;
     187             : 
     188          30 :                         int temp = smart_temp(devinfo);
     189          30 :                         if (temp < 0)
     190           0 :                                 continue;
     191             : 
     192          30 :                         log_tag("thermal:system:candidate:%d\n", temp);
     193             : 
     194          30 :                         if (state->thermal_ambient_temperature == 0 || state->thermal_ambient_temperature > temp)
     195           1 :                                 state->thermal_ambient_temperature = temp;
     196             :                 }
     197             : 
     198           1 :                 log_tag("thermal:system:final:%d\n", state->thermal_ambient_temperature);
     199             :         }
     200             : 
     201        2416 :         int highest_temperature = 0;
     202       74896 :         for (i = tommy_list_head(&low); i != 0; i = i->next) {
     203             :                 tommy_node* t;
     204             :                 struct snapraid_thermal* found;
     205       72480 :                 devinfo_t* devinfo = i->data;
     206             :                 unsigned k;
     207             : 
     208       72480 :                 int temperature = smart_temp(devinfo);
     209       72480 :                 if (temperature < 0)
     210           0 :                         continue;
     211             : 
     212             :                 /* search of the entry */
     213       72480 :                 found = 0;
     214     1123440 :                 for (t = tommy_list_head(&state->thermallist); t != 0; t = t->next) {
     215     1123410 :                         struct snapraid_thermal* thermal = t->data;
     216     1123410 :                         if (thermal->device == devinfo->device) {
     217       72450 :                                 found = thermal;
     218       72450 :                                 break;
     219             :                         }
     220             :                 }
     221             : 
     222             :                 /* if not found, create it */
     223       72480 :                 if (found == 0) {
     224          30 :                         found = thermal_alloc(devinfo->device, devinfo->name);
     225          30 :                         tommy_list_insert_tail(&state->thermallist, &found->node, found);
     226             :                 }
     227             : 
     228       72480 :                 found->latest_temperature = temperature;
     229             : 
     230       72480 :                 if (highest_temperature < temperature)
     231        2416 :                         highest_temperature = temperature;
     232             : 
     233       72480 :                 log_tag("thermal:current:%s:%" PRIu64 ":%d\n", devinfo->name, devinfo->device, temperature);
     234             : 
     235       72480 :                 if (state->thermal_stop_gathering)
     236           0 :                         continue;
     237             : 
     238       72480 :                 if (found->count + 1 >= THERMAL_MAX) /* keep one extra space at the end */
     239           0 :                         continue;
     240             : 
     241             :                 /* only monotone increasing temperature */
     242       72480 :                 if (found->count > 0 && found->data[found->count - 1].temperature >= temperature)
     243       72450 :                         continue;
     244             : 
     245             :                 /* insert the new data point */
     246          30 :                 found->data[found->count].temperature = temperature;
     247          30 :                 found->data[found->count].time = now - state->thermal_first;
     248          30 :                 ++found->count;
     249             : 
     250          30 :                 if (state->opt.fake_device) {
     251             :                         /* fill with fake data */
     252          30 :                         found->data[0].time = 0;
     253          30 :                         found->data[0].temperature = 27;
     254          30 :                         found->data[1].time = 100;
     255          30 :                         found->data[1].temperature = 28;
     256          30 :                         found->data[2].time = 300;
     257          30 :                         found->data[2].temperature = 29;
     258          30 :                         found->data[3].time = 700;
     259          30 :                         found->data[3].temperature = 30;
     260          30 :                         found->data[4].time = 1500;
     261          30 :                         found->data[4].temperature = 31;
     262          30 :                         found->count = 5;
     263             :                 }
     264             : 
     265             :                 /* log the new data */
     266          30 :                 log_tag("thermal:heat:%s:%" PRIu64 ":%u:", devinfo->name, devinfo->device, found->count);
     267         180 :                 for (k = 0; k < found->count; ++k)
     268         150 :                         log_tag("%s%d/%d", k > 0 ? "," : "", (int)found->data[k].temperature, (int)found->data[k].time);
     269          30 :                 log_tag("\n");
     270             : 
     271             :                 /* estimate parameters */
     272          30 :                 found->params = fit_thermal_model(found->data, found->count, state->thermal_ambient_temperature);
     273             : 
     274          30 :                 log_tag("thermal:params:%s:%" PRIu64 ":%g:%g:%g:%g:%g:%g\n", devinfo->name, devinfo->device,
     275             :                         found->params.k_heat, found->params.t_ambient, found->params.t_steady,
     276             :                         found->params.rmse, found->params.r_squared, found->params.max_error);
     277             :         }
     278             : 
     279             :         /* always update the highest temperature */
     280        2416 :         state->thermal_highest_temperature = highest_temperature;
     281             : 
     282        2416 :         log_tag("thermal:highest:%d\n", highest_temperature);
     283        2416 :         log_flush();
     284             : 
     285        2416 :         tommy_list_foreach(&high, free);
     286        2416 :         tommy_list_foreach(&low, free);
     287             : }
     288             : 
     289     1121242 : int state_thermal_alarm(struct snapraid_state* state)
     290             : {
     291             :         /* if no limit, there is no thermal support */
     292     1121242 :         if (state->thermal_temperature_limit == 0)
     293     1118826 :                 return 0;
     294             : 
     295        2416 :         if (state->thermal_highest_temperature <= state->thermal_temperature_limit)
     296        2416 :                 return 0;
     297             : 
     298           0 :         return 1;
     299             : }
     300             : 
     301           0 : void state_thermal_cooldown(struct snapraid_state* state)
     302             : {
     303           0 :         int sleep_time = state->thermal_cooldown_time;
     304             : 
     305           0 :         if (sleep_time == 0)
     306           0 :                 sleep_time = 5 * 60; /* default sleep time */
     307           0 :         if (sleep_time < 3 * 60)
     308           0 :                 sleep_time = 3 * 60; /* minimum sleep time */
     309             : 
     310             :         /* from now on, stop any further data gathering as the heating is interrupted */
     311           0 :         state->thermal_stop_gathering = 1;
     312             : 
     313           0 :         log_tag("thermal:spindown\n");
     314           0 :         state_device(state, DEVICE_DOWN, 0);
     315             : 
     316           0 :         msg_progress("Cooldown...\n");
     317             : 
     318           0 :         log_tag("thermal:cooldown:%d\n", sleep_time);
     319           0 :         printf("Waiting for %d minutes...\n", sleep_time / 60);
     320             : 
     321           0 :         log_flush();
     322             : 
     323             :         /* every 30 seconds spin down any disk that was spunup */
     324           0 :         while (sleep_time > 0) {
     325           0 :                 state_device(state, DEVICE_DOWNIFUP, 0);
     326             : 
     327           0 :                 sleep(30);
     328           0 :                 sleep_time -= 30;
     329             :         }
     330             : 
     331           0 :         if (!global_interrupt) { /* don't wake-up if we are interrupting */
     332           0 :                 log_tag("thermal:spinup\n");
     333             : 
     334             :                 /* spinup */
     335           0 :                 state_device(state, DEVICE_UP, 0);
     336             : 
     337             :                 /* log new thermal info */
     338           0 :                 state_thermal(state, 0);
     339             :         }
     340           0 : }
     341             : 
     342         254 : int state_thermal_begin(struct snapraid_state* state, time_t now)
     343             : {
     344         254 :         if (state->thermal_temperature_limit == 0)
     345         253 :                 return 1;
     346             : 
     347             :         /* initial thermal measure */
     348           1 :         state->thermal_first = now;
     349           1 :         state->thermal_latest = now;
     350           1 :         state_thermal(state, now);
     351             : 
     352           1 :         if (state->thermal_ambient_temperature != 0) {
     353           1 :                 printf("System temperature is %u degrees\n", state->thermal_ambient_temperature);
     354             : 
     355           1 :                 if (state->thermal_temperature_limit != 0 && state->thermal_temperature_limit <= state->thermal_ambient_temperature) {
     356             :                         /* LCOV_EXCL_START */
     357             :                         log_fatal(EENVIRONMENT, "DANGER! System temperature of %d degrees is higher than the temperature limit of %d degrees. Unable to proceed!\n", state->thermal_ambient_temperature, state->thermal_temperature_limit);
     358             :                         log_flush();
     359             :                         return 0;
     360             :                         /* LCOV_EXCL_STOP */
     361             :                 }
     362             :         }
     363             : 
     364           1 :         if (state_thermal_alarm(state)) {
     365             :                 /* LCOV_EXCL_START */
     366             :                 log_fatal(EENVIRONMENT, "DANGER! Hard disk temperature of %d degrees is already outside the operating range. Unable to proceed!\n", state->thermal_highest_temperature);
     367             :                 log_flush();
     368             :                 return 0;
     369             :                 /* LCOV_EXCL_STOP */
     370             :         }
     371             : 
     372           1 :         return 1;
     373             : }
     374             : 

Generated by: LCOV version 1.0