Line data Source code
1 : /*
2 : * Copyright (C) 2025 Andrea Mazzoleni
3 : *
4 : * This program is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 3 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License
15 : * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 : */
17 :
18 : #include "portable.h"
19 :
20 : #include "thermal.h"
21 : #include "state.h"
22 : #include "io.h"
23 :
24 : #include <math.h>
25 :
26 12 : struct snapraid_thermal* thermal_alloc(uint64_t dev, const char* name)
27 : {
28 12 : struct snapraid_thermal* thermal = malloc_nofail(sizeof(struct snapraid_thermal));
29 :
30 12 : thermal->device = dev;
31 12 : thermal->latest_temperature = 0;
32 12 : thermal->count = 0;
33 12 : pathcpy(thermal->name, sizeof(thermal->name), name);
34 :
35 12 : return thermal;
36 : }
37 :
38 12 : void thermal_free(struct snapraid_thermal* thermal)
39 : {
40 12 : free(thermal);
41 12 : }
42 :
43 : /*
44 : * Fit exponential heating model to data using least squares
45 : */
46 12 : struct snapraid_thermal_params fit_thermal_model(const struct snapraid_thermal_point* points, int n_points, double t_ambient)
47 : {
48 : struct snapraid_thermal_params model;
49 : double t_steady_try;
50 : double k_try;
51 :
52 12 : memset(&model, 0, sizeof(model));
53 :
54 12 : model.t_ambient = t_ambient;
55 :
56 : /* at least four points to have a result */
57 12 : if (n_points < 4)
58 0 : return model;
59 :
60 12 : double last_temp = points[n_points - 1].temperature;
61 :
62 : /* iterative refinement to find best k_heat and t_steady */
63 12 : double best_error = 1e10;
64 12 : double best_k = 0;
65 12 : double best_t_steady = 0;
66 :
67 : /* grid search for parameters */
68 576 : for (t_steady_try = last_temp + 2.0; t_steady_try <= last_temp + 25.0; t_steady_try += 0.5) {
69 15228 : for (k_try = 0.00001; k_try <= 0.001; k_try *= 1.2) {
70 14664 : double error = 0.0;
71 : int i;
72 :
73 : /* calculate error for this parameter set */
74 87984 : for (i = 0; i < n_points; i++) {
75 73320 : double t = points[i].time;
76 73320 : double t_predicted = t_steady_try - (t_steady_try - points[0].temperature) * exp(-k_try * t);
77 73320 : double diff = points[i].temperature - t_predicted;
78 73320 : error += diff * diff;
79 : }
80 :
81 14664 : if (error < best_error) {
82 312 : best_error = error;
83 312 : best_k = k_try;
84 312 : best_t_steady = t_steady_try;
85 : }
86 : }
87 : }
88 :
89 12 : model.k_heat = best_k;
90 12 : model.t_steady = best_t_steady;
91 :
92 : /* calculate quality metrics */
93 12 : double sum_squared_residuals = 0.0;
94 12 : double sum_total = 0.0;
95 12 : double mean_temp = 0.0;
96 12 : model.max_error = 0.0;
97 : int i;
98 :
99 : /* calculate mean temperature */
100 72 : for (i = 0; i < n_points; i++)
101 60 : mean_temp += points[i].temperature;
102 12 : mean_temp /= n_points;
103 :
104 : /* calculate R-squared and errors */
105 72 : for (i = 0; i < n_points; i++) {
106 60 : double t = points[i].time;
107 60 : double t_predicted = model.t_steady - (model.t_steady - points[0].temperature) * exp(-model.k_heat * t);
108 60 : double residual = points[i].temperature - t_predicted;
109 :
110 60 : sum_squared_residuals += residual * residual;
111 60 : sum_total += (points[i].temperature - mean_temp) * (points[i].temperature - mean_temp);
112 :
113 60 : double abs_error = fabs(residual);
114 60 : if (abs_error > model.max_error) {
115 36 : model.max_error = abs_error;
116 : }
117 : }
118 :
119 12 : model.rmse = sqrt(sum_squared_residuals / n_points);
120 12 : model.r_squared = 1.0 - (sum_squared_residuals / sum_total);
121 :
122 12 : return model;
123 : }
124 :
125 72510 : static int smart_temp(devinfo_t* devinfo)
126 : {
127 72510 : uint64_t t = devinfo->smart[SMART_TEMPERATURE_CELSIUS];
128 :
129 : /* validate temperature */
130 72510 : if (t == SMART_UNASSIGNED)
131 0 : return -1;
132 72510 : if (t == 0)
133 0 : return -1;
134 72510 : if (t > 100)
135 0 : return -1;
136 :
137 72510 : return t;
138 : }
139 :
140 2609 : void state_thermal(struct snapraid_state* state, time_t now)
141 : {
142 : tommy_node* i;
143 : unsigned j;
144 : tommy_list high;
145 : tommy_list low;
146 : int ret;
147 :
148 2609 : if (state->thermal_temperature_limit == 0)
149 193 : return;
150 :
151 2416 : tommy_list_init(&high);
152 2416 : tommy_list_init(&low);
153 :
154 : /* for all disks */
155 16912 : for (i = state->disklist; i != 0; i = i->next) {
156 14496 : struct snapraid_disk* disk = i->data;
157 : devinfo_t* entry;
158 :
159 14496 : entry = calloc_nofail(1, sizeof(devinfo_t));
160 :
161 14496 : entry->device = disk->device;
162 14496 : pathcpy(entry->name, sizeof(entry->name), disk->name);
163 14496 : pathcpy(entry->mount, sizeof(entry->mount), disk->dir);
164 14496 : pathcpy(entry->smartctl, sizeof(entry->smartctl), disk->smartctl);
165 14496 : memcpy(entry->smartignore, disk->smartignore, sizeof(entry->smartignore));
166 :
167 14496 : tommy_list_insert_tail(&high, &entry->node, entry);
168 : }
169 :
170 : /* for all parities */
171 16912 : for (j = 0; j < state->level; ++j) {
172 : devinfo_t* entry;
173 : unsigned s;
174 :
175 72480 : for (s = 0; s < state->parity[j].split_mac; ++s) {
176 57984 : entry = calloc_nofail(1, sizeof(devinfo_t));
177 :
178 57984 : entry->device = state->parity[j].split_map[s].device;
179 57984 : pathcpy(entry->name, sizeof(entry->name), lev_config_name(j));
180 57984 : pathcpy(entry->mount, sizeof(entry->mount), state->parity[j].split_map[s].path);
181 57984 : pathcpy(entry->smartctl, sizeof(entry->smartctl), state->parity[j].smartctl);
182 57984 : memcpy(entry->smartignore, state->parity[j].smartignore, sizeof(entry->smartignore));
183 57984 : pathcut(entry->mount); /* remove the parity file */
184 :
185 57984 : tommy_list_insert_tail(&high, &entry->node, entry);
186 : }
187 : }
188 :
189 2416 : if (state->opt.fake_device) {
190 2416 : ret = devtest(&high, &low, DEVICE_SMART);
191 : } else {
192 0 : ret = devquery(&high, &low, DEVICE_SMART, 0 /* only disks in the array */);
193 : }
194 :
195 : /* on error, just disable thermal gathering */
196 2416 : if (ret != 0)
197 0 : return;
198 :
199 : /* if the list is empty, it's not supported in this platform */
200 2416 : if (tommy_list_empty(&low))
201 0 : return;
202 :
203 : /* if ambient temperature is not set, set it now with the lowest HD temperature */
204 2416 : if (state->thermal_ambient_temperature == 0) {
205 1 : state->thermal_ambient_temperature = ambient_temperature();
206 :
207 31 : for (i = tommy_list_head(&low); i != 0; i = i->next) {
208 30 : devinfo_t* devinfo = i->data;
209 :
210 30 : int temp = smart_temp(devinfo);
211 30 : if (temp < 0)
212 0 : continue;
213 :
214 30 : log_tag("thermal:system:candidate:%d\n", temp);
215 :
216 30 : if (state->thermal_ambient_temperature == 0 || state->thermal_ambient_temperature > temp)
217 1 : state->thermal_ambient_temperature = temp;
218 : }
219 :
220 1 : log_tag("thermal:system:final:%d\n", state->thermal_ambient_temperature);
221 : }
222 :
223 2416 : int highest_temperature = 0;
224 74896 : for (i = tommy_list_head(&low); i != 0; i = i->next) {
225 : tommy_node* t;
226 : struct snapraid_thermal* found;
227 72480 : devinfo_t* devinfo = i->data;
228 : unsigned k;
229 :
230 72480 : int temperature = smart_temp(devinfo);
231 72480 : if (temperature < 0)
232 0 : continue;
233 :
234 : /* search of the entry */
235 72480 : found = 0;
236 601584 : for (t = tommy_list_head(&state->thermallist); t != 0; t = t->next) {
237 601572 : struct snapraid_thermal* thermal = t->data;
238 601572 : if (thermal->device == devinfo->device) {
239 72468 : found = thermal;
240 72468 : break;
241 : }
242 : }
243 :
244 : /* if not found, create it */
245 72480 : if (found == 0) {
246 12 : found = thermal_alloc(devinfo->device, devinfo->name);
247 12 : tommy_list_insert_tail(&state->thermallist, &found->node, found);
248 : }
249 :
250 72480 : found->latest_temperature = temperature;
251 :
252 72480 : if (highest_temperature < temperature)
253 2416 : highest_temperature = temperature;
254 :
255 72480 : log_tag("thermal:current:%s:%" PRIu64 ":%d\n", devinfo->name, devinfo->device, temperature);
256 :
257 72480 : if (state->thermal_stop_gathering)
258 0 : continue;
259 :
260 72480 : if (found->count + 1 >= THERMAL_MAX) /* keep one extra space at the end */
261 0 : continue;
262 :
263 : /* only monotone increasing temperature */
264 72480 : if (found->count > 0 && found->data[found->count - 1].temperature >= temperature)
265 72468 : continue;
266 :
267 : /* insert the new data point */
268 12 : found->data[found->count].temperature = temperature;
269 12 : found->data[found->count].time = now - state->thermal_first;
270 12 : ++found->count;
271 :
272 12 : if (state->opt.fake_device) {
273 : /* fill with fake data */
274 12 : found->data[0].time = 0;
275 12 : found->data[0].temperature = 27;
276 12 : found->data[1].time = 100;
277 12 : found->data[1].temperature = 28;
278 12 : found->data[2].time = 300;
279 12 : found->data[2].temperature = 29;
280 12 : found->data[3].time = 700;
281 12 : found->data[3].temperature = 30;
282 12 : found->data[4].time = 1500;
283 12 : found->data[4].temperature = 31;
284 12 : found->count = 5;
285 : }
286 :
287 : /* log the new data */
288 12 : log_tag("thermal:heat:%s:%" PRIu64 ":%u:", devinfo->name, devinfo->device, found->count);
289 72 : for (k = 0; k < found->count; ++k)
290 60 : log_tag("%s%d/%d", k > 0 ? "," : "", (int)found->data[k].temperature, (int)found->data[k].time);
291 12 : log_tag("\n");
292 :
293 : /* estimate parameters */
294 12 : found->params = fit_thermal_model(found->data, found->count, state->thermal_ambient_temperature);
295 :
296 12 : log_tag("thermal:params:%s:%" PRIu64 ":%g:%g:%g:%g:%g:%g\n", devinfo->name, devinfo->device,
297 : found->params.k_heat, found->params.t_ambient, found->params.t_steady,
298 : found->params.rmse, found->params.r_squared, found->params.max_error);
299 : }
300 :
301 : /* always update the highest temperature */
302 2416 : state->thermal_highest_temperature = highest_temperature;
303 :
304 2416 : log_tag("thermal:highest:%d\n", highest_temperature);
305 2416 : log_flush();
306 :
307 2416 : tommy_list_foreach(&high, free);
308 2416 : tommy_list_foreach(&low, free);
309 : }
310 :
311 1060633 : int state_thermal_alarm(struct snapraid_state* state)
312 : {
313 : /* if no limit, there is no thermal support */
314 1060633 : if (state->thermal_temperature_limit == 0)
315 1058217 : return 0;
316 :
317 2416 : if (state->thermal_highest_temperature <= state->thermal_temperature_limit)
318 2416 : return 0;
319 :
320 0 : return 1;
321 : }
322 :
323 0 : void state_thermal_cooldown(struct snapraid_state* state)
324 : {
325 0 : int sleep_time = state->thermal_cooldown_time;
326 :
327 0 : if (sleep_time == 0)
328 0 : sleep_time = 5 * 60; /* default sleep time */
329 0 : if (sleep_time < 5 * 60)
330 0 : sleep_time = 5 * 60; /* minimum sleep time */
331 :
332 : /* from now on, stop any further data gathering as the heating is interrupted */
333 0 : state->thermal_stop_gathering = 1;
334 :
335 0 : log_tag("thermal:spindown\n");
336 0 : state_device(state, DEVICE_DOWN, 0);
337 :
338 0 : msg_progress("Cooldown...\n");
339 :
340 0 : log_tag("thermal:cooldown:%d\n", sleep_time);
341 0 : printf("Waiting for %d minutes...\n", sleep_time / 60);
342 :
343 0 : log_flush();
344 :
345 : /* every 30 seconds spin down any disk that was spunup */
346 0 : while (sleep_time > 0) {
347 0 : state_device(state, DEVICE_DOWNIFUP, 0);
348 :
349 0 : sleep(30);
350 0 : sleep_time -= 30;
351 : }
352 :
353 0 : if (!global_interrupt) { /* don't wake-up if we are interrupting */
354 0 : log_tag("thermal:spinup\n");
355 :
356 : /* spinup */
357 0 : state_device(state, DEVICE_UP, 0);
358 :
359 : /* log new thermal info */
360 0 : state_thermal(state, 0);
361 : }
362 0 : }
363 :
364 222 : int state_thermal_begin(struct snapraid_state* state, time_t now)
365 : {
366 222 : if (state->thermal_temperature_limit == 0)
367 221 : return 1;
368 :
369 : /* initial thermal measure */
370 1 : state->thermal_first = now;
371 1 : state->thermal_latest = now;
372 1 : state_thermal(state, now);
373 :
374 1 : if (state->thermal_ambient_temperature != 0) {
375 1 : printf("System temperature is %u degrees\n", state->thermal_ambient_temperature);
376 :
377 1 : if (state->thermal_temperature_limit != 0 && state->thermal_temperature_limit <= state->thermal_ambient_temperature) {
378 : /* LCOV_EXCL_START */
379 : log_fatal("DANGER! System temperature of %d degrees is higher than the temperature limit of %d degrees. Unable to proceeed!\n", state->thermal_ambient_temperature, state->thermal_temperature_limit);
380 : log_flush();
381 : return 0;
382 : /* LCOV_EXCL_STOP */
383 : }
384 : }
385 :
386 1 : if (state_thermal_alarm(state)) {
387 : /* LCOV_EXCL_START */
388 : log_fatal("DANGER! Hard disk temperature of %d degrees is already outside the operating range. Unable to proceeed!\n", state->thermal_highest_temperature);
389 : log_flush();
390 : return 0;
391 : /* LCOV_EXCL_STOP */
392 : }
393 :
394 1 : return 1;
395 : }
396 :
|