Line data Source code
1 : // SPDX-License-Identifier: GPL-3.0-or-later
2 : // Copyright (C) 2025 Andrea Mazzoleni
3 :
4 : #include "portable.h"
5 :
6 : #include "thermal.h"
7 : #include "state.h"
8 : #include "io.h"
9 :
10 : #include <math.h>
11 :
12 30 : struct snapraid_thermal* thermal_alloc(uint64_t dev, const char* name)
13 : {
14 30 : struct snapraid_thermal* thermal = malloc_nofail(sizeof(struct snapraid_thermal));
15 :
16 30 : thermal->device = dev;
17 30 : thermal->latest_temperature = 0;
18 30 : thermal->count = 0;
19 30 : pathcpy(thermal->name, sizeof(thermal->name), name);
20 :
21 30 : return thermal;
22 : }
23 :
24 30 : void thermal_free(struct snapraid_thermal* thermal)
25 : {
26 30 : free(thermal);
27 30 : }
28 :
29 : /*
30 : * Fit exponential heating model to data using least squares
31 : */
32 30 : struct snapraid_thermal_params fit_thermal_model(const struct snapraid_thermal_point* points, int n_points, double t_ambient)
33 : {
34 : struct snapraid_thermal_params model;
35 : double t_steady_try;
36 : double k_try;
37 :
38 30 : memset(&model, 0, sizeof(model));
39 :
40 30 : model.t_ambient = t_ambient;
41 :
42 : /* at least four points to have a result */
43 30 : if (n_points < 4)
44 0 : return model;
45 :
46 30 : double last_temp = points[n_points - 1].temperature;
47 :
48 : /* iterative refinement to find best k_heat and t_steady */
49 30 : double best_error = 1e10;
50 30 : double best_k = 0;
51 30 : double best_t_steady = 0;
52 :
53 : /* grid search for parameters */
54 1440 : for (t_steady_try = last_temp + 2.0; t_steady_try <= last_temp + 25.0; t_steady_try += 0.5) {
55 38070 : for (k_try = 0.00001; k_try <= 0.001; k_try *= 1.2) {
56 36660 : double error = 0.0;
57 : int i;
58 :
59 : /* calculate error for this parameter set */
60 219960 : for (i = 0; i < n_points; i++) {
61 183300 : double t = points[i].time;
62 183300 : double t_predicted = t_steady_try - (t_steady_try - points[0].temperature) * exp(-k_try * t);
63 183300 : double diff = points[i].temperature - t_predicted;
64 183300 : error += diff * diff;
65 : }
66 :
67 36660 : if (error < best_error) {
68 780 : best_error = error;
69 780 : best_k = k_try;
70 780 : best_t_steady = t_steady_try;
71 : }
72 : }
73 : }
74 :
75 30 : model.k_heat = best_k;
76 30 : model.t_steady = best_t_steady;
77 :
78 : /* calculate quality metrics */
79 30 : double sum_squared_residuals = 0.0;
80 30 : double sum_total = 0.0;
81 30 : double mean_temp = 0.0;
82 30 : model.max_error = 0.0;
83 : int i;
84 :
85 : /* calculate mean temperature */
86 180 : for (i = 0; i < n_points; i++)
87 150 : mean_temp += points[i].temperature;
88 30 : mean_temp /= n_points;
89 :
90 : /* calculate R-squared and errors */
91 180 : for (i = 0; i < n_points; i++) {
92 150 : double t = points[i].time;
93 150 : double t_predicted = model.t_steady - (model.t_steady - points[0].temperature) * exp(-model.k_heat * t);
94 150 : double residual = points[i].temperature - t_predicted;
95 :
96 150 : sum_squared_residuals += residual * residual;
97 150 : sum_total += (points[i].temperature - mean_temp) * (points[i].temperature - mean_temp);
98 :
99 150 : double abs_error = fabs(residual);
100 150 : if (abs_error > model.max_error) {
101 90 : model.max_error = abs_error;
102 : }
103 : }
104 :
105 30 : model.rmse = sqrt(sum_squared_residuals / n_points);
106 30 : model.r_squared = 1.0 - (sum_squared_residuals / sum_total);
107 :
108 30 : return model;
109 : }
110 :
111 2642 : void state_thermal(struct snapraid_state* state, time_t now)
112 : {
113 : tommy_node* i;
114 : unsigned j;
115 : tommy_list high;
116 : tommy_list low;
117 : int ret;
118 :
119 2642 : if (state->thermal_temperature_limit == 0)
120 226 : return;
121 :
122 2416 : tommy_list_init(&high);
123 2416 : tommy_list_init(&low);
124 :
125 : /* for all disks */
126 16912 : for (i = state->disklist; i != 0; i = i->next) {
127 14496 : struct snapraid_disk* disk = i->data;
128 : devinfo_t* entry;
129 :
130 14496 : entry = calloc_nofail(1, sizeof(devinfo_t));
131 :
132 14496 : entry->device = disk->mount_device;
133 14496 : device_name_set(entry, disk->name, 0);
134 14496 : pathcpy(entry->mount, sizeof(entry->mount), disk->mount_point);
135 14496 : pathcpy(entry->smartctl, sizeof(entry->smartctl), disk->smartctl);
136 14496 : memcpy(entry->smartignore, disk->smartignore, sizeof(entry->smartignore));
137 :
138 14496 : tommy_list_insert_tail(&high, &entry->node, entry);
139 : }
140 :
141 : /* for all parities */
142 16912 : for (j = 0; j < state->level; ++j) {
143 : devinfo_t* entry;
144 : unsigned s;
145 :
146 72480 : for (s = 0; s < state->parity[j].split_mac; ++s) {
147 57984 : entry = calloc_nofail(1, sizeof(devinfo_t));
148 :
149 57984 : entry->device = state->parity[j].split_map[s].device;
150 57984 : device_name_set(entry, lev_config_name(j), s);
151 57984 : pathcpy(entry->mount, sizeof(entry->mount), state->parity[j].split_map[s].path);
152 57984 : pathcpy(entry->smartctl, sizeof(entry->smartctl), state->parity[j].smartctl);
153 57984 : memcpy(entry->smartignore, state->parity[j].smartignore, sizeof(entry->smartignore));
154 57984 : pathcut(entry->mount); /* remove the parity file */
155 :
156 57984 : tommy_list_insert_tail(&high, &entry->node, entry);
157 : }
158 : }
159 :
160 : /* with a GUI always gives time reference */
161 2416 : if (state->opt.gui)
162 0 : log_tag("unixtime:%" PRIi64 "\n", (int64_t)now);
163 :
164 2416 : if (state->opt.fake_device) {
165 2416 : ret = devtest(&high, &low, DEVICE_SMART);
166 : } else {
167 0 : ret = devquery(&high, &low, DEVICE_SMART);
168 : }
169 :
170 : /* on error, just disable thermal gathering */
171 2416 : if (ret != 0)
172 0 : return;
173 :
174 : /* if the list is empty, it's not supported in this platform */
175 2416 : if (tommy_list_empty(&low))
176 0 : return;
177 :
178 : /* report new attribute */
179 2416 : state_attr(state, &low);
180 :
181 : /* if ambient temperature is not set, set it now with the lowest HD temperature */
182 2416 : if (state->thermal_ambient_temperature == 0) {
183 1 : state->thermal_ambient_temperature = ambient_temperature();
184 :
185 31 : for (i = tommy_list_head(&low); i != 0; i = i->next) {
186 30 : devinfo_t* devinfo = i->data;
187 :
188 30 : int temp = smart_temp(devinfo);
189 30 : if (temp < 0)
190 0 : continue;
191 :
192 30 : log_tag("thermal:system:candidate:%d\n", temp);
193 :
194 30 : if (state->thermal_ambient_temperature == 0 || state->thermal_ambient_temperature > temp)
195 1 : state->thermal_ambient_temperature = temp;
196 : }
197 :
198 1 : log_tag("thermal:system:final:%d\n", state->thermal_ambient_temperature);
199 : }
200 :
201 2416 : int highest_temperature = 0;
202 74896 : for (i = tommy_list_head(&low); i != 0; i = i->next) {
203 : tommy_node* t;
204 : struct snapraid_thermal* found;
205 72480 : devinfo_t* devinfo = i->data;
206 : unsigned k;
207 :
208 72480 : int temperature = smart_temp(devinfo);
209 72480 : if (temperature < 0)
210 0 : continue;
211 :
212 : /* search of the entry */
213 72480 : found = 0;
214 1123440 : for (t = tommy_list_head(&state->thermallist); t != 0; t = t->next) {
215 1123410 : struct snapraid_thermal* thermal = t->data;
216 1123410 : if (thermal->device == devinfo->device) {
217 72450 : found = thermal;
218 72450 : break;
219 : }
220 : }
221 :
222 : /* if not found, create it */
223 72480 : if (found == 0) {
224 30 : found = thermal_alloc(devinfo->device, devinfo->name);
225 30 : tommy_list_insert_tail(&state->thermallist, &found->node, found);
226 : }
227 :
228 72480 : found->latest_temperature = temperature;
229 :
230 72480 : if (highest_temperature < temperature)
231 2416 : highest_temperature = temperature;
232 :
233 72480 : log_tag("thermal:current:%s:%" PRIu64 ":%d\n", devinfo->name, devinfo->device, temperature);
234 :
235 72480 : if (state->thermal_stop_gathering)
236 0 : continue;
237 :
238 72480 : if (found->count + 1 >= THERMAL_MAX) /* keep one extra space at the end */
239 0 : continue;
240 :
241 : /* only monotone increasing temperature */
242 72480 : if (found->count > 0 && found->data[found->count - 1].temperature >= temperature)
243 72450 : continue;
244 :
245 : /* insert the new data point */
246 30 : found->data[found->count].temperature = temperature;
247 30 : found->data[found->count].time = now - state->thermal_first;
248 30 : ++found->count;
249 :
250 30 : if (state->opt.fake_device) {
251 : /* fill with fake data */
252 30 : found->data[0].time = 0;
253 30 : found->data[0].temperature = 27;
254 30 : found->data[1].time = 100;
255 30 : found->data[1].temperature = 28;
256 30 : found->data[2].time = 300;
257 30 : found->data[2].temperature = 29;
258 30 : found->data[3].time = 700;
259 30 : found->data[3].temperature = 30;
260 30 : found->data[4].time = 1500;
261 30 : found->data[4].temperature = 31;
262 30 : found->count = 5;
263 : }
264 :
265 : /* log the new data */
266 30 : log_tag("thermal:heat:%s:%" PRIu64 ":%u:", devinfo->name, devinfo->device, found->count);
267 180 : for (k = 0; k < found->count; ++k)
268 150 : log_tag("%s%d/%d", k > 0 ? "," : "", (int)found->data[k].temperature, (int)found->data[k].time);
269 30 : log_tag("\n");
270 :
271 : /* estimate parameters */
272 30 : found->params = fit_thermal_model(found->data, found->count, state->thermal_ambient_temperature);
273 :
274 30 : log_tag("thermal:params:%s:%" PRIu64 ":%g:%g:%g:%g:%g:%g\n", devinfo->name, devinfo->device,
275 : found->params.k_heat, found->params.t_ambient, found->params.t_steady,
276 : found->params.rmse, found->params.r_squared, found->params.max_error);
277 : }
278 :
279 : /* always update the highest temperature */
280 2416 : state->thermal_highest_temperature = highest_temperature;
281 :
282 2416 : log_tag("thermal:highest:%d\n", highest_temperature);
283 2416 : log_flush();
284 :
285 2416 : tommy_list_foreach(&high, free);
286 2416 : tommy_list_foreach(&low, free);
287 : }
288 :
289 1121242 : int state_thermal_alarm(struct snapraid_state* state)
290 : {
291 : /* if no limit, there is no thermal support */
292 1121242 : if (state->thermal_temperature_limit == 0)
293 1118826 : return 0;
294 :
295 2416 : if (state->thermal_highest_temperature <= state->thermal_temperature_limit)
296 2416 : return 0;
297 :
298 0 : return 1;
299 : }
300 :
301 0 : void state_thermal_cooldown(struct snapraid_state* state)
302 : {
303 0 : int sleep_time = state->thermal_cooldown_time;
304 :
305 0 : if (sleep_time == 0)
306 0 : sleep_time = 5 * 60; /* default sleep time */
307 0 : if (sleep_time < 3 * 60)
308 0 : sleep_time = 3 * 60; /* minimum sleep time */
309 :
310 : /* from now on, stop any further data gathering as the heating is interrupted */
311 0 : state->thermal_stop_gathering = 1;
312 :
313 0 : log_tag("thermal:spindown\n");
314 0 : state_device(state, DEVICE_DOWN, 0);
315 :
316 0 : msg_progress("Cooldown...\n");
317 :
318 0 : log_tag("thermal:cooldown:%d\n", sleep_time);
319 0 : printf("Waiting for %d minutes...\n", sleep_time / 60);
320 :
321 0 : log_flush();
322 :
323 : /* every 30 seconds spin down any disk that was spunup */
324 0 : while (sleep_time > 0) {
325 0 : state_device(state, DEVICE_DOWNIFUP, 0);
326 :
327 0 : sleep(30);
328 0 : sleep_time -= 30;
329 : }
330 :
331 0 : if (!global_interrupt) { /* don't wake-up if we are interrupting */
332 0 : log_tag("thermal:spinup\n");
333 :
334 : /* spinup */
335 0 : state_device(state, DEVICE_UP, 0);
336 :
337 : /* log new thermal info */
338 0 : state_thermal(state, 0);
339 : }
340 0 : }
341 :
342 254 : int state_thermal_begin(struct snapraid_state* state, time_t now)
343 : {
344 254 : if (state->thermal_temperature_limit == 0)
345 253 : return 1;
346 :
347 : /* initial thermal measure */
348 1 : state->thermal_first = now;
349 1 : state->thermal_latest = now;
350 1 : state_thermal(state, now);
351 :
352 1 : if (state->thermal_ambient_temperature != 0) {
353 1 : printf("System temperature is %u degrees\n", state->thermal_ambient_temperature);
354 :
355 1 : if (state->thermal_temperature_limit != 0 && state->thermal_temperature_limit <= state->thermal_ambient_temperature) {
356 : /* LCOV_EXCL_START */
357 : log_fatal(EENVIRONMENT, "DANGER! System temperature of %d degrees is higher than the temperature limit of %d degrees. Unable to proceed!\n", state->thermal_ambient_temperature, state->thermal_temperature_limit);
358 : log_flush();
359 : return 0;
360 : /* LCOV_EXCL_STOP */
361 : }
362 : }
363 :
364 1 : if (state_thermal_alarm(state)) {
365 : /* LCOV_EXCL_START */
366 : log_fatal(EENVIRONMENT, "DANGER! Hard disk temperature of %d degrees is already outside the operating range. Unable to proceed!\n", state->thermal_highest_temperature);
367 : log_flush();
368 : return 0;
369 : /* LCOV_EXCL_STOP */
370 : }
371 :
372 1 : return 1;
373 : }
374 :
|