Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : // Copyright (C) 2013 Andrea Mazzoleni
3 :
4 : #ifndef __RAID_CPU_H
5 : #define __RAID_CPU_H
6 :
7 : #ifdef CONFIG_X86
8 :
9 3629 : static inline void raid_cpuid(uint32_t func_eax, uint32_t sub_ecx, uint32_t *reg)
10 : {
11 3629 : asm volatile (
12 : #if defined(__i386__) && defined(__PIC__)
13 : /* allow compilation in PIC mode saving ebx */
14 : "xchgl %%ebx, %1\n"
15 : "cpuid\n"
16 : "xchgl %%ebx, %1\n"
17 : : "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
18 : : "0" (func_eax), "2" (sub_ecx)
19 : #else
20 : "cpuid\n"
21 3629 : : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
22 : : "0" (func_eax), "2" (sub_ecx)
23 : #endif
24 : );
25 3629 : }
26 :
27 93 : static inline void raid_xgetbv(uint32_t* reg)
28 : {
29 : /* get the value of the Extended Control Register ecx=0 */
30 93 : asm volatile (
31 : /*
32 : * Uses a direct encoding of the XGETBV instruction as only recent
33 : * assemblers support it.
34 : * the next line is equivalent to: "xgetbv\n"
35 : */
36 : ".byte 0x0f, 0x01, 0xd0\n"
37 93 : : "=a" (reg[0]), "=d" (reg[3])
38 : : "c" (0)
39 : );
40 93 : }
41 :
42 : #define CPU_VENDOR_MAX 13
43 :
44 658 : static inline void raid_cpu_info(char *vendor, unsigned *family, unsigned *model)
45 : {
46 : uint32_t reg[4];
47 : unsigned f, ef, m, em;
48 :
49 658 : raid_cpuid(0, 0, reg);
50 :
51 658 : ((uint32_t*)vendor)[0] = reg[1];
52 658 : ((uint32_t*)vendor)[1] = reg[3];
53 658 : ((uint32_t*)vendor)[2] = reg[2];
54 658 : vendor[12] = 0;
55 :
56 658 : raid_cpuid(1, 0, reg);
57 :
58 658 : f = (reg[0] >> 8) & 0xF;
59 658 : ef = (reg[0] >> 20) & 0xFF;
60 658 : m = (reg[0] >> 4) & 0xF;
61 658 : em = (reg[0] >> 16) & 0xF;
62 :
63 658 : if (strcmp(vendor, "AuthenticAMD") == 0) {
64 0 : if (f < 15) {
65 0 : *family = f;
66 0 : *model = m;
67 : } else {
68 0 : *family = f + ef;
69 0 : *model = m + (em << 4);
70 : }
71 : } else {
72 658 : *family = f + ef;
73 658 : *model = m + (em << 4);
74 : }
75 658 : }
76 :
77 1417 : static inline int raid_cpu_match_sse(uint32_t cpuid_1_ecx, uint32_t cpuid_1_edx)
78 : {
79 : uint32_t reg[4];
80 :
81 1417 : raid_cpuid(1, 0, reg);
82 1417 : if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
83 18 : return 0;
84 1399 : if ((reg[3] & cpuid_1_edx) != cpuid_1_edx)
85 0 : return 0;
86 :
87 1399 : return 1;
88 : }
89 :
90 818 : static inline int raid_cpu_match_avx(uint32_t cpuid_1_ecx, uint32_t cpuid_7_ebx, uint32_t cpuid_7_ecx, uint32_t xcr0)
91 : {
92 : uint32_t reg[4];
93 :
94 818 : raid_cpuid(1, 0, reg);
95 818 : if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
96 725 : return 0;
97 :
98 93 : raid_xgetbv(reg);
99 93 : if ((reg[0] & xcr0) != xcr0)
100 15 : return 0;
101 :
102 78 : raid_cpuid(7, 0, reg);
103 78 : if ((reg[1] & cpuid_7_ebx) != cpuid_7_ebx)
104 0 : return 0;
105 :
106 78 : if ((reg[2] & cpuid_7_ecx) != cpuid_7_ecx)
107 3 : return 0;
108 :
109 75 : return 1;
110 : }
111 :
112 366 : static inline int raid_cpu_has_sse2(void)
113 : {
114 : /*
115 : * Intel 64 and IA-32 Architectures Software Developer's Manual
116 : * 325462-048US September 2013
117 : *
118 : * 11.6.2 Checking for SSE/SSE2 Support
119 : * Before an application attempts to use the SSE and/or SSE2 extensions, it should check
120 : * that they are present on the processor:
121 : * 1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
122 : * register can be used to check if the processor supports the CPUID instruction.
123 : * 2. Check that the processor supports the SSE and/or SSE2 extensions (true if
124 : * CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
125 : */
126 366 : return raid_cpu_match_sse(
127 : 0,
128 : 1 << 26); /* SSE2 */
129 : }
130 :
131 703 : static inline int raid_cpu_has_ssse3(void)
132 : {
133 : /*
134 : * Intel 64 and IA-32 Architectures Software Developer's Manual
135 : * 325462-048US September 2013
136 : *
137 : * 12.7.2 Checking for SSSE3 Support
138 : * Before an application attempts to use the SSSE3 extensions, the application should
139 : * follow the steps illustrated in Section 11.6.2, "Checking for SSE/SSE2 Support."
140 : * Next, use the additional step provided below:
141 : * Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
142 : */
143 703 : return raid_cpu_match_sse(
144 : 1 << 9, /* SSSE3 */
145 : 1 << 26); /* SSE2 */
146 : }
147 :
148 348 : static inline int raid_cpu_has_crc32(void)
149 : {
150 : /*
151 : * Intel 64 and IA-32 Architectures Software Developer's Manual
152 : * 325462-048US September 2013
153 : *
154 : * 12.12.3 Checking for SSE4.2 Support
155 : * ...
156 : * Before an application attempts to use the CRC32 instruction, it must check
157 : * that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
158 : */
159 348 : return raid_cpu_match_sse(
160 : 1 << 20, /* CRC32 */
161 : 0);
162 : }
163 :
164 407 : static inline int raid_cpu_has_avx2(void)
165 : {
166 : /*
167 : * Intel Architecture Instruction Set Extensions Programming Reference
168 : * 319433-022 October 2014
169 : *
170 : * 14.3 Detection of AVX instructions
171 : * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use1)
172 : * 2) Issue XGETBV and verify that XCR0[2:1] = `11b' (XMM state and YMM state are enabled by OS).
173 : * 3) detect CPUID.1:ECX.AVX[bit 28] = 1 (AVX instructions supported).
174 : * (Step 3 can be done in any order relative to 1 and 2)
175 : *
176 : * 14.7.1 Detection of AVX2
177 : * Hardware support for AVX2 is indicated by CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]=1.
178 : * Application Software must identify that hardware supports AVX, after that it must
179 : * also detect support for AVX2 by checking CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5].
180 : */
181 407 : return raid_cpu_match_avx(
182 : (1 << 27) | (1 << 28), /* Leaf 1, ECX: XSAVE and AVX */
183 : 1 << 5, /* Leaf 7, EBX: AVX2 */
184 : 0, /* Leaf 7, ECX: */
185 : 3 << 1); /* XCR0: OS saves XMM and YMM registers */
186 : }
187 :
188 5 : static inline int raid_cpu_has_avx2gfni(void)
189 : {
190 : /*
191 : * Detection of AVX2 + GFNI:
192 : * 1) Verify OSXSAVE and XGETBV (CPUID.1:ECX[27])
193 : * 2) Verify XCR0[2:1] = '11b' (XMM and YMM state enabled)
194 : * 3) Verify AVX2 support (CPUID.7.0:EBX[5])
195 : * 4) Verify GFNI support (CPUID.7.0:ECX[8])
196 : */
197 5 : return raid_cpu_match_avx(
198 : (1 << 27) | (1 << 28), /* Leaf 1, ECX: XSAVE and AVX */
199 : 1 << 5, /* Leaf 7, EBX: AVX2 */
200 : 1 << 8, /* Leaf 7, ECX: GFNI */
201 : 3 << 1); /* XCR0: OS saves XMM and YMM registers */
202 : }
203 :
204 401 : static inline int raid_cpu_has_avx512bw(void)
205 : {
206 : /*
207 : * Intel Architecture Instruction Set Extensions Programming Reference
208 : * 319433-022 October 2014
209 : *
210 : * 2.2 Detection of 512-bit Instruction Groups of Intel AVX-512 Family
211 : * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use)
212 : * 2) Execute XGETBV and verify that XCR0[7:5] = `111b' (OPMASK state, upper 256-bit of
213 : * ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled by OS) and that XCR0[2:1] = `11b'
214 : * (XMM state and YMM state are enabled by OS).
215 : * 3) Verify both CPUID.0x7.0:EBX.AVX512F[bit 16] = 1, CPUID.0x7.0:EBX.AVX512BW[bit 30] = 1.
216 : */
217 :
218 : /*
219 : * Note that intentionally we don't check for AVX and AVX2
220 : * because the documentation doesn't require that
221 : */
222 401 : return raid_cpu_match_avx(
223 : 1 << 27, /* Leaf 1, ECX: XSAVE/XGETBV */
224 : (1 << 16) | (1 << 30), /* Leaf 7, EBX: AVX512F and AVX512BW */
225 : 0, /* Leaf 7, ECX: */
226 : (3 << 1) | (7 << 5)); /* XCR0: OS saves XMM, YMM and ZMM registers */
227 : }
228 :
229 5 : static inline int raid_cpu_has_avx512gfni(void)
230 : {
231 5 : return raid_cpu_match_avx(
232 : 1 << 27, /* Leaf 1, ECX: XSAVE/XGETBV */
233 : 1 << 16, /* Leaf 7, EBX: AVX512F (Foundation) */
234 : 1 << 8, /* Leaf 7, ECX: GFNI */
235 : (3 << 1) | (7 << 5) /* XCR0: OS saves XMM, YMM and ZMM registers */
236 : );
237 : }
238 :
239 : /**
240 : * Check if it's an Intel Atom CPU.
241 : */
242 653 : static inline int raid_cpu_is_atom(unsigned family, unsigned model)
243 : {
244 653 : if (family != 6)
245 3 : return 0;
246 :
247 : /*
248 : * x86 Architecture CPUID
249 : * http://www.sandpile.org/x86/cpuid.htm
250 : *
251 : * Intel Atom
252 : * 1C (28) Atom (45 nm) with 512 KB on-die L2
253 : * 26 (38) Atom (45 nm) with 512 KB on-die L2
254 : * 36 (54) Atom (32 nm) with 512 KB on-die L2
255 : * 27 (39) Atom (32 nm) with 512 KB on-die L2
256 : * 35 (53) Atom (?? nm) with ??? KB on-die L2
257 : * 4A (74) Atom 2C (22 nm) 1 MB L2 + PowerVR (TGR)
258 : * 5A (90) Atom 4C (22 nm) 2 MB L2 + PowerVR (ANN)
259 : * 37 (55) Atom 4C (22 nm) 2 MB L2 + Intel Gen7 (BYT)
260 : * 4C (76) Atom 4C (14 nm) 2 MB L2 + Intel Gen8 (BSW)
261 : * 5D (93) Atom 4C (28 nm TSMC) 1 MB L2 + Mali (SoFIA)
262 : * 4D (77) Atom 8C (22 nm) 4 MB L2 (AVN)
263 : * ?? Atom ?C (14 nm) ? MB L2 (DVN)
264 : */
265 650 : return model == 28 || model == 38 || model == 54
266 650 : || model == 39 || model == 53 || model == 74
267 650 : || model == 90 || model == 55 || model == 76
268 1300 : || model == 93 || model == 77;
269 : }
270 :
271 : /**
272 : * Check if the processor has a slow MULT implementation.
273 : * If yes, it's better to use a hash not based on multiplication.
274 : */
275 5 : static inline int raid_cpu_has_slowmult(void)
276 : {
277 : char vendor[CPU_VENDOR_MAX];
278 : unsigned family;
279 : unsigned model;
280 :
281 : /*
282 : * In some cases Murmur3 based on MUL instruction,
283 : * is a LOT slower than Spooky2 based on SHIFTs.
284 : */
285 5 : raid_cpu_info(vendor, &family, &model);
286 :
287 5 : if (strcmp(vendor, "GenuineIntel") == 0) {
288 : /*
289 : * Intel Atom (Model 28)
290 : * murmur3:378 MB/s, spooky2:3413 MB/s (x86)
291 : *
292 : * Intel Atom (Model 77)
293 : * murmur3:1311 MB/s, spooky2:4056 MB/s (x64)
294 : */
295 5 : if (raid_cpu_is_atom(family, model))
296 0 : return 1;
297 : }
298 :
299 5 : return 0;
300 : }
301 :
302 : /**
303 : * Check if the processor has a slow extended set of SSE registers.
304 : * If yes, it's better to limit the unrolling to the first 8 registers.
305 : */
306 648 : static inline int raid_cpu_has_slowextendedreg(void)
307 : {
308 : char vendor[CPU_VENDOR_MAX];
309 : unsigned family;
310 : unsigned model;
311 :
312 : /*
313 : * In some cases the PAR2 implementation using 16 SSE registers
314 : * is a LITTLE slower than the one using only the first 8 registers.
315 : * This doesn't happen for PARZ.
316 : */
317 648 : raid_cpu_info(vendor, &family, &model);
318 :
319 648 : if (strcmp(vendor, "AuthenticAMD") == 0) {
320 : /*
321 : * AMD Bulldozer
322 : * par2_sse2:4922 MB/s, par2_sse2e:4465 MB/s
323 : */
324 0 : if (family == 21)
325 0 : return 1;
326 : }
327 :
328 648 : if (strcmp(vendor, "GenuineIntel") == 0) {
329 : /*
330 : * Intel Atom (Model 77)
331 : * par2_sse2:5686 MB/s, par2_sse2e:5250 MB/s
332 : * parz_sse2:3100 MB/s, parz_sse2e:3400 MB/s
333 : * par3_sse3:1921 MB/s, par3_sse3e:1813 MB/s
334 : * par4_sse3:1175 MB/s, par4_sse3e:1113 MB/s
335 : * par5_sse3:876 MB/s, par5_sse3e:675 MB/s
336 : * par6_sse3:705 MB/s, par6_sse3e:529 MB/s
337 : *
338 : * Intel Atom (Model 77) "Avoton C2750"
339 : * par2_sse2:5661 MB/s, par2_sse2e:5382 MB/s
340 : * parz_sse2:3110 MB/s, parz_sse2e:3450 MB/s
341 : * par3_sse3:1769 MB/s, par3_sse3e:1856 MB/s
342 : * par4_sse3:1221 MB/s, par4_sse3e:1141 MB/s
343 : * par5_sse3:910 MB/s, par5_sse3e:675 MB/s
344 : * par6_sse3:720 MB/s, par6_sse3e:534 MB/s
345 : */
346 648 : if (raid_cpu_is_atom(family, model))
347 0 : return 1;
348 : }
349 :
350 648 : return 0;
351 : }
352 : #endif
353 :
354 : #endif
355 :
|