Line data Source code
1 : /*
2 : * Copyright (C) 2013 Andrea Mazzoleni
3 : *
4 : * This program is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : */
14 :
15 : #ifndef __RAID_CPU_H
16 : #define __RAID_CPU_H
17 :
18 : #ifdef CONFIG_X86
19 :
20 2672 : static inline void raid_cpuid(uint32_t func_eax, uint32_t sub_ecx, uint32_t *reg)
21 : {
22 2672 : asm volatile (
23 : #if defined(__i386__) && defined(__PIC__)
24 : /* allow compilation in PIC mode saving ebx */
25 : "xchgl %%ebx, %1\n"
26 : "cpuid\n"
27 : "xchgl %%ebx, %1\n"
28 : : "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
29 : : "0" (func_eax), "2" (sub_ecx)
30 : #else
31 : "cpuid\n"
32 2672 : : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
33 : : "0" (func_eax), "2" (sub_ecx)
34 : #endif
35 : );
36 2672 : }
37 :
38 31 : static inline void raid_xgetbv(uint32_t* reg)
39 : {
40 : /* get the value of the Extended Control Register ecx=0 */
41 31 : asm volatile (
42 : /* uses a direct encoding of the XGETBV instruction as only recent */
43 : /* assemblers support it. */
44 : /* the next line is equivalent at: "xgetbv\n" */
45 : ".byte 0x0f, 0x01, 0xd0\n"
46 31 : : "=a" (reg[0]), "=d" (reg[3])
47 : : "c" (0)
48 : );
49 31 : }
50 :
51 : #define CPU_VENDOR_MAX 13
52 :
53 553 : static inline void raid_cpu_info(char *vendor, unsigned *family, unsigned *model)
54 : {
55 : uint32_t reg[4];
56 : unsigned f, ef, m, em;
57 :
58 553 : raid_cpuid(0, 0, reg);
59 :
60 553 : ((uint32_t*)vendor)[0] = reg[1];
61 553 : ((uint32_t*)vendor)[1] = reg[3];
62 553 : ((uint32_t*)vendor)[2] = reg[2];
63 553 : vendor[12] = 0;
64 :
65 553 : raid_cpuid(1, 0, reg);
66 :
67 553 : f = (reg[0] >> 8) & 0xF;
68 553 : ef = (reg[0] >> 20) & 0xFF;
69 553 : m = (reg[0] >> 4) & 0xF;
70 553 : em = (reg[0] >> 16) & 0xF;
71 :
72 553 : if (strcmp(vendor, "AuthenticAMD") == 0) {
73 0 : if (f < 15) {
74 0 : *family = f;
75 0 : *model = m;
76 : } else {
77 0 : *family = f + ef;
78 0 : *model = m + (em << 4);
79 : }
80 : } else {
81 553 : *family = f + ef;
82 553 : *model = m + (em << 4);
83 : }
84 553 : }
85 :
86 1193 : static inline int raid_cpu_match_sse(uint32_t cpuid_1_ecx, uint32_t cpuid_1_edx)
87 : {
88 : uint32_t reg[4];
89 :
90 1193 : raid_cpuid(1, 0, reg);
91 1193 : if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
92 18 : return 0;
93 1175 : if ((reg[3] & cpuid_1_edx) != cpuid_1_edx)
94 0 : return 0;
95 :
96 1175 : return 1;
97 : }
98 :
99 342 : static inline int raid_cpu_match_avx(uint32_t cpuid_1_ecx, uint32_t cpuid_7_ebx, uint32_t xcr0)
100 : {
101 : uint32_t reg[4];
102 :
103 342 : raid_cpuid(1, 0, reg);
104 342 : if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
105 311 : return 0;
106 :
107 31 : raid_xgetbv(reg);
108 31 : if ((reg[0] & xcr0) != xcr0)
109 0 : return 0;
110 :
111 31 : raid_cpuid(7, 0, reg);
112 31 : if ((reg[1] & cpuid_7_ebx) != cpuid_7_ebx)
113 0 : return 0;
114 :
115 31 : return 1;
116 : }
117 :
118 307 : static inline int raid_cpu_has_sse2(void)
119 : {
120 : /*
121 : * Intel® 64 and IA-32 Architectures Software Developer's Manual
122 : * 325462-048US September 2013
123 : *
124 : * 11.6.2 Checking for SSE/SSE2 Support
125 : * Before an application attempts to use the SSE and/or SSE2 extensions, it should check
126 : * that they are present on the processor:
127 : * 1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
128 : * register can be used to check processor's support the CPUID instruction.
129 : * 2. Check that the processor supports the SSE and/or SSE2 extensions (true if
130 : * CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
131 : */
132 307 : return raid_cpu_match_sse(
133 : 0,
134 : 1 << 26); /* SSE2 */
135 : }
136 :
137 591 : static inline int raid_cpu_has_ssse3(void)
138 : {
139 : /*
140 : * Intel® 64 and IA-32 Architectures Software Developer's Manual
141 : * 325462-048US September 2013
142 : *
143 : * 12.7.2 Checking for SSSE3 Support
144 : * Before an application attempts to use the SSSE3 extensions, the application should
145 : * follow the steps illustrated in Section 11.6.2, "Checking for SSE/SSE2 Support."
146 : * Next, use the additional step provided below:
147 : * Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
148 : */
149 591 : return raid_cpu_match_sse(
150 : 1 << 9, /* SSSE3 */
151 : 1 << 26); /* SSE2 */
152 : }
153 :
154 295 : static inline int raid_cpu_has_crc32(void)
155 : {
156 : /*
157 : * Intel® 64 and IA-32 Architectures Software Developer's Manual
158 : * 325462-048US September 2013
159 : *
160 : * 12.12.3 Checking for SSE4.2 Support
161 : * ...
162 : * Before an application attempts to use the CRC32 instruction, it must check
163 : * that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
164 : */
165 295 : return raid_cpu_match_sse(
166 : 1 << 20, /* CRC32 */
167 : 0);
168 : }
169 :
170 342 : static inline int raid_cpu_has_avx2(void)
171 : {
172 : /*
173 : * Intel Architecture Instruction Set Extensions Programming Reference
174 : * 319433-022 October 2014
175 : *
176 : * 14.3 Detection of AVX instructions
177 : * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use1)
178 : * 2) Issue XGETBV and verify that XCR0[2:1] = `11b' (XMM state and YMM state are enabled by OS).
179 : * 3) detect CPUID.1:ECX.AVX[bit 28] = 1 (AVX instructions supported).
180 : * (Step 3 can be done in any order relative to 1 and 2)
181 : *
182 : * 14.7.1 Detection of AVX2
183 : * Hardware support for AVX2 is indicated by CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]=1.
184 : * Application Software must identify that hardware supports AVX, after that it must
185 : * also detect support for AVX2 by checking CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5].
186 : */
187 342 : return raid_cpu_match_avx(
188 : (1 << 27) | (1 << 28), /* OSXSAVE and AVX */
189 : 1 << 5, /* AVX2 */
190 : 3 << 1); /* OS saves XMM and YMM registers */
191 : }
192 :
193 : static inline int raid_cpu_has_avx512bw(void)
194 : {
195 : /*
196 : * Intel Architecture Instruction Set Extensions Programming Reference
197 : * 319433-022 October 2014
198 : *
199 : * 2.2 Detection of 512-bit Instruction Groups of Intel AVX-512 Family
200 : * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use)
201 : * 2) Execute XGETBV and verify that XCR0[7:5] = `111b' (OPMASK state, upper 256-bit of
202 : * ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled by OS) and that XCR0[2:1] = `11b'
203 : * (XMM state and YMM state are enabled by OS).
204 : * 3) Verify both CPUID.0x7.0:EBX.AVX512F[bit 16] = 1, CPUID.0x7.0:EBX.AVX512BW[bit 30] = 1.
205 : */
206 :
207 : /* note that intentionally we don't check for AVX and AVX2 */
208 : /* because the documentation doesn't require that */
209 : return raid_cpu_match_avx(
210 : 1 << 27, /* XSAVE/XGETBV */
211 : (1 << 16) | (1 << 30), /* AVX512F and AVX512BW */
212 : (3 << 1) | (7 << 5)); /* OS saves XMM, YMM and ZMM registers */
213 : }
214 :
215 : /**
216 : * Check if it's an Intel Atom CPU.
217 : */
218 549 : static inline int raid_cpu_is_atom(unsigned family, unsigned model)
219 : {
220 549 : if (family != 6)
221 3 : return 0;
222 :
223 : /*
224 : * x86 Architecture CPUID
225 : * http://www.sandpile.org/x86/cpuid.htm
226 : *
227 : * Intel Atom
228 : * 1C (28) Atom (45 nm) with 512 KB on-die L2
229 : * 26 (38) Atom (45 nm) with 512 KB on-die L2
230 : * 36 (54) Atom (32 nm) with 512 KB on-die L2
231 : * 27 (39) Atom (32 nm) with 512 KB on-die L2
232 : * 35 (53) Atom (?? nm) with ??? KB on-die L2
233 : * 4A (74) Atom 2C (22 nm) 1 MB L2 + PowerVR (TGR)
234 : * 5A (90) Atom 4C (22 nm) 2 MB L2 + PowerVR (ANN)
235 : * 37 (55) Atom 4C (22 nm) 2 MB L2 + Intel Gen7 (BYT)
236 : * 4C (76) Atom 4C (14 nm) 2 MB L2 + Intel Gen8 (BSW)
237 : * 5D (93) Atom 4C (28 nm TSMC) 1 MB L2 + Mali (SoFIA)
238 : * 4D (77) Atom 8C (22 nm) 4 MB L2 (AVN)
239 : * ?? Atom ?C (14 nm) ? MB L2 (DVN)
240 : */
241 1092 : return model == 28 || model == 38 || model == 54
242 546 : || model == 39 || model == 53 || model == 74
243 546 : || model == 90 || model == 55 || model == 76
244 1092 : || model == 93 || model == 77;
245 : }
246 :
247 : /**
248 : * Check if the processor has a slow MULT implementation.
249 : * If yes, it's better to use a hash not based on multiplication.
250 : */
251 4 : static inline int raid_cpu_has_slowmult(void)
252 : {
253 : char vendor[CPU_VENDOR_MAX];
254 : unsigned family;
255 : unsigned model;
256 :
257 : /*
258 : * In some cases Murmur3 based on MUL instruction,
259 : * is a LOT slower than Spooky2 based on SHIFTs.
260 : */
261 4 : raid_cpu_info(vendor, &family, &model);
262 :
263 4 : if (strcmp(vendor, "GenuineIntel") == 0) {
264 : /*
265 : * Intel Atom (Model 28)
266 : * murmur3:378 MB/s, spooky2:3413 MB/s (x86)
267 : *
268 : * Intel Atom (Model 77)
269 : * murmur3:1311 MB/s, spooky2:4056 MB/s (x64)
270 : */
271 4 : if (raid_cpu_is_atom(family, model))
272 0 : return 1;
273 : }
274 :
275 4 : return 0;
276 : }
277 :
278 : /**
279 : * Check if the processor has a slow extended set of SSE registers.
280 : * If yes, it's better to limit the unroll to the firsrt 8 registers.
281 : */
282 545 : static inline int raid_cpu_has_slowextendedreg(void)
283 : {
284 : char vendor[CPU_VENDOR_MAX];
285 : unsigned family;
286 : unsigned model;
287 :
288 : /*
289 : * In some cases the PAR2 implementation using 16 SSE registers
290 : * is a LITTLE slower than the one using only the first 8 registers.
291 : * This doesn't happen for PARZ.
292 : */
293 545 : raid_cpu_info(vendor, &family, &model);
294 :
295 545 : if (strcmp(vendor, "AuthenticAMD") == 0) {
296 : /*
297 : * AMD Bulldozer
298 : * par2_sse2:4922 MB/s, par2_sse2e:4465 MB/s
299 : */
300 0 : if (family == 21)
301 0 : return 1;
302 : }
303 :
304 545 : if (strcmp(vendor, "GenuineIntel") == 0) {
305 : /*
306 : * Intel Atom (Model 77)
307 : * par2_sse2:5686 MB/s, par2_sse2e:5250 MB/s
308 : * parz_sse2:3100 MB/s, parz_sse2e:3400 MB/s
309 : * par3_sse3:1921 MB/s, par3_sse3e:1813 MB/s
310 : * par4_sse3:1175 MB/s, par4_sse3e:1113 MB/s
311 : * par5_sse3:876 MB/s, par5_sse3e:675 MB/s
312 : * par6_sse3:705 MB/s, par6_sse3e:529 MB/s
313 : *
314 : * Intel Atom (Model 77) "Avoton C2750"
315 : * par2_sse2:5661 MB/s, par2_sse2e:5382 MB/s
316 : * parz_sse2:3110 MB/s, parz_sse2e:3450 MB/s
317 : * par3_sse3:1769 MB/s, par3_sse3e:1856 MB/s
318 : * par4_sse3:1221 MB/s, par4_sse3e:1141 MB/s
319 : * par5_sse3:910 MB/s, par5_sse3e:675 MB/s
320 : * par6_sse3:720 MB/s, par6_sse3e:534 MB/s
321 : */
322 545 : if (raid_cpu_is_atom(family, model))
323 0 : return 1;
324 : }
325 :
326 545 : return 0;
327 : }
328 : #endif
329 :
330 : #endif
331 :
|