LCOV - code coverage report
Current view: top level - raid - cpu.h (source / functions) Hit Total Coverage
Test: lcov.info Lines: 64 76 84.2 %
Date: 2017-11-06 22:14:04 Functions: 12 12 100.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (C) 2013 Andrea Mazzoleni
       3             :  *
       4             :  * This program is free software: you can redistribute it and/or modify
       5             :  * it under the terms of the GNU General Public License as published by
       6             :  * the Free Software Foundation, either version 2 of the License, or
       7             :  * (at your option) any later version.
       8             :  *
       9             :  * This program is distributed in the hope that it will be useful,
      10             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  * GNU General Public License for more details.
      13             :  */
      14             : 
      15             : #ifndef __RAID_CPU_H
      16             : #define __RAID_CPU_H
      17             : 
      18             : #ifdef CONFIG_X86
      19             : 
      20        2672 : static inline void raid_cpuid(uint32_t func_eax, uint32_t sub_ecx, uint32_t *reg)
      21             : {
      22        2672 :         asm volatile (
      23             : #if defined(__i386__) && defined(__PIC__)
      24             :                 /* allow compilation in PIC mode saving ebx */
      25             :                 "xchgl %%ebx, %1\n"
      26             :                 "cpuid\n"
      27             :                 "xchgl %%ebx, %1\n"
      28             :                 : "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
      29             :                 : "0" (func_eax), "2" (sub_ecx)
      30             : #else
      31             :                 "cpuid\n"
      32        2672 :                 : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
      33             :                 : "0" (func_eax), "2" (sub_ecx)
      34             : #endif
      35             :         );
      36        2672 : }
      37             : 
      38          31 : static inline void raid_xgetbv(uint32_t* reg)
      39             : {
      40             :         /* get the value of the Extended Control Register ecx=0 */
      41          31 :         asm volatile (
      42             :                 /* uses a direct encoding of the XGETBV instruction as only recent */
      43             :                 /* assemblers support it. */
      44             :                 /* the next line is equivalent at: "xgetbv\n" */
      45             :                 ".byte 0x0f, 0x01, 0xd0\n"
      46          31 :                 : "=a" (reg[0]), "=d" (reg[3])
      47             :                 : "c" (0)
      48             :         );
      49          31 : }
      50             : 
      51             : #define CPU_VENDOR_MAX 13
      52             : 
      53         553 : static inline void raid_cpu_info(char *vendor, unsigned *family, unsigned *model)
      54             : {
      55             :         uint32_t reg[4];
      56             :         unsigned f, ef, m, em;
      57             : 
      58         553 :         raid_cpuid(0, 0, reg);
      59             : 
      60         553 :         ((uint32_t*)vendor)[0] = reg[1];
      61         553 :         ((uint32_t*)vendor)[1] = reg[3];
      62         553 :         ((uint32_t*)vendor)[2] = reg[2];
      63         553 :         vendor[12] = 0;
      64             : 
      65         553 :         raid_cpuid(1, 0, reg);
      66             : 
      67         553 :         f = (reg[0] >> 8) & 0xF;
      68         553 :         ef = (reg[0] >> 20) & 0xFF;
      69         553 :         m = (reg[0] >> 4) & 0xF;
      70         553 :         em = (reg[0] >> 16) & 0xF;
      71             : 
      72         553 :         if (strcmp(vendor, "AuthenticAMD") == 0) {
      73           0 :                 if (f < 15) {
      74           0 :                         *family = f;
      75           0 :                         *model = m;
      76             :                 } else {
      77           0 :                         *family = f + ef;
      78           0 :                         *model = m + (em << 4);
      79             :                 }
      80             :         } else {
      81         553 :                 *family = f + ef;
      82         553 :                 *model = m + (em << 4);
      83             :         }
      84         553 : }
      85             : 
      86        1193 : static inline int raid_cpu_match_sse(uint32_t cpuid_1_ecx, uint32_t cpuid_1_edx)
      87             : {
      88             :         uint32_t reg[4];
      89             : 
      90        1193 :         raid_cpuid(1, 0, reg);
      91        1193 :         if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
      92          18 :                 return 0;
      93        1175 :         if ((reg[3] & cpuid_1_edx) != cpuid_1_edx)
      94           0 :                 return 0;
      95             : 
      96        1175 :         return 1;
      97             : }
      98             : 
      99         342 : static inline int raid_cpu_match_avx(uint32_t cpuid_1_ecx, uint32_t cpuid_7_ebx, uint32_t xcr0)
     100             : {
     101             :         uint32_t reg[4];
     102             : 
     103         342 :         raid_cpuid(1, 0, reg);
     104         342 :         if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
     105         311 :                 return 0;
     106             : 
     107          31 :         raid_xgetbv(reg);
     108          31 :         if ((reg[0] & xcr0) != xcr0)
     109           0 :                 return 0;
     110             : 
     111          31 :         raid_cpuid(7, 0, reg);
     112          31 :         if ((reg[1] & cpuid_7_ebx) != cpuid_7_ebx)
     113           0 :                 return 0;
     114             : 
     115          31 :         return 1;
     116             : }
     117             : 
     118         307 : static inline int raid_cpu_has_sse2(void)
     119             : {
     120             :         /*
     121             :          * Intel® 64 and IA-32 Architectures Software Developer's Manual
     122             :          * 325462-048US September 2013
     123             :          *
     124             :          * 11.6.2 Checking for SSE/SSE2 Support
     125             :          * Before an application attempts to use the SSE and/or SSE2 extensions, it should check
     126             :          * that they are present on the processor:
     127             :          * 1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
     128             :          * register can be used to check processor's support the CPUID instruction.
     129             :          * 2. Check that the processor supports the SSE and/or SSE2 extensions (true if
     130             :          * CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
     131             :          */
     132         307 :         return raid_cpu_match_sse(
     133             :                 0,
     134             :                 1 << 26); /* SSE2 */
     135             : }
     136             : 
     137         591 : static inline int raid_cpu_has_ssse3(void)
     138             : {
     139             :         /*
     140             :          * Intel® 64 and IA-32 Architectures Software Developer's Manual
     141             :          * 325462-048US September 2013
     142             :          *
     143             :          * 12.7.2 Checking for SSSE3 Support
     144             :          * Before an application attempts to use the SSSE3 extensions, the application should
     145             :          * follow the steps illustrated in Section 11.6.2, "Checking for SSE/SSE2 Support."
     146             :          * Next, use the additional step provided below:
     147             :          * Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
     148             :          */
     149         591 :         return raid_cpu_match_sse(
     150             :                 1 << 9, /* SSSE3 */
     151             :                 1 << 26); /* SSE2 */
     152             : }
     153             : 
     154         295 : static inline int raid_cpu_has_crc32(void)
     155             : {
     156             :         /*
     157             :          * Intel® 64 and IA-32 Architectures Software Developer's Manual
     158             :          * 325462-048US September 2013
     159             :          *
     160             :          * 12.12.3 Checking for SSE4.2 Support
     161             :          * ...
     162             :          * Before an application attempts to use the CRC32 instruction, it must check
     163             :          * that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
     164             :          */
     165         295 :         return raid_cpu_match_sse(
     166             :                 1 << 20, /* CRC32 */
     167             :                 0);
     168             : }
     169             : 
     170         342 : static inline int raid_cpu_has_avx2(void)
     171             : {
     172             :         /*
     173             :          * Intel Architecture Instruction Set Extensions Programming Reference
     174             :          * 319433-022 October 2014
     175             :          *
     176             :          * 14.3 Detection of AVX instructions
     177             :          * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use1)
     178             :          * 2) Issue XGETBV and verify that XCR0[2:1] = `11b' (XMM state and YMM state are enabled by OS).
     179             :          * 3) detect CPUID.1:ECX.AVX[bit 28] = 1 (AVX instructions supported).
     180             :          * (Step 3 can be done in any order relative to 1 and 2)
     181             :          *
     182             :          * 14.7.1 Detection of AVX2
     183             :          * Hardware support for AVX2 is indicated by CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]=1.
     184             :          * Application Software must identify that hardware supports AVX, after that it must
     185             :          * also detect support for AVX2 by checking CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5].
     186             :          */
     187         342 :         return raid_cpu_match_avx(
     188             :                 (1 << 27) | (1 << 28), /* OSXSAVE and AVX */
     189             :                 1 << 5, /* AVX2 */
     190             :                 3 << 1); /* OS saves XMM and YMM registers */
     191             : }
     192             : 
     193             : static inline int raid_cpu_has_avx512bw(void)
     194             : {
     195             :         /*
     196             :          * Intel Architecture Instruction Set Extensions Programming Reference
     197             :          * 319433-022 October 2014
     198             :          *
     199             :          * 2.2 Detection of 512-bit Instruction Groups of Intel AVX-512 Family
     200             :          * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use)
     201             :          * 2) Execute XGETBV and verify that XCR0[7:5] = `111b' (OPMASK state, upper 256-bit of
     202             :          * ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled by OS) and that XCR0[2:1] = `11b'
     203             :          * (XMM state and YMM state are enabled by OS).
     204             :          * 3) Verify both CPUID.0x7.0:EBX.AVX512F[bit 16] = 1, CPUID.0x7.0:EBX.AVX512BW[bit 30] = 1.
     205             :          */
     206             : 
     207             :         /* note that intentionally we don't check for AVX and AVX2 */
     208             :         /* because the documentation doesn't require that */
     209             :         return raid_cpu_match_avx(
     210             :                 1 << 27, /* XSAVE/XGETBV */
     211             :                 (1 << 16) | (1 << 30), /* AVX512F and AVX512BW */
     212             :                 (3 << 1) | (7 << 5)); /* OS saves XMM, YMM and ZMM registers */
     213             : }
     214             : 
     215             : /**
     216             :  * Check if it's an Intel Atom CPU.
     217             :  */
     218         549 : static inline int raid_cpu_is_atom(unsigned family, unsigned model)
     219             : {
     220         549 :         if (family != 6)
     221           3 :                 return 0;
     222             : 
     223             :         /*
     224             :          * x86 Architecture CPUID
     225             :          * http://www.sandpile.org/x86/cpuid.htm
     226             :          *
     227             :          * Intel Atom
     228             :          * 1C (28) Atom (45 nm) with 512 KB on-die L2
     229             :          * 26 (38) Atom (45 nm) with 512 KB on-die L2
     230             :          * 36 (54) Atom (32 nm) with 512 KB on-die L2
     231             :          * 27 (39) Atom (32 nm) with 512 KB on-die L2
     232             :          * 35 (53) Atom (?? nm) with ??? KB on-die L2
     233             :          * 4A (74) Atom 2C (22 nm) 1 MB L2 + PowerVR (TGR)
     234             :          * 5A (90) Atom 4C (22 nm) 2 MB L2 + PowerVR (ANN)
     235             :          * 37 (55) Atom 4C (22 nm) 2 MB L2 + Intel Gen7 (BYT)
     236             :          * 4C (76) Atom 4C (14 nm) 2 MB L2 + Intel Gen8 (BSW)
     237             :          * 5D (93) Atom 4C (28 nm TSMC) 1 MB L2 + Mali (SoFIA)
     238             :          * 4D (77) Atom 8C (22 nm) 4 MB L2 (AVN)
     239             :          * ?? Atom ?C (14 nm) ? MB L2 (DVN)
     240             :          */
     241        1092 :         return model == 28 || model == 38 || model == 54
     242         546 :                 || model == 39 || model == 53 || model == 74
     243         546 :                 || model == 90 || model == 55 || model == 76
     244        1092 :                 || model == 93 || model == 77;
     245             : }
     246             : 
     247             : /**
     248             :  * Check if the processor has a slow MULT implementation.
     249             :  * If yes, it's better to use a hash not based on multiplication.
     250             :  */
     251           4 : static inline int raid_cpu_has_slowmult(void)
     252             : {
     253             :         char vendor[CPU_VENDOR_MAX];
     254             :         unsigned family;
     255             :         unsigned model;
     256             : 
     257             :         /*
     258             :          * In some cases Murmur3 based on MUL instruction,
     259             :          * is a LOT slower than Spooky2 based on SHIFTs.
     260             :          */
     261           4 :         raid_cpu_info(vendor, &family, &model);
     262             : 
     263           4 :         if (strcmp(vendor, "GenuineIntel") == 0) {
     264             :                 /*
     265             :                  * Intel Atom (Model 28)
     266             :                  * murmur3:378 MB/s, spooky2:3413 MB/s (x86)
     267             :                  *
     268             :                  * Intel Atom (Model 77)
     269             :                  * murmur3:1311 MB/s, spooky2:4056 MB/s (x64)
     270             :                  */
     271           4 :                 if (raid_cpu_is_atom(family, model))
     272           0 :                         return 1;
     273             :         }
     274             : 
     275           4 :         return 0;
     276             : }
     277             : 
     278             : /**
     279             :  * Check if the processor has a slow extended set of SSE registers.
     280             :  * If yes, it's better to limit the unroll to the firsrt 8 registers.
     281             :  */
     282         545 : static inline int raid_cpu_has_slowextendedreg(void)
     283             : {
     284             :         char vendor[CPU_VENDOR_MAX];
     285             :         unsigned family;
     286             :         unsigned model;
     287             : 
     288             :         /*
     289             :          * In some cases the PAR2 implementation using 16 SSE registers
     290             :          * is a LITTLE slower than the one using only the first 8 registers.
     291             :          * This doesn't happen for PARZ.
     292             :          */
     293         545 :         raid_cpu_info(vendor, &family, &model);
     294             : 
     295         545 :         if (strcmp(vendor, "AuthenticAMD") == 0) {
     296             :                 /*
     297             :                  * AMD Bulldozer
     298             :                  * par2_sse2:4922 MB/s, par2_sse2e:4465 MB/s
     299             :                  */
     300           0 :                 if (family == 21)
     301           0 :                         return 1;
     302             :         }
     303             : 
     304         545 :         if (strcmp(vendor, "GenuineIntel") == 0) {
     305             :                 /*
     306             :                  * Intel Atom (Model 77)
     307             :                  * par2_sse2:5686 MB/s, par2_sse2e:5250 MB/s
     308             :                  * parz_sse2:3100 MB/s, parz_sse2e:3400 MB/s
     309             :                  * par3_sse3:1921 MB/s, par3_sse3e:1813 MB/s
     310             :                  * par4_sse3:1175 MB/s, par4_sse3e:1113 MB/s
     311             :                  * par5_sse3:876 MB/s, par5_sse3e:675 MB/s
     312             :                  * par6_sse3:705 MB/s, par6_sse3e:529 MB/s
     313             :                  *
     314             :                  * Intel Atom (Model 77) "Avoton C2750"
     315             :                  * par2_sse2:5661 MB/s, par2_sse2e:5382 MB/s
     316             :                  * parz_sse2:3110 MB/s, parz_sse2e:3450 MB/s
     317             :                  * par3_sse3:1769 MB/s, par3_sse3e:1856 MB/s
     318             :                  * par4_sse3:1221 MB/s, par4_sse3e:1141 MB/s
     319             :                  * par5_sse3:910 MB/s, par5_sse3e:675 MB/s
     320             :                  * par6_sse3:720 MB/s, par6_sse3e:534 MB/s
     321             :                  */
     322         545 :                 if (raid_cpu_is_atom(family, model))
     323           0 :                         return 1;
     324             :         }
     325             : 
     326         545 :         return 0;
     327             : }
     328             : #endif
     329             : 
     330             : #endif
     331             : 

Generated by: LCOV version 1.13