LCOV - code coverage report
Current view: top level - raid - cpu.h (source / functions) Hit Total Coverage
Test: lcov.info Lines: 73 84 86.9 %
Date: 2026-04-29 15:04:44 Functions: 15 15 100.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : // Copyright (C) 2013 Andrea Mazzoleni
       3             : 
       4             : #ifndef __RAID_CPU_H
       5             : #define __RAID_CPU_H
       6             : 
       7             : #ifdef CONFIG_X86
       8             : 
       9        3629 : static inline void raid_cpuid(uint32_t func_eax, uint32_t sub_ecx, uint32_t *reg)
      10             : {
      11        3629 :         asm volatile (
      12             : #if defined(__i386__) && defined(__PIC__)
      13             :                 /* allow compilation in PIC mode saving ebx */
      14             :                 "xchgl %%ebx, %1\n"
      15             :                 "cpuid\n"
      16             :                 "xchgl %%ebx, %1\n"
      17             :                 : "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
      18             :                 : "0" (func_eax), "2" (sub_ecx)
      19             : #else
      20             :                 "cpuid\n"
      21        3629 :                 : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
      22             :                 : "0" (func_eax), "2" (sub_ecx)
      23             : #endif
      24             :         );
      25        3629 : }
      26             : 
      27          93 : static inline void raid_xgetbv(uint32_t* reg)
      28             : {
      29             :         /* get the value of the Extended Control Register ecx=0 */
      30          93 :         asm volatile (
      31             :                 /*
      32             :                  * Uses a direct encoding of the XGETBV instruction as only recent
      33             :                  * assemblers support it.
      34             :                  * the next line is equivalent to: "xgetbv\n"
      35             :                  */
      36             :                 ".byte 0x0f, 0x01, 0xd0\n"
      37          93 :                 : "=a" (reg[0]), "=d" (reg[3])
      38             :                 : "c" (0)
      39             :         );
      40          93 : }
      41             : 
      42             : #define CPU_VENDOR_MAX 13
      43             : 
      44         658 : static inline void raid_cpu_info(char *vendor, unsigned *family, unsigned *model)
      45             : {
      46             :         uint32_t reg[4];
      47             :         unsigned f, ef, m, em;
      48             : 
      49         658 :         raid_cpuid(0, 0, reg);
      50             : 
      51         658 :         ((uint32_t*)vendor)[0] = reg[1];
      52         658 :         ((uint32_t*)vendor)[1] = reg[3];
      53         658 :         ((uint32_t*)vendor)[2] = reg[2];
      54         658 :         vendor[12] = 0;
      55             : 
      56         658 :         raid_cpuid(1, 0, reg);
      57             : 
      58         658 :         f = (reg[0] >> 8) & 0xF;
      59         658 :         ef = (reg[0] >> 20) & 0xFF;
      60         658 :         m = (reg[0] >> 4) & 0xF;
      61         658 :         em = (reg[0] >> 16) & 0xF;
      62             : 
      63         658 :         if (strcmp(vendor, "AuthenticAMD") == 0) {
      64           0 :                 if (f < 15) {
      65           0 :                         *family = f;
      66           0 :                         *model = m;
      67             :                 } else {
      68           0 :                         *family = f + ef;
      69           0 :                         *model = m + (em << 4);
      70             :                 }
      71             :         } else {
      72         658 :                 *family = f + ef;
      73         658 :                 *model = m + (em << 4);
      74             :         }
      75         658 : }
      76             : 
      77        1417 : static inline int raid_cpu_match_sse(uint32_t cpuid_1_ecx, uint32_t cpuid_1_edx)
      78             : {
      79             :         uint32_t reg[4];
      80             : 
      81        1417 :         raid_cpuid(1, 0, reg);
      82        1417 :         if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
      83          18 :                 return 0;
      84        1399 :         if ((reg[3] & cpuid_1_edx) != cpuid_1_edx)
      85           0 :                 return 0;
      86             : 
      87        1399 :         return 1;
      88             : }
      89             : 
      90         818 : static inline int raid_cpu_match_avx(uint32_t cpuid_1_ecx, uint32_t cpuid_7_ebx, uint32_t cpuid_7_ecx, uint32_t xcr0)
      91             : {
      92             :         uint32_t reg[4];
      93             : 
      94         818 :         raid_cpuid(1, 0, reg);
      95         818 :         if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
      96         725 :                 return 0;
      97             : 
      98          93 :         raid_xgetbv(reg);
      99          93 :         if ((reg[0] & xcr0) != xcr0)
     100          15 :                 return 0;
     101             : 
     102          78 :         raid_cpuid(7, 0, reg);
     103          78 :         if ((reg[1] & cpuid_7_ebx) != cpuid_7_ebx)
     104           0 :                 return 0;
     105             : 
     106          78 :         if ((reg[2] & cpuid_7_ecx) != cpuid_7_ecx)
     107           3 :                 return 0;
     108             : 
     109          75 :         return 1;
     110             : }
     111             : 
     112         366 : static inline int raid_cpu_has_sse2(void)
     113             : {
     114             :         /*
     115             :          * Intel 64 and IA-32 Architectures Software Developer's Manual
     116             :          * 325462-048US September 2013
     117             :          *
     118             :          * 11.6.2 Checking for SSE/SSE2 Support
     119             :          * Before an application attempts to use the SSE and/or SSE2 extensions, it should check
     120             :          * that they are present on the processor:
     121             :          * 1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
     122             :          * register can be used to check if the processor supports the CPUID instruction.
     123             :          * 2. Check that the processor supports the SSE and/or SSE2 extensions (true if
     124             :          * CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
     125             :          */
     126         366 :         return raid_cpu_match_sse(
     127             :                 0,
     128             :                 1 << 26); /* SSE2 */
     129             : }
     130             : 
     131         703 : static inline int raid_cpu_has_ssse3(void)
     132             : {
     133             :         /*
     134             :          * Intel 64 and IA-32 Architectures Software Developer's Manual
     135             :          * 325462-048US September 2013
     136             :          *
     137             :          * 12.7.2 Checking for SSSE3 Support
     138             :          * Before an application attempts to use the SSSE3 extensions, the application should
     139             :          * follow the steps illustrated in Section 11.6.2, "Checking for SSE/SSE2 Support."
     140             :          * Next, use the additional step provided below:
     141             :          * Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
     142             :          */
     143         703 :         return raid_cpu_match_sse(
     144             :                 1 << 9, /* SSSE3 */
     145             :                 1 << 26); /* SSE2 */
     146             : }
     147             : 
     148         348 : static inline int raid_cpu_has_crc32(void)
     149             : {
     150             :         /*
     151             :          * Intel 64 and IA-32 Architectures Software Developer's Manual
     152             :          * 325462-048US September 2013
     153             :          *
     154             :          * 12.12.3 Checking for SSE4.2 Support
     155             :          * ...
     156             :          * Before an application attempts to use the CRC32 instruction, it must check
     157             :          * that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
     158             :          */
     159         348 :         return raid_cpu_match_sse(
     160             :                 1 << 20, /* CRC32 */
     161             :                 0);
     162             : }
     163             : 
     164         407 : static inline int raid_cpu_has_avx2(void)
     165             : {
     166             :         /*
     167             :          * Intel Architecture Instruction Set Extensions Programming Reference
     168             :          * 319433-022 October 2014
     169             :          *
     170             :          * 14.3 Detection of AVX instructions
     171             :          * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use1)
     172             :          * 2) Issue XGETBV and verify that XCR0[2:1] = `11b' (XMM state and YMM state are enabled by OS).
     173             :          * 3) detect CPUID.1:ECX.AVX[bit 28] = 1 (AVX instructions supported).
     174             :          * (Step 3 can be done in any order relative to 1 and 2)
     175             :          *
     176             :          * 14.7.1 Detection of AVX2
     177             :          * Hardware support for AVX2 is indicated by CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]=1.
     178             :          * Application Software must identify that hardware supports AVX, after that it must
     179             :          * also detect support for AVX2 by checking CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5].
     180             :          */
     181         407 :         return raid_cpu_match_avx(
     182             :                 (1 << 27) | (1 << 28), /* Leaf 1, ECX: XSAVE and AVX */
     183             :                 1 << 5, /* Leaf 7, EBX: AVX2 */
     184             :                 0, /* Leaf 7, ECX: */
     185             :                 3 << 1); /* XCR0: OS saves XMM and YMM registers */
     186             : }
     187             : 
     188           5 : static inline int raid_cpu_has_avx2gfni(void)
     189             : {
     190             :         /*
     191             :          * Detection of AVX2 + GFNI:
     192             :          * 1) Verify OSXSAVE and XGETBV (CPUID.1:ECX[27])
     193             :          * 2) Verify XCR0[2:1] = '11b' (XMM and YMM state enabled)
     194             :          * 3) Verify AVX2 support (CPUID.7.0:EBX[5])
     195             :          * 4) Verify GFNI support (CPUID.7.0:ECX[8])
     196             :          */
     197           5 :         return raid_cpu_match_avx(
     198             :                 (1 << 27) | (1 << 28), /* Leaf 1, ECX: XSAVE and AVX */
     199             :                 1 << 5, /* Leaf 7, EBX: AVX2 */
     200             :                 1 << 8, /* Leaf 7, ECX: GFNI */
     201             :                 3 << 1); /* XCR0: OS saves XMM and YMM registers */
     202             : }
     203             : 
     204         401 : static inline int raid_cpu_has_avx512bw(void)
     205             : {
     206             :         /*
     207             :          * Intel Architecture Instruction Set Extensions Programming Reference
     208             :          * 319433-022 October 2014
     209             :          *
     210             :          * 2.2 Detection of 512-bit Instruction Groups of Intel AVX-512 Family
     211             :          * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use)
     212             :          * 2) Execute XGETBV and verify that XCR0[7:5] = `111b' (OPMASK state, upper 256-bit of
     213             :          * ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled by OS) and that XCR0[2:1] = `11b'
     214             :          * (XMM state and YMM state are enabled by OS).
     215             :          * 3) Verify both CPUID.0x7.0:EBX.AVX512F[bit 16] = 1, CPUID.0x7.0:EBX.AVX512BW[bit 30] = 1.
     216             :          */
     217             : 
     218             :         /*
     219             :          * Note that intentionally we don't check for AVX and AVX2
     220             :          * because the documentation doesn't require that
     221             :          */
     222         401 :         return raid_cpu_match_avx(
     223             :                 1 << 27, /* Leaf 1, ECX: XSAVE/XGETBV */
     224             :                 (1 << 16) | (1 << 30), /* Leaf 7, EBX: AVX512F and AVX512BW */
     225             :                 0, /* Leaf 7, ECX: */
     226             :                 (3 << 1) | (7 << 5)); /* XCR0: OS saves XMM, YMM and ZMM registers */
     227             : }
     228             : 
     229           5 : static inline int raid_cpu_has_avx512gfni(void)
     230             : {
     231           5 :         return raid_cpu_match_avx(
     232             :                 1 << 27, /* Leaf 1, ECX: XSAVE/XGETBV */
     233             :                 1 << 16,  /* Leaf 7, EBX: AVX512F (Foundation) */
     234             :                 1 << 8, /* Leaf 7, ECX: GFNI */
     235             :                 (3 << 1) | (7 << 5) /* XCR0: OS saves XMM, YMM and ZMM registers */
     236             :         );
     237             : }
     238             : 
     239             : /**
     240             :  * Check if it's an Intel Atom CPU.
     241             :  */
     242         653 : static inline int raid_cpu_is_atom(unsigned family, unsigned model)
     243             : {
     244         653 :         if (family != 6)
     245           3 :                 return 0;
     246             : 
     247             :         /*
     248             :          * x86 Architecture CPUID
     249             :          * http://www.sandpile.org/x86/cpuid.htm
     250             :          *
     251             :          * Intel Atom
     252             :          * 1C (28) Atom (45 nm) with 512 KB on-die L2
     253             :          * 26 (38) Atom (45 nm) with 512 KB on-die L2
     254             :          * 36 (54) Atom (32 nm) with 512 KB on-die L2
     255             :          * 27 (39) Atom (32 nm) with 512 KB on-die L2
     256             :          * 35 (53) Atom (?? nm) with ??? KB on-die L2
     257             :          * 4A (74) Atom 2C (22 nm) 1 MB L2 + PowerVR (TGR)
     258             :          * 5A (90) Atom 4C (22 nm) 2 MB L2 + PowerVR (ANN)
     259             :          * 37 (55) Atom 4C (22 nm) 2 MB L2 + Intel Gen7 (BYT)
     260             :          * 4C (76) Atom 4C (14 nm) 2 MB L2 + Intel Gen8 (BSW)
     261             :          * 5D (93) Atom 4C (28 nm TSMC) 1 MB L2 + Mali (SoFIA)
     262             :          * 4D (77) Atom 8C (22 nm) 4 MB L2 (AVN)
     263             :          * ?? Atom ?C (14 nm) ? MB L2 (DVN)
     264             :          */
     265         650 :         return model == 28 || model == 38 || model == 54
     266         650 :                 || model == 39 || model == 53 || model == 74
     267         650 :                 || model == 90 || model == 55 || model == 76
     268        1300 :                 || model == 93 || model == 77;
     269             : }
     270             : 
     271             : /**
     272             :  * Check if the processor has a slow MULT implementation.
     273             :  * If yes, it's better to use a hash not based on multiplication.
     274             :  */
     275           5 : static inline int raid_cpu_has_slowmult(void)
     276             : {
     277             :         char vendor[CPU_VENDOR_MAX];
     278             :         unsigned family;
     279             :         unsigned model;
     280             : 
     281             :         /*
     282             :          * In some cases Murmur3 based on MUL instruction,
     283             :          * is a LOT slower than Spooky2 based on SHIFTs.
     284             :          */
     285           5 :         raid_cpu_info(vendor, &family, &model);
     286             : 
     287           5 :         if (strcmp(vendor, "GenuineIntel") == 0) {
     288             :                 /*
     289             :                  * Intel Atom (Model 28)
     290             :                  * murmur3:378 MB/s, spooky2:3413 MB/s (x86)
     291             :                  *
     292             :                  * Intel Atom (Model 77)
     293             :                  * murmur3:1311 MB/s, spooky2:4056 MB/s (x64)
     294             :                  */
     295           5 :                 if (raid_cpu_is_atom(family, model))
     296           0 :                         return 1;
     297             :         }
     298             : 
     299           5 :         return 0;
     300             : }
     301             : 
     302             : /**
     303             :  * Check if the processor has a slow extended set of SSE registers.
     304             :  * If yes, it's better to limit the unrolling to the first 8 registers.
     305             :  */
     306         648 : static inline int raid_cpu_has_slowextendedreg(void)
     307             : {
     308             :         char vendor[CPU_VENDOR_MAX];
     309             :         unsigned family;
     310             :         unsigned model;
     311             : 
     312             :         /*
     313             :          * In some cases the PAR2 implementation using 16 SSE registers
     314             :          * is a LITTLE slower than the one using only the first 8 registers.
     315             :          * This doesn't happen for PARZ.
     316             :          */
     317         648 :         raid_cpu_info(vendor, &family, &model);
     318             : 
     319         648 :         if (strcmp(vendor, "AuthenticAMD") == 0) {
     320             :                 /*
     321             :                  * AMD Bulldozer
     322             :                  * par2_sse2:4922 MB/s, par2_sse2e:4465 MB/s
     323             :                  */
     324           0 :                 if (family == 21)
     325           0 :                         return 1;
     326             :         }
     327             : 
     328         648 :         if (strcmp(vendor, "GenuineIntel") == 0) {
     329             :                 /*
     330             :                  * Intel Atom (Model 77)
     331             :                  * par2_sse2:5686 MB/s, par2_sse2e:5250 MB/s
     332             :                  * parz_sse2:3100 MB/s, parz_sse2e:3400 MB/s
     333             :                  * par3_sse3:1921 MB/s, par3_sse3e:1813 MB/s
     334             :                  * par4_sse3:1175 MB/s, par4_sse3e:1113 MB/s
     335             :                  * par5_sse3:876 MB/s, par5_sse3e:675 MB/s
     336             :                  * par6_sse3:705 MB/s, par6_sse3e:529 MB/s
     337             :                  *
     338             :                  * Intel Atom (Model 77) "Avoton C2750"
     339             :                  * par2_sse2:5661 MB/s, par2_sse2e:5382 MB/s
     340             :                  * parz_sse2:3110 MB/s, parz_sse2e:3450 MB/s
     341             :                  * par3_sse3:1769 MB/s, par3_sse3e:1856 MB/s
     342             :                  * par4_sse3:1221 MB/s, par4_sse3e:1141 MB/s
     343             :                  * par5_sse3:910 MB/s, par5_sse3e:675 MB/s
     344             :                  * par6_sse3:720 MB/s, par6_sse3e:534 MB/s
     345             :                  */
     346         648 :                 if (raid_cpu_is_atom(family, model))
     347           0 :                         return 1;
     348             :         }
     349             : 
     350         648 :         return 0;
     351             : }
     352             : #endif
     353             : 
     354             : #endif
     355             : 

Generated by: LCOV version 1.0