AVX512をECで試す

今回はEC2のc5.largeインスタンスを使います。

CPU情報は

$ cat /proc/cpuinfo
processor   : 0
vendor_id   : GenuineIntel
cpu family  : 6
model       : 85
model name  : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
stepping    : 4
microcode   : 0x200005e
cpu MHz     : 3408.548
cache size  : 25344 KB
physical id : 0
siblings    : 2
core id     : 0
cpu cores   : 1
apicid      : 0
initial apicid  : 0
fpu     : yes
fpu_exception   : yes
cpuid level : 13
wp      : yes
flags       : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves ida arat pku ospke
bugs        : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds
bogomips    : 5999.99
clflush size    : 64
cache_alignment : 64
address sizes   : 46 bits physical, 48 bits virtual
power management:

processor   : 1
vendor_id   : GenuineIntel
cpu family  : 6
model       : 85
model name  : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
stepping    : 4
microcode   : 0x200005e
cpu MHz     : 3411.384
cache size  : 25344 KB
physical id : 0
siblings    : 2
core id     : 0
cpu cores   : 1
apicid      : 1
initial apicid  : 1
fpu     : yes
fpu_exception   : yes
cpuid level : 13
wp      : yes
flags       : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves ida arat pku ospke
bugs        : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds
bogomips    : 5999.99
clflush size    : 64
cache_alignment : 64
address sizes   : 46 bits physical, 48 bits virtual
power management:

c5の上位インスタンスではIntel DL Boostに対応していますが、下位インスタンスでは対応していません。

f:id:taku-woohar:20190922175408p:plain
Intel DL Boost対応表
2019/7/24のIntel AIのブログです。

実行のコードです。

vector.h

float sumProduct(float* vec1, float* vec2, int num);

vector.c

#include "vector.h"
#include <immintrin.h>

float sumProduct(float* vec1, float* vec2, int num)
{
    __m512 avx_sum = _mm512_setzero_ps();
    for (int i = 0;i < num;i += 16) {
        const __m512 a512 = _mm512_loadu_ps((double*)&vec1[i]);
        const __m512 b512 = _mm512_loadu_ps((double*)&vec2[i]);
        avx_sum = _mm512_fmadd_ps(a512, b512, avx_sum);
    }

    float __attribute__((aligned(32))) out[16] = {};
    _mm512_storeu_ps(out, avx_sum);
    float sum = 0;
    for (int i = 0;i < 16;i++) {
        sum += out[i];
    }
    return sum;
}

test.c

#include<stdio.h>
#include "vector.h"

int main(void)
{
    float a[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
    float b[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
    printf("%f\n", sumProduct(a, b, 16));
    return 0;
}

以下のコマンドでビルドします。

gcc -O2 -mavx512f vector.c -c  -o vector
gcc -O0 -mavx512f test.c -o test vector
./test
408.000000

参考サイト

https://colfaxresearch.com/knl-avx512/