今回はEC2のc5.largeインスタンスを使います。
CPU情報は
$ cat /proc/cpuinfo processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 85 model name : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz stepping : 4 microcode : 0x200005e cpu MHz : 3408.548 cache size : 25344 KB physical id : 0 siblings : 2 core id : 0 cpu cores : 1 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves ida arat pku ospke bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds bogomips : 5999.99 clflush size : 64 cache_alignment : 64 address sizes : 46 bits physical, 48 bits virtual power management: processor : 1 vendor_id : GenuineIntel cpu family : 6 model : 85 model name : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz stepping : 4 microcode : 0x200005e cpu MHz : 3411.384 cache size : 25344 KB physical id : 0 siblings : 2 core id : 0 cpu cores : 1 apicid : 1 initial apicid : 1 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves ida arat pku ospke bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds bogomips : 5999.99 clflush size : 64 cache_alignment : 64 address sizes : 46 bits physical, 48 bits virtual power management:
c5の上位インスタンスではIntel DL Boostに対応していますが、下位インスタンスでは対応していません。
実行のコードです。
vector.h
float sumProduct(float* vec1, float* vec2, int num);
vector.c
#include "vector.h" #include <immintrin.h> float sumProduct(float* vec1, float* vec2, int num) { __m512 avx_sum = _mm512_setzero_ps(); for (int i = 0;i < num;i += 16) { const __m512 a512 = _mm512_loadu_ps((double*)&vec1[i]); const __m512 b512 = _mm512_loadu_ps((double*)&vec2[i]); avx_sum = _mm512_fmadd_ps(a512, b512, avx_sum); } float __attribute__((aligned(32))) out[16] = {}; _mm512_storeu_ps(out, avx_sum); float sum = 0; for (int i = 0;i < 16;i++) { sum += out[i]; } return sum; }
test.c
#include<stdio.h> #include "vector.h" int main(void) { float a[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; float b[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; printf("%f\n", sumProduct(a, b, 16)); return 0; }
以下のコマンドでビルドします。
gcc -O2 -mavx512f vector.c -c -o vector gcc -O0 -mavx512f test.c -o test vector ./test 408.000000