EC2 c5.12xlarge インスタンスでは使えるはずと思って試しました。 Intelのサイトを参考に
sudo apt install gcc-8 clang-8
#include <immintrin.h> #include <stdio.h> int main() { int8_t __attribute__((aligned(64))) op1_int8[64]; int8_t __attribute__((aligned(64))) op2_int8[64]; int __attribute__((aligned(64))) op3_int[16]; int16_t __attribute__((aligned(64))) op4_int16[32]; __m512i v1_int8; __m512i v2_int8; __m512i v3_int; __m512i v4_int16; printf("size of int8_t is %zu\n", sizeof(int8_t)); printf("size of int is %zu\n", sizeof(int)); printf("size of int16_t is %zu\n", sizeof(int16_t)); for (int i = 0;i < 64;i++) { op1_int8[i] = i; op2_int8[i] = i; } for (int i = 0;i < 16;i++) { v3_int[i] = 0; } for (int i = 0;i < 32;i++) { v4_int16[i] = 0; } v1_int8 =_mm512_load_si512(&op1_int8); v2_int8 =_mm512_load_si512(&op2_int8); v3_int =_mm512_load_si512(&op3_int); v4_int16 =_mm512_load_si512(&op4_int16); __m512i result = _mm512_dpbusds_epi32(v3_int, v1_int8, v2_int8); int* presult = (int*) &result; for (int i = 0; i < 16; i++) { printf("%d = %d\n", i, presult[i]); } return 0; }
gcc-8 -mavx512f -march=icelake-server main.c -o main
実行結果が以下の通り
size of int8_t is 1 size of int is 4 size of int16_t is 2 0 = 15 1 = 126 2 = 1135245462 3 = 33463 4 = 1135244750 5 = 34583 6 = 2607 7 = 3486 8 = 1135208014 9 = 38359 10 = 1133029965 11 = 41015 12 = 1135254878 13 = 44183 14 = 13230 15 = 15134
明らかに計算結果が違っています。 原因調査中・・・
AVX-512 Vector Neural Network Instructions (VNNI) - x86 - WikiChip