Intel DL boostを使おうとしたけど

EC2 c5.12xlarge インスタンスでは使えるはずと思って試しました。 Intelのサイトを参考に

sudo apt install gcc-8 clang-8
#include <immintrin.h>
#include <stdio.h>

int main()
{
    int8_t __attribute__((aligned(64)))   op1_int8[64];
    int8_t __attribute__((aligned(64)))    op2_int8[64];
    int    __attribute__((aligned(64)))   op3_int[16];
    int16_t __attribute__((aligned(64)))   op4_int16[32];

    __m512i  v1_int8;
    __m512i  v2_int8;
    __m512i  v3_int;
    __m512i  v4_int16;

    printf("size of int8_t is %zu\n", sizeof(int8_t));
    printf("size of int is %zu\n", sizeof(int));
    printf("size of int16_t is %zu\n", sizeof(int16_t));

    for (int i = 0;i < 64;i++) {
        op1_int8[i] = i;
        op2_int8[i] = i;
    }
    for (int i = 0;i < 16;i++) {
        v3_int[i] = 0;
    }
    for (int i = 0;i < 32;i++) {
        v4_int16[i] = 0;
    }

    v1_int8 =_mm512_load_si512(&op1_int8);
    v2_int8 =_mm512_load_si512(&op2_int8);
    v3_int =_mm512_load_si512(&op3_int);
    v4_int16 =_mm512_load_si512(&op4_int16);

    __m512i result = _mm512_dpbusds_epi32(v3_int, v1_int8, v2_int8);
    int* presult = (int*) &result;

    for (int i = 0; i < 16; i++) {
        printf("%d = %d\n", i, presult[i]);
    }

    return 0;
}

gccコンパイル

gcc-8 -mavx512f -march=icelake-server  main.c -o main

実行結果が以下の通り

size of int8_t is 1
size of int is 4
size of int16_t is 2
0 = 15
1 = 126
2 = 1135245462
3 = 33463
4 = 1135244750
5 = 34583
6 = 2607
7 = 3486
8 = 1135208014
9 = 38359
10 = 1133029965
11 = 41015
12 = 1135254878
13 = 44183
14 = 13230
15 = 15134

明らかに計算結果が違っています。 原因調査中・・・

AVX-512 Vector Neural Network Instructions (VNNI) - x86 - WikiChip