Intel DL boostを使おうとした続き

先日の記事ですが、問題解決できました。

#include <immintrin.h>
#include <stdio.h>

int main()
{
    int8_t __attribute__((aligned(64)))   op1_int8[64];
    int8_t __attribute__((aligned(64)))    op2_int8[64];
    int    __attribute__((aligned(64)))   op3_int[16];
    int    __attribute__((aligned(64)))   presult[16];

    int16_t __attribute__((aligned(64)))   op4_int16[32];
    int16_t __attribute__((aligned(64)))   op5_int16[32];

    __m512i  v1_int8;
    __m512i  v2_int8;
    __m512i  v3_int;
    __m512i  v4_int16;
    __m512i  v5_int16;

    for (int i = 0;i < 64;i++) {
        op1_int8[i] = i;
        op2_int8[i] = i;
    }
    for (int i = 0;i < 16;i++) {
        op3_int[i] = 0;
    }
    for (int i = 0;i < 32;i++) {
        op4_int16[i] = i;
        op5_int16[i] = i;
    }

    v1_int8 = _mm512_load_si512(&op1_int8);
    v2_int8 =_mm512_load_si512(&op2_int8);
    v3_int = _mm512_load_si512(&op3_int);
    v4_int16 = _mm512_load_si512(&op4_int16);
    v5_int16 = _mm512_load_si512(&op5_int16);


    printf("vpdpbusds\n");

    __m512i result = _mm512_dpbusds_epi32(v3_int, v1_int8, v2_int8);
    _mm512_store_si512(presult, result);

    for (int i = 0; i < 16; i++) {
        int val = presult[i];
        printf("%d = %d\n", i, val);
    }

    printf("vpmaddwd + vpaddd\n");
    result = _mm512_madd_epi16(v4_int16, v5_int16);
    result = _mm512_add_epi32(result, v3_int);
    _mm512_store_si512(presult, result);
    for (int i = 0; i < 16; i++) {
        int val = presult[i];
        printf("%d = %d\n", i, val);
    }


    return 0;
}

実行結果

vpdpbusds
0 = 14
1 = 126
2 = 366
3 = 734
4 = 1230
5 = 1854
6 = 2606
7 = 3486
8 = 4494
9 = 5630
10 = 6894
11 = 8286
12 = 9806
13 = 11454
14 = 13230
15 = 15134
vpmaddwd + vpaddd
0 = 1
1 = 13
2 = 41
3 = 85
4 = 145
5 = 221
6 = 313
7 = 421
8 = 545
9 = 685
10 = 841
11 = 1013
12 = 1201
13 = 1405
14 = 1625
15 = 1861

なんか vpmaddwd + vpaddd の使い方間違っている気がしますが、気にしない方針で