pangbryant
pangbryant

Reputation: 101

Inline assembly causes Segmentation fault (core dumped)

I'm attempting to convert the Intel intrinsics into inline assembly.

The code is going to calculate a 4x4 Matrix. The size of A and B are 4 x kc and kc x 4, respectively.

Here is the complete function:

 #define MR 4
 #define NR 4
 // compute C := beta * C + alpha * AB
 static void  dgemm_micro_kernel(int kc,
               double alpha, const double *A, const double *B,
               double beta,
               double *C, int incRowC, int incColC)
{ 
    double AB[MR*NR] __attribute__ ((aligned (32)));

    int i, j, p;
    register __m256d ab_00_10_20_30, ab_01_11_21_31, ab_02_12_22_32, ab_03_13_23_33;
    register __m256d a_0123, b_0000, b_1111, b_2222, b_3333;


    ab_00_10_20_30 = _mm256_setzero_pd();
    ab_01_11_21_31 = _mm256_setzero_pd();
    ab_02_12_22_32 = _mm256_setzero_pd();
    ab_03_13_23_33 = _mm256_setzero_pd();

    for (p = 0; p < kc; p++)
    {
        a_0123 = _mm256_load_pd(A);
        b_0000 = _mm256_broadcast_sd(B);
        b_1111 = _mm256_broadcast_sd(B + 1);
        b_2222 = _mm256_broadcast_sd(B + 2);
        b_3333 = _mm256_broadcast_sd(B + 3);

        // Col 1
        ab_00_10_20_30 = _mm256_fmadd_pd(a_0123, b_0000, ab_00_10_20_30);
        // Col 2
        ab_01_11_21_31 = _mm256_fmadd_pd(a_0123, b_1111, ab_01_11_21_31);
        // Col 3
        ab_02_12_22_32 = _mm256_fmadd_pd(a_0123, b_2222, ab_02_12_22_32);
        // Col 4
        ab_03_13_23_33 = _mm256_fmadd_pd(a_0123, b_3333, ab_03_13_23_33);

        A += MR;
        B += NR;
  }
    _mm256_store_pd(AB +  0, ab_00_10_20_30);
    _mm256_store_pd(AB +  4, ab_01_11_21_31);
    _mm256_store_pd(AB +  8, ab_02_12_22_32);
    _mm256_store_pd(AB + 12, ab_03_13_23_33);

    // Updata C := beta * C
    if (beta == 0.0)
   {
        // C == 0
        for (j = 0; j < NR; j++)
        {
            for (i = 0; i < MR; i++)
           {
                C[i * incRowC + j * incColC] = 0.0;
          }
      }
    }
    else if (beta != 1.0)
    {
        // C := beta * C
        for (j = 0; j < NR; j++)
        {
            for (i = 0; i < MR; i++)
            {
                C[i * incRowC + j * incColC] *= beta;
            }
        }
    }

    // Updata C := C + alpha * AB
    if (alpha == 1.0)
    {
        for (j = 0; j < NR; j++)
        {
            for (i = 0; i < MR; i++)
            {
                C[i * incRowC + j * incColC] += AB[j * MR + i];
            }
        }
    }
    else
    {
        for (j = 0; j < NR; j++)
        {
            for (i = 0; i < MR; i++)
            {
                C[i * incRowC + j * incColC] += alpha * AB[j * MR + i];
            }
        }
    }
}

Here is my inline assembly (just post the related part):

double AB[16] __attribute__ ((aligned(32)));
__asm__ volatile
(
    "movl           %0,         %%esi               \n\t"   // kc
    "movq           %1,         %%rax               \n\t"   // A
    "movq           %2,         %%rbx               \n\t"   // B
    "movq           %3,         %%rcx               \n\t"   // AB
    "                                               \n\t"
    "vxorpd         %%ymm0,     %%ymm0,     %%ymm0  \n\t"   // SET ZERO
    "vxorpd         %%ymm1,     %%ymm1,     %%ymm1  \n\t"
    "vxorpd         %%ymm2,     %%ymm2,     %%ymm2  \n\t"
    "vxorpd         %%ymm3,     %%ymm3,     %%ymm3  \n\t"
    "                                               \n\t"
    "testl           %%esi,      %%esi               \n\t"   // CHECK
    "je             .DWRITEAB                       \n\t"
    "                                               \n\t"
    ".DLOOP:                                        \n\t"   // LOOP
    "vmovapd        (%%rax),    %%ymm4              \n\t"   // load a_0123
    "vbroadcastsd   (%%rbx),    %%ymm5              \n\t"   // load b_0000
    "vbroadcastsd   8(%%rbx),   %%ymm6              \n\t"   // load b_1111
    "vbroadcastsd   16(%%rbx),  %%ymm7              \n\t"   // load b_2222
    "vbroadcastsd   24(%%rbx),  %%ymm8              \n\t"   // load b_3333
    "                                               \n\t"
    "vfmadd132pd    %%ymm4,     %%ymm5,     %%ymm0  \n\t"   // Col 1
    "vfmadd132pd    %%ymm4,     %%ymm6,     %%ymm1  \n\t"   // Col 2
    "vfmadd132pd    %%ymm4,     %%ymm7,     %%ymm2  \n\t"   // Col 3
    "vfmadd132pd    %%ymm4,     %%ymm8,     %%ymm3  \n\t"   // Col 4
    "                                               \n\t"
    "addq           $32,        %%rax               \n\t"
    "addq           $32,        %%rbx               \n\t"
    "                                               \n\t"
    "decl           %%esi                           \n\t"
    "jne            .DLOOP                          \n\t"
    "                                               \n\t"
    ".DWRITEAB:                                     \n\t"
    "vmovapd        %%ymm0,     (%%rcx)             \n\t"
    "vmovapd        %%ymm1,     32(%%rcx)           \n\t"
    "vmovapd        %%ymm2,     64(%%rcx)           \n\t"
    "vmovapd        %%ymm3,     96(%%rcx)           \n\t"
    "                                               \n\t"
    : // output
    : // input
        "m" (kc), // 0
        "m" (A),  // 1
        "m" (B),  // 2
        "m" (AB) // 3
    : // clober list
        "rax", "rbx", "rcx", "esi",
        "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
        "xmm5", "xmm6", "xmm7", "xmm8", "memory"
);

Then I compile and run it, the output shows Segmentation fault (core dumped). However, the intrinsic version works well. What's wrong with my inline assembly code?

Upvotes: 0

Views: 297

Answers (1)

prl
prl

Reputation: 12435

AB is an array, but you are using it as a pointer. Also, it is an output, but it is listed as an input.

The easiest change to fix this is to use lea instead of mov to load the address of AB into rcx. Also put "=m"(AB) as an output.

A better solution is to let the compiler do register allocation and remove the clobbers for eax, ebx, ecx, and esi. By using an "r" constraint, the compiler converts the array to a pointer to its first element and puts the pointer in the register. You can avoid the memory clobber by listing the array operands twice.

WARNING, this isn’t quite right, because it doesn’t properly indicate that the assembly code changes its input registers. Since you haven’t shown the entire function, I can’t tell if that will cause a problem (but it’s definitely wrong).

asm ("..."
    : // output
      "=m"(AB)
    : // input
      "r"(kc), "r"(A), "r"(B), "r"(AB),
      "m"(*(double (*)[4*kc])A), "m"(*(double (*)[4*kc])B)
    : // clobber list
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
      "xmm5", "xmm6", "xmm7", "xmm8"
);

This requires changing all the references to the parameters in the assembly code to use %1, %2, %3, and %4.

Upvotes: 2

Related Questions