What is the right way to emit masked vmovapd (AVX-512) instructions using ASM?

Question

I've been trying to write some AVX512 code to transpose a 8x8 matrix of doubles, that is already in 8 zmm registers.

One of the tricks I was trying was to replace 2 shuffles with 1 shuffle and 2 mask_movs, so that there is less port 5 pressure - https://gcc.godbolt.org/z/HxZThj. The example code loads and stores the matrix but for my actual use case I have the matrix in zmm registers and need the transpose to stay in zmm registers. Clang however decides to just output 3 shuffles instead!

void Transpose(double* in, double* out) {
  __m512d __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
  __m512d __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
  __m512d row0 = _mm512_load_pd(in + 0 * 8);  //  0  1  2  3  4  5  6  7
  __m512d row1 = _mm512_load_pd(in + 1 * 8);  //  8  9 10 11 12 13 14 15
  __m512d row2 = _mm512_load_pd(in + 2 * 8);  // 16 17 18 19 20 21 22 23
  __m512d row3 = _mm512_load_pd(in + 3 * 8);  // 24 25 26 27 28 29 30 31
  __m512d row4 = _mm512_load_pd(in + 4 * 8);  // 32 33 34 35 36 37 38 39
  __m512d row5 = _mm512_load_pd(in + 5 * 8);  // 40 41 42 43 44 45 46 47
  __m512d row6 = _mm512_load_pd(in + 6 * 8);  // 48 49 50 51 52 53 54 55
  __m512d row7 = _mm512_load_pd(in + 7 * 8);  // 56 57 58 59 60 61 62 63

// IACA_START
  __t0 = _mm512_unpacklo_pd(row0, row1);  // 0  8  2  10  4 12  6 14
  __t1 = _mm512_unpackhi_pd(row0, row1);  // 1  9  3  11  5 13  7 15
  __t2 = _mm512_unpacklo_pd(row2, row3);  // 16 24 18 26 20 28 22 30
  __t3 = _mm512_unpackhi_pd(row2, row3);  // 17 25 19 27 21 29 23 31
  __t4 = _mm512_unpacklo_pd(row4, row5);  // 32 40 34 42 36 44 38 46
  __t5 = _mm512_unpackhi_pd(row4, row5);  // 33 41 35 43 37 45 39 47
  __t6 = _mm512_unpacklo_pd(row6, row7);  // 48 56 50 58 52 60 54 62
  __t7 = _mm512_unpackhi_pd(row6, row7);  // 49 57 51 59 53 61 55 63


  __tt0 = _mm512_permutex2var_pd(
      __t0, _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13), __t2);
  __tt1 = _mm512_permutex2var_pd(
      __t0, _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15), __t2);
  __tt2 = _mm512_permutex2var_pd(
      __t1, _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13), __t3);
  __tt3 = _mm512_permutex2var_pd(
      __t1, _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15), __t3);
  __tt4 = _mm512_permutex2var_pd(
      __t4, _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13), __t6);
  __tt5 = _mm512_permutex2var_pd(
      __t4, _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15), __t6);
  __tt6 = _mm512_permutex2var_pd(
      __t5, _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13), __t7);
  __tt7 = _mm512_permutex2var_pd(
      __t5, _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15), __t7);
  // 0 8 16 24 4 12 20 28
  // 2 10 18 26 6 14 22 30
  // 1 9 17 25 5 13 21 29
  // 3 11 19 27 7 15 23 31
  // 32 40 48 56 36 44 52 60
  // 34 42 50 58 38 46 54 62
  // 33 41 49 57 37 45 53 61
  // 35 43 51 59 39 47 55 63


//   __t0 = _mm512_shuffle_f64x2(__tt0, __tt4, 0x44);  // 0  8 16 24 32 40 48 56 
//   __t1 = _mm512_shuffle_f64x2(__tt2, __tt6, 0x44);  // 1  9 17 25 33 41 49 57 
//   __t2 = _mm512_shuffle_f64x2(__tt1, __tt5, 0x44);  // 2 10 18 26 34 42 50 58 
//   __t3 = _mm512_shuffle_f64x2(__tt3, __tt7, 0x44);  // 3 11 19 27 35 43 51 59 
//   __t4 = _mm512_shuffle_f64x2(__tt0, __tt4, 0xee);  // 4 12 20 28 36 44 52 60 
//   __t5 = _mm512_shuffle_f64x2(__tt2, __tt6, 0xee);  // 5 13 21 29 37 45 53 61 
//   __t6 = _mm512_shuffle_f64x2(__tt1, __tt5, 0xee);  // 6 14 22 30 38 46 54 62 
//   __t7 = _mm512_shuffle_f64x2(__tt3, __tt7, 0xee);  // 7 15 23 31 39 47 55 63 

 // Tried to replace a pair of shuffles, with 1 shuffle and 2 blends.
 // 2 blends should go to port 0 and be better overall.
 // Clang has other ideas and does port 5 shuffles instead :(
 // Can I convince Clang to do as I say some how?

 {
    __m512d v = _mm512_shuffle_f64x2(__tt0, __tt4, 0x4e);       // 4 12 20 28 32 40 48 56
    __t0 = _mm512_mask_mov_pd(__tt0, 0b11110000, v);
    __t4 = _mm512_mask_mov_pd(__tt4, 0b00001111, v);
  }

 {
    __m512d v = _mm512_shuffle_f64x2(__tt1, __tt5, 0x4e);  // 6 14 22 30 34 42 50 58
    __t2 = _mm512_mask_mov_pd( __tt1, 0b11110000, v); // 2 10 18 26 34 42 50 58
    __t6 = _mm512_mask_mov_pd(__tt5, 0b00001111, v); // 34 42 50 58
  }

  {
    __m512d v = _mm512_shuffle_f64x2(__tt2, __tt6, 0x4e);  // 5 13 21 29 33 41 49 57
    __t1 = _mm512_mask_mov_pd(__tt2, 0b11110000, v);
    __t5 = _mm512_mask_mov_pd(__tt6, 0b00001111, v);
  }

  {
    __m512d v = _mm512_shuffle_f64x2(__tt3, __tt7, 0x4e);  // 7 15 23 31 35 43 51 59
    __t3 = _mm512_mask_mov_pd(__tt3, 0b11110000, v);
    __t7 = _mm512_mask_mov_pd(__tt7, 0b00001111, v);
  }

// IACA_END

  _mm512_store_pd(out + 0 * 8, __t0);
  _mm512_store_pd(out + 1 * 8, __t1);
  _mm512_store_pd(out + 2 * 8, __t2);
  _mm512_store_pd(out + 3 * 8, __t3);
  _mm512_store_pd(out + 4 * 8, __t4);
  _mm512_store_pd(out + 5 * 8, __t5);
  _mm512_store_pd(out + 6 * 8, __t6);
  _mm512_store_pd(out + 7 * 8, __t7);
}

So my next attempt was to write an inline assembly version - https://gcc.godbolt.org/z/LR6aQy. The standalone mov_stuff function looks fine to me, however the program doesn't work. Looking at the assembly, it seems wrong too.

__m512d mov_stuff(__m512d src, __mmask8 mask, __m512d a) {
    asm volatile ("vmovapd %[A], %[SRC] %{%[MASK]%}	"
       :  [SRC] "=v" (src)              //output
       :  [A] "v" (a), [MASK] "Yk" (mask));   //inputs
       return src;
}

// Transpose of 8x8 matrix.
// Load stores only done to generate relevant code.
// In actual code the matrix can stay completely in registers
// for multiple iterations.
// Only interested in the register ops, hence the IACA annotations there.
// Severely port 5 limited.
void Transpose(double* in, double* out) {
  __m512d __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
  __m512d __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
  __m512d row0 = _mm512_load_pd(in + 0 * 8);  //  0  1  2  3  4  5  6  7
  __m512d row1 = _mm512_load_pd(in + 1 * 8);  //  8  9 10 11 12 13 14 15
  __m512d row2 = _mm512_load_pd(in + 2 * 8);  // 16 17 18 19 20 21 22 23
  __m512d row3 = _mm512_load_pd(in + 3 * 8);  // 24 25 26 27 28 29 30 31
  __m512d row4 = _mm512_load_pd(in + 4 * 8);  // 32 33 34 35 36 37 38 39
  __m512d row5 = _mm512_load_pd(in + 5 * 8);  // 40 41 42 43 44 45 46 47
  __m512d row6 = _mm512_load_pd(in + 6 * 8);  // 48 49 50 51 52 53 54 55
  __m512d row7 = _mm512_load_pd(in + 7 * 8);  // 56 57 58 59 60 61 62 63

// IACA_START
  __t0 = _mm512_unpacklo_pd(row0, row1);  // 0  8  2  10  4 12  6 14
  __t1 = _mm512_unpackhi_pd(row0, row1);  // 1  9  3  11  5 13  7 15
  __t2 = _mm512_unpacklo_pd(row2, row3);  // 16 24 18 26 20 28 22 30
  __t3 = _mm512_unpackhi_pd(row2, row3);  // 17 25 19 27 21 29 23 31
  __t4 = _mm512_unpacklo_pd(row4, row5);  // 32 40 34 42 36 44 38 46
  __t5 = _mm512_unpackhi_pd(row4, row5);  // 33 41 35 43 37 45 39 47
  __t6 = _mm512_unpacklo_pd(row6, row7);  // 48 56 50 58 52 60 54 62
  __t7 = _mm512_unpackhi_pd(row6, row7);  // 49 57 51 59 53 61 55 63


  __tt0 = _mm512_permutex2var_pd(
      __t0, _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13), __t2);
  __tt1 = _mm512_permutex2var_pd(
      __t0, _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15), __t2);
  __tt2 = _mm512_permutex2var_pd(
      __t1, _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13), __t3);
  __tt3 = _mm512_permutex2var_pd(
      __t1, _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15), __t3);
  __tt4 = _mm512_permutex2var_pd(
      __t4, _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13), __t6);
  __tt5 = _mm512_permutex2var_pd(
      __t4, _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15), __t6);
  __tt6 = _mm512_permutex2var_pd(
      __t5, _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13), __t7);
  __tt7 = _mm512_permutex2var_pd(
      __t5, _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15), __t7);
  // 0 8 16 24 4 12 20 28
  // 2 10 18 26 6 14 22 30
  // 1 9 17 25 5 13 21 29
  // 3 11 19 27 7 15 23 31
  // 32 40 48 56 36 44 52 60
  // 34 42 50 58 38 46 54 62
  // 33 41 49 57 37 45 53 61
  // 35 43 51 59 39 47 55 63


  // Does not work and asm looks wrong.
 {
    __m512d v = _mm512_shuffle_f64x2(__tt0, __tt4, 0x4e);       // 4 12 20 28 32 40 48 56
    __t0 = mov_stuff(__tt0, 0b11110000, v);
    __t4 = mov_stuff(__tt4, 0b00001111, v);
  }

 {
    __m512d v = _mm512_shuffle_f64x2(__tt1, __tt5, 0x4e);  // 6 14 22 30 34 42 50 58
    __t2 = mov_stuff( __tt1, 0b11110000, v); // 2 10 18 26 34 42 50 58
    __t6 = mov_stuff(__tt5, 0b00001111, v); // 34 42 50 58
  }

  {
    __m512d v = _mm512_shuffle_f64x2(__tt2, __tt6, 0x4e);  // 5 13 21 29 33 41 49 57
    __t1 = mov_stuff(__tt2, 0b11110000, v);
    __t5 = mov_stuff(__tt6, 0b00001111, v);
  }

  {
    __m512d v = _mm512_shuffle_f64x2(__tt3, __tt7, 0x4e);  // 7 15 23 31 35 43 51 59
    __t3 = mov_stuff(__tt3, 0b11110000, v);
    __t7 = mov_stuff(__tt7, 0b00001111, v);
  }

// IACA_END

  _mm512_store_pd(out + 0 * 8, __t0);
  _mm512_store_pd(out + 1 * 8, __t1);
  _mm512_store_pd(out + 2 * 8, __t2);
  _mm512_store_pd(out + 3 * 8, __t3);
  _mm512_store_pd(out + 4 * 8, __t4);
  _mm512_store_pd(out + 5 * 8, __t5);
  _mm512_store_pd(out + 6 * 8, __t6);
  _mm512_store_pd(out + 7 * 8, __t7);
}

To isolate the problem I tried to write a test program to see if I could get my assembly version to work - https://gcc.godbolt.org/z/TY7iv6. In this test program the standalone versions of mov_stuff_non_asm() and mov_stuff_asm() look identical, however when I use them in a program, the asm version compiles to what looks to me like garbage.

__m512d mov_stuff_non_asm(__m512d src, __mmask8 mask, __m512d a)
{
  return _mm512_mask_mov_pd(src, mask, a);    
}

// Trying to emulate mov_stuff_non_asm here.
// Assembly on its own looks identical to the non-asm version.
// But in a full program it compiles to rubbish as seen in the main_asm
// program.
__m512d mov_stuff_asm(__m512d src, __mmask8 mask, __m512d a)
{
    asm volatile ("vmovapd %[A], %[SRC] %{%[MASK]%}	"
       :  [SRC] "=v" (src)              //output
       :  [A] "v" (a), [MASK] "Yk" (mask));   //inputs
       return src;

}

int main_asm() {
  __mmask8 upper_lower = 0b11110000;
  __mmask8 lower_upper = 0b00001111;

  __m512d t0 = _mm512_setr_pd(0, 8, 16, 24, 4, 12, 20, 28);
  __m512d t4 = _mm512_setr_pd(32, 40, 48, 56, 36, 44, 52, 60);
  __m512d v = _mm512_shuffle_f64x2(t0, t4, 0x4e); // 4 12 20 28 32 40 48 56
  __m512d new_t0 = mov_stuff_asm(t0, upper_lower, v);

  DoStuff(new_t0);
}

int main_non_asm() {
  __mmask8 upper_lower = 0b11110000;
  __mmask8 lower_upper = 0b00001111;

  __m512d t0 = _mm512_setr_pd(0, 8, 16, 24, 4, 12, 20, 28);
  __m512d t4 = _mm512_setr_pd(32, 40, 48, 56, 36, 44, 52, 60);
  __m512d v = _mm512_shuffle_f64x2(t0, t4, 0x4e); // 4 12 20 28 32 40 48 56
  __m512d new_t0 = mov_stuff_non_asm(t0, upper_lower, v);

  DoStuff(new_t0);
}

What am I doing wrong? Is there any good documentation on how to write masked AVX-512 operations using inline asm. Or maybe if I can coax Clang in some other way to do what I really want it to do?

David Wohlferd · Accepted Answer

Turning my comment into an answer since it seems to have solved the problem.

Looking at your asm, you are using the constraint [SRC] "=v" (src). The '=' in this case indicates that this variable will be assigned the value of SRC on exit from the asm, but that the input value is ignored (ie an output-only variable). Since the input value is ignored, clang's optimizers can discard any code that would have calculated the value before this point (since you've told it it's not going to get used).

Changing the '=' to '+' says that the existing value in SRC is updated rather than output, which is what I believe you intended here.

What is the right way to emit masked vmovapd (AVX-512) instructions using ASM?

Answers (1)

Related Questions