Reputation: 1107
I'm currently writing some code targeting Intel's forthcoming AVX-512 SIMD instructions, which supports 512-bit operations.
Now assuming there's a matrix represented by 16 SIMD registers, each holding 16 32-bit integers (corresponds to a row), how can I transpose the matrix with purely SIMD instructions?
There're already solutions to transposing 4x4 or 8x8 matrices with SSE and AVX2 respectively. But I couldn't figure out how to extend it to 16x16 with AVX-512.
Any ideas?
Upvotes: 20
Views: 12365
Reputation: 33679
I recently got access to Xeon Phi Knights Landing hardware which has AVX512. Specifically the hardware I am using is a Intel(R) Xeon Phi(TM) CPU 7250 @ 1.40GHz (http://ark.intel.com/products/94035/Intel-Xeon-Phi-Processor-7250-16GB-1_40-GHz-68-core). This is not an auxiliary card. The Xeon Phi is the main computer.
I tested the AVX512 gather instructions compared to my method here https://stackoverflow.com/a/29587984/2542702 and it appears that gather is still slower. My code in that answer worked the first try with no errors.
I have not written intrinsics in about 3 months or thought much about optimization in this time so maybe my test is not robust enough. There is certainly some overhead but nevertheless I feel confident that the results clearly show that gather is slower in this case.
I only tested with ICC 17.0.0 because the currently installed OS is only CentOS 7.2 with Linux Kernel 3.10 and GCC 4.8.5 and GCC 4.8 does not support AVX512. I may persuade the HPC group at my work to upgrade.
I looked at the assembly to make sure it was generating AVX512 instructions but I have not analyzed it carefully.
//icc -O3 -xCOMMON-AVX512 tran.c -fopenmp
#include <stdio.h>
#include <x86intrin.h>
#include <omp.h>
void tran(int* mat, int* matT) {
int i,j;
__m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf;
__m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf;
r0 = _mm512_load_epi32(&mat[ 0*16]);
r1 = _mm512_load_epi32(&mat[ 1*16]);
r2 = _mm512_load_epi32(&mat[ 2*16]);
r3 = _mm512_load_epi32(&mat[ 3*16]);
r4 = _mm512_load_epi32(&mat[ 4*16]);
r5 = _mm512_load_epi32(&mat[ 5*16]);
r6 = _mm512_load_epi32(&mat[ 6*16]);
r7 = _mm512_load_epi32(&mat[ 7*16]);
r8 = _mm512_load_epi32(&mat[ 8*16]);
r9 = _mm512_load_epi32(&mat[ 9*16]);
ra = _mm512_load_epi32(&mat[10*16]);
rb = _mm512_load_epi32(&mat[11*16]);
rc = _mm512_load_epi32(&mat[12*16]);
rd = _mm512_load_epi32(&mat[13*16]);
re = _mm512_load_epi32(&mat[14*16]);
rf = _mm512_load_epi32(&mat[15*16]);
t0 = _mm512_unpacklo_epi32(r0,r1); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29
t1 = _mm512_unpackhi_epi32(r0,r1); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31
t2 = _mm512_unpacklo_epi32(r2,r3); // 32 48 33 49 ...
t3 = _mm512_unpackhi_epi32(r2,r3); // 34 50 35 51 ...
t4 = _mm512_unpacklo_epi32(r4,r5); // 64 80 65 81 ...
t5 = _mm512_unpackhi_epi32(r4,r5); // 66 82 67 83 ...
t6 = _mm512_unpacklo_epi32(r6,r7); // 96 112 97 113 ...
t7 = _mm512_unpackhi_epi32(r6,r7); // 98 114 99 115 ...
t8 = _mm512_unpacklo_epi32(r8,r9); // 128 ...
t9 = _mm512_unpackhi_epi32(r8,r9); // 130 ...
ta = _mm512_unpacklo_epi32(ra,rb); // 160 ...
tb = _mm512_unpackhi_epi32(ra,rb); // 162 ...
tc = _mm512_unpacklo_epi32(rc,rd); // 196 ...
td = _mm512_unpackhi_epi32(rc,rd); // 198 ...
te = _mm512_unpacklo_epi32(re,rf); // 228 ...
tf = _mm512_unpackhi_epi32(re,rf); // 230 ...
r0 = _mm512_unpacklo_epi64(t0,t2); // 0 16 32 48 ...
r1 = _mm512_unpackhi_epi64(t0,t2); // 1 17 33 49 ...
r2 = _mm512_unpacklo_epi64(t1,t3); // 2 18 34 49 ...
r3 = _mm512_unpackhi_epi64(t1,t3); // 3 19 35 51 ...
r4 = _mm512_unpacklo_epi64(t4,t6); // 64 80 96 112 ...
r5 = _mm512_unpackhi_epi64(t4,t6); // 65 81 97 114 ...
r6 = _mm512_unpacklo_epi64(t5,t7); // 66 82 98 113 ...
r7 = _mm512_unpackhi_epi64(t5,t7); // 67 83 99 115 ...
r8 = _mm512_unpacklo_epi64(t8,ta); // 128 144 160 176 ...
r9 = _mm512_unpackhi_epi64(t8,ta); // 129 145 161 178 ...
ra = _mm512_unpacklo_epi64(t9,tb); // 130 146 162 177 ...
rb = _mm512_unpackhi_epi64(t9,tb); // 131 147 163 179 ...
rc = _mm512_unpacklo_epi64(tc,te); // 192 208 228 240 ...
rd = _mm512_unpackhi_epi64(tc,te); // 193 209 229 241 ...
re = _mm512_unpacklo_epi64(td,tf); // 194 210 230 242 ...
rf = _mm512_unpackhi_epi64(td,tf); // 195 211 231 243 ...
t0 = _mm512_shuffle_i32x4(r0, r4, 0x88); // 0 16 32 48 8 24 40 56 64 80 96 112 ...
t1 = _mm512_shuffle_i32x4(r1, r5, 0x88); // 1 17 33 49 ...
t2 = _mm512_shuffle_i32x4(r2, r6, 0x88); // 2 18 34 50 ...
t3 = _mm512_shuffle_i32x4(r3, r7, 0x88); // 3 19 35 51 ...
t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd); // 4 20 36 52 ...
t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd); // 5 21 37 53 ...
t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd); // 6 22 38 54 ...
t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd); // 7 23 39 55 ...
t8 = _mm512_shuffle_i32x4(r8, rc, 0x88); // 128 144 160 176 ...
t9 = _mm512_shuffle_i32x4(r9, rd, 0x88); // 129 145 161 177 ...
ta = _mm512_shuffle_i32x4(ra, re, 0x88); // 130 146 162 178 ...
tb = _mm512_shuffle_i32x4(rb, rf, 0x88); // 131 147 163 179 ...
tc = _mm512_shuffle_i32x4(r8, rc, 0xdd); // 132 148 164 180 ...
td = _mm512_shuffle_i32x4(r9, rd, 0xdd); // 133 149 165 181 ...
te = _mm512_shuffle_i32x4(ra, re, 0xdd); // 134 150 166 182 ...
tf = _mm512_shuffle_i32x4(rb, rf, 0xdd); // 135 151 167 183 ...
r0 = _mm512_shuffle_i32x4(t0, t8, 0x88); // 0 16 32 48 64 80 96 112 ... 240
r1 = _mm512_shuffle_i32x4(t1, t9, 0x88); // 1 17 33 49 66 81 97 113 ... 241
r2 = _mm512_shuffle_i32x4(t2, ta, 0x88); // 2 18 34 50 67 82 98 114 ... 242
r3 = _mm512_shuffle_i32x4(t3, tb, 0x88); // 3 19 35 51 68 83 99 115 ... 243
r4 = _mm512_shuffle_i32x4(t4, tc, 0x88); // 4 ...
r5 = _mm512_shuffle_i32x4(t5, td, 0x88); // 5 ...
r6 = _mm512_shuffle_i32x4(t6, te, 0x88); // 6 ...
r7 = _mm512_shuffle_i32x4(t7, tf, 0x88); // 7 ...
r8 = _mm512_shuffle_i32x4(t0, t8, 0xdd); // 8 ...
r9 = _mm512_shuffle_i32x4(t1, t9, 0xdd); // 9 ...
ra = _mm512_shuffle_i32x4(t2, ta, 0xdd); // 10 ...
rb = _mm512_shuffle_i32x4(t3, tb, 0xdd); // 11 ...
rc = _mm512_shuffle_i32x4(t4, tc, 0xdd); // 12 ...
rd = _mm512_shuffle_i32x4(t5, td, 0xdd); // 13 ...
re = _mm512_shuffle_i32x4(t6, te, 0xdd); // 14 ...
rf = _mm512_shuffle_i32x4(t7, tf, 0xdd); // 15 31 47 63 79 96 111 127 ... 255
_mm512_store_epi32(&matT[ 0*16], r0);
_mm512_store_epi32(&matT[ 1*16], r1);
_mm512_store_epi32(&matT[ 2*16], r2);
_mm512_store_epi32(&matT[ 3*16], r3);
_mm512_store_epi32(&matT[ 4*16], r4);
_mm512_store_epi32(&matT[ 5*16], r5);
_mm512_store_epi32(&matT[ 6*16], r6);
_mm512_store_epi32(&matT[ 7*16], r7);
_mm512_store_epi32(&matT[ 8*16], r8);
_mm512_store_epi32(&matT[ 9*16], r9);
_mm512_store_epi32(&matT[10*16], ra);
_mm512_store_epi32(&matT[11*16], rb);
_mm512_store_epi32(&matT[12*16], rc);
_mm512_store_epi32(&matT[13*16], rd);
_mm512_store_epi32(&matT[14*16], re);
_mm512_store_epi32(&matT[15*16], rf);
}
void gather(int *mat, int *matT) {
int i,j;
int index[16] __attribute__((aligned(64)));
__m512i vindex;
for(i=0; i<16; i++) index[i] = 16*i;
for(i=0; i<256; i++) mat[i] = i;
vindex = _mm512_load_epi32(index);
for(i=0; i<16; i++)
_mm512_store_epi32(&matT[16*i], _mm512_i32gather_epi32(vindex, &mat[i], 4));
}
int verify(int *mat) {
int i,j;
int error = 0;
for(i=0; i<16; i++) {
for(j=0; j<16; j++) {
if(mat[j*16+i] != i*16+j) error++;
}
}
return error;
}
void print_mat(int *mat) {
int i,j;
for(i=0; i<16; i++) {
for(j=0; j<16; j++) printf("%2X ", mat[i*16+j]);
puts("");
}
puts("");
}
int main(void) {
int i,j, rep;
int mat[256] __attribute__((aligned(64)));
int matT[256] __attribute__((aligned(64)));
double dtime;
rep = 10000000;
for(i=0; i<256; i++) mat[i] = i;
print_mat(mat);
gather(mat, matT);
for(i=0; i<256; i++) mat[i] = i;
dtime = -omp_get_wtime();
for(i=0; i<rep; i++) gather(mat, matT);
dtime += omp_get_wtime();
printf("errors %d\n", verify(matT));
printf("dtime %f\n", dtime);
print_mat(matT);
tran(mat,matT);
dtime = -omp_get_wtime();
for(i=0; i<rep; i++) tran(mat, matT);
dtime += omp_get_wtime();
printf("errors %d\n", verify(matT));
printf("dtime %f\n", dtime);
print_mat(matT);
}
The gather
function in this case takes 1.5 s and the tran
function 1.15 s. If anyone sees an error or has any suggestions for my test please let me know. I am only beginning to get experience with AVX512 and Knights Landing.
I tried to remove some of the overhead and succeeded nevertheless gather still appears to be slower
#include <stdio.h>
#include <x86intrin.h>
#include <omp.h>
void tran(int* mat, int* matT, int rep) {
int i;
__m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf;
__m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf;
for(i=0; i<rep; i++) {
r0 = _mm512_load_epi32(&mat[ 0*16]);
r1 = _mm512_load_epi32(&mat[ 1*16]);
r2 = _mm512_load_epi32(&mat[ 2*16]);
r3 = _mm512_load_epi32(&mat[ 3*16]);
r4 = _mm512_load_epi32(&mat[ 4*16]);
r5 = _mm512_load_epi32(&mat[ 5*16]);
r6 = _mm512_load_epi32(&mat[ 6*16]);
r7 = _mm512_load_epi32(&mat[ 7*16]);
r8 = _mm512_load_epi32(&mat[ 8*16]);
r9 = _mm512_load_epi32(&mat[ 9*16]);
ra = _mm512_load_epi32(&mat[10*16]);
rb = _mm512_load_epi32(&mat[11*16]);
rc = _mm512_load_epi32(&mat[12*16]);
rd = _mm512_load_epi32(&mat[13*16]);
re = _mm512_load_epi32(&mat[14*16]);
rf = _mm512_load_epi32(&mat[15*16]);
t0 = _mm512_unpacklo_epi32(r0,r1); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29
t1 = _mm512_unpackhi_epi32(r0,r1); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31
t2 = _mm512_unpacklo_epi32(r2,r3); // 32 48 33 49 ...
t3 = _mm512_unpackhi_epi32(r2,r3); // 34 50 35 51 ...
t4 = _mm512_unpacklo_epi32(r4,r5); // 64 80 65 81 ...
t5 = _mm512_unpackhi_epi32(r4,r5); // 66 82 67 83 ...
t6 = _mm512_unpacklo_epi32(r6,r7); // 96 112 97 113 ...
t7 = _mm512_unpackhi_epi32(r6,r7); // 98 114 99 115 ...
t8 = _mm512_unpacklo_epi32(r8,r9); // 128 ...
t9 = _mm512_unpackhi_epi32(r8,r9); // 130 ...
ta = _mm512_unpacklo_epi32(ra,rb); // 160 ...
tb = _mm512_unpackhi_epi32(ra,rb); // 162 ...
tc = _mm512_unpacklo_epi32(rc,rd); // 196 ...
td = _mm512_unpackhi_epi32(rc,rd); // 198 ...
te = _mm512_unpacklo_epi32(re,rf); // 228 ...
tf = _mm512_unpackhi_epi32(re,rf); // 230 ...
r0 = _mm512_unpacklo_epi64(t0,t2); // 0 16 32 48 ...
r1 = _mm512_unpackhi_epi64(t0,t2); // 1 17 33 49 ...
r2 = _mm512_unpacklo_epi64(t1,t3); // 2 18 34 49 ...
r3 = _mm512_unpackhi_epi64(t1,t3); // 3 19 35 51 ...
r4 = _mm512_unpacklo_epi64(t4,t6); // 64 80 96 112 ...
r5 = _mm512_unpackhi_epi64(t4,t6); // 65 81 97 114 ...
r6 = _mm512_unpacklo_epi64(t5,t7); // 66 82 98 113 ...
r7 = _mm512_unpackhi_epi64(t5,t7); // 67 83 99 115 ...
r8 = _mm512_unpacklo_epi64(t8,ta); // 128 144 160 176 ...
r9 = _mm512_unpackhi_epi64(t8,ta); // 129 145 161 178 ...
ra = _mm512_unpacklo_epi64(t9,tb); // 130 146 162 177 ...
rb = _mm512_unpackhi_epi64(t9,tb); // 131 147 163 179 ...
rc = _mm512_unpacklo_epi64(tc,te); // 192 208 228 240 ...
rd = _mm512_unpackhi_epi64(tc,te); // 193 209 229 241 ...
re = _mm512_unpacklo_epi64(td,tf); // 194 210 230 242 ...
rf = _mm512_unpackhi_epi64(td,tf); // 195 211 231 243 ...
t0 = _mm512_shuffle_i32x4(r0, r4, 0x88); // 0 16 32 48 8 24 40 56 64 80 96 112 ...
t1 = _mm512_shuffle_i32x4(r1, r5, 0x88); // 1 17 33 49 ...
t2 = _mm512_shuffle_i32x4(r2, r6, 0x88); // 2 18 34 50 ...
t3 = _mm512_shuffle_i32x4(r3, r7, 0x88); // 3 19 35 51 ...
t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd); // 4 20 36 52 ...
t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd); // 5 21 37 53 ...
t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd); // 6 22 38 54 ...
t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd); // 7 23 39 55 ...
t8 = _mm512_shuffle_i32x4(r8, rc, 0x88); // 128 144 160 176 ...
t9 = _mm512_shuffle_i32x4(r9, rd, 0x88); // 129 145 161 177 ...
ta = _mm512_shuffle_i32x4(ra, re, 0x88); // 130 146 162 178 ...
tb = _mm512_shuffle_i32x4(rb, rf, 0x88); // 131 147 163 179 ...
tc = _mm512_shuffle_i32x4(r8, rc, 0xdd); // 132 148 164 180 ...
td = _mm512_shuffle_i32x4(r9, rd, 0xdd); // 133 149 165 181 ...
te = _mm512_shuffle_i32x4(ra, re, 0xdd); // 134 150 166 182 ...
tf = _mm512_shuffle_i32x4(rb, rf, 0xdd); // 135 151 167 183 ...
r0 = _mm512_shuffle_i32x4(t0, t8, 0x88); // 0 16 32 48 64 80 96 112 ... 240
r1 = _mm512_shuffle_i32x4(t1, t9, 0x88); // 1 17 33 49 66 81 97 113 ... 241
r2 = _mm512_shuffle_i32x4(t2, ta, 0x88); // 2 18 34 50 67 82 98 114 ... 242
r3 = _mm512_shuffle_i32x4(t3, tb, 0x88); // 3 19 35 51 68 83 99 115 ... 243
r4 = _mm512_shuffle_i32x4(t4, tc, 0x88); // 4 ...
r5 = _mm512_shuffle_i32x4(t5, td, 0x88); // 5 ...
r6 = _mm512_shuffle_i32x4(t6, te, 0x88); // 6 ...
r7 = _mm512_shuffle_i32x4(t7, tf, 0x88); // 7 ...
r8 = _mm512_shuffle_i32x4(t0, t8, 0xdd); // 8 ...
r9 = _mm512_shuffle_i32x4(t1, t9, 0xdd); // 9 ...
ra = _mm512_shuffle_i32x4(t2, ta, 0xdd); // 10 ...
rb = _mm512_shuffle_i32x4(t3, tb, 0xdd); // 11 ...
rc = _mm512_shuffle_i32x4(t4, tc, 0xdd); // 12 ...
rd = _mm512_shuffle_i32x4(t5, td, 0xdd); // 13 ...
re = _mm512_shuffle_i32x4(t6, te, 0xdd); // 14 ...
rf = _mm512_shuffle_i32x4(t7, tf, 0xdd); // 15 31 47 63 79 96 111 127 ... 255
_mm512_store_epi32(&matT[ 0*16], r0);
_mm512_store_epi32(&matT[ 1*16], r1);
_mm512_store_epi32(&matT[ 2*16], r2);
_mm512_store_epi32(&matT[ 3*16], r3);
_mm512_store_epi32(&matT[ 4*16], r4);
_mm512_store_epi32(&matT[ 5*16], r5);
_mm512_store_epi32(&matT[ 6*16], r6);
_mm512_store_epi32(&matT[ 7*16], r7);
_mm512_store_epi32(&matT[ 8*16], r8);
_mm512_store_epi32(&matT[ 9*16], r9);
_mm512_store_epi32(&matT[10*16], ra);
_mm512_store_epi32(&matT[11*16], rb);
_mm512_store_epi32(&matT[12*16], rc);
_mm512_store_epi32(&matT[13*16], rd);
_mm512_store_epi32(&matT[14*16], re);
_mm512_store_epi32(&matT[15*16], rf);
}
}
void gather(int *mat, int *matT, int rep) {
int i,j;
int index[16] __attribute__((aligned(64)));
__m512i vindex;
for(i=0; i<16; i++) index[i] = 16*i;
for(i=0; i<256; i++) mat[i] = i;
vindex = _mm512_load_epi32(index);
for(i=0; i<rep; i++) {
_mm512_store_epi32(&matT[ 0*16], _mm512_i32gather_epi32(vindex, &mat[ 0], 4));
_mm512_store_epi32(&matT[ 1*16], _mm512_i32gather_epi32(vindex, &mat[ 1], 4));
_mm512_store_epi32(&matT[ 2*16], _mm512_i32gather_epi32(vindex, &mat[ 2], 4));
_mm512_store_epi32(&matT[ 3*16], _mm512_i32gather_epi32(vindex, &mat[ 3], 4));
_mm512_store_epi32(&matT[ 4*16], _mm512_i32gather_epi32(vindex, &mat[ 4], 4));
_mm512_store_epi32(&matT[ 5*16], _mm512_i32gather_epi32(vindex, &mat[ 5], 4));
_mm512_store_epi32(&matT[ 6*16], _mm512_i32gather_epi32(vindex, &mat[ 6], 4));
_mm512_store_epi32(&matT[ 7*16], _mm512_i32gather_epi32(vindex, &mat[ 7], 4));
_mm512_store_epi32(&matT[ 8*16], _mm512_i32gather_epi32(vindex, &mat[ 8], 4));
_mm512_store_epi32(&matT[ 9*16], _mm512_i32gather_epi32(vindex, &mat[ 9], 4));
_mm512_store_epi32(&matT[10*16], _mm512_i32gather_epi32(vindex, &mat[10], 4));
_mm512_store_epi32(&matT[11*16], _mm512_i32gather_epi32(vindex, &mat[11], 4));
_mm512_store_epi32(&matT[12*16], _mm512_i32gather_epi32(vindex, &mat[12], 4));
_mm512_store_epi32(&matT[13*16], _mm512_i32gather_epi32(vindex, &mat[13], 4));
_mm512_store_epi32(&matT[14*16], _mm512_i32gather_epi32(vindex, &mat[14], 4));
_mm512_store_epi32(&matT[15*16], _mm512_i32gather_epi32(vindex, &mat[15], 4));
}
}
int verify(int *mat) {
int i,j;
int error = 0;
for(i=0; i<16; i++) {
for(j=0; j<16; j++) {
if(mat[j*16+i] != i*16+j) error++;
}
}
return error;
}
void print_mat(int *mat) {
int i,j;
for(i=0; i<16; i++) {
for(j=0; j<16; j++) printf("%2X ", mat[i*16+j]);
puts("");
}
puts("");
}
int main(void) {
int i,j, rep;
int mat[256] __attribute__((aligned(64)));
int matT[256] __attribute__((aligned(64)));
double dtime;
rep = 10000000;
for(i=0; i<256; i++) mat[i] = i;
print_mat(mat);
gather(mat, matT,1);
for(i=0; i<256; i++) mat[i] = i;
dtime = -omp_get_wtime();
gather(mat, matT, rep);
dtime += omp_get_wtime();
printf("errors %d\n", verify(matT));
printf("dtime %f\n", dtime);
print_mat(matT);
tran(mat,matT,1);
dtime = -omp_get_wtime();
tran(mat, matT, rep);
dtime += omp_get_wtime();
printf("errors %d\n", verify(matT));
printf("dtime %f\n", dtime);
print_mat(matT);
}
The gather
function took 1.13 s and the tran
function 0.8 s.
According to Agner Fog's Micro-architecture manual shuffle and permute instructions have poor performance with KNL. The shuffle and unpack instructions used in my original answer https://stackoverflow.com/a/29587984/2542702 have a reciprocal throughput of 2. I managed to greatly improve the performance by using vpermq
instead which has a reciprocal throughput of 1.
In additional I improved the first 1/4 of the transpose using using vinserti64x4
(see tran_new2
below). Here is a table of the times. The tran
function takes 0.8 seconds and the tran_new2
function 0.46 s.
void tran_new2(int* mat, int* matT, int rep) {
__m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf;
__m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf;
int mask;
int64_t idx1[8] __attribute__((aligned(64))) = {2, 3, 0, 1, 6, 7, 4, 5};
int64_t idx2[8] __attribute__((aligned(64))) = {1, 0, 3, 2, 5, 4, 7, 6};
int32_t idx3[16] __attribute__((aligned(64))) = {1, 0, 3, 2, 5 ,4 ,7 ,6 ,9 ,8 , 11, 10, 13, 12 ,15, 14};
__m512i vidx1 = _mm512_load_epi64(idx1);
__m512i vidx2 = _mm512_load_epi64(idx2);
__m512i vidx3 = _mm512_load_epi32(idx3);
int i;
for(i=0; i<rep; i++) {
t0 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 0*16+0])), _mm256_load_si256((__m256i*)&mat[ 8*16+0]), 1);
t1 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 1*16+0])), _mm256_load_si256((__m256i*)&mat[ 9*16+0]), 1);
t2 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 2*16+0])), _mm256_load_si256((__m256i*)&mat[10*16+0]), 1);
t3 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 3*16+0])), _mm256_load_si256((__m256i*)&mat[11*16+0]), 1);
t4 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 4*16+0])), _mm256_load_si256((__m256i*)&mat[12*16+0]), 1);
t5 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 5*16+0])), _mm256_load_si256((__m256i*)&mat[13*16+0]), 1);
t6 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 6*16+0])), _mm256_load_si256((__m256i*)&mat[14*16+0]), 1);
t7 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 7*16+0])), _mm256_load_si256((__m256i*)&mat[15*16+0]), 1);
t8 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 0*16+8])), _mm256_load_si256((__m256i*)&mat[ 8*16+8]), 1);
t9 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 1*16+8])), _mm256_load_si256((__m256i*)&mat[ 9*16+8]), 1);
ta = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 2*16+8])), _mm256_load_si256((__m256i*)&mat[10*16+8]), 1);
tb = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 3*16+8])), _mm256_load_si256((__m256i*)&mat[11*16+8]), 1);
tc = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 4*16+8])), _mm256_load_si256((__m256i*)&mat[12*16+8]), 1);
td = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 5*16+8])), _mm256_load_si256((__m256i*)&mat[13*16+8]), 1);
te = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 6*16+8])), _mm256_load_si256((__m256i*)&mat[14*16+8]), 1);
tf = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_load_si256((__m256i*)&mat[ 7*16+8])), _mm256_load_si256((__m256i*)&mat[15*16+8]), 1);
mask= 0xcc;
r0 = _mm512_mask_permutexvar_epi64(t0, (__mmask8)mask, vidx1, t4);
r1 = _mm512_mask_permutexvar_epi64(t1, (__mmask8)mask, vidx1, t5);
r2 = _mm512_mask_permutexvar_epi64(t2, (__mmask8)mask, vidx1, t6);
r3 = _mm512_mask_permutexvar_epi64(t3, (__mmask8)mask, vidx1, t7);
r8 = _mm512_mask_permutexvar_epi64(t8, (__mmask8)mask, vidx1, tc);
r9 = _mm512_mask_permutexvar_epi64(t9, (__mmask8)mask, vidx1, td);
ra = _mm512_mask_permutexvar_epi64(ta, (__mmask8)mask, vidx1, te);
rb = _mm512_mask_permutexvar_epi64(tb, (__mmask8)mask, vidx1, tf);
mask= 0x33;
r4 = _mm512_mask_permutexvar_epi64(t4, (__mmask8)mask, vidx1, t0);
r5 = _mm512_mask_permutexvar_epi64(t5, (__mmask8)mask, vidx1, t1);
r6 = _mm512_mask_permutexvar_epi64(t6, (__mmask8)mask, vidx1, t2);
r7 = _mm512_mask_permutexvar_epi64(t7, (__mmask8)mask, vidx1, t3);
rc = _mm512_mask_permutexvar_epi64(tc, (__mmask8)mask, vidx1, t8);
rd = _mm512_mask_permutexvar_epi64(td, (__mmask8)mask, vidx1, t9);
re = _mm512_mask_permutexvar_epi64(te, (__mmask8)mask, vidx1, ta);
rf = _mm512_mask_permutexvar_epi64(tf, (__mmask8)mask, vidx1, tb);
mask = 0xaa;
t0 = _mm512_mask_permutexvar_epi64(r0, (__mmask8)mask, vidx2, r2);
t1 = _mm512_mask_permutexvar_epi64(r1, (__mmask8)mask, vidx2, r3);
t4 = _mm512_mask_permutexvar_epi64(r4, (__mmask8)mask, vidx2, r6);
t5 = _mm512_mask_permutexvar_epi64(r5, (__mmask8)mask, vidx2, r7);
t8 = _mm512_mask_permutexvar_epi64(r8, (__mmask8)mask, vidx2, ra);
t9 = _mm512_mask_permutexvar_epi64(r9, (__mmask8)mask, vidx2, rb);
tc = _mm512_mask_permutexvar_epi64(rc, (__mmask8)mask, vidx2, re);
td = _mm512_mask_permutexvar_epi64(rd, (__mmask8)mask, vidx2, rf);
mask = 0x55;
t2 = _mm512_mask_permutexvar_epi64(r2, (__mmask8)mask, vidx2, r0);
t3 = _mm512_mask_permutexvar_epi64(r3, (__mmask8)mask, vidx2, r1);
t6 = _mm512_mask_permutexvar_epi64(r6, (__mmask8)mask, vidx2, r4);
t7 = _mm512_mask_permutexvar_epi64(r7, (__mmask8)mask, vidx2, r5);
ta = _mm512_mask_permutexvar_epi64(ra, (__mmask8)mask, vidx2, r8);
tb = _mm512_mask_permutexvar_epi64(rb, (__mmask8)mask, vidx2, r9);
te = _mm512_mask_permutexvar_epi64(re, (__mmask8)mask, vidx2, rc);
tf = _mm512_mask_permutexvar_epi64(rf, (__mmask8)mask, vidx2, rd);
mask = 0xaaaa;
r0 = _mm512_mask_permutexvar_epi32(t0, (__mmask16)mask, vidx3, t1);
r2 = _mm512_mask_permutexvar_epi32(t2, (__mmask16)mask, vidx3, t3);
r4 = _mm512_mask_permutexvar_epi32(t4, (__mmask16)mask, vidx3, t5);
r6 = _mm512_mask_permutexvar_epi32(t6, (__mmask16)mask, vidx3, t7);
r8 = _mm512_mask_permutexvar_epi32(t8, (__mmask16)mask, vidx3, t9);
ra = _mm512_mask_permutexvar_epi32(ta, (__mmask16)mask, vidx3, tb);
rc = _mm512_mask_permutexvar_epi32(tc, (__mmask16)mask, vidx3, td);
re = _mm512_mask_permutexvar_epi32(te, (__mmask16)mask, vidx3, tf);
mask = 0x5555;
r1 = _mm512_mask_permutexvar_epi32(t1, (__mmask16)mask, vidx3, t0);
r3 = _mm512_mask_permutexvar_epi32(t3, (__mmask16)mask, vidx3, t2);
r5 = _mm512_mask_permutexvar_epi32(t5, (__mmask16)mask, vidx3, t4);
r7 = _mm512_mask_permutexvar_epi32(t7, (__mmask16)mask, vidx3, t6);
r9 = _mm512_mask_permutexvar_epi32(t9, (__mmask16)mask, vidx3, t8);
rb = _mm512_mask_permutexvar_epi32(tb, (__mmask16)mask, vidx3, ta);
rd = _mm512_mask_permutexvar_epi32(td, (__mmask16)mask, vidx3, tc);
rf = _mm512_mask_permutexvar_epi32(tf, (__mmask16)mask, vidx3, te);
_mm512_store_epi32(&matT[ 0*16], r0);
_mm512_store_epi32(&matT[ 1*16], r1);
_mm512_store_epi32(&matT[ 2*16], r2);
_mm512_store_epi32(&matT[ 3*16], r3);
_mm512_store_epi32(&matT[ 4*16], r4);
_mm512_store_epi32(&matT[ 5*16], r5);
_mm512_store_epi32(&matT[ 6*16], r6);
_mm512_store_epi32(&matT[ 7*16], r7);
_mm512_store_epi32(&matT[ 8*16], r8);
_mm512_store_epi32(&matT[ 9*16], r9);
_mm512_store_epi32(&matT[10*16], ra);
_mm512_store_epi32(&matT[11*16], rb);
_mm512_store_epi32(&matT[12*16], rc);
_mm512_store_epi32(&matT[13*16], rd);
_mm512_store_epi32(&matT[14*16], re);
_mm512_store_epi32(&matT[15*16], rf);
int* tmp = mat;
mat = matT;
matT = tmp;
}
}
Upvotes: 8
Reputation: 33679
For two operand instructions using SIMD you can show that the number of operations necessary to transpose a nxn
matrix is n*log_2(n)
whereas using scalar operations it's O(n^2)
. In fact, later I'll show that the number of read and write operations using the scalar registers is 2*n*(n-1)
. Below is a table showing the number of operations to transpose 4x4
, 8x8
, 16x16
, and 32x32
matrices using SSE, AVX, AVX512, and AVX1024 compared to the scalar operations
n 4(SSE) 8(AVX) 16(AVX512) 32(AVX1024)
SIMD ops 8 24 64 160
SIMD +r/w ops 16 40 96 224
Scalar r/w ops 24 112 480 1984
where SIMD +r/w ops includes the read and write operations (n*log_2(n) + 2*n
).
The reason the SIMD transpose can be done in n*log_2(n)
operations is that the algorithm is:
permute n 32-bit rows
permute n 64-bit rows
...
permute n simd_width/2-bit rows
For example, for 4x4
there are 4 rows and therefore you have to permute 32-bit lanes 4 times and then 64-bit lanes 4 times. For 16x16
you have to permute 32-bit lanes , 64-bit lanes, 128-bit lanes, and finally 256-lanes 16 times for each.
I already showed that 8x8
can be done with 24 operations with AVX. So the question is how to do this for 16x16
using AVX512 in 64 operations? The general algorithm is:
interleave 32-bit lanes using
8x _mm512_unpacklo_epi32
8x _mm512_unpackhi_epi32
interleave 64-bit lanes using
8x _mm512_unpacklo_epi64
8x _mm512_unpackhi_epi64
permute 128-bit lanes using
16x _mm512_shuffle_i32x4
permute 256-bit lanes using again
16x _mm512_shuffle_i32x4
Here is untested code doing this
//given __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf;
__m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf;
t0 = _mm512_unpacklo_epi32(r0,r1); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29
t1 = _mm512_unpackhi_epi32(r0,r1); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31
t2 = _mm512_unpacklo_epi32(r2,r3); // 32 48 33 49 ...
t3 = _mm512_unpackhi_epi32(r2,r3); // 34 50 35 51 ...
t4 = _mm512_unpacklo_epi32(r4,r5); // 64 80 65 81 ...
t5 = _mm512_unpackhi_epi32(r4,r5); // 66 82 67 83 ...
t6 = _mm512_unpacklo_epi32(r6,r7); // 96 112 97 113 ...
t7 = _mm512_unpackhi_epi32(r6,r7); // 98 114 99 115 ...
t8 = _mm512_unpacklo_epi32(r8,r9); // 128 ...
t9 = _mm512_unpackhi_epi32(r8,r9); // 130 ...
ta = _mm512_unpacklo_epi32(ra,rb); // 160 ...
tb = _mm512_unpackhi_epi32(ra,rb); // 162 ...
tc = _mm512_unpacklo_epi32(rc,rd); // 196 ...
td = _mm512_unpackhi_epi32(rc,rd); // 198 ...
te = _mm512_unpacklo_epi32(re,rf); // 228 ...
tf = _mm512_unpackhi_epi32(re,rf); // 230 ...
r0 = _mm512_unpacklo_epi64(t0,t2); // 0 16 32 48 ...
r1 = _mm512_unpackhi_epi64(t0,t2); // 1 17 33 49 ...
r2 = _mm512_unpacklo_epi64(t1,t3); // 2 18 34 49 ...
r3 = _mm512_unpackhi_epi64(t1,t3); // 3 19 35 51 ...
r4 = _mm512_unpacklo_epi64(t4,t6); // 64 80 96 112 ...
r5 = _mm512_unpackhi_epi64(t4,t6); // 65 81 97 114 ...
r6 = _mm512_unpacklo_epi64(t5,t7); // 66 82 98 113 ...
r7 = _mm512_unpackhi_epi64(t5,t7); // 67 83 99 115 ...
r8 = _mm512_unpacklo_epi64(t8,ta); // 128 144 160 176 ...
r9 = _mm512_unpackhi_epi64(t8,ta); // 129 145 161 178 ...
ra = _mm512_unpacklo_epi64(t9,tb); // 130 146 162 177 ...
rb = _mm512_unpackhi_epi64(t9,tb); // 131 147 163 179 ...
rc = _mm512_unpacklo_epi64(tc,te); // 192 208 228 240 ...
rd = _mm512_unpackhi_epi64(tc,te); // 193 209 229 241 ...
re = _mm512_unpacklo_epi64(td,tf); // 194 210 230 242 ...
rf = _mm512_unpackhi_epi64(td,tf); // 195 211 231 243 ...
t0 = _mm512_shuffle_i32x4(r0, r4, 0x88); // 0 16 32 48 8 24 40 56 64 80 96 112 ...
t1 = _mm512_shuffle_i32x4(r1, r5, 0x88); // 1 17 33 49 ...
t2 = _mm512_shuffle_i32x4(r2, r6, 0x88); // 2 18 34 50 ...
t3 = _mm512_shuffle_i32x4(r3, r7, 0x88); // 3 19 35 51 ...
t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd); // 4 20 36 52 ...
t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd); // 5 21 37 53 ...
t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd); // 6 22 38 54 ...
t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd); // 7 23 39 55 ...
t8 = _mm512_shuffle_i32x4(r8, rc, 0x88); // 128 144 160 176 ...
t9 = _mm512_shuffle_i32x4(r9, rd, 0x88); // 129 145 161 177 ...
ta = _mm512_shuffle_i32x4(ra, re, 0x88); // 130 146 162 178 ...
tb = _mm512_shuffle_i32x4(rb, rf, 0x88); // 131 147 163 179 ...
tc = _mm512_shuffle_i32x4(r8, rc, 0xdd); // 132 148 164 180 ...
td = _mm512_shuffle_i32x4(r9, rd, 0xdd); // 133 149 165 181 ...
te = _mm512_shuffle_i32x4(ra, re, 0xdd); // 134 150 166 182 ...
tf = _mm512_shuffle_i32x4(rb, rf, 0xdd); // 135 151 167 183 ...
r0 = _mm512_shuffle_i32x4(t0, t8, 0x88); // 0 16 32 48 64 80 96 112 ... 240
r1 = _mm512_shuffle_i32x4(t1, t9, 0x88); // 1 17 33 49 66 81 97 113 ... 241
r2 = _mm512_shuffle_i32x4(t2, ta, 0x88); // 2 18 34 50 67 82 98 114 ... 242
r3 = _mm512_shuffle_i32x4(t3, tb, 0x88); // 3 19 35 51 68 83 99 115 ... 243
r4 = _mm512_shuffle_i32x4(t4, tc, 0x88); // 4 ...
r5 = _mm512_shuffle_i32x4(t5, td, 0x88); // 5 ...
r6 = _mm512_shuffle_i32x4(t6, te, 0x88); // 6 ...
r7 = _mm512_shuffle_i32x4(t7, tf, 0x88); // 7 ...
r8 = _mm512_shuffle_i32x4(t0, t8, 0xdd); // 8 ...
r9 = _mm512_shuffle_i32x4(t1, t9, 0xdd); // 9 ...
ra = _mm512_shuffle_i32x4(t2, ta, 0xdd); // 10 ...
rb = _mm512_shuffle_i32x4(t3, tb, 0xdd); // 11 ...
rc = _mm512_shuffle_i32x4(t4, tc, 0xdd); // 12 ...
rd = _mm512_shuffle_i32x4(t5, td, 0xdd); // 13 ...
re = _mm512_shuffle_i32x4(t6, te, 0xdd); // 14 ...
rf = _mm512_shuffle_i32x4(t7, tf, 0xdd); // 15 31 47 63 79 96 111 127 ... 255
I got the idea for using _mm512_shufflei32x4
by looking at transposing a 4x4
matrix using _mm_shuffle_ps
(which is what MSVC uses in _MM_TRANSPOSE4_PS
but not GCC and ICC).
__m128 tmp0 ,tmp1, tmp2, tmp3;
tmp0 = _mm_shuffle_ps(row0, row1, 0x88); // 0 2 4 6
tmp1 = _mm_shuffle_ps(row0, row1, 0xdd); // 1 3 5 7
tmp2 = _mm_shuffle_ps(row2, row3, 0x88); // 8 a c e
tmp3 = _mm_shuffle_ps(row2, row3, 0xdd); // 9 b d f
row0 = _mm_shuffle_ps(tmp0, tmp2, 0x88); // 0 4 8 c
row1 = _mm_shuffle_ps(tmp1, tmp3, 0x88); // 1 5 9 d
row2 = _mm_shuffle_ps(tmp0, tmp2, 0xdd); // 2 6 a e
row3 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); // 3 7 b f
the same idea applies to _mm512_shuffle_i32x4
but now the lanes are 128-bit instead of 32-bit and there are 16 rows instead of 4 rows.
Finally, to compare to scalar operations I modified Example 9.5a from Agner Fog's optimizing C++ manual
#define SIZE 16
void transpose(int a[SIZE][SIZE]) { // function to transpose matrix
// define a macro to swap two array elements:
#define swapd(x,y) {temp=x; x=y; y=temp;}
int r, c; int temp;
for (r = 1; r < SIZE; r++) {
for (c = 0; c < r; c++) {
swapd(a[r][c], a[c][r]);
}
}
}
this does n*(n-1)/2
swaps (because the diagonal does not need to be swapped). The swaps from assembly for 16x16 look like
mov r8d, DWORD PTR [rax+68]
mov r9d, DWORD PTR [rdx+68]
mov DWORD PTR [rax+68], r9d
mov DWORD PTR [rdx+68], r8d
so the number of read/write operations using the scalar registers is 2*n*(n-1)
.
Upvotes: 27