Ralf
Ralf

Reputation: 1295

C/C++: emulating restrict keyword by copying arguments to local variables

I encountered the following question when using vector intrinsics (AVX), but the question probably also applies to sequential programming. It concerns the use of the restrict keyword. The keyword is available in C99, but not in C++ (except for special keywords provided by most compilers). My question is: Can I avoid using restrict by copying data from non-restrict pointer arguments to local variables? It works in my example, but is this behavior guaranteed?

Here's my code with 4 different versions of SIMD vector addition. The first version vecAdd1() passes the arguments as restrict pointers. All other versions use normal (non-restrict) pointers as arguments. The second version vecAdd2() has no further code modifications. The third version vecAdd3() copies the data pointer of each struct into a local variable. The fourth version vecAdd4() also does the same for the size n.

#include <stdio.h>
#include <x86intrin.h>

#define N 8 // 8 floats per AVX vector
#define SIZE 1000 // 1000 floats per data vector

typedef struct { int n; float *data; } Vec;

void vecCreate(int size, Vec *v) {
  v->n = size;
  posix_memalign((void**)&(v->data), 32, size * sizeof(float));
}


void vecAdd1(Vec * restrict a, Vec * restrict b, Vec * restrict c) {
  __m256 va, vb, vc;
  for (int i = 0; i <= (a->n - N); i += N) {
    va = _mm256_load_ps(a->data + i);
    vb = _mm256_load_ps(b->data + i);
    vc = _mm256_add_ps(va, vb);
    _mm256_store_ps(c->data + i, vc);
  }
}

void vecAdd2(Vec *a, Vec *b, Vec *c) {
  __m256 va, vb, vc;
  for (int i = 0; i <= (a->n - N); i += N) {
    va = _mm256_load_ps(a->data + i);
    vb = _mm256_load_ps(b->data + i);
    vc = _mm256_add_ps(va, vb);
    _mm256_store_ps(c->data + i, vc);
  }
}

void vecAdd3(Vec *a, Vec *b, Vec *c) {
  __m256 va, vb, vc;
  float *pa = a->data, *pb = b->data, *pc = c->data;
  for (int i = 0; i <= (a->n - N); i += N) {
    va = _mm256_load_ps(pa + i);
    vb = _mm256_load_ps(pb + i);
    vc = _mm256_add_ps(va, vb);
    _mm256_store_ps(pc + i, vc);
  }
}

void vecAdd4(Vec *a, Vec *b, Vec *c) {
  __m256 va, vb, vc;
  float *pa = a->data, *pb = b->data, *pc = c->data;
  int ae = a->n - N;
  for (int i = 0; i <= ae; i += N) {
    va = _mm256_load_ps(pa + i);
    vb = _mm256_load_ps(pb + i);
    vc = _mm256_add_ps(va, vb);
    _mm256_store_ps(pc + i, vc);
  }
}



int
main()
{
  Vec a, b, c;
  vecCreate(1000, &a);
  vecCreate(1000, &b);
  vecCreate(1000, &c);
  vecAdd1(&a, &b, &c);
  vecAdd2(&a, &b, &c);
  vecAdd3(&a, &b, &c);
  vecAdd4(&a, &b, &c);
  printf("%g\n", c.data[123]);
  return 0;
}

(Just a comment: The -N and <= is used to limit processing to the part where entire SIMD vectors can be loaded and stored. I omitted the sequential postamble.)

Here's the compiler invocation:

gcc -O3 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -march=native -masm=intel -save-temps -std=c99 -Wall -o vecadd vecadd.c

I'm using version 7.5.0. In the following I only show the relevant portions of the assembly code from vecadd.s.

In vecAdd1(), the loop has a very efficient implementation: load one SIMD vector, add the second, store to result, advance pointer, check for loop end:

.L5:
    vmovaps ymm0, YMMWORD PTR [rdi+rax]
    vaddps  ymm0, ymm0, YMMWORD PTR [rsi+rax]
    vmovaps YMMWORD PTR [rcx+rax], ymm0
    add rax, 32
    cmp rdx, rax
    jne .L5

If I leave out the restrict keyword in the argument list in vecAdd2(), the loop gets very inefficient: Within the loop, the three data pointers and the size n are reloaded every time, before the SIMD vectors are loaded, processed, and stored, and the loop condition is checked:

.L10:
    mov r10, QWORD PTR 8[rdi]
    mov r9, QWORD PTR 8[rsi]
    add r8d, 8
    mov rcx, QWORD PTR 8[rdx]
    vmovaps ymm0, YMMWORD PTR [r10+rax]
    vaddps  ymm0, ymm0, YMMWORD PTR [r9+rax]
    vmovaps YMMWORD PTR [rcx+rax], ymm0
    mov ecx, DWORD PTR [rdi]
    add rax, 32
    sub ecx, 7
    cmp ecx, r8d
    jg  .L10

In version vecAdd3(), the data pointers are not reloaded (they are loaded once before the loop), but the size n is reloaded:

.L15:
    vmovaps ymm0, YMMWORD PTR -32[r8+rax*4]
    mov ecx, eax
    vaddps  ymm0, ymm0, YMMWORD PTR -32[rsi+rax*4]
    vmovaps YMMWORD PTR -32[r9+rax*4], ymm0
    mov edx, DWORD PTR [rdi]
    add rax, 8
    sub edx, 7
    cmp edx, ecx
    jg  .L15

Only if I copy all data pointers and n to local variables in vecAdd4(), the code looks like the one in vecAdd1():

.L20:
    vmovaps ymm0, YMMWORD PTR [rcx+rax]
    vaddps  ymm0, ymm0, YMMWORD PTR [rsi+rax]
    vmovaps YMMWORD PTR [r8+rax], ymm0
    add rax, 32
    cmp rdx, rax
    jne .L20

So, to repeat my question: Assume I want to avoid the compiler-specific restrict replacements in C++. I therefore use non-restrict arguments, but copy them to local (also non-restrict) variables. Is it guaranteed that the compiler doesn't assume for the local variables that they can alias (even if the function arguments can)?

(Side question: Is it relevant for this question that I copy struct components?)

Upvotes: 2

Views: 526

Answers (4)

Ralf
Ralf

Reputation: 1295

Just to summarize the recent state of research:

  • vecAdd1() uses restrict and __m256 with may_alias with default intrincis
  • vecAdd2() doesn't use restrict and __m256 with may_alias with default intrinsics
  • vecAdd2x() doesn't use restrict, but x__m256 without may_alias with modified intrinsics

Here's the code:

#include <stdio.h>
#include <x86intrin.h>

typedef float x__m256 __attribute__ ((__vector_size__ (32)));

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
x_mm256_store_ps (float *__P, x__m256 __A)
{
  *(x__m256 *)__P = __A;
}

extern __inline x__m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
x_mm256_load_ps (float const *__P)
{
  return *(x__m256 *)__P;
}

extern __inline x__m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
x_mm256_add_ps (x__m256 __A, x__m256 __B)
{
  return (x__m256) ((__v8sf)__A + (__v8sf)__B);
}

#define N 8 // 8 floats per AVX vector
#define SIZE 1000 // 1000 floats per data vector

typedef struct { int n; float *data; } Vec;

void vecCreate(int size, Vec *v) {
  v->n = size;
  posix_memalign((void**)&(v->data), 32, size * sizeof(float));
}

// restrict pointer arguments
void vecAdd1(Vec * restrict a, Vec * restrict b, Vec * restrict c) {
  __m256 va, vb, vc;
  for (int i = 0; i <= (a->n - N); i += N) {
    va = _mm256_load_ps(a->data + i);
    vb = _mm256_load_ps(b->data + i);
    vc = _mm256_add_ps(va, vb);
    _mm256_store_ps(c->data + i, vc);
  }
}

// like vecAdd1, but without restrict
void vecAdd2(Vec *a, Vec *b, Vec *c) {
  __m256 va, vb, vc;
  for (int i = 0; i <= (a->n - N); i += N) {
    va = _mm256_load_ps(a->data + i);
    vb = _mm256_load_ps(b->data + i);
    vc = _mm256_add_ps(va, vb);
    _mm256_store_ps(c->data + i, vc);
  }
}

// like vecAdd2, but with x__m256 and x_mm256
void vecAdd2x(Vec *a, Vec *b, Vec *c) {
  x__m256 va, vb, vc;
  for (int i = 0; i <= (a->n - N); i += N) {
    va = x_mm256_load_ps(a->data + i);
    vb = x_mm256_load_ps(b->data + i);
    vc = x_mm256_add_ps(va, vb);
    x_mm256_store_ps(c->data + i, vc);
  }
}

int
main() {
  Vec a, b, c;
  vecCreate(1000, &a);
  vecCreate(1000, &b);
  vecCreate(1000, &c);
  vecAdd1(&a, &b, &c);
  vecAdd2(&a, &b, &c);
  vecAdd2x(&a, &b, &c);
  printf("%g\n", c.data[123]);
  return 0;
}

Compile with

gcc -O3 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -march=native -masm=intel -save-temps -std=c99 -Wall -o vecadd vecadd.c

vecAdd1() and vecAdd2x() lead to efficient assembly instrucions such as

.L5:
    vmovaps ymm0, YMMWORD PTR [rdi+rax]
    vaddps  ymm0, ymm0, YMMWORD PTR [rsi+rax]
    vmovaps YMMWORD PTR [rcx+rax], ymm0
    add rax, 32
    cmp rdx, rax
    jne .L5

whereas the code generated for vecAdd2() reloads data pointers and n in each iteration:

.L10:
    mov r10, QWORD PTR 8[rdi]
    mov r9, QWORD PTR 8[rsi]
    add r8d, 8
    mov rcx, QWORD PTR 8[rdx]
    vmovaps ymm0, YMMWORD PTR [r10+rax]
    vaddps  ymm0, ymm0, YMMWORD PTR [r9+rax]
    vmovaps YMMWORD PTR [rcx+rax], ymm0
    mov ecx, DWORD PTR [rdi]
    add rax, 32
    sub ecx, 7
    cmp ecx, r8d
    jg  .L10

Upvotes: 1

Ralf
Ralf

Reputation: 1295

This is only an attempt at a partial answer. I added the following version vecAdd5() where I copy the Vec pointers from the argument list to local variables, rather than the data pointers and n from the struct:

void vecAdd5(Vec *a, Vec *b, Vec *c) {
  __m256 va, vb, vc;
  Vec *aa = a, *bb = b, *cc = c;
  for (int i = 0; i <= (aa->n - N); i += N) {
    va = _mm256_load_ps(aa->data + i);
    vb = _mm256_load_ps(bb->data + i);
    vc = _mm256_add_ps(va, vb);
    _mm256_store_ps(cc->data + i, vc);
  }
}

This leads to inefficient assembly code similar to vecAdd2():

.L25:
    mov r10, QWORD PTR 8[rdi]
    mov r9, QWORD PTR 8[rsi]
    add r8d, 8
    mov rcx, QWORD PTR 8[rdx]
    vmovaps ymm0, YMMWORD PTR [r10+rax]
    vaddps  ymm0, ymm0, YMMWORD PTR [r9+rax]
    vmovaps YMMWORD PTR [rcx+rax], ymm0
    mov ecx, DWORD PTR [rdi]
    add rax, 32
    sub ecx, 7
    cmp r8d, ecx
    jl  .L25

So, just copying pointers to local variables does not tell the compiler to forget about aliasing. I'm not sure if I interpret this correctly: In vecAdd4(), the compiler still assumes that the Vec pointer arguments can alias (and copying them as in vecAdd5() doesn't change this), so the components of one struct could change through a pointer to another one. However, it does not assume that the data pointers can alias once they are copied to local variables?

(I'm more confused than before: Actually, nowhere in the code of vecAdd2() a data component or n is modified, so why would the compiler assume that they can change and reload these components from the struct? We don't write to n, but only read, and never write to data, but only access the content of data.)


Edit in response to Peter Cordes:

If I transfer the processing to a function which receives an int and three float pointers, the assembly code is efficient:

void floatAdd6(int n, float *a, float *b, float *c) {
  __m256 va, vb, vc;
  for (int i = 0; i <= (n - N); i += N) {
    va = _mm256_load_ps(a + i);
    vb = _mm256_load_ps(b + i);
    vc = _mm256_add_ps(va, vb);
    _mm256_store_ps(c + i, vc);
  }
}

void vecAdd6(Vec *a, Vec *b, Vec *c) {
  floatAdd6(a->n, a->data, b->data, c->data);
}

Here's the assembly output for vecAdd6() (floatAdd6() looks the same):

.L50:
    vmovaps ymm0, YMMWORD PTR [r8+rax]
    vaddps  ymm0, ymm0, YMMWORD PTR [rsi+rax]
    vmovaps YMMWORD PTR [rcx+rax], ymm0
    add rax, 32
    cmp rdx, rax
    jne .L50

2nd edit in response to Peter Cordes:

I think Peter Cordes is right with his explanation that the may_alias attribute of the __m256 is causing the problem. I replaced __m256 with my own data type and a sequential implementation (which is auto-vectorized by the compiler, though):

typedef struct { float v[N]; } emuVec;

emuVec loadEmuVec(float *d) {
  emuVec r;
  for (int i = 0; i < N; i++) r.v[i] = d[i];
  return r;
}

emuVec addEmuVec(emuVec a, emuVec b) {
  emuVec c;
  for (int i = 0; i < N; i++) c.v[i] = a.v[i] + b.v[i];
  return c;        
}

void storeEmuVec(float *d, emuVec a) {
  for (int i = 0; i < N; i++) d[i] = a.v[i];
}

void vecAdd7a(Vec *a, Vec *b, Vec *c) {
  emuVec va, vb, vc;
  for (int i = 0; i <= (a->n - N); i += N) {
    va = loadEmuVec(a->data + i);
    vb = loadEmuVec(b->data + i);
    vc = addEmuVec(va, vb);
    storeEmuVec(c->data + i, vc);
  }
}

void vecAdd7b(Vec * restrict a, Vec * restrict b, Vec *restrict c) {
  emuVec va, vb, vc;
  for (int i = 0; i <= (a->n - N); i += N) {
    va = loadEmuVec(a->data + i);
    vb = loadEmuVec(b->data + i);
    vc = addEmuVec(va, vb);
    storeEmuVec(c->data + i, vc);
  }
}

Here the code produced for vecAdd7a() (without restrict) and vecAdd7b() (with restrict) is exactly the same. The assembly instructions are somewhat different compared to vecAdd1(), but the code doesn't show the inefficient the reloading of data pointers and size n inside the loop; it is just using an additional counter and unaligned rather than aligned loads and stores:

.L71:
    vmovups ymm0, YMMWORD PTR [rcx+rax]
    add edx, 1
    vaddps  ymm0, ymm0, YMMWORD PTR [rdi+rax]
    vmovups YMMWORD PTR [r9+rax], ymm0
    add rax, 32
    cmp esi, edx
    ja  .L71

Somewhat confusing is that there is a second loop, and the code seems to decide which version to use:

.L73:
    vmovups ymm0, YMMWORD PTR [rcx]
    add rax, 32
    add rcx, 32
    add rdx, 32
    vaddps  ymm0, ymm0, YMMWORD PTR -32[rax]
    vmovups YMMWORD PTR -32[rdx], ymm0
    cmp rax, rsi
    jne .L73

but still there's no sign of reloading pointers and size inside the loop.

So my interpretation would be the following: If the compiler sees the may_alias attribute of __m256, it entirely abandons an analysis whether the code actually could modify one Vec through a pointer to another Vec (in the intrinsics using __m256). I really wonder why: It should be clear to the compiler that an access to floats via a data pointer could never actually modify the pointer itself nor the struct Vec in which it is contained, and therefore could never lead to aliasing.


3rd edit: I modified the replacement data type emuVec by adding a may_alias attribute:

typedef struct { float v[N]; } __attribute__ ((__may_alias__)) emuVec;

but this doesn't lead to changes in the assembly code; there's no difference between vecAdd7a() and vecAdd7b(). This casts some doubt on the conjecture that may_alias confuses the compiler. Could the effect of wrong aliasing assumptions made by the compiler be caused by using intrinsics?

Upvotes: 0

John Bollinger
John Bollinger

Reputation: 181149

Can I avoid using restrict by copying data from non-restrict pointer arguments to local variables?

You can avoid using restrict simply by not using restrict. There are no circumstances that require restrict-qualification. It's more the other way around: restrict qualification places requirements on other code.

The advantage of restrict is that it allows the compiler to make stronger assumptions than otherwise it could do, so as -- at its exclusive discretion -- to perform optimizations that otherwise might result in non-conforming behavior.

The compiler can often make similar assumptions about local variables, and about locals with respect to globals and the targets of pointer arguments, and in that sense yes, making local copies can sometimes enable the compiler to perform the same stronger optimizations that restrict affords, at the expense of making the copy in the first place.

It should also be observed that restrict-qualification is shallow. For example, the restrict-qualification in your vecAdd1() case requires the caller to ensure that the (pointer) arguments associated with parameters a, b, and c do not alias each other, but it does not require that the data pointers stored in the pointed-to Vec objects point to different or non-overlapping arrays.

It works in my example, but is this behavior guaranteed?

As general considerations,

  • If you write conforming code, then you can rely on a conforming information to exhibit conforming externally-visible behavior. In this sense, however, externally-visible behavior does not include running time. The C language provides no basis ever to rely on particular optimizations being performed by compilers.

  • Removing restrict qualification can change a non-conforming program into a conforming one, but the reverse is not the case.

But with respect to the example code, restrict qualification does not gain you anything useful. The compiler might be able to more aggressively optimize if it could assume that the vector data pointed to by a->data, b->data, and c->data do not overlap, but the none of the restrict qualification nor any of the pointer copying in any of the example code permits it to make such an assumption. It is conceivable that a compiler could use other means to come to such a non-aliasing conclusion, but nothing about of your vecAddX() variations contributes to that.

Upvotes: 2

yugr
yugr

Reputation: 21954

In general local variables can not achieve the same optimization capabilities as aliasing directives like restrict.

E.g. imagine that loop in vecAdd4 is unrolled by compiler:

  for (int i = 0; i <= ae / 2; i += 2*N) {
    va_1 = _mm256_load_ps(pa + i);
    vb_1 = _mm256_load_ps(pb + i);
    vc_1 = _mm256_add_ps(va_1, vb_1);
    _mm256_store_ps(pc + i, vc_1);
    va_2 = _mm256_load_ps(pa + i + 1);
    vb_2 = _mm256_load_ps(pb + i + 1);
    vc_2 = _mm256_add_ps(va_2, vb_2);
    _mm256_store_ps(pc + i + 1, vc_2);
  }

In this case it's unable to move va_2 and va_3 loads before the first _mm256_store_ps intrinsic (to hide latency) because of potential aliasing between pa + i and pc + i.

Assuming that all a, b, c, pa, pb and pc do not alias, I'd suggest to mark them as such:

void vecAdd5(Vec * restrict a, Vec * restrict b, Vec * restrict c) {
  __m256 va, vb, vc;
  float * restrict pa = a->data, * restrict pb = b->data, * restrict pc = c->data;
  for (int i = 0; i <= a->n - N; i += N) {
    va = _mm256_load_ps(pa + i);
    vb = _mm256_load_ps(pb + i);
    vc = _mm256_add_ps(va, vb);
    _mm256_store_ps(pc + i, vc);
  }
}

This achieves the same assembly as vecAdd4 with much less manual work:

        vmovaps ymm1, YMMWORD PTR [rcx+rax*4]
        vaddps  ymm0, ymm1, YMMWORD PTR [rsi+rax*4]
        vmovaps YMMWORD PTR [rdi+rax*4], ymm0
        add     rax, 8
        cmp     edx, eax
        jg      .L3

Upvotes: 2

Related Questions