Calmarius
Calmarius

Reputation: 19441

Are zero initializers faster than memset?

I maintain legacy C code where at many places they have small arrays like int a[32]; followed by a memset(a, 0, sizeof a); to zero initialize it.

I'm thinking of refactoring this into int a[32] = {0}; and removing the memset.

The question is: Are using zero initializers result in faster code in general than calling memset?

Upvotes: 7

Views: 1520

Answers (1)

Toby Speight
Toby Speight

Reputation: 30891

TL;DR: Use the initializer - it's never worse than memset().

It depends on your compiler. It shouldn't be any slower than calling memset() (because calling memset() is one option available to the compiler).

The initializer is easier to read than imperatively overwriting the array; it also adapts well if the element type is changed to something where all-bit-zero isn't what you want.


As an experiment, let's see what GCC does with this:

#include <string.h>

int f1()
{
    int a[32] = {0};
    return a[31];
}

int f2()
{
    int a[32];
    memset(a, 0, sizeof a);
    return a[31];
}

Compiling with gcc -S -std=c11 gives:

f1:
.LFB0:
    .file 1 "40786375.c"
    .loc 1 4 0
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    subq    $8, %rsp
    .loc 1 5 0
    leaq    -128(%rbp), %rdx
    movl    $0, %eax
    movl    $16, %ecx
    movq    %rdx, %rdi
    rep stosq
    .loc 1 6 0
    movl    -4(%rbp), %eax
    .loc 1 7 0
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
f2:
.LFB1:
    .loc 1 10 0
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    addq    $-128, %rsp
    .loc 1 12 0
    leaq    -128(%rbp), %rax
    movl    $128, %edx
    movl    $0, %esi
    movq    %rax, %rdi
    call    memset@PLT
    .loc 1 13 0
    movl    -4(%rbp), %eax
    .loc 1 14 0
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc

showing that f1() uses rep stosq for the initializer, whereas f2() has the function call, exactly like the C code. It's quite likely that memset() has a more efficient vectorized implementation for large arrays, but for small arrays like this, any benefits would likely be outweighed by the function call overhead.

If we declare a as volatile, we get to see what happens with optimizations enabled (gcc -S -std=c11 -O3):

f1:
.LFB4:
    .cfi_startproc
    subq    $16, %rsp
    .cfi_def_cfa_offset 24
    xorl    %eax, %eax
    movl    $16, %ecx
    leaq    -120(%rsp), %rdi
    rep stosq
    movl    4(%rsp), %eax
    addq    $16, %rsp
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
f2:
.LFB5:
    .cfi_startproc
    subq    $16, %rsp
    .cfi_def_cfa_offset 24
    xorl    %eax, %eax
    movl    $16, %ecx
    leaq    -120(%rsp), %rdx
    movq    %rdx, %rdi
    rep stosq
    movl    4(%rsp), %eax
    addq    $16, %rsp
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc

You can see that the two functions now compile to identical code.

Upvotes: 5

Related Questions