Daniele
Daniele

Reputation: 734

Sum array of float in assembly

I'm implementing a function in assembly x86, called from a C program, to add an array of float. The first argument of the function is the pointer to the array and the second is the number of elements. When I run the code in linux, I get a segmentation fault. What did I do wrong?

.text
.globl floatsum

floatsum:
push %ebp
movl %esp, %ebp

movl  8(%ebp), %eax
movl 12(%ebp), %edx
shrl $2, %edx

xorps %xmm0, %xmm0
loop:
testl %edx, %edx
je end  
movaps (%eax), %xmm1
addps %xmm1, %xmm0
addl $16, %eax
decl %edx
jmp loop 

end:
                            #         3       2      1       0
movaps %xmm0, %xmm1         # xmm0:   w       z      y       x 
                            # xmm1:   z       w      x       y
shufps $0xb1, %xmm1, %xmm1  #        10      11      00      01  = 0xb1
addps  %xmm1, %xmm0         # xmm0:  w+z     z+w     y+x     x+y
movaps %xmm0, %xmm1         # xmm1:  w+z     z+w     y+x     x+y
                            # xmm1:  x+y     y+x     z+w     w+z
shufps $0x1b, %xmm1, %xmm1  #        00      01      10      11  = 0x1b
addps  %xmm1, %xmm0         # xmm0:  w+z+x+y z+w+y+x y+x+z+w x+y+w+z
                            #
#movd %xmm0, %eax
#pushl %eax

finst:

flds (%esp)
popl %eax

movl %ebp, %esp
popl %ebp
ret

// C Code

#include <stdio.h>
#include <stdlib.h>


float
floatsum(float *array, size_t number_of_items);

float
floatsum_c(float *array, size_t number_of_items){
float sum;
size_t i;

sum=0.0;
for(i=0; i<number_of_items;i++){
    sum+=array[i];
}
return sum;
}

float *
create_array(size_t number_of_items){
float *array;
size_t i;

array=calloc(number_of_items, sizeof(float));
if(array){
    for(i=0; i<number_of_items; i++){
        array[i]=1.0+(float)i;
    }
   }
   return array;
 }

int
main(int argc, char **argv){
float *a;
float result;
size_t number_of_items, i;

number_of_items=8;
a=create_array(number_of_items);
if(a){
    result=floatsum_c(a, number_of_items);
    printf("Sum (c version): %f\n", result);    
    result=floatsum(a, number_of_items);
    printf("Sum (asm version): %f\n", result);  
    free(a);
}

return 0;
}

Upvotes: 3

Views: 1951

Answers (1)

Michael Petch
Michael Petch

Reputation: 47593

As Paul mentioned this is likely an alignment issue. It is clear from your C code that your float array is not guaranteed to be aligned on a 16-byte boundary. The failure is this line:

movaps (%eax), %xmm1

The reason is that MOVAPS has this requirement:

When the source or destination operand is a memory operand, the operand must be aligned on a 16-byte (128-bit version) or 32-byte (VEX.256 encoded version) boundary or a general-protection exception (#GP) will be generated.

Since you are working with 128-bit vector registers you need 16-byte alignment. You have two choices:

  • Change MOVAPS to MOVUPS so that unaligned memory access can be done
  • Modify your C code to create an array of floats aligned on a 16-byte boundary

First solution would require:

movaps (%eax), %xmm1

to be changed to;

movups (%eax), %xmm1

The second solution is to avoid using calloc and utilize a function that allows you to create objects with 16-byte alignment. If using C11 then you can use the function aligned_alloc and memset to zero the array. Your create_array could look like:

float *
create_array(size_t number_of_items)
{
    float *array = NULL;
    size_t i;

    array=(float *)aligned_alloc(16, number_of_items * sizeof(*array));
    if(array){
        memset (array, 0x00, number_of_items * sizeof(*array));
        for(i=0; i<number_of_items; i++){
            array[i]=1.0+(float)i;
        }
    }
    return array;
}

If you are not using C11 you can utilize the POSIX function posix_memalign and memset on Linux. The code could look something like:

float *
create_array(size_t number_of_items)
{
    float *array = NULL;
    size_t i;

    if (!posix_memalign((void **)&array, 16, number_of_items * sizeof(*array))){
        memset (array, 0x00, number_of_items * sizeof(*array));
        for(i=0; i<number_of_items; i++){
            array[i]=1.0+(float)i;
        }
    }
    return array;
}

You will have to uncomment these lines as well:

#movd %xmm0, %eax
#pushl %eax

so that they appear this way:

movd %xmm0, %eax
pushl %eax

Note: Although I use memset to zero out the float array like calloc would have, it isn't actually needed in your code since you initialize all the elements to specific values afterwards. In your case the call to memset can be removed.

Upvotes: 7

Related Questions