Ferdinand Wehle
Ferdinand Wehle

Reputation: 1

Converting large CSV file into 2D array in C

I have reacently switched to C due to its exceptional computing speeds. I am currently trying to import a very large CSV table (500.000 x 27) into an array, but am facing majour difficulties in trying to assign the values from the imported cell to the respective array cell. I hope the following code helps in trying to understand my problem. I would be glad for any help!

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main() {

    int row = 520241, col = 27, j = 0, i = 0;

    double *arr[row];
    for(i = 0; i < row; i++)
        arr[i] = (double*)malloc(col*sizeof(double));

    FILE *f;
    f = fopen("WethData.csv","r");
        if( f == NULL) { //control
            printf("Unable to open file \n");
            exit(1);
        }

    char line[485]; //if too little memory is allocated weird numbers show up
    char *sp; //string pointer

    i = 0; //loop to store every row
    double res;
    while(fgets(line, 485, f) !=NULL) {
        sp = strtok(line, ","); arr[i][0] = atof(sp); //<- Here the problem lies as far as I know
        for(j = 1; j < col; j++)
            sp = strtok(NULL, ","); arr[i][j] = atof(sp); //<-
        i++;
            if( i > row) {
                puts("Row count exceeded"); //controll, if the loop repeats more than there are rows: "record count exceeded"
                exit(1); //stops loop
            }

    };

    fclose(f);

    return 0;
}


Upvotes: 0

Views: 366

Answers (1)

Craig Estey
Craig Estey

Reputation: 33601

There are a few issues ...

Putting arr on the stack can be problematic:

int row = 520241, col = 27;
double *arr[row];

Here, arr takes up 520241 * 8 bytes which is 2,901,208 bytes. Most archs have a default stack size of 4MB or 8MB. While your allocation is below this, the method doesn't scale to really large .csv files.

Also, pre-allocating for a fixed number of rows can be wasteful. Better to allow the array to grow dynamically in both dimensions.

You don't really have a two dimensional array as C defines them. You have a one dimensional array of pointers to double elements. This is how one has to define a 2D array in languages that don't have 2D arrays (e.g. python).

Note that you're using atof to decode numbers (on the return value of strtok). We can skip the strtok altogether if we use strtod to decode the numbers.

Creating a control struct for a dynamic 2D array can be helpful.

Anyway, here is some refactored code. It is annotated:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

#define sysfault(_fmt...) \
    do { \
        printf(_fmt); \
        exit(1); \
    } while (0)

typedef struct {
    int rowmax;
    int colmax;
    double *data;
} arr_t;

// colcalc -- get number of columns in csv file
int
colcalc(const char *line)
{
    int colmax = 1;

    // count the number of commas
    while (1) {
        line = strchr(line,',');

        if (line == NULL)
            break;
        ++line;

        ++colmax;
    }

    return colmax;
}

// arrload -- read in array from .csv file
arr_t *
arrload(FILE *f)
{
    char line[10000];
    arr_t *arr;

    arr = calloc(1,sizeof(*arr));
    if (arr == NULL)
        sysfault("calloc failure -- %s\n",strerror(errno));

    int rowcap = 0;
    double *rowptr;

    while (fgets(line, sizeof(line), f) != NULL) {
        // dynamically calculate number of columns by probing the first row
        if (arr->colmax == 0)
            arr->colmax = colcalc(line);

        // expand the array size
        if (arr->rowmax >= rowcap) {
            rowcap += 100;
            arr->data = realloc(arr->data,
                sizeof(*arr->data) * rowcap * arr->colmax);
            if (arr->data == NULL)
                sysfault("realloc failure -- %s\n",strerror(errno));
        }

        // point to the current row we want to fill
        rowptr = &arr->data[arr->rowmax * arr->colmax];

        char *cp = line;
        for (int colidx = 0;  colidx < arr->colmax;  ++colidx) {
            if (*cp == 0)
                sysfault("line too short -- rowmax=%d colidx=%d\n",
                    arr->rowmax,colidx);

            // decode the number
            rowptr[colidx] = strtod(cp,&cp);

            // check the trailing character
            switch (*cp) {
            case ',':  // middle of the line number
                ++cp;
                break;

            case '\n':  // last number on line
                ++cp;
                break;

            case 0:  // broken last line (still okay)
                break;

            default:
                sysfault("syntax error -- '%s'\n",cp);
                break;
            }
        }

        ++arr->rowmax;
    }

    // trim array to actual size used
    rowcap = arr->rowmax;
    arr->data = realloc(arr->data,sizeof(*arr->data) * rowcap * arr->colmax);
    if (arr->data == NULL)
        sysfault("realloc trim failure -- %s\n",strerror(errno));

    return arr;
}

void
arrprt(const arr_t *arr)
{
    const double *rowptr = arr->data;

    for (int rowidx = 0;  rowidx < arr->rowmax;  ++rowidx,
        rowptr += arr->colmax) {
        for (int colidx = 0;  colidx < arr->colmax;  ++colidx)
            printf(" %g",rowptr[colidx]);
        printf("\n");
    }
}

int
main(void)
{

    FILE *f;

    f = fopen("WethData.csv", "r");
    if (f == NULL)
        sysfault("unable to open file -- %s\n",strerror(errno));

    arr_t *arr = arrload(f);

    fclose(f);

    arrprt(arr);

    return 0;
}

UPDATE:

After going through your code I still have some uncertainties. For one the use of the arrprt function?

That function just loops through the array and prints the elements. As an example of array element access. More on this below.

For another your use of the "->" opperator?

This is the arrow operator. It is basic C coding. There are other SO questions that detail this. But, I'd look at a good C book (e.g. K&R). Here are some simple examples. They all do the same thing (e.g. print the array element that has index 3):

double data[5];

printf("%g\n",data[3]);

double *dptr = data;
printf("%g\n",dptr[3]);
printf("%g\n",*(dptr + 3));

struct arr {
    double data[5];
};

struct arr arr;
printf("%g\n",arr.data[3]);

struct arr *ap = &arr;
printf("%g\n",ap->data[3]);

And lastly how would I access a specific box in the array (e.g. row 300, col 5).

There are a few different ways to do this. I've improved my example code to show the different ways. You can define a few macros with the -D compiler option to get the different ways (e.g. -DUSEMACRO and/or -USEFNC).

You could also use cpp to get the macro output. I've added the USECPP as a convenience. So, to look at macro output (e.g.):

cpp -DUSECPP x.c > xstd.i
cpp -DUSECPP -DUSEMACRO x.c > xmac.i
cpp -DUSECPP -DUSEFNC x.c > xfnc.i
cpp -DUSECPP -DUSEMACRO -DUSEFNC x.c > xmacfnc.i

You can also add -DUSEPRT=n where n is 1-4

Anyway, here is the full code:

#ifndef USECPP
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#endif

// USEMACRO -- select access method
//   0 - index calculation in arrdata function
//   1 - index calculation in ARRDATA macro
//
// USEFNC -- select access method for arrprt2/arrprt3 functions
//   0 - use ARRDATA macro
//   1 - use arrdata
//
// USEPRT -- select arrprt* function to use
//   0-4

#ifndef USEPRT
#define USEPRT      1
#endif

#define sysfault(_fmt...) \
    do { \
        printf(_fmt); \
        exit(1); \
    } while (0)

typedef struct {
    int rowmax;
    int colmax;
    double *data;
} arr_t;

// ARRDATA -- access individual array elements
#if USEMACRO
#define ARRDATA(_arr,_rowidx,_colidx) \
    (_arr)->data[((_rowidx) * (_arr)->colmax) + (_colidx)]
#else
#define ARRDATA(_arr,_rowidx,_colidx) \
    *arrdata(_arr,_rowidx,_colidx)
#endif

// arrdata -- point to given data address
static inline double *
arrdata(const arr_t *arr,int rowidx,int colidx)
{

#if USEMACRO
    return &ARRDATA(arr,rowidx,colidx);
#else
    return &arr->data[(rowidx * arr->colmax) + colidx];
#endif
}

// ARRDEF -- define 2D array (_advanced_ usage)
#define ARRDEF(_arr,_sym) \
    double (*_sym)[(_arr)->colmax] = (__typeof__(_sym)) (_arr)->data

// colcalc -- get number of columns in csv file
#ifndef USECPP
int
colcalc(const char *line)
{
    int colmax = 1;

    // count the number of commas
    while (1) {
        line = strchr(line,',');

        if (line == NULL)
            break;
        ++line;

        ++colmax;
    }

    return colmax;
}
#endif

// arrload -- read in array from .csv file
#ifndef USECPP
arr_t *
arrload(FILE *f)
{
    char line[10000];
    arr_t *arr;

    arr = calloc(1,sizeof(*arr));
    if (arr == NULL)
        sysfault("calloc failure -- %s\n",strerror(errno));

    int rowcap = 0;
    double *rowptr;

    while (fgets(line, sizeof(line), f) != NULL) {
        // dynamically calculate number of columns by probing the first row
        if (arr->colmax == 0)
            arr->colmax = colcalc(line);

        // expand the array size
        if (arr->rowmax >= rowcap) {
            rowcap += 100;
            arr->data = realloc(arr->data,
                sizeof(*arr->data) * rowcap * arr->colmax);
            if (arr->data == NULL)
                sysfault("realloc failure -- %s\n",strerror(errno));
        }

        // point to the current row we which to fill
#if 0
        rowptr = &arr->data[arr->rowmax * arr->colmax];
#else
        rowptr = arrdata(arr,arr->rowmax,0);
#endif

        char *cp = line;
        for (int colidx = 0;  colidx < arr->colmax;  ++colidx) {
            if (*cp == 0)
                sysfault("line too short -- rowmax=%d colidx=%d\n",
                    arr->rowmax,colidx);

            // decode the number
            rowptr[colidx] = strtod(cp,&cp);

            // check the trailing character
            switch (*cp) {
            case ',':  // middle of the line number
                ++cp;
                break;

            case '\n':  // last number on line
                ++cp;
                break;

            case 0:  // broken last line (still okay)
                break;

            default:
                sysfault("syntax error -- '%s'\n",cp);
                break;
            }
        }

        ++arr->rowmax;
    }

    // trim array to actual size used
    rowcap = arr->rowmax;
    arr->data = realloc(arr->data,sizeof(*arr->data) * rowcap * arr->colmax);
    if (arr->data == NULL)
        sysfault("realloc trim failure -- %s\n",strerror(errno));

    return arr;
}
#endif

// arrprt1 -- print array (original)
void
arrprt1(const arr_t *arr)
{
    const double *rowptr = arr->data;

    for (int rowidx = 0;  rowidx < arr->rowmax;  ++rowidx,
        rowptr += arr->colmax) {
        for (int colidx = 0;  colidx < arr->colmax;  ++colidx)
            printf(" %g",rowptr[colidx]);
        printf("\n");
    }
}

// arrprt2 -- print array (use arrdata function and pointers)
void
arrprt2(const arr_t *arr)
{
    const double *rowptr;
    const double *rowend;

    for (int rowidx = 0;  rowidx < arr->rowmax;  ++rowidx) {
        // point to start of the row
        rowptr = arrdata(arr,rowidx,0);

        // point to one past the end of the row (either of these work)
#if USEFNC
        rowend = arrdata(arr,rowidx,arr->colmax);
#else
        rowend = rowptr + arr->colmax;
#endif

        for (;  rowptr < rowend;  ++rowptr)
            printf(" %g",*rowptr);
        printf("\n");
    }
}

// arrprt3 -- print array (slow, use arrdata/ARRDATA each time)
void
arrprt3(const arr_t *arr)
{

    for (int rowidx = 0;  rowidx < arr->rowmax;  ++rowidx) {
        for (int colidx = 0;  colidx < arr->colmax;  ++colidx) {
#if USEFNC
            printf(" %g",*arrdata(arr,rowidx,colidx));
#else
            printf(" %g",ARRDATA(arr,rowidx,colidx));
#endif
        }
        printf("\n");
    }
}

// arrprt4 -- print array (slow, use ARRDEF)
void
arrprt4(const arr_t *arr)
{
    ARRDEF(arr,data);

    for (int rowidx = 0;  rowidx < arr->rowmax;  ++rowidx) {
        for (int colidx = 0;  colidx < arr->colmax;  ++colidx)
            printf(" %g",data[rowidx][colidx]);
        printf("\n");
    }
}

#ifndef USECPP
int
main(void)
{

    FILE *f;

    f = fopen("WethData.csv", "r");
    if (f == NULL)
        sysfault("unable to open file -- %s\n",strerror(errno));

    arr_t *arr = arrload(f);

    fclose(f);

    switch (USEPRT) {
    case 1:
        arrprt1(arr);
        break;
    case 2:
        arrprt2(arr);
        break;
    case 3:
        arrprt3(arr);
        break;
    case 4:
        arrprt4(arr);
        break;
    }

    return 0;
}
#endif

UPDATE #2:

Here are the outputs of the modified source, run through cpp from the cpp commands above.

Here is xstd.i:

typedef struct {
    int rowmax;
    int colmax;
    double *data;
} arr_t;

static inline double *
arrdata(const arr_t *arr, int rowidx, int colidx)
{

    return &arr->data[(rowidx * arr->colmax) + colidx];

}

void
arrprt1(const arr_t *arr)
{
    const double *rowptr = arr->data;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx, rowptr += arr->colmax) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx)
            printf(" %g", rowptr[colidx]);
        printf("\n");
    }
}

void
arrprt2(const arr_t *arr)
{
    const double *rowptr;
    const double *rowend;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {

        rowptr = arrdata(arr, rowidx, 0);

        rowend = rowptr + arr->colmax;

        for (; rowptr < rowend; ++rowptr)
            printf(" %g", *rowptr);
        printf("\n");
    }
}

void
arrprt3(const arr_t *arr)
{

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx) {

            printf(" %g", *arrdata(arr, rowidx, colidx));

        }
        printf("\n");
    }
}

void
arrprt4(const arr_t *arr)
{
    double (*data)[(arr)->colmax] = (__typeof__(data)) (arr)->data;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx)
            printf(" %g", data[rowidx][colidx]);
        printf("\n");
    }
}

Here is xmac.i:

typedef struct {
    int rowmax;
    int colmax;
    double *data;
} arr_t;

static inline double *
arrdata(const arr_t *arr, int rowidx, int colidx)
{

    return &(arr)->data[((rowidx) * (arr)->colmax) + (colidx)];

}

void
arrprt1(const arr_t *arr)
{
    const double *rowptr = arr->data;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx, rowptr += arr->colmax) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx)
            printf(" %g", rowptr[colidx]);
        printf("\n");
    }
}

void
arrprt2(const arr_t *arr)
{
    const double *rowptr;
    const double *rowend;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {

        rowptr = arrdata(arr, rowidx, 0);

        rowend = rowptr + arr->colmax;

        for (; rowptr < rowend; ++rowptr)
            printf(" %g", *rowptr);
        printf("\n");
    }
}

void
arrprt3(const arr_t *arr)
{

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx) {

            printf(" %g", (arr)->data[((rowidx) * (arr)->colmax) + (colidx)]);

        }
        printf("\n");
    }
}

void
arrprt4(const arr_t *arr)
{
    double (*data)[(arr)->colmax] = (__typeof__(data)) (arr)->data;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx)
            printf(" %g", data[rowidx][colidx]);
        printf("\n");
    }
}

Here is xfnc.i:

typedef struct {
    int rowmax;
    int colmax;
    double *data;
} arr_t;

static inline double *
arrdata(const arr_t *arr, int rowidx, int colidx)
{

    return &arr->data[(rowidx * arr->colmax) + colidx];

}

void
arrprt1(const arr_t *arr)
{
    const double *rowptr = arr->data;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx, rowptr += arr->colmax) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx)
            printf(" %g", rowptr[colidx]);
        printf("\n");
    }
}

void
arrprt2(const arr_t *arr)
{
    const double *rowptr;
    const double *rowend;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {

        rowptr = arrdata(arr, rowidx, 0);

        rowend = arrdata(arr, rowidx, arr->colmax);

        for (; rowptr < rowend; ++rowptr)
            printf(" %g", *rowptr);
        printf("\n");
    }
}

void
arrprt3(const arr_t *arr)
{

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx) {

            printf(" %g", *arrdata(arr, rowidx, colidx));

        }
        printf("\n");
    }
}

void
arrprt4(const arr_t *arr)
{
    double (*data)[(arr)->colmax] = (__typeof__(data)) (arr)->data;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx)
            printf(" %g", data[rowidx][colidx]);
        printf("\n");
    }
}

Here is xmacfnc.i:

typedef struct {
    int rowmax;
    int colmax;
    double *data;
} arr_t;

static inline double *
arrdata(const arr_t *arr, int rowidx, int colidx)
{

    return &(arr)->data[((rowidx) * (arr)->colmax) + (colidx)];

}

void
arrprt1(const arr_t *arr)
{
    const double *rowptr = arr->data;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx, rowptr += arr->colmax) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx)
            printf(" %g", rowptr[colidx]);
        printf("\n");
    }
}

void
arrprt2(const arr_t *arr)
{
    const double *rowptr;
    const double *rowend;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {

        rowptr = arrdata(arr, rowidx, 0);

        rowend = arrdata(arr, rowidx, arr->colmax);

        for (; rowptr < rowend; ++rowptr)
            printf(" %g", *rowptr);
        printf("\n");
    }
}

void
arrprt3(const arr_t *arr)
{

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx) {

            printf(" %g", *arrdata(arr, rowidx, colidx));

        }
        printf("\n");
    }
}

void
arrprt4(const arr_t *arr)
{
    double (*data)[(arr)->colmax] = (__typeof__(data)) (arr)->data;

    for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
        for (int colidx = 0; colidx < arr->colmax; ++colidx)
            printf(" %g", data[rowidx][colidx]);
        printf("\n");
    }
}

Upvotes: 2

Related Questions