Reputation: 1
I have reacently switched to C due to its exceptional computing speeds. I am currently trying to import a very large CSV table (500.000 x 27) into an array, but am facing majour difficulties in trying to assign the values from the imported cell to the respective array cell. I hope the following code helps in trying to understand my problem. I would be glad for any help!
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main() {
int row = 520241, col = 27, j = 0, i = 0;
double *arr[row];
for(i = 0; i < row; i++)
arr[i] = (double*)malloc(col*sizeof(double));
FILE *f;
f = fopen("WethData.csv","r");
if( f == NULL) { //control
printf("Unable to open file \n");
exit(1);
}
char line[485]; //if too little memory is allocated weird numbers show up
char *sp; //string pointer
i = 0; //loop to store every row
double res;
while(fgets(line, 485, f) !=NULL) {
sp = strtok(line, ","); arr[i][0] = atof(sp); //<- Here the problem lies as far as I know
for(j = 1; j < col; j++)
sp = strtok(NULL, ","); arr[i][j] = atof(sp); //<-
i++;
if( i > row) {
puts("Row count exceeded"); //controll, if the loop repeats more than there are rows: "record count exceeded"
exit(1); //stops loop
}
};
fclose(f);
return 0;
}
Upvotes: 0
Views: 366
Reputation: 33601
There are a few issues ...
Putting arr
on the stack can be problematic:
int row = 520241, col = 27;
double *arr[row];
Here, arr
takes up 520241 * 8
bytes which is 2,901,208 bytes. Most archs have a default stack size of 4MB or 8MB. While your allocation is below this, the method doesn't scale to really large .csv
files.
Also, pre-allocating for a fixed number of rows can be wasteful. Better to allow the array to grow dynamically in both dimensions.
You don't really have a two dimensional array as C defines them. You have a one dimensional array of pointers to double
elements. This is how one has to define a 2D array in languages that don't have 2D arrays (e.g. python
).
Note that you're using atof
to decode numbers (on the return value of strtok
). We can skip the strtok
altogether if we use strtod
to decode the numbers.
Creating a control struct for a dynamic 2D array can be helpful.
Anyway, here is some refactored code. It is annotated:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#define sysfault(_fmt...) \
do { \
printf(_fmt); \
exit(1); \
} while (0)
typedef struct {
int rowmax;
int colmax;
double *data;
} arr_t;
// colcalc -- get number of columns in csv file
int
colcalc(const char *line)
{
int colmax = 1;
// count the number of commas
while (1) {
line = strchr(line,',');
if (line == NULL)
break;
++line;
++colmax;
}
return colmax;
}
// arrload -- read in array from .csv file
arr_t *
arrload(FILE *f)
{
char line[10000];
arr_t *arr;
arr = calloc(1,sizeof(*arr));
if (arr == NULL)
sysfault("calloc failure -- %s\n",strerror(errno));
int rowcap = 0;
double *rowptr;
while (fgets(line, sizeof(line), f) != NULL) {
// dynamically calculate number of columns by probing the first row
if (arr->colmax == 0)
arr->colmax = colcalc(line);
// expand the array size
if (arr->rowmax >= rowcap) {
rowcap += 100;
arr->data = realloc(arr->data,
sizeof(*arr->data) * rowcap * arr->colmax);
if (arr->data == NULL)
sysfault("realloc failure -- %s\n",strerror(errno));
}
// point to the current row we want to fill
rowptr = &arr->data[arr->rowmax * arr->colmax];
char *cp = line;
for (int colidx = 0; colidx < arr->colmax; ++colidx) {
if (*cp == 0)
sysfault("line too short -- rowmax=%d colidx=%d\n",
arr->rowmax,colidx);
// decode the number
rowptr[colidx] = strtod(cp,&cp);
// check the trailing character
switch (*cp) {
case ',': // middle of the line number
++cp;
break;
case '\n': // last number on line
++cp;
break;
case 0: // broken last line (still okay)
break;
default:
sysfault("syntax error -- '%s'\n",cp);
break;
}
}
++arr->rowmax;
}
// trim array to actual size used
rowcap = arr->rowmax;
arr->data = realloc(arr->data,sizeof(*arr->data) * rowcap * arr->colmax);
if (arr->data == NULL)
sysfault("realloc trim failure -- %s\n",strerror(errno));
return arr;
}
void
arrprt(const arr_t *arr)
{
const double *rowptr = arr->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx,
rowptr += arr->colmax) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g",rowptr[colidx]);
printf("\n");
}
}
int
main(void)
{
FILE *f;
f = fopen("WethData.csv", "r");
if (f == NULL)
sysfault("unable to open file -- %s\n",strerror(errno));
arr_t *arr = arrload(f);
fclose(f);
arrprt(arr);
return 0;
}
UPDATE:
After going through your code I still have some uncertainties. For one the use of the arrprt function?
That function just loops through the array and prints the elements. As an example of array element access. More on this below.
For another your use of the "->" opperator?
This is the arrow operator. It is basic C coding. There are other SO questions that detail this. But, I'd look at a good C book (e.g. K&R). Here are some simple examples. They all do the same thing (e.g. print the array element that has index 3):
double data[5];
printf("%g\n",data[3]);
double *dptr = data;
printf("%g\n",dptr[3]);
printf("%g\n",*(dptr + 3));
struct arr {
double data[5];
};
struct arr arr;
printf("%g\n",arr.data[3]);
struct arr *ap = &arr;
printf("%g\n",ap->data[3]);
And lastly how would I access a specific box in the array (e.g. row 300, col 5).
There are a few different ways to do this. I've improved my example code to show the different ways. You can define a few macros with the -D
compiler option to get the different ways (e.g. -DUSEMACRO
and/or -USEFNC
).
You could also use cpp
to get the macro output. I've added the USECPP
as a convenience. So, to look at macro output (e.g.):
cpp -DUSECPP x.c > xstd.i
cpp -DUSECPP -DUSEMACRO x.c > xmac.i
cpp -DUSECPP -DUSEFNC x.c > xfnc.i
cpp -DUSECPP -DUSEMACRO -DUSEFNC x.c > xmacfnc.i
You can also add -DUSEPRT=n
where n
is 1-4
Anyway, here is the full code:
#ifndef USECPP
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#endif
// USEMACRO -- select access method
// 0 - index calculation in arrdata function
// 1 - index calculation in ARRDATA macro
//
// USEFNC -- select access method for arrprt2/arrprt3 functions
// 0 - use ARRDATA macro
// 1 - use arrdata
//
// USEPRT -- select arrprt* function to use
// 0-4
#ifndef USEPRT
#define USEPRT 1
#endif
#define sysfault(_fmt...) \
do { \
printf(_fmt); \
exit(1); \
} while (0)
typedef struct {
int rowmax;
int colmax;
double *data;
} arr_t;
// ARRDATA -- access individual array elements
#if USEMACRO
#define ARRDATA(_arr,_rowidx,_colidx) \
(_arr)->data[((_rowidx) * (_arr)->colmax) + (_colidx)]
#else
#define ARRDATA(_arr,_rowidx,_colidx) \
*arrdata(_arr,_rowidx,_colidx)
#endif
// arrdata -- point to given data address
static inline double *
arrdata(const arr_t *arr,int rowidx,int colidx)
{
#if USEMACRO
return &ARRDATA(arr,rowidx,colidx);
#else
return &arr->data[(rowidx * arr->colmax) + colidx];
#endif
}
// ARRDEF -- define 2D array (_advanced_ usage)
#define ARRDEF(_arr,_sym) \
double (*_sym)[(_arr)->colmax] = (__typeof__(_sym)) (_arr)->data
// colcalc -- get number of columns in csv file
#ifndef USECPP
int
colcalc(const char *line)
{
int colmax = 1;
// count the number of commas
while (1) {
line = strchr(line,',');
if (line == NULL)
break;
++line;
++colmax;
}
return colmax;
}
#endif
// arrload -- read in array from .csv file
#ifndef USECPP
arr_t *
arrload(FILE *f)
{
char line[10000];
arr_t *arr;
arr = calloc(1,sizeof(*arr));
if (arr == NULL)
sysfault("calloc failure -- %s\n",strerror(errno));
int rowcap = 0;
double *rowptr;
while (fgets(line, sizeof(line), f) != NULL) {
// dynamically calculate number of columns by probing the first row
if (arr->colmax == 0)
arr->colmax = colcalc(line);
// expand the array size
if (arr->rowmax >= rowcap) {
rowcap += 100;
arr->data = realloc(arr->data,
sizeof(*arr->data) * rowcap * arr->colmax);
if (arr->data == NULL)
sysfault("realloc failure -- %s\n",strerror(errno));
}
// point to the current row we which to fill
#if 0
rowptr = &arr->data[arr->rowmax * arr->colmax];
#else
rowptr = arrdata(arr,arr->rowmax,0);
#endif
char *cp = line;
for (int colidx = 0; colidx < arr->colmax; ++colidx) {
if (*cp == 0)
sysfault("line too short -- rowmax=%d colidx=%d\n",
arr->rowmax,colidx);
// decode the number
rowptr[colidx] = strtod(cp,&cp);
// check the trailing character
switch (*cp) {
case ',': // middle of the line number
++cp;
break;
case '\n': // last number on line
++cp;
break;
case 0: // broken last line (still okay)
break;
default:
sysfault("syntax error -- '%s'\n",cp);
break;
}
}
++arr->rowmax;
}
// trim array to actual size used
rowcap = arr->rowmax;
arr->data = realloc(arr->data,sizeof(*arr->data) * rowcap * arr->colmax);
if (arr->data == NULL)
sysfault("realloc trim failure -- %s\n",strerror(errno));
return arr;
}
#endif
// arrprt1 -- print array (original)
void
arrprt1(const arr_t *arr)
{
const double *rowptr = arr->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx,
rowptr += arr->colmax) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g",rowptr[colidx]);
printf("\n");
}
}
// arrprt2 -- print array (use arrdata function and pointers)
void
arrprt2(const arr_t *arr)
{
const double *rowptr;
const double *rowend;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
// point to start of the row
rowptr = arrdata(arr,rowidx,0);
// point to one past the end of the row (either of these work)
#if USEFNC
rowend = arrdata(arr,rowidx,arr->colmax);
#else
rowend = rowptr + arr->colmax;
#endif
for (; rowptr < rowend; ++rowptr)
printf(" %g",*rowptr);
printf("\n");
}
}
// arrprt3 -- print array (slow, use arrdata/ARRDATA each time)
void
arrprt3(const arr_t *arr)
{
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx) {
#if USEFNC
printf(" %g",*arrdata(arr,rowidx,colidx));
#else
printf(" %g",ARRDATA(arr,rowidx,colidx));
#endif
}
printf("\n");
}
}
// arrprt4 -- print array (slow, use ARRDEF)
void
arrprt4(const arr_t *arr)
{
ARRDEF(arr,data);
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g",data[rowidx][colidx]);
printf("\n");
}
}
#ifndef USECPP
int
main(void)
{
FILE *f;
f = fopen("WethData.csv", "r");
if (f == NULL)
sysfault("unable to open file -- %s\n",strerror(errno));
arr_t *arr = arrload(f);
fclose(f);
switch (USEPRT) {
case 1:
arrprt1(arr);
break;
case 2:
arrprt2(arr);
break;
case 3:
arrprt3(arr);
break;
case 4:
arrprt4(arr);
break;
}
return 0;
}
#endif
UPDATE #2:
Here are the outputs of the modified source, run through cpp
from the cpp
commands above.
Here is xstd.i
:
typedef struct {
int rowmax;
int colmax;
double *data;
} arr_t;
static inline double *
arrdata(const arr_t *arr, int rowidx, int colidx)
{
return &arr->data[(rowidx * arr->colmax) + colidx];
}
void
arrprt1(const arr_t *arr)
{
const double *rowptr = arr->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx, rowptr += arr->colmax) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g", rowptr[colidx]);
printf("\n");
}
}
void
arrprt2(const arr_t *arr)
{
const double *rowptr;
const double *rowend;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
rowptr = arrdata(arr, rowidx, 0);
rowend = rowptr + arr->colmax;
for (; rowptr < rowend; ++rowptr)
printf(" %g", *rowptr);
printf("\n");
}
}
void
arrprt3(const arr_t *arr)
{
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx) {
printf(" %g", *arrdata(arr, rowidx, colidx));
}
printf("\n");
}
}
void
arrprt4(const arr_t *arr)
{
double (*data)[(arr)->colmax] = (__typeof__(data)) (arr)->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g", data[rowidx][colidx]);
printf("\n");
}
}
Here is xmac.i
:
typedef struct {
int rowmax;
int colmax;
double *data;
} arr_t;
static inline double *
arrdata(const arr_t *arr, int rowidx, int colidx)
{
return &(arr)->data[((rowidx) * (arr)->colmax) + (colidx)];
}
void
arrprt1(const arr_t *arr)
{
const double *rowptr = arr->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx, rowptr += arr->colmax) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g", rowptr[colidx]);
printf("\n");
}
}
void
arrprt2(const arr_t *arr)
{
const double *rowptr;
const double *rowend;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
rowptr = arrdata(arr, rowidx, 0);
rowend = rowptr + arr->colmax;
for (; rowptr < rowend; ++rowptr)
printf(" %g", *rowptr);
printf("\n");
}
}
void
arrprt3(const arr_t *arr)
{
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx) {
printf(" %g", (arr)->data[((rowidx) * (arr)->colmax) + (colidx)]);
}
printf("\n");
}
}
void
arrprt4(const arr_t *arr)
{
double (*data)[(arr)->colmax] = (__typeof__(data)) (arr)->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g", data[rowidx][colidx]);
printf("\n");
}
}
Here is xfnc.i
:
typedef struct {
int rowmax;
int colmax;
double *data;
} arr_t;
static inline double *
arrdata(const arr_t *arr, int rowidx, int colidx)
{
return &arr->data[(rowidx * arr->colmax) + colidx];
}
void
arrprt1(const arr_t *arr)
{
const double *rowptr = arr->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx, rowptr += arr->colmax) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g", rowptr[colidx]);
printf("\n");
}
}
void
arrprt2(const arr_t *arr)
{
const double *rowptr;
const double *rowend;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
rowptr = arrdata(arr, rowidx, 0);
rowend = arrdata(arr, rowidx, arr->colmax);
for (; rowptr < rowend; ++rowptr)
printf(" %g", *rowptr);
printf("\n");
}
}
void
arrprt3(const arr_t *arr)
{
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx) {
printf(" %g", *arrdata(arr, rowidx, colidx));
}
printf("\n");
}
}
void
arrprt4(const arr_t *arr)
{
double (*data)[(arr)->colmax] = (__typeof__(data)) (arr)->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g", data[rowidx][colidx]);
printf("\n");
}
}
Here is xmacfnc.i
:
typedef struct {
int rowmax;
int colmax;
double *data;
} arr_t;
static inline double *
arrdata(const arr_t *arr, int rowidx, int colidx)
{
return &(arr)->data[((rowidx) * (arr)->colmax) + (colidx)];
}
void
arrprt1(const arr_t *arr)
{
const double *rowptr = arr->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx, rowptr += arr->colmax) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g", rowptr[colidx]);
printf("\n");
}
}
void
arrprt2(const arr_t *arr)
{
const double *rowptr;
const double *rowend;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
rowptr = arrdata(arr, rowidx, 0);
rowend = arrdata(arr, rowidx, arr->colmax);
for (; rowptr < rowend; ++rowptr)
printf(" %g", *rowptr);
printf("\n");
}
}
void
arrprt3(const arr_t *arr)
{
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx) {
printf(" %g", *arrdata(arr, rowidx, colidx));
}
printf("\n");
}
}
void
arrprt4(const arr_t *arr)
{
double (*data)[(arr)->colmax] = (__typeof__(data)) (arr)->data;
for (int rowidx = 0; rowidx < arr->rowmax; ++rowidx) {
for (int colidx = 0; colidx < arr->colmax; ++colidx)
printf(" %g", data[rowidx][colidx]);
printf("\n");
}
}
Upvotes: 2