Reputation: 2610
I have a dataset which I read row by row in Cython. Each row is returned as a string. What I would like to do is to convert the string into an array of numbers (ints and floats) with length equal to the number of columns in each row (which is given by the delimiter ';').
For example
import pandas as pd
import numpy as np
df = pd.DataFrame(np.c_[np.random.rand(3,2),np.random.randint(0,10,(3,2))], columns = ['a','b','c','d'])
filename = r'H:\mydata.csv'
df.to_csv('filename',sep=';',index=False)
Now I want to iterate randomly over the rows in cython and do some computations on each row.
import numpy as np
from readc_csv import row_pos, read_file_and_compute
filename = r'H:\mydata.csv'
row_position = row_pos(filename)[:-1] # returns the position of the start
# of each row in the file
# (excluding the header)
rows = np.random.choice(row_position,size=len(row_position),replace=False)
read_file_and_compute(filename,rows)
The readc_csv.pyx file looks as follows
from libc.stdio cimport FILE, fopen, fgets, fclose, fseek, SEEK_SET, ftell
import numpy as np
cimport numpy as np
def row_pos(str filename):
filename_byte_string = filename.encode("UTF-8")
cdef:
char* fname = filename_byte_string
FILE* cfile
char line[50]
list pos = []
cfile = fopen(fname, "r")
while fgets(line, 50, cfile)!=NULL:
pos.append(ftell(cfile))
fclose(cfile)
return pos
def read_file_and_compute(str filename, int [:] rows):
filename_byte_string = filename.encode("UTF-8")
cdef:
char* fname = filename_byte_string
FILE* cfile
char line[50]
size_t j
int n = rows.shape[0]
cfile = fopen(fname, "r")
for j in range(n):
r = rows[j]
fseek(cfile,r,SEEK_SET)
fgets(line, 50, cfile)
# line is now e.g.
# '0.659933520847;0.471779123704;1.0;2.0\n'
# I want to convert it into an array with 4 elements
# each element corresponding to one of the numbers we
# see in the string
# and do some computations
fclose(cfile)
return
(Note: The cython code is not yet optimzed) Backround information: This is part of a script I want to write for stochastic gradient descent on a data set that is too large to be read into memory. I want to perform the inner-loop over the randomly ordered samples in cython. Hence I need to be able to read the data from a given row in a csv-file in cython.
Upvotes: 1
Views: 2274
Reputation: 2610
I found the c-functions strtok
and atof
which can be imported from libc.string
and libc.stdlib
. They do the trick.
Continuing the above example the read_file_and_compute
function could then look something like this
def read_file_and_compute(str filename, int [:] rows, int col_n):
filename_byte_string = filename.encode("UTF-8")
cdef:
char* fname = filename_byte_string
FILE* cfile
char line[50]
char *token
double *col = <double *>malloc(col_n * sizeof(double))
size_t j, i
int count
double num
int n = rows.shape[0]
cfile = fopen(fname, "r")
for j in range(n):
r = rows[j]
fseek(cfile,r,SEEK_SET)
fgets(line, 50, cfile)
token = strtok(line, ';') # splits the string at the delimiter ';'
count = 0
while token!=NULL and count<col_n:
num = atof(token) # converts the string into a float
col[count] = num
token = strtok(NULL,';\n')
count +=1
# now do some computations on col ...
fclose(cfile)
free(col)
return
There are more functions for conversion of strings into different types, see here.
Upvotes: 2