user3820991
user3820991

Reputation: 2610

cython - convert string into integers and floats

I have a dataset which I read row by row in Cython. Each row is returned as a string. What I would like to do is to convert the string into an array of numbers (ints and floats) with length equal to the number of columns in each row (which is given by the delimiter ';').

For example

import pandas as pd
import numpy as np

df = pd.DataFrame(np.c_[np.random.rand(3,2),np.random.randint(0,10,(3,2))], columns = ['a','b','c','d'])

filename = r'H:\mydata.csv'
df.to_csv('filename',sep=';',index=False)

Now I want to iterate randomly over the rows in cython and do some computations on each row.

import numpy as np
from readc_csv import row_pos, read_file_and_compute

filename = r'H:\mydata.csv'
row_position = row_pos(filename)[:-1] # returns the position of the start
                                      # of each row in the file
                                      # (excluding the header)

rows = np.random.choice(row_position,size=len(row_position),replace=False)
read_file_and_compute(filename,rows)

The readc_csv.pyx file looks as follows

from libc.stdio cimport FILE, fopen, fgets, fclose, fseek, SEEK_SET, ftell
import numpy as np
cimport numpy as np

def row_pos(str filename):
    filename_byte_string = filename.encode("UTF-8")

    cdef:
        char* fname = filename_byte_string
        FILE* cfile
        char line[50]
        list pos = []

    cfile = fopen(fname, "r")

    while fgets(line, 50, cfile)!=NULL:
        pos.append(ftell(cfile))

    fclose(cfile)

    return pos            


def read_file_and_compute(str filename, int [:] rows):
    filename_byte_string = filename.encode("UTF-8")
    cdef:
        char* fname = filename_byte_string
        FILE* cfile
        char line[50]
        size_t j 
        int n = rows.shape[0]

    cfile = fopen(fname, "r")

    for j in range(n):
        r = rows[j]
        fseek(cfile,r,SEEK_SET)
        fgets(line, 50, cfile)

        # line is now e.g. 
        # '0.659933520847;0.471779123704;1.0;2.0\n'
        # I want to convert it into an array with 4 elements
        # each element corresponding to one of the numbers we
        # see in the string
        # and do some computations


    fclose(cfile)

    return 

(Note: The cython code is not yet optimzed) Backround information: This is part of a script I want to write for stochastic gradient descent on a data set that is too large to be read into memory. I want to perform the inner-loop over the randomly ordered samples in cython. Hence I need to be able to read the data from a given row in a csv-file in cython.

Upvotes: 1

Views: 2274

Answers (1)

user3820991
user3820991

Reputation: 2610

I found the c-functions strtok and atof which can be imported from libc.string and libc.stdlib. They do the trick.

Continuing the above example the read_file_and_computefunction could then look something like this

def read_file_and_compute(str filename, int [:] rows, int col_n):
    filename_byte_string = filename.encode("UTF-8")
    cdef:
        char* fname = filename_byte_string
        FILE* cfile
        char line[50]
        char *token
        double *col = <double *>malloc(col_n * sizeof(double))
        size_t j, i 
        int count
        double num
        int n = rows.shape[0]

    cfile = fopen(fname, "r")

    for j in range(n):
        r = rows[j]
        fseek(cfile,r,SEEK_SET)
        fgets(line, 50, cfile)

        token = strtok(line, ';') # splits the string at the delimiter ';'
        count = 0
        while token!=NULL and count<col_n:
            num = atof(token) # converts the string into a float

            col[count] = num
            token = strtok(NULL,';\n')
            count +=1 

        # now do some computations on col ...

fclose(cfile)
free(col)    

return 

There are more functions for conversion of strings into different types, see here.

Upvotes: 2

Related Questions