Reputation: 2045
I have a dataset like that (0:5 row) ,every row have different number of elements, like 2,3...
whole milk,margarine
yogurt,brown bread,coffee
pork,yogurt,coffee
bottled water,bottled beer
whole milk
salty snack
I have tried
gro_arr = np.genfromtxt("groceries.csv", dtype=str, delimiter=",")
but it shows the
Some errors were detected !
Line #2 (got 3 columns instead of 4)
Line #3 (got 1 columns instead of 4)
Line #6 (got 5 columns instead of 4)
Line #7 (got 1 columns instead of 4)
how to solve it?
Upvotes: 1
Views: 68
Reputation: 7353
You can read in the lines in two different ways:
I have made a class with necessary methods to do this in a few lines. So, you could apply it as-is, without making hardly any changes to it.
s = """whole milk,margarine
yogurt,brown bread,coffee
pork,yogurt,coffee
bottled water,bottled beer
whole milk
salty snack
"""
tc = TextToColumns(file = 'source.txt',
sep = ',',
text2columns = True,
savedata = False,
usedummydata = True,
dummydata = s)
tc.make_dummydata()
tc.toarray()
Output:
array([['whole milk', 'margarine', ''],
['yogurt', 'brown bread', 'coffee'],
['pork', 'yogurt', 'coffee'],
['bottled water', 'bottled beer', ''],
['whole milk', '', ''],
['salty snack', '', '']], dtype='<U13')
import numpy as np
import os
class TextToColumns(object):
"""Reads in text data and converts text into
columns using user specified separator.
Parameters
----------
file: path to the file
sep: (str) separator. Default: ","
text2columns: (bool) if True, adds empty strings as a padding to create a
2D array. Default: True
savedata: (bool) if True, saves the data read-in after splitting with the
separator, as a part of the object. Default: False
usedummydata: (bool) if True, uses dummy data to write to a file.
Default: False
dummydata: (str) the string to use as dummy data.
Default: ''
Example:
# test-data
s = '''whole milk,margarine
yogurt,brown bread,coffee
pork,yogurt,coffee
bottled water,bottled beer
whole milk
salty snack
'''
# Text-to-column transformation
tc = TextToColumns(filename = 'source.txt',
sep = ',',
text2columns = True,
savedata = False,
usedummydata = True,
dummydata = s)
tc.make_dummydata()
tc.toarray()
# Uncomment next line to clear any dummy data created
# tc.clear_dummydata()
"""
def __init__(self, file,
sep: str = ',',
text2columns: bool = True,
savedata: bool = False,
usedummydata: bool = False,
dummydata: str=''):
self.file = file # 'source.txt'
self.sep = sep
self.text2columns = text2columns
self.savedata = savedata
self.usedummydata = usedummydata
self.dummydata = dummydata
def __repr__(self):
return "TextToColumns object"
def make_dummydata(self, dummydata=''):
"""Save a string as a file to use as dummy data.
"""
s = """whole milk,margarine
yogurt,brown bread,coffee
pork,yogurt,coffee
bottled water,bottled beer
whole milk
salty snack
"""
if (self.dummydata == ''):
self.dummydata = s
if (dummydata == ''):
dummydata = self.dummydata
with open(self.file, 'w') as f:
f.write(dummydata)
def clear_dummydata(self):
if os.path.isfile(self.file):
os.remove(self.file)
def readlines(self):
return self.toarray()
def read_file(self):
if os.path.isfile(self.file):
with open(self.file, 'r') as f:
lines = f.readlines()
return lines
else:
raise ValueError('Invalid file path.')
def split_lines(self, lines=None):
data = []
self._max_length = 0
if lines is None:
lines = self.read_file()
for line in lines:
linedata = [e.strip() for e in line.split(sep)]
length = len(linedata)
if (length > self._max_length):
self._max_length = length
#print(linedata)
if length > 0:
data.append(linedata)
if self.savedata:
self.data = data
return data
def toarray(self, data=None):
if data is None:
data = self.split_lines()
padded_data = []
if self.text2columns:
for line in data:
padded_data.append(line + ['']*(max_length - len(line)))
if self.savedata:
self.padded_data = padded_data
return np.array(padded_data)
else:
return data
Upvotes: 1
Reputation: 896
You could read thecsv
file using open
and then read each line and split
it by the comma delimiter.
allLines = []
with open("groceries.csv", 'r') as f:
while(True):
line = f.readline()[:-1] # [:-1] to avoid the \n at the end of the line
if not line:
break
line = line.split(",")
allLines.append(line)
for l in allLines:
print(l)
Outputs:
['whole milk', 'margarine']
['yogurt', 'brown bread', 'coffee']
['pork', 'yogurt', 'coffee']
['bottled water', 'bottled beer']
['whole milk']
['salty snac']
Hope this helps.
Upvotes: 0