Pandas KeyError when working on split data frame

Question

I want to perform some operations on a pandas data frame that is split into chunks. After splitting the data frame, I then try to iterate over the chunks, but after the first iterations runs well, I get an error (see below). I have gone through some questions like these: 1 and 2 but they don't quite address my issue. Kindly help me resolve this as I don't fully understand it.

import pandas as pd

tupList = [('Eisenstadt', 'Paris','1', '2'), ('London', 'Berlin','1','3'), ('Berlin', 'stuttgat','1', '4'),
           ('Liverpool', 'Southampton','1', '5'),('Tirana', 'Blackpool', '1', '6'),('blackpool', 'tirana','1','7'),
           ('Paris', 'Lyon','1','8'), ('Manchester', 'Nice','1','10'),('Orleans', 'Madrid','1', '12'),
           ('Lisbon','Stockholm','1','12')]


cities = pd.DataFrame(tupList, columns=['Origin', 'Destination', 'O_Code', 'D_code'])


# purpose - splits the DataFrame into smaller of max size chunkSize (last is smaller)
def splitDataFrameIntoSmaller(df, chunkSize = 3):
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
    return listOfDf

citiesChunks = splitDataFrameIntoSmaller(cities)

for ind, cc in enumerate(citiesChunks):
    cc["distance"] = 0
    cc["time"] = 0

    for i in xrange(len(cc)):
        al = cc['Origin'][i]
        bl = cc['Destination'][i]
        '...' #trucating to make it readable

    cc.to_csv('out.csv', sep=',', encoding='utf-8')


Traceback (most recent call last):
  File ..., line 39, in 
    al = cc['Origin'][i]
  File ..., line 603, in __getitem__
    result = self.index.get_value(self, key)
  File ..., line 2169, in get_value
    tz=getattr(series.dtype, 'tz', None))
  File "pandas\index.pyx", line 98, in pandas.index.IndexEngine.get_value (pandas\index.c:3557)
  File "pandas\index.pyx", line 106, in pandas.index.IndexEngine.get_value (pandas\index.c:3240)
  File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:4279)
  File "pandas\src\hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:8564)
  File "pandas\src\hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:8508)
KeyError: 0L

jezrael · Accepted Answer

You can first floor divide index values and then use list comprehension - loop by unique values and select by loc, last reset_index for remove duplicated index:

cities.index = cities.index // 3
print (cities)
       Origin  Destination O_Code D_code
0  Eisenstadt        Paris      1      2
0      London       Berlin      1      3
0      Berlin     stuttgat      1      4
1   Liverpool  Southampton      1      5
1      Tirana    Blackpool      1      6
1   blackpool       tirana      1      7
2       Paris         Lyon      1      8
2  Manchester         Nice      1     10
2     Orleans       Madrid      1     12
3      Lisbon    Stockholm      1     12

citiesChunks = [cities.loc[[x]].reset_index(drop=True) for x in cities.index.unique()]
#print (citiesChunks)

print (citiesChunks[0])
       Origin Destination O_Code D_code
0  Eisenstadt       Paris      1      2
1      London      Berlin      1      3
2      Berlin    stuttgat      1      4

Last need iterrows if need loop in DataFrame:

#write columns to file first
cols = ['Origin', 'Destination', 'O_Code', 'D_code', 'distance', 'time']
df = pd.DataFrame(columns=cols)
df.to_csv('out.csv', encoding='utf-8', index=False)

for ind, cc in enumerate(citiesChunks):
    cc["distance"] = 0
    cc["time"] = 0

    for i, val in cc.iterrows():
        al = cc.loc[i, 'Origin']
        bl = cc.loc[i, 'Destination']
        '...' #trucating to make it readable

    cc.to_csv('out.csv', encoding='utf-8', mode='a', header=None, index=False)
    print (cc.to_csv(encoding='utf-8'))

,Origin,Destination,O_Code,D_code,distance,time
0,Eisenstadt,Paris,1,2,0,0
1,London,Berlin,1,3,0,0
2,Berlin,stuttgat,1,4,0,0

,Origin,Destination,O_Code,D_code,distance,time
0,Liverpool,Southampton,1,5,0,0
1,Tirana,Blackpool,1,6,0,0
2,blackpool,tirana,1,7,0,0

,Origin,Destination,O_Code,D_code,distance,time
0,Paris,Lyon,1,8,0,0
1,Manchester,Nice,1,10,0,0
2,Orleans,Madrid,1,12,0,0

,Origin,Destination,O_Code,D_code,distance,time
0,Lisbon,Stockholm,1,12,0,0

Pandas KeyError when working on split data frame

Answers (1)

Related Questions