Reputation: 4043
I have more than 1000 groups with different id
and I only need to select a specific number of groups and read the nth
number of every group. Here an example of what I need:
#These are the codes from different answers
import pandas as pd
import numpy as np
import time
import sys
df = pd.DataFrame({
'index':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'id' : ['veh0', 'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192','veh0', 'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192'],
'veh_x' :[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'veh_y':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884]
}
)
data=['veh0', 'veh1', 'veh2', 'veh3']
# print(df.groupby(['id']).head(1))
#first part
start = time.clock()
for i in range(0,20):
g=df.groupby(['id']).nth([i]).reset_index()
for x in data:
for idx, row in g.iterrows():
if x==row['id']:
print("code1 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start) )
#second part
#This is what I need but it is running slowly when I add it to my whole dataset
start = time.clock()
for i in range(0,20):
for x in data: #these are the selected groups
g = df[df['id'].isin([x])].groupby(['id']).nth([i]).reset_index()
for x, row in g.iterrows():
print("code2 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start) )
#Third part
start = time.clock()
for i in range(0,20):
g=df[df['id'].isin(data)].groupby('id').nth([i]).reset_index()
for x, row in g.iterrows():
print("code3 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start))
#fourth part
start = time.clock()
df2 = df[df['id'].isin(data)]
for i in range(0,20):
for x in data:
row = df2.groupby('id').nth(i)
if(x in row.index):
print("code4 group",i, " = ", x)
end = time.clock()
print ("%.2gs" % (end-start))
#fifth part
def printf(text):
print text
start = time.clock()
tmp = df.loc[df.id.isin(data)].groupby(['id']).apply(lambda x: x.reset_index(drop=True)).reset_index(level=1)
# cleanup and rename index
tmp = tmp.rename(columns={'level_1': 'group'})
# print 20 first groups
for i in range(20):
lst= tmp.loc[tmp.group == i].apply(lambda x:x, axis=1)
for x, row in lst.iterrows():
print("code5 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start))
The first part of the code read all the groups and return the nth
number of every group but I need only five or six or more. The problem is that I don't know any information about the group. I can use a counter
after that I can use break
but the code is running so slowly because I need to load more than 30000 records every iteration. Here I added data=['veh0', 'veh1', 'veh2', 'veh3']
as an example but it can be chosen randomly.
The second part is what I want but the code still running slowly. The second part takes 0.43s, the first part takes 0.14s, and the third part takes 0.077s. What is the best way of making it better?
I appreciate any help?
Upvotes: 2
Views: 294
Reputation: 6536
I think you don't need any for loops like this:
data = ['veh0', 'veh1', 'veh2', 'veh3']
# insert group index for each id group
tmp = (df.loc[df.id.isin(data)]
.groupby(['id'])
.apply(lambda x: x.reset_index(drop=True))
.reset_index(level=1))
# cleanup and rename index
tmp = tmp.reset_index(drop=True)
tmp = tmp.rename(columns={'level_1': 'group'})
# fast print 20 first groups
print(tmp.loc[tmp.group.isin(list(range(20)))]
.sort_values('group')[['group', 'id']]
.to_string(formatters={'group':'group {} ='.format},
index=False,
header=None))
# slow print with loop
for i in range(20):
lst = tmp.loc[tmp.group == i]
for x, row in lst.iterrows():
print("code5 group",i,"=",row['id'])
Upvotes: 1
Reputation: 2887
I imagine the problem is that you groupby df
in every loop iteration. I think changing the order of loops could help for that.
If I understood correctly you want to print the nth value of id
if it is present and none if it is not there. This could be done by a count operation
First get the count for all ids:
In [27]: df2 = df[df['id'].isin(data)].groupby('id')[['id']].count()
In [28]: df2
Out[28]:
id index
id
veh0 24 24
veh1 18 18
veh2 12 12
veh3 6 6
Second the output
In [44]: for i in range(0,20):
...: for x in data:
...: if( i < df2.loc[x,].values):
...: print("code4 group",i,"=", x)
...:
code4 group 0 = veh0
code4 group 0 = veh1
code4 group 0 = veh2
code4 group 0 = veh3
code4 group 1 = veh0
code4 group 1 = veh1
code4 group 1 = veh2
code4 group 1 = veh3
code4 group 2 = veh0
code4 group 2 = veh1
code4 group 2 = veh2
code4 group 2 = veh3
code4 group 3 = veh0
code4 group 3 = veh1
code4 group 3 = veh2
code4 group 3 = veh3
code4 group 4 = veh0
code4 group 4 = veh1
code4 group 4 = veh2
code4 group 4 = veh3
code4 group 5 = veh0
code4 group 5 = veh1
code4 group 5 = veh2
code4 group 5 = veh3
code4 group 6 = veh0
code4 group 6 = veh1
code4 group 6 = veh2
code4 group 7 = veh0
code4 group 7 = veh1
code4 group 7 = veh2
code4 group 8 = veh0
code4 group 8 = veh1
code4 group 8 = veh2
code4 group 9 = veh0
code4 group 9 = veh1
code4 group 9 = veh2
code4 group 10 = veh0
code4 group 10 = veh1
code4 group 10 = veh2
code4 group 11 = veh0
code4 group 11 = veh1
code4 group 11 = veh2
code4 group 12 = veh0
code4 group 12 = veh1
code4 group 13 = veh0
code4 group 13 = veh1
code4 group 14 = veh0
code4 group 14 = veh1
code4 group 15 = veh0
code4 group 15 = veh1
code4 group 16 = veh0
code4 group 16 = veh1
code4 group 17 = veh0
code4 group 17 = veh1
code4 group 18 = veh0
code4 group 19 = veh0
If I haven't missed anything this is the output you wanted. Does this solve your problems?
EDIT: To get the other columns, you could split up df2 further.
In [53]: df2 = df[df['id'].isin(data)]
In [54]: df2
Out[54]:
index id veh_x veh_y
0 0 veh0 0 0
1 1 veh0 1 1
2 2 veh0 2 2
3 2 veh1 2 2
4 3 veh0 3 3
5 3 veh1 3 3
6 4 veh0 4 4
.. ... ... ... ...
83 9 veh2 9 9
84 9 veh3 9 9
85 10 veh0 10 10
86 10 veh1 10 10
87 10 veh2 10 10
88 10 veh3 10 10
89 11 veh0 11 11
Then you need to groupby in the loop again but the DataFrame should be much smaller
In [90]: for i in range(0,20):
...: for x in data:
...: row = df2.groupby('id').nth(i)
...: if(x in row.index):
...: print("code5 group",i, " = ", x)
...:
...:
code5 group 0 = veh0
code5 group 0 = veh1
code5 group 0 = veh2
code5 group 0 = veh3
code5 group 1 = veh0
code5 group 1 = veh1
code5 group 1 = veh2
code5 group 1 = veh3
code5 group 2 = veh0
code5 group 2 = veh1
code5 group 2 = veh2
code5 group 2 = veh3
code5 group 3 = veh0
code5 group 3 = veh1
code5 group 3 = veh2
code5 group 3 = veh3
code5 group 4 = veh0
code5 group 4 = veh1
code5 group 4 = veh2
code5 group 4 = veh3
code5 group 5 = veh0
code5 group 5 = veh1
code5 group 5 = veh2
code5 group 5 = veh3
code5 group 6 = veh0
code5 group 6 = veh1
code5 group 6 = veh2
code5 group 7 = veh0
code5 group 7 = veh1
code5 group 7 = veh2
code5 group 8 = veh0
code5 group 8 = veh1
code5 group 8 = veh2
code5 group 9 = veh0
code5 group 9 = veh1
code5 group 9 = veh2
code5 group 10 = veh0
code5 group 10 = veh1
code5 group 10 = veh2
code5 group 11 = veh0
code5 group 11 = veh1
code5 group 11 = veh2
code5 group 12 = veh0
code5 group 12 = veh1
code5 group 13 = veh0
code5 group 13 = veh1
code5 group 14 = veh0
code5 group 14 = veh1
code5 group 15 = veh0
code5 group 15 = veh1
code5 group 16 = veh0
code5 group 16 = veh1
code5 group 17 = veh0
code5 group 17 = veh1
code5 group 18 = veh0
code5 group 19 = veh0
row
(or row.reset_index()
) should now contain the same data as in your examples as well. Is that what you meant?
Upvotes: 1
Reputation: 5741
If I understand you correctly you are looking to do this:
data = ['veh0', 'veh1', 'veh2', 'veh3']
n_groups = 3
df[df['id'].isin(data)].groupby('id').head(n_groups)['id']
The only way this differs with your expected output is that the groups are not numbered (i
in your example code). However you are not explicitly stating this is a requirement?
Correct me if I'm wrong.
Upvotes: 1
Reputation: 588
To the best of my understanding of your problem:
>>> import pandas as pd
>>> df = \
pd.DataFrame(
{
'id': [i for i in range (1000)]*10,
'col1': ['col1 occurence {} for id {}'.format(j, i) for j in range(10) for i in range (1000)],
'col2': ['col2 occurence {} for id {}'.format(j, i) for j in range(10) for i in range (1000)]
}
)
>>> df.head()
id col1 col2
0 0 col1 occurence 0 for id 0 col2 occurence 0 for id 0
1 1 col1 occurence 0 for id 1 col2 occurence 0 for id 1
2 2 col1 occurence 0 for id 2 col2 occurence 0 for id 2
3 3 col1 occurence 0 for id 3 col2 occurence 0 for id 3
4 4 col1 occurence 0 for id 4 col2 occurence 0 for id 4
This will give you precisely 0th, 5th and 9th data row for each id (modify list [0,5,9] in accordance to your case):
>>> df.groupby(['id']).nth([0,5,9]).reset_index()
id col1 col2
0 0 col1 occurence 0 for id 0 col2 occurence 0 for id 0
1 0 col1 occurence 5 for id 0 col2 occurence 5 for id 0
2 0 col1 occurence 9 for id 0 col2 occurence 9 for id 0
3 1 col1 occurence 0 for id 1 col2 occurence 0 for id 1
4 1 col1 occurence 5 for id 1 col2 occurence 5 for id 1
... ... ... ...
2995 998 col1 occurence 0 for id 998 col2 occurence 0 for id 998
2996 998 col1 occurence 5 for id 998 col2 occurence 5 for id 998
2997 999 col1 occurence 5 for id 999 col2 occurence 5 for id 999
2998 999 col1 occurence 0 for id 999 col2 occurence 0 for id 999
2999 999 col1 occurence 9 for id 999 col2 occurence 9 for id 999
[3000 rows x 3 columns]
EDIT: Maybe this might help you (modify list [1,300] in accordance to your case):
>>> df[df['id'].isin([1,300])].groupby(['id']).nth([0]).reset_index()
id col1 col2
0 1 col1 occurence 0 for id 1 col2 occurence 0 for id 1
1 300 col1 occurence 0 for id 300 col2 occurence 0 for id 300
Upvotes: 1