Reputation: 4043

What is the fastest and the best way to get a specific number of group after applying groupby?

I have more than 1000 groups with different id and I only need to select a specific number of groups and read the nth number of every group. Here an example of what I need:

 #These are the codes from different answers 
 import pandas as pd
import numpy as np
import time
import sys
df = pd.DataFrame({
  'index':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'id'    :   ['veh0',    'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192','veh0',    'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192'],
'veh_x' :[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'veh_y':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884]

}
)


data=['veh0',   'veh1', 'veh2', 'veh3'] 
# print(df.groupby(['id']).head(1))
#first  part
start = time.clock()
for i in range(0,20):
  g=df.groupby(['id']).nth([i]).reset_index()
  for x in data:
      for idx, row in g.iterrows():
          if x==row['id']:
             print("code1 group",i,"=",row['id'])

end = time.clock()
print ("%.2gs" % (end-start) )      


#second part
#This is what I need but it is running slowly when I add it to my whole dataset


start = time.clock()


for i in range(0,20):
 for x in data: #these are the selected groups
            g = df[df['id'].isin([x])].groupby(['id']).nth([i]).reset_index()
            for x, row in g.iterrows():
               print("code2 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start) ) 


#Third part

start = time.clock()               

for i in range(0,20):
    g=df[df['id'].isin(data)].groupby('id').nth([i]).reset_index()

    for x, row in g.iterrows():
               print("code3 group",i,"=",row['id'])

end = time.clock()
print ("%.2gs" % (end-start))


#fourth part

start = time.clock()               
df2 = df[df['id'].isin(data)] 
for i in range(0,20): 
  for x in data: 
      row = df2.groupby('id').nth(i) 
      if(x in row.index): 
            print("code4 group",i, " = ", x) 

end = time.clock()
print ("%.2gs" % (end-start))

#fifth  part
def printf(text):
   print text
start = time.clock()               

tmp = df.loc[df.id.isin(data)].groupby(['id']).apply(lambda x: x.reset_index(drop=True)).reset_index(level=1)

# cleanup and rename index

tmp = tmp.rename(columns={'level_1': 'group'})
# print 20 first groups
for i in range(20): 

    lst= tmp.loc[tmp.group == i].apply(lambda x:x, axis=1)
    for x, row in lst.iterrows():
               print("code5 group",i,"=",row['id'])


end = time.clock()
print ("%.2gs" % (end-start))

The first part of the code read all the groups and return the nth number of every group but I need only five or six or more. The problem is that I don't know any information about the group. I can use a counter after that I can use break but the code is running so slowly because I need to load more than 30000 records every iteration. Here I added data=['veh0', 'veh1', 'veh2', 'veh3'] as an example but it can be chosen randomly.

The second part is what I want but the code still running slowly. The second part takes 0.43s, the first part takes 0.14s, and the third part takes 0.077s. What is the best way of making it better?

I appreciate any help?

Upvotes: 2

Answers (4)

mjspier

Reputation: 6536

I think you don't need any for loops like this:

data = ['veh0', 'veh1', 'veh2', 'veh3']

# insert group index for each id group
tmp = (df.loc[df.id.isin(data)]
       .groupby(['id'])
       .apply(lambda x: x.reset_index(drop=True))
       .reset_index(level=1))
# cleanup and rename index
tmp = tmp.reset_index(drop=True)
tmp = tmp.rename(columns={'level_1': 'group'})

# fast print 20 first groups
print(tmp.loc[tmp.group.isin(list(range(20)))]
  .sort_values('group')[['group', 'id']]
  .to_string(formatters={'group':'group {} ='.format}, 
             index=False, 
             header=None))

# slow print with loop
for i in range(20): 
    lst = tmp.loc[tmp.group == i]
    for x, row in lst.iterrows():
               print("code5 group",i,"=",row['id'])

Upvotes: 1

maow

Reputation: 2887

I imagine the problem is that you groupby df in every loop iteration. I think changing the order of loops could help for that.

If I understood correctly you want to print the nth value of id if it is present and none if it is not there. This could be done by a count operation

First get the count for all ids:

In [27]: df2 = df[df['id'].isin(data)].groupby('id')[['id']].count()                                                                                                                               

In [28]: df2                                                                                                                                                                                                
Out[28]: 
      id  index
id             
veh0  24     24
veh1  18     18
veh2  12     12
veh3   6      6

Second the output

In [44]: for i in range(0,20): 
    ...:   for x in data: 
    ...:     if( i < df2.loc[x,].values): 
    ...:       print("code4 group",i,"=", x) 
    ...:                                                                                                                                                                                                    
code4 group 0 = veh0
code4 group 0 = veh1
code4 group 0 = veh2
code4 group 0 = veh3
code4 group 1 = veh0
code4 group 1 = veh1
code4 group 1 = veh2
code4 group 1 = veh3
code4 group 2 = veh0
code4 group 2 = veh1
code4 group 2 = veh2
code4 group 2 = veh3
code4 group 3 = veh0
code4 group 3 = veh1
code4 group 3 = veh2
code4 group 3 = veh3
code4 group 4 = veh0
code4 group 4 = veh1
code4 group 4 = veh2
code4 group 4 = veh3
code4 group 5 = veh0
code4 group 5 = veh1
code4 group 5 = veh2
code4 group 5 = veh3
code4 group 6 = veh0
code4 group 6 = veh1
code4 group 6 = veh2
code4 group 7 = veh0
code4 group 7 = veh1
code4 group 7 = veh2
code4 group 8 = veh0
code4 group 8 = veh1
code4 group 8 = veh2
code4 group 9 = veh0
code4 group 9 = veh1
code4 group 9 = veh2
code4 group 10 = veh0
code4 group 10 = veh1
code4 group 10 = veh2
code4 group 11 = veh0
code4 group 11 = veh1
code4 group 11 = veh2
code4 group 12 = veh0
code4 group 12 = veh1
code4 group 13 = veh0
code4 group 13 = veh1
code4 group 14 = veh0
code4 group 14 = veh1
code4 group 15 = veh0
code4 group 15 = veh1
code4 group 16 = veh0
code4 group 16 = veh1
code4 group 17 = veh0
code4 group 17 = veh1
code4 group 18 = veh0
code4 group 19 = veh0

If I haven't missed anything this is the output you wanted. Does this solve your problems?

EDIT: To get the other columns, you could split up df2 further.

In [53]: df2 = df[df['id'].isin(data)]                                                                                                                                                                      

In [54]: df2                                                                                                                                                                                                
Out[54]: 
    index    id  veh_x  veh_y
0       0  veh0      0      0
1       1  veh0      1      1
2       2  veh0      2      2
3       2  veh1      2      2
4       3  veh0      3      3
5       3  veh1      3      3
6       4  veh0      4      4
..    ...   ...    ...    ...
83      9  veh2      9      9
84      9  veh3      9      9
85     10  veh0     10     10
86     10  veh1     10     10
87     10  veh2     10     10
88     10  veh3     10     10
89     11  veh0     11     11

Then you need to groupby in the loop again but the DataFrame should be much smaller

In [90]: for i in range(0,20): 
    ...:   for x in data: 
    ...:       row = df2.groupby('id').nth(i) 
    ...:       if(x in row.index): 
    ...:           print("code5 group",i, " = ", x) 
    ...:       
    ...:                                                                                                                                                                                                    
code5 group 0  =  veh0
code5 group 0  =  veh1
code5 group 0  =  veh2
code5 group 0  =  veh3
code5 group 1  =  veh0
code5 group 1  =  veh1
code5 group 1  =  veh2
code5 group 1  =  veh3
code5 group 2  =  veh0
code5 group 2  =  veh1
code5 group 2  =  veh2
code5 group 2  =  veh3
code5 group 3  =  veh0
code5 group 3  =  veh1
code5 group 3  =  veh2
code5 group 3  =  veh3
code5 group 4  =  veh0
code5 group 4  =  veh1
code5 group 4  =  veh2
code5 group 4  =  veh3
code5 group 5  =  veh0
code5 group 5  =  veh1
code5 group 5  =  veh2
code5 group 5  =  veh3
code5 group 6  =  veh0
code5 group 6  =  veh1
code5 group 6  =  veh2
code5 group 7  =  veh0
code5 group 7  =  veh1
code5 group 7  =  veh2
code5 group 8  =  veh0
code5 group 8  =  veh1
code5 group 8  =  veh2
code5 group 9  =  veh0
code5 group 9  =  veh1
code5 group 9  =  veh2
code5 group 10  =  veh0
code5 group 10  =  veh1
code5 group 10  =  veh2
code5 group 11  =  veh0
code5 group 11  =  veh1
code5 group 11  =  veh2
code5 group 12  =  veh0
code5 group 12  =  veh1
code5 group 13  =  veh0
code5 group 13  =  veh1
code5 group 14  =  veh0
code5 group 14  =  veh1
code5 group 15  =  veh0
code5 group 15  =  veh1
code5 group 16  =  veh0
code5 group 16  =  veh1
code5 group 17  =  veh0
code5 group 17  =  veh1
code5 group 18  =  veh0
code5 group 19  =  veh0

row (or row.reset_index()) should now contain the same data as in your examples as well. Is that what you meant?

Upvotes: 1

gosuto

Reputation: 5741

If I understand you correctly you are looking to do this:

data = ['veh0', 'veh1', 'veh2', 'veh3']
n_groups = 3
df[df['id'].isin(data)].groupby('id').head(n_groups)['id']

The only way this differs with your expected output is that the groups are not numbered (i in your example code). However you are not explicitly stating this is a requirement?

Correct me if I'm wrong.

Upvotes: 1

Juan Kania-Morales

Reputation: 588

To the best of my understanding of your problem:

>>> import pandas as pd
>>> df = \
pd.DataFrame(
    {
        'id': [i for i in range (1000)]*10,
        'col1': ['col1 occurence {} for id {}'.format(j, i) for j in range(10) for i in range (1000)],
        'col2': ['col2 occurence {} for id {}'.format(j, i) for j in range(10) for i in range (1000)]
    }
)
>>> df.head()

   id                       col1                       col2
0   0  col1 occurence 0 for id 0  col2 occurence 0 for id 0
1   1  col1 occurence 0 for id 1  col2 occurence 0 for id 1
2   2  col1 occurence 0 for id 2  col2 occurence 0 for id 2
3   3  col1 occurence 0 for id 3  col2 occurence 0 for id 3
4   4  col1 occurence 0 for id 4  col2 occurence 0 for id 4

This will give you precisely 0th, 5th and 9th data row for each id (modify list [0,5,9] in accordance to your case):

>>> df.groupby(['id']).nth([0,5,9]).reset_index()
       id                         col1                         col2
0       0    col1 occurence 0 for id 0    col2 occurence 0 for id 0
1       0    col1 occurence 5 for id 0    col2 occurence 5 for id 0
2       0    col1 occurence 9 for id 0    col2 occurence 9 for id 0
3       1    col1 occurence 0 for id 1    col2 occurence 0 for id 1
4       1    col1 occurence 5 for id 1    col2 occurence 5 for id 1
...   ...                          ...                          ...
2995  998  col1 occurence 0 for id 998  col2 occurence 0 for id 998
2996  998  col1 occurence 5 for id 998  col2 occurence 5 for id 998
2997  999  col1 occurence 5 for id 999  col2 occurence 5 for id 999
2998  999  col1 occurence 0 for id 999  col2 occurence 0 for id 999
2999  999  col1 occurence 9 for id 999  col2 occurence 9 for id 999

[3000 rows x 3 columns]

EDIT: Maybe this might help you (modify list [1,300] in accordance to your case):

>>> df[df['id'].isin([1,300])].groupby(['id']).nth([0]).reset_index()
    id                         col1                         col2
0    1    col1 occurence 0 for id 1    col2 occurence 0 for id 1
1  300  col1 occurence 0 for id 300  col2 occurence 0 for id 300

Upvotes: 1

What is the fastest and the best way to get a specific number of group after applying groupby?

Answers (4)

Related Questions