Reputation: 13
I have a dataframe of this type:
i1 i3 i4 i9 i14 i16 i17 i18 i19 i20 i22 i26 i27 i28
0 4 2 1 4 1 3 3 4 2 2 4 1 4 3
and I have to extract the name of the columns whose values are in the top 20% of all values in the row.
The result would be as follows:
[i1,i9,i18,i22,i27]
the I need to itereate the same procedure over other rows of the same type.
Upvotes: 1
Views: 70
Reputation: 9047
Pandas quantile
0.8 = 1 - 0.2
df.columns[df.loc[0,:] >= df.loc[0,:].quantile(0.8)]
Index(['i1', 'i9', 'i18', 'i22', 'i27'], dtype='object')
As suggested by @Nko3, more generic way to do this is to apply the formula for each row-
df.apply(lambda x: x >= x.quantile(0.8),1).dot(df.columns + ', ').str.strip(', ')
0 i1, i9, i18, i22, i27
dtype: object
Upvotes: 2
Reputation: 117731
This works:
top20 = df.ge(df.quantile(0.8, axis=1), axis=0)
top_cols = top20.apply(lambda x: x.index[x], axis=1)
Example result:
>>> df = pd.DataFrame(data=np.random.randint(1, 5, (5, 10)),
columns=[f"i{i}" for i in range(10)])
>>> df
i0 i1 i2 i3 i4 i5 i6 i7 i8 i9
0 4 4 2 4 2 4 4 3 2 4
1 2 3 4 2 2 3 1 1 4 1
2 3 2 2 1 2 2 3 1 2 4
3 3 1 1 3 3 4 2 2 1 3
4 3 2 3 2 4 1 4 2 4 2
>>> top20 = df.ge(df.quantile(0.8, axis=1), axis=0)
>>> top20
i0 i1 i2 i3 i4 i5 i6 i7 i8 i9
0 True True False True False True True False False True
1 False False True False False False False False True False
2 True False False False False False True False False True
3 True False False True True True False False False True
4 False False False False True False True False True False
>>> top20.apply(lambda x: x.index[x], axis=1)
0 Index(['i0', 'i1', 'i3', 'i5', 'i6', 'i9'], dt...
1 Index(['i2', 'i8'], dtype='object')
2 Index(['i0', 'i6', 'i9'], dtype='object')
3 Index(['i0', 'i3', 'i4', 'i5', 'i9'], dtype='o...
4 Index(['i4', 'i6', 'i8'], dtype='object')
dtype: object
Upvotes: 0