testenthu
testenthu

Reputation: 79

python Count of ID's for percentage levels

I have the table like below which is stored in a DataFrame. I want to add the tolerance_level = [0, 1, 5, 10, 20, 30, 50, 100, 200, 300, 500, 700] and get the count of ID's for columns A, B, C that fall under the tolerance_level

For example: in the below table count of ID's for column A having tolerance_level of 10% = 2 count of ID's for column A having tolerance_level of 0% = 1 count of ID's for columns A having tolerance_level of 100% = 1 and 700% =1

And count of ID's for column B having tolerance_level of 30% = 2 and so on..... And if a columns has % which is not defined in the tolerance_level then the ID falls under the nearest tolerance_level. For example, if a columns has 900% then it will be in >700% tolerance_level

ID A B C
1 0% 1% 5%
3 10% 30% 50%
6 100% 300% 500%
7 700% 900% 50%
10 10% 30% 50%

So, the result would be something like

tolerance_level A B C
0% 17 100 50
1% 10 50 70
5% 60 80 40

Upvotes: 0

Views: 94

Answers (3)

Soudipta Dutta
Soudipta Dutta

Reputation: 2152

import pandas as pd
import numpy as np

data = [
    {'ID': 1, 'A': '0%', 'B': '1%', 'C': '5%'},
    {'ID': 3, 'A': '10%', 'B': '30%', 'C': '50%'},
    {'ID': 6, 'A': '100%', 'B': '300%', 'C': '500%'},
    {'ID': 7, 'A': '700%', 'B': '900%', 'C': '50%'},
    {'ID': 10, 'A': '10%', 'B': '30%', 'C': '50%'}
]

df = pd.DataFrame(data)
print(df)
"""
  ID     A     B     C
0   1    0%    1%    5%
1   3   10%   30%   50%
2   6  100%  300%  500%
3   7  700%  900%   50%
4  10   10%   30%   50%
"""

tolerance_levels = np.array([0, 1, 5, 10, 20, 30, 50, 100, 200, 300, 500, 700])
df_numeric = df[['A', 'B', 'C']].apply(lambda x: x.str.rstrip('%').astype(float))
    
# half tolerance for each level 
half_tolerance = np.where(tolerance_levels > 0, tolerance_levels / 2, 0)

# Initialize a DataFrame to store the counts
counts_df = pd.DataFrame(index=tolerance_levels, columns=df_numeric.columns, dtype=int).fillna(0)
counts_df.index.name = 'tolerance_levels'  # Set the name of the index
print(counts_df)
"""
                   A    B    C
tolerance_levels               
0                 0.0  0.0  0.0
1                 0.0  0.0  0.0
5                 0.0  0.0  0.0
10                0.0  0.0  0.0
20                0.0  0.0  0.0
30                0.0  0.0  0.0
50                0.0  0.0  0.0
100               0.0  0.0  0.0
200               0.0  0.0  0.0
300               0.0  0.0  0.0
500               0.0  0.0  0.0
700               0.0  0.0  0.0
"""
for tl, ht in zip(tolerance_levels, half_tolerance):
    cond1 = (df_numeric >= (tl - ht) )
    cond2 = (df_numeric <= (tl + ht) )
    cond  =  (cond1 & cond2)
    counts_df.loc[tl] = cond.sum()

counts_df.index = counts_df.index.astype(str) + '%' 

print(counts_df)
"""
                   A    B    C
tolerance_levels               
0%                1.0  0.0  0.0
1%                0.0  1.0  0.0
5%                0.0  0.0  1.0
10%               2.0  0.0  1.0
20%               2.0  2.0  0.0
30%               0.0  2.0  0.0
50%               0.0  2.0  3.0
100%              1.0  0.0  3.0
200%              1.0  1.0  0.0
300%              0.0  1.0  0.0
500%              1.0  1.0  1.0
700%              1.0  1.0  1.0
"""

Upvotes: 0

jezrael
jezrael

Reputation: 863481

Use merge_asof for add new column filled by nearest values from helper DataFrame and count output by crosstab:

df = pd.DataFrame({'ID': [1, 3, 6, 7, 10], 
                   'A': ['0%', '17%', '108%', '700%', '10%'], 
                   'B': ['1%', '30%', '299%', '900%', '30%'], 
                   'C': ['5%', '50%', '500%', '51%', pd.Timestamp('12-10-2000')]})
print (df)
   ID     A     B                    C
0   1    0%    1%                   5%
1   3   17%   30%                  50%
2   6  108%  299%                 500%
3   7  700%  900%                  51%
4  10   10%   30%  2000-12-10 00:00:00

tolerance_level = [0, 1, 5, 10, 20, 30, 50, 100, 200, 300, 500, 700]
df1 = pd.DataFrame({'tolerance_level':tolerance_level})
df1['tolerance_level'] = df1['tolerance_level'].astype('float')

df2 = df.melt('ID', value_name='tol')
df2['tol'] = pd.to_numeric(df2['tol'].str.rstrip('%'), errors='coerce')

df3 = pd.merge_asof(df2.dropna(subset=['tol']).sort_values('tol'), 
                    df1, 
                    left_on='tol',
                    right_on='tolerance_level', 
                    direction='nearest')


out = (pd.crosstab(df3['tolerance_level'], df3['variable'])
         .rename_axis(columns=None)
         .rename(lambda x: f"{x}%")
         .reset_index())
print (out)
   tolerance_level  A  B  C
0               0%  1  0  0
1               1%  0  1  0
2               5%  0  0  1
3              10%  1  0  0
4              20%  1  0  0
5              30%  0  2  0
6              50%  0  0  3
7             100%  1  0  0
8             300%  0  1  0
9             500%  0  0  1
10            700%  1  1  0

Upvotes: 1

mozway
mozway

Reputation: 262234

Reshape the input with melt, form the bins from tolerance_level using pandas.cut (after removing the % with str.rstrip), then compute the crosstab:

tolerance_level = [0, 1, 5, 10, 20, 30, 50, 100, 200, 300, 500, 700]


tmp = df.melt('ID', value_name='tolerance_level')

out = pd.crosstab(pd.cut(tmp['tolerance_level'].str.rstrip('%').astype(float),
                         bins=tolerance_level+[np.inf], labels=tolerance_level,
                         right=False),
                  tmp['variable']
                 ).reset_index()

Output:

  tolerance_level  A  B  C
0               0  1  0  0
1               1  0  1  0
2               5  0  0  1
3              10  2  0  0
4              30  0  2  0
5              50  0  0  3
6             100  1  0  0
7             300  0  1  0
8             500  0  0  1
9             700  1  1  0

The above will assign a given value to bin with the the nearest lower boundary. If you want to assign to the nearest upper bound, change:

out = pd.crosstab(pd.cut(tmp['tolerance_level'].str.rstrip('%').astype(float),
                         bins=tolerance_level+[np.inf], labels=tolerance_level,
                         right=True, include_lowest=True),
                  tmp['variable']
                 ).rename_axis(columns=None).reset_index()

Output:

  tolerance_level  A  B  C
0               0  1  1  0
1               1  0  0  1
2               5  2  0  0
3              20  0  2  0
4              30  0  0  3
5              50  1  0  0
6             200  0  1  0
7             300  0  0  1
8             500  1  0  0
9             700  0  1  0

Upvotes: 1

Related Questions