navac
navac

Reputation: 87

Iterate over two pandas dataframes and compare similarly named columns by row

I have two dfs and I would like to create a modified 3rd df that adds a character string to the value of df1 based on the same position (row x column) in df2. My for loop does not seem to be working correctly, and is not adding the character string as anticipated. Where am I going wrong with my forloops? I've tried several versions of .iterrows() and .iterritems() to no avail.

data1 =  {'col1' :['2.28', '-0.38', '0.16', '-0.00', '-0.11', '0.00', '-0.00','0.92', '0.58', '0.90', '0.80'],
         'col2': ['2.23', '-0.38', '0.17', '-0.00', '-0.10', '0.00', '-0.00','0.89', '0.57', '0.89', '0.77'],
         'col3': ['2.25', '-0.31', '0.17', '0.00', '-0.10', '0.00', '-0.00','0.88', '0.55', '0.89', '0.78'],
         'col4': ['2.22', '-0.16', '0.17', '0.00', '-0.08', '0.00', '-0.00','0.85', '0.52', '0.85', '0.76']}
df1 = pd.DataFrame(data1)

data2 =  {'col1' :[0.0043463 , 0.02835221, 0.01998397, 0.95802428, 0.30099018,
                   0.88572679, 0.49670165, 0.24806438, 0.46116773, 0.26479583,
                   0.30652824],
         'col2': [0.00557248, 0.14322473, 0.01656108, 0.99548637, 0.32824552,
                  0.78092598, 0.45962261, 0.26562723, 0.47006285, 0.27588199,
                  0.32995383],
         'col3': [0.00516395, 0.1432596 , 0.01855883, 0.95806069, 0.33487223,
                  0.90526158, 0.373449  , 0.27230896, 0.48722896, 0.27608198,
                  0.323059  ],
         'col4': [0.00476079, 0.08205838, 0.03080842, 0.8642895 , 0.46951708,
                  0.75946821, 0.25630978, 0.30713967, 0.52637885, 0.29876842,
                  0.36094742]}
df2 = pd.DataFrame(data2)


final_df = pd.DataFrame()
for idx1,c in enumerate(df1): 
  for idx2,p in enumerate(df2):    
    final_df[c] = np.where(df2[p] < .05, df1[c]+'*',
                        np.where(df2[p] <.01,df1[c]+'**',
                                 np.where(df2[p] <.001 , df1[c]+'***',df1[c])))
    final_df = pd.DataFrame(final_df)
final_df 

desired output:

    col1    col2    col3    col4
0   2.28**  2.23**  2.25**  2.22**
1   -0.38*  -0.38   -0.31   -0.16
2   0.16*   0.17*   0.17*   0.17*
3   -0.00   -0.00   0.00    0.00
4   -0.11   -0.10   -0.10   -0.08

Upvotes: 0

Views: 1592

Answers (1)

mr_mooo_cow
mr_mooo_cow

Reputation: 1128

I don't think this is the best answer but I believe it achieves the result you're looking for.

Firstly, I don't see any reason to iterate through df2 since you are using np.where to compare the same column in both dataframes.

import pandas as pd
import numpy as np

data1 =  {'col1' :['2.28', '-0.38', '0.16', '-0.00', '-0.11', '0.00', '-0.00','0.92', '0.58', '0.90', '0.80'],
         'col2': ['2.23', '-0.38', '0.17', '-0.00', '-0.10', '0.00', '-0.00','0.89', '0.57', '0.89', '0.77'],
         'col3': ['2.25', '-0.31', '0.17', '0.00', '-0.10', '0.00', '-0.00','0.88', '0.55', '0.89', '0.78'],
         'col4': ['2.22', '-0.16', '0.17', '0.00', '-0.08', '0.00', '-0.00','0.85', '0.52', '0.85', '0.76']}
df1 = pd.DataFrame(data1)

data2 =  {'col1' :[0.0043463 , 0.02835221, 0.01998397, 0.95802428, 0.30099018,
                   0.88572679, 0.49670165, 0.24806438, 0.46116773, 0.26479583,
                   0.30652824],
         'col2': [0.00557248, 0.14322473, 0.01656108, 0.99548637, 0.32824552,
                  0.78092598, 0.45962261, 0.26562723, 0.47006285, 0.27588199,
                  0.32995383],
         'col3': [0.00516395, 0.1432596 , 0.01855883, 0.95806069, 0.33487223,
                  0.90526158, 0.373449  , 0.27230896, 0.48722896, 0.27608198,
                  0.323059  ],
         'col4': [0.00476079, 0.08205838, 0.03080842, 0.8642895 , 0.46951708,
                  0.75946821, 0.25630978, 0.30713967, 0.52637885, 0.29876842,
                  0.36094742]}
df2 = pd.DataFrame(data2)


final_df = pd.DataFrame()
for idx1, c in enumerate(df1):
    # just add a * for each level, and after the first modifcation reference itself(final_df) so we don't overwrite any *'s
    final_df[c] = np.where(df2[c] < .05, df1[c]+'*',df1[c])
    final_df[c] = np.where(df2[c] <.01,final_df[c]+'*', final_df[c])
    final_df[c] = np.where(df2[c] <.001 , final_df[c]+'*', final_df[c])
    final_df = pd.DataFrame(final_df)
final_df

Output:

col1 col2 col3 col4 
[0: 2.28** 2.23** 2.25** 2.22**], 
[1: -0.38* -0.38 -0.31 -0.16], 
[2: 0.16* 0.17* 0.17* 0.17*], 
[3: -0.00 -0.00 0.00 0.00], 
[4: -0.11 -0.10 -0.10 -0.08], 
[5: 0.00 0.00 0.00 0.00], 
[6: -0.00 -0.00 -0.00 -0.00], 
[7: 0.92 0.89 0.88 0.85]...

Upvotes: 1

Related Questions