jjgasse
jjgasse

Reputation: 329

replace zero value to one and one value to zero in row pandas

I build a function that normalized an attribute data set by dummy. I want change value from zero to one and from one to zero if the number of one value of each row is > to number of zero:

def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    n_zero = (data == 0).astype(int).sum(axis=1)
    n_uno = (data == 1).astype(int).sum(axis=1)
    for i in range(len(n_zero)):
        if n_uno[i] > n_zero[i]:
           #replace_values = {0: 1, 1: 0}   
           #data.iloc[i] = data.iloc[i].replace({data.iloc[i]: replace_values}) 
           data.iloc[i] = data.iloc[i].map({0 : 1})
           data.iloc[i] = data.iloc[i].map({1 : 0})
    return data


dummy_columns = ["ATTRIBUTE1",..."ATTRIBIUTE N"]
df=dummy_data(df, dummy_columns)

The function not replace my zero and one value

Upvotes: 1

Views: 1246

Answers (2)

b2002
b2002

Reputation: 914

fast method of finding and reversing ones and zeros using numpy logical_not:

def dummy_data(data_df, dummy_columns):

    static_df = data_df[list(set(data.columns) - set(dummy_columns))]
    df = pd.get_dummies(data_df[dummy_columns])
    vals = df.values
    ones_count = np.add.reduce(vals, axis=1)
    zeros_count = np.add.reduce(np.logical_not(vals), axis=1)
    idx = np.where(ones_count > zeros_count)[0]

    vals[idx, :] = np.logical_not(vals[idx, :])
    result_df = pd.concat([static_df, pd.DataFrame(vals, index=df.index, columns=df.columns)], axis=1)

    return result_df

Upvotes: 0

jezrael
jezrael

Reputation: 862661

I think you need:

def dummy_data(data, columns):
    #get_dummies with all columns together
    data =  pd.concat([data, pd.get_dummies(data[columns])], axis=1).drop(columns, axis=1)
    #convert to int not necessary
    n_zero = (data == 0).sum(axis=1)
    n_uno = (data == 1).sum(axis=1)
    #replace by condition without loop
    m = n_uno > n_zero
    data = data.mask(m, data.replace({0:1,1:0}))

    return data

Sample:

df = pd.DataFrame({'A':list('abb'),
                   'B':list('bbb'),
                   'C':list('baa'),
                   'D':list('aaa')})

print (df)
   A  B  C  D
0  a  b  b  a
1  b  b  a  a
2  b  b  a  a

def dummy_data(data, columns):
    data =  pd.concat([data, pd.get_dummies(data[columns])], axis=1).drop(columns, axis=1)
    print (data)

   D  A_a  A_b  B_b  C_a  C_b
0  a    1    0    1    0    1
1  a    0    1    1    1    0
2  a    0    1    1    1    0

    n_zero = (data == 0).sum(axis=1)
    n_uno = (data == 1).sum(axis=1)
    m = n_uno > n_zero
    print (m)

0    True
1    True
2    True
dtype: bool

    data = data.mask(m, data.replace({0:1,1:0}))

    return data

dummy_columns = ['A','B', 'C']
df = dummy_data(df, dummy_columns)
print (df)

   D  A_a  A_b  B_b  C_a  C_b
0  a    0    1    0    1    0
1  a    1    0    0    0    1
2  a    1    0    0    0    1

Upvotes: 2

Related Questions