minalee
minalee

Reputation: 13

Add new values to dataframe when multiple match with apply()

I have a data frame df as below

d = {'letter_num' :['Nr. 1', 'Nr. 2', 'Nr. 3', 'Nr. 3']}

df = pd.DataFrame(d)
print(df)

   letter_num
0         Nr. 1
1         Nr. 2
2         Nr. 3
3         Nr. 3

letters = pd.DataFrame(d, columns=['letter_num'])

I want to add the following dictionary's key and values as new columns to the above data frame on the condition that the (digits in the) keys match the existing (digit) values in the letter_num column in the df.

labels = {'[1]': 'budget', '[2]': 'budget', '[3 a]': 'expensive', '[3 b]': 'sport'}


def apply_and_concat(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

def matcher(k):
    for i,j in labels.items():
      num =  re.search('(\d+)', i).group()
      if num in k.split(' '): 
        return i,j

apply_and_concat(df, 'letter_num', matcher, ['letters','content'])

The above code gives me an output as below:

 letter_num letters content
0   Nr. 1   [1]     budget
1   Nr. 2   [2]     budget
2   Nr. 3   [3 a]   expensive
3   Nr. 3   [3 a]   expensive


Expected Output:

 letter_num letters content
0   Nr. 1   [1]     budget
1   Nr. 2   [2]     budget
2   Nr. 3   [3 a]   expensive
3   Nr. 3   [3 b]   sport

Can somebody please help me out?

Upvotes: 1

Views: 64

Answers (1)

jezrael
jezrael

Reputation: 863216

Use a bit different approach - idea is create new DataFrame by labels, get numbers to new Series by Series.str.extract and mainly add counter of them by GroupBy.cumcount.

In this solutions are join together by Series.str.cat and set to index in both, so last is possible use DataFrame.join:

d = {'letter_num' :['Nr. 1', 'Nr. 2', 'Nr. 3', 'Nr. 3']}

letters = pd.DataFrame(d, columns=['letter_num'])

labels = {'[1]': 'budget', '[2]': 'budget', '[3 a]': 'expensive', '[3 b]': 'sport'}

df1 = pd.DataFrame({(k, v) for k, v in labels.items()}, columns=['letters','content'])
num = df1['letters'].str.extract(r'(\d+)', expand=False)
df1.index = df1.groupby(num).cumcount().astype(str).str.cat(num, sep='|')
print (df1)
    letters    content
0|3   [3 a]  expensive
0|2     [2]     budget
0|1     [1]     budget
1|3   [3 b]      sport

df = pd.DataFrame(d)

num = df['letter_num'].str.extract(r'(\d+)', expand=False)
df.index = df.groupby(num).cumcount().astype(str).str.cat(num, sep='|')
print (df)

    letter_num
0|1      Nr. 1
0|2      Nr. 2
0|3      Nr. 3
1|3      Nr. 3

df = df.join(df1).reset_index(drop=True)
print (df)
  letter_num letters    content
0      Nr. 1     [1]     budget
1      Nr. 2     [2]     budget
2      Nr. 3   [3 a]  expensive
3      Nr. 3   [3 b]      sport

Or create new columns and use DataFrame.merge with left join:

d = {'letter_num' :['Nr. 1', 'Nr. 2', 'Nr. 3', 'Nr. 3']}

letters = pd.DataFrame(d, columns=['letter_num'])

labels = {'[1]': 'budget', '[2]': 'budget', '[3 a]': 'expensive', '[3 b]': 'sport'}

df1 = pd.DataFrame({(k, v) for k, v in labels.items()}, columns=['letters','content'])
df1['num'] = df1['letters'].str.extract(r'(\d+)', expand=False)
df1['g'] = df1.groupby('num').cumcount()
print (df1)
  letters    content num  g
0   [3 a]  expensive   3  0
1     [2]     budget   2  0
2     [1]     budget   1  0
3   [3 b]      sport   3  1

df = pd.DataFrame(d)
#print (df)

df['num'] = df['letter_num'].str.extract(r'(\d+)', expand=False)
df['g'] = df.groupby('num').cumcount()
print (df)
  letter_num num  g
0      Nr. 1   1  0
1      Nr. 2   2  0
2      Nr. 3   3  0
3      Nr. 3   3  1

df = df.merge(df1, on=['num','g'], how='left')
print (df)
  letter_num num  g letters    content
0      Nr. 1   1  0     [1]     budget
1      Nr. 2   2  0     [2]     budget
2      Nr. 3   3  0   [3 a]  expensive
3      Nr. 3   3  1   [3 b]      sport

df = df.drop(['num','g'], axis=1)
print (df)
  letter_num letters    content
0      Nr. 1     [1]     budget
1      Nr. 2     [2]     budget
2      Nr. 3   [3 a]  expensive
3      Nr. 3   [3 b]      sport

Upvotes: 1

Related Questions