Add a text column to the dataframe based on characters in another column of the same dataframe

Question

The following is a small sample of my data frame:

> print(df1)
        Section                     A                      B                      
1   ./100-12-S00.xlsx           0.6032591              0.4554364
2   ./108-15-S01.xlsx           0.7109148              0.5367121
3   ./92-12-S00.xlsx            0.6032591              0.4554364
4   ./124-9-S05.xlsx            0.7109148              0.5367121
5   ./116-15-S11.xlsx           0.6032591              0.4554364
6   ./108-9-S06.xlsx            0.7109148              0.5367121
7   ./84-12-S08.xlsx            0.6032591              0.4554364
8   ./124-15-S11.xlsx           0.7109148              0.5367121
9   ./92-12-S00.xlsx            0.6032591              0.4554364
10  ./116-15-S03.xlsx           0.7109148              0.5367121
11  ./100-12-S02.xlsx           0.6032591              0.4554364
12  ./84-9-S05.xlsx             0.7109148              0.5367121

I would like to add a new column of texts to df1 based on the characters in the Section column. This must be done such that if Section includes:

> print(df2)
     Section    Names
1    S00        Baseline
2    S01        Sample1
3    S02        Sample2
4    S03        Sample3
5    S04        Sample4
6    S05        Sample5
7    S06        Sample6
8    S07        Sample7
9    S08        Sample8
10   S09        Sample9
11   S10        Sample10
12   S11        AASHTO

I want df1 to include a new column called Names based on what is in the Section column.

The expected output is:

> print(df3)
        Section                 A              B          Names                  
1   ./100-12-S00.xlsx       0.6032591      0.4554364    Baseline
2   ./108-15-S01.xlsx       0.7109148      0.5367121    Sample1
3   ./92-12-S00.xlsx        0.6032591      0.4554364    Baseline
4   ./124-9-S05.xlsx        0.7109148      0.5367121    Sample5
5   ./116-15-S11.xlsx       0.6032591      0.4554364    AASHTO
6   ./108-9-S06.xlsx        0.7109148      0.5367121    Sample6
7   ./84-12-S08.xlsx        0.6032591      0.4554364    Sample8
8   ./124-15-S11.xlsx       0.7109148      0.5367121    AASHTO
9   ./92-12-S00.xlsx        0.6032591      0.4554364    Baseline
10  ./116-15-S03.xlsx       0.7109148      0.5367121    Sample3
11  ./100-12-S02.xlsx       0.6032591      0.4554364    Sample2
12  ./84-9-S05.xlsx         0.7109148      0.5367121    Sample5

akrun · Accepted Answer

We can create a new column by extracting the substring in 'df1' and do a left join with 'df2'

library(dplyr)
library(stringr)
df1 %>%
   mutate(Section2 = str_extract(Section, "S\d+")) %>%
   left_join(df2, by = c("Section2" = "Section")) %>%
   select(-Section2)
#         Section         A         B    Names
#1  ./100-12-S00.xlsx 0.6032591 0.4554364 Baseline
#2  ./108-15-S01.xlsx 0.7109148 0.5367121  Sample1
#3   ./92-12-S00.xlsx 0.6032591 0.4554364 Baseline
#4   ./124-9-S05.xlsx 0.7109148 0.5367121  Sample5
#5  ./116-15-S11.xlsx 0.6032591 0.4554364   AASHTO
#6   ./108-9-S06.xlsx 0.7109148 0.5367121  Sample6
#7   ./84-12-S08.xlsx 0.6032591 0.4554364  Sample8
#8  ./124-15-S11.xlsx 0.7109148 0.5367121   AASHTO
#9   ./92-12-S00.xlsx 0.6032591 0.4554364 Baseline
#10 ./116-15-S03.xlsx 0.7109148 0.5367121  Sample3
#11 ./100-12-S02.xlsx 0.6032591 0.4554364  Sample2
#12   ./84-9-S05.xlsx 0.7109148 0.5367121  Sample5

Or without much effort, use regex_left_join from fuzzyjoin

library(fuzzyjoin)
regex_left_join(df1, df2, by = 'Section') %>%
    select(Section = Section.x, A, B, Names)
# A tibble: 12 x 4
#   Section               A     B Names   
#                     
# 1 ./100-12-S00.xlsx 0.603 0.455 Baseline
# 2 ./108-15-S01.xlsx 0.711 0.537 Sample1 
# 3 ./92-12-S00.xlsx  0.603 0.455 Baseline
# 4 ./124-9-S05.xlsx  0.711 0.537 Sample5 
# 5 ./116-15-S11.xlsx 0.603 0.455 AASHTO  
# 6 ./108-9-S06.xlsx  0.711 0.537 Sample6 
# 7 ./84-12-S08.xlsx  0.603 0.455 Sample8 
# 8 ./124-15-S11.xlsx 0.711 0.537 AASHTO  
# 9 ./92-12-S00.xlsx  0.603 0.455 Baseline
#10 ./116-15-S03.xlsx 0.711 0.537 Sample3 
#11 ./100-12-S02.xlsx 0.603 0.455 Sample2 
#12 ./84-9-S05.xlsx   0.711 0.537 Sample5

data

df1 <- structure(list(Section = c("./100-12-S00.xlsx", "./108-15-S01.xlsx", 
"./92-12-S00.xlsx", "./124-9-S05.xlsx", "./116-15-S11.xlsx", 
"./108-9-S06.xlsx", "./84-12-S08.xlsx", "./124-15-S11.xlsx", 
"./92-12-S00.xlsx", "./116-15-S03.xlsx", "./100-12-S02.xlsx", 
"./84-9-S05.xlsx"), A = c(0.6032591, 0.7109148, 0.6032591, 0.7109148, 
0.6032591, 0.7109148, 0.6032591, 0.7109148, 0.6032591, 0.7109148, 
0.6032591, 0.7109148), B = c(0.4554364, 0.5367121, 0.4554364, 
0.5367121, 0.4554364, 0.5367121, 0.4554364, 0.5367121, 0.4554364, 
0.5367121, 0.4554364, 0.5367121)), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))

df2 <- structure(list(Section = c("S00", "S01", "S02", "S03", "S04", 
"S05", "S06", "S07", "S08", "S09", "S10", "S11"), Names = c("Baseline", 
"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6", 
"Sample7", "Sample8", "Sample9", "Sample10", "AASHTO")), 
 class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))

Add a text column to the dataframe based on characters in another column of the same dataframe

Answers (1)

data

Related Questions