MERose
MERose

Reputation: 4421

Replace strings in all dataframe cells by corresponding entries in another data frame

I have a dataframe with a differing number of names in a cell of a dataframe which I want to replace with corresponding numbers of another dataframe. Afterwards, I want to proceed and calculate the mean and maximum but thats not part of my problem.

df_with_names <-read.table(text="
id names
1 AA,BB
2 AA,CC,DD
3 BB,CC
4 AA,BB,CC,DD
",header=TRUE,sep="")

The dataframe with the correspoding numbers looks like

df_names <-read.table(text="
name number_1 number_2
AA 20 30
BB 12 14
CC 13 29
DD 14 27
",header=TRUE,sep="")

At the end of the first step it should be

id number_1   number_2
1 20,12       30,14
2 20,13,14    30,29,27
3 12,13       14,29
4 20,12,13,14 30,14,29,27

From here I know how to proceed but I don't know how to get there.

I tried to separate the names of each row in a loop into a dataframe and then replace the names but I always fail to get the right column of df_with_names. After a while, I doubt that replace() is the function I am looking for. Who can help?

Upvotes: 4

Views: 716

Answers (6)

thelatemail
thelatemail

Reputation: 93833

I think it would actually be worth your while to rearrange your df_with_names dataset to make things more straight-forward:

spl <- strsplit(as.character(df_with_names$names), ",")
df_with_names <- data.frame(
  id=rep(df_with_names$id, sapply(spl, length)),
  name=unlist(spl)
)

#   id name
#1   1   AA
#2   1   BB
#3   2   AA
#4   2   CC
#5   2   DD
#6   3   BB
#7   3   CC
#8   4   AA
#9   4   BB
#10  4   CC
#11  4   DD

aggregate(
  . ~ id,
  data=merge(df_with_names, df_names, by="name")[-1],
  FUN=function(x) paste(x,collapse=",")
)

#  id    number_1    number_2
#1  1       20,12       30,14
#2  2    20,13,14    30,29,27
#3  3       12,13       14,29
#4  4 20,12,13,14 30,14,29,27

Upvotes: 0

rnso
rnso

Reputation: 24555

Another method:

df3 = data.frame(id=df1$id, 
                 number_1=as.character(df1$names), 
                 number_2=as.character(df1$names), stringsAsFactors=FALSE)

for(n1 in 1:nrow(df3))
    for(n2 in 1:nrow(df2)){
            df3[n1,2] = sub(df2[n2,1],df2[n2,2], df3[n1,2] )   
            df3[n1,3] = sub(df2[n2,1],df2[n2,3], df3[n1,3] )   
    }

df3
#  id    number_1    number_2
#1  1       20,12       30,14
#2  2    20,13,14    30,29,27
#3  3       12,13       14,29
#4  4 20,12,13,14 30,14,29,27

Upvotes: 0

IRTFM
IRTFM

Reputation: 263352

I don't like names like "names" or "name", so I went with "nam":

 do.call( rbind,    # reassembles the individual lists
    apply(df_with_names, 1,   # for each row in df_with_names
          function(x)  lapply(   #  lapply(..., paste) to each column
             # Next line will read each comma separated value and 
             # and match to rows of df_names[] and return cols 2:3
               df_names[ df_names$nam %in% scan(text=x[2], what="", sep=",") ,
                       2:3, drop=FALSE] ,   # construct packet of text digits
                                paste0, collapse=",") ) )


     number_1      number_2     
[1,] "20,12"       "30,14"      
[2,] "20,13,14"    "30,29,27"   
[3,] "12,13"       "14,29"      
[4,] "20,12,13,14" "30,14,29,27"

(I'm surprised that scan(text= ... a factor variable actually succeeded.)

Upvotes: 0

arvi1000
arvi1000

Reputation: 9582

Another all in one:

data2match <- strsplit(df_with_names$names, ',')

lookup <- function(lookfor, in_df, return_col, search_col=1) {
  in_df[, return_col][match(lookfor, in_df[, search_col])]
}

output <- 
  # for each number_x column....
  sapply(names(df_names)[-1],
         function(y) {
           # for each set of names
           sapply(data2match,
                  function(x) paste(sapply(x, lookup, df_names, 
                                  y, USE.NAMES=F), collapse=','))
         })

data.frame(id=1:nrow(output), output)

Produces:

  id    number_1    number_2
1  1       20,12       30,14
2  2    20,13,14    30,29,27
3  3       12,13       14,29
4  4 20,12,13,14 30,14,29,27

Upvotes: 2

eddi
eddi

Reputation: 49448

library(data.table)

dt1 = as.data.table(df_with_names)
dt2 = as.data.table(df_names)

setkey(dt2, name)

dt2[setkey(dt1[, strsplit(as.character(names), split = ","), by = id], V1)][,
    lapply(.SD, paste0, collapse = ","), keyby = id]
#   id        name    number_1    number_2
#1:  1       AA,BB       20,12       30,14
#2:  2    AA,CC,DD    20,13,14    30,29,27
#3:  3       BB,CC       12,13       14,29
#4:  4 AA,BB,CC,DD 20,12,13,14 30,14,29,27

The above first splits the names along the comma in the first data.table, then joins that with the second one (after setting keys appropriately) and collapses all of the resulting columns back with a comma.

Upvotes: 2

Brandon Bertelsen
Brandon Bertelsen

Reputation: 44648

Note: make sure both dataframes are ordered by id otherwise you may see unexpected results

listing <- df_with_names
listing <- strsplit(as.character(listing$names),",")

col1 <- lapply(listing, function(x) df_names[(df_names[[1]] %in% x),2])
col2 <- lapply(listing, function(x) df_names[(df_names[[1]] %in% x),3])

col1 <- unlist(lapply(col1, paste0, collapse = ","))
col2 <- unlist(lapply(col2, paste0, collapse = ","))

data.frame(number_1 = col1, number_2 = col2 )

     number_1    number_2
1       20,12       30,14
2    20,13,14    30,29,27
3       12,13       14,29
4 20,12,13,14 30,14,29,27

Upvotes: 0

Related Questions