Reputation: 91

rbind if column contains partial match R

I have a lists of dataframes where I have managed to successfully loop through the lists and merge the tables as I required. I have done this by something similar to the following code:

anno_files<-c("R1000A_v1.hg19_multianno.txt", "R1000B_v1.hg19_multianno.txt" ,"R1000C_v1.hg19_multianno.txt", "RC1080A_v1.hg19_multianno.txt", "RC1080B_v1.hg19_multianno.txt" )

cancer_files<-c("R1000A_v1.hg19_multianno.txt.cancervar", "R1000B_v1.hg19_multianno.txt.cancervar" ,"R1000C_v1.hg19_multianno.txt.cancervar", "RC1080A_v1.hg19_multianno.txt.cancervar", "RC1080B_v1.hg19_multianno.txt.cancervar")

# extract the sample names from the whole filename
samples<- c()
# Iterate through each filename
for (filename in anno_files) {
  # Pull out the characters prior to the first '.'
  filename_id <- unlist(strsplit(filename, "\\."))[1]
  # append to sample_names vector
  samples<- c(samples, filename_id)
}

myfilelist <- lapply(anno_files, read.delim,header=T)
myfilelist2<-lapply(cancer_files, read.csv,sep="\t",header=T)

    library(dplyr)
    for (i in 1:length(myfilelist)){
      all_annotation<-left_join(myfilelist[[i]], myfilelist2[[i]], by = c("Chr","Start","End","Ref","Alt"))
      all_annotation$sample_id<-samples[i]
      write.csv(all_annotation, paste("data/",samples[i],".merged.csv",sep = ""))
}

I would like to put in an additional line of code to rbind the tables if they have the same first part of the filename/sample name eg rbind tables "R1000A_v1", "R1000B_v1" ,"R1000C_v1" as they are from sample R1000 and rbind "RC1080A_v1", "RC1080B_v1" as they are from sample RC1080

I have managed to get out a list of the part of the sample name that I want to match:

samples2<- c()
# Iterate through each filename
for (filename in samples) {
  # Pull out the characters prior to the first 'underscore'.'
  filename2_id<-unlist(strsplit(filename, "\\_"))[1]
  filename3_id<-substr(filename2_id,1,nchar(filename2_id)-1)
  # append to sample_names vector
  samples2<- unique(c(samples2, filename3_id))
}
samples2
"R1000""RC1080"

So now within the all_annoatation loop I need to do something like:

for (j in length(samples2)){
if all_annotation$sample_id contains samples2[j] 
rbind
write.csv
}

Upvotes: 1

Answers (3)

Parfait

Reputation: 107577

Consider apply family solutions (lapply, sapply, Map which is wrapper of mapply) and avoid the bookkeeping required of for loops. And to extract needed samples use string pattern functions gsub and grep. Finally, run do.call and rbind to stack like named data frames.

anno_files <- c(
    "R1000A_v1.hg19_multianno.txt",
    "R1000B_v1.hg19_multianno.txt",
    "R1000C_v1.hg19_multianno.txt",
    "RC1080A_v1.hg19_multianno.txt",
    "RC1080B_v1.hg19_multianno.txt" 
)
anno_samples <- sapply(strsplit(anno_files, "\\."), `[`, 1)

cancer_files <- c(
    "R1000A_v1.hg19_multianno.txt.cancervar", 
    "R1000B_v1.hg19_multianno.txt.cancervar",
    "R1000C_v1.hg19_multianno.txt.cancervar", 
    "RC1080A_v1.hg19_multianno.txt.cancervar", 
    "RC1080B_v1.hg19_multianno.txt.cancervar"
) 

cancer_samples <- sapply(strsplit(cancer_files, "\\."), `[`, 1)

process_data <- function(txt_file, s_id) {
    data.frame(
        sample_id = s_id,        # FIRST COLUMN
        read.delim(txt_file),
        source = txt_file        # LAST COLUMN
    )
} 

anno_dfs <- Map(process_data, anno_files, anno_samples)
cancer_dfs <- Map(process_data, cancer_files, cancer_samples)

merged_dfs <- Map(
    # BASE R LEFT JOIN
    merge(x, y, by=c("Chr","Start","End","Ref","Alt"), all.x=TRUE),
    anno_dfs,
    cancer_dfs
) |> setNames(   # NEW PIPE AS OF R v4.1.0+
    anno_samples
)

roots <- unique(gsub("[A-Z]\\_.*$", "", anno_samples))
rbind_dfs <- lapply(
    roots,
    function(r) do.call(
        rbind.data.frame, 
        merged_dfs[grep(r, names(merged_dfs))]
    )
)

Upvotes: 1

PavoDive

Reputation: 6496

This can be easily donde with data.table.

library(data.table)

You'll need to change the csv reading line, so each object is loaded as a data.table (if your csv are big, you'll notice this is fast too):

# myfilelist <- lapply(anno_files, read.delim,header=T)
myfilelist <- lapply(anno_files, function(x) assign(gsub("(.*)(\\.txt)", "\\1", x), fread(x), .GlobalEnv))

# myfilelist2 <- lapply(cancer_files, read.csv,sep="\t",header=T)
myfilelist2 <- lapply(cancer_files, function(x) assign(gsub("(.*)(\\.txt\\.cancervar)", "\\1", x), fread(x), .GlobalEnv))

Once your objects are created, I'll assume that you know what the pattern on the name is, so you can do something like:

R100_total = rbindlist(lapply(ls(pattern = "R100"), get))
R1080_total = rbindlist(lapply(ls(pattern = "R1080"), get))

Provided you don't have any other objects named with the same pattern that you don't want to rbind.

Upvotes: 1

Brian Montgomery

Reputation: 2414

I can't really test this without all the files, but I think this is close:

for (j in seq_along(samples2)) {
  all_annotation[grepl(samples2[j], samples)]
    temp <- do.call('rbind', all_annotation[grepl(samples2[j], samples)])
    write.csv(temp, paste0(samples2[j], '.csv'))
  }  
}

Upvotes: 1

rbind if column contains partial match R

Answers (3)

Related Questions