Reputation: 91
I have a lists of dataframes where I have managed to successfully loop through the lists and merge the tables as I required. I have done this by something similar to the following code:
anno_files<-c("R1000A_v1.hg19_multianno.txt", "R1000B_v1.hg19_multianno.txt" ,"R1000C_v1.hg19_multianno.txt", "RC1080A_v1.hg19_multianno.txt", "RC1080B_v1.hg19_multianno.txt" )
cancer_files<-c("R1000A_v1.hg19_multianno.txt.cancervar", "R1000B_v1.hg19_multianno.txt.cancervar" ,"R1000C_v1.hg19_multianno.txt.cancervar", "RC1080A_v1.hg19_multianno.txt.cancervar", "RC1080B_v1.hg19_multianno.txt.cancervar")
# extract the sample names from the whole filename
samples<- c()
# Iterate through each filename
for (filename in anno_files) {
# Pull out the characters prior to the first '.'
filename_id <- unlist(strsplit(filename, "\\."))[1]
# append to sample_names vector
samples<- c(samples, filename_id)
}
myfilelist <- lapply(anno_files, read.delim,header=T)
myfilelist2<-lapply(cancer_files, read.csv,sep="\t",header=T)
library(dplyr)
for (i in 1:length(myfilelist)){
all_annotation<-left_join(myfilelist[[i]], myfilelist2[[i]], by = c("Chr","Start","End","Ref","Alt"))
all_annotation$sample_id<-samples[i]
write.csv(all_annotation, paste("data/",samples[i],".merged.csv",sep = ""))
}
I would like to put in an additional line of code to rbind the tables if they have the same first part of the filename/sample name eg rbind tables "R1000A_v1", "R1000B_v1" ,"R1000C_v1"
as they are from sample R1000 and rbind "RC1080A_v1", "RC1080B_v1"
as they are from sample RC1080
I have managed to get out a list of the part of the sample name that I want to match:
samples2<- c()
# Iterate through each filename
for (filename in samples) {
# Pull out the characters prior to the first 'underscore'.'
filename2_id<-unlist(strsplit(filename, "\\_"))[1]
filename3_id<-substr(filename2_id,1,nchar(filename2_id)-1)
# append to sample_names vector
samples2<- unique(c(samples2, filename3_id))
}
samples2
"R1000""RC1080"
So now within the all_annoatation loop I need to do something like:
for (j in length(samples2)){
if all_annotation$sample_id contains samples2[j]
rbind
write.csv
}
Upvotes: 1
Views: 197
Reputation: 107577
Consider apply family solutions (lapply
, sapply
, Map
which is wrapper of mapply
) and avoid the bookkeeping required of for
loops. And to extract needed samples use string pattern functions gsub
and grep
. Finally, run do.call
and rbind
to stack like named data frames.
anno_files <- c(
"R1000A_v1.hg19_multianno.txt",
"R1000B_v1.hg19_multianno.txt",
"R1000C_v1.hg19_multianno.txt",
"RC1080A_v1.hg19_multianno.txt",
"RC1080B_v1.hg19_multianno.txt"
)
anno_samples <- sapply(strsplit(anno_files, "\\."), `[`, 1)
cancer_files <- c(
"R1000A_v1.hg19_multianno.txt.cancervar",
"R1000B_v1.hg19_multianno.txt.cancervar",
"R1000C_v1.hg19_multianno.txt.cancervar",
"RC1080A_v1.hg19_multianno.txt.cancervar",
"RC1080B_v1.hg19_multianno.txt.cancervar"
)
cancer_samples <- sapply(strsplit(cancer_files, "\\."), `[`, 1)
process_data <- function(txt_file, s_id) {
data.frame(
sample_id = s_id, # FIRST COLUMN
read.delim(txt_file),
source = txt_file # LAST COLUMN
)
}
anno_dfs <- Map(process_data, anno_files, anno_samples)
cancer_dfs <- Map(process_data, cancer_files, cancer_samples)
merged_dfs <- Map(
# BASE R LEFT JOIN
merge(x, y, by=c("Chr","Start","End","Ref","Alt"), all.x=TRUE),
anno_dfs,
cancer_dfs
) |> setNames( # NEW PIPE AS OF R v4.1.0+
anno_samples
)
roots <- unique(gsub("[A-Z]\\_.*$", "", anno_samples))
rbind_dfs <- lapply(
roots,
function(r) do.call(
rbind.data.frame,
merged_dfs[grep(r, names(merged_dfs))]
)
)
Upvotes: 1
Reputation: 6496
This can be easily donde with data.table
.
library(data.table)
You'll need to change the csv reading line, so each object is loaded as a data.table (if your csv are big, you'll notice this is fast too):
# myfilelist <- lapply(anno_files, read.delim,header=T)
myfilelist <- lapply(anno_files, function(x) assign(gsub("(.*)(\\.txt)", "\\1", x), fread(x), .GlobalEnv))
# myfilelist2 <- lapply(cancer_files, read.csv,sep="\t",header=T)
myfilelist2 <- lapply(cancer_files, function(x) assign(gsub("(.*)(\\.txt\\.cancervar)", "\\1", x), fread(x), .GlobalEnv))
Once your objects are created, I'll assume that you know what the pattern on the name is, so you can do something like:
R100_total = rbindlist(lapply(ls(pattern = "R100"), get))
R1080_total = rbindlist(lapply(ls(pattern = "R1080"), get))
Provided you don't have any other objects named with the same pattern that you don't want to rbind
.
Upvotes: 1
Reputation: 2414
I can't really test this without all the files, but I think this is close:
for (j in seq_along(samples2)) {
all_annotation[grepl(samples2[j], samples)]
temp <- do.call('rbind', all_annotation[grepl(samples2[j], samples)])
write.csv(temp, paste0(samples2[j], '.csv'))
}
}
Upvotes: 1