Reputation: 573
I have a script that calculates the copy number variation and saves the data into an existing file named "genesforcomp1" based on first column information. The input files named BRCA1.txt, BRCA2.txt, BRCA3.txt.......BRCA4376.txt. The other input file "genes.txt" is the same in each cycle and used for the annotation, while "genesforcomp1" is used for updating the output. Due to the large number of files, I would like to know if I can do it by the loop function in R. Here is my script
setwd("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/aa")
library(GenomicRanges)
library(dplyr)
library("scales")
require(tidyverse)
#Create annotation or refrence table
genes <- read.table("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/genes.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
genes$chromosome_name <- gsub('X', '23', genes$chromosome_name)
genes$chromosome_name <- gsub('Y', '24', genes$chromosome_name)
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
#File need to be analyzed (3 step: preprocessing, comparison with reference or annotation and post-porcessing)
df<- read.table("BRCA1.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df$Chromosome <- gsub('X', '23', df$Chromosome)
df$Chromosome <- gsub('Y', '24', df$Chromosome)
colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
cnv <- makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
hits <- findOverlaps(genes_GR, cnv, type="within")
df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
df_ann <- unique(df_ann)
df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
df_ann$Seg_value2 <- abs(df_ann$Seg_value)
df_ann$Seg_value2 = 2^df_ann$Seg_value2
df_ann$Seg_value2 = df_ann[, 4] - 1
df_ann$Seg_value2 = df_ann[, 4] * 2
df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
df_ann1 <- read.table("/home/sumit/Academic/DHR/TCGA/Gene List/Final1/genesbase.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df <- rbind.data.frame(df_ann, df_ann1)
df <- df[!duplicated(df$Ensembl_ID),]
#saving the results into existing file based on first column values
df1 <- read.delim("genesforcomp1", check.names=FALSE, stringsAsFactors=FALSE)
lst <- list(data.frame(df1), data.frame(df))
df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
write.table(df2, file="genesforcomp1", quote = F, sep = "\t", row.names = F)
Any suggestions or ideas of how to loop the script will be appreciated. Thanks in advance!
Upvotes: 0
Views: 168
Reputation: 4456
As your filenames follow a nice pattern, you can do a loop from 1 to 4376, and substitute the "BRCA1.txt"
in your code with paste0("BRCA", i, ".txt")
. There probably are ways to loop without hard coding the pattern, but in your case you don't seem to need it.
setwd("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/aa")
library(GenomicRanges)
library(dplyr)
library("scales")
require(tidyverse)
#Create annotation or refrence table
genes <- read.table("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/genes.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
genes$chromosome_name <- gsub('X', '23', genes$chromosome_name)
genes$chromosome_name <- gsub('Y', '24', genes$chromosome_name)
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
#File need to be analyzed (3 step: preprocessing, comparison with reference or annotation and post-porcessing)
for(i in 1:4376){
df<- read.table(paste0("BRCA", i, ".txt"), sep="\t", stringsAsFactors=FALSE, header=TRUE)
df$Chromosome <- gsub('X', '23', df$Chromosome)
df$Chromosome <- gsub('Y', '24', df$Chromosome)
colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
cnv <- makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
hits <- findOverlaps(genes_GR, cnv, type="within")
df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
df_ann <- unique(df_ann)
df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
df_ann$Seg_value2 <- abs(df_ann$Seg_value)
df_ann$Seg_value2 = 2^df_ann$Seg_value2
df_ann$Seg_value2 = df_ann[, 4] - 1
df_ann$Seg_value2 = df_ann[, 4] * 2
df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
df_ann1 <- read.table("/home/sumit/Academic/DHR/TCGA/Gene List/Final1/genesbase.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df <- rbind.data.frame(df_ann, df_ann1)
df <- df[!duplicated(df$Ensembl_ID),]
#saving the results into existing file based on first column values
df1 <- read.delim("genesforcomp1", check.names=FALSE, stringsAsFactors=FALSE)
lst <- list(data.frame(df1), data.frame(df))
df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
write.table(df2, file="genesforcomp1", quote = F, sep = "\t", row.names = F)
}
Upvotes: 1