Anon
Anon

Reputation: 1547

R: Error: sample_info must be a data frame with columns sample_name, file_bam, paired_end, read_length, frag_length, lib_size

I have a list of bam files in chr16_bam folder and also an annotation file sgseq_sam.txt.

library(SGSeq)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(BSgenome.Hsapiens.UCSC.hg19)
library(org.Hs.eg.db)
library(tidyverse)
library(AnnotationDbi)
library(dplyr)
library(stringr)


# Rename the "file_bam" column values to the full path where the BAMs are stored
setwd("C:/Users/User/Downloads/")
bamPath = "C:/Users/User/Downloads/chr16_bam"
samFile <- read.delim("C:/Users/User/Downloads/sgseq_sam.txt", header=T) 
samFile <- samFile %>% 
  mutate(file_bam = paste0(bamPath, file_bam))

# Save the TxDb.Hsapiens.UCSC.hg19.knownGene object into a variable.
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb <- keepSeqlevels(txdb, "chr16")
seqlevelsStyle(txdb) <- "NCBI"

# Read in the gene list from Supplementary Table 1
gene.list <- read.table("Table_1_Differential Expression Analysis Revealing CLCA1 to Be a Prognostic and Diagnostic Biomarker for Colorectal Cancer.xls", header=T)

# Convert the gene symbols to Entrez IDs and remove the genes that don't map to an Entrez ID.
entrez = mapIds(org.Hs.eg.db, keys=gene.list$Name, column = "ENTREZID", keytype="SYMBOL")

gene.list <- gene.list %>% 
  mutate(entrez) %>% 
  filter(!is.na(entrez))

# Convert the TxDb.Hsapiens.UCSC.hg19.knownGene object
txf_ucsc <- convertToTxFeatures(txdb)

# Cast this object to a dataframe and save as another variable name
txf_df <- as.data.frame(txf_ucsc)

# Subset by the Entrez IDs
txf_df <- txf_df %>% filter(geneName %in% gene.list$entrez)

# Find the number of common transcripts
unique <- unique(txf_df$geneName)
gene.list <- gene.list[gene.list$entrez %in% unique,]
nrow(gene.list)

# Cast the seqnames column to factor
txf_df$seqnames <- as.factor(as.character(txf_df$seqnames))

# Remove the "chr" prefix from the seqnames column 
txf_df %>% 
  rename_all(~stringr::str_replace(.,"^chr",""))

# Recast this dataframe back to a GRanges object
txf_grange <- makeGRangesFromDataFrame(txf_df, keep.extra.columns=T)

Now, I want to create a loop for each of the genes, where upon iteration, subset Granges objects in txf_grange by only the gene, use the reduce function to collapse the ranges of the genes into a single vector, run analyzeFeatures, then annotate functions, and finally plotFeatures.

for (i in txf_grange$geneName) {
  if (i=="343") {
    next
  }
  else{
    # For each of the 15 genes, subset the Granges objects by only the gene
    grange.subset <- txf_grange[i == toString(i)]
    # Collapse the ranges of the genes into a single vector
    grange.subset <- unlist(IRanges::reduce((split(grange.subset, grange.subset$geneName))))
    # Run analyzeFeatures
    for (j in 1:dim(samFile)[[1]]) {
      si <- samFile
      for (k in si) {
        for (l in list.files(path="C:/Users/User/Downloads/chr16_bam", pattern=".bam$", all.files=F, full.names=F)) {
          sgfc_pred <- analyzeFeatures(k, which=grange.subset, features=txf_ucsc, predict=T)
          # Annotate predicted features
          sgfc_pred <- annotate(sgfc_pred, txf_ucsc)
          # Plot features
          pdf(paste("plot", l, ".pdf", sep=""))
          plotFeatures(sgfc_pred, geneID=1)
          dev.off()
        }
      }
    }    
  }
}

Data

> dput(head(txf_grange))
new("GRanges", seqnames = new("Rle", values = structure(1L, .Label = "16", class = "factor"), 
    lengths = 6L, elementMetadata = NULL, metadata = list()), 
    ranges = new("IRanges", start = c(12058964L, 12059311L, 12059311L, 
    12060052L, 12060198L, 12060198L), width = c(348L, 742L, 2117L, 
    147L, 680L, 1230L), NAMES = NULL, elementType = "ANY", elementMetadata = NULL, 
        metadata = list()), strand = new("Rle", values = structure(1L, .Label = c("+", 
    "-", "*"), class = "factor"), lengths = 6L, elementMetadata = NULL, 
        metadata = list()), seqinfo = new("Seqinfo", seqnames = "16", 
        seqlengths = NA_integer_, is_circular = NA, genome = NA_character_), 
    elementMetadata = new("DFrame", rownames = NULL, nrows = 6L, 
        listData = list(type = structure(c(3L, 1L, 1L, 2L, 1L, 
        1L), .Label = c("J", "I", "F", "L", "U"), class = "factor"), 
            txName = structure(list(c("uc002dbv.3", "uc010buy.3", 
            "uc010buz.3"), c("uc002dbv.3", "uc010buy.3"), "uc010buz.3", 
                c("uc002dbv.3", "uc010buy.3"), "uc010buy.3", 
                "uc002dbv.3"), class = "AsIs"), geneName = structure(list(
                "608", "608", "608", "608", "608", "608"), class = "AsIs")), 
        elementType = "ANY", elementMetadata = NULL, metadata = list()), 
    elementType = "ANY", metadata = list())

Upvotes: 1

Views: 124

Answers (1)

joey
joey

Reputation: 11

change line:

analyzeFeatures(samFile, grange.subset)

Also you do not need that many loops to run for the question. The question asks for 14 plots and you might be plotting for much more with the number of loops you have.

Upvotes: 1

Related Questions