R: Error: sample_info must be a data frame with columns sample_name, file_bam, paired_end, read_length, frag_length, lib_size

Question

I have a list of bam files in chr16_bam folder and also an annotation file sgseq_sam.txt.

library(SGSeq)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(BSgenome.Hsapiens.UCSC.hg19)
library(org.Hs.eg.db)
library(tidyverse)
library(AnnotationDbi)
library(dplyr)
library(stringr)


# Rename the "file_bam" column values to the full path where the BAMs are stored
setwd("C:/Users/User/Downloads/")
bamPath = "C:/Users/User/Downloads/chr16_bam"
samFile <- read.delim("C:/Users/User/Downloads/sgseq_sam.txt", header=T) 
samFile <- samFile %>% 
  mutate(file_bam = paste0(bamPath, file_bam))

# Save the TxDb.Hsapiens.UCSC.hg19.knownGene object into a variable.
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb <- keepSeqlevels(txdb, "chr16")
seqlevelsStyle(txdb) <- "NCBI"

# Read in the gene list from Supplementary Table 1
gene.list <- read.table("Table_1_Differential Expression Analysis Revealing CLCA1 to Be a Prognostic and Diagnostic Biomarker for Colorectal Cancer.xls", header=T)

# Convert the gene symbols to Entrez IDs and remove the genes that don't map to an Entrez ID.
entrez = mapIds(org.Hs.eg.db, keys=gene.list$Name, column = "ENTREZID", keytype="SYMBOL")

gene.list <- gene.list %>% 
  mutate(entrez) %>% 
  filter(!is.na(entrez))

# Convert the TxDb.Hsapiens.UCSC.hg19.knownGene object
txf_ucsc <- convertToTxFeatures(txdb)

# Cast this object to a dataframe and save as another variable name
txf_df <- as.data.frame(txf_ucsc)

# Subset by the Entrez IDs
txf_df <- txf_df %>% filter(geneName %in% gene.list$entrez)

# Find the number of common transcripts
unique <- unique(txf_df$geneName)
gene.list <- gene.list[gene.list$entrez %in% unique,]
nrow(gene.list)

# Cast the seqnames column to factor
txf_df$seqnames <- as.factor(as.character(txf_df$seqnames))

# Remove the "chr" prefix from the seqnames column 
txf_df %>% 
  rename_all(~stringr::str_replace(.,"^chr",""))

# Recast this dataframe back to a GRanges object
txf_grange <- makeGRangesFromDataFrame(txf_df, keep.extra.columns=T)

Now, I want to create a loop for each of the genes, where upon iteration, subset Granges objects in txf_grange by only the gene, use the reduce function to collapse the ranges of the genes into a single vector, run analyzeFeatures, then annotate functions, and finally plotFeatures.

for (i in txf_grange$geneName) {
  if (i=="343") {
    next
  }
  else{
    # For each of the 15 genes, subset the Granges objects by only the gene
    grange.subset <- txf_grange[i == toString(i)]
    # Collapse the ranges of the genes into a single vector
    grange.subset <- unlist(IRanges::reduce((split(grange.subset, grange.subset$geneName))))
    # Run analyzeFeatures
    for (j in 1:dim(samFile)[[1]]) {
      si <- samFile
      for (k in si) {
        for (l in list.files(path="C:/Users/User/Downloads/chr16_bam", pattern=".bam$", all.files=F, full.names=F)) {
          sgfc_pred <- analyzeFeatures(k, which=grange.subset, features=txf_ucsc, predict=T)
          # Annotate predicted features
          sgfc_pred <- annotate(sgfc_pred, txf_ucsc)
          # Plot features
          pdf(paste("plot", l, ".pdf", sep=""))
          plotFeatures(sgfc_pred, geneID=1)
          dev.off()
        }
      }
    }    
  }
}

Data

> dput(head(txf_grange))
new("GRanges", seqnames = new("Rle", values = structure(1L, .Label = "16", class = "factor"), 
    lengths = 6L, elementMetadata = NULL, metadata = list()), 
    ranges = new("IRanges", start = c(12058964L, 12059311L, 12059311L, 
    12060052L, 12060198L, 12060198L), width = c(348L, 742L, 2117L, 
    147L, 680L, 1230L), NAMES = NULL, elementType = "ANY", elementMetadata = NULL, 
        metadata = list()), strand = new("Rle", values = structure(1L, .Label = c("+", 
    "-", "*"), class = "factor"), lengths = 6L, elementMetadata = NULL, 
        metadata = list()), seqinfo = new("Seqinfo", seqnames = "16", 
        seqlengths = NA_integer_, is_circular = NA, genome = NA_character_), 
    elementMetadata = new("DFrame", rownames = NULL, nrows = 6L, 
        listData = list(type = structure(c(3L, 1L, 1L, 2L, 1L, 
        1L), .Label = c("J", "I", "F", "L", "U"), class = "factor"), 
            txName = structure(list(c("uc002dbv.3", "uc010buy.3", 
            "uc010buz.3"), c("uc002dbv.3", "uc010buy.3"), "uc010buz.3", 
                c("uc002dbv.3", "uc010buy.3"), "uc010buy.3", 
                "uc002dbv.3"), class = "AsIs"), geneName = structure(list(
                "608", "608", "608", "608", "608", "608"), class = "AsIs")), 
        elementType = "ANY", elementMetadata = NULL, metadata = list()), 
    elementType = "ANY", metadata = list())

R: Error: sample_info must be a data frame with columns sample_name, file_bam, paired_end, read_length, frag_length, lib_size

Answers (1)

Related Questions