NK130
NK130

Reputation: 11

Converting ENSEMBL IDs to Gene ID in a Data Frame

I have a large data-table of RNA-seq data that is listed by ensembl_gene_id, but I would like to convert to hgnc_symbol, for ease of visualization on heat maps.

So far I have the following code - but not sure how to proceed. Would it be better to convert the names from the beginning, or only on the subsetted data?

I am also more familiar with python, and normally, I would use a dictionary to map ensembl_gene_id and hgnc_symbol, but in R, not sure how to go about this. My gut says for loops wouldn't be scalable.

Any suggestions would be appreciated.

library(biomaRt)
library(RColorBrewer)
#Load ggplot2 for graphing
#library(ggplot2)

#Load the Gene Expression File. This one is MEAN TPM for genes across cell types.
GE_file <- read.csv(file = "mean_tpm_merged.csv")

#Get the header names of this file
headers <- names(GE_file)

# define biomart object
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

# query biomart

#Define Genes of Interest
GOI <- c("TFEB", "RAC1", "TFE3", "RAB5A")

# get the mapping of GOI and ENSEMBL IDs and create a dictionary
IDs <- getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
                 filters = "hgnc_symbol", values = GOI,
                 mart = mart)

# make the row names the ENSMBL IDs
row.names(IDs) <- IDs[,2]

# Look by rows of interest for this data out of the large dataset
Data_subset <- subset(GE_file, gene %in% IDs$ensembl_gene_id)

# make the row names ENSMBL IDs
row.names(Data_subset) <- Data_subset[,1]

# delete the first row as its not needed for the numerical matrix
Data_subset_matrix <- as.matrix(Data_subset[,2:16])

# colors should be green/red if possible, or whatever is color blind compatible.
# should go row-wise for the coloring.
# excise colors for B cells/NK cells/CD8 T cells.
my_palette <- colorRampPalette(c("red","green"))(n = 299)
heatmap(Data_subset_matrix, Colv = NA, Rowv = NA, scale = 'row', col = my_palette)

Some Relevant outputs:

> dput(head(GE_file))
structure(list(gene = c("ENSG00000223116", "ENSG00000233440", 
"ENSG00000207157", "ENSG00000229483", "ENSG00000252952", "ENSG00000235205"
), T.cell..CD4..naive..activated. = c(0, 0.0034414596504, 0, 
0, 0, 0), NK.cell..CD56dim.CD16. = c(0, 0, 0, 0, 0, 0.0139463278778
), T.cell..CD4..TFH = c(0, 0, 0, 0, 0, 0), T.cell..CD4..memory.TREG = c(0, 
0, 0, 0, 0, 0.000568207845073), T.cell..CD4..TH1.17 = c(0, 0.0196376949773, 
0, 0, 0, 0), B.cell..naive = c(0, 0, 0, 0, 0, 0), T.cell..CD4..TH2 = c(0, 
0, 0, 0, 0, 0), T.cell..CD4..TH1 = c(0, 0, 0, 0, 0, 0.000571213481481
), T.cell..CD4..naive = c(0, 0, 0, 0, 0, 0), T.cell..CD4..TH17 = c(0, 
0.00434618468012, 0, 0, 0, 0), Monocyte..classical = c(0, 0, 
0, 0, 0, 0), Monocyte..non.classical = c(0, 0, 0, 0, 0, 0), T.cell..CD4..naive.TREG = c(0, 
0, 0, 0, 0, 0.000821516453853), T.cell..CD8..naive = c(0, 0, 
0, 0, 0, 0.000508869486411), T.cell..CD8..naive..activated. = c(0, 
0.00348680689669, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame")

Upvotes: 1

Views: 6388

Answers (1)

StupidWolf
StupidWolf

Reputation: 46898

Get everything at one go:

mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

IDs <- getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
                 filters = "ensembl_gene_id", values = GE_file[,1],
                 mart = mart)

head(IDs)
  ensembl_gene_id hgnc_symbol
1 ENSG00000207157      RNY3P4
2 ENSG00000229483   LINC00362
3 ENSG00000233440     HMGA1P6
4 ENSG00000235205    TATDN2P3
5 ENSG00000252952    RNU6-58P

GOI <- c("RNY3P4", "TATDN2P3")

Simple way, subset the ensembl ids in your master table, and subset your dataset according to that:

GOI_ens = IDs$ensembl_gene_id[IDs$hgnc_symbol %in% GOI]

Data_subset = subset(GE_file,gene %in% GOI_ens)[,-1]

Dictionary way, there's always something you can do, but you need to ensure no duplicated symbols:

dedup = !duplicated(IDs$hgnc_symbol)
dict = tapply(IDs$hgnc_symbol,IDs$ensembl_gene_id,unique)
subset(GE_file,dict[gene] %in% GOI)

Upvotes: 0

Related Questions