Reputation: 45
I have generated a table to show the overlap between different gene lists. Since I have eight different gene lists I have 64 results. The code I currently have is as follows:
#-------------------------------------------------------------------------------
# Set the working directory and load the data files
#-------------------------------------------------------------------------------
setwd("~/Desktop/R_Project/Gene_overlap")
getwd()
files <- list.files(pattern="*.txt", full.names = TRUE)
files
data.list <- lapply(files, function(fil) {
scan(file=fil, what=character())
})
names(data.list) <- basename(files) %>% stringr::str_remove("\\.txt$")
str(data.list)
# List of 8
# $ GSE108363_BCGdown_D: chr [1:350] "IL1B" "IL6" "IL1A" "CCL20" ...
# $ GSE108363_BCGdown_V: chr [1:267] "IL6" "CCL20" "IL1A" "CXCL5" ...
# $ GSE108363_BCGup_D : chr [1:250] "FABP4" "CMTM2" "FUCA1" "CD36" ...
# $ GSE108363_BCGup_V : chr [1:429] "FCN1" "FCGR3B" "MNDA" "CPVL" ...
# $ GSE108363_MTBdown_D: chr [1:86] "CCL20" "IL1B" "IL1A" "IL6" ...
# $ GSE108363_MTBdown_V: chr [1:244] "IL1B" "IL1A" "CCL20" "IL6" ...
# $ GSE108363_MTBup_D : chr [1:128] "FUCA1" "FGL2" "TGFBI" "CPVL" ...
# $ GSE108363_MTBup_V : chr [1:286] "FABP4" "RNASE1" "MNDA" "CPVL" ...
intersect(data.list$GSE108363_BCGdown_D, data.list$GSE108363_BCGdown_V) %>% length
sapply(data.list, length)
#-------------------------------------------------------------------------------
# Using the intersect function to see the overlaps
#-------------------------------------------------------------------------------
data.file1 <- "GSE108363_BCGdown_V.txt"
data.file2 <- "GSE108363_BCGdown_D.txt"
data.file3 <- "GSE108363_BCGup_V.txt"
data.file4 <- "GSE108363_BCGup_D.txt"
data.file5 <- "GSE108363_MTBdown_V.txt"
data.file6 <- "GSE108363_MTBdown_D.txt"
data.file7 <- "GSE108363_MTBup_V.txt"
data.file8 <- "GSE108363_MTBup_D.txt"
genevect1 <- scan(data.file1, what=character(), sep="\n")
genevect2 <- scan(data.file2, what=character(), sep="\n")
genevect3 <- scan(data.file3, what=character(), sep="\n")
genevect4 <- scan(data.file4, what=character(), sep="\n")
genevect5 <- scan(data.file5, what=character(), sep="\n")
genevect6 <- scan(data.file6, what=character(), sep="\n")
genevect7 <- scan(data.file7, what=character(), sep="\n")
genevect8 <- scan(data.file8, what=character(), sep="\n")
filelist <- list(data.file1, data.file2, data.file3, data.file4, data.file5, data.file6, data.file7, data.file8)
all(sapply(filelist, file.exists))
# read files:
gene.lists <- lapply(filelist, function(f) {
scan(file=f, what=character())
})
# set up empty matrix
x <- (length(gene.lists))^2
x
y <- rep(NA, x)
mx <- matrix(y, ncol=length(gene.lists))
mx
row.names(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
colnames(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
mx
mx.overlap.count <- mx
# seq_along(gene.lists) # 1 2 3 4 5 6 7 8
for (i in seq_along(gene.lists)) {
g1 <- gene.lists[[i]]
for (j in seq_along(gene.lists)) {
g2 <- gene.lists[[j]]
a <- intersect(g1, g2)
b <- length(a)
mx.overlap.count[j,i] <- b
}
}
mx.overlap.count
View(mx.overlap.count)
I would now like to do something similar but instead of viewing the overlap as numbers, I would like to see the extent to which overlaps exist between different gene lists as a percentage. Somehow i will need to see whether g1 or g2 is greater in order to divide b by the larger one before multiplying by 100. Any suggestions would be greatly appreciated.
Upvotes: 1
Views: 1032
Reputation: 1435
using a letter sample as you did not provide the genes list:
set.seed(1)
data.list <- lapply(sample(10:20), function(n)LETTERS[sample(1:26, n)])
overlaps <- sapply(data.list, function(g1)
sapply(data.list, function(g2)
{round(length(intersect(g1, g2)) / length(g2) * 100)}))
overlaps
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
[1,] 100 50 67 75 42 92 58 92 67 33 92
[2,] 46 100 62 69 54 77 62 69 69 54 62
[3,] 53 53 100 60 33 73 60 73 80 33 60
[4,] 53 53 53 100 47 71 53 76 53 29 82
[5,] 45 64 45 73 100 91 64 82 36 45 73
[6,] 61 56 61 67 56 100 56 89 56 33 72
[7,] 50 57 64 64 50 71 100 86 71 50 64
[8,] 55 45 55 65 45 80 60 100 60 40 80
[9,] 50 56 75 56 25 62 62 75 100 38 69
[10,] 40 70 50 50 50 60 70 80 60 100 70
[11,] 58 42 47 74 42 68 47 84 58 37 100
(I used set.seed
so you can reproduce the example). It uses a nested sapply to iterate over both gene lists individually and then calculates the percentage for each combination of gene vectors, by dividing the length of the intersect by the total length of the second gene vector. If you want to divide by the length of the longest of the 2 gene vectors, replace length(g2)
by max(length(g1), length(g2))
Upvotes: 2