user2895292
user2895292

Reputation: 63

Create probability matrices with SNP allele data for ACGT

Given the following data for 8 samples (A1-A8):

A1 A2 A3 A4 A5 A6 A7 A8
T T T T T T T C 
T C T T T T T C
A A A G G A A A

Where each column is one sample, and each row is one marker with the possible codings A, C, G, T, I wish to calculate the probability for each row the origin of any of the 4 alleles. For example, the output of the above data for row 1 should be

   A C G T
A1 0 0 0 1/7
A2 0 0 0 1/7 
A3 0 0 0 1/7
A4 0 0 0 1/7
A5 0 0 0 1/7
A6 0 0 0 1/7
A7 0 0 0 1/7
A8 0 1 0 0

since 7 of the samples possess a T in row 1, each sample has a probability of 1/7. Since only A8 possess a C, there's an 100% probability of assigning a C to A8. For row 3 the output should be

   A C G T
A1 1/6 0 0 0
A2 1/6 0 0 0 
A3 1/6 0 0 0
A4 1/2 0 0 0
A5 1/2 0 0 0
A6 1/6 0 0 0
A7 1/6 0 0 0
A8 1/6 0 0 0

The total output should be a list of i 8x4 matrices, where i equals the number of rows.

A re-workable example with code is:

states <- c("A1","A2","A3","A4","A5","A6","A7","A8") # Define the names of the states
A1 <- c("T","T","A") # Set the alleles for state A1 across 3 SNPs
A2 <- c("T","C","A") # Set the alleles for state A2 across 3 SNPs
A3 <- c("T","T","A") # Set the alleles for state A3 across 3 SNPs
A4 <- c("T","T","G") # Set the alleles for state A4 across 3 SNPs
A5 <- c("T","T","G") # Set the alleles for state A5 across 3 SNPs
A6 <- c("T","T","A") # Set the alleles for state A6 across 3 SNPs
A7 <- c("T","T","A") # Set the alleles for state A7 across 3 SNPs
A8 <- c("C","C","A") # Set the alleles for state A8 across 3 SNPs
theemissionmatrix <- matrix(t(c(A1,A2,A3,A4,A5,A6,A7,A8)), 8, 3, byrow = TRUE) # Create an 8 x 3 matrix
rownames(theemissionmatrix) <- states
theemissionmatrix # Print out the data matrix
   [,1] [,2] [,3]
A1 "T"  "T"  "A" 
A2 "T"  "C"  "A" 
A3 "T"  "T"  "A" 
A4 "T"  "T"  "G" 
A5 "T"  "T"  "G" 
A6 "T"  "T"  "A" 
A7 "T"  "T"  "A" 
A8 "C"  "C"  "A" 

test <- cbind(theemissionmatrix[,1]=="A",theemissionmatrix[,1]=="C",theemissionmatrix[,1]=="G",theemissionmatrix[,1]=="T")
colnames(test) <- c("A","C","G","T")

test
    [,1]  [,2]  [,3]  [,4]
A1 FALSE FALSE FALSE  TRUE
A2 FALSE FALSE FALSE  TRUE
A3 FALSE FALSE FALSE  TRUE
A4 FALSE FALSE FALSE  TRUE
A5 FALSE FALSE FALSE  TRUE
A6 FALSE FALSE FALSE  TRUE
A7 FALSE FALSE FALSE  TRUE
A8 FALSE  TRUE FALSE FALSE

Past this step, I'm not sure how to go about summing the total counts for each column and dividing to get the total probability.

Upvotes: 3

Views: 237

Answers (3)

Matthew Plourde
Matthew Plourde

Reputation: 44614

Here's a base R way, relying on split, table, and sweep:

res <- lapply(split(as.matrix(df), 1:nrow(df)), factor, levels=unique(unlist(df)))
lapply(res, function(row) sweep(sapply(levels(row), '==', row), 1, table(row)[row], FUN='/'))

Upvotes: 2

A5C1D2H2I1M1N2O1R2T1
A5C1D2H2I1M1N2O1R2T1

Reputation: 193517

Here are a few alternatives (starting with "df" from @zx8754's answer):

Option 1: melt + table + prop.table

Does not display nicely when there are NaN values.

library(reshape2)
dfL <- melt(as.matrix(df))
Levs <- c("A", "C", "G", "T") 
dfL$value <- factor(dfL$value, Levs) ## Just to be sure

prop.table(table(dfL[c(2, 3, 1)]), c(2, 3))
# , , Var1 = 1
# 
#     value
# Var2 A         C G         T
#   A1   0.0000000   0.1428571
#   A2   0.0000000   0.1428571
#   A3   0.0000000   0.1428571
#   A4   0.0000000   0.1428571
#   A5   0.0000000   0.1428571
#   A6   0.0000000   0.1428571
#   A7   0.0000000   0.1428571
#   A8   1.0000000   0.0000000
# 
# , , Var1 = 2
# 
#     value
# Var2 A         C G         T
#   A1   0.0000000   0.1666667
#   A2   0.5000000   0.0000000
# ..... OUTPUT TRUNCATED

Option 2: melt + by + table

Can be easily made to display 0 where NaN would otherwise be.

dfL <- melt(as.matrix(df))
Levs <- c("A", "C", "G", "T") 
dfL$value <- factor(dfL$value, Levs) ## Just to be sure

by(dfL[-1], dfL[1], FUN = function(x) {
  A <- prop.table(table(x), 2)
  A[is.nan(A)] <- 0
  A
})
# Var1: 1
#     value
# Var2         A         C         G         T
#   A1 0.0000000 0.0000000 0.0000000 0.1428571
#   A2 0.0000000 0.0000000 0.0000000 0.1428571
#   A3 0.0000000 0.0000000 0.0000000 0.1428571
#   A4 0.0000000 0.0000000 0.0000000 0.1428571
#   A5 0.0000000 0.0000000 0.0000000 0.1428571
#   A6 0.0000000 0.0000000 0.0000000 0.1428571
#   A7 0.0000000 0.0000000 0.0000000 0.1428571
#   A8 0.0000000 1.0000000 0.0000000 0.0000000
# ------------------------------------------------------------------------ 
# Var1: 2
#     value
# Var2         A         C         G         T
#   A1 0.0000000 0.0000000 0.0000000 0.1666667
#   A2 0.0000000 0.5000000 0.0000000 0.0000000
# ..... OUTPUT TRUNCATED

Option 3: lapply + table after some restructuring of the data

Sticking entirely in base R, here's another alternative....

Levs <- c("A", "C", "G", "T")
out <- data.frame(N = names(df), t(df), row.names=NULL)
Rows <- setdiff(names(out), "N")
out[Rows] <- lapply(out[Rows], function(x) factor(x, Levs))
Tables <- lapply(seq_along(Rows), function(x) {
  A <- prop.table(table(out[, 1], out[, Rows[x]]), 2)
  A[is.nan(A)] <- 0
  A
})

Upvotes: 2

zx8754
zx8754

Reputation: 56149

Try this:

#data
df <- read.table(text="
A1 A2 A3 A4 A5 A6 A7 A8
T T T T T T T C 
T C T T T T T C
A A A G G A A A", header=TRUE, as.is=T)

#ACGT
allele <- c("A","C","G","T")

#get counts: loop samples loop alleles
lapply(1:nrow(df),function(sample){
  sapply(c("A","C","G","T"),
         function(x){
           p <- as.numeric(df[sample,]==x) / sum(df[sample,]==x)
           #check if it is `not a number`
           ifelse(is.nan(p),0,p)
           })
  })

Upvotes: 1

Related Questions