Reputation: 5719
I have this data called mydf
.
I need to match the letters (DNA letters) in column REF
and ALT
with the colnames(x)
("A","T","G","C"
) and get the corresponding numerical values pasted together as "REF,ALT"
.
However, there are some lines where I have "snp:+[0-9]"
and "flat$"
in TYPE
column.
Now for "flat$"
lines I want to:
ALT
values from as many as "snp:+[0-9]"
of the corresponding "start"
ids, including the flat line itself, if the
ALT
letters are unique (please see the script enclosed in curly
brackets for one flat line)ALT
value again as "REF,ALT"
(REF
value will be
same for both "snp:+[0-9]"
and "flat$"
with same start ID)I have done this for one flat line,but I need help making the function for flatcase
so that it will do the same for all flat lines.
How can I make a function to do this for flatcase
?
Code
normalCase <- function(x, ns) {
ref.idx <- which(ns == "REF")
ref.allele <- x[ref.idx]
ref.count <- x[which(ns == ref.allele)]
alt.idx <- which(ns == "ALT")
alt.allele <- x[alt.idx]
alt.count <- x[which(ns == alt.allele)]
paste(ref.count, alt.count, sep=",")
}
flatcase??{
g<-x[,"start"]=="chr16:2530921"& grepl("snp:+[0-9]",x[,"TYPE"])
myt<-x[g,]
x[g,"ALT"]
unique(x[g,"ALT"])
c<-unique(x[g,"ALT"])
flat<-myt[grepl("flat$",myt[,"TYPE"]),]
c<-unique(x[g,"ALT"])
alt.count<- sum(as.numeric(flat[c]))
}
calculateAD <- function(x, mat, ns) {
if (grepl("flat$", x[which(ns == 'TYPE')])) {
flatCase(x, mat, ns)
} else {
normalCase(x, ns)
}
}
bamAD <- function(x) {
new.x <- cbind(x, apply(x, 1, calculateAD, x, colnames(x)))
colnames(new.x)[ncol(new.x)] <- "bam.AD"
new.x
}
The function I have tried for flatCase is:
flatCase <- function(x, mat, ns) {
id.idx <- which(ns == 'start')
type.idx <- which(ns == 'TYPE')
ref.idx <- which(ns == 'REF')
alt.idx <- which(ns == 'ALT')
id <- x[id.idx]
#m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
#m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
m<-mat[grepl(id,mat[, id.idx]) & grepl("snp:+[0-9]",mat[, type.idx]),]
#flat<-mat[grepl("flat$",mat[, type.idx]),]
ref.allele <- x[ref.idx]
ref.count<-x[which(ns == ref.allele)]
alt.count <- sum(apply(m, 1, function(x) as.numeric(x[which(ns == x[alt.idx])])))
paste(ref.count, alt.count, sep=",")
}
mydf
x <- as.matrix(read.csv(text="start,A,T,G,C,REF,ALT,TYPE
chr20:5363934,95,29,14,59,C,T,snp
chr5:8529759,24,1,28,41,G,C,snp
chr14:9620689,65,49,41,96,T,G,snp
chr18:547375,94,1,51,67,G,C,snp
chr8:5952145,27,80,25,96,T,T,snp
chr14:8694382,68,94,26,30,A,A,snp
chr16:2530921,49,15,79,72,A,T,snp:2530921
chr16:2530921,49,15,79,72,A,G,snp:2530921
chr16:2530921,49,15,79,72,A,T,snp:2530921flat
chr16:2533924,42,13,19,52,G,T,snp:2533924flat
chr16:2543344,4,13,13,42,G,T,snp:2543344flat
chr16:2543344,4,23,13,42,G,A,snp:2543344
chr14:4214117,73,49,18,77,G,A,snp
chr4:7799768,36,28,1,16,C,A,snp
chr3:9141263,27,41,93,90,A,A,snp", stringsAsFactors=FALSE))
Result:
start A T G C REF ALT TYPE bam.AD
[1,] "chr20:5363934" "95" "29" "14" "59" "C" "T" "snp" "59,29"
[2,] "chr5:8529759" "24" " 1" "28" "41" "G" "C" "snp" "28,41"
[3,] "chr14:9620689" "65" "49" "41" "96" "T" "G" "snp" "49,41"
[4,] "chr18:547375" "94" " 1" "51" "67" "G" "C" "snp" "51,67"
[5,] "chr8:5952145" "27" "80" "25" "96" "T" "T" "snp" "80,80"
[6,] "chr14:8694382" "68" "94" "26" "30" "A" "A" "snp" "68,68"
[7,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921" "49,15"
[8,] "chr16:2530921" "49" "15" "79" "72" "A" "G" "snp:2530921" "49,79"
[9,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921flat" "49,94"
[10,] "chr16:2533924" "42" "13" "19" "52" "G" "T" "snp:2533924flat" "19,13"
[11,] "chr16:2543344" "42" "13" "13" "42" "G" "T" "snp:2543344flat" "13,55"
[12,] "chr16:2543344" "42" "23" "13" "42" "G" "A" "snp:2543344" "13,42"
[13,] "chr14:4214117" "73" "49" "18" "77" "G" "A" "snp" "18,73"
[14,] "chr4:7799768" "36" "28" " 1" "16" "C" "A" "snp" "16,36"
[15,] "chr3:9141263" "27" "41" "93" "90" "A" "A" "snp" "27,27"
Upvotes: 0
Views: 355
Reputation: 4686
Another approach:
# create dataframe
mydf <- as.data.frame(x, stringsAsFactors=FALSE)
# create temporary values based on REF and ALT
mydf$REFval <- diag(as.matrix(mydf[, mydf$REF]))
mydf$ALTval <- diag(as.matrix(mydf[, mydf$ALT]))
Here in the next step, you said to sum ALT "if the ALT letters are unique" but didn't specify which value to use if the ALT is the same but the value is different. It didn't matter in your sample data set since the values are the same, so in my code below I assumed the last ALT value to be used.
# sum up ALT values for all start ID
require(dplyr)
mydfs <- mydf %>% group_by(start, ALT) %>%
summarize(ALTkeep=last(ALTval)) %>% # assume keep last one if same ALT
group_by(start) %>%
summarize(ALTflat=sum(as.numeric(ALTkeep)))
# merge back into main dataframe
mydf <- left_join(mydf, mydfs)
# select ALT value for bam.AD depending on "flat$" in TYPE
mydf$bam.AD <- with(mydf,
paste(REFval, ifelse(grepl("flat$", TYPE), ALTflat, ALTval), sep=","))
# optional clean up of temporary values
mydf <- mydf[, !(names(mydf) %in% c("REFval", "ALTval", "ALTflat"))]
The output as you wanted
start A T G C REF ALT TYPE bam.AD
1 chr20:5363934 95 29 14 59 C T snp 59,29
2 chr5:8529759 24 1 28 41 G C snp 28,41
3 chr14:9620689 65 49 41 96 T G snp 49,41
4 chr18:547375 94 1 51 67 G C snp 51,67
5 chr8:5952145 27 80 25 96 T T snp 80,80
6 chr14:8694382 68 94 26 30 A A snp 68,68
7 chr16:2530921 49 15 79 72 A T snp:2530921 49,15
8 chr16:2530921 49 15 79 72 A G snp:2530921 49,79
9 chr16:2530921 49 15 79 72 A T snp:2530921flat 49,94
10 chr16:2533924 42 13 19 52 G T snp:2533924flat 19,13
11 chr16:2543344 4 13 13 42 G T snp:2543344flat 13,55
12 chr16:2543344 42 23 13 42 G A snp:2543344 13,42
13 chr14:4214117 73 49 18 77 G A snp 18,73
14 chr4:7799768 36 28 1 16 C A snp 16,36
15 chr3:9141263 27 41 93 90 A A snp 27,27
Upvotes: 2
Reputation: 56915
Here's a way to do it all, vectorised.
First, note that REF is the same regardless of type. We can look it up quickly by using REF as a coordinate into the matrix, so e.g. row 1 has REF C, so if we look up coordinates (1, "C") we get the REF value for that row.
# the REFs are the same regardless of TYPE
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]
Have a look at cbind(1:nrow(x), x[, 'REF'])
: this is just a list of coordinates (row number, REF)
and we use it to look up the REF number.
Then we do the same for ALT:
alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
However we have to make sure that if the type is 'flat', we add all the other ALTs to the 'flat' row's ALT (only the unique ones, as you say).
First, work out which rows are flat:
which.flat <- grep('flat$', x[, 'TYPE'])
Next, for each flat row, look up ALT of the other rows with the same 'start' (that's the x[, 'start'] == x[i, 'start']
bit), and exclude rows that have duplicate ALT (that's the x[, 'ALT'] != x[i, 'ALT']
bit). Here i
is the index of the current flat line. Add them all to the flat line's ALT. The sapply
just vectorises this all for each flat line.
# add the other alts to the alt of the 'flat' line.
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
function (i) {
sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
x[, 'ALT'] != x[i, 'ALT'] ]))
})
now we just paste together:
x <- cbind(x, bam.AD=paste(ref, alt, sep=','))
The result is the same as yours except for row 10 where I believe you have made a mistake - there's only one row with "chr16:2533924" and its ALT is "T" (value 13), so bam.AD
is "19,13" (you have "19,42" as if the ALT was "A", but it isn't).
If you must stick with the function form in your question (quite slow & inefficient!), it's essentially the same as what I have done (hence why you can do it without the apply
call and skip the loop entirely):
flatCase <- function(x, mat, ns) { # get alt of the flat row alt <- as.numeric(x[x['ALT']])
# get the other rows with the same 'start' and different 'ALT'
xx <- mat[mat[, 'start'] == x['start'] & mat[, 'ALT'] != x['ALT'], ,drop=F]
if (nrow(xx) > 0) {
# grab all the alts as done before
rownames(xx) <- 1:nrow(xx)
alt <- alt + sum(as.numeric(xx[cbind(1:nrow(xx), xx[, 'ALT'])]))
}
ref <- x[x['REF']]
return(paste(ref, alt, sep=','))
}
However as mentioned before, if you vectorise it the entire code you have above reduces to just a few lines, and is faster:
newBamAD <- function (x) {
# the version above
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]
alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
which.flat <- grep('flat$', x[, 'TYPE'])
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
function (i) {
sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
x[, 'ALT'] != x[i, 'ALT'] ]))
})
cbind(x, bam.AD=paste(ref, alt, sep=','))
}
library(rbenchmark)
benchmark(
bamAD=bamAD(x),
newBamAD=newBamAD(x)
)
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 bamAD 100 0.082 3.905 0.072 0.004 0 0
# 2 newBamAD 100 0.021 1.000 0.020 0.000 0 0
the vectorised version is almost 4x faster.
Upvotes: 2