Reputation: 4274
I have a dataframe which looks like this:
>head(df)
chrom pos strand ref alt A_pos A_neg C_pos C_neg G_pos G_neg T_pos T_neg
chr1 2283161 - G A 3 1 2 0 0 0 0 0
chr1 2283161 - G A 3 1 2 0 0 0 0 0
chr1 2283313 - G C 0 0 0 0 0 0 0 0
chr1 2283313 - G C 0 0 0 0 0 0 0 0
chr1 2283896 - G A 0 0 0 0 0 0 0 0
chr1 2283896 + G A 0 0 0 0 0 0 0 0
I want to extract the value from columns 6:13 (A_pos...T_neg) based on the value of the columns 'strand', 'ref' and 'alt'. For instance, in row1: strand = '-', ref = 'G' and alt = 'A', so I should extract the values from G_neg and A_neg. Again, in row6: stand = '+', ref = 'G' and alt = 'A', so I should get the values from G_pos and A_pos. I basically intend to do a chi-square test after extracting these values (These are my observed values, I have another set of expected values) but that is another story.
So the logic is somewhat like:
if(df$strand=="+")
do
print:paste(df$ref,"pos",sep="_") #extract value in column df$ref_pos
print:paste(df$alt,"pos",sep="_") #extract value in column df$alt_pos
else if(gt.merge$gene_strand=="-")
do
print:paste(df$ref,"neg",sep="_") #extract value in column df$ref_neg
print:paste(df$alt,"neg",sep="_") #extract value in column df$alt_neg
Here, I am trying to use paste on the values in 'ref' and 'alt' to get the desired column names. For instance, if strand ='+' and ref = 'G', it will fetch value from column G_pos.
The data frame is actually large and so I ruled out using for-loops. I am not sure how else can I do this to make the code as efficient as possible. Any help/suggestions would be appreciated.
Thanks!
Upvotes: 0
Views: 1541
Reputation: 3656
using the [very] fast data.table library:
library(data.table)
df = fread('df.txt') # fastread
df[,ref := ifelse(strand == "-",
paste(ref,"neg",sep = "_"),
paste(ref,"pos",sep = "_"))]
df[,alt := ifelse(strand == "-",
paste(alt,"neg",sep = "_"),
paste(alt,"pos",sep = "_"))]
df[,strand := NULL] # not required anymore
dfm = melt(df,
id.vars = c("chrom","pos","ref","alt"),
variable.name = "mycol", value.name = "value")
dfm[mycol == ref | mycol == alt,] # matching
Upvotes: 0
Reputation: 92300
Another way
testFunc <- function(x){
posneg <- if(x["strand"] == "-") {"neg"} else {"pos"}
cbind(as.numeric(x[paste0(x["ref"],"_",posneg)]), as.numeric(x[paste0(x["alt"],"_",posneg)]))
}
temp <- t(apply(df, 1, testFunc))
colnames(temp) <- c("ref", "alt")
Upvotes: 1
Reputation: 13122
Another alternative that looks valid, at least with the sample data:
tmp = ifelse(as.character(DF$strand) == "-", "neg", "pos")
sapply(DF[c("ref", "alt")],
function(x) as.integer(DF[cbind(seq_len(nrow(DF)),
match(paste(x, tmp, sep = "_"), names(DF)))]))
# ref alt
#[1,] 0 1
#[2,] 0 1
#[3,] 0 0
#[4,] 0 0
#[5,] 0 0
#[6,] 0 0
Where DF
:
DF = structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"),
pos = c(2283161L, 2283161L, 2283313L, 2283313L, 2283896L,
2283896L), strand = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("-",
"+"), class = "factor"), ref = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = "G", class = "factor"), alt = structure(c(1L,
1L, 2L, 2L, 1L, 1L), .Label = c("A", "C"), class = "factor"),
A_pos = c(3L, 3L, 0L, 0L, 0L, 0L), A_neg = c(1L, 1L, 0L,
0L, 0L, 0L), C_pos = c(2L, 2L, 0L, 0L, 0L, 0L), C_neg = c(0L,
0L, 0L, 0L, 0L, 0L), G_pos = c(0L, 0L, 0L, 0L, 0L, 0L), G_neg = c(0L,
0L, 0L, 0L, 0L, 0L), T_pos = c(0L, 0L, 0L, 0L, 0L, 0L), T_neg = c(0L,
0L, 0L, 0L, 0L, 0L)), .Names = c("chrom", "pos", "strand",
"ref", "alt", "A_pos", "A_neg", "C_pos", "C_neg", "G_pos", "G_neg",
"T_pos", "T_neg"), class = "data.frame", row.names = c(NA, -6L
))
Upvotes: 1
Reputation: 52687
Not very elegant, but does the job:
strand.map <- c("-"="_neg", "+"="_pos")
cbind(
df[1:5],
do.call(
rbind,
lapply(
split(df[-(1:2)], 1:nrow(df)),
function(x)
c(
ref=x[-(1:2)][, paste0(x[[2]], strand.map[x[[1]]])],
alt=x[-(1:2)][, paste0(x[[3]], strand.map[x[[1]]])]
) ) ) )
We cycle through each row in your data frame and apply a function that pulls the value based on strand
, ref
, and alt
. This produces:
chrom pos strand ref alt ref alt
1 chr1 2283161 - G A 0 1
2 chr1 2283161 - G A 0 1
3 chr1 2283313 - G C 0 0
4 chr1 2283313 - G C 0 0
5 chr1 2283896 - G A 0 0
6 chr1 2283896 + G A 0 0
An alternate approach is to use melt
, but the format of your data makes it rather annoying because we need two melts in a row, and we need to create a unique id column so we can reconstitute the data frame once we're done computing.
df$id <- 1:nrow(df)
df.mlt <-
melt(
melt(df, id.vars=c("id", "chrom", "pos", "strand", "ref", "alt")),
measure.vars=c("ref", "alt"), value.name="base",
variable.name="alt_or_ref"
)
dcast(
subset(df.mlt, paste0(base, strand.map[strand]) == variable),
id + chrom + pos + strand ~ alt_or_ref,
value.var="value"
)
Which produces:
id chrom pos strand ref alt
1 1 chr1 2283161 - 0 1
2 2 chr1 2283161 - 0 1
3 3 chr1 2283313 - 0 0
4 4 chr1 2283313 - 0 0
5 5 chr1 2283896 - 0 0
6 6 chr1 2283896 + 0 0
Upvotes: 1