Reputation: 5719
I have a matrix called mymat
. I have a vector called geno <- c("01","N1","11","1N","10")
. I have another table called key.table
. What I want to do is I want to match the key
column in key.table
with the key
column in mymat
and If the column values in any of the matching rows have the any of the geno
elements, I want to extract that column name from mymat
along with the matching geno
element and paste it in the new column in matched.extract
in key.table
in the corresponding rows for each key
and get the result.
mymat <- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00",
"11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L), .Dimnames = list(
c("34", "35", "36", "37", "38"), c("key", "AMLM12001KP",
"AMAS-11.3-Diagnostic", "AMLM12014N-R")))
key.table<- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125",
"chr5:12127", "chr5:12129", "9920068", "9920069", "9920070",
"9920071", "9920072", "9920073", "9920074", "9920075", "9920076",
"9920077", "9920078"), .Dim = c(11L, 2L), .Dimnames = list(c("34",
"35", "36", "37", "38", "39", "40", "41", "42", "43", "44"),
c("key", "variantId")))
result
key variantId matched.extract
34 "chr5:12111" "9920068" NA
35 "chr5:12111" "9920069" NA
36 "chr5:12113" "9920070" AMLM12001KP (1N),AMLM12014N-R (1N)
37 "chr5:12114" "9920071" AMAS-11.3-Diagnostic (11)
38 "chr5:12118" "9920072" AMAS-11.3-Diagnostic (10)
39 "chr5:12122" "9920073" NA
40 "chr5:12123" "9920074" NA
41 "chr5:12123" "9920075" NA
42 "chr5:12125" "9920076" NA
43 "chr5:12127" "9920077" NA
44 "chr5:12129" "9920078" NA
Upvotes: 6
Views: 739
Reputation: 83215
Using data.table, I would approach it like this:
library(data.table)
# convert the 'key.table' matrix to a data.table
kt <- as.data.table(key.table, keep.rownames=TRUE)
# convert the 'mymat' matrix to a data.table and melt into long format
# filter on the needed geno-types
# paste the needed values together into the requested format
mm <- melt(as.data.table(mymat, keep.rownames=TRUE),
id=c("rn","key"))[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")
][, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)
][val=="", val:=NA]
# join the 'mm' and 'kt' data.tables
kt[mm, matched := val, on=c("rn","key")]
which gives:
> kt rn key variantId matched 1: 34 chr5:12111 9920068 NA 2: 35 chr5:12111 9920069 NA 3: 36 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N) 4: 37 chr5:12114 9920071 AMAS-11.3-Diagnostic (11) 5: 38 chr5:12118 9920072 AMAS-11.3-Diagnostic (10) 6: 39 chr5:12122 9920073 NA 7: 40 chr5:12123 9920074 NA 8: 41 chr5:12123 9920075 NA 9: 42 chr5:12125 9920076 NA 10: 43 chr5:12127 9920077 NA 11: 44 chr5:12129 9920078 NA
Explanation:
kt <- as.data.table(key.table, keep.rownames=TRUE)
will convert the matrix key.table
to a data.table
(which is an enhanced data.frame
) and stores the rownames in the rn
column.mm <- melt(as.data.table(mymat, keep.rownames=TRUE), id=c("rn","key"))
will convert the matrix mymat
to a data.table
, stores the rownames in the rn
column and melts the data.table into long format.[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")]
will paste the variable
-values (which were the columnnams in mymat
) with the value
-values for only in the cases where value
is 1N
, 11
or 10
.[, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)]
will paste the non-NA
rows of val
together by the rn
& key
variables.[val=="", val:=NA]
will transform the empty rows for val
into NA
-valueskt[mm, matched := val, on=c("rn","key")]
updates the kt
-data.table by reference with the val
-values of the mm
-data.table for the matching rn
& key
variables.WARNING: When using data.table, it is better not to use key
as a variable name as key
is also a parameter in a data.table
. See ?key
for more info.
Upvotes: 7
Reputation: 17648
I'm not that familiar with the dplyr functions. You can try the base R merge function:
mm <- merge(key.table,mymat,by="key",all.x=T)
mm
function to paste the column names with the tissue type:
get.geno <- function(x,y) ifelse(!x %in% c("00","0N") & !is.na(x), paste0(y," (",x,")"), NA)
a <- t(apply(mm[,3:5], 1, get.geno, colnames(mm)[3:5]))
final dataframe:
mm$result <- apply(a, 1, function(x) paste(x[!is.na(x)] ,collapse=","))
mm[, -3:-5]
key variantId result
1 chr5:12111 9920068
2 chr5:12111 9920068
3 chr5:12111 9920069
4 chr5:12111 9920069
5 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
6 chr5:12114 9920071 AMAS-11.3-Diagnostic (11)
7 chr5:12118 9920072 AMAS-11.3-Diagnostic (10)
8 chr5:12122 9920073
9 chr5:12123 9920074
10 chr5:12123 9920075
11 chr5:12125 9920076
12 chr5:12127 9920077
13 chr5:12129 9920078
Upvotes: 4
Reputation: 22807
Not exactly sure what you want, but it might be close to this:
library(reshape2)
mymat <- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00",
"11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L),
.Dimnames = list(
c("34", "35", "36", "37", "38"),
c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R")))
key.table<- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125",
"chr5:12127", "chr5:12129", "9920068", "9920069", "9920070",
"9920071", "9920072", "9920073", "9920074", "9920075", "9920076",
"9920077", "9920078"), .Dim = c(11L, 2L),
.Dimnames = list(
c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"),
c("key", "variantId")))
# work with dataframes
mmdf <- data.frame(mymat)
ktdf <- data.frame(key.table)
tdf <- merge(mmdf,ktdf,by="key")
mltdf <- melt(tdf,id.vars=c("key","variantId"))
mltdf1 <- mltdf[mltdf$value != "0N" & mltdf$value != "00" ,]
mltdf1
Yielding:
key variantId variable value
5 chr5:12113 9920070 AMLM12001KP 1N
13 chr5:12114 9920071 AMAS.11.3.Diagnostic 11
14 chr5:12118 9920072 AMAS.11.3.Diagnostic 10
19 chr5:12113 9920070 AMLM12014N.R 1N
Upvotes: 1