Reputation: 6304
I have a data.frame
with several columns:
df <- data.frame(sgid = c("sg1","sg1","sg2","sg3"), stid = c(NA,"st1",NA,NA), spid = c(NA,NA,NA,"sp3"), sch = c("sch1","sch1","sch2","sch2"), sst = c(11,11,32,21), snd = c(21,21,46,34),
qgid = c("qg1","qg1","qg1","qg1"), qtid = c("qt1","qt1","qt1","qt1"), qpid = c("qp1","qp1","qp1","qp1"), qch = c("qch1","qch1","qch1","qch1"), qst = c(111,111,234,21), qnd = c(211,211,267,34))
This data.frame
describes mapping one sequence (i.e. query) onto a database of other sequences (i.e. search).
Search and query sequences are identified by three sets of ids:
gid
, tid
, and pid
(prefixed by s and q for search and query, respectively), and the coordinates of the matches are described by a string: ch
and two integers: st
and nd
(again, prefixed by s and q for search and query, respectively).
tid
and pid
are subsets of gid
in the case of the search, hence in the search database they are kept as separate rows. So, it is possible that the query will 'hit' both gid
and either tid
and/or pid
in different coordinates.
However, as in the case of rows 1 and 2 in df
, the query is hitting the search inside the tid
and hence the search and query coordinates of rows 1 and 2 are identical.
So I'm looking for is a function
(probably through dplyr::group
and dplyr::filter
) that will return a unique df
based on the definition above.
This is my crude way of achieving this:
tmp.df <- df %>% dplyr::select(-stid,-spid) %>% unique()
uniq.df <- do.call(rbind,lapply(1:nrow(tmp.df),function(i){
tmp.df.i <- tmp.df[i,,drop=F] %>% dplyr::left_join(df)
if(!(all(is.na(tmp.df.i$stid) & is.na(tmp.df.i$spid)))){
tmp.df.i <- tmp.df.i[which(!is.na(tmp.df.i$stid) | !is.na(tmp.df.i$spid)),,drop=F]
} else{
tmp.df.i <- tmp.df.i %>%
dplyr::select(-stid,-spid) %>%
dplyr::mutate(stid=NA,spid=NA)
}
return(tmp.df.i)
}))
#organize the columns of uniq.df to the order of df:
uniq.df <- uniq.df %>% dplyr::select_(.dots = colnames(df))
> uniq.df
sgid stid spid sch sst snd qgid qtid qpid qch qst qnd
2 sg1 st1 <NA> sch1 11 21 qg1 qt1 qp1 qch1 111 211
1 sg2 <NA> <NA> sch2 32 46 qg1 qt1 qp1 qch1 234 267
11 sg3 <NA> sp3 sch2 21 34 qg1 qt1 qp1 qch1 21 34
Looking for something more elegant.
Upvotes: 0
Views: 56
Reputation: 2650
How about something like this with dplyr
:
cols <- setdiff(names(df), c("stid", "spid"))
df %>% group_by_at(cols) %>%
summarise(stid = ifelse(length(unique(stid)) == 1,
unique(stid),
unique(stid)[! is.na(unique(stid))]),
spid = ifelse(length(unique(spid)) == 1,
unique(spid),
unique(spid)[! is.na(unique(spid))]))
Or you can use the function Coalesce
from the package DescTools
(or even define you own function to select the first non NA value):
df %>% group_by_at(cols) %>%
summarise(stid = DescTools::Coalesce(stid),
spid = DescTools::Coalesce(spid))
Upvotes: 1
Reputation: 27732
data.table
solution
sample data
# sgid stid spid sch sst snd qgid qtid qpid qch qst qnd
# 1: sg1 <NA> <NA> sch1 11 21 qg1 qt1 qp1 qch1 111 211
# 2: sg1 st1 <NA> sch1 11 21 qg1 qt1 qp1 qch1 111 211
# 3: sg2 <NA> <NA> sch2 32 46 qg1 qt1 qp1 qch1 234 267
# 4: sg3 <NA> sp3 sch2 21 34 qg1 qt1 qp1 qch1 21 34
code
library( data.table )
setDT( df )
#get columns you wish to exclude from duplication-check
cols <- c( "stid", "spid" )
#keep non-duplicated rows, based on a subset of df (without the columns in `cols`)
df[ !duplicated( df[, !..cols] ), ][]
# sgid stid spid sch sst snd qgid qtid qpid qch qst qnd
# 1: sg1 <NA> <NA> sch1 11 21 qg1 qt1 qp1 qch1 111 211
# 2: sg2 <NA> <NA> sch2 32 46 qg1 qt1 qp1 qch1 234 267
# 3: sg3 <NA> sp3 sch2 21 34 qg1 qt1 qp1 qch1 21 34
alternative
if you do not wish to keep the first row of a duplicate, but the last, use:
df[ !duplicated( df[, !..cols], fromLast = TRUE ), ][] #<-- note fromlast-argument!
# sgid stid spid sch sst snd qgid qtid qpid qch qst qnd
# 1: sg1 st1 <NA> sch1 11 21 qg1 qt1 qp1 qch1 111 211
# 2: sg2 <NA> <NA> sch2 32 46 qg1 qt1 qp1 qch1 234 267
# 3: sg3 <NA> sp3 sch2 21 34 qg1 qt1 qp1 qch1 21 34
Upvotes: 1