Reputation: 309
I have a data.frame
ID code1 code2 code3
A 143 143 144
A 35 453 35
A 35 15
B 46 46 45
B 12 43 765
C 255 455 344
C 343 343 343
C 343 23 23
each code appears in one time
the id may be repeated. The real data.frame is very large
ID code1 code2 code3
A 143 144
A 35 453
A 35 15
B 46 45
B 12 43 765
C 255 455 344
C 343
C 343 23
thanks
Upvotes: 4
Views: 1937
Reputation: 118779
How about this?
df[, 2:4][t(apply(df[,2:4], 1, duplicated))] <- NA
Edit: A much faster base solution:
for (i in 2:(ncol(df)-1)) {
for (j in (i+1):ncol(df)) {
chk <- df[[i]] == df[[j]]
df[[j]][chk] <- NA
}
}
Here's a benchmark of the above two methods along with AnandaMahto's reshape2
and data.table
methods on bigger data. Using for-loop
with right i
and j
indices seems to be the fastest.
require(microbenchmark)
microbenchmark(ar.f <- arun.f(df), ar.s <- arun.s(df),
an.f <- ananda.ave(df),
an.s <- ananda.dt(copy(DT)), times=10)
# Unit: milliseconds
# expr min lq median uq max neval
# arun.f(df) 4816.3937 5197.0626 6402.454 6955.9380 7534.6912 10
# arun.s(df) 114.8372 118.7971 149.284 202.6081 297.4787 10
# ananda.ave(df) 2877.7936 3288.5935 3650.660 3985.5390 4111.9064 10
# ananda.dt(copy(DT)) 3383.1229 3861.6379 4432.751 4776.6108 5368.6504 10
set.seed(1234)
df <- cbind(data.frame(ID = rep(letters[1:20], each=1e4)), stringsAsFactors=FALSE),
matrix(sample(1:10, 6 * 1e5, replace=TRUE), ncol=3))
names(df)[2:4] <- paste0("code", 1:3)
arun.f <- function(df) {
df[, 2:4][t(apply(df[,2:4], 1, duplicated))] <- NA
df
}
arun.s <- function(df) {
for (i in 2:(ncol(df)-1)) {
for (j in (i+1):ncol(df)) {
chk <- df[[i]] == df[[j]]
df[[j]][chk] <- NA
}
}
df
}
ave
+ reshape2
solution:library(reshape2)
ananda.ave <- function(df) {
df$ID2 <- with(df, ave(ID, ID, FUN = seq_along))
m.df <- melt(df, id.vars=c("ID", "ID2"))
m.df[duplicated(m.df[setdiff(names(m.df), "variable")]), "value"] <- NA
dcast(m.df, ID + ID2 ~ variable)
}
data.table
solution:(modified a bit to be more optimised)
library(data.table)
DT <- data.table(df)
ananda.dt <- function(dt) {
temp <- dt[, list(ID2 = 1:.N, Value = unlist(.SD, use.names=FALSE)), by ="ID"]
temp[duplicated(temp), Value := NA]
out <- setnames(temp[, as.list(Value), by=list(ID, ID2)], 3:5, paste0("code", 1:3))
}
Upvotes: 4
Reputation: 193507
This solution is likely to be somewhat inefficient, but that is mostly because of transforming the data back and forth between wide and long and wide. However, you might find it easier to work with your data in a "long" form.
First, generate a second ID, since you have IDs spanning multiple rows.
mydf$ID2 <- with(mydf, ave(ID, ID, FUN = seq_along))
Second, use melt
from the "reshape2" package to make your data into a long form.
library(reshape2)
m.df <- melt(mydf, id.vars=c("ID", "ID2"))
With the data in its long form, it is much easier to identify duplicates and replace them with NA
.
m.df[duplicated(m.df[setdiff(names(m.df), "variable")]), "value"] <- NA
If you are happy with your data in the long form. Stop there. If you want to get it back to its wide form, use dcast
(again from "reshape2").
dcast(m.df, ID + ID2 ~ variable)
# ID ID2 code1 code2 code3
# 1 A 1 143 NA 144
# 2 A 2 35 453 NA
# 3 A 3 NA 35 15
# 4 B 1 46 NA 45
# 5 B 2 12 43 765
# 6 C 1 255 455 344
# 7 C 2 343 NA NA
# 8 C 3 343 23 NA
For reference, this is also doable in base R, but the syntax is more clumsy (even though it might perform better than the "reshape2" equivalents).
mydf$ID2 <- with(mydf, ave(ID, ID, FUN = seq_along))
m.df <- cbind(mydf[c("ID", "ID2")],
stack(mydf[setdiff(names(mydf), c("ID", "ID2"))]))
m.df[duplicated(m.df[setdiff(names(m.df), "ind")]), "values"] <- NA
cbind(mydf[c("ID", "ID2")], unstack(m.df, values ~ ind))
data.table
solutionYou may want to explore data.table
since you mention that your data are large. Here's one possible solution (though @Arun might have a more direct solution to share).
library(data.table)
DT <- data.table(mydf)
## Creates your long data.table
temp <- DT[, list(ID2 = 1:.N, Value = unlist(.SD)), by ="ID"]
## Changes duplicates to NA and adds in the "Code" column
temp[duplicated(temp), Value := NA][, Variable := rep(names(DT)[-1],
each = nrow(DT))]
## "Reshapes" the data from long to wide
temp[, as.list(setattr(Value, 'names', Variable)), by=list(ID, ID2)]
# ID ID2 code1 code1 code1
# 1: A 1 143 NA 144
# 2: A 2 35 453 NA
# 3: A 3 NA 35 15
# 4: B 1 46 NA 45
# 5: B 2 12 43 765
# 6: C 1 255 455 344
# 7: C 2 343 NA NA
# 8: C 3 343 23 NA
Upvotes: 1