Reputation: 568
The goal is to extract rows from a dataframe/data.table which have:
Currently I'm doing this:
library(data.table)
# load the data
customers <- structure(list(
NAME = c("GEETA SHYAM RAO", "B V RAMANA", "GONTU VENKATARAMANAIAH",
"DAMAT RAMAKRISHNA", "MARIAM SUDHAKAR", "VELPURI LAKSHMI SUJATHA",
"MOHAMMED LIYAKHAT ALI", "VENKATESHWARAN PONNAMBALAM",
"DEVARAKONDA SATISH BABU", "GEEDI RAMULU", "KANDU OBULESU",
"J PARVATHALU(TEMP.SUB-STAFF)", "DOKKA RAJESH", "G TULASIRAM REDDY",
"MALLELA CHIRANJEEVI", "MANEPALLI VENKATA RAVAMMA",
"DOKKA JAGADEESHWAR", "K KRISHNA", "B SUDARSHAN", "B ANNAPURNA",
"CHAVVA SHIVA RAMULU", "BIKASH BAHADUR CHITRE", "DARBAR ASHOK",
"VEMULAPALLY SANGAMESHWAR RAO", "MOHAMMED ABDUL HAKEEM ANWAR",
"MANEPALLI SHIV SHANKAR RAO", "MOHD MISKEEN MOHIUDDIN",
"KOTLA CHENNAMMA", "NAYAK SURYAKANTH", "GOPIREDDY INDIRA",
"MEKALA SREEDEVI", "K KRISHNA", "B V RAMANA", "KUMMARI VENKATESHAM",
"BHAVANI CONSRUCTIONS", "UPPUTHOLLA KOTAIAH", "YEDIDHA NIRMALA DEVI",
"MARIAM SUDHAKAR", "B ANNAPURNA", "VELPURI LAKSHMI SUJATHA",
"DARBAR ASHOK", "AMMANA VISHNU VARDHAN REDDY", "ZAITOON BEE",
"MOHD CHAND PASHA", "PALERELLA RAMESH", "GEEDI SRINIVAS",
"RAMAIAH SADU", "BIMAN BALAIAH", "KOTLA CHENNAMMA",
"VENKATESHWARAN PONNAMBALAM"),
DOB = c("13-02-1971", "15-01-1960", "01-07-1970", "10-03-1977",
"24-01-1954", "28-06-1971", "26-01-1980", "14-04-1969", "23-09-1978",
"15-08-1954", "09-10-1984", "20-02-1975", "29-09-1984", "03-03-1975",
"26-01-1979", "01-01-1964", "21-01-1954", "01-05-1964", "12-03-1975",
"12-12-1962", "10-03-1982", "14-05-1983", "03-01-1950", "04-03-1962",
"12-05-1966", "01-06-1960", "10-03-1964", "15-07-1958", "26-06-1979",
"02-04-1974", "10-01-1975", "01-05-1964", "15-01-1960", "08-08-1977",
NA, "05-04-1981", "29-08-1971", "24-01-1954", "12-12-1962",
"28-06-1971", "03-01-1950", "23-06-1970", "20-02-1960", "05-07-1975",
"10-01-1979", "31-08-1982", "10-08-1983", "10-03-1964",
"15-07-1958", "14-04-1969"),
ID = c(502969, 502902, 502985, 502981, 502475, 502267, 502976,
502272, 502977, 502973, 502986, 502978, 502989, 502998, 502967,
502971, 502988, 502737, 502995, 502878, 502972, 502984, 502639,
502968, 502975, 502970, 502997, 502466, 502991, 502982, 502980,
502737, 502902, 502999, 502994, 502987, 502990, 502047, 502877,
502251, 502548, 502992, 503000, 502993, 502983, 502974, 502996,
502979, 502467, 502290),
PIN = c(500082, 500032, 500032, 500032,
500032, 500084, 500032, 500032, 500032, 500032, 500032, 500084,
500032, 500084, 500084, 500032, 5e+05, 500050, 500032, 500084,
500032, 500032, 500032, 500050, 500032, 500032, 500045, 500032,
500084, 500032, 500032, 500084, 500035, 500084, 500032, 500032,
500032, 500032, 500084, 500032, 500084, 500033, 500084, 500032,
500032, 500032, 500084, 500032, 500032, 500032)),
.Names = c("NAME", "DOB", "ID", "PIN"),
class = c("data.table", "data.frame"), row.names = c(NA,-50L))
Checkout the data:
dim(customers)
#[1] 50 4
head(customers)
NAME DOB ID PIN
#1: GEETA SHYAM RAO 13-02-1971 502969 500082
#2: B V RAMANA 15-01-1960 502902 500032
#3: GONTU VENKATARAMANAIAH 01-07-1970 502985 500032
#4: DAMAT RAMAKRISHNA 10-03-1977 502981 500032
#5: MARIAM SUDHAKAR 24-01-1954 502475 500032
#6: VELPURI LAKSHMI SUJATHA 28-06-1971 502267 500084
Step1: Get rows with same values in NAME and DOB columns -
dup1 <- customers[, .(ID, PIN, .N), keyby=.(NAME, DOB)][N>1][, -"N"]
dup1
# NAME DOB ID PIN
# 1: B ANNAPURNA 12-12-1962 502878 500084
# 2: B ANNAPURNA 12-12-1962 502877 500084
# 3: B V RAMANA 15-01-1960 502902 500032
# 4: B V RAMANA 15-01-1960 502902 500035
# 5: DARBAR ASHOK 03-01-1950 502639 500032
# 6: DARBAR ASHOK 03-01-1950 502548 500084
# 7: K KRISHNA 01-05-1964 502737 500050
# 8: K KRISHNA 01-05-1964 502737 500084
# 9: KOTLA CHENNAMMA 15-07-1958 502466 500032
#10: KOTLA CHENNAMMA 15-07-1958 502467 500032
#11: MARIAM SUDHAKAR 24-01-1954 502475 500032
#12: MARIAM SUDHAKAR 24-01-1954 502047 500032
#13: VELPURI LAKSHMI SUJATHA 28-06-1971 502267 500084
#14: VELPURI LAKSHMI SUJATHA 28-06-1971 502251 500032
#15: VENKATESHWARAN PONNAMBALAM 14-04-1969 502272 500032
#16: VENKATESHWARAN PONNAMBALAM 14-04-1969 502290 500032
In the above result, the ID values of "B V RAMANA" and "K KRISHNA" are same in their duplicate rows and therefore need to be removed.
Step 2: Get rows with same values in NAME, DOB and ID columns -
dup2 <- dup1[, .(PIN, .N), keyby=.(NAME, DOB, ID)][N>1][, -"N"]
dup2
# NAME DOB ID PIN
#1: B V RAMANA 15-01-1960 502902 500032
#2: B V RAMANA 15-01-1960 502902 500035
#3: K KRISHNA 01-05-1964 502737 500050
#4: K KRISHNA 01-05-1964 502737 500084
Step 3: Now remove rows in Step 2 from rows in Step 1 to get the final result -
result <- fsetdiff(dup1, dup2)
result
# NAME DOB ID PIN
# 1: B ANNAPURNA 12-12-1962 502878 500084
# 2: B ANNAPURNA 12-12-1962 502877 500084
# 3: DARBAR ASHOK 03-01-1950 502639 500032
# 4: DARBAR ASHOK 03-01-1950 502548 500084
# 5: KOTLA CHENNAMMA 15-07-1958 502466 500032
# 6: KOTLA CHENNAMMA 15-07-1958 502467 500032
# 7: MARIAM SUDHAKAR 24-01-1954 502475 500032
# 8: MARIAM SUDHAKAR 24-01-1954 502047 500032
# 9: VELPURI LAKSHMI SUJATHA 28-06-1971 502267 500084
#10: VELPURI LAKSHMI SUJATHA 28-06-1971 502251 500032
#11: VENKATESHWARAN PONNAMBALAM 14-04-1969 502272 500032
#12: VENKATESHWARAN PONNAMBALAM 14-04-1969 502290 500032
In every case above, the NAME and DOB columns have duplicate values but the values in ID column for those duplicate rows are necessarily different.
That's three lines of processing code for getting the result but I'm sure there must be alternative methods. In this example, there are just four fields. With say, more than 50 fields it would be a tedious job to put all the field names in code even with copy-paste. Therefore, it would be really cool to create a re-usable function which could take as input -
and output the result as a dataframe/data.table. Ideas please.
Side note: This feature is deemed so important in fraud analytics that a commercial software "CaseWare IDEA" offers it by the name "Duplicate Key Exclusion". Checkout this feature in action: https://www.youtube.com/watch?v=XqL4j8UXsKw
Upvotes: 3
Views: 1433
Reputation: 66819
I think the OP's way is very good already. However, ...
j = .N
on its own, it will be more efficient. See ?GForce
for details.I think the OP's two steps do not succeed in the "duplicate key exclusion" task described in the OP and linked video:
In every case above, the NAME and DOB columns have duplicate values but the values in ID column for those duplicate rows are necessarily different.
For the OP's two steps, there's...
bycols = c("NAME", "DOB")
dcol = "ID"
cols = c(bycols, dcol)
w1 = customers[customers[, .N, by=bycols][N > 1L, !"N"], on=bycols, which=TRUE]
customers[w1][!customers[w1, .N, by=cols][N > 1L, !"N"], on=cols]
For the quoted task ...
mDT = customers[!duplicated(customers, by=cols), .N, by=bycols][N > 1L]
customers[mDT[, !"N"], on=bycols]
Either way for the OP's example, we get
NAME DOB ID PIN
1: MARIAM SUDHAKAR 24-01-1954 502475 500032
2: MARIAM SUDHAKAR 24-01-1954 502047 500032
3: VELPURI LAKSHMI SUJATHA 28-06-1971 502267 500084
4: VELPURI LAKSHMI SUJATHA 28-06-1971 502251 500032
5: VENKATESHWARAN PONNAMBALAM 14-04-1969 502272 500032
6: VENKATESHWARAN PONNAMBALAM 14-04-1969 502290 500032
7: B ANNAPURNA 12-12-1962 502878 500084
8: B ANNAPURNA 12-12-1962 502877 500084
9: DARBAR ASHOK 03-01-1950 502639 500032
10: DARBAR ASHOK 03-01-1950 502548 500084
11: KOTLA CHENNAMMA 15-07-1958 502466 500032
12: KOTLA CHENNAMMA 15-07-1958 502467 500032
mDT
is a summary table describing the duplicates, convenient for browsing:
> mDT
NAME DOB N
1: MARIAM SUDHAKAR 24-01-1954 2
2: VELPURI LAKSHMI SUJATHA 28-06-1971 2
3: VENKATESHWARAN PONNAMBALAM 14-04-1969 2
4: B ANNAPURNA 12-12-1962 2
5: DARBAR ASHOK 03-01-1950 2
6: KOTLA CHENNAMMA 15-07-1958 2
Edited by __San__ (Original Poster):
Dataset modified to show how to deal with the case pointed out earlier by Frank: "A group with ID = A A B would lose the two As and retain the B"
library(data.table)
# load the data
customers <- structure(list(
NAME = c("GEETA SHYAM RAO", "B V RAMANA", "GONTU VENKATARAMANAIAH",
"DAMAT RAMAKRISHNA", "MARIAM SUDHAKAR", "VELPURI LAKSHMI SUJATHA",
"MOHAMMED LIYAKHAT ALI", "VENKATESHWARAN PONNAMBALAM",
"DEVARAKONDA SATISH BABU", "GEEDI RAMULU", "KANDU OBULESU",
"B V RAMANA", "DOKKA RAJESH", "G TULASIRAM REDDY",
"MALLELA CHIRANJEEVI", "MANEPALLI VENKATA RAVAMMA",
"DOKKA JAGADEESHWAR", "K KRISHNA", "B SUDARSHAN", "B ANNAPURNA",
"CHAVVA SHIVA RAMULU", "BIKASH BAHADUR CHITRE", "DARBAR ASHOK",
"VEMULAPALLY SANGAMESHWAR RAO", "MOHAMMED ABDUL HAKEEM ANWAR",
"MANEPALLI SHIV SHANKAR RAO", "MOHD MISKEEN MOHIUDDIN",
"KOTLA CHENNAMMA", "NAYAK SURYAKANTH", "GOPIREDDY INDIRA",
"MEKALA SREEDEVI", "K KRISHNA", "B V RAMANA",
"KUMMARI VENKATESHAM", "BHAVANI CONSRUCTIONS",
"UPPUTHOLLA KOTAIAH", "YEDIDHA NIRMALA DEVI", "MARIAM SUDHAKAR",
"B ANNAPURNA", "VELPURI LAKSHMI SUJATHA", "DARBAR ASHOK",
"AMMANA VISHNU VARDHAN REDDY", "ZAITOON BEE", "MOHD CHAND PASHA",
"PALERELLA RAMESH", "GEEDI SRINIVAS", "RAMAIAH SADU",
"BIMAN BALAIAH", "KOTLA CHENNAMMA", "VENKATESHWARAN PONNAMBALAM"),
DOB = c("13-02-1971", "15-01-1960", "01-07-1970", "10-03-1977",
"24-01-1954", "28-06-1971", "26-01-1980", "14-04-1969",
"23-09-1978", "15-08-1954", "09-10-1984", "15-01-1960",
"29-09-1984", "03-03-1975", "26-01-1979", "01-01-1964",
"21-01-1954", "01-05-1964", "12-03-1975", "12-12-1962",
"10-03-1982", "14-05-1983", "03-01-1950", "04-03-1962",
"12-05-1966", "01-06-1960", "10-03-1964", "15-07-1958",
"26-06-1979", "02-04-1974", "10-01-1975", "01-05-1964",
"15-01-1960", "08-08-1977", NA, "05-04-1981", "29-08-1971",
"24-01-1954", "12-12-1962", "28-06-1971", "03-01-1950",
"23-06-1970", "20-02-1960", "05-07-1975", "10-01-1979",
"31-08-1982", "10-08-1983", "10-03-1964", "15-07-1958",
"14-04-1969"),
ID = c(502969, 502902, 502985, 502981, 502475, 502267, 502976,
502272, 502977, 502973, 502986, 502910, 502989, 502998, 502967,
502971, 502988, 502737, 502995, 502878, 502972, 502984, 502639,
502968, 502975, 502970, 502997, 502466, 502991, 502982, 502980,
502737, 502902, 502999, 502994, 502987, 502990, 502047, 502877,
502251, 502548, 502992, 503000, 502993, 502983, 502974, 502996,
502979, 502467, 502290),
PIN = c(500082, 500032, 500032, 500032, 500032, 500084, 500032, 500032,
500032, 500032, 500032, 500033, 500032, 500084, 500084, 500032,
5e+05, 500050, 500032, 500084, 500032, 500032, 500032, 500050,
500032, 500032, 500045, 500032, 500084, 500032, 500032, 500084,
500035, 500084, 500032, 500032, 500032, 500032, 500084, 500032,
500084, 500033, 500084, 500032, 500032, 500032, 500084, 500032,
500032, 500032)),
.Names = c("NAME", "DOB", "ID", "PIN"),
row.names = c(NA, -50L), class = c("data.table", "data.frame"))
# define function for duplicate key exclusion
dupKeyEx <- function(DT, dup_cols, unique_cols) {
cols <- c(dup_cols, unique_cols)
mDT <- DT[!duplicated(DT, by=cols), .N, by=dup_cols][N > 1L]
ans <- unique(DT[mDT[, !"N"], on=dup_cols], by=cols)
return(ans)
}
# call function
result <- dupKeyEx(customers, c("NAME", "DOB"), "ID")
result
The result tells us that B V RAMANA (same NAME and DOB) has been issued multiple IDs and shows those different IDs as under:
NAME DOB ID PIN
1: B ANNAPURNA 12-12-1962 502877 500084
2: B ANNAPURNA 12-12-1962 502878 500084
3: B V RAMANA 15-01-1960 502902 500032
4: B V RAMANA 15-01-1960 502910 500033
5: DARBAR ASHOK 03-01-1950 502548 500084
6: DARBAR ASHOK 03-01-1950 502639 500032
7: KOTLA CHENNAMMA 15-07-1958 502466 500032
8: KOTLA CHENNAMMA 15-07-1958 502467 500032
9: MARIAM SUDHAKAR 24-01-1954 502047 500032
10: MARIAM SUDHAKAR 24-01-1954 502475 500032
11: VELPURI LAKSHMI SUJATHA 28-06-1971 502251 500032
12: VELPURI LAKSHMI SUJATHA 28-06-1971 502267 500084
13: VENKATESHWARAN PONNAMBALAM 14-04-1969 502272 500032
14: VENKATESHWARAN PONNAMBALAM 14-04-1969 502290 500032
If the OP method is used with this modified dataset, two rows with identical ID's of B V RAMANA are lost and the result will show just one (out of total three) IDs issued to B V RAMANA because with that method "a group with ID = A A B would lose the two As and retain the B" (to quote Frank). Same applies to the other data.table solution by Uwe Block. The result which does not serve the purpose is shown below.
NAME DOB ID PIN
1: B ANNAPURNA 12-12-1962 502878 500084
2: B ANNAPURNA 12-12-1962 502877 500084
3: B V RAMANA 15-01-1960 502910 500033
4: DARBAR ASHOK 03-01-1950 502639 500032
5: DARBAR ASHOK 03-01-1950 502548 500084
6: KOTLA CHENNAMMA 15-07-1958 502466 500032
7: KOTLA CHENNAMMA 15-07-1958 502467 500032
8: MARIAM SUDHAKAR 24-01-1954 502475 500032
9: MARIAM SUDHAKAR 24-01-1954 502047 500032
10: VELPURI LAKSHMI SUJATHA 28-06-1971 502267 500084
11: VELPURI LAKSHMI SUJATHA 28-06-1971 502251 500032
12: VENKATESHWARAN PONNAMBALAM 14-04-1969 502272 500032
13: VENKATESHWARAN PONNAMBALAM 14-04-1969 502290 500032
Till someone thinks of a situation where the new method fails, I think we have got a correct solution for "duplicate key exclusion". Great catch and data.table solution by Frank.
For an extension to cover the case where values mismatch due to leading or trailing spaces, see the OP's follow-up at How to refer to multiple column names held in a variable inside a function, which uses stringr::str_trim()
and concludes:
dupKeyEx <- function(DT, dup_cols, unique_cols) {
cols <- c(dup_cols, unique_cols)
chr_cols <- cols[sapply(DT[, ..cols], is.character)]
DT[, (chr_cols) := lapply(.SD, stringr::str_trim), .SDcols=chr_cols]
mDT <- DT[!duplicated(DT, by=cols), .N, by=dup_cols][N > 1L]
ans <- unique(DT[mDT[, !"N"], on=dup_cols], by=cols)
setorderv(ans, c(dup_cols, unique_cols))
return(ans)
}
Upvotes: 3
Reputation: 624
I've written a function which follows the basic ideas outlined in your question.
It finds all the unique ID
s there are for each person
(in this case NAME
and DOB
). Function arguments person
and ID
specify which columns to use.
my_idea <- function(df, person, ID) {
# Count number of entries per person
df$count <- 1
entries <- aggregate(df["count"], by=df[person], FUN=sum)
# Only consider people with multiple entries
entries <- entries[entries$count > 1, person]
df <- merge(df, entries)
# Get rid of all rows where any 'person' and 'ID' columns are duplicated
df <- df[!(duplicated(df[c(person, ID)]) | duplicated(df[c(person, ID)], fromLast=T)), ]
return(df[, -match("count", names(df))])
}
my_idea(df=customers, person=c("NAME", "DOB"), ID="ID")
Upvotes: 0
Reputation: 42544
The OP has already given a data.table
solution in his question. However, here is an improved version:
library(data.table)
# define column names to check
dup_cols <- c("NAME", "DOB")
unique_cols <- "ID"
# set sort order for convience and easy comparison of result
# note that this doesn't copy the data
setorderv(customers, dupe_cols)
# extract rows which are duplicates in dup_cols but unique in unique_cols
customers[(duplicated(customers, by = dup_cols) |
duplicated(customers, by = dup_cols, fromLast = TRUE)) &
!(duplicated(customers, by =c(dup_cols, unique_cols)) |
duplicated(customers, by = c(dup_cols, unique_cols), fromLast = TRUE))]
NAME DOB ID PIN 1: B ANNAPURNA 12-12-1962 502878 500084 2: B ANNAPURNA 12-12-1962 502877 500084 3: DARBAR ASHOK 03-01-1950 502639 500032 4: DARBAR ASHOK 03-01-1950 502548 500084 5: KOTLA CHENNAMMA 15-07-1958 502466 500032 6: KOTLA CHENNAMMA 15-07-1958 502467 500032 7: MARIAM SUDHAKAR 24-01-1954 502475 500032 8: MARIAM SUDHAKAR 24-01-1954 502047 500032 9: VELPURI LAKSHMI SUJATHA 28-06-1971 502267 500084 10: VELPURI LAKSHMI SUJATHA 28-06-1971 502251 500032 11: VENKATESHWARAN PONNAMBALAM 14-04-1969 502272 500032 12: VENKATESHWARAN PONNAMBALAM 14-04-1969 502290 500032
This can be wrapped in a function as requested:
# define function
filter_dupes <- function(DT, dup_cols, unique_cols) {
DT[(duplicated(DT, by = dup_cols) |
duplicated(DT, by = dup_cols, fromLast = TRUE)) &
!(duplicated(DT, by =c(dup_cols, unique_cols)) |
duplicated(DT, by = c(dup_cols, unique_cols), fromLast = TRUE))]
}
# call function
result <- filter_dupes(customers, c("NAME", "DOB"), "ID")
result
returning the same result as above.
Note that the input data.table customers
already had been sorted using
setorderv(customers, dupe_cols)
above and so is the result. You can set the sort order of result
also afterwards by, e.g.,
setorderv(result, c(dup_cols, unique_cols))
result
duplicated()
returns a logical vector indicating which rows of a data.table
are duplicates of a row with smaller subscripts. So,
customers[duplicated(customers, by = dup_cols)]
would only return the 2nd (and subsequent if any) occurence of any duplicate entry. Therefore, duplicated()
is called a second time but now looking from the reverse side:
customers[duplicated(customers, by = dup_cols) |
duplicated(customers, by = dup_cols, fromLast = TRUE)]
Note that the solution is taken from Finding ALL duplicate rows, including “elements with smaller subscripts”.
This exercise is repeated to find all rows which have duplicate values in the combined set of columns given by dup_cols
and unique_cols
to identify the rows to be omitted from the result. Instead of fsetdiff()
logical operators are used.
Note that unique_cols
may contain more than one column name.
Functions used for production codes always should check their arguments. Here is a version with checks included:
filter_dupes <- function(DT, dup_cols, unique_cols) {
checkmate::assert_data_table(DT)
checkmate::assert_character(dup_cols, min.len = 1L, any.missing = FALSE)
checkmate::assert_character(unique_cols, min.len = 1L, any.missing = FALSE)
checkmate::assert_subset(dup_cols, names(DT))
checkmate::assert_subset(unique_cols, names(DT))
# verify that column names are disjoint (although error message isn't self-explanatory)
checkmate::assert_set_equal(dup_cols, setdiff(dup_cols, unique_cols))
DT[(duplicated(DT, by = dup_cols) |
duplicated(DT, by = dup_cols, fromLast = TRUE)) &
!(duplicated(DT, by =c(dup_cols, unique_cols)) |
duplicated(DT, by = c(dup_cols, unique_cols), fromLast = TRUE))]
}
Upvotes: 2
Reputation: 1975
Here you go.
process.df <- function(df, duplic.vars, differ.var) {
df <- as.data.frame(df)
#
part0 <- df[, duplic.vars]
DUPLI <- sapply(1:nrow(part0), (function(i){
tmp <- part0[i,]
chk01 <-sapply(1:length(duplic.vars), (function(j){
as.vector(part0[,duplic.vars[j]]) %in% as.vector(tmp[j])
}))
chk01 <- apply(chk01, 1, sum) == length(duplic.vars)
sum(chk01) >= 2 # eventually, customize
}))
#
new.df <- df[DUPLI,]
#
new.df <- new.df[-which(new.df[,differ.var] %in%
new.df[,differ.var][duplicated(new.df[,differ.var])]), ]
#
# return
return(new.df)
}
res <- process.df(customers, duplic.vars = c("NAME", "DOB"), differ.var = "ID")
res[order(res[,1]),]
Upvotes: 0