Reputation:

Keep less recent duplicate row in R

So, I have a dataset with bill numbers, day, month, year, and aggregate value. There are a bunch of bull number duplicates, and I want to keep the first ones. If there is a duplicate with the same day, month, and year, I want to keep the one with the highest amount in aggregate value.

For example, if the dataset now looks like this:

Bill Number   Day   Month    Year   Ag. Value
   1           10     4       1998     10
   1           11     4       1998     14
   2           23     11      2001     12
   2           23     11      2001     9
   3           11     3       2005     8
   3           12     3       2005     9
   3           13     3       2005     4

I want the result to look like this:

Bill Number  Day  Month  Year  Ag. Value
    1         10    4     1998    10
    2         23    11    2001    12
    3         11    3     2005    8

I'm not sure if there is a command I can use and just introduce all these arguments or if I should do it in stages, but either way I'm not sure how to begin. I used duplicate() and unique() and then got stuck.

Thanks!

Upvotes: 2

Answers (4)

Alessio

Reputation: 920

I used some loops and condition checks, and tried with a test set besides the "base" set you mentioned.

library(tidyverse)

#base dataset
billNumber <- c(1,1,2,2,3,3,3)
day <- c(10,11,23,23,11,12,13)
month <- c(4,4,11,11,3,3,3)
year <- c(1998,1998,2001,2001,2005,2005,2005)
agValue <- c(10,14,12,9,8,9,4)

#test dataset
billNumber <- c(1,1,2,2,3,3,3,4,4,4)
day <- c(10,11,23,23,11,12,13,15,15,15)
month <- c(4,4,11,11,3,3,3,6,6,6)
year <- c(1998,1998,2001,2001,2005,2005,2005,2020,2020,2020)
agValue <- c(10,14,9,12,8,9,4,13,15,8)

#build the dataset
df <- data.frame(billNumber,day,month,year,agValue)

#add a couple of working columns
df_full <- df %>%
  mutate(
    concat = paste(df$billNumber,df$day,df$month,df$year,sep="-"),
    flag = ""
  )

df_full

billNumber day month year agValue       concat flag
1          1  10     4 1998      10  1-10-4-1998     
2          1  11     4 1998      14  1-11-4-1998     
3          2  23    11 2001      12 2-23-11-2001     
4          2  23    11 2001       9 2-23-11-2001     
5          3  11     3 2005       8  3-11-3-2005     
6          3  12     3 2005       9  3-12-3-2005     
7          3  13     3 2005       4  3-13-3-2005     

#separate records with one/multi occurence as defined in the question
row_single <- df_full %>% count(concat) %>% filter(n == 1)
df_full_single <- df_full[df_full$concat %in% row_single$concat,]

row_multi <- df_full %>% count(concat) %>% filter(n > 1)
df_full_multi <- df_full[df_full$concat %in% row_multi$concat,]

#flag the rows with single occurence
df_full_single[1,]$flag = "Y"

for (row in 2:nrow(df_full_single)) {

  if (df_full_single[row,]$billNumber == df_full_single[row-1,]$billNumber) {
    df_full_single[row,]$flag = "N"    
  } else 
  {
    df_full_single[row,]$flag = "Y"
  }
}

df_full_single


#flag the rows with multi occurences
df_full_multi[1,]$flag = "Y"

for (row in 2:nrow(df_full_multi)) {

  if (
      (df_full_multi[row,]$billNumber == df_full_multi[row-1,]$billNumber) &
      (df_full_multi[row,]$agValue > df_full_multi[row-1,]$agValue)
     ) {
    df_full_multi[row,]$flag = "Y"    
    df_full_multi[row-1,]$flag = "N"
  } else 
  {
    df_full_multi[row,]$flag = "N"
  }
}

df_full_multi

#rebuild full dataset and retrieve the desired output

df_full_final <- rbind(df_full_single,df_full_multi)

df_full_final <- df_full_final[df_full_final$flag == "Y",c(1,2,3,4,5)]

df_full_final <- df_full_final[order(df_full_final$billNumber),]

df_full_final

billNumber day month year agValue
1          1  10     4 1998      10
3          2  23    11 2001      12
5          3  11     3 2005       8

Upvotes: 0

Hasan Mahmud Sujan

Reputation: 1

This answer uses dplyr package and satisfies your condition: "If there is a duplicate with the same day, month, and year, I want to keep the one with the highest amount in aggregate value."

library(data.table)
library(dplyr)

myData <- fread("Bill_Number   Day   Month    Year   Ag_Value
        1           10     4       1998     10
        1           11     4       1998     14
        2           23     11      2001     12
        2           23     11      2001     9
        3           11     3       2005     8
        3           12     3       2005     9
        3           13     3       2005     4", header = TRUE)

myData <- as.tibble(myData) #tibble form
sData <- arrange(myData, Bill_Number, Year, Month, Day, desc(Ag_Value)) #sort the data with the required manner 
fData <- distinct(sData, Bill_Number, .keep_all = 1) #final data
fData
# A tibble: 3 x 5
  Bill_Number   Day Month  Year Ag_Value
       <int> <int> <int> <int>    <int>
1           1    10     4  1998       10
2           2    23    11  2001       12
3           3    11     3  2005        8

Upvotes: 0

Wimpel

Reputation: 27732

library( data.table )

dt <- fread("Bill_Number   Day   Month    Year   Ag_Value
1           10     4       1998     10
1           11     4       1998     14
2           23     11      2001     12
2           23     11      2001     9
3           11     3       2005     8
3           12     3       2005     9
3           13     3       2005     4", header = TRUE)

dt[ !duplicated( Bill_Number), ]  

#    Bill_Number Day Month Year Ag_Value
# 1:           1  10     4 1998       10
# 2:           2  23    11 2001       12
# 3:           3  11     3 2005        8

dt[, .SD[1], by = .(Bill_Number) ]  #other approach, a bit slower

Upvotes: 3

nope

Reputation: 111

duplicated() gives you the entries which are identical to earlier one (i.e. ones with smaller subscripts). Therefore, sorting your bill numbers by date (earliest to the top) and then removing duplicates should do the trick. Aggregating your columns day, month and year into one date-column might be helpful.

Upvotes: 1

Keep less recent duplicate row in R

Answers (4)

Related Questions