filter rows based on all previous row data in another column

Question

I have a data table which i would like to filter based on multiple conditions looking at all previous columns. If the New_ID.1 row number is before the same id in the New_ID column, remove the row where New_ID= New_ID.1 from previous row. For example, I would remove the New_ID 581 in row 3 because New_ID.1 is in row 1. However I don't want to remove row six New_ID 551 since row 3 New_ID.551 would be removed first. Essentially, I think i need to loop through and create a new filtered table for each row and repeat process?

orig_df<- structure(list(New_ID = c(557L, 588L, 581L, 580L, 591L, 551L, 
300L, 112L), New_ID.1 = c(581L, 591L, 551L, 300L, 112L, 584L, 
416L, 115L), distance = c(3339.15537217173, 3432.33715484179, 
5268.69104753613, 5296.72042763528, 5271.94917463488, 5258.66546295312, 
5286.99982045171, 5277.81914818968), X.x = c(903604.940384474, 
819515.728302034, 903663.550206032, 866828.860223065, 819525.350044447, 
903720.790105847, 866881.654186025, 819585.173276271), Y.x = c(1027706.41509243, 
1026880.34660449, 1024367.77412815, 1023962.99139374, 1023448.02293581, 
1019099.39402149, 1018666.53407908, 1018176.41319296), X.y = c(903663.550206032, 
819525.350044447, 903720.790105847, 866881.654186025, 819585.173276271, 
903801.327345876, 866919.184271939, 819630.672367509), Y.y = c(1024367.77412815, 
1023448.02293581, 1019099.39402149, 1018666.53407908, 1018176.41319296, 
1013841.34531459, 1013379.66746509, 1012898.79016799), Y_filter = c(3338.64096427278, 
3432.32366867992, 5268.38010666054, 5296.45731465891, 5271.60974284587, 
5258.04870690871, 5286.86661398865, 5277.62302497006), X_filter = c(58.609821557533, 
9.62174241337925, 57.2398998149438, 52.7939629601315, 59.8232318238588, 
80.5372400298947, 37.5300859131385, 45.4990912381327), row.number = 1:8), row.names = c(NA, 
-8L), class = c("tbl_df", "tbl", "data.frame"))

End result would retain rows 1,2,4,6 and 8 from original data

output_table<-structure(list(New_ID = c(557L, 588L, 580L, 551L, 112L), New_ID.1 = c(581L, 
591L, 300L, 584L, 115L), distance = c(3339.15537217173, 3432.33715484179, 
5296.72042763528, 5258.66546295312, 5277.81914818968), X.x = c(903604.940384474, 
819515.728302034, 866828.860223065, 903720.790105847, 819585.173276271
), Y.x = c(1027706.41509243, 1026880.34660449, 1023962.99139374, 
1019099.39402149, 1018176.41319296), X.y = c(903663.550206032, 
819525.350044447, 866881.654186025, 903801.327345876, 819630.672367509
), Y.y = c(1024367.77412815, 1023448.02293581, 1018666.53407908, 
1013841.34531459, 1012898.79016799), Y_filter = c(3338.64096427278, 
3432.32366867992, 5296.45731465891, 5258.04870690871, 5277.62302497006
), X_filter = c(58.609821557533, 9.62174241337925, 52.7939629601315, 
80.5372400298947, 45.4990912381327), row.number = c(1L, 2L, 4L, 
6L, 8L)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", 
"data.frame"))

Below is a simpler problem that might be of help.

Original data
A|B
C|D
B|E
E|F

Updated data table
A|B
C|D
E|F

Alex · Accepted Answer

I think looping through the rows and saving the ids that you already encountered should be enough?

orig_df <- as.data.frame(orig_df)
included_rows <- rep(FALSE, nrow(orig_df))
seen_ids <- c()
for(i in 1:nrow(orig_df)){
    # Skip row if we have seen either ID already
    if(orig_df[i, 'New_ID']   %in% seen_ids) next
    if(orig_df[i, 'New_ID.1'] %in% seen_ids) next
    # If both ids are new, we save them as seen and include the entry
    seen_ids <- c(seen_ids, orig_df[i, 'New_ID'] , orig_df[i, 'New_ID.1'] )
    included_rows[i] <-  TRUE
}
filtered_df <- orig_df[included_rows,]

filter rows based on all previous row data in another column

Answers (1)

Related Questions