sammyramz
sammyramz

Reputation: 563

Conditionally remove a row based on another id code

In a dataset which contains many ids, I am only trying to manipulate rows which have id 7 or 9, and leave everything else untouched.

I am trying to conditionally remove a row from 7 or 9 in all instances where there isn't a variable that corresponds to it. So, if in the case of the dput example below, I want to remove the ninth row from id=9 because id=7 does not have an itemcode=2. Vice versa for id=7, I am trying to remove its itemcode=9 because id=9 does not have it.

id  client    item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
...
7   7     Bob  eighth        8  100    13    18    15    NA    NA    NA    NA
8   7     Bob   ninth        9  100    11    21    10    NA    NA    NA    NA
9   9 Bob_new   first        1  100    NA    NA    NA    23    18    25    18

Code:

structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob", 
"Bob_new", "Mark"), class = "factor"), item = structure(c(3L, 
9L, 4L, 2L, 8L, 7L, 1L, 5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L
), .Label = c("eighth", "fifth", "first", "fourth", "ninth", 
"second", "seventh", "sixth", "third"), class = "factor"), itemcode = c(1L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L
), unit = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = structure(c(5L, 
6L, 1L, 4L, 2L, 5L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L
), .Label = c("11", "12", "13", "22", "24", "25", "NA"), class = "factor"), 
X2002 = structure(c(4L, 8L, 1L, 3L, 7L, 2L, 5L, 6L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("13", "14", "15", 
"17", "18", "21", "22", "24", "NA"), class = "factor"), X2003 = structure(c(5L, 
1L, 4L, 2L, 6L, 1L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L), .Label = c("10", "11", "15", "19", "23", "24", "NA"), class = "factor"), 
X2004 = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 5L, 4L, 
2L, 6L, 1L, 3L, 4L, 3L, 4L), .Label = c("11", "14", "15", 
"20", "23", "25", "NA"), class = "factor"), X2005 = structure(c(6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L, 2L, 4L, 3L, 5L, 3L, 1L, 4L, 
3L), .Label = c("11", "13", "18", "19", "25", "NA"), class = "factor"), 
X2006 = structure(c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 8L, 6L, 
1L, 2L, 5L, 3L, 7L, 8L, 4L), .Label = c("10", "15", "18", 
"19", "20", "22", "23", "25", "NA"), class = "factor"), X2007 =     structure(c(8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 4L, 7L, 6L, 2L, 4L, 1L, 5L, 5L, 
3L), .Label = c("12", "13", "16", "18", "19", "21", "24", 
"NA"), class = "factor")), .Names = c("id", "client", "item", 
"itemcode", "unit", "X2001", "X2002", "X2003", "X2004", "X2005", 
"X2006", "X2007"), class = "data.frame", row.names = c(NA, -17L
))

———————————————————————————————————————— ANOTHER SCENARIO:

before:

structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"), 
item = structure(c(3L, 9L, 10L, 9L, 4L, 2L, 8L, 7L, 7L, 1L, 
5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L), .Label = c("eighth", 
"fifth", "first", "fourth", "ninth", "second", "seventh", 
"sixth", "third", "third "), class = "factor"), itemcode = c(1L, 
3L, 3L, 3L, 4L, 5L, 6L, 7L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 
6L, 7L, 8L, 1L), type = structure(c(1L, 1L, 2L, 3L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", 
"B", "C"), class = "factor"), unit = c(100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L, 
25L, 30L, 26L, 11L, 22L, 12L, 25L, 24L, 13L, 11L, NA, NA, 
NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L, 24L, 12L, 96L, 
13L, 15L, 22L, 21L, 14L, 18L, 21L, NA, NA, NA, NA, NA, NA, 
NA, NA, NA), X2003 = c(23L, 10L, 46L, 94L, 19L, 11L, 24L, 
19L, 10L, 15L, 10L, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
X2004 = c(NA, NA, 43L, 83L, NA, NA, NA, 6L, NA, NA, NA, 23L, 
20L, 14L, 25L, 11L, 15L, 20L, 15L, 20L), X2005 = c(NA, NA, 
97L, 86L, NA, NA, NA, 17L, NA, NA, NA, 18L, 13L, 19L, 18L, 
25L, 18L, 11L, 19L, 18L), X2006 = c(NA, NA, 11L, 91L, NA, 
NA, NA, 11L, NA, NA, NA, 25L, 22L, 10L, 15L, 20L, 18L, 23L, 
25L, 19L), X2007 = c(NA, NA, 19L, 27L, NA, NA, NA, 15L, NA, 
NA, NA, 18L, 24L, 21L, 13L, 18L, 12L, 19L, 19L, 16L)), .Names = c("id", 
"client", "item", "itemcode", "type", "unit", "X2001", "X2002", 
"X2003", "X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA, 
-20L))

after:

structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 
9L, 9L, 10L), client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"), 
item = structure(c(2L, 7L, 3L, 1L, 5L, 4L, 2L, 6L, 3L, 1L, 
5L, 4L, 2L), .Label = c("fifth", "first", "fourth", "seventh", 
"sixth", "third", "third "), class = "factor"), itemcode = c(1L, 
3L, 4L, 5L, 6L, 7L, 1L, 3L, 4L, 5L, 6L, 7L, 1L), type = structure(c(1L, 
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", 
"B"), class = "factor"), unit = c(100L, 100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L, 
10L, 11L, 22L, 12L, 17L, NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L, 
87L, 13L, 15L, 22L, 19L, NA, NA, NA, NA, NA, NA, NA), X2003 = c(23L, 
47L, 19L, 11L, 24L, 17L, NA, NA, NA, NA, NA, NA, NA), X2004 = c(NA, 
28L, NA, NA, NA, 28L, 23L, 14L, 25L, 11L, 15L, 20L, 20L), 
X2005 = c(NA, 43L, NA, NA, NA, 16L, 18L, 19L, 18L, 25L, 18L, 
11L, 18L), X2006 = c(NA, 69L, NA, NA, NA, 5L, 25L, 10L, 15L, 
20L, 18L, 23L, 19L), X2007 = c(NA, 72L, NA, NA, NA, 20L, 
18L, 21L, 13L, 18L, 12L, 19L, 16L)), .Names = c("id", "client", 
"item", "itemcode", "type", "unit", "X2001", "X2002", "X2003", 
"X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA, 
-13L))

I could implement the said filter code to remove items which do not exist in its corresponding place (id 7 and 9).

But if there are sub levels for items, like type of item. I am also trying to remove items if they don't have a type similar in the corresponding field.

Upvotes: 0

Views: 91

Answers (4)

sammyramz
sammyramz

Reputation: 563

library(dplyr)
df$remove <- paste(df$itemcode, df$type)
df<-invisible(filter(df,
                          remove %in% intersect(remove[type==7], 
                                                remove[type==9])|!type %in%     c(7,9) ))
#Remove the additional column after filter
df$remove <- NULL

Upvotes: 1

akrun
akrun

Reputation: 887068

You could use filter from dplyr

library(dplyr)
filter(df_all, itemcode %in% intersect(itemcode[id==7], 
              itemcode[id==9])|!id %in% c(7,9) )
#    id  client    item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
#1   7     Bob   first        1  100    24    17    23    NA    NA    NA    NA
#2   7     Bob   third        3  100    25    24    10    NA    NA    NA    NA
#3   7     Bob  fourth        4  100    11    13    19    NA    NA    NA    NA
#4   7     Bob   fifth        5  100    22    15    11    NA    NA    NA    NA
#5   7     Bob   sixth        6  100    12    22    24    NA    NA    NA    NA
#6   7     Bob seventh        7  100    24    14    10    NA    NA    NA    NA
#7   7     Bob  eighth        8  100    13    18    15    NA    NA    NA    NA
#8   9 Bob_new   first        1  100    NA    NA    NA    23    18    25    18
#9   9 Bob_new   third        3  100    NA    NA    NA    14    19    10    21
#10  9 Bob_new  fourth        4  100    NA    NA    NA    25    18    15    13
#11  9 Bob_new   fifth        5  100    NA    NA    NA    11    25    20    18
#12  9 Bob_new   sixth        6  100    NA    NA    NA    15    18    18    12
#13  9 Bob_new seventh        7  100    NA    NA    NA    20    11    23    19
#14  9 Bob_new  eighth        8  100    NA    NA    NA    15    19    25    19
#15 10    Mark   first        1  100    NA    NA    NA    20    18    19    16

Update

Based on the new dataset, perhaps this helps

library(dplyr)
library(tidyr)
dfnew %>%
      unite(itemtype, itemcode,type) %>% 
      filter(itemtype %in% intersect(itemtype[id==7], 
             itemtype[id==9])|!id %in% c(7,9)) %>% 
      separate(itemtype, c('itemcode', 'type'))
#    id  client    item itemcode type unit X2001 X2002 X2003 X2004 X2005 X2006
# 1   7     Bob   first        1    A  100    24    17    23    NA    NA    NA
# 2   7     Bob  third         3    B  100    30    12    46    43    97    11
# 3   7     Bob  fourth        4    A  100    11    13    19    NA    NA    NA
# 4   7     Bob   fifth        5    A  100    22    15    11    NA    NA    NA
# 5   7     Bob   sixth        6    A  100    12    22    24    NA    NA    NA
# 6   7     Bob seventh        7    A  100    25    21    19     6    17    11
# 7   9 Bob_new   first        1    A  100    NA    NA    NA    23    18    25
# 8   9 Bob_new   third        3    B  100    NA    NA    NA    14    19    10
# 9   9 Bob_new  fourth        4    A  100    NA    NA    NA    25    18    15
# 10  9 Bob_new   fifth        5    A  100    NA    NA    NA    11    25    20
# 11  9 Bob_new   sixth        6    A  100    NA    NA    NA    15    18    18
# 12  9 Bob_new seventh        7    A  100    NA    NA    NA    20    11    23
# 13 10    Mark   first        1    A  100    NA    NA    NA    20    18    19
 #   X2007
 #1     NA
 #2     19
 #3     NA
 #4     NA
 #5     NA
 #6     15
 #7     18
 #8     21
 #9     13
 #10    18
 #11    12
 #12    19
 #13    16

Upvotes: 1

Rich Scriven
Rich Scriven

Reputation: 99331

You could do something like this, which runs setdiff in both directions. The cl() function wasn't really necessary, but I really don't like writing the same expression over and over again.

f <- function(x, y) setdiff(union(x, y), x)
cl <- function(var) substitute(df$itemcode[df$id == x], list(x = var))

So now you can call f() on c(id7, id9) and then reverse it and get the c(id9, id7) result.

do.call(f, x <- list(cl(7), cl(9)))
# [1] 2
do.call(f, rev(x))
# [1] 9

Upvotes: 0

bergant
bergant

Reputation: 7232

If I understand the problem: every itemcode in id=9 subset must have identical itemcode in id=7 subset (and reverse). If it is not the case then we filter the row with the non-pair itemcode out, but leave everything with id not in 7 or 9. Here is one way of doing it:

First get common item codes:

items_9 <- df_all$itemcode[ df_all$id==9 ] 
items_7 <- df_all$itemcode[ df_all$id==7 ] 
items_common <- items_9[ items_9 %in% items_7 ]

select everything with common itemcodes for 7 and 9 and the rest:

df_new <- df_all[ 
  which( 
    ( df_all$id %in% c(7, 9) & 
      df_all$itemcode %in% items_common
    ) |
    !df_all$id %in% c(7,9)
  )
,]

Upvotes: 1

Related Questions