Max
Max

Reputation: 121

Check if value in two columns, otherwise remove it from dataframe

I want to create a function that checks whether a certain value is present in two columns. If the answer is affirmative, I want it to do nothing. If a value is only present in one of the columns, I want to remove it from the other one, too. The end result should be two identical columns.

I tried to use the %in% function, however I don´t seem to be able to get quite where I want.

testfunction <- function(dataframe, conditionvariable, idvariable){
  #catches all levels of condition variable
  conditions <- unique(dataframe$Group)
  #We only want to compare two conditions
  dataGroupA <- dataframe %>% filter(Group == conditions[1])
  dataGroupB <- dataframe %>% filter(Group == conditions[2])
  
  #check for each participant number if there are values in both groups. If there are not, remove the participant from the dataset. Finally, return the new data set with the removed participants.
  for(ppnumber in dataGroupA){
    for(ppnumber in dataGroupB){
      
      if(ppnumber %in% dataGroupA){
        if(!ppnumber %in% dataGroupB){
          dataframeUpdated <- dataframe %>% filter(participant == ppnumber)
}      
}#close if Clause
      
      if(ppnumber %in% dataGroupB){
        if(!ppnumber %in% dataGroupA){
          dataframeUpdated <- dataframe %>% filter(participant == ppnumber)
}      
}#close if Clause
      else{
        dataframeUpdated <- dataframe
      }#close else loop
    }#close for loop groupB
  } #close for loop groupA
} #close function


#Sample Data
group <- c(1,2,1,2,1,2,1,2)
participant <- c(1,1,1,2,2,4,3,4)
latency <- c(1.1, 1.4, 1.3, 1.5, 1.3, 0.5, 0.7, 1.5)
dataframe <- as.data.frame(cbind(participant, group, latency))

#expected output:

dataframeExpected <- dataframe[1:5,]

#PP1 and PP2 both have some data in both groups/conditions. 
#PP3 and PP4 do only have data in one condition, hence they should be removed.

Any help is appreciated.

The expected output is a dataframe that only contains the data of participants of which there is data in both groups (group being a within-subject factor).

There was a solution provided, which only works sometimes. Here is code reproducing when it does not work:

#first some data
dfn <- structure(list(X = c(11L, 12L, 18L, 20L, 27L, 29L, 33L, 36L, 
37L, 47L, 51L, 55L, 57L, 58L, 61L, 68L, 71L, 76L, 77L, 78L, 80L, 
84L, 86L, 87L, 90L, 92L, 95L, 98L, 100L, 101L, 103L, 106L, 109L, 
114L, 116L, 117L, 119L, 122L, 123L, 130L, 131L, 132L, 136L, 138L, 
140L, 141L, 143L, 144L, 146L, 147L, 149L, 154L, 158L, 160L, 161L, 
163L, 167L, 168L, 169L, 170L, 174L, 177L, 178L, 180L, 182L, 186L, 
193L, 195L, 196L, 197L, 198L, 200L, 201L, 203L, 204L, 205L, 206L, 
209L, 210L, 213L, 215L, 216L, 217L, 218L, 220L, 222L, 224L, 226L, 
227L, 229L, 230L, 231L, 232L, 234L, 238L, 239L, 240L, 241L, 242L, 
246L, 249L, 251L, 252L, 253L, 254L, 255L, 256L, 260L, 261L, 262L, 
263L, 266L, 267L, 269L, 271L, 272L, 273L, 274L, 275L, 276L, 279L, 
280L, 282L, 284L, 285L, 286L, 287L, 288L, 289L, 291L, 293L, 294L, 
295L, 298L, 300L, 301L, 304L, 305L, 306L, 308L, 309L, 311L, 312L, 
313L, 315L, 316L, 318L, 319L, 320L, 321L, 324L, 325L, 326L, 327L, 
328L, 330L, 331L, 332L, 333L, 334L, 336L, 339L, 340L, 341L, 342L, 
344L, 345L, 347L, 348L, 349L, 350L, 353L, 354L, 356L, 357L, 358L, 
360L, 362L, 366L, 368L, 369L, 370L, 371L, 372L, 379L, 380L, 381L, 
384L, 385L, 387L, 388L, 389L, 392L, 395L, 399L, 401L, 402L, 404L, 
405L, 406L), participant = c(31L, 31L, 31L, 31L, 31L, 31L, 31L, 
31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 
31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 
31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 
31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 
31L, 31L, 31L, 31L, 31L, 31L, 31L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 32L, 32L, 32L, 32L, 33L, 33L, 33L, 33L, 33L, 
33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L), latency = c(1.241730792, 
2.158541302, 0.891793295, 1.725196538, 1.141727408, 1.291906705, 
2.391879296, 1.475077386, 1.102099018, 1.208499439, 1.608445841, 
1.275157173, 1.475068786, 1.225057273, 1.29179213, 1.375061518, 
1.62511187, 1.458515113, 1.175086097, 1.325173291, 1.141661382, 
1.541854966, 0.891640437, 1.708388748, 2.508667999, 1.791765498, 
1.208331599, 1.758546351, 1.325188826, 1.275036217, 1.475070173, 
2.008490728, 1.591703522, 1.101920637, 1.341740835, 1.491791409, 
1.541778953, 1.391753069, 1.541872166, 1.102017734, 1.258391273, 
0.974974463, 0.90828122, 1.291834021, 1.425102049, 2.308734192, 
1.508330934, 1.158520772, 1.641787886, 0.724940479, 2.008608631, 
1.641750434, 0.708203154, 1.758491421, 1.658470282, 0.691591222, 
1.341717254, 1.608539054, 1.458390551, 2.325271775, 2.241825948, 
1.791821259, 1.891815766, 1.208526903, 1.325015161, 0.758281414, 
1.10965261976, 1.37513309242, 1.22519903524, 1.30854166216, 1.10197473366, 
1.15839843002, 1.20852218725, 1.19183313276, 1.10203687582, 1.30849283617, 
1.19171578394, 0.958308434856, 1.42508124255, 1.10204575328, 
0.908274006988, 1.12500395324, 1.40856585321, 1.1019153657, 0.958443538575, 
0.941628812658, 1.47504576049, 0.924951964663, 1.1019153657, 
1.10196752073, 1.54169378451, 1.22501177649, 1.10191869475, 0.90829092963, 
0.908266239217, 0.958333402688, 1.45836142215, 0.85834499886, 
1.10202744353, 0.958313705843, 0.908268736001, 0.808449281176, 
1.10205435331, 1.10191786249, 0.79179351713, 0.858266211475, 
1.10196668847, 1.42508151998, 1.1249706628, 1.72516352544, 1.15850995301, 
0.974938121387, 1.29183679471, 1.191713842, 0.808320835546, 1.1916525321, 
0.808365500225, 0.808249815932, 1.22504423468, 1.30837049379, 
0.741602346754, 1.12511603107, 1.42505932635, 0.974968360207, 
1.22501649264, 1.275043985, 0.94168374189, 1.10199997892, 1.49183746052, 
1.30836716475, 1.39183823729, 1.30836938411, 2.07520588753, 0.958485983891, 
1.65851910788, 0.924998848706, 1.72524425476, 1.10198028207, 
0.94175892281, 1.10193367545, 1.10194754647, 1.10203992745, 1.10204769522, 
1.22500012484, 1.10193728191, 0.891663462462, 0.925117029782, 
1.10200219828, 1.29175023955, 1.3083338743, 0.875121163346, 1.10196585621, 
1.77526425677, 1.2251629706, 0.891650423704, 1.10204408875, 1.10197695303, 
1.32506981284, 1.10202855321, 1.10197279172, 1.4085669629, 1.15834682983, 
1.52508545934, 0.791753568597, 0.908250148836, 1.40841188491, 
1.40849982717, 0.975095696156, 1.10194144322, 1.20839290936, 
1.52507602705, 1.10189705596, 0.891815488823, 1.10202994032, 
0.841608228066, 1.37504098886, 1.49169708581, 1.10196890783, 
1.50853206341, 0.924902583838, 0.891599100936, 1.10198915952, 
0.874946388514, 0.758285575417, 1.45891598546, 1.22515409314, 
1.82517939388, 1.5250998852, 1.4585664358, 1.10193228835, 1.10204436617, 
1.30839546162, 0.924993577719, 1.17497485184, 1.12509245034, 
1.10189400434, 1.14180120167, 1.10199248857, 0.908294258675, 
1.15838012028), Group = c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 
1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 
2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 
1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 
2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L)), row.names = c(NA, 
200L), class = "data.frame")
#There clearly is more than 1 value per group per participant.

groups <- aggregate(
  Group ~ participant,
  dfn,
  function(x) identical(sort(unique(x)), unique(dfn$Group))
)

compliant_participants <- groups[which(groups$Group),][["participant"]]
data_cleaned <- dfn[dfn$participant %in% compliant_participants,]
#The code falsely states that these participants are not compliant.

Upvotes: 0

Views: 57

Answers (1)

bcarlsen
bcarlsen

Reputation: 1441

> groups <- aggregate(
    group ~ participant,
    dataframe,
    function(x) identical(sort(unique(x)), sort(unique(dataframe$group)))
  )

> compliant_participants <- groups[which(groups$group),][["participant"]]

> dataframe[participant %in% compliant_participants,]

  participant group latency
1           1     1     1.1
2           1     2     1.4
3           1     1     1.3
4           2     2     1.5
5           2     1     1.3

Upvotes: 1

Related Questions