Lili
Lili

Reputation: 587

Discard cases without two visits in R

I have this dataset:

df <- data.frame(PatientID = c("0002" ,"0002", "0005", "0005" ,"0009" ,"0009" ,"0018", "0018" ,"0020" ,"0027", "0039" ,"0039" ,"0042", "0043" ,"0043" ,"0045", "0046", "0046" ,"0048" ,"0048", "0055"),
                 Timepoint= c("A", "B", "A", "B", "A", "B", "A", "B", "A", "A", "A", "B", "A", "A", "B",  "A",  "A", "B", "A", "B", "A"),
                 A = c(NA , 977.146 , NA , 964.315 ,NA , 952.311 , NA , 950.797 , 958.975  ,960.712  ,NA , 947.465 , 902.852 , NA,  985.124  ,NA , 930.141 ,1007.790 , 948.848, 1027.110 , 999.414),
                 B = c(998.988 , NA , 998.680 , NA , NA ,1020.560 , 947.751 ,1029.560 , 955.540 , 911.606 , 964.039   ,    NA,  988.087 , 902.367 , 959.338 ,1029.050 , 925.162 , 987.374 ,1066.400  ,957.512 , 917.597),
                 C = c( NA , 987.140 , 961.810 , 929.466 , 978.166, 1005.820  ,925.752 , 969.469 , 943.398  ,936.034,  965.292 , 996.404 , 920.610 , 967.047  ,986.565 , 913.517 , 893.428 , 921.606 , NA , 929.590  ,950.493), 
                 D = c(975.634 , 987.140 , 961.810 , 929.466 , 978.166, 1005.820 , 925.752 , 969.469  ,943.398 , NA , 965.292 , 996.404 , NA , 967.047 , 986.565 , NA , 893.428 , 921.606 , 976.192 , 929.590 , 950.493),
                 E = c(1006.330, 1028.070 , NA , 954.274 ,1005.910  ,949.969 , 992.820 , 977.048  ,934.407 , 948.913 , NA , NA , NA,  961.375  ,955.296 , 961.128  ,998.119 ,1009.110 , 994.891 ,1000.170  ,982.763),
                 G= c(NA , 958.990 , NA , NA , 924.680 , 955.927 , NA , 949.384  ,973.348 , 984.392 , 943.894 , 961.468 , 995.368 , 994.997 , NA , 979.454 , 952.605 , NA  ,   NA, NA , 956.507), stringsAsFactors = F)

I would like to discard from this dataset all patients who don't have visit A and B. I attach an example:

enter image description here

Thank you so much!!

Upvotes: 1

Views: 104

Answers (3)

ThomasIsCoding
ThomasIsCoding

Reputation: 101189

You can try ave + subset like below

subset(
    df,
    ave(Timepoint %in% c("A", "B"), PatientID, FUN = all)
)

which gives

   PatientID Timepoint        A        B        C        D        E       G
1       0002         A       NA  998.988       NA  975.634 1006.330      NA
2       0002         B  977.146       NA  987.140  987.140 1028.070 958.990
3       0005         A       NA  998.680  961.810  961.810       NA      NA
4       0005         B  964.315       NA  929.466  929.466  954.274      NA
5       0009         A       NA       NA  978.166  978.166 1005.910 924.680
6       0009         B  952.311 1020.560 1005.820 1005.820  949.969 955.927
7       0018         A       NA  947.751  925.752  925.752  992.820      NA
8       0018         B  950.797 1029.560  969.469  969.469  977.048 949.384
9       0020         A  958.975  955.540  943.398  943.398  934.407 973.348
10      0027         A  960.712  911.606  936.034       NA  948.913 984.392
11      0039         A       NA  964.039  965.292  965.292       NA 943.894
12      0039         B  947.465       NA  996.404  996.404       NA 961.468
13      0042         A  902.852  988.087  920.610       NA       NA 995.368
14      0043         A       NA  902.367  967.047  967.047  961.375 994.997
15      0043         B  985.124  959.338  986.565  986.565  955.296      NA
16      0045         A       NA 1029.050  913.517       NA  961.128 979.454
17      0046         A  930.141  925.162  893.428  893.428  998.119 952.605
18      0046         B 1007.790  987.374  921.606  921.606 1009.110      NA
19      0048         A  948.848 1066.400       NA  976.192  994.891      NA
20      0048         B 1027.110  957.512  929.590  929.590 1000.170      NA
21      0055         A  999.414  917.597  950.493  950.493  982.763 956.507

Upvotes: 2

hello_friend
hello_friend

Reputation: 5788

Base R solution:

# Function to union all data.frames in  a list: 
# unionListOfDataFrames  => Function
unionListOfDataFrames <- function(dfList){
  # data.frame => GlobalEnv() 
  return(data.frame(
    do.call(
      rbind, 
      dfList
    ),
    stringsAsFactors = FALSE,
    row.names = NULL
    )
  )
}

# Store a vector of the timepoints each patient much have:
# timePoints => character vector
timePoints <- c("A", "B")

# Split-Apply-Combine: data.frame => stdout(console)
# Split data.frame into a list of data.frames on PatientId
with(df, split(df, PatientID)) |>
  # Map a function over the list to check if each of the time 
  # points is present in a given patients' (x's) data.frame
  # subset out observations (patients) that don't meet this condition
  lapply(function(x){
    x[all(timePoints %in% x$Timepoint)]
  }
  # Pipe into the combine (data.frame list row-bind) function:
  ) |>
  unionListOfDataFrames()

OR:

# Store a vector of the timepoints each patient much have:
# timePoints => character vector
timePoints <- c("A", "B")

# Subset the data: 
subset(
  df, 
  as.logical(
    # For each patient check if they are valid (meet condition):
    ave(
      Timepoint, 
      PatientID, 
      FUN = function(x){
        all(timePoints %in% x)
      }
    )
  )
)

Upvotes: 2

Gregor de Cillia
Gregor de Cillia

Reputation: 7645

If you want to filter for all patients that had exactly two visits, this can easily be done with dplyr. First you group by the patient id, and then you use the number of rows per patient as a filter condition.

library(dplyr)

df %>% 
  group_by(PatientID) %>% 
  filter(n() == 2)

A more rigorous way would be to check for each patient wether A and B occurs in the Timepoint column. This makes sure that patients who had two "A-Visits" are not included in the output.

df %>% 
  group_by(PatientID) %>% 
  filter("A" %in% Timepoint & "B" %in% Timepoint)

As mentioned by @IceCreamToucan in the comments, this syntax can become very tedious if a large number of visits is required. In this case it is more elegant to put the required visits in a vector and check the condition like so.

required_visits <- c("A", "B")

df %>% 
  group_by(PatientID) %>% 
  filter(all(required_visits %in% Timepoint))

Upvotes: 3

Related Questions