Reputation: 587
I have this dataset:
df <- data.frame(PatientID = c("0002" ,"0002", "0005", "0005" ,"0009" ,"0009" ,"0018", "0018" ,"0020" ,"0027", "0039" ,"0039" ,"0042", "0043" ,"0043" ,"0045", "0046", "0046" ,"0048" ,"0048", "0055"),
Timepoint= c("A", "B", "A", "B", "A", "B", "A", "B", "A", "A", "A", "B", "A", "A", "B", "A", "A", "B", "A", "B", "A"),
A = c(NA , 977.146 , NA , 964.315 ,NA , 952.311 , NA , 950.797 , 958.975 ,960.712 ,NA , 947.465 , 902.852 , NA, 985.124 ,NA , 930.141 ,1007.790 , 948.848, 1027.110 , 999.414),
B = c(998.988 , NA , 998.680 , NA , NA ,1020.560 , 947.751 ,1029.560 , 955.540 , 911.606 , 964.039 , NA, 988.087 , 902.367 , 959.338 ,1029.050 , 925.162 , 987.374 ,1066.400 ,957.512 , 917.597),
C = c( NA , 987.140 , 961.810 , 929.466 , 978.166, 1005.820 ,925.752 , 969.469 , 943.398 ,936.034, 965.292 , 996.404 , 920.610 , 967.047 ,986.565 , 913.517 , 893.428 , 921.606 , NA , 929.590 ,950.493),
D = c(975.634 , 987.140 , 961.810 , 929.466 , 978.166, 1005.820 , 925.752 , 969.469 ,943.398 , NA , 965.292 , 996.404 , NA , 967.047 , 986.565 , NA , 893.428 , 921.606 , 976.192 , 929.590 , 950.493),
E = c(1006.330, 1028.070 , NA , 954.274 ,1005.910 ,949.969 , 992.820 , 977.048 ,934.407 , 948.913 , NA , NA , NA, 961.375 ,955.296 , 961.128 ,998.119 ,1009.110 , 994.891 ,1000.170 ,982.763),
G= c(NA , 958.990 , NA , NA , 924.680 , 955.927 , NA , 949.384 ,973.348 , 984.392 , 943.894 , 961.468 , 995.368 , 994.997 , NA , 979.454 , 952.605 , NA , NA, NA , 956.507), stringsAsFactors = F)
I would like to discard from this dataset all patients who don't have visit A and B. I attach an example:
Thank you so much!!
Upvotes: 1
Views: 104
Reputation: 101189
You can try ave
+ subset
like below
subset(
df,
ave(Timepoint %in% c("A", "B"), PatientID, FUN = all)
)
which gives
PatientID Timepoint A B C D E G
1 0002 A NA 998.988 NA 975.634 1006.330 NA
2 0002 B 977.146 NA 987.140 987.140 1028.070 958.990
3 0005 A NA 998.680 961.810 961.810 NA NA
4 0005 B 964.315 NA 929.466 929.466 954.274 NA
5 0009 A NA NA 978.166 978.166 1005.910 924.680
6 0009 B 952.311 1020.560 1005.820 1005.820 949.969 955.927
7 0018 A NA 947.751 925.752 925.752 992.820 NA
8 0018 B 950.797 1029.560 969.469 969.469 977.048 949.384
9 0020 A 958.975 955.540 943.398 943.398 934.407 973.348
10 0027 A 960.712 911.606 936.034 NA 948.913 984.392
11 0039 A NA 964.039 965.292 965.292 NA 943.894
12 0039 B 947.465 NA 996.404 996.404 NA 961.468
13 0042 A 902.852 988.087 920.610 NA NA 995.368
14 0043 A NA 902.367 967.047 967.047 961.375 994.997
15 0043 B 985.124 959.338 986.565 986.565 955.296 NA
16 0045 A NA 1029.050 913.517 NA 961.128 979.454
17 0046 A 930.141 925.162 893.428 893.428 998.119 952.605
18 0046 B 1007.790 987.374 921.606 921.606 1009.110 NA
19 0048 A 948.848 1066.400 NA 976.192 994.891 NA
20 0048 B 1027.110 957.512 929.590 929.590 1000.170 NA
21 0055 A 999.414 917.597 950.493 950.493 982.763 956.507
Upvotes: 2
Reputation: 5788
Base R solution:
# Function to union all data.frames in a list:
# unionListOfDataFrames => Function
unionListOfDataFrames <- function(dfList){
# data.frame => GlobalEnv()
return(data.frame(
do.call(
rbind,
dfList
),
stringsAsFactors = FALSE,
row.names = NULL
)
)
}
# Store a vector of the timepoints each patient much have:
# timePoints => character vector
timePoints <- c("A", "B")
# Split-Apply-Combine: data.frame => stdout(console)
# Split data.frame into a list of data.frames on PatientId
with(df, split(df, PatientID)) |>
# Map a function over the list to check if each of the time
# points is present in a given patients' (x's) data.frame
# subset out observations (patients) that don't meet this condition
lapply(function(x){
x[all(timePoints %in% x$Timepoint)]
}
# Pipe into the combine (data.frame list row-bind) function:
) |>
unionListOfDataFrames()
OR:
# Store a vector of the timepoints each patient much have:
# timePoints => character vector
timePoints <- c("A", "B")
# Subset the data:
subset(
df,
as.logical(
# For each patient check if they are valid (meet condition):
ave(
Timepoint,
PatientID,
FUN = function(x){
all(timePoints %in% x)
}
)
)
)
Upvotes: 2
Reputation: 7645
If you want to filter for all patients that had exactly two visits, this can easily be done with dplyr. First you group by the patient id, and then you use the number of rows per patient as a filter condition.
library(dplyr)
df %>%
group_by(PatientID) %>%
filter(n() == 2)
A more rigorous way would be to check for each patient wether A and B occurs in the Timepoint column. This makes sure that patients who had two "A-Visits" are not included in the output.
df %>%
group_by(PatientID) %>%
filter("A" %in% Timepoint & "B" %in% Timepoint)
As mentioned by @IceCreamToucan in the comments, this syntax can become very tedious if a large number of visits is required. In this case it is more elegant to put the required visits in a vector and check the condition like so.
required_visits <- c("A", "B")
df %>%
group_by(PatientID) %>%
filter(all(required_visits %in% Timepoint))
Upvotes: 3