Reputation: 169
I've tried a number of different methods for this including this stack but nothing is working quite correctly.
My dataframe "SiteVisits" (a small subset dput is at bottom) is made up of columns Date
(class = date), TagID
(class = numeric), SiteVisits
(a list of character), and NumSites
(class = numeric). Each row lists all sites where an individual organism (TagID
) is found for each Date.
I'd like to assign whether a tag spent the entire day "inside", "outside", or "transiting" based on the sites it visited. It can only be "inside" if it never visits an outside site, and it can only be "outside" if it never visits an inside site
First, I'd like to determine whether ALL the sites for a TagID for a Date are included in this list:
inside <- list(c("Release","IC1", "IC2", "IC3","RGD1"))
If TRUE SiteVisit$Location = "INSIDE"
ELSE test whether ALL sites for a TagID for a Date are contained within this list:
outside <- list(c("ORS1","WC1","WC2","WC3","RGU1","ORN1","ORN2","ORS3","GL1","CVP1","CLRS"))
If TRUE SiteVisit$Location = "OUTSIDE"
ELSE SiteVisit$Location = "TRANSITING"
I've tried a number of different dplyr
and base
versions to accomplish this, but none seem to get it right. I think it's because I'm not correctly checking through each element of SiteVisit$SiteVisits
My current attempts are:
SiteVisit <- SiteVisit %>%
mutate(Location = ifelse(all(SiteVisits[[]] %in% inside), "INSIDE",
ifelse(all(SiteVisits[[]] %in% outside),"OUTSIDE","TRANSITING")))
which yields all "INSIDE"
and
SiteVisit <- SiteVisit %>%
mutate(Location = ifelse(all(SiteVisits[] %in% inside), "INSIDE",
ifelse(all(SiteVisits[] %in% outside),"OUTSIDE","TRANSITING")))
which yields all "TRANSITING"
also, attempting to do this in a for loop doesn't quite work
for (i in 1: nrow(SiteVisit)) {SiteVisit$Inside <-
all(SiteVisit$SiteVisits[[i]] %in% inside)}
yields all FALSE while
all(SiteVisit$SiteVisits[[2]] %in% inside)
is TRUE
Here's a small subset of my dataframe "SiteVisit" dput:
structure(list(Date = structure(c(15828, 15828, 15847, 15847,
15847, 15847, 15847, 15847, 15848, 15848, 15848, 15848, 15848,
15848, 15848, 15848, 15849, 15849, 15849, 15849, 15849, 15849,
15849, 15850, 15850, 15850, 15850, 15850, 15850, 15850, 15851,
15851, 15851, 15851, 15851, 15851, 15851, 15851, 15852, 15852,
15852, 15852, 15852, 15852, 15852, 15853, 15853, 15853, 15853,
15853, 15853, 15853, 15853, 15853, 15854, 15854, 15854, 15854,
15854, 15854, 15854, 15854, 15855, 15855, 15855, 15855, 15855,
15855, 15855, 15855, 15855, 15855, 15855, 15855, 15855, 15855,
15856, 15856, 15856, 15856, 15856, 15856, 15856, 15856, 15856,
15856, 15856, 15856, 15856, 15857, 15857, 15857, 15857, 15857,
15857, 15857, 15857, 15857, 15857, 15857), class = "Date"), TagID = c(5717.06,
6277.06, 5073.06, 5717.06, 11121.1, 11191.1, 11387.1, 11415.1,
5717.06, 6277.06, 11121.1, 11191.1, 11219.1, 11289.1, 11387.1,
11415.1, 5717.06, 11121.1, 11191.1, 11219.1, 11289.1, 11387.1,
11415.1, 5717.06, 11121.1, 11191.1, 11219.1, 11289.1, 11387.1,
11415.1, 5717.06, 11121.1, 11191.1, 11219.1, 11289.1, 11317.1,
11387.1, 11415.1, 5717.06, 6277.06, 11191.1, 11219.1, 11289.1,
11387.1, 11415.1, 5717.06, 6277.06, 9015.01, 9833.06, 11191.1,
11219.1, 11289.1, 11387.1, 11415.1, 5717.06, 6277.06, 9015.01,
11191.1, 11219.1, 11289.1, 11387.1, 11415.1, 5641.22, 5717.06,
6221.06, 6277.06, 7909.22, 9015.01, 9833.06, 11121.1, 11191.1,
11219.1, 11289.1, 11317.1, 11387.1, 11415.1, 5717.06, 6277.06,
6529.06, 8119.01, 8545.06, 9015.01, 9497.06, 9833.06, 11191.1,
11219.1, 11289.1, 11387.1, 11415.1, 5717.06, 6277.06, 6529.06,
9015.01, 9497.06, 9833.06, 11191.1, 11219.1, 11289.1, 11387.1,
11415.1), SiteVisits = list("Release", "Release", c("IC2", "IC1",
"Release"), "IC3", "WC2", "RGD1", c("WC1", "WC3"), "WC3", "IC3",
"IC3", "WC2", "RGD1", "IC2", "IC1", "WC1", "WC3", "IC3",
"WC2", "RGD1", c("IC2", "IC1"), "IC1", "WC1", "WC3", "IC3",
"WC2", "RGD1", "IC2", "IC1", "WC1", "WC3", "IC3", "WC2",
"RGD1", "IC2", "IC1", "WC1", "WC1", "WC3", "IC3", "IC3",
"RGD1", "IC2", "IC1", "WC1", "WC3", "IC3", "IC3", c("IC3",
"Release"), c("IC3", "IC2", "IC1", "Release"), "RGD1", "IC2",
"IC1", "WC1", "WC3", "IC3", "IC3", c("IC3", "IC2"), "RGD1",
"IC2", "IC1", "WC1", "WC3", "Release", "IC3", "Release",
"IC3", c("RGD1", "Release"), c("IC3", "IC2"), c("IC3", "IC1"
), "WC2", "RGD1", "IC2", "IC1", "WC1", "WC1", "WC3", "IC3",
"IC3", c("RGD1", "Release"), c("RGD1", "Release"), "Release",
c("IC3", "IC2", "IC1"), "Release", c("IC3", "IC2", "IC1",
"RGD1"), "RGD1", "IC2", "IC1", "WC1", "WC3", "IC3", "IC3",
"RGD1", c("IC3", "IC2", "IC1"), "RGD1", c("IC3", "IC1", "RGD1"
), "RGD1", "IC2", c("IC2", "IC1"), "WC1", "WC3"), NumSites = c(1L,
1L, 3L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L,
3L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L,
2L, 1L, 1L)), row.names = c(NA, -100L), groups = structure(list(
Date = structure(c(15828, 15847, 15848, 15849, 15850, 15851,
15852, 15853, 15854, 15855, 15856, 15857), class = "Date"),
.rows = list(1:2, 3:8, 9:16, 17:23, 24:30, 31:38, 39:45,
46:54, 55:62, 63:76, 77:89, 90:100)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Upvotes: 2
Views: 183
Reputation: 128
Want an answer that's about 1/100th as fast? (Not a typo*, this is way worse than manotheshark's answer, but it works on your data structured as it was). *it was a typo! 1/100th not 1/10th
for (i in 1:nrow(SiteVisit)) {
SiteVisit_test$Location[i] <- if (all(unlist(SiteVisit[i, ]$SiteVisits) %in% unlist(inside))) {
"INSIDE"
} else if (all(unlist(SiteVisit[i, ]$SiteVisits) %in% unlist(outside))) {
"OUTSIDE"
} else {"TRANSITIONING"}
}
Benchmarks for the 2 approaches:
microbenchmark(
for_statement = for (i in 1:nrow(SiteVisit)) {
SiteVisit_test$Location[i] <- if (all(unlist(SiteVisit[i, ]$SiteVisits) %in% unlist(inside))) {
"INSIDE"
} else if (all(unlist(SiteVisit[i, ]$SiteVisits) %in% unlist(outside))) {
"OUTSIDE"
} else {"TRANSITIONING"}
},
lapply_statemnt = lapply(SiteVisit$SiteVisits, function(x) ifelse(all(x %in% inside2), "INSIDE", ifelse(all(x %in% outside2), "OUTSIDE", "TRANSIT")))
)
Unit: microseconds
expr min lq mean median uq max neval
for_statement 28874.4 30082.0 32411.968 31008.3 33108.90 48878.1 100
lapply_statemnt 268.4 284.2 346.201 295.5 310.85 4114.9 100
I don't really get why the lapply approach is so much faster here... probably because I'm unlisting for every i in the loop.
Upvotes: 0
Reputation: 4357
The following works once inside
and outside
are stored as an array
and not a list
inside <- c("Release", "IC1", "IC2", "IC3", "RGD1")
outside <- c("ORS1", "WC1", "WC2", "WC3", "RGU1", "ORN1", "ORN2", "ORS3", "GL1", "CVP1", "CLRS")
df1$Location <- lapply(df1$SiteVisits, function(x) ifelse(all(x %in% inside), "INSIDE", ifelse(all(x %in% outside), "OUTSIDE", "TRANSIT")))
Upvotes: 2