Reputation: 359
I am trying to create a stratified random sample of files based on two grouping variables (a sensor location and a date). Not all sensors have the same number of observations (batteries die on the last day).
The main thing I am working on is using dplyr to process my data, by getting a number of observations for each sensor-day combination and filtering out those that have fewer than I want for my eventual stratified sample.
This is the head of the data I am working with:
structure(list(fullPath = c("S4A00440_20180508_123353.flac",
"S4A00440_20180508_123353.wav", "S4A00440_20180508_130000.flac",
"S4A00440_20180508_133000.flac", "S4A00440_20180508_140000.flac",
"S4A00440_20180508_143000.flac", "S4A00440_20180508_150000.flac",
"S4A00440_20180508_153000.flac", "S4A00440_20180508_160000.flac",
"S4A00440_20180508_163000.flac", "S4A00440_20180508_170000.flac",
"S4A00440_20180508_173000.flac", "S4A00440_20180508_180000.flac",
"S4A00440_20180508_183000.flac", "S4A00440_20180508_190000.flac",
"S4A00440_20180508_193000.flac", "S4A00440_20180508_200000.flac",
"S4A00440_20180508_203000.flac", "S4A00440_20180508_210000.flac",
"S4A00440_20180508_213000.flac", "S4A00440_20180508_220000.flac",
"S4A00440_20180508_223000.flac", "S4A00440_20180508_230000.flac",
"S4A00440_20180508_233000.flac", "S4A00466_20180508_130000.flac",
"S4A00466_20180508_130000.wav", "S4A00466_20180508_133000.flac",
"S4A00466_20180508_140000.flac", "S4A00466_20180508_143000.flac",
"S4A00466_20180508_150000.flac", "S4A00466_20180508_153000.flac",
"S4A00466_20180508_160000.flac", "S4A00466_20180508_163000.flac",
"S4A00466_20180508_170000.flac", "S4A00466_20180508_173000.flac",
"S4A00466_20180508_180000.flac", "S4A00466_20180508_183000.flac",
"S4A00466_20180508_190000.flac", "S4A00466_20180508_193000.flac",
"S4A00466_20180508_200000.flac", "S4A00466_20180508_203000.flac",
"S4A00466_20180508_210000.flac", "S4A00466_20180508_213000.flac",
"S4A00466_20180508_220000.flac", "S4A00466_20180508_223000.flac",
"S4A00466_20180508_230000.flac", "S4A00466_20180508_233000.flac"),
sensorName = c("S4A00440", "S4A00440", "S4A00440", "S4A00440",
"S4A00440", "S4A00440", "S4A00440", "S4A00440", "S4A00440", "S4A00440",
"S4A00440", "S4A00440", "S4A00440", "S4A00440", "S4A00440", "S4A00440",
"S4A00440", "S4A00440", "S4A00440", "S4A00440", "S4A00440", "S4A00440",
"S4A00440", "S4A00440", "S4A00466", "S4A00466", "S4A00466", "S4A00466",
"S4A00466", "S4A00466", "S4A00466", "S4A00466", "S4A00466", "S4A00466",
"S4A00466", "S4A00466", "S4A00466", "S4A00466", "S4A00466", "S4A00466",
"S4A00466", "S4A00466", "S4A00466", "S4A00466", "S4A00466", "S4A00466",
"S4A00466"),
Date = structure(c(1525737600, 1525737600, 1525737600,
1525737600, 1525737600, 1525737600, 1525737600, 1525737600, 1525737600,
1525737600, 1525737600, 1525737600, 1525737600, 1525737600, 1525737600,
1525737600, 1525737600, 1525737600, 1525737600, 1525737600, 1525737600,
1525737600, 1525737600, 1525737600, 1525737600, 1525737600, 1525737600,
1525737600, 1525737600, 1525737600, 1525737600, 1525737600, 1525737600,
1525737600, 1525737600, 1525737600, 1525737600, 1525737600, 1525737600,
1525737600, 1525737600, 1525737600, 1525737600, 1525737600, 1525737600,
1525737600, 1525737600),
class = c("POSIXct", "POSIXt"), tzone = "UTC")),
row.names = c(1114L, 1115L, 1116L, 1117L, 1118L, 1119L, 1120L, 1121L, 1122L, 1123L,
1124L, 1125L, 1126L, 1127L, 1128L, 1129L, 1130L, 1131L, 1132L,
1133L, 1134L, 1135L, 1136L, 1137L, 1395L, 1396L, 1397L, 1398L,
1399L, 1400L, 1401L, 1402L, 1403L, 1404L, 1405L, 1406L, 1407L,
1408L, 1409L, 1410L, 1411L, 1412L, 1413L, 1414L, 1415L, 1416L,
1417L), class = "data.frame")
How I tried to do the sampling
foo_strat <- foo %>%
select(fullPath, sensorName, Date) %>%
group_by(sensorName, Date) %>%
summarise(num_recs = length(fullPath)) %>%
dplyr::filter(num_recs > 12) %>%
sample_n(12)
However, I'm getting this error:
Error: 'size' must be less or equal than 7 (size of data), set
replace
= TRUE to use sampling with replacement
I checked this issue which suggests that what I'm trying to do may just not be the behavior of the function, in which case I'm not sure what would be the best way to proceed
Upvotes: 0
Views: 1890
Reputation: 887691
The summarise
summarises to single row per each group. So, we don't hae enough data points to do the sample_n
. Instead, we can directly apply the filter
on the whole data with n()
(giving the number of rows)
library(dplyr)
foo %>%
select(fullPath, sensorName, Date) %>%
group_by(sensorName, Date) %>%
filter(n() > 12) %>%
sample_n(12)
# A tibble: 24 x 3
# Groups: sensorName, Date [2]
# fullPath sensorName Date
# <chr> <chr> <dttm>
# 1 S4A00440_20180508_193000.flac S4A00440 2018-05-08 00:00:00
# 2 S4A00440_20180508_160000.flac S4A00440 2018-05-08 00:00:00
# 3 S4A00440_20180508_183000.flac S4A00440 2018-05-08 00:00:00
# 4 S4A00440_20180508_190000.flac S4A00440 2018-05-08 00:00:00
# 5 S4A00440_20180508_200000.flac S4A00440 2018-05-08 00:00:00
# 6 S4A00440_20180508_150000.flac S4A00440 2018-05-08 00:00:00
# 7 S4A00440_20180508_230000.flac S4A00440 2018-05-08 00:00:00
# 8 S4A00440_20180508_223000.flac S4A00440 2018-05-08 00:00:00
# 9 S4A00440_20180508_163000.flac S4A00440 2018-05-08 00:00:00
#10 S4A00440_20180508_133000.flac S4A00440 2018-05-08 00:00:00
# … with 14 more rows
Upvotes: 0