Reputation: 1067
I need to write a function involving subsetting a df by a variable n
bins. Like, if n
is 2, then subsample the df some number of times in two bins (from the first half, then from the second half). If n
is 3, subsample in 3 bins (first 1/3, second 1/3, third 1/3). I've been doing this for different lengths of n manually so far, and I know there must be a better way to do it. I want to write it into a function with n
as an input, but I can't make it work so far. Code below.
# create df
df <- data.frame(year = c(1:46),
sample = seq(from=10,to=30,length.out = 46) + rnorm(46,mean=0,sd=2) )
# real df has some NAs, so we'll add some here
df[c(20,32),2] <- NA
this df is 46 years of sampling. I want to pretend instead of 46 samples, I only took 2, but at one random year in the first half (1:23), and one random year in the second half (24:46).
# to subset in 2 groups, say, 200 times
# I'll make a df of elements to sample
samplelist <- data.frame(firstsample = sample(1:(nrow(df)/2),200,replace = T), # first sample in first half of vector
secondsample = sample((nrow(df)/2):nrow(df),200, replace = T) )# second sample in second half of vector
samplelist <- as.matrix(samplelist)
# start a df to add to
plot_df <- df %>% mutate(first='all',
second = 'all',
group='full')
# fill the df using coords from expand.grid
for(i in 1:nrow(samplelist)){
plot_df <<- rbind(plot_df,
df[samplelist[i,] , ] %>%
mutate(
first = samplelist[i,1],
second = samplelist[i,2],
group = i
))
print(i)
}
(If we can make it skip samples on "NA" sample years, that would be extra good).
So, if I wanted to do this for three points instead of two, I'd repeat the process like this:
# to subset in 3 groups 200 times
# I'll make a df of elements to sample
samplelist <- data.frame(firstsample = sample(1:(nrow(df)/3),200,replace = T), # first sample in first 1/3
secondsample = sample(round(nrow(df)/3):round(nrow(df)*(2/3)),200, replace = T), # second sample in second 1/3
thirdsample = sample(round(nrow(df)*(2/3)):nrow(df), 200, replace=T) # third sample in last 1/3
)
samplelist <- as.matrix(samplelist)
# start a df to add to
plot_df <- df %>% mutate(first='all',
second = 'all',
third = 'all',
group='full')
# fill the df using coords from expand.grid
for(i in 1:nrow(samplelist)){
plot_df <<- rbind(plot_df,
df[samplelist[i,] , ] %>%
mutate(
first = samplelist[i,1],
second = samplelist[i,2],
third = samplelist[i,3],
group = i
))
print(i)
}
but, I want to do this many times, sampling up to ~20 times (so in 20 bins), so this manual method is not sustainable. Can you help me write a function to say "pick one sample from n bins x times"?
btw, this is the plot I am making with the complete df:
plot_df %>%
ggplot(aes(x=year,y=sample)) +
geom_point(color="grey40") +
stat_smooth(geom="line",
method = "lm",
alpha=.3,
aes(color=group,
group=group),
se=F,
show.legend = F) +
geom_line(color="grey40") +
geom_smooth(data = plot_df %>% filter(group %in% c("full")),
method = "lm",
alpha=.7,
color="black",
size=2,
#se=F,
# fill="grey40
show.legend = F
) +
theme_classic()
Upvotes: 2
Views: 160
Reputation: 1382
Here's a function using loops, closer to what you started doing:
df <- data.frame(year = c(1:46),
sample = seq(from=10, to=30, length.out = 46) +
rnorm(46,mean=0,sd=2))
df[c(20,32), 2] <- NA
my_function <- function(n, sample_size, data = df) {
plot_df <- data %>% mutate(group = 'full')
sample_matrix <- matrix(data = NA, nrow = sample_size, ncol = n)
first_row <- 1 # First subset has 1 as first row, no matter how many subsets
for (i in 1:n) {
last_row <- round(first_row + nrow(df)/n - 1) # Determine last row of i-th subset
sample_matrix[, i] <- sample(first_row:last_row, sample_size, replace = T) # Store sample directly in matrix
first_row <- i + last_row # Determine first row for next i
group_name <- paste("group", i, sep = "_") # Column name for i-th group
plot_df[[group_name]] <- "all" # Column for i-th group
}
for (j in 1:sample_size) {
# Creating a new data frame for new observations
new_obs <- df[sample_matrix[j,], ]
new_obs[["group"]] <- j
for (group_n in 1:n) {
new_obs[[paste0("group_", group_n)]] <- sample_matrix[j, group_n]
}
plot_df <- rbind(plot_df, new_obs)
plot_df <<- plot_df
}
}
my_function(2, 200, data = df)
Upvotes: 1
Reputation: 124148
If I got you right, the following function splits your df in n bins, draws x samples from each and puts the results back into cols of a df:
library(tidyverse)
set.seed(42)
df <- data.frame(year = c(1:46),
sample = seq(from=10,to=30,length.out = 46) + rnorm(46,mean=0,sd=2) )
get_df_sample <- function(df, n, x) {
df %>%
# bin df in n bins of (approx.) equal length
mutate(bin = ggplot2::cut_number(seq_len(nrow(.)), n, labels = seq_len(n))) %>%
# split by bin
split(.$bin) %>%
# sample x times from each bin
map(~ .x[sample(seq_len(nrow(.x)), x, replace = TRUE),]) %>%
# keep only column "sample"
map(~ select(.x, sample)) %>%
# Rename: Add number of df-bin from which sample is drawn
imap(~ rename(.x, !!sym(paste0("sample_", .y)) := sample)) %>%
# bind
bind_cols() %>%
# Add group = rownames
rownames_to_column(var = "group")
}
get_df_sample(df, 3, 200) %>%
head()
#> sample_1 sample_2 sample_3 group
#> 1 12.58631 18.27561 24.74263 1
#> 2 19.46218 24.24423 23.44881 2
#> 3 12.92179 18.47367 27.40558 3
#> 4 15.22020 18.47367 26.29243 4
#> 5 12.58631 24.24423 24.43108 5
#> 6 19.46218 23.36464 27.40558 6
Created on 2020-03-24 by the reprex package (v0.3.0)
Upvotes: 1