Sampling without replacement - dplyr

Question

I have tried to sample a data from Excel rows without having a replacement. I keep having duplicates. I changed the replace = TRUE, the same issue. Sorry, I am new to R. Code below:

library(dplyr)
library(purrr)
library(tidyr)

#Read in the data
DuT = read.csv("APATA.csv", stringsAsFactors = TRUE)

#Filter by number of transformers
Df1 = DuT %>%
  group_by(DSS.NAME)%>%
  dplyr::summarise(no_rows = length(DSS.NAME))
str(Df1)

#Create Sample Size column
Df1$SampleSize = ifelse(Df1$no_rows >= 1 &Df1$no_rows <= 10, 10,
                        ifelse(Df1$no_rows >= 11 & Df1$no_rows <= 49, 12,
                               ifelse(Df1$no_rows >= 50 & Df1$no_rows <= 99, 17,
                                      ifelse(Df1$no_rows >= 100 & Df1$no_rows <= 199, 24,
                                             ifelse(Df1$no_rows >= 200 & Df1$no_rows <= 299, 27,
                                                    ifelse(Df1$no_rows >= 300 & Df1$no_rows <= 499, 32,
                                                           ifelse(Df1$no_rows >= 500  & Df1$no_rows <= 799, 32,
                                                                  ifelse(Df1$no_rows >= 800 & Df1$no_rows <= 999, 44,
                                                                         ifelse(Df1$no_rows >= 1000 & Df1$no_rows <= 1299, 49,
                                                                                ifelse(Df1$no_rows >= 1300 & Df1$no_rows <= 1500, 57,0))))))))))

sum(Df1$SampleSize)

#Sample based on  name and sampleSize column
Df2 = DuT %>%
  group_by(DSS.NAME) %>%
  arrange(DSS.NAME) %>%
  tidyr::nest() %>%            
  ungroup() %>%
  mutate(n = Df1$SampleSize) %>%
  mutate(samp = purrr::map2(data, n, sample_n, replace = FALSE)) %>%
  select(-data) %>%
  select(-n) %>%
  tidyr::unnest(samp)
write.csv(Df2, "APATA_SAMPLED.csv", row.names = F)
write.csv(Df1, "APATA_SAMPLING SIZE.csv", row.names = F)"""

dput(The file I want to sample)

dput(EXpected output, with Town 1 not up to Sample size 12 displaying all out)

Niz · Accepted Answer

You can just split the data in a way that you can sample without replacement even when the sample size is greater than the population size like this:

#Sample based on  name and sampleSize column

    Df2 = DuT %>%
      group_by(DSS.NAME) %>% 
      arrange(DSS.NAME)%>%
      tidyr::nest()  %>%          
      ungroup() %>% 
      mutate(SS = Df1$SampleSize) %>%
#number of rows of each table
      mutate(nofRows = map_dbl(data, nrow))

    #split data into SS > popsize & SS < popsize
    Df2_i  = Df2[Df2$SS >= Df2$nofRows, ]
    Df2_ii = Df2[Df2$SS < Df2$nofRows, ]


    #sampling without replacement SS > popsize
    Df3_i = Df2_i %>%  
           mutate(samp = purrr::map2(data, nofRows, sample_n, replace = F))%>%
           select(-data) %>%
           tidyr::unnest(samp)


    #sampling without replacement SS < popsize
    Df3_ii = Df2_ii %>%
           mutate(samp = purrr::map2(data, SS , sample_n, replace = F))%>%
           select(-data) %>%
           tidyr::unnest(samp)

    #join the tables
    Df = rbind(Df3_i,Df3_ii)

Note SS is sample size and nofrows is Number of Rows (population size) of each nested table. I hope this helps

Sampling without replacement - dplyr

Answers (1)

Related Questions