Reputation: 61
I'm new to re-sampling techniques and bootstrapping in r, so any help will be greatly appreciated. I have a dataframe that consists of different individuals that have sometimes been measured more than once. I would like to randomly sample a SINGLE measurement from each individual in the population, and then calculate the population mean and standard deviation. I would like to do this procedure many times (500 or so) and obtain two new dataframes, one with all the population means and another with all the population standard deviations for each variable (Dim.1 through Dim.4). From this, I can then extract the global mean and standard deviation for my downstream analysis. Here is what the dataframe looks like ("ID" is each individuals unique number, and you can see that there are variable amounts of repeat measures for each individual).
ID Dim.1 Dim.2 Dim.3 Dim.4
41 0.4001945 1.15899378 0.269197195 0.184791153
14 2.1615710 1.15712356 -0.096055808 0.450943821
63 0.4325496 0.75521068 0.085588532 -0.233144806
53 1.2459718 0.97450610 -0.069171367 -0.613423267
63 1.3380629 0.22606572 -0.061178395 -0.304960508
42 1.6048214 0.94184036 0.232863647 -0.201738198
57 1.3306709 0.80440736 -0.955949551 -0.734022636
53 0.7019118 0.87285991 -0.042557052 -0.146748989
51 0.7235493 0.29946448 0.474477629 0.305810371
53 1.2431220 1.20252749 -0.073627812 0.237740020
41 1.1788653 0.55536570 -0.017354302 0.119014260
14 2.5769809 0.18551630 0.634304132 0.617288243
67 1.0445458 1.47107481 0.024383348 0.111808376
31 0.9759513 1.31091796 -0.008660192 0.189962355
63 1.8621687 0.97137412 0.317014897 -0.390871248
76 0.5905190 1.49817641 -0.374503265 0.142478388
90 2.4323563 0.87696545 0.467220123 0.513197279
67 2.2378032 0.35682721 0.400233674 -0.926848226
41 1.7098808 0.40470067 0.050950910 -0.153059068
97 1.5351169 1.11597681 0.011878347 -0.092047152
63 1.2647155 0.80006707 0.730022680 -0.089726522
57 1.7200676 0.01358165 0.450075592 0.038352174
76 0.6949196 1.36741272 -1.286488394 0.477345585
123 2.4235534 1.69165605 0.528863655 0.447856674
76 -2.4022432 -0.27531557 -1.850999153 2.194893741
117 1.6955740 -1.86088122 1.502655438 0.856026945
117 0.7130716 1.44198379 -1.495098987 -1.021981479
131 0.8425548 1.22970621 -0.160634720 0.005202717
117 1.0913048 1.19834030 -0.240309947 0.279379075
90 2.5787954 0.21638781 0.973339314 0.853752379
105 1.4989440 1.31525062 0.233114414 0.082557111
45 0.4749492 0.36264159 0.016554066 0.434416650
14 1.9841503 -0.18133091 -0.517021686 0.131796394
here is the dput version....
structure(list(AnID = structure(c(3L, 1L, 9L, 7L, 9L, 4L, 8L,
7L, 6L, 7L, 3L, 1L, 10L, 2L, 9L, 11L, 12L, 10L, 3L, 13L, 9L,
8L, 11L, 16L, 11L, 15L, 15L, 17L, 15L, 12L, 14L, 5L, 1L), .Label = c("14",
"31", "41", "42", "45", "51", "53", "57", "63", "67", "76", "90",
"97", "105", "117", "123", "131"), class = "factor"), Dim.1 = c(0.400194544195721,
2.16157096683054, 0.432549610256816, 1.24597182598991, 1.33806287869605,
1.60482137307563, 1.33067093524332, 0.701911835019105, 0.723549265733465,
1.24312199041168, 1.17886527411877, 2.57698094739979, 1.04454579781695,
0.975951278566957, 1.86216869726173, 0.590519015534528, 2.43235630542313,
2.23780317751189, 1.70988079418724, 1.53511692947232, 1.26471553939687,
1.72006761902848, 0.694919562457936, 2.42355344632234, -2.40224317003857,
1.69557401848893, 0.713071563313831, 0.84255475961074, 1.09130484807346,
2.57879543707134, 1.49894397171646, 0.474949215360165, 1.9841503256016
), Dim.2 = c(1.15899377720071, 1.15712355628702, 0.755210676050028,
0.974506103663373, 0.226065715930444, 0.941840360304357, 0.804407356238532,
0.872859912826886, 0.299464475124326, 1.2025274866889, 0.55536570304097,
0.185516296049789, 1.47107481283135, 1.31091795925695, 0.971374119614307,
1.49817640676682, 0.876965451353274, 0.356827207847936, 0.404700668672103,
1.11597680662439, 0.800067070614603, 0.0135816493815426, 1.36741271705742,
1.69165605426992, -0.275315573666507, -1.86088122056554, 1.44198379044125,
1.229706212058, 1.19834030462339, 0.216387812905091, 1.31525061699366,
0.362641590025834, -0.181330912913297), Dim.3 = c(0.269197195180612,
-0.0960558078596061, 0.0855885321454752, -0.0691713671666404,
-0.0611783947257435, 0.232863646917399, -0.955949551451659, -0.0425570523689114,
0.474477629049467, -0.0736278121798866, -0.0173543018324465,
0.634304131880689, 0.0243833483864922, -0.00866019164798527,
0.317014896588811, -0.374503264871839, 0.467220123029729, 0.400233673552903,
0.0509509097106227, 0.0118783465387495, 0.730022679967163, 0.450075591988245,
-1.28648839432794, 0.528863655457902, -1.85099915345691, 1.50265543792412,
-1.49509898726221, -0.160634720376254, -0.24030994662375, 0.973339313851613,
0.233114414466102, 0.0165540663395682, -0.517021685999838), Dim.4 = c(0.184791153018369,
0.45094382124022, -0.233144806193005, -0.613423266807646, -0.304960507895512,
-0.201738198311526, -0.734022636110577, -0.146748988783387, 0.305810371055691,
0.237740020179384, 0.11901425952943, 0.61728824337695, 0.111808376374363,
0.189962354663836, -0.390871248426407, 0.14247838773032, 0.513197279323348,
-0.926848226311571, -0.153059067639092, -0.0920471522899872,
-0.0897265219239891, 0.0383521738356584, 0.477345585143069, 0.447856673901548,
2.19489374105159, 0.856026944966164, -1.02198147948597, 0.00520271670521917,
0.279379074573862, 0.853752378937349, 0.0825571109781094, 0.434416649778733,
0.131796393683415)), .Names = c("AnID", "Dim.1", "Dim.2", "Dim.3",
"Dim.4"), class = "data.frame", row.names = c("20", "26", "36",
"46", "49", "52", "75", "93", "94", "110", "118", "124", "132",
"143", "157", "168", "185", "199", "210", "211", "215", "225",
"240", "245", "248", "250", "254", "270", "272", "281", "297",
"322", "337"))
Upvotes: 0
Views: 282
Reputation: 4907
Here you go:
boot_id <- function(df) {
s <- sample((1:nrow(df)), size=1, replace=F)
return(df[s,])
}
boot_dat <- function(df, n= 500, f= c("mean", "sd")) {
f <- match.arg(f, c("mean", "sd"), several.ok = FALSE)
res <- matrix(NA, nrow= n, ncol= ncol(df)-1)
for (i in 1:n) {
df2 <- df[, boot_id(.SD), by= "ID"]
df2$ID <- NULL
if (f == "mean") {
res[i,] <- colMeans(df2)
} else {
res[i,] <- apply(df2, 2, sd)
}
}
return(res)
}
# dt <- <your structure>
names(dt) <- c("ID", "d1", "d2", "d3", "d4")
library(data.table)
dt <- data.table(dt)
setkey(dt, ID)
dat_means <- boot_dat(dt, f= "mean")
dat_sds <- boot_dat(dt, f= "sd")
Upvotes: 1