Reputation: 369
I have sample data:
testSample <- data.table(a = rnorm(n = 20, mean = 2, sd = 1),
b = sample(c(0,1), replace=TRUE, size=20))
testSample
a b
1: 3.1731458 0
2: 1.0687438 1
3: 2.9078655 1
4: 1.5675078 0
5: 2.7825992 0
6: 1.3672285 1
7: 3.6178619 0
8: 2.9067640 1
9: 2.5021129 0
10: 2.7672849 1
11: 2.3501007 1
12: -0.2923344 0
13: 0.3920071 1
14: 2.5113855 0
15: 2.2192234 1
16: 0.5913632 0
17: 0.8864734 1
18: 1.9187394 0
19: 1.1238824 1
20: 1.5001240 1
In column 'b' there are runs of alternating 0
and 1
. Along each consecutive run of 1
, I want a new column 'c' to be filled with the number from the column "a" at the index of the first 1
in each run.
When 'b' is 0
, 'c' should be NA
Desired output where I filled in the new 'c' column manually:
a b c
1: 3.1731458 0 NA
2: 1.0687438 1 1.0687438 # <- run of 1.
3: 2.9078655 1 1.0687438 # <- All rows filled with the first 'a' value in the run
4: 1.5675078 0 NA
5: 2.7825992 0 NA
6: 1.3672285 1 1.3672285 # <-
7: 3.6178619 0 NA
8: 2.9067640 1 2.9067640 # <-
9: 2.5021129 0 NA
10: 2.7672849 1 2.7672849 # <- run of 1
11: 2.3501007 1 2.7672849 # <- All rows filled with the first 'a' value in the run
12: -0.2923344 0 NA
13: 0.3920071 1 0.3920071
14: 2.5113855 0 NA
15: 2.2192234 1 2.2192234
16: 0.5913632 0 NA
17: 0.8864734 1 0.8864734
18: 1.9187394 0 NA
19: 1.1238824 1 1.1238824
20: 1.5001240 1 1.1238824
Upvotes: 3
Views: 85
Reputation: 25225
Here is another option:
mtd2 = DT2[, cc := {
ri <- rowid(rleid(b))
bool <- ri>1L
v <- replace(a, bool, NA_real_)
v <- nafill(v, "locf")
replace(v, b==0L, NA_real_)
}]
timing code:
microbenchmark::microbenchmark(times=3L,
mtd0 = DT0[, grouper := rleid(b)][b == 1L, cc := a[1L], by = .(grouper)],
mtd1 = DT1[, cc := head(a, 1L), by = rleid(b) ][ b == 0L, cc := NA_real_ ],
mtd2 = DT2[, cc := {
ri <- rowid(rleid(b))
bool <- ri>1L
v <- replace(a, bool, NA_real_)
v <- nafill(v, "locf")
replace(v, b==0L, NA_real_)
}],
mtd3 = DT3[, cc := {
cs = cumsum(b)
nafill(a * NA_real_^(cs - cummax((!b) * cs) > 1), "locf") * NA_real_^(b == 0L)
}]
)
all.equal(DT0$cc, DT1$cc)
#[1] TRUE
all.equal(DT0$cc, DT2$cc)
#[1] TRUE
all.equal(DT0$cc, DT3$cc)
#[1] TRUE
timings for nr <- 1e6L
:
Unit: milliseconds
expr min lq mean median uq max neval
mtd0 198.31551 202.87683 211.27864 207.43815 217.76021 228.08227 3
mtd1 3559.34608 3575.83858 3648.31707 3592.33108 3692.80257 3793.27405 3
mtd2 62.99026 63.58249 64.05060 64.17471 64.58078 64.98684 3
mtd3 48.19877 49.60878 51.08868 51.01879 52.53364 54.04849 3
timings for nr <- 1e7L
:
Unit: milliseconds
expr min lq mean median uq max neval
mtd0 1912.1486 2019.0890 2069.4102 2126.0294 2148.0410 2170.0527 3
mtd2 712.6978 774.7994 806.8337 836.9009 853.9016 870.9023 3
mtd3 515.7079 525.7400 531.4712 535.7722 539.3529 542.9336 3
data:
library(data.table)
set.seed(47L)
nr <- 1e6L
testSample <- data.table(a = rnorm(n = nr, mean = 2, sd = 1),
b = sample(c(0,1), replace=TRUE, size=nr))
DT0 <- copy(testSample)
DT1 <- copy(testSample)
DT2 <- copy(testSample)
Upvotes: 3
Reputation: 56219
Using Gregor's data, similar answer:
testSample[, c := head(a, 1), by = rleid(b) ][ b == 0, c := NA ]
testSample
# a b c
# 1: 3.9946963 0 NA
# 2: 2.7111425 0 NA
# 3: 2.1854053 1 2.1854053
# 4: 1.7182350 0 NA
# 5: 2.1087755 0 NA
# 6: 0.9142625 1 0.9142625
# 7: 1.0145178 1 0.9142625
# 8: 2.0151309 1 0.9142625
# 9: 1.7479541 1 0.9142625
# 10: 0.5342497 1 0.9142625
# 11: 1.0775438 0 NA
# 12: 2.0396024 0 NA
# 13: 2.4938202 0 NA
# 14: 0.1717708 1 0.1717708
# 15: 2.0914729 0 NA
# 16: 2.6707792 1 2.6707792
# 17: 1.9189219 0 NA
# 18: 3.2642411 0 NA
# 19: 1.2966118 1 1.2966118
# 20: 1.9594218 0 NA
Upvotes: 3
Reputation: 145985
set.seed(47)
testSample <- data.table(a = rnorm(n = 20, mean = 2, sd = 1),
b = sample(c(0,1), replace=TRUE, size=20))
testSample[, grouper := rleid(b)][b == 1, c := a[1], by = .(grouper)]
testSample
# a b grouper c
# 1: 3.9946963 0 1 NA
# 2: 2.7111425 0 1 NA
# 3: 2.1854053 1 2 2.1854053
# 4: 1.7182350 0 3 NA
# 5: 2.1087755 0 3 NA
# 6: 0.9142625 1 4 0.9142625
# 7: 1.0145178 1 4 0.9142625
# 8: 2.0151309 1 4 0.9142625
# 9: 1.7479541 1 4 0.9142625
# 10: 0.5342497 1 4 0.9142625
# 11: 1.0775438 0 5 NA
# 12: 2.0396024 0 5 NA
# 13: 2.4938202 0 5 NA
# 14: 0.1717708 1 6 0.1717708
# 15: 2.0914729 0 7 NA
# 16: 2.6707792 1 8 2.6707792
# 17: 1.9189219 0 9 NA
# 18: 3.2642411 0 9 NA
# 19: 1.2966118 1 10 1.2966118
# 20: 1.9594218 0 11 NA
You can, of course, drop the grouper
column when you're done with it.
Upvotes: 3