Reputation: 369

Along runs of certain values, fill with first value from a second column

I have sample data:

testSample <- data.table(a = rnorm(n = 20, mean = 2, sd = 1),
                         b = sample(c(0,1), replace=TRUE, size=20))

testSample
             a b
 1:  3.1731458 0
 2:  1.0687438 1
 3:  2.9078655 1
 4:  1.5675078 0
 5:  2.7825992 0
 6:  1.3672285 1
 7:  3.6178619 0
 8:  2.9067640 1
 9:  2.5021129 0
10:  2.7672849 1
11:  2.3501007 1
12: -0.2923344 0
13:  0.3920071 1
14:  2.5113855 0
15:  2.2192234 1
16:  0.5913632 0
17:  0.8864734 1
18:  1.9187394 0
19:  1.1238824 1
20:  1.5001240 1

In column 'b' there are runs of alternating 0 and 1. Along each consecutive run of 1, I want a new column 'c' to be filled with the number from the column "a" at the index of the first 1 in each run.

When 'b' is 0, 'c' should be NA

Desired output where I filled in the new 'c' column manually:

             a b         c
 1:  3.1731458 0        NA
 2:  1.0687438 1 1.0687438 # <- run of 1.  
 3:  2.9078655 1 1.0687438 # <- All rows filled with the first 'a' value in the run 
 4:  1.5675078 0        NA
 5:  2.7825992 0        NA
 6:  1.3672285 1 1.3672285 # <-  
 7:  3.6178619 0        NA
 8:  2.9067640 1 2.9067640 # <-  
 9:  2.5021129 0        NA
10:  2.7672849 1 2.7672849 # <- run of 1  
11:  2.3501007 1 2.7672849 # <- All rows filled with the first 'a' value in the run
12: -0.2923344 0        NA
13:  0.3920071 1 0.3920071
14:  2.5113855 0        NA
15:  2.2192234 1 2.2192234
16:  0.5913632 0        NA
17:  0.8864734 1 0.8864734
18:  1.9187394 0        NA
19:  1.1238824 1 1.1238824
20:  1.5001240 1 1.1238824

Upvotes: 3

Answers (3)

chinsoon12

Reputation: 25225

Here is another option:

mtd2 = DT2[, cc := {
    ri <- rowid(rleid(b))  
    bool <- ri>1L
    v <- replace(a, bool, NA_real_)
    v <- nafill(v, "locf")
    replace(v, b==0L, NA_real_)
}]

timing code:

microbenchmark::microbenchmark(times=3L,
    mtd0 = DT0[, grouper := rleid(b)][b == 1L, cc := a[1L], by = .(grouper)],
    
    mtd1 = DT1[, cc := head(a, 1L), by = rleid(b) ][ b == 0L, cc := NA_real_ ],
    
    mtd2 = DT2[, cc := {
        ri <- rowid(rleid(b))  
        bool <- ri>1L
        v <- replace(a, bool, NA_real_)
        v <- nafill(v, "locf")
        replace(v, b==0L, NA_real_)
    }],

    mtd3 = DT3[, cc := {
        cs = cumsum(b)
        nafill(a * NA_real_^(cs - cummax((!b) * cs) > 1), "locf") * NA_real_^(b == 0L)
    }]
)        

all.equal(DT0$cc, DT1$cc)
#[1] TRUE
all.equal(DT0$cc, DT2$cc)
#[1] TRUE
all.equal(DT0$cc, DT3$cc)
#[1] TRUE

timings for nr <- 1e6L:

Unit: milliseconds
 expr        min         lq       mean     median         uq        max neval
 mtd0  198.31551  202.87683  211.27864  207.43815  217.76021  228.08227     3
 mtd1 3559.34608 3575.83858 3648.31707 3592.33108 3692.80257 3793.27405     3
 mtd2   62.99026   63.58249   64.05060   64.17471   64.58078   64.98684     3
 mtd3   48.19877   49.60878   51.08868   51.01879   52.53364   54.04849     3

timings for nr <- 1e7L:

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval
 mtd0 1912.1486 2019.0890 2069.4102 2126.0294 2148.0410 2170.0527     3
 mtd2  712.6978  774.7994  806.8337  836.9009  853.9016  870.9023     3
 mtd3  515.7079  525.7400  531.4712  535.7722  539.3529  542.9336     3

data:

library(data.table)
set.seed(47L)
nr <- 1e6L
testSample <- data.table(a = rnorm(n = nr, mean = 2, sd = 1),
    b = sample(c(0,1), replace=TRUE, size=nr))
DT0 <- copy(testSample)
DT1 <- copy(testSample)
DT2 <- copy(testSample)

Upvotes: 3

zx8754

Reputation: 56219

Using Gregor's data, similar answer:

testSample[, c := head(a, 1), by = rleid(b) ][ b == 0, c := NA ]

testSample
#             a b         c
#  1: 3.9946963 0        NA
#  2: 2.7111425 0        NA
#  3: 2.1854053 1 2.1854053
#  4: 1.7182350 0        NA
#  5: 2.1087755 0        NA
#  6: 0.9142625 1 0.9142625
#  7: 1.0145178 1 0.9142625
#  8: 2.0151309 1 0.9142625
#  9: 1.7479541 1 0.9142625
# 10: 0.5342497 1 0.9142625
# 11: 1.0775438 0        NA
# 12: 2.0396024 0        NA
# 13: 2.4938202 0        NA
# 14: 0.1717708 1 0.1717708
# 15: 2.0914729 0        NA
# 16: 2.6707792 1 2.6707792
# 17: 1.9189219 0        NA
# 18: 3.2642411 0        NA
# 19: 1.2966118 1 1.2966118
# 20: 1.9594218 0        NA

Upvotes: 3

Gregor Thomas

Reputation: 145985

set.seed(47)
testSample <- data.table(a = rnorm(n = 20, mean = 2, sd = 1),
                         b = sample(c(0,1), replace=TRUE, size=20))

testSample[, grouper := rleid(b)][b == 1, c := a[1], by = .(grouper)]
testSample
#            a b grouper         c
#  1: 3.9946963 0       1        NA
#  2: 2.7111425 0       1        NA
#  3: 2.1854053 1       2 2.1854053
#  4: 1.7182350 0       3        NA
#  5: 2.1087755 0       3        NA
#  6: 0.9142625 1       4 0.9142625
#  7: 1.0145178 1       4 0.9142625
#  8: 2.0151309 1       4 0.9142625
#  9: 1.7479541 1       4 0.9142625
# 10: 0.5342497 1       4 0.9142625
# 11: 1.0775438 0       5        NA
# 12: 2.0396024 0       5        NA
# 13: 2.4938202 0       5        NA
# 14: 0.1717708 1       6 0.1717708
# 15: 2.0914729 0       7        NA
# 16: 2.6707792 1       8 2.6707792
# 17: 1.9189219 0       9        NA
# 18: 3.2642411 0       9        NA
# 19: 1.2966118 1      10 1.2966118
# 20: 1.9594218 0      11        NA

You can, of course, drop the grouper column when you're done with it.

Upvotes: 3

Along runs of certain values, fill with first value from a second column

Answers (3)

Related Questions