Reputation: 35
I am trying to shift the data by group session_id based on the logical condition column in the data table. Shift the data down to rows that are TRUE. The amount of kw available is equal to amount of TRUE statement. If there is already kw at TRUE rows, keep it there.
I have tried na.locf filled them up but that wouldn't solve the problem because some of the data is not the same
tib = structure(list(Row = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58), session_id = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), hour_of_day = c(14.75,
15, 15.25, 15.5, 15.75, 16, 16.25, 16.5, 16.75, 17, 17.25, 17.5,
22, 22.25, 22.5, 22.75, 23, 23.25, 23.5, 23.75, 0, 0.25, 0.5,
0.75, 1, 1.25, 1.5, 14.25, 14.5, 14.75, 15, 15.25, 15.5, 15.75,
16, 16.25, 16.5, 16.75, 17, 17.25, 17.5, 17.75, 18, 18.25, 18.5,
18.75, 19, 22, 22.25, 22.5, 22.75, 23, 23.25, 23.5, 23.75, 0,
0.25, 0.5), kw = c(1.306767902, 1.399948473, 1.399948473, 1.399948473,
1.399948473, 1.399948473, 1.399948473, 1.399948473, 1.399948473,
1.399948473, 1.399948473, 0.560427373, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 1.199276308, 3.600133009,
3.600133009, 3.600133009, 3.600133009, 3.600133009, 3.600133009,
2.101325635, NA, NA, NA, NA, 0, 1.399975856, 1.399975856, 1.399975856,
1.399975856, 1.399975856, 1.399975856, 0.413944861, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), check = c(TRUE, TRUE, TRUE,
TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -58L), spec = structure(list(
cols = list(Row = structure(list(), class = c("collector_double",
"collector")), session_id = structure(list(), class = c("collector_double",
"collector")), hour_of_day = structure(list(), class = c("collector_double",
"collector")), kw = structure(list(), class = c("collector_double",
"collector")), check = structure(list(), class = c("collector_logical",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
I want to have data like this
out = structure(list(Row = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58), session_id = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), hour_of_day = c(14.75,
15, 15.25, 15.5, 15.75, 16, 16.25, 16.5, 16.75, 17, 17.25, 17.5,
22, 22.25, 22.5, 22.75, 23, 23.25, 23.5, 23.75, 0, 0.25, 0.5,
0.75, 1, 1.25, 1.5, 14.25, 14.5, 14.75, 15, 15.25, 15.5, 15.75,
16, 16.25, 16.5, 16.75, 17, 17.25, 17.5, 17.75, 18, 18.25, 18.5,
18.75, 19, 22, 22.25, 22.5, 22.75, 23, 23.25, 23.5, 23.75, 0,
0.25, 0.5), kw = c(1.306767902, 1.399948473, 1.399948473, 1.399948473,
1.399948473, NA, NA, NA, NA, NA, NA, NA, 1.399948473, 1.399948473,
1.399948473, 1.399948473, 1.399948473, 1.399948473, 0.560427373,
NA, NA, NA, NA, NA, NA, NA, NA, 1.199276308, 3.600133009, 3.600133009,
3.600133009, 3.600133009, 3.600133009, 3.600133009, 2.101325635,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 1.399975856,
1.399975856, 1.399975856, 1.399975856, 1.399975856, 1.399975856,
0.413944861, NA, NA, NA), check = c(TRUE, TRUE, TRUE, TRUE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, FALSE, FALSE, FALSE)), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -58L), spec = structure(list(
cols = list(Row = structure(list(), class = c("collector_double",
"collector")), session_id = structure(list(), class = c("collector_double",
"collector")), hour_of_day = structure(list(), class = c("collector_double",
"collector")), kw = structure(list(), class = c("collector_double",
"collector")), check = structure(list(), class = c("collector_logical",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
Thank you very much
Upvotes: 2
Views: 250
Reputation: 66819
You can do:
# fill lines where check is TRUE with non-missing kw values
tib$kw[tib$check == TRUE] <- na.omit(tib$kw)
# overwrite values where check is FALSE with missing
tib$kw[tib$check == FALSE] <- NA
# verify it works in the example
identical(tib$kw, out$kw)
# [1] TRUE
This overwrites the kw column in place, which means if there is a bug in this code, you lose your original data. This code may not be right, for example, because it ignored grouping by session_id. So while testing the code, I'd suggest making a whole new column:
tib$v <- tib$kw
tib$v[tib$check == TRUE] <- na.omit(tib$kw)
tib$v[tib$check == FALSE] <- NA
The amount of kw available is equal to amount of TRUE statement.
This suggests that ignoring session_id as above should be okay as long as data is grouped by session_id ahead of time. If the operation does need to be done by session_id, though, I'd suggest dplyr or data.table:
library(data.table)
dt_res = data.table(tib)
dt_res[, kw := {
v = kw
v[check == TRUE] = na.omit(kw)
v[check == FALSE] = NA
v
}, by=session_id]
# or
library(dplyr)
dplyr_res = tib %>% group_by(session_id) %>% mutate(kw = {
v = kw
v[check == TRUE] = na.omit(kw)
v[check == FALSE] = NA
v
})
# check 'em
identical(dplyr_res$kw, out$kw)
identical(dt_res$kw, out$kw)
Upvotes: 2