Reputation: 99
I have problems in getting data table to create 28 new columns by calculating their value from other column names that are variables and, at the same time, fulfill some conditions.
What I am trying to achieve is to distinguish four cases:
As you can see I tried many different variants (if clause 2 tries to do it in one DT way), if-clause 4 tries to get the values beforehand (which, of course, does not work, since my_ic_value has all the thousands of values in it instead only the one row). Other things I unsuccessfully tried were
all <- all[, ratio_col:= get(ic_col) / get(nc_col)), by=.(m_island_id_1, m_owner_id_1)]
Also just using the colum names does not work, because they are strings and cannot be divided:
all <- all[, ratio_col:= (ic_col / nc_col), by=.(m_island_id_1, m_owner_id_1)]
I hope the problem is somewhat clear and I am looking forward to all the data.table experts showing me how hopelessly messy my code is and how short and fast it can be done! ;-)
Thank you for your help!
Here is some data (a very small portion):
structure(list(m_island_id_1 = c("020d49b9580f071075cfb6f9da7a426669a56d2d",
"020d49b9580f071075cfb6f9da7a426669a56d2d", "020d49b9580f071075cfb6f9da7a426669a56d2d"
), m_owner_id_1 = c("01d96c16a2caf720617b266ae5d053243cef0920",
"4e25ef4cf04f023d96134856098b8104fa6a6add", "7818c5081257a837e7cfad2c1aba4d90dcd18a69"
), Sum_nc = c(2L, 2L, 2L), Sum_ic = c(4L, 4L, 4L), Sum_ic_2 = c(0L,
0L, 0L), Sum_ic_3 = c(0L, 0L, 0L), Sum_ic_4 = c(0L, 0L, 0L),
Sum_ic_5 = c(0L, 0L, 0L), Sum_ic_6 = c(0L, 0L, 0L), Sum_ic_7 = c(0L,
0L, 0L), Sum_ic_8 = c(0L, 0L, 0L), Sum_ic_9 = c(0L, 0L, 0L
), Sum_ic_10 = c(0L, 0L, 0L), Sum_ic_11 = c(0L, 0L, 0L),
Sum_ic_12 = c(0L, 0L, 0L), Sum_ic_13 = c(0L, 0L, 0L), Sum_ic_14 = c(0L,
0L, 0L), Sum_ic_15 = c(0L, 0L, 0L), Sum_ic_16 = c(0L, 0L,
0L), Sum_ic_17 = c(1L, 1L, 1L), Sum_ic_18 = c(1L, 1L, 1L),
Sum_ic_19 = c(2L, 2L, 2L), Sum_ic_20 = c(0L, 0L, 0L), Sum_ic_21 = c(0L,
0L, 0L), Sum_ic_22 = c(0L, 0L, 0L), Sum_ic_23 = c(0L, 0L,
0L), Sum_ic_24 = c(0L, 0L, 0L), Sum_ic_25 = c(0L, 0L, 0L),
Sum_ic_26 = c(0L, 0L, 0L), Sum_ic_27 = c(0L, 0L, 0L), Sum_ic_28 = c(0L,
0L, 0L), Sum_nc_2 = c(0L, 0L, 0L), Sum_nc_3 = c(0L, 0L, 0L
), Sum_nc_4 = c(0L, 0L, 0L), Sum_nc_5 = c(0L, 0L, 0L), Sum_nc_6 = c(0L,
0L, 0L), Sum_nc_7 = c(0L, 0L, 0L), Sum_nc_8 = c(0L, 0L, 0L
), Sum_nc_9 = c(0L, 0L, 0L), Sum_nc_10 = c(0L, 0L, 0L), Sum_nc_11 = c(0L,
0L, 0L), Sum_nc_12 = c(0L, 0L, 0L), Sum_nc_13 = c(0L, 0L,
0L), Sum_nc_14 = c(0L, 0L, 0L), Sum_nc_15 = c(0L, 0L, 0L),
Sum_nc_16 = c(1L, 1L, 1L), Sum_nc_17 = c(0L, 0L, 0L), Sum_nc_18 = c(1L,
1L, 1L), Sum_nc_19 = c(0L, 0L, 0L), Sum_nc_20 = c(0L, 0L,
0L), Sum_nc_21 = c(0L, 0L, 0L), Sum_nc_22 = c(0L, 0L, 0L),
Sum_nc_23 = c(0L, 0L, 0L), Sum_nc_24 = c(0L, 0L, 0L), Sum_nc_25 = c(0L,
0L, 0L), Sum_nc_26 = c(0L, 0L, 0L), Sum_nc_27 = c(0L, 0L,
0L), Sum_nc_28 = c(0L, 0L, 0L), inhabited = c(TRUE, TRUE,
FALSE), Sum_time_ic = c(3L, 1L, 0L), groupsize = c(2L, 2L,
2L), Ratio_nc_to_ic_16 = c(0, 0, 0), Ratio_nc_to_ic_17 = c(TRUE,
TRUE, TRUE), Ratio_nc_to_ic_18 = c(1, 1, 1), Ratio_nc_to_ic_19 = c(TRUE,
TRUE, TRUE), ratio_col = c(NaN, NaN, NaN), Ratio_nc_to_ic = c(NA,
NA, NA), Ratio_nc_to_ic_ = c(2, 2, 2), Ratio_nc_to_ic_2 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_3 = c(NaN, NaN, NaN), Ratio_nc_to_ic_4 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_5 = c(NaN, NaN, NaN), Ratio_nc_to_ic_6 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_7 = c(NaN, NaN, NaN), Ratio_nc_to_ic_8 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_9 = c(NaN, NaN, NaN), Ratio_nc_to_ic_10 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_11 = c(NaN, NaN, NaN), Ratio_nc_to_ic_12 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_13 = c(NaN, NaN, NaN), Ratio_nc_to_ic_14 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_15 = c(NaN, NaN, NaN), Ratio_nc_to_ic_20 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_21 = c(NaN, NaN, NaN), Ratio_nc_to_ic_22 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_23 = c(NaN, NaN, NaN), Ratio_nc_to_ic_24 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_25 = c(NaN, NaN, NaN), Ratio_nc_to_ic_26 = c(NaN,
NaN, NaN), Ratio_nc_to_ic_27 = c(NaN, NaN, NaN), Ratio_nc_to_ic_28 = c(NaN,
NaN, NaN)), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x000001f487d51ef0>)
Here is the non-working code:
for (j in 2:28){
ic_col <- paste0("Sum_ic_", j)
nc_col <- paste0("Sum_nc_", j)
ratio_col <- paste0("Ratio_nc_to_ic_", j)
my_ic_value <- all[,get(ic_col)]
my_nc_value <- all[,get(nc_col)]
if ((ic_col == 0) & (nc_col == 0)) {
all <- all[, ratio_col:= NA, by=.(m_island_id_1, m_owner_id_1)] # no incumbents, no newcomers
}
if ((ic_col > 0) & (nc_col == 0)) {
all <- all[, .(ratio_col= ic_col), by=.(m_island_id_1, m_owner_id_1)] # Incumbents without newcomers
}
if ((ic_col == 0) & (nc_col > 0)) {
all <- all[, ratio_col:= 0, by=.(m_island_id_1, m_owner_id_1)] # Newcomers without incumbents
}
if ((ic_col >= 1) & (nc_col >= 1)) {
all <- all[, ratio_col:= (my_ic_value / my_nc_value), by=.(m_island_id_1, m_owner_id_1)] # Newcomers with incumbents
}
}
Upvotes: 0
Views: 149
Reputation: 25225
It seems like calculations are to be performed row-wise, hence you can drop the by
(please correct me if I am wrong). And using matrix operations can cover the last 2 cases. And then using some indexing to handle cases 1 and 2 as follows:
DT[, (ratio_cols) := {
icmat <- as.matrix(.SD[, mget(ic_cols)])
ncmat <- as.matrix(.SD[, mget(nc_cols)])
m <- icmat / ncmat
#when nc_col==0
m[is.infinite(m)] <- icmat[is.infinite(m)]
#when ic_col==0 & nc_col==0
m[icmat==0 & ncmat==0] <- NA_real_
as.data.table(m)
}]
data:
#where dat is your sample data and ratio cols are removed here
DT <- dat[, .SD, .SDcols=m_island_id_1:groupsize]
#clean up names
rename <- c("Sum_nc","Sum_ic")
setnames(DT, rename, paste0(rename, "_1"))
#input ic, nc and output ratio columns
jcols <- 1:28
ic_cols <- paste0("Sum_ic_", jcols)
nc_cols <- paste0("Sum_nc_", jcols)
ratio_cols <- paste0("Ratio_nc_to_ic_", jcols)
Upvotes: 1