Reputation: 105
I am new to R. I have a large dataframe with millions of rows that looks like below:
Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3
64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9
55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4
70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7
26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2
47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8
35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1
I am looking for three output fields depending on the values in code1, code2, and code3.
> Output1 : If code1 is 'a' or 'b', then Output1 = Whole*P_1, else Output1 = Whole* Q_1
> Output2 : If code1 is 'a' or 'b', then Output1 = Whole*P_2, else Output2 = Whole* Q_2
> Output3 : If code1 is 'a' or 'b', then Output1 = Whole*P_3, else Output3 = Whole* Q_3
Would appreciate if this code below could be corrected:
df1 %>%
for (i in 1:6) {
if (paste0("code", i) %in% c("a", "b")) {
mutate (paste0("Output", i) = Whole * paste0("P_", i) )
} else {
mutate (paste0("Output", i) = Whole * paste0("Q_", i) )
}
}
Upvotes: 1
Views: 775
Reputation: 160407
library(dplyr)
df1 %>%
mutate(
Output1 = Whole * if_else(code1 %in% c('a', 'b'), P_1, Q_1),
Output2 = Whole * if_else(code1 %in% c('a', 'b'), P_2, Q_2),
Output3 = Whole * if_else(code1 %in% c('a', 'b'), P_3, Q_3)
)
# Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
# 1 64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9 12.8 19.2 6.4
# 2 55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4 27.5 5.5 22.0
# 3 70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7 28.0 14.0 49.0
# 4 26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2 13.0 15.6 5.2
# 5 47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8 37.6 4.7 28.2
# 6 35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1 28.0 3.5 3.5
If your data is more generic (not hard-coded or many more than "3" sets of columns), then we can reshape the data, do the assignment, and shape it back.
library(tidyr)
df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q)) %>%
pivot_wider(
id_cols = Whole,
names_from = set,
values_from = c(code, P, Q, Output),
names_sep = "_"
)
# # A tibble: 6 x 13
# Whole code_1 code_2 code_3 P_1 P_2 P_3 Q_1 Q_2 Q_3 Output_1 Output_2 Output_3
# <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 64 a b d 0.2 0.3 0.1 0.1 0.2 0.9 12.8 19.2 57.6
# 2 55 a c b 0.5 0.1 0.4 0.3 0.3 0.4 27.5 16.5 22
# 3 70 b d NULL 0.4 0.2 0.7 0.1 0.5 0.7 28 35 49
# 4 26 c a b 0.7 0.2 0.2 0.5 0.6 0.2 13 5.2 5.2
# 5 47 a d NULL 0.8 0.1 0.6 0.7 0.2 0.8 37.6 9.4 37.6
# 6 35 d b a 0.2 0.8 0.2 0.8 0.1 0.1 28 28 7
As a side note, in general I would recommend keeping it in the "long" format and not re-widening it. This "long" format is often preferred for other tidy-like functions (including ggplot2
), and is easily extended to arbitrary counts. That would result in data like this:
df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q))
# # A tibble: 18 x 6
# Whole set code P Q Output
# <int> <chr> <chr> <dbl> <dbl> <dbl>
# 1 64 1 a 0.2 0.1 12.8
# 2 64 2 b 0.3 0.2 19.2
# 3 64 3 d 0.1 0.9 57.6
# 4 55 1 a 0.5 0.3 27.5
# 5 55 2 c 0.1 0.3 16.5
# 6 55 3 b 0.4 0.4 22
# 7 70 1 b 0.4 0.1 28
# 8 70 2 d 0.2 0.5 35
# 9 70 3 NULL 0.7 0.7 49
# 10 26 1 c 0.7 0.5 13
# 11 26 2 a 0.2 0.6 5.2
# 12 26 3 b 0.2 0.2 5.2
# 13 47 1 a 0.8 0.7 37.6
# 14 47 2 d 0.1 0.2 9.4
# 15 47 3 NULL 0.6 0.8 37.6
# 16 35 1 d 0.2 0.8 28
# 17 35 2 b 0.8 0.1 28
# 18 35 3 a 0.2 0.1 7
(Much shorter.)
Upvotes: 1
Reputation: 24770
Depending on how many rows you have, this data.table approach may be faster.
library(data.table)
setDT(df1)[,Logical := (code1 == "a" | code1 == "b")][
,`:=`(Output1 = numeric(),Output2 = numeric(), Output3 = numeric())
][Logical == TRUE,`:=`(Output1 = Whole * P_1,
Output2 = Whole * P_2,
Output3 = Whole * P_3)
][Logical == FALSE,`:=`(Output1 = Whole * Q_1,
Output2 = Whole * Q_2,
Output3 = Whole * Q_3)
][,.(Output1,Output2,Output3)]
Output1 Output2 Output3
1: 12.8 19.2 6.4
2: 27.5 5.5 22.0
3: 28.0 14.0 49.0
4: 13.0 15.6 5.2
5: 37.6 4.7 28.2
6: 28.0 3.5 3.5
Upvotes: 0
Reputation: 886938
We can use map2
. Get the names of the columns that have 'P', 'Q'. followed by digits after the _
. then loop over the corresponding columns with map2
, apply the transformation logic and bind the columns with the original dataset
library(dplyr)
library(purrr)
library(stringr)
ps <- names(df1)[str_detect(names(df1), "^P_\\d+$")]
qs <- names(df1)[str_detect(names(df1), "^Q_\\d+$")]
map2_dfc(ps, qs, ~ df1 %>%
transmute(Output = Whole *
case_when(code1 %in% c('a', 'b') ~ !! rlang::sym(.x),
TRUE ~ !! rlang::sym(.y)))) %>%
rename_all(~ str_remove(., fixed("..."))) %>%
bind_cols(df1, .)
# Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
#1 64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9 12.8 19.2 6.4
#2 55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4 27.5 5.5 22.0
#3 70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7 28.0 14.0 49.0
#4 26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2 13.0 15.6 5.2
#5 47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8 37.6 4.7 28.2
#6 35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1 28.0 3.5 3.5
df1 <- structure(list(Whole = c(64L, 55L, 70L, 26L, 47L, 35L), code1 = c("a",
"a", "b", "c", "a", "d"), P_1 = c(0.2, 0.5, 0.4, 0.7, 0.8, 0.2
), Q_1 = c(0.1, 0.3, 0.1, 0.5, 0.7, 0.8), code2 = c("b", "c",
"d", "a", "d", "b"), P_2 = c(0.3, 0.1, 0.2, 0.2, 0.1, 0.8), Q_2 = c(0.2,
0.3, 0.5, 0.6, 0.2, 0.1), code3 = c("d", "b", "NULL", "b", "NULL",
"a"), P_3 = c(0.1, 0.4, 0.7, 0.2, 0.6, 0.2), Q_3 = c(0.9, 0.4,
0.7, 0.2, 0.8, 0.1)), class = "data.frame", row.names = c(NA,
-6L))
Upvotes: 0