Reputation: 453
Reprex
data <- data.frame(
subjid = c(3,4,7,8,11,14,18,19,20,21),
pct_pvr_3 = c(10,NA,NA,NA,20,NA,70,NA,NA,NA),
pct_pvr_2 = c(90,NA,90,30,30,50,30,100,NA,NA),
pct_pvr_1 = c(NA,NA,10,20,30,NA,NA,NA,NA,NA),
pct_pvr_0 = c(NA,NA,NA,50,NA,NA,NA,NA,NA,NA)
)
Problem: I want to create a new column hscore
with the following formula:
hscore = ((0*pct_pvr_0)+(1*pct_pvr_1)+(2*pct_pvr_2)+(3*pct_pvr_3))
However, as many of the columns are NA
, the resulting column is full of NA
s. I think I may be able to get around this issue with a bunch of if_else
statements. Before going that route, I wanted to see if there were any simpler solutions using dplyr
or other packages.
Current Code:
data <- data %>%
mutate(hscore = ((0*pct_pvr_0)+(1*pct_pvr_1)+(2*pct_pvr_2)+(3*pct_pvr_3)))
Current Output:
# A tibble: 56 x 6
subjid pct_pvr_0 pct_pvr_1 pct_pvr_2 pct_pvr_3 hscore
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 3 NA NA 90 10 NA
2 4 NA NA NA NA NA
3 7 NA 10 90 NA NA
4 8 50 20 30 NA NA
5 11 NA 30 50 20 NA
6 14 NA NA 100 NA NA
7 18 NA NA 30 70 NA
8 19 NA NA NA NA NA
9 20 NA NA NA NA NA
10 21 NA NA NA NA NA
dplyr
Solution: Replaced NA
s as 0
data <- data %>%
replace(is.na(.),0) %>%
mutate(hscore = ((0*pct_pvr_0)+(1*pct_pvr_1)+(2*pct_pvr_2)+(3*pct_pvr_3)))
data
subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
1 3 10 90 0 0 210
2 4 0 0 0 0 0
3 7 0 90 10 0 190
4 8 0 30 20 50 80
5 11 20 30 30 0 150
6 14 0 50 0 0 100
7 18 70 30 0 0 270
8 19 0 100 0 0 200
9 20 0 0 0 0 0
10 21 0 0 0 0 0
Upvotes: 4
Views: 2470
Reputation: 6446
how about that?
data %>%
mutate(
across(everything(), ~replace_na(.x, 0))
) %>%
mutate(hscore = ((0*pct_pvr_0)+(1*pct_pvr_1)+(2*pct_pvr_2)+(3*pct_pvr_3))) %>%
select(subjid, hscore) %>%
right_join(data)
yields:
subjid hscore pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0
1 3 210 10 90 NA NA
2 4 0 NA NA NA NA
3 7 190 NA 90 10 NA
4 8 80 NA 30 20 50
5 11 150 20 30 30 NA
6 14 100 NA 50 NA NA
7 18 270 70 30 NA NA
8 19 200 NA 100 NA NA
9 20 0 NA NA NA NA
10 21 0 NA NA NA NA
Upvotes: 4
Reputation: 1141
This would be a straight forward {tidyverse}
solution. rowwise()
is a real lifesaver!
library(tidyverse)
df <- tibble(
subjid = c(3,4,7,8,11,14,18,19,20,21),
pct_pvr_3 = c(10,NA,NA,NA,20,NA,70,NA,NA,NA),
pct_pvr_2 = c(90,NA,90,30,30,50,30,100,NA,NA),
pct_pvr_1 = c(NA,NA,10,20,30,NA,NA,NA,NA,NA),
pct_pvr_0 = c(NA,NA,NA,50,NA,NA,NA,NA,NA,NA)
)
df %>%
rowwise() %>%
mutate(hscore = sum((1*pct_pvr_1),(2*pct_pvr_2),(3*pct_pvr_3), na.rm=T))
subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 3 10 90 NA NA 210
2 4 NA NA NA NA 0
3 7 NA 90 10 NA 190
4 8 NA 30 20 50 80
5 11 20 30 30 NA 150
6 14 NA 50 NA NA 100
7 18 70 30 NA NA 270
8 19 NA 100 NA NA 200
9 20 NA NA NA NA 0
10 21 NA NA NA NA 0
EDIT:
If you want to keep NA
s where all columns are NA
, then you can use Paul Poncet's sumNA()
from {bazar}
.
df %>%
rowwise() %>%
mutate(hscore = bazar::sumNA((1*pct_pvr_1),(2*pct_pvr_2),(3*pct_pvr_3), na.rm=T)) -> df
It's actually pretty neat. This is the function:
function (..., na.rm = FALSE)
{
x <- unlist(list(...))
if (na.rm && length(x) && all(is.na(x)))
return(x[1] + NA)
sum(x, na.rm = na.rm)
}
Upvotes: 3
Reputation: 101327
A data.table
option
setDT(data)[
,
hscore := rowSums(
mapply(
"*",
.SD,
as.numeric(gsub(".*_", "", names(.SD)))
),
na.rm = TRUE
),
.SDcols = patterns("pct_pvr_")
][]
gives
subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
1: 3 10 90 NA NA 210
2: 4 NA NA NA NA 0
3: 7 NA 90 10 NA 190
4: 8 NA 30 20 50 80
5: 11 20 30 30 NA 150
6: 14 NA 50 NA NA 100
7: 18 70 30 NA NA 270
8: 19 NA 100 NA NA 200
9: 20 NA NA NA NA 0
10: 21 NA NA NA NA 0
Upvotes: 1
Reputation: 887088
We could also do with across
(similar to @JonSpring's solution involving coalesce
and parse_number
), but without reshaping to 'long' format. i.e. loop across
the columns that starts with 'pct', get the column name with cur_column
, then extract the numeric part with parse_number
from readr
, convert the NA to 0 with either coalesce
or replace_na
and reduce
the columns to a single column by adding (+
) the corresponding elements of each column, finally bind the single column output from transmute
with original data
library(dplyr)
library(purrr)
df %>%
transmute(across(starts_with('pct'),
~ coalesce(. * readr::parse_number(cur_column()), 0))) %>%
transmute(hscore = reduce(cur_data(), `+`)) %>%
bind_cols(df, .)
-output
# A tibble: 10 x 6
# subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 3 10 90 NA NA 210
# 2 4 NA NA NA NA 0
# 3 7 NA 90 10 NA 190
# 4 8 NA 30 20 50 80
# 5 11 20 30 30 NA 150
# 6 14 NA 50 NA NA 100
# 7 18 70 30 NA NA 270
# 8 19 NA 100 NA NA 200
# 9 20 NA NA NA NA 0
#10 21 NA NA NA NA 0
Upvotes: 1
Reputation: 145775
I'd (temporarily) replace the NA
values with 0
and frame the problem as matrix multiplication:
data$hscore = rowSums(as.matrix(replace(data[-1], is.na(data[-1]), 0)) %*% diag(3:0))
data
# subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
# 1 3 10 90 NA NA 210
# 2 4 NA NA NA NA 0
# 3 7 NA 90 10 NA 190
# 4 8 NA 30 20 50 80
# 5 11 20 30 30 NA 150
# 6 14 NA 50 NA NA 100
# 7 18 70 30 NA NA 270
# 8 19 NA 100 NA NA 200
# 9 20 NA NA NA NA 0
# 10 21 NA NA NA NA 0
Upvotes: 4
Reputation: 66445
library(tidyverse)
data %>%
left_join(data %>%
pivot_longer(-subjid) %>%
group_by(subjid) %>%
summarise(hscore = sum(coalesce(value * parse_number(name), 0)))
)
parse_number() extracts the ending number part of the column name. coalesce takes care of the NA's by replacing any NA with zero.
Joining, by = "subjid"
subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
1 3 10 90 NA NA 210
2 4 NA NA NA NA 0
3 7 NA 90 10 NA 190
4 8 NA 30 20 50 80
5 11 20 30 30 NA 150
6 14 NA 50 NA NA 100
7 18 70 30 NA NA 270
8 19 NA 100 NA NA 200
9 20 NA NA NA NA 0
10 21 NA NA NA NA 0
Upvotes: 4