Adam
Adam

Reputation: 453

Mutate over multiple columns, skip NA values

Reprex

data <- data.frame(
  subjid = c(3,4,7,8,11,14,18,19,20,21),
  pct_pvr_3 = c(10,NA,NA,NA,20,NA,70,NA,NA,NA),
  pct_pvr_2 = c(90,NA,90,30,30,50,30,100,NA,NA),
  pct_pvr_1 = c(NA,NA,10,20,30,NA,NA,NA,NA,NA),
  pct_pvr_0 = c(NA,NA,NA,50,NA,NA,NA,NA,NA,NA)
)

Problem: I want to create a new column hscore with the following formula:

hscore = ((0*pct_pvr_0)+(1*pct_pvr_1)+(2*pct_pvr_2)+(3*pct_pvr_3))

However, as many of the columns are NA, the resulting column is full of NAs. I think I may be able to get around this issue with a bunch of if_else statements. Before going that route, I wanted to see if there were any simpler solutions using dplyr or other packages.

Current Code:

data <- data %>%
     mutate(hscore = ((0*pct_pvr_0)+(1*pct_pvr_1)+(2*pct_pvr_2)+(3*pct_pvr_3)))

Current Output:

# A tibble: 56 x 6
   subjid pct_pvr_0 pct_pvr_1 pct_pvr_2 pct_pvr_3 hscore
    <dbl>     <dbl>     <dbl>     <dbl>     <dbl>  <dbl>
 1      3        NA        NA        90        10     NA
 2      4        NA        NA        NA        NA     NA
 3      7        NA        10        90        NA     NA
 4      8        50        20        30        NA     NA
 5     11        NA        30        50        20     NA
 6     14        NA        NA       100        NA     NA
 7     18        NA        NA        30        70     NA
 8     19        NA        NA        NA        NA     NA
 9     20        NA        NA        NA        NA     NA
10     21        NA        NA        NA        NA     NA

dplyr Solution: Replaced NAs as 0

data <- data %>% 
replace(is.na(.),0) %>% 
mutate(hscore = ((0*pct_pvr_0)+(1*pct_pvr_1)+(2*pct_pvr_2)+(3*pct_pvr_3)))

data
   subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
1       3        10        90         0         0    210
2       4         0         0         0         0      0
3       7         0        90        10         0    190
4       8         0        30        20        50     80
5      11        20        30        30         0    150
6      14         0        50         0         0    100
7      18        70        30         0         0    270
8      19         0       100         0         0    200
9      20         0         0         0         0      0
10     21         0         0         0         0      0

Upvotes: 4

Views: 2470

Answers (6)

Wojciech Kulma
Wojciech Kulma

Reputation: 6446

how about that?

data %>%
  mutate(
    across(everything(), ~replace_na(.x, 0))
  ) %>%
  mutate(hscore = ((0*pct_pvr_0)+(1*pct_pvr_1)+(2*pct_pvr_2)+(3*pct_pvr_3))) %>%
  select(subjid, hscore) %>%
  right_join(data)

yields:

   subjid hscore pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0
1       3    210        10        90        NA        NA
2       4      0        NA        NA        NA        NA
3       7    190        NA        90        10        NA
4       8     80        NA        30        20        50
5      11    150        20        30        30        NA
6      14    100        NA        50        NA        NA
7      18    270        70        30        NA        NA
8      19    200        NA       100        NA        NA
9      20      0        NA        NA        NA        NA
10     21      0        NA        NA        NA        NA

Upvotes: 4

Dr. Fabian Habersack
Dr. Fabian Habersack

Reputation: 1141

This would be a straight forward {tidyverse} solution. rowwise() is a real lifesaver!

library(tidyverse)

df <- tibble(
  subjid = c(3,4,7,8,11,14,18,19,20,21),
  pct_pvr_3 = c(10,NA,NA,NA,20,NA,70,NA,NA,NA),
  pct_pvr_2 = c(90,NA,90,30,30,50,30,100,NA,NA),
  pct_pvr_1 = c(NA,NA,10,20,30,NA,NA,NA,NA,NA),
  pct_pvr_0 = c(NA,NA,NA,50,NA,NA,NA,NA,NA,NA)
)

df %>%
  rowwise() %>%
  mutate(hscore = sum((1*pct_pvr_1),(2*pct_pvr_2),(3*pct_pvr_3), na.rm=T))

   subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
    <dbl>     <dbl>     <dbl>     <dbl>     <dbl>  <dbl>
 1      3        10        90        NA        NA    210
 2      4        NA        NA        NA        NA      0
 3      7        NA        90        10        NA    190
 4      8        NA        30        20        50     80
 5     11        20        30        30        NA    150
 6     14        NA        50        NA        NA    100
 7     18        70        30        NA        NA    270
 8     19        NA       100        NA        NA    200
 9     20        NA        NA        NA        NA      0
10     21        NA        NA        NA        NA      0

EDIT:

If you want to keep NAs where all columns are NA, then you can use Paul Poncet's sumNA() from {bazar}.

df %>%
  rowwise() %>%
  mutate(hscore = bazar::sumNA((1*pct_pvr_1),(2*pct_pvr_2),(3*pct_pvr_3), na.rm=T)) -> df

It's actually pretty neat. This is the function:

function (..., na.rm = FALSE) 
{
    x <- unlist(list(...))
    if (na.rm && length(x) && all(is.na(x))) 
        return(x[1] + NA)
    sum(x, na.rm = na.rm)
}

Upvotes: 3

ThomasIsCoding
ThomasIsCoding

Reputation: 101327

A data.table option

setDT(data)[
  ,
  hscore := rowSums(
    mapply(
      "*",
      .SD,
      as.numeric(gsub(".*_", "", names(.SD)))
    ),
    na.rm = TRUE
  ),
  .SDcols = patterns("pct_pvr_")
][]

gives

    subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
 1:      3        10        90        NA        NA    210
 2:      4        NA        NA        NA        NA      0
 3:      7        NA        90        10        NA    190
 4:      8        NA        30        20        50     80
 5:     11        20        30        30        NA    150
 6:     14        NA        50        NA        NA    100
 7:     18        70        30        NA        NA    270
 8:     19        NA       100        NA        NA    200
 9:     20        NA        NA        NA        NA      0
10:     21        NA        NA        NA        NA      0

Upvotes: 1

akrun
akrun

Reputation: 887088

We could also do with across (similar to @JonSpring's solution involving coalesce and parse_number), but without reshaping to 'long' format. i.e. loop across the columns that starts with 'pct', get the column name with cur_column, then extract the numeric part with parse_number from readr, convert the NA to 0 with either coalesce or replace_na and reduce the columns to a single column by adding (+) the corresponding elements of each column, finally bind the single column output from transmute with original data

library(dplyr)
library(purrr)
df %>%
  transmute(across(starts_with('pct'), 
       ~ coalesce(. * readr::parse_number(cur_column()), 0))) %>% 
  transmute(hscore = reduce(cur_data(), `+`)) %>%
  bind_cols(df, .)

-output

# A tibble: 10 x 6
#   subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
#    <dbl>     <dbl>     <dbl>     <dbl>     <dbl>  <dbl>
# 1      3        10        90        NA        NA    210
# 2      4        NA        NA        NA        NA      0
# 3      7        NA        90        10        NA    190
# 4      8        NA        30        20        50     80
# 5     11        20        30        30        NA    150
# 6     14        NA        50        NA        NA    100
# 7     18        70        30        NA        NA    270
# 8     19        NA       100        NA        NA    200
# 9     20        NA        NA        NA        NA      0
#10     21        NA        NA        NA        NA      0

Upvotes: 1

Gregor Thomas
Gregor Thomas

Reputation: 145775

I'd (temporarily) replace the NA values with 0 and frame the problem as matrix multiplication:

data$hscore = rowSums(as.matrix(replace(data[-1], is.na(data[-1]), 0)) %*% diag(3:0))
data
#    subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
# 1       3        10        90        NA        NA    210
# 2       4        NA        NA        NA        NA      0
# 3       7        NA        90        10        NA    190
# 4       8        NA        30        20        50     80
# 5      11        20        30        30        NA    150
# 6      14        NA        50        NA        NA    100
# 7      18        70        30        NA        NA    270
# 8      19        NA       100        NA        NA    200
# 9      20        NA        NA        NA        NA      0
# 10     21        NA        NA        NA        NA      0

Upvotes: 4

Jon Spring
Jon Spring

Reputation: 66445

library(tidyverse)

data %>%
  left_join(data %>%
    pivot_longer(-subjid) %>%
    group_by(subjid) %>%
    summarise(hscore = sum(coalesce(value * parse_number(name), 0)))
  )

parse_number() extracts the ending number part of the column name. coalesce takes care of the NA's by replacing any NA with zero.

Joining, by = "subjid"
   subjid pct_pvr_3 pct_pvr_2 pct_pvr_1 pct_pvr_0 hscore
1       3        10        90        NA        NA    210
2       4        NA        NA        NA        NA      0
3       7        NA        90        10        NA    190
4       8        NA        30        20        50     80
5      11        20        30        30        NA    150
6      14        NA        50        NA        NA    100
7      18        70        30        NA        NA    270
8      19        NA       100        NA        NA    200
9      20        NA        NA        NA        NA      0
10     21        NA        NA        NA        NA      0

Upvotes: 4

Related Questions