Michiel Besseling
Michiel Besseling

Reputation: 23

How can I replace NAs in multiple columns with the values from other corresponding multiple columns in the same data set?

I am trying replace NA's over multiple columns with corresponding values from other columns in the df.


df = data.frame(ID = sample(1000:9999,10),
                Age = sample(18:99,10),
                Gender = sample(c("M","F"),10, replace = TRUE),              
                Test1 = sample(60:100,10),
                Test2 = sample(60:100,10),
                Test3 = sample(60:100,10),
                Test1.x = rep(NA,10),
                Test2.x = rep(NA,10),
                Test3.x = rep(NA,10))

df$Test1[c(2,3,8)] = NA
df$Test2[c(4,10)] = NA
df$Test3[c(1,7)] = NA
df$Test1.x[c(2,3,4,8)] = sample(60:100,4)
df$Test2.x[c(4,9,10)] = sample(60:100,3)
df$Test3.x[c(1,6,7)] = sample(60:100,3)
print(df)
    ID Age Gender Test1 Test2 Test3 Test1.x Test2.x Test3.x
1  7877  40      M    78    70    NA      NA      NA      84
2  6345  54      F    NA    99    61      62      NA      NA
3  9170  41      F    NA    80    96      82      NA      NA
4  2400  83      M   100    NA   100      94      95      NA
5  5920  66      M    77    62    69      NA      NA      NA
6  2569  34      M    99    96    81      NA      NA     100
7  7879  28      M    64    71    NA      NA      NA      90
8  8652  53      F    NA    74    89      95      NA      NA
9  6357  97      F    92    86    83      NA      86      NA
10 1943  45      M    95    NA    98      NA      72      NA

I would like to replace only the NAs in the test scores with the corresponding test.x score, while using str_replace. My actual data frame contain more than 3 columns but all the corresponding column names are the same with the ".x" afterwards.

Any ideas to make this quick and easy? I'm struggling between mutating across said columns or using replace_nas.

Upvotes: 2

Views: 63

Answers (2)

harre
harre

Reputation: 7307

Within dplyr we could use coalesce with across.

library(dplyr)

df |>
  mutate(across(starts_with("Test") & !ends_with(".x"),
                ~ coalesce(., get(paste0(cur_column(), ".x")))))

Output:

     ID Age Gender Test1 Test2 Test3 Test1.x Test2.x Test3.x
1  5022  90      M    94    68    79      NA      NA      79
2  1625  41      M    71    66    89      71      NA      NA
3  6438  86      M    86    94    94      86      NA      NA
4  3249  93      F    74    90    76      68      90      NA
5  7338  70      F    64    63    70      NA      NA      NA
6  9416  27      F    78    74    75      NA      NA      64
7  4374  45      F    82   100    60      NA      NA      60
8  6226  21      F    61    82    63      61      NA      NA
9  5265  97      M    83    83    68      NA      89      NA
10 5441  95      M    70    79    99      NA      79      NA

Update 2/9:

To allow for other variable names we could do a specific solution or a more general one:

Specific:

df |>
  mutate(across(c(HW, Exam, Final) & !ends_with(".x"),
                ~ coalesce(., get(paste0(cur_column(), ".x")))))

General:

df |>
  mutate(across(ends_with(".x"),
                ~ coalesce(get(sub("\\.x", "", cur_column())), .)))

New output:

     ID Age Gender HW Exam Final HW.x Exam.x Final.x
1  5166  80      F 60   79    NA   60     79      64
2  3375  35      M NA   88    72   65     88      72
3  5722  19      F NA   65    75   81     65      75
4  3701  27      M 89   NA    61   89     89      61
5  1424  67      F 69   94    91   69     94      91
6  1407  20      F 75   72    66   75     72      66
7  2927  39      M 63   82    NA   63     82      86
8  7315  90      F NA   92    79   70     92      79
9  7420  76      F 87   83    87   87     83      87
10 9334  73      F 86   NA    64   86     82      64

New data:

df = data.frame(ID = sample(1000:9999,10),
                Age = sample(18:99,10),
                Gender = sample(c("M","F"),10, replace = TRUE),              
                HW = sample(60:100,10),
                Exam = sample(60:100,10),
                Final = sample(60:100,10),
                HW.x = rep(NA,10),
                Exam.x = rep(NA,10),
                Final.x = rep(NA,10))

df$HW[c(2,3,8)] = NA
df$Exam[c(4,10)] = NA
df$Final[c(1,7)] = NA
df$HW.x[c(2,3,4,8)] = sample(60:100,4)
df$Exam.x[c(4,9,10)] = sample(60:100,3)
df$Final.x[c(1,6,7)] = sample(60:100,3)

Upvotes: 2

akrun
akrun

Reputation: 887951

Using dplyover

library(dplyover)
df <- df %>% 
   mutate(across2(matches("Test\\d+$"), ends_with(".x"),
       coalesce, .names = "{xcol}"))

-output

df
     ID Age Gender Test1 Test2 Test3 Test1.x Test2.x Test3.x
1  7877  40      M    78    70    84      NA      NA      84
2  6345  54      F    62    99    61      62      NA      NA
3  9170  41      F    82    80    96      82      NA      NA
4  2400  83      M   100    95   100      94      95      NA
5  5920  66      M    77    62    69      NA      NA      NA
6  2569  34      M    99    96    81      NA      NA     100
7  7879  28      M    64    71    90      NA      NA      90
8  8652  53      F    95    74    89      95      NA      NA
9  6357  97      F    92    86    83      NA      86      NA
10 1943  45      M    95    72    98      NA      72      NA

Upvotes: 1

Related Questions