Reputation: 1002
I have a data frame like this:
require(dplyr)
x_1=rnorm(10,0,1)
x_2=rnorm(10,0,1)
x_3=rnorm(10,0,1)
y_1=rnorm(10,0,1)
y_2=rnorm(10,0,1)
data=data.frame(cbind(x_1,x_2,x_3,y_1,y_2))
data[1,1]=NA
data[2,1]=NA
data[5,2]=NA
> data
x_1 x_2 x_3 y_1 y_2
1 NA 0.9272000 0.29439845 -1.7856567 1.6579091
2 NA 0.2346621 1.09837343 0.3731092 0.6111779
3 0.7315300 -0.5579094 -0.08524311 -2.8661310 1.1545358
4 -0.9469221 0.6929277 -2.67173898 0.6391045 -0.5114099
5 1.5408777 NA 1.33386146 -0.5581233 -2.5733381
6 -0.2852210 -0.9532492 0.03750860 -1.0129503 0.3929722
7 -1.3821487 -2.1865094 -0.03039062 0.3960388 -1.5332137
8 -0.9447420 0.2669902 0.65167163 0.4310705 -1.5300816
9 -0.9023479 0.2068130 0.10868635 -1.1652238 -0.4892178
10 -0.9739177 -0.8094084 0.64103491 0.6063812 0.7248394
I need to create a new variable which counts the number of non missing values in each row for the variables starting with "x_". To do that I used mutate
and across
functions from dplyr
.
data=data %>% mutate(sum_no_miss=across(.cols = starts_with("x_"),~ sum(is.na(.x))))
I ran the code without getting error. But I am not getting the ourput that I want. I am getting this.
Would it be possible to tell what I'm doing wrong?
Upvotes: 2
Views: 367
Reputation: 16856
Here is a possible data.table
solution, using Reduce
and lapply
on the specific columns using grep
.
library(data.table)
dt <- as.data.table(data)
dt[, num_obs := Reduce(`+`, lapply(.SD, \(x) !is.na(x))), .SDcols=grep("x_", names(dt))]
Output
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1: NA 1.30228879 -0.586898083 -1.02679871 -0.9280488 2
2: NA -1.00846632 -0.260183508 -0.78828113 -0.8712957 2
3: -0.40475601 0.22961832 0.004414558 -1.04496673 -0.1032775 3
4: 0.09559518 -0.58875533 1.360528488 -0.48798151 -0.6350380 3
5: -0.39312997 NA 0.292025300 1.13544025 -0.2487097 2
6: -1.15802973 1.01589098 0.445829196 -0.02029337 0.9758154 3
7: -0.02524740 -0.17334510 -1.455821490 -0.12165396 -0.4441740 3
8: 0.93627901 -0.92913166 0.407038460 2.04054914 -0.8347571 3
9: 1.20218530 0.54453181 0.513222262 0.05571475 -0.4858128 3
10: 0.84765702 0.07472934 1.367745731 -1.49924113 -1.3170490 3
Upvotes: 1
Reputation: 78927
Here is a base R solution:
As akrun already provided the best answer. Here is a different one using base R:
We use apply
with grep
to apply the function to specific columns:
data$sum_no_miss <- apply(data[, grep("x_", names(data))], 1, function(x) sum(!is.na(x)))
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1 NA -0.5659449 -1.44792814 0.1659370 0.8040186 2
2 NA 2.6873635 -0.70704189 1.2647756 -0.1238085 2
3 -0.3239291 0.6206436 0.04374401 -0.6476829 1.5228775 3
4 0.7245148 1.6632621 -0.39304104 -0.9305281 1.1328385 3
5 -0.5994830 NA 0.06037891 -1.7654617 0.3073035 2
6 -0.1848746 0.3694963 -1.13622715 0.9252195 0.1072250 3
7 -0.1147132 0.4042102 1.56730477 0.3262673 -0.6369951 3
8 -0.8631230 0.2888508 -2.20030009 -0.9873629 0.2561348 3
9 -0.9384460 -0.8739620 -1.59174131 0.7559146 -1.4229472 3
10 -0.9352575 1.3151532 -0.11439843 -0.5451860 0.9334084 3
Upvotes: 3
Reputation: 309
First, you're close.
Edit: OP fixed this I believe, so not a relevant comment. Second, your variables are coded in a misleading way - you should rename your variable sum_is_miss
to something like sum_no_miss
if you want readers to understand that you're calculating the number of non-missing values.
Third, this is how you could calculate the number of non-missing entries per row in your dataset first using apply then just adding this column to your dataset:
library(tidyverse)
x_1=rnorm(10,0,1)
x_2=rnorm(10,0,1)
x_3=rnorm(10,0,1)
y_1=rnorm(10,0,1)
y_2=rnorm(10,0,1)
data=data.frame(cbind(x_1,x_2,x_3,y_1,y_2))
data[1,1]=NA
data[2,1]=NA
data[5,2]=NA
sum_no_miss_vec <- apply(data %>% dplyr::select(starts_with("x_")), MARGIN = 1, FUN = function(r){
sum(!is.na(r))
})
data2 <- data %>% mutate(sum_no_miss = sum_no_miss_vec) ;
data2
#> x_1 x_2 x_3 y_1 y_2 sum_no_miss
#> 1 NA 0.74234418 1.06515091 -0.313359946 -0.81266805 2
#> 2 NA 2.13222122 0.78392737 2.109171065 0.69459821 2
#> 3 0.9322299 -0.52545325 0.67377319 2.025281430 0.99975832 3
#> 4 0.9634517 0.38985353 1.20940016 -0.007232240 -1.61104902 3
#> 5 0.4454230 NA 0.02420848 -1.743636503 -0.59597234 2
#> 6 -1.7305822 2.07163152 -0.52849895 0.830802138 -1.40573549 3
#> 7 0.2382603 0.20427098 0.22184048 0.806113977 -0.36726054 3
#> 8 -0.3972436 1.61183785 -0.26835072 0.419459671 -0.05723072 3
#> 9 0.3703195 -0.05354607 -1.19558014 -0.852003930 0.64032424 3
#> 10 0.3003434 -0.82513981 0.19782771 0.001526784 0.89393655 3
Created on 2022-04-15 by the reprex package (v2.0.1)
Upvotes: 3
Reputation: 887108
We may use rowSums
which is vectorized and efficient compared to rowwise
with sum
library(dplyr)
data %>%
mutate(sum_no_miss = rowSums(!is.na(across(starts_with("x_")))))
-output
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1 NA 0.9272000 0.29439845 -1.7856567 1.6579091 2
2 NA 0.2346621 1.09837343 0.3731092 0.6111779 2
3 0.7315300 -0.5579094 -0.08524311 -2.8661310 1.1545358 3
4 -0.9469221 0.6929277 -2.67173898 0.6391045 -0.5114099 3
5 1.5408777 NA 1.33386146 -0.5581233 -2.5733381 2
6 -0.2852210 -0.9532492 0.03750860 -1.0129503 0.3929722 3
7 -1.3821487 -2.1865094 -0.03039062 0.3960388 -1.5332137 3
8 -0.9447420 0.2669902 0.65167163 0.4310705 -1.5300816 3
9 -0.9023479 0.2068130 0.10868635 -1.1652238 -0.4892178 3
10 -0.9739177 -0.8094084 0.64103491 0.6063812 0.7248394 3
If we want to use sum
, then need rowwise
data %>%
rowwise %>%
mutate(sum_no_miss = sum(!is.na(c_across(starts_with('x_'))))) %>%
ungroup
-output
# A tibble: 10 × 6
x_1 x_2 x_3 y_1 y_2 sum_no_miss
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 NA 0.927 0.294 -1.79 1.66 2
2 NA 0.235 1.10 0.373 0.611 2
3 0.732 -0.558 -0.0852 -2.87 1.15 3
4 -0.947 0.693 -2.67 0.639 -0.511 3
5 1.54 NA 1.33 -0.558 -2.57 2
6 -0.285 -0.953 0.0375 -1.01 0.393 3
7 -1.38 -2.19 -0.0304 0.396 -1.53 3
8 -0.945 0.267 0.652 0.431 -1.53 3
9 -0.902 0.207 0.109 -1.17 -0.489 3
10 -0.974 -0.809 0.641 0.606 0.725 3
In the OP's code, the function sum
is used within across
and across
loops over each column, thus the sum
will be the sum of non-NA elements in each column instead of across a row
Upvotes: 4