Reputation: 153

Count NA in given columns by rows

I would like to count NA in selected columns by rows and save result in new column. I would like to achieve this with mutate() function from dplyr

How it should work:

loop for each row i in test{
test$SUM <-sum(is.na(test[i,1:2]))
test$SUM2 <-sum(is.na(test[i,3:4]))
test$SUM3 <-sum(is.na(test[i,5:6]))
}

Data used:

test<-data.frame(
BIEZ_01 = c(59000, 61462, NA, 33000, 30840, 36612), 
BIEZ_02 = c(5060, 55401, 33000, 33000, 30840, 28884), 
BIEZ_03 = c(NA, 60783, 20000, 20000, NA, 19248), 
BIEZ_04 = c(22100, 59885, 15000, 15000, 20840, 10000), 
BIEZ_05 = c(NA, 59209, 15000, 15000, 20840, NA), 
BIEZ_06 = c(4400, 6109, NA, 500, 10840, 10000))

Upvotes: 4

Answers (3)

IceCreamToucan

Reputation: 28685

Another option

NA.counts <- sapply(split(seq(ncol(test)), ceiling(seq(ncol(test))/2))
                    , function(x) rowSums(is.na(test[, x])))

If you want to use tidyverse to add columns you can do

library(tidyverse)
test %>% 
  cbind(NA.counts = map(seq(ncol(test)) %>% split(ceiling(./2))
                        , ~rowSums(is.na(test[, .]))))


#   BIEZ_01 BIEZ_02 BIEZ_03 BIEZ_04 BIEZ_05 BIEZ_06 NA.counts.1 NA.counts.2 NA.counts.3
# 1   59000    5060      NA   22100      NA    4400           0           1           1
# 2   61462   55401   60783   59885   59209    6109           0           0           0
# 3      NA   33000   20000   15000   15000      NA           1           0           1
# 4   33000   33000   20000   15000   15000     500           0           0           0
# 5   30840   30840      NA   20840   20840   10840           0           1           0
# 6   36612   28884   19248   10000      NA   10000           0           0           1

As @Moody_Mudskipper points out, cbind isn't necessary if you want to modify the dataframe. You can add the columns with

test[paste0("SUM",seq(ncol(test)/2))] <- map(seq(ncol(test)) %>% split(ceiling(./2)), 
                                             ~rowSums(is.na(test[.])))

Upvotes: 1

moodymudskipper

Reputation: 47310

for (i in seq(1,ncol(test),2)) {
  test[[paste('SUM',(i+1)/2)]] <- rowSums(is.na(test[c(i,i+1)]))
}
#   BIEZ_01 BIEZ_02 BIEZ_03 BIEZ_04 BIEZ_05 BIEZ_06 SUM 1 SUM 2 SUM 3
# 1   59000    5060      NA   22100      NA    4400     0     1     1
# 2   61462   55401   60783   59885   59209    6109     0     0     0
# 3      NA   33000   20000   15000   15000      NA     1     0     1
# 4   33000   33000   20000   15000   15000     500     0     0     0
# 5   30840   30840      NA   20840   20840   10840     0     1     0
# 6   36612   28884   19248   10000      NA   10000     0     0     1

This is somewhat "tidy" :

library(tidyverse)

split(seq(ncol(test)),((1:ncol(test))-1) %/% 2 + 1) %>%
  imap(~test[.] %>% mutate_at(paste0("SUM",.y),function(x) rowSums(is.na(.)))) %>%
  bind_cols

#   BIEZ_01 BIEZ_02 SUM1 BIEZ_03 BIEZ_04 SUM2 BIEZ_05 BIEZ_06 SUM3
# 1   59000    5060    0      NA   22100    1      NA    4400    1
# 2   61462   55401    0   60783   59885    0   59209    6109    0
# 3      NA   33000    1   20000   15000    0   15000      NA    1
# 4   33000   33000    0   20000   15000    0   15000     500    0
# 5   30840   30840    0      NA   20840    1   20840   10840    0
# 6   36612   28884    0   19248   10000    0      NA   10000    1

And this would be the really tidy version :

test %>%
  rowid_to_column("rowid") %>%
  gather(,,-1) %>%
  mutate(SUM = ceiling(group_indices(.,key)/2)) %>%
  group_by(rowid,SUM) %>%
  summarize(sum_val = sum(is.na(value))) %>%
  ungroup %>%
  spread(SUM,sum_val,sep="") %>%
  select(-1) %>%
  bind_cols(test,.)

#   BIEZ_01 BIEZ_02 BIEZ_03 BIEZ_04 BIEZ_05 BIEZ_06 SUM1 SUM2 SUM3
# 1   59000    5060      NA   22100      NA    4400    0    1    1
# 2   61462   55401   60783   59885   59209    6109    0    0    0
# 3      NA   33000   20000   15000   15000      NA    1    0    1
# 4   33000   33000   20000   15000   15000     500    0    0    0
# 5   30840   30840      NA   20840   20840   10840    0    1    0
# 6   36612   28884   19248   10000      NA   10000    0    0    1

I also tried to use nest to group the columns by 2 with the idea of using map_dfc on the nested result to mutate the new columns, but I got stuck trying to use reduce with nest because of the non standard evaluation of the .key parameter... This would have been a bit shorter and more readable.

Upvotes: 1

tyumru

Reputation: 428

Here is a solution using apply function:

NA_counts <- apply(test,1,function(x){
  c(SUM1=sum(is.na(x[c(1,2)])),SUM2=sum(is.na(x[c(3,4)])),SUM3=sum(is.na(x[c(5,6)])))
  })
cbind(test,t(NA_counts))

Upvotes: 1

Count NA in given columns by rows

Answers (3)

Related Questions