Nahuel Patiño
Nahuel Patiño

Reputation: 89

Iterate over columns with NAs to create percentile variables with dplyr and data.table

I need quite a simple thing. To iterate over columns of a dataset to create percentil versions of said columns. I tried with dplyr and data.table but none seem to do what I need. Particulary, I need to exclude de NA values when creating the percentile versions of the columns.

Reproducible example below:

values<-c(19,
    6,
    27,
    63,
    50,
    59,
    97,
    89,
    NA,
    9,
    31,
    58,
    83,
    2,
    1,
    31,
    3,
    1,
    27,
    40,
    32,
    42,
    99,
    NA,
    12,
    16,
    23,
    98,
    44,
    25,
    13,
    70,
    64,
    NA,
    37,
    75,
    73,
    59,
    21,
    3,
    76,
    43,
    6,
    96,
    55,
    48,
    70,
    90,
    18,
    58,
    22,
    19,
    26,
    49,
    59,
    94,
    31,
    45,
    20,
    8,
    26,
    56,
    7,
    11,
    98,
    50,
    41,
    38,
    86,
    0,
    37,
    NA,
    40,
    7,
    88,
    38,
    41,
    41,
    19,
    34,
    21,
    64,
    87,
    22,
    54,
    39,
    75,
    72,
    91,
    78)

values2<- c(98,
            60,
            9,
            98,
            NA,
            88,
            NA,
            54,
            92,
            90,
            NA,
            83,
            92,
            65,
            44,
            NA,
            98,
            40,
            26,
            40,
            54,
            56,
            15,
            90,
            15,
            63,
            57,
            NA,
            85,
            69,
            73,
            43,
            24,
            27,
            82,
            75,
            29,
            98,
            29,
            5,
            91,
            88,
            28,
            12,
            53,
            NA,
            2,
            42,
            86,
            2,
            78,
            20,
            50,
            73,
            77,
            NA,
            4,
            39,
            90,
            NA,
            29,
            14,
            98,
            88,
            77,
            79,
            30,
            9,
            74,
            93,
            NA,
            16,
            27,
            16,
            18,
            40,
            NA,
            2,
            66,
            71,
            82,
            10,
            62,
            84,
            25,
            NA,
            15,
            12,
            85,
            50)


  groups<-c(1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2)

df<-as.data.frame(cbind(groups,values,values2))


library(dplyr)

for (i in c("values","values2")) {
    df<-df %>%
      group_by(groups) %>% 
      mutate(!!sym(paste( i,"_percentile", sep="")) := percent_rank(na.omit(i)))
  }


for (i in c("values","values2")) {
  df<-df %>%
    group_by(groups) %>% 
    mutate(!!sym(paste( i,"_percentile", sep="")) := rank(i)/length(i) )
}

library(data.table)

df<- as.data.table(df)

for (i in c("values","values2")) {
  df[,  paste(i,"_percentile",sep="")  := rank(get(i))/length( get(i)), by = groups ]

}


for (i in c("values","values2")) {
  df[!is.na(i),  paste(i,"_percentile",sep="")  := rank(get(i))/length( get(i)), by = groups ]

}

Upvotes: 1

Views: 277

Answers (2)

ravic_
ravic_

Reputation: 1831

My tidyverse answer has the same structure as @akrun's -- using mutate_at to add multiple columns, starts_with to select the columns. A few things worth pointing out with the more minimal example:

  1. The percent_rank function already removes NA's when it calculates, so you don't have to do the additional work to filter them out of the calc.
  2. There is one degenerate case where there's only one actual measure. (In my case, it's group "b"). percent_rank can return a NaN value there because it's scaling the min_rank. Inside the direct mutate_at, that issue seems to be avoided. (It's unclear what value that should be assigned to in your case).
  3. There's another sort-of degenerate case when there's a tie. In group "a", I have a tie for first place, and the percent_rank's are accordingly not 1.0.
library(tidyverse)
df <- tribble(
  ~groups, ~values1, ~values2,
  "a", 1, 10,
  "a", 2, 10,
  "a", NA, 8,
  "a", 3, 9,
  "a", 4, 7,
  "b", NA, 10,
  "b", 2, NA,
  "b", NA, 8
)

df %>%
  group_by(groups) %>%
  mutate_at(
    vars(starts_with("values")),
    list(percentile = ~ percent_rank(.))) 
#> # A tibble: 8 x 5
#> # Groups:   groups [2]
#>   groups values1 values2 values1_percentile values2_percentile
#>   <chr>    <dbl>   <dbl>              <dbl>              <dbl>
#> 1 a            1      10              0                   0.75
#> 2 a            2      10              0.333               0.75
#> 3 a           NA       8             NA                   0.25
#> 4 a            3       9              0.667               0.5 
#> 5 a            4       7              1                   0   
#> 6 b           NA      10             NA                   1   
#> 7 b            2      NA              0                  NA   
#> 8 b           NA       8             NA                   0

Upvotes: 0

akrun
akrun

Reputation: 887108

An option is mutate_at. After grouping by 'groups', use mutate_at to loop over the columns that starts_with ('values') as column name, replace, the values where the values are not NA with the percent_rank of the non-NA elements

library(dplyr)
df %>% 
   group_by(groups) %>%
   mutate_at(vars(starts_with('values')), 
         list(percentile = ~ replace(., !is.na(.), percent_rank(.[!is.na(.)]))))

Or with data.table

library(data.table)
nm1 <- paste(names(df1)[2:3], "_percentile")
setDT(df)[, (nm1) := lapply(.SD, function(x) replace(x, !is.na(x), 
      frank(x[!is.na(x)])/sum(!is.na(x)))), .SDcols = 2:3, by = groups]

Upvotes: 1

Related Questions