Sandhya Ghildiyal
Sandhya Ghildiyal

Reputation: 273

Function using columns in arguments of a dataframe in R

I am trying to create a function with one of column of my dataframe outplacement, because similar calculations I would need for other columns also.

numeric_fun<-function(dataset,grp_var,var){

  require("dplyr")

    dataset%>%select(grp_var,var)%>% group_by_(grp_var)%>%
  summarize(
    'q25' = quantile(var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],
    'median' =round(quantile(var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],0),
    'avg' = round( mean(var, na.rm=TRUE), 0),
    'q75' = quantile(var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4] ,
    'n' = sum(!is.na(var))
  )%>% 
  mutate(
    q25 = ifelse( n < 5, "--", paste(q25,"%" )),
    median = ifelse( n < 5, "--", paste(median,"%")),
    avg = ifelse(n < 5, "--", paste(avg,"%")),
    q75 = ifelse( n < 5, "--", paste(q75,"%")),
    n = n
  ) %>% 
  rename( "Industry"=grp_var,
          "25%" = q25,
          "75%" = q75
  )

}

numeric_fun(outplacement,"Q7_1","Q8")

But this ones is not working, any other efficient ways?

output of dput

structure(list(Q7_1 = structure(c(NA, NA, NA, 5L, 5L, 14L, NA, 
1L, 9L, 13L, 1L, NA, 3L, 13L, 13L, 13L, 12L, 2L, 11L, 13L, 10L, 
11L, NA, 1L, 4L, NA, 5L, NA, 5L, 4L, 5L, 9L, 2L, 4L, 13L, 10L, 
13L, 13L, NA, 11L, NA, 1L, 11L, NA, 5L, NA, 1L, NA, 9L, 3L, 1L, 
1L, 10L, 1L, NA, 5L, NA, NA, 2L, NA, 6L, 6L, NA, 13L, 14L, NA, 
NA, 14L, 8L, 11L, 11L, 8L, 12L, 13L, NA, 3L, 11L, 3L, 11L, 1L, 
NA, 9L, NA, 10L, 6L, 1L, 5L, 3L, 1L, 13L, 4L, 14L, NA, 13L), .Label = c("Banking/Financial Services", 
"Chemicals", "Consumer Goods", "Energy", "High Tech", "Insurance/Reinsurance", 
"Life Sciences", "Logistics", "Mining & Metals", "Other Manufacturing", 
"Other Non-Manufacturing", "Retail & Wholesale", "Services (Non-Financial)", "Transportation Equipment"), class = "factor"), Q8 = c(NA, NA, 
NA, NA, NA, NA, NA, 2L, 3L, 3L, 6L, NA, 5L, 4L, 2L, 5L, 6L, 2L, 
2L, 3L, 2L, 5L, NA, 3L, 1L, NA, 3L, NA, 1L, 3L, 4L, 4L, 2L, 4L, 
1L, 3L, 2L, 3L, NA, 2L, NA, 4L, 4L, NA, 1L, NA, 3L, NA, 1L, 3L, 
5L, 2L, 3L, 1L, NA, 6L, NA, NA, 4L, NA, 1L, 5L, NA, 2L, 1L, NA, 
NA, 2L, 6L, 6L, 2L, 6L, 3L, 5L, NA, 5L, 2L, 1L, 3L, 3L, NA, 3L, 
NA, 3L, 3L, 6L, 4L, 1L, 4L, 6L, 3L, 5L, NA, 5L), Q9 = c(3L, 1L, 
NA, 1L, 3L, 3L, NA, 3L, 3L, 1L, 1L, NA, 3L, 2L, 2L, 3L, 2L, 3L, 
2L, 2L, 2L, 1L, NA, 3L, 1L, NA, 1L, NA, 1L, 2L, 1L, 2L, 3L, 1L, 
1L, 1L, 3L, 3L, NA, 3L, NA, 2L, 2L, NA, 1L, NA, 1L, NA, 1L, 2L, 
2L, 1L, 2L, 3L, NA, 1L, NA, NA, 2L, NA, 2L, 2L, NA, 2L, 2L, NA, 
NA, 1L, 3L, 1L, 3L, 3L, 1L, 3L, NA, 1L, 3L, 1L, 1L, 3L, NA, 1L, 
NA, 2L, 2L, 3L, 3L, 2L, 3L, 3L, 2L, 1L, NA, 2L), Q10 = c(NA, 
1L, NA, 1L, NA, NA, NA, NA, NA, 1L, 1L, NA, NA, 1L, 2L, NA, 1L, 
NA, 1L, 1L, 2L, 2L, NA, NA, 2L, NA, 2L, NA, 2L, 1L, NA, 1L, NA, 
1L, 1L, 1L, NA, NA, NA, NA, NA, 2L, 1L, NA, 1L, NA, 2L, NA, 2L, 
2L, 2L, 1L, 2L, 2L, NA, 1L, NA, NA, 2L, NA, 2L, 1L, NA, 1L, 2L, 
NA, NA, 1L, 1L, NA, 1L, NA, NA, 2L, NA, NA, 1L, 1L, 1L, 2L, NA, 
1L, NA, 1L, 2L, 2L, 1L, 1L, NA, 1L, NA, 2L, NA, 1L)), row.names = c(NA, 
-94L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 
0x0000000000090788>, .Names = c("Q7_1", 
"Q8", "Q9", "Q10"))

Upvotes: 1

Views: 249

Answers (2)

G. Grothendieck
G. Grothendieck

Reputation: 270195

The problem is that the code was using character strings in inappropriate contexts. We can use sym and !! from the rlang package to translate them. Add the statements marked with ##and then use !!grp_var and !!var in place of grp_var and var everywhere. Also changed group_by_ to group_by, reformatted, changed the require to library (see next paragraph) and added library statements for data.table and rlang.

Note that library is preferable to require unless within an if. That way if the package is missing it will fail at the library statement making the cause obvious. On the other hand with require it will fail further down making it harder to debug.

library(data.table) ##
library(dplyr)


numeric_fun <- function(dataset, grp_var, var) {

  grp_var <- sym(grp_var)
  var <- sym(var)

  dataset %>%
    select(!!grp_var,!!var) %>% 
    group_by(!!grp_var) %>%
    summarize(
      'q25' = quantile(!!var, type=6, probs = seq(0, 1, 0.25), na.rm = TRUE)[2],
      'median' = round(quantile(!!var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],0),
      'avg' = round( mean(!!var, na.rm = TRUE), 0),
      'q75' = quantile(!!var, type=6, probs = seq(0, 1, 0.25), na.rm = TRUE)[4] ,
      'n' = sum(!is.na(!!var))
    ) %>% 
    mutate(
      q25 = ifelse( n < 5, "--", paste(q25, "%" )),
      median = ifelse( n < 5, "--", paste(median, "%")),
      avg = ifelse(n < 5, "--", paste(avg, "%")),
      q75 = ifelse( n < 5, "--", paste(q75, "%")),
      n = n
    ) %>% 
    rename( "Industry" = !!grp_var,
            "25%" = q25,
            "75%" = q75
    )

}

numeric_fun(outplacement,"Q7_1","Q8")

giving:

# A tibble: 14 x 6
   Industry                   `25%` median avg   `75%`      n
   <fctr>                     <chr> <chr>  <chr> <chr>  <int>
 1 Banking/Financial Services 2 %   3 %    4 %   5 %       11
 2 Chemicals                  --    --     --    --         3
 3 Consumer Goods             1 %   3 %    3 %   5 %        5
 4 Energy                     --    --     --    --         4
 5 High Tech                  1 %   4 %    3 %   4.5 %      6
 6 Insurance/Reinsurance      --    --     --    --         3
 7 Logistics                  --    --     --    --         2
 8 Mining & Metals            --    --     --    --         4
 9 Other Manufacturing        --    --     --    --         4
10 Other Non-Manufacturing    2 %   2 %    3 %   4.75 %     8
11 Retail & Wholesale         --    --     --    --         2
12 Services (Non-Financial)   2 %   3 %    3 %   5 %       12
13 Transportation Equipment   --    --     --    --         3
14 <NA>                       --    --     --    --         0

Note

dput does not work with objects having external pointers such as data.table objects so we used this:

outplacement <- 
structure(list(Q7_1 = structure(c(NA, NA, NA, 5L, 5L, 14L, NA, 
1L, 9L, 13L, 1L, NA, 3L, 13L, 13L, 13L, 12L, 2L, 11L, 13L, 10L, 
11L, NA, 1L, 4L, NA, 5L, NA, 5L, 4L, 5L, 9L, 2L, 4L, 13L, 10L, 
13L, 13L, NA, 11L, NA, 1L, 11L, NA, 5L, NA, 1L, NA, 9L, 3L, 1L, 
1L, 10L, 1L, NA, 5L, NA, NA, 2L, NA, 6L, 6L, NA, 13L, 14L, NA, 
NA, 14L, 8L, 11L, 11L, 8L, 12L, 13L, NA, 3L, 11L, 3L, 11L, 1L, 
NA, 9L, NA, 10L, 6L, 1L, 5L, 3L, 1L, 13L, 4L, 14L, NA, 13L), .Label = c("Banking/Financial Services", 
"Chemicals", "Consumer Goods", "Energy", "High Tech", "Insurance/Reinsurance", 
"Life Sciences", "Logistics", "Mining & Metals", "Other Manufacturing", 
"Other Non-Manufacturing", "Retail & Wholesale", "Services (Non-Financial)", "Transportation Equipment"), class = "factor"), Q8 = c(NA, NA, 
NA, NA, NA, NA, NA, 2L, 3L, 3L, 6L, NA, 5L, 4L, 2L, 5L, 6L, 2L, 
2L, 3L, 2L, 5L, NA, 3L, 1L, NA, 3L, NA, 1L, 3L, 4L, 4L, 2L, 4L, 
1L, 3L, 2L, 3L, NA, 2L, NA, 4L, 4L, NA, 1L, NA, 3L, NA, 1L, 3L, 
5L, 2L, 3L, 1L, NA, 6L, NA, NA, 4L, NA, 1L, 5L, NA, 2L, 1L, NA, 
NA, 2L, 6L, 6L, 2L, 6L, 3L, 5L, NA, 5L, 2L, 1L, 3L, 3L, NA, 3L, 
NA, 3L, 3L, 6L, 4L, 1L, 4L, 6L, 3L, 5L, NA, 5L), Q9 = c(3L, 1L, 
NA, 1L, 3L, 3L, NA, 3L, 3L, 1L, 1L, NA, 3L, 2L, 2L, 3L, 2L, 3L, 
2L, 2L, 2L, 1L, NA, 3L, 1L, NA, 1L, NA, 1L, 2L, 1L, 2L, 3L, 1L, 
1L, 1L, 3L, 3L, NA, 3L, NA, 2L, 2L, NA, 1L, NA, 1L, NA, 1L, 2L, 
2L, 1L, 2L, 3L, NA, 1L, NA, NA, 2L, NA, 2L, 2L, NA, 2L, 2L, NA, 
NA, 1L, 3L, 1L, 3L, 3L, 1L, 3L, NA, 1L, 3L, 1L, 1L, 3L, NA, 1L, 
NA, 2L, 2L, 3L, 3L, 2L, 3L, 3L, 2L, 1L, NA, 2L), Q10 = c(NA, 
1L, NA, 1L, NA, NA, NA, NA, NA, 1L, 1L, NA, NA, 1L, 2L, NA, 1L, 
NA, 1L, 1L, 2L, 2L, NA, NA, 2L, NA, 2L, NA, 2L, 1L, NA, 1L, NA, 
1L, 1L, 1L, NA, NA, NA, NA, NA, 2L, 1L, NA, 1L, NA, 2L, NA, 2L, 
2L, 2L, 1L, 2L, 2L, NA, 1L, NA, NA, 2L, NA, 2L, 1L, NA, 1L, 2L, 
NA, NA, 1L, 1L, NA, 1L, NA, NA, 2L, NA, NA, 1L, 1L, 1L, 2L, NA, 
1L, NA, 1L, 2L, 2L, 1L, 1L, NA, 1L, NA, 2L, NA, 1L)), row.names = c(NA, 
-94L), class = "data.frame", .Names = c("Q7_1", "Q8", "Q9", "Q10"))

library(data.table)
outplacement <- as.data.table(outplacement)

Upvotes: 2

akrun
akrun

Reputation: 887851

Using the enquo/!! from dplyr we can remodel the function as

library(dplyr)
numeric_fun<-function(dataset,grp_var,var){

      grp_var <- enquo(grp_var)
      var <- enquo(var)


    dataset %>%
          select(!! grp_var, !!var) %>%
          group_by(!! grp_var) %>%
          summarise(
               q25 = quantile(!! var,  type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],
           median =round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],0),
               avg = round( mean(!! var, na.rm=TRUE), 0),
               q75 = quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4] ,
               n = sum(!is.na(!!var))) %>%

         mutate(
              q25 = ifelse( n < 5, "--", paste0(q25,"%" )),
              median = ifelse( n < 5, "--", paste0(median,"%")),
              avg = ifelse(n < 5, "--", paste0(avg,"%")),
              q75 = ifelse( n < 5, "--", paste0(q75,"%"))

          ) %>% 
        rename(Industry= !!grp_var,
          `25%` = q25,
          `75%` = q75
  )



}

-run the function

numeric_fun(df1, Q7_1, Q8)
# A tibble: 14 x 6
#   Industry                   `25%` median avg   `75%`     n
#   <fctr>                     <chr> <chr>  <chr> <chr> <int>
# 1 Banking/Financial Services 2%    3%     4%    5%       11
# 2 Chemicals                  --    --     --    --        3
# 3 Consumer Goods             1%    3%     3%    5%        5
# 4 Energy                     --    --     --    --        4
# 5 High Tech                  1%    4%     3%    4.5%      6
# 6 Insurance/Reinsurance      --    --     --    --        3
# 7 Logistics                  --    --     --    --        2
# 8 Mining & Metals            --    --     --    --        4
# 9 Other Manufacturing        --    --     --    --        4
#10 Other Non-Manufacturing    2%    2%     3%    4.75%     8
#11 Retail & Wholesale         --    --     --    --        2
#12 Services (Non-Financial)   2%    3%     3%    5%       12
#13 Transportation Equipment   --    --     --    --        3
#14 <NA>                       --    --     --    --        0

Upvotes: 2

Related Questions