banky banks
banky banks

Reputation: 49

Summarising using an specific filter

I'm trying to count the amount of rows that fulfill a given condition based on a variable. My code is the following:

CD_2018%>%summarise(twenty = n([porcentaje_acumulado_clientes<0.2]),
                                             forty = n([porcentaje_acumulado_clientes<0.4]),
                                             sixty = n([porcentaje_acumulado_clientes<0.6]),
                                             eighty = n([porcentaje_acumulado_clientes<0.8])
                                             )

However, this seems not to work. I also tried using n(id[porcentaje_acumulado_clientes<0.2]), but it didnt work. I couldnt find something similar using n() in the other questions.

Data:

CD_2018 <- structure(list(Codigo_Cliente = c(54688L, 28854L, 71951L, 222564L, 
375419L, 66151L), Segmento = structure(c(5L, 5L, 3L, 5L, 3L, 
3L), .Label = c("Clasico", "Emergente", "Mi_Negocio", "Preferencial", 
"Prestige"), class = "factor"), Sal_Cons_TC_2018 = c(NA, NA, 
38820.8212, NA, NA, 277874.1813), Sal_Cons_TC_2019 = c(NA, NA, 
98899.6049, NA, NA, 284103.8359), Sal_Cons_CC_2018 = c(NA, 5932867.82, 
NA, NA, NA, NA), Sal_Cons_CC_2019 = c(NA, 2891443.57, NA, NA, 
NA, NA), Sal_Cons_CA_2018 = c(332429778.9279, 6119225.3163, 24548153.6145, 
11366326.5221, 7590637.6539, 7283427.2443), Sal_Cons_CA_2019 = c(299599510.9669, 
8183323.8751, 14419748.2394, 7848030.4762, 971111.4943, 3815088.7069
), Sal_Cons_CD_2018 = c(235849719.6407, 235830511.2803, 175709800, 
174190782.1112, 134828233.5874, 125507000), Sal_Cons_CD_2019 = c(251984964.1521, 
267979439.471, 105804400, 173706600, NA, 185157700), Sal_Cons_NE_2018 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_NE_2019 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Veh_2018 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Veh_2019 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Tie_2018 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Tie_2019 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Hip_2018 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Hip_2019 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Com_2018 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Com_2019 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Per_2018 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_Per_2019 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_LC_2018 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Sal_Cons_PR_LC_2019 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Prom_CC_2018 = c(NA, 
4955113.05, NA, NA, NA, NA), Prom_CC_2019 = c(NA, 4395887.25, 
NA, NA, NA, NA), Prom_CA_2018 = c(6989118, 131813.59, 1343168.3, 
42440.5, 581442.72, 133269.29), Prom_CA_2019 = c(5655918.06, 
154901.35, 97210.92, 872286.48, 467481.65, 85932.79), Prom_NE_2018 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Prom_NE_2019 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Tipo_Cliente_2018 = structure(c(2L, 
2L, 2L, 2L, 2L, 2L), .Label = c("Nuevo", "Viejo"), class = "factor"), 
    Tipo_Cliente_2019 = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Nuevo", 
    "Viejo"), class = "factor"), cum = c(235849719.6407, 471680230.921, 
    647390030.921, 821580813.0322, 956409046.6196, 1081916046.6196
    ), ones = c(1, 1, 1, 1, 1, 1), clientes = c(1, 2, 3, 4, 5, 
    6), porcentaje_acumulado_CD = c(1.7023796894358, 3.4046207315895, 
    4.67290629584847, 5.93022748327579, 6.90342705617062, 7.8093453163556
    ), porcentaje_acumulado_clientes = c(0.0209292591042277, 
    0.0418585182084554, 0.0627877773126831, 0.0837170364169108, 
    0.104646295521139, 0.125575554625366)), row.names = c(NA, 
-6L), groups = structure(list(Codigo_Cliente = c(28854L, 54688L, 
66151L, 71951L, 222564L, 375419L), Segmento = structure(c(5L, 
5L, 3L, 3L, 5L, 3L), .Label = c("Clasico", "Emergente", "Mi_Negocio", 
"Preferencial", "Prestige"), class = "factor"), .rows = list(
    2L, 1L, 6L, 3L, 4L, 5L)), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

Upvotes: 0

Views: 34

Answers (2)

akrun
akrun

Reputation: 887128

We can also use cut to create a group variable and then do the group by n()

library(dplyr)
CD_2018 %>%
       group_by(grp = cut(porcentaje_acumulado_clientes, 
          breaks = c(-Inf, 0.2, 0.4, 0.6, 0.8))) %>%
       summarise(n = n())

Upvotes: 1

Gregor Thomas
Gregor Thomas

Reputation: 145775

Change n to sum and delete brackets:

CD_2018 %>% summarise(
  twenty = sum(porcentaje_acumulado_clientes < 0.2),
  forty  = sum(porcentaje_acumulado_clientes < 0.4),
  sixty  = sum(porcentaje_acumulado_clientes < 0.6),
  eighty = sum(porcentaje_acumulado_clientes < 0.8)
)
# # A tibble: 1 x 4
#   twenty forty sixty eighty
#    <int> <int> <int>  <int>
# 1      6     6     6      6

Upvotes: 1

Related Questions