Is there a function to get counts in multiple columns from multiple datasets?

Question

I have 2 columns of postal codes. One represents my orders, and the other represents reported issues with these orders, both are in separate data sets.

I have a Postal Code column in my orders dataset:

B0E1H0
B3M0G4
B3K6R6
B3L1J7
B0E1H0
B3K3M2
B3K2Z8
B0E1H0
B3K6R6
B0E1H0

I have a postal code column in my reported issues dataset:

B3K6R6
B3K6R6
B0E1H0
B0E1H0
B3L1J7

I would like to end up with a data frame that gives me a list of unique postal codes, the count of volume, the count of issue, and the proportion of issues for each postal code, so something like this:

Postal code, Volume, Issues, Issue %
BOE1H0, 4, 2, 50%
B3K2Z8, 1, 0, 0%
B3K3M2, 1, 0, 0%
B3K6R6, 2, 2, 100%
B3L1J7, 1, 1, 100%
B3M0G4, 1, 0, 0%

I was able to get the 1st 2 rows by doing something like this:

    orders <- read.csv("G:\My Drive\R\R Data\Stuff\Text File\Orders.csv", header = TRUE)
pcvec <- as.vector(orders["Postal.Code"])
unipc <- unique(pcvec,incomparables = F)
unipcvec <- as.vector(unipc)
pccount <- count(orders, "Postal.Code")
nrow(unipc)
x <- data.frame(pccount)
x <- rename(x, c("freq" = "Volume"))
x

    Postal.Code Volume
1        B0C1H0      1
2        B0E1B0      3
3        B0E1H0      7
4        B0E1L0      1
5        B0E1N0      1
6        B0E1P0      1
7        B0E1V0      1
8        B0E1W0      1
9        B0E2K0      1

I have about 5000 rows in my volume dataset about 300 in my issues dataset, is possible to do this easily?

Apologies if I don’t have the proper terminology, please let me know if I can clarify this.

akrun · Accepted Answer

Here is one option with data.table. Convert the 'data.frame' to 'data.table' (setDT(df1), setDT(df2)), get the number of rows (.N) by 'V1', do a join on the 'V1', then get the percentage by dividing the non-common columns, while assigning the NA to 0

library(data.table)
setnames(setDT(df1)[, .N, V1][setDT(df2)[, .N, V1], 
    Issues := i.N, on = .(V1)][, Issue_perc:= Issues/N * 100][is.na(Issues), 
     c('Issues', 'Issue_perc') := 0], 'N', 'Volume')[]
#       V1 Volume Issues Issue_perc
#1: B0E1H0      4      2         50
#2: B3M0G4      1      0          0
#3: B3K6R6      2      2        100
#4: B3L1J7      1      1        100
#5: B3K3M2      1      0          0
#6: B3K2Z8      1      0          0

Or another option with dcast

dcast(rbindlist(list(df1, df2), idcol = 'grp')[, .N, .(grp, V1)],
   V1 ~ c("Volume", "Issues")[grp], value.var = "N", fill = 0)[, 
      Issue_perc := Issues/Volume * 100][]
#         V1 Issues Volume Issue_perc
#1: B0E1H0      2      4         50
#2: B3K2Z8      0      1          0
#3: B3K3M2      0      1          0
#4: B3K6R6      2      2        100
#5: B3L1J7      1      1        100
#6: B3M0G4      0      1          0

Or using base R, we create a union of elements in the 'V1' column from both datasets, then convert to factor with levels specified as the 'lvls', get the table, do a merge and transform to create the 'Issue_perc' column

lvls <- union(df1$V1, df2$V1)
transform(merge(as.data.frame(table(factor(df1$V1, levels = lvls))), 
   as.data.frame(table(factor(df2$V1, levels = lvls))), by = 'Var1'), 
    Issue_perc = Freq.y/Freq.x * 100)
#     Var1 Freq.x Freq.y Issue_perc
#1 B0E1H0      4      2         50
#2 B3K2Z8      1      0          0
#3 B3K3M2      1      0          0
#4 B3K6R6      2      2        100
#5 B3L1J7      1      1        100
#6 B3M0G4      1      0          0

or an option with tidyverse, we get the datasets into a list, map through the list, convert the 'V1' to factor with levels specified as earlier, reduce the list to a single data.frame by doing an inner_join, then create the percentage column with mutate

library(tidyverse)
list(df1, df2) %>% 
    map(~ .x %>% 
             mutate(V1 = factor(V1, levels = lvls)) %>% 
             count(V1,  .drop = FALSE)) %>%
             reduce(inner_join, by = 'V1') %>% 
             mutate(Issue_perc = n.y/n.x * 100) %>% 
             rename_at(vars(matches('n\.')), ~ c("Volume", "Issues"))
# A tibble: 6 x 4
#  V1     Volume Issues Issue_perc
#             
#1 B0E1H0      4      2         50
#2 B3M0G4      1      0          0
#3 B3K6R6      2      2        100
#4 B3L1J7      1      1        100
#5 B3K3M2      1      0          0
#6 B3K2Z8      1      0          0

Or a slightly different option is to place the datasets in a list, then bind them with a grouping column, count to get the frequency, spread to 'wide' format and then create the new 'perc' column

list(df1, df2) %>%
    bind_rows(.id = 'grp') %>%
    count(grp, V1) %>% 
    mutate(grp = c("Volume", "Issues")[as.integer(grp)]) %>% 
    spread(grp, n, fill = 0) %>% 
    mutate(Issue_perc = Issues/Volume * 100)
# A tibble: 6 x 4
#  V1     Issues Volume Issue_perc
#             
#1 B0E1H0      2      4         50
#2 B3K2Z8      0      1          0
#3 B3K3M2      0      1          0
#4 B3K6R6      2      2        100
#5 B3L1J7      1      1        100
#6 B3M0G4      0      1          0

data

df1 <- structure(list(V1 = c("B0E1H0", "B3M0G4", "B3K6R6", "B3L1J7", 
"B0E1H0", "B3K3M2", "B3K2Z8", "B0E1H0", "B3K6R6", "B0E1H0")), row.names 
= c(NA, -10L), class = "data.frame")

df2 <- structure(list(V1 = c("B3K6R6", "B3K6R6", "B0E1H0", "B0E1H0", 
"B3L1J7")), row.names = c(NA, -5L), class = "data.frame")

Is there a function to get counts in multiple columns from multiple datasets?

Answers (2)

data

Related Questions