Eli
Eli

Reputation: 43

Count characters and summarize values per group

I can seem to find a proper code for my problem. I want to create groups and summarize (sum, count or length) other columns based on different conditions.

I've tried group_by and summarize with different conditions but haven't found anything that works yet.

I have a table similar to this:

data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
                       'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
               Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021', 
                       '1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
               Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
                       'Apple','Pear','Apple','Apple', 'Tomato'),
               Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable', 
                           'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
               Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
                            1,0.5,0.8,1.2,0.5)
               )

This would be my desired output:

desired_table <- data.frame(Name=c('Anna', 'Bella', 'Camilla'),
Shopping_days=c(3,2,2),
days_fruit=c(2,2,1),
days_vegetables=c(2,0,1),
Total_kg=c(2.5,2.3,1.7),
Fruit_kg=c(0.9,2.3,1.2),
Vegetables=c(1.6,0,0.5))

Ive tried many variations of a code similar to this one that obviously doesn't work:

data1 <- data %>%
group_by(Name) %>%
summarize(Shopping_days = length(unique(Date)),
days_fruit = length(unique(Date, Category='Fruit')),
days_vegetables = length(unique(Date, Category='Vegetables')),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg, if Category=Fruit),
Vegetables_kg = sum(Weight_kg, if Category=Vegetables))

Any help would be much appreciated.

Upvotes: 1

Views: 2400

Answers (6)

Chriss Paul
Chriss Paul

Reputation: 1101

An alternative using data.table

library(data.table)
setDT(data)
#Computing the necessary columns by Name
data[, `:=`(Shopping_days = uniqueN(Date),
            day_fruits = uniqueN(Date[Category=="Fruit"]),
            day_vegetable = uniqueN(Date[Category=="Vegetable"]),
            Total_kg = sum(Weight_kg),
            Fruit_kg = sum(Weight_kg[Category=="Fruit"]),
            Vegetable_kg = sum(Weight_kg[Category=="Vegetable"])), Name]

#Taking the first element of the computed columns by Name
data[,.SD[1, .(Shopping_days,day_fruits,day_vegetable,Total_kg,Fruit_kg,Vegetable_kg)], Name]

      Name Shopping_days day_fruits day_vegetable Total_kg Fruit_kg Vegetable_kg
1:    Anna             3          2             2      2.5      0.9          1.6
2:   Bella             2          2             0      2.3      2.3          0.0
3: Camilla             2          1             1      1.7      1.2          0.5

Upvotes: 0

hello_friend
hello_friend

Reputation: 5788

A bit ugly base solution (but still pretty generic):

# Create a lookup table to be used to ensure every category is represented:
lookup_df <- data.frame(
  Category = unique(df$Category), 
  Weight_kg.days = rep(0, length(unique(df$Category))), 
  Weight_kg.kg = rep(0, length(unique(df$Category)))
)

# Create the report:
data.frame(do.call(rbind, Map(function(r){
  s <- subset(r, !duplicated(Date), select = -Item)
  v <- do.call(data.frame, aggregate(Weight_kg~Category, s,
        FUN = function(u){c(days = length(u), kg = sum(u))}))
  w <- rbind(v, subset(lookup_df, !(Category %in% v$Category)))
  col_idx <- vapply(w, is.numeric, logical(1))
  x <- rbind(w, data.frame(Category = "Total", t(colSums(w[,col_idx]))))
  z <- do.call(cbind, lapply(with(x, split(x[,col_idx], Category)), function(y){
        setNames(y, paste0(y$Category, c("days", "kg")))}))
  cbind(Name = unique(s$Name), z)
}, with(df, split(df, Name)))), row.names = NULL)

Data:

df <- df <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
                               'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
                       Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021', 
                               '1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
                       Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
                               'Apple','Pear','Apple','Apple', 'Tomato'),
                       Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable', 
                                   'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
                       Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
                                    1,0.5,0.8,1.2,0.5)
)

Upvotes: 0

abreums
abreums

Reputation: 196

Two columns can be easy obtained just grouping by name:

r1 <- data %>%
  group_by(Name) %>%
  summarize(Shopping_days = n_distinct(Date),
            Total_kg = sum(Weight_kg))

#  A tibble: 3 x 3
#    Name    Shopping_days Total_kg
#  * <chr>           <int>    <dbl>
#  1 Anna                3      2.5
#  2 Bella               2      2.3
#  3 Camilla             2      1.7

The remaining columns can be obtained grouping by Name and Category.

r2 <- data %>% 
  group_by(Name, Category) %>% 
  summarize(
    days = n_distinct(Date),
    weight = sum(Weight_kg)
  ) 

# A tibble: 5 x 4
#  Groups:   Name [3]
#   Name    Category   days weight
#   <chr>   <chr>     <int>  <dbl>
# 1 Anna    Fruit         2    0.9
# 2 Anna    Vegetable     2    1.6
# 3 Bella   Fruit         2    2.3
# 4 Camilla Fruit         1    1.2
# 5 Camilla Vegetable     1    0.5

But end result is not in the format you would like.

So, first let`s pivot_wider r2 and than left_join with the first result, using the Name column as key:

r2 <- r2 %>% 
  pivot_wider(names_from = 'Category', values_from = c('days', 'weight'), values_fill = 0)

result = r1 %>% left_join(r2)


# A tibble: 3 x 7
#  Name    Shopping_days Total_kg days_Fruit days_Vegetable weight_Fruit weight_Vegetable
#  <chr>           <int>    <dbl>      <int>          <int>        <dbl>            <dbl>
#1 Anna                3      2.5          2              2          0.9              1.6
#2 Bella               2      2.3          2              0          2.3              0  
#3 Camilla             2      1.7          1              1          1.2              0.5

Upvotes: 0

MKR
MKR

Reputation: 1700

I don't think that there is a very simple solution since you have to group and regroup your dataframe various times to get to the desired output.

You could do this with some left_joins() using dplyr.

For example:

library(dplyr)

data %>% distinct(Name) %>% 
  left_join(
    data %>% select(Name, Date) %>% 
      distinct() %>% 
      group_by(Name) %>% 
      summarise(shopping_days = n()), by = 'Name') %>% 
  left_join(
    data %>% filter(Category == "Fruit") %>%
      select(Name, Date) %>% 
      distinct() %>% 
      group_by(Name) %>% 
      summarise(days_fruit = n()), by = 'Name') %>% 
  left_join(
    data %>% filter(Category == "Vegetable") %>%
      select(Name, Date) %>% 
      distinct() %>% 
      group_by(Name) %>% 
      summarise(days_vegetables = n()), by = 'Name') %>% 
  left_join(
    data %>% filter(Category == "Vegetable") %>% 
      group_by(Name) %>% 
      summarise(Vegetables = sum(Weight_kg)), by= 'Name') %>% 
  left_join(
    data %>% filter(Category == "Fruit") %>% 
      group_by(Name) %>% 
      summarise(Fruit_kg = sum(Weight_kg)), by= 'Name') %>% 
  left_join(
    data %>% 
      group_by(Name) %>% 
      summarise(Total_kg = sum(Weight_kg)), by= 'Name')

Here, you basically create new dataframes for each left_join(), so these could also be saved as objects themselves.

Upvotes: 0

Ronak Shah
Ronak Shah

Reputation: 388807

Using group_by and summarise :

library(dplyr)

data %>%
  group_by(Name) %>%
  summarise(Shopping_days = n_distinct(Date), 
            days_fruit = n_distinct(Item[Category == 'Fruit']), 
            days_vegetables = n_distinct(Item[Category == 'Vegetable']), 
            Total_kg = sum(Weight_kg), 
            Fruit_kg = sum(Weight_kg[Category == 'Fruit']), 
            Vegetables_kg = sum(Weight_kg[Category == 'Vegetable']))

#  Name    Shopping_days days_fruit days_vegetables Total_kg Fruit_kg Vegetables_kg
#  <chr>           <int>      <int>           <int>    <dbl>    <dbl>         <dbl>
#1 Anna                3          2               2      2.5      0.9           1.6
#2 Bella               2          2               0      2.3      2.3           0  
#3 Camilla             2          1               1      1.7      1.2           0.5

Upvotes: 5

Yuriy Saraykin
Yuriy Saraykin

Reputation: 8880

I propose to make three separate tables, and then combine them

data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
                           'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
                   Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021', 
                           '1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
                   Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
                           'Apple','Pear','Apple','Apple', 'Tomato'),
                   Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable', 
                               'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
                   Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
                                1,0.5,0.8,1.2,0.5)
)

library(tidyverse)

tbl_kg <- data %>% 
  pivot_wider(
    id_cols = Name, 
    names_from = Category, 
    values_from = Weight_kg, 
    values_fill = 0, 
    values_fn = sum,
    names_prefix = "kg_"
    ) %>% 
  rowwise() %>% 
  mutate(kg_total = sum(c_across(where(is.numeric)))) %>% 
  ungroup()

tbl_shoping <- data %>% 
  group_by(Name) %>% 
  summarise(shopping_days = n_distinct(Date))

tbl_category <- data %>% 
  pivot_wider(
    id_cols = Name, 
    names_from = Category, 
    values_from = Category, 
    values_fn = length,
    values_fill = 0,
    names_prefix = "days_"
    )

l <- list(tbl_kg, tbl_shoping, tbl_category)

reduce(l, left_join, by = "Name")
#> # A tibble: 3 x 7
#>   Name    kg_Fruit kg_Vegetable kg_total shopping_days days_Fruit days_Vegetable
#>   <chr>      <dbl>        <dbl>    <dbl>         <int>      <int>          <int>
#> 1 Anna         0.9          1.6      2.5             3          3              2
#> 2 Bella        2.3          0        2.3             2          3              0
#> 3 Camilla      1.2          0.5      1.7             2          1              1

Created on 2021-03-24 by the reprex package (v1.0.0)

Upvotes: 0

Related Questions