Reputation: 43
I can seem to find a proper code for my problem. I want to create groups and summarize (sum, count or length) other columns based on different conditions.
I've tried group_by and summarize with different conditions but haven't found anything that works yet.
I have a table similar to this:
data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021',
'1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
'Apple','Pear','Apple','Apple', 'Tomato'),
Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable',
'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
1,0.5,0.8,1.2,0.5)
)
This would be my desired output:
desired_table <- data.frame(Name=c('Anna', 'Bella', 'Camilla'),
Shopping_days=c(3,2,2),
days_fruit=c(2,2,1),
days_vegetables=c(2,0,1),
Total_kg=c(2.5,2.3,1.7),
Fruit_kg=c(0.9,2.3,1.2),
Vegetables=c(1.6,0,0.5))
Ive tried many variations of a code similar to this one that obviously doesn't work:
data1 <- data %>%
group_by(Name) %>%
summarize(Shopping_days = length(unique(Date)),
days_fruit = length(unique(Date, Category='Fruit')),
days_vegetables = length(unique(Date, Category='Vegetables')),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg, if Category=Fruit),
Vegetables_kg = sum(Weight_kg, if Category=Vegetables))
Any help would be much appreciated.
Upvotes: 1
Views: 2400
Reputation: 1101
An alternative using data.table
library(data.table)
setDT(data)
#Computing the necessary columns by Name
data[, `:=`(Shopping_days = uniqueN(Date),
day_fruits = uniqueN(Date[Category=="Fruit"]),
day_vegetable = uniqueN(Date[Category=="Vegetable"]),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg[Category=="Fruit"]),
Vegetable_kg = sum(Weight_kg[Category=="Vegetable"])), Name]
#Taking the first element of the computed columns by Name
data[,.SD[1, .(Shopping_days,day_fruits,day_vegetable,Total_kg,Fruit_kg,Vegetable_kg)], Name]
Name Shopping_days day_fruits day_vegetable Total_kg Fruit_kg Vegetable_kg
1: Anna 3 2 2 2.5 0.9 1.6
2: Bella 2 2 0 2.3 2.3 0.0
3: Camilla 2 1 1 1.7 1.2 0.5
Upvotes: 0
Reputation: 5788
A bit ugly base solution (but still pretty generic):
# Create a lookup table to be used to ensure every category is represented:
lookup_df <- data.frame(
Category = unique(df$Category),
Weight_kg.days = rep(0, length(unique(df$Category))),
Weight_kg.kg = rep(0, length(unique(df$Category)))
)
# Create the report:
data.frame(do.call(rbind, Map(function(r){
s <- subset(r, !duplicated(Date), select = -Item)
v <- do.call(data.frame, aggregate(Weight_kg~Category, s,
FUN = function(u){c(days = length(u), kg = sum(u))}))
w <- rbind(v, subset(lookup_df, !(Category %in% v$Category)))
col_idx <- vapply(w, is.numeric, logical(1))
x <- rbind(w, data.frame(Category = "Total", t(colSums(w[,col_idx]))))
z <- do.call(cbind, lapply(with(x, split(x[,col_idx], Category)), function(y){
setNames(y, paste0(y$Category, c("days", "kg")))}))
cbind(Name = unique(s$Name), z)
}, with(df, split(df, Name)))), row.names = NULL)
Data:
df <- df <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021',
'1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
'Apple','Pear','Apple','Apple', 'Tomato'),
Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable',
'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
1,0.5,0.8,1.2,0.5)
)
Upvotes: 0
Reputation: 196
Two columns can be easy obtained just grouping by name:
r1 <- data %>%
group_by(Name) %>%
summarize(Shopping_days = n_distinct(Date),
Total_kg = sum(Weight_kg))
# A tibble: 3 x 3
# Name Shopping_days Total_kg
# * <chr> <int> <dbl>
# 1 Anna 3 2.5
# 2 Bella 2 2.3
# 3 Camilla 2 1.7
The remaining columns can be obtained grouping by Name and Category.
r2 <- data %>%
group_by(Name, Category) %>%
summarize(
days = n_distinct(Date),
weight = sum(Weight_kg)
)
# A tibble: 5 x 4
# Groups: Name [3]
# Name Category days weight
# <chr> <chr> <int> <dbl>
# 1 Anna Fruit 2 0.9
# 2 Anna Vegetable 2 1.6
# 3 Bella Fruit 2 2.3
# 4 Camilla Fruit 1 1.2
# 5 Camilla Vegetable 1 0.5
But end result is not in the format you would like.
So, first let`s pivot_wider r2 and than left_join with the first result, using the Name column as key:
r2 <- r2 %>%
pivot_wider(names_from = 'Category', values_from = c('days', 'weight'), values_fill = 0)
result = r1 %>% left_join(r2)
# A tibble: 3 x 7
# Name Shopping_days Total_kg days_Fruit days_Vegetable weight_Fruit weight_Vegetable
# <chr> <int> <dbl> <int> <int> <dbl> <dbl>
#1 Anna 3 2.5 2 2 0.9 1.6
#2 Bella 2 2.3 2 0 2.3 0
#3 Camilla 2 1.7 1 1 1.2 0.5
Upvotes: 0
Reputation: 1700
I don't think that there is a very simple solution since you have to group and regroup your dataframe various times to get to the desired output.
You could do this with some left_joins()
using dplyr
.
For example:
library(dplyr)
data %>% distinct(Name) %>%
left_join(
data %>% select(Name, Date) %>%
distinct() %>%
group_by(Name) %>%
summarise(shopping_days = n()), by = 'Name') %>%
left_join(
data %>% filter(Category == "Fruit") %>%
select(Name, Date) %>%
distinct() %>%
group_by(Name) %>%
summarise(days_fruit = n()), by = 'Name') %>%
left_join(
data %>% filter(Category == "Vegetable") %>%
select(Name, Date) %>%
distinct() %>%
group_by(Name) %>%
summarise(days_vegetables = n()), by = 'Name') %>%
left_join(
data %>% filter(Category == "Vegetable") %>%
group_by(Name) %>%
summarise(Vegetables = sum(Weight_kg)), by= 'Name') %>%
left_join(
data %>% filter(Category == "Fruit") %>%
group_by(Name) %>%
summarise(Fruit_kg = sum(Weight_kg)), by= 'Name') %>%
left_join(
data %>%
group_by(Name) %>%
summarise(Total_kg = sum(Weight_kg)), by= 'Name')
Here, you basically create new dataframes for each left_join()
, so these could also be saved as objects themselves.
Upvotes: 0
Reputation: 388807
Using group_by
and summarise
:
library(dplyr)
data %>%
group_by(Name) %>%
summarise(Shopping_days = n_distinct(Date),
days_fruit = n_distinct(Item[Category == 'Fruit']),
days_vegetables = n_distinct(Item[Category == 'Vegetable']),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg[Category == 'Fruit']),
Vegetables_kg = sum(Weight_kg[Category == 'Vegetable']))
# Name Shopping_days days_fruit days_vegetables Total_kg Fruit_kg Vegetables_kg
# <chr> <int> <int> <int> <dbl> <dbl> <dbl>
#1 Anna 3 2 2 2.5 0.9 1.6
#2 Bella 2 2 0 2.3 2.3 0
#3 Camilla 2 1 1 1.7 1.2 0.5
Upvotes: 5
Reputation: 8880
I propose to make three separate tables, and then combine them
data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021',
'1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
'Apple','Pear','Apple','Apple', 'Tomato'),
Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable',
'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
1,0.5,0.8,1.2,0.5)
)
library(tidyverse)
tbl_kg <- data %>%
pivot_wider(
id_cols = Name,
names_from = Category,
values_from = Weight_kg,
values_fill = 0,
values_fn = sum,
names_prefix = "kg_"
) %>%
rowwise() %>%
mutate(kg_total = sum(c_across(where(is.numeric)))) %>%
ungroup()
tbl_shoping <- data %>%
group_by(Name) %>%
summarise(shopping_days = n_distinct(Date))
tbl_category <- data %>%
pivot_wider(
id_cols = Name,
names_from = Category,
values_from = Category,
values_fn = length,
values_fill = 0,
names_prefix = "days_"
)
l <- list(tbl_kg, tbl_shoping, tbl_category)
reduce(l, left_join, by = "Name")
#> # A tibble: 3 x 7
#> Name kg_Fruit kg_Vegetable kg_total shopping_days days_Fruit days_Vegetable
#> <chr> <dbl> <dbl> <dbl> <int> <int> <int>
#> 1 Anna 0.9 1.6 2.5 3 3 2
#> 2 Bella 2.3 0 2.3 2 3 0
#> 3 Camilla 1.2 0.5 1.7 2 1 1
Created on 2021-03-24 by the reprex package (v1.0.0)
Upvotes: 0