Reputation: 11
I have a data frame.
library(tidyverse)
df <- data.frame(
device = c('0001', '0001', '0001', '0001', '0002', '0002', '0002', '0003', '0003', '0003'),
problem = c('A', 'B', 'C', 'A', 'A', 'C', 'A', 'B', 'B', 'C'),
down = c(0, 25, 45, 12, 10, 0, 96, 66, 88, 99),
possible_up = rep(100, 10)
) %>% mutate(up = possible_up - down)
It results in:
device problem down possible_up up
0001 A 0 100 100
0001 B 25 100 75
0001 C 45 100 55
0001 A 12 100 88
0002 A 10 100 90
0002 C 0 100 100
0002 A 96 100 4
0003 B 66 100 34
0003 B 88 100 12
0003 C 99 100 1
Numeric columns represent minutes. I need to create new table (summary for each device) where each category from the problem column is represented as a column, and for each category I need to add one additional column.
Important thing to add is that I need for this to be dynamic since the number of unique values inside the problem column may vary.
I've tried something using spread function from tidyr but didn't get anything substantial.
I need for my new table to look like this:
device up_sum up % down_sum down % A mins A % B mins B % C mins C %
0001 318 79.50% 82 20.50% 12 14.63% 25 30.49% 45 54.88%
0002 194 64.67% 106 35.33% 106 100.00% 0 0.00% 0 0.00%
0003 47 15.67% 253 84.33% 0 0.00% 154 60.87% 99 39.13%
Values in column up_sum represent the sum of up column for each device, down_sum is sum of down column for each device, values in columns A %, B% and C% show percentages of down minutes for problems A, B and C.
Upvotes: 0
Views: 499
Reputation: 47350
first aggregate by device :
library(tidyverse)
df1 <- df %>% group_by(device) %>% summarize_at(vars("up","down"),lst(sum, pct = mean))
df1
#> # A tibble: 3 x 5
#> device up_sum down_sum up_pct down_pct
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 0001 318 82 79.5 20.5
#> 2 0002 194 106 64.7 35.3
#> 3 0003 47 253 15.7 84.3
then aggregate by problem:
df2 <- df %>%
group_by(device, problem) %>%
summarize(mins = sum(down)) %>%
mutate(pct = 100 * mins/sum(mins)) %>%
pivot_wider(device, names_from = "problem", values_from = c("mins", "pct"),
values_fill = list(mins=0, pct=0))
df2
#> # A tibble: 3 x 7
#> # Groups: device [3]
#> device mins_A mins_B mins_C pct_A pct_B pct_C
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0001 12 25 45 14.6 30.5 54.9
#> 2 0002 106 0 0 100 0 0
#> 3 0003 0 154 99 0 60.9 39.1
And finally join both
left_join(df1,df2, by = "device")
#> # A tibble: 3 x 11
#> device up_sum down_sum up_pct down_pct mins_A mins_B mins_C pct_A pct_B
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0001 318 82 79.5 20.5 12 25 45 14.6 30.5
#> 2 0002 194 106 64.7 35.3 106 0 0 100 0
#> 3 0003 47 253 15.7 84.3 0 154 99 0 60.9
#> # ... with 1 more variable: pct_C <dbl>
Upvotes: 0
Reputation: 1
library(tidyverse)
df <- data.frame(
device = c('0001', '0001', '0001', '0001', '0002', '0002', '0002', '0003', '0003', '0003'),
problem = c('A', 'B', 'C', 'A', 'A', 'C', 'A', 'B', 'B', 'C'),
down = c(0, 25, 45, 12, 10, 0, 96, 66, 88, 99),
possible_up = rep(100, 10)
) %>% mutate(up = possible_up - down)
##########################################
df1 = df %>%
group_by(device, problem) %>%
summarize(up_sum = sum(up),down_sum = sum(down))%>%
spread(key = problem,value = down_sum) %>%
setNames( c("device","up_sum","A mins", "B mins", "C mins") ) %>%
replace(is.na(.), 0) %>%
group_by(device) %>% summarise_all(sum)%>%
#####################################################
mutate(down_sum = rowSums(select(., contains("mins"))))%>%
mutate_at(vars(contains("mins")) , funs(P = ./down_sum * 100))%>%
#####################################################
mutate(Tot_up_down = rowSums(select(., contains("sum"))))%>%
mutate_at(vars(contains("sum")) , funs(P = ./Tot_up_down * 100))%>%
#####################################################
select(-Tot_up_down)%>%
select(device,contains("up"),contains("down"),contains("A"),contains("B"),
contains("C"))
the last line was just to leave the same order as you presented
Upvotes: 0
Reputation: 3996
A solution using the new pivot_wider
function from the development version of tidyr
:
pivot wider
does most of the heavy lifting by summarising our up and down columns. Unfortunately we are still left with the tedious work of calculating the percentages for each group.
#devtools::install_github("tidyverse/tidyr")
library(dplyr)
library(tidyr)
df %>%
pivot_wider(id_cols = c('device'), names_from = 'problem',
values_from = c('down','up'),
values_fn = list(down=sum, up = sum),
values_fill = list(down = 0, up = 0)) %>%
# calculate percentages
mutate(up_sum = rowSums(select(., starts_with('up'))),
down_sum = rowSums(select(., starts_with('down')))) %>%
group_by(device) %>%
mutate_at(vars(matches('down_[A-Z]',F)), list('perc' = ~ . / down_sum)) %>%
mutate(up_perc = up_sum / (up_sum + down_sum),
down_perc = down_sum / (up_sum + down_sum)) %>%
# re-order columns to desired output
select(device, up_sum, up_perc, down_sum, down_perc, starts_with('down_A'),
starts_with('down_B'), starts_with('down_C'))
# A tibble: 3 x 11
# Groups: device [3]
# device up_sum up_perc down_sum down_perc down_A down_A_perc down_B down_B_perc down_C down_C_perc
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0001 318 0.795 82 0.205 12 0.146 25 0.305 45 0.549
#2 0002 194 0.647 106 0.353 106 1 0 0 0 0
#3 0003 47 0.157 253 0.843 0 0 154 0.609 99 0.391
Upvotes: 0
Reputation: 6020
use summarise
then gather
and then spread
to get the results for each problem in columns. I realise that my calculations are different than yours, but the global approach stays the same.
df %>%
group_by(device,problem) %>%
summarise(up_sum = sum(up),
up_perc = sum(up)/sum(possible_up)*100,
down_sum = sum(down),
down_perc = sum(down)/sum(possible_up)*100) %>%
gather(key, value,-device,-problem) %>%
mutate(dummy = paste(problem,key,sep="_")) %>%
select(-key,-problem) %>%
spread(dummy, value)
results in:
# A tibble: 3 x 13
# Groups: device [3]
device A_down_perc A_down_sum A_up_perc A_up_sum B_down_perc B_down_sum B_up_perc B_up_sum C_down_perc
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0001 6 12 94 188 25 25 75 75 45
2 0002 53 106 47 94 NA NA NA NA 0
3 0003 NA NA NA NA 77 154 23 46 99
# ... with 3 more variables: C_down_sum <dbl>, C_up_perc <dbl>, C_up_sum <dbl>
EDIT:
With the following you can combine the total with the mins per problem:
df %>%
group_by(device,problem) %>%
summarise(mins = sum(down),
up = sum(up)) %>%
group_by(device) %>%
mutate(up = sum(up)) %>%
mutate(perc = mins/sum(mins)*100) %>%
gather(key, value,-device,-problem,-up) %>%
unite(dummy, problem, key) %>%
spread(dummy, value)
gives:
# A tibble: 3 x 8
# Groups: device [3]
device up A_mins A_perc B_mins B_perc C_mins C_perc
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0001 318 12 14.6 25 30.5 45 54.9
2 0002 194 106 100 NA NA 0 0
3 0003 47 NA NA 154 60.9 99 39.1
Upvotes: 3
Reputation: 305
with summarise
and group_by
and an additional step merging a left_join
:
library(tidyverse)
# sample df
df <- data.frame(
device = c('0001', '0001', '0001', '0001', '0002', '0002', '0002', '0003', '0003', '0003'),
problem = c('A', 'B', 'C', 'A', 'A', 'C', 'A', 'B', 'B', 'C'),
down = c(0, 25, 45, 12, 10, 0, 96, 66, 88, 99),
possible_up = rep(100, 10)
) %>% mutate(up = possible_up - down)
# first summarise the information per device
df %>%
group_by(device) %>%
summarise(up_sum = sum(up),
down_sum = sum(down),
up_pct = (100/(up_sum+down_sum)*up_sum),
down_pct = (100/(up_sum+down_sum)*down_sum)) -> device_df
# then summarise the down_time per device/problem
df %>%
group_by(device, problem) %>%
summarise(mins = sum(down)) %>%
spread(., key = "problem", value = "mins", fill = 0) %>%
mutate(A_pct = 100/(A+B+C)*A,
B_pct = 100/(A+B+C)*B,
C_pct = 100/(A+B+C)*C) -> downtime_df
# finally merge both dataframes
left_join(device_df, downtime_df)
I sometimes like to do things step by steps, altough previous answers spare you some lines
gives you:
Joining, by = "device"
# A tibble: 3 x 11
device up_sum down_sum up_pct down_pct A B C A_pct B_pct C_pct
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0001 318 82 79.5 20.5 12 25 45 14.6 30.5 54.9
2 0002 194 106 64.7 35.3 106 0 0 100 0 0
3 0003 47 253 15.7 84.3 0 154 99 0 60.9 39.1
Upvotes: 0
Reputation: 726
with summarise
and group_by
you will get the desired result. missing min return Inf
because sample data has no observation for those filters.
library(dplyr)
df %>%
group_by(device) %>%
summarise(up_sum = sum(up),
up_perc = mean(up/possible_up),
down_sum = sum(down),
down_perc = mean(down/possible_up),
A_mins = min(down[problem=="A"]),
A_perc = mean(up[problem=="A"]/possible_up[problem=="A"]),
B_mins = min(down[problem=="B"]),
B_perc = mean(up[problem=="B"]/possible_up[problem=="B"]),
C_mins = min(down[problem=="C"]),
C_perc = mean(up[problem=="C"]/possible_up[problem=="C"]),
)
# A tibble: 3 x 11
device up_sum up_perc down_sum down_perc A_mins A_perc B_mins B_perc C_mins C_perc
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0001 318 0.795 82 0.205 0 0.94 25 0.75 45 0.55
2 0002 194 0.647 106 0.353 10 0.47 Inf NaN 0 1
3 0003 47 0.157 253 0.843 Inf NaN 66 0.23 99 0.01
Upvotes: 1