hoze
hoze

Reputation: 11

R - spread column and add aditional columns at the same time

I have a data frame.

library(tidyverse)
df <- data.frame(
  device = c('0001', '0001', '0001', '0001', '0002', '0002', '0002', '0003', '0003', '0003'),
  problem = c('A', 'B', 'C', 'A', 'A', 'C', 'A', 'B', 'B', 'C'),
  down = c(0, 25, 45, 12, 10, 0, 96, 66, 88, 99),
  possible_up = rep(100, 10)
) %>% mutate(up = possible_up - down)

It results in:

device  problem  down  possible_up    up
  0001        A     0          100   100
  0001        B    25          100    75
  0001        C    45          100    55
  0001        A    12          100    88
  0002        A    10          100    90
  0002        C     0          100   100
  0002        A    96          100     4
  0003        B    66          100    34
  0003        B    88          100    12
  0003        C    99          100     1

Numeric columns represent minutes. I need to create new table (summary for each device) where each category from the problem column is represented as a column, and for each category I need to add one additional column.

Important thing to add is that I need for this to be dynamic since the number of unique values inside the problem column may vary.

I've tried something using spread function from tidyr but didn't get anything substantial.

I need for my new table to look like this:

device   up_sum    up %    down_sum    down %    A mins    A %    B mins    B %    C mins    C %
0001       318   79.50%       82       20.50%      12     14.63%    25     30.49%    45     54.88%
0002       194   64.67%      106       35.33%     106    100.00%     0      0.00%     0      0.00%
0003        47   15.67%      253       84.33%       0      0.00%   154     60.87%    99     39.13%

Values in column up_sum represent the sum of up column for each device, down_sum is sum of down column for each device, values in columns A %, B% and C% show percentages of down minutes for problems A, B and C.

Upvotes: 0

Views: 499

Answers (6)

moodymudskipper
moodymudskipper

Reputation: 47350

first aggregate by device :

library(tidyverse)
df1 <- df %>% group_by(device) %>% summarize_at(vars("up","down"),lst(sum, pct = mean))
df1
#> # A tibble: 3 x 5
#>   device up_sum down_sum up_pct down_pct
#>   <fct>   <dbl>    <dbl>  <dbl>    <dbl>
#> 1 0001      318       82   79.5     20.5
#> 2 0002      194      106   64.7     35.3
#> 3 0003       47      253   15.7     84.3

then aggregate by problem:

df2 <- df %>% 
  group_by(device, problem) %>% 
  summarize(mins = sum(down)) %>%
  mutate(pct = 100 * mins/sum(mins)) %>%
  pivot_wider(device, names_from = "problem", values_from = c("mins", "pct"), 
              values_fill = list(mins=0, pct=0))
df2
#> # A tibble: 3 x 7
#> # Groups:   device [3]
#>   device mins_A mins_B mins_C pct_A pct_B pct_C
#>   <fct>   <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>
#> 1 0001       12     25     45  14.6  30.5  54.9
#> 2 0002      106      0      0 100     0     0  
#> 3 0003        0    154     99   0    60.9  39.1

And finally join both

left_join(df1,df2, by = "device")
#> # A tibble: 3 x 11
#>   device up_sum down_sum up_pct down_pct mins_A mins_B mins_C pct_A pct_B
#>   <fct>   <dbl>    <dbl>  <dbl>    <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl>
#> 1 0001      318       82   79.5     20.5     12     25     45  14.6  30.5
#> 2 0002      194      106   64.7     35.3    106      0      0 100     0  
#> 3 0003       47      253   15.7     84.3      0    154     99   0    60.9
#> # ... with 1 more variable: pct_C <dbl>

Upvotes: 0

David A
David A

Reputation: 1

library(tidyverse)
df <- data.frame(
  device = c('0001', '0001', '0001', '0001', '0002', '0002', '0002', '0003',    '0003', '0003'),
  problem = c('A', 'B', 'C', 'A', 'A', 'C', 'A', 'B', 'B', 'C'),
  down = c(0, 25, 45, 12, 10, 0, 96, 66, 88, 99),
  possible_up = rep(100, 10)
) %>% mutate(up = possible_up - down)

##########################################

df1  = df %>%
  group_by(device, problem) %>%
  summarize(up_sum = sum(up),down_sum = sum(down))%>%
  spread(key = problem,value = down_sum) %>%
  setNames( c("device","up_sum","A mins", "B mins", "C mins") ) %>%
  replace(is.na(.), 0) %>%
  group_by(device) %>% summarise_all(sum)%>%
  #####################################################
  mutate(down_sum = rowSums(select(., contains("mins"))))%>%
  mutate_at(vars(contains("mins")) , funs(P = ./down_sum * 100))%>%
  #####################################################
  mutate(Tot_up_down = rowSums(select(., contains("sum"))))%>%
  mutate_at(vars(contains("sum")) , funs(P = ./Tot_up_down * 100))%>%
  #####################################################
  select(-Tot_up_down)%>%
  select(device,contains("up"),contains("down"),contains("A"),contains("B"),
         contains("C"))

the last line was just to leave the same order as you presented

Upvotes: 0

Chris
Chris

Reputation: 3996

A solution using the new pivot_wider function from the development version of tidyr:

pivot wider does most of the heavy lifting by summarising our up and down columns. Unfortunately we are still left with the tedious work of calculating the percentages for each group.

#devtools::install_github("tidyverse/tidyr")
library(dplyr)
library(tidyr)

df  %>%
  pivot_wider(id_cols = c('device'), names_from = 'problem',
              values_from = c('down','up'),
              values_fn = list(down=sum, up = sum),
              values_fill = list(down = 0, up = 0)) %>%
  # calculate percentages
  mutate(up_sum = rowSums(select(., starts_with('up'))),
         down_sum = rowSums(select(., starts_with('down')))) %>%
  group_by(device) %>%
  mutate_at(vars(matches('down_[A-Z]',F)),  list('perc' = ~ . / down_sum)) %>%
  mutate(up_perc = up_sum / (up_sum + down_sum),
         down_perc = down_sum / (up_sum + down_sum)) %>%
  # re-order columns to desired output
  select(device, up_sum, up_perc, down_sum, down_perc, starts_with('down_A'),
  starts_with('down_B'), starts_with('down_C'))




# A tibble: 3 x 11
# Groups:   device [3]
#  device up_sum up_perc down_sum down_perc down_A down_A_perc down_B down_B_perc down_C down_C_perc
#  <fct>   <dbl>   <dbl>    <dbl>     <dbl>  <dbl>       <dbl>  <dbl>       <dbl>  <dbl>       <dbl>
#1 0001      318   0.795       82     0.205     12       0.146     25       0.305     45       0.549
#2 0002      194   0.647      106     0.353    106       1          0       0          0       0    
#3 0003       47   0.157      253     0.843      0       0        154       0.609     99       0.391

Upvotes: 0

Wietze314
Wietze314

Reputation: 6020

use summarise then gather and then spread to get the results for each problem in columns. I realise that my calculations are different than yours, but the global approach stays the same.

df %>% 
  group_by(device,problem) %>%
  summarise(up_sum = sum(up),
            up_perc = sum(up)/sum(possible_up)*100,
            down_sum = sum(down),
            down_perc = sum(down)/sum(possible_up)*100) %>%
  gather(key, value,-device,-problem) %>%
  mutate(dummy = paste(problem,key,sep="_")) %>%
  select(-key,-problem) %>%
  spread(dummy, value)

results in:

# A tibble: 3 x 13
# Groups:   device [3]
  device A_down_perc A_down_sum A_up_perc A_up_sum B_down_perc B_down_sum B_up_perc B_up_sum C_down_perc
  <fct>        <dbl>      <dbl>     <dbl>    <dbl>       <dbl>      <dbl>     <dbl>    <dbl>       <dbl>
1 0001             6         12        94      188          25         25        75       75          45
2 0002            53        106        47       94          NA         NA        NA       NA           0
3 0003            NA         NA        NA       NA          77        154        23       46          99
# ... with 3 more variables: C_down_sum <dbl>, C_up_perc <dbl>, C_up_sum <dbl>

EDIT:

With the following you can combine the total with the mins per problem:

df %>% 
  group_by(device,problem) %>%
  summarise(mins = sum(down),
            up = sum(up)) %>%
  group_by(device) %>%
  mutate(up = sum(up)) %>%
  mutate(perc = mins/sum(mins)*100) %>%
  gather(key, value,-device,-problem,-up) %>%
  unite(dummy, problem, key) %>%
  spread(dummy, value)

gives:

# A tibble: 3 x 8
# Groups:   device [3]
  device    up A_mins A_perc B_mins B_perc C_mins C_perc
  <fct>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 0001     318     12   14.6     25   30.5     45   54.9
2 0002     194    106  100       NA   NA        0    0  
3 0003      47     NA   NA      154   60.9     99   39.1

Upvotes: 3

tobin_lab
tobin_lab

Reputation: 305

with summarise and group_by and an additional step merging a left_join:

library(tidyverse)

# sample df
df <- data.frame(
  device = c('0001', '0001', '0001', '0001', '0002', '0002', '0002', '0003', '0003', '0003'),
  problem = c('A', 'B', 'C', 'A', 'A', 'C', 'A', 'B', 'B', 'C'),
  down = c(0, 25, 45, 12, 10, 0, 96, 66, 88, 99),
  possible_up = rep(100, 10)
) %>% mutate(up = possible_up - down)


# first summarise the information per device
df %>% 
  group_by(device) %>% 
  summarise(up_sum = sum(up),
            down_sum = sum(down),
            up_pct = (100/(up_sum+down_sum)*up_sum),
            down_pct = (100/(up_sum+down_sum)*down_sum)) -> device_df


# then summarise the down_time per device/problem
df %>% 
  group_by(device, problem) %>% 
  summarise(mins = sum(down)) %>% 
  spread(., key = "problem", value = "mins", fill = 0) %>% 
  mutate(A_pct = 100/(A+B+C)*A,
         B_pct = 100/(A+B+C)*B,
         C_pct = 100/(A+B+C)*C) -> downtime_df


# finally merge both dataframes
left_join(device_df, downtime_df)

I sometimes like to do things step by steps, altough previous answers spare you some lines

gives you:

Joining, by = "device"
# A tibble: 3 x 11
  device up_sum down_sum up_pct down_pct     A     B     C A_pct B_pct C_pct
  <fct>   <dbl>    <dbl>  <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0001      318       82   79.5     20.5    12    25    45  14.6  30.5  54.9
2 0002      194      106   64.7     35.3   106     0     0 100     0     0  
3 0003       47      253   15.7     84.3     0   154    99   0    60.9  39.1

Upvotes: 0

heck1
heck1

Reputation: 726

with summarise and group_by you will get the desired result. missing min return Inf because sample data has no observation for those filters.

library(dplyr)
       df %>%
          group_by(device) %>%
          summarise(up_sum = sum(up),
                    up_perc = mean(up/possible_up),
                    down_sum = sum(down),
                    down_perc = mean(down/possible_up),
                    A_mins = min(down[problem=="A"]),
                    A_perc = mean(up[problem=="A"]/possible_up[problem=="A"]),
                    B_mins = min(down[problem=="B"]),
                    B_perc = mean(up[problem=="B"]/possible_up[problem=="B"]),
                    C_mins = min(down[problem=="C"]),
                    C_perc = mean(up[problem=="C"]/possible_up[problem=="C"]),
                    )

# A tibble: 3 x 11
  device up_sum up_perc down_sum down_perc A_mins A_perc B_mins B_perc C_mins C_perc
  <fct>   <dbl>   <dbl>    <dbl>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 0001      318   0.795       82     0.205      0   0.94     25   0.75     45   0.55
2 0002      194   0.647      106     0.353     10   0.47    Inf NaN         0   1   
3 0003       47   0.157      253     0.843    Inf NaN        66   0.23     99   0.01

Upvotes: 1

Related Questions