Reputation: 79
I have a data set as given in example data set employee:
s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 10-Jan-19 10:45 11:45 1
2 George 10-Jan-19 15-Jan-19 06:00 09:00 3
3 George 15-Jan-19 15-Jan-19 12:00 03:00 3
4 George 5-Feb-19 18-Feb-19 12:50 14:50 2
5 Jacob 2-Feb-19 20-Feb-19 15:50 16:50 1
6 Jacob 20-Feb-19 24-Feb-19 14:30 18:30 4
7 Jacob 3-Dec-19 25-Dec-19 06:40 11:40 5
8 Jacob 25-Dec-19 30-Dec-19 09:40 12:40 3
9 Mike 02-Jun-19 02-Jun-19 6:40 07:40 1
10 Mike 02-Jun-19 02-Jun-19 2:45 3:45 1
11 Mike 02-Jun-19 20-Jun-19 10:00 12:00 2
12 Mike 23-Jun-19 25-Jun-19 4:00 5:00 1
My desired output is:
s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 15-Jan-19 10:45 03:00 7
2 George 5-Feb-19 18-Feb-19 12:50 14:50 2
3 Jacob 2-Feb-19 24-Feb-19 15:50 18:30 5
4 Jacob 3-Dec-19 30-Dec-19 06:40 12:40 8
5 Mike 2-Jun-19 20-Jun-19 6:40 12:00 4
6 Mike 23-Jun-19 25-Jun-19 4:00 5:00 1
I was using dplyr library to summarize this but i was having some problem with that i was using this code but i'm not getting my desired output
employee <- employee %>% group_by(e.name) %>% summarise(
s.date=first(s.date),
e.date=last(e.date),
s.time=first(s.time),
e.time=last(e.time),
total.hrs=sum(total.hrs))
From my code i'm getting result as follows
s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 18-Jan-19 10:45 14:50 6
2 Jacob 2-Feb-19 30-Dec-19 15:50 12:40 12
but i want my result like this
s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 15-Jan-19 10:45 03:00 7
2 George 5-Feb-19 18-Feb-19 12:50 14:50 2
3 Jacob 2-Feb-19 24-Feb-19 15:50 18:30 5
4 Jacob 3-Dec-19 30-Dec-19 06:40 12:40 8
5 Mike 2-Jun-19 20-Jun-19 6:40 12:00 4
6 Mike 23-Jun-19 25-Jun-19 4:00 5:00 1
Upvotes: 0
Views: 104
Reputation: 66819
With data.table...
library(data.table)
setDT(DT)
res = DT[, .(
s.no = first(s.no),
s.date = first(s.date),
e.date = last(e.date),
s.time = first(s.time),
e.time = last(e.time),
total.hrs = sum(total.hrs)
), by=.(e.name, .g = cumsum(s.date != shift(e.date, fill=first(s.date))))]
res[, .g := NULL]
e.name s.no s.date e.date s.time e.time total.hrs
1: George 1 1-Jan-19 15-Jan-19 10:45 03:00 7
2: George 4 5-Feb-19 18-Feb-19 12:50 14:50 2
3: Jacob 5 2-Feb-19 24-Feb-19 15:50 18:30 5
4: Jacob 7 3-Dec-19 30-Dec-19 06:40 12:40 8
5: Mike 9 02-Jun-19 20-Jun-19 6:40 12:00 4
6: Mike 12 23-Jun-19 25-Jun-19 4:00 5:00 1
Analogue in dplyr:
library(dplyr)
DT %>% group_by(e.name, .g = cumsum(s.date != lag(e.date, default=first(s.date)))) %>%
summarise(
s.no = first(s.no),
s.date = first(s.date),
e.date = last(e.date),
s.time = first(s.time),
e.time = last(e.time),
total.hrs = sum(total.hrs)
) %>% select(-.g)
# A tibble: 6 x 7
# Groups: e.name [3]
e.name s.no s.date e.date s.time e.time total.hrs
<chr> <int> <chr> <chr> <chr> <chr> <int>
1 George 1 1-Jan-19 15-Jan-19 10:45 03:00 7
2 George 4 5-Feb-19 18-Feb-19 12:50 14:50 2
3 Jacob 5 2-Feb-19 24-Feb-19 15:50 18:30 5
4 Jacob 7 3-Dec-19 30-Dec-19 06:40 12:40 8
5 Mike 9 02-Jun-19 20-Jun-19 6:40 12:00 4
6 Mike 12 23-Jun-19 25-Jun-19 4:00 5:00 1
This answer...
@Mouad's answer is more thorough in that it fixes these (as the OP should do with their real data). The approach there is also essentially the same -- group by both e.name
and the cumsum
of flags for where s.date
changes from its previous/shifted/lagged value.
Data
library(data.table)
DT = fread("s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 10-Jan-19 10:45 11:45 1
2 George 10-Jan-19 15-Jan-19 06:00 09:00 3
3 George 15-Jan-19 15-Jan-19 12:00 03:00 3
4 George 5-Feb-19 18-Feb-19 12:50 14:50 2
5 Jacob 2-Feb-19 20-Feb-19 15:50 16:50 1
6 Jacob 20-Feb-19 24-Feb-19 14:30 18:30 4
7 Jacob 3-Dec-19 25-Dec-19 06:40 11:40 5
8 Jacob 25-Dec-19 30-Dec-19 09:40 12:40 3
9 Mike 02-Jun-19 02-Jun-19 6:40 07:40 1
10 Mike 02-Jun-19 02-Jun-19 2:45 3:45 1
11 Mike 02-Jun-19 20-Jun-19 10:00 12:00 2
12 Mike 23-Jun-19 25-Jun-19 4:00 5:00 1")
Upvotes: 2
Reputation: 2716
require(dplyr)
my_df <- read.table(text =
's.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 10-Jan-19 10:45 11:45 1
2 George 10-Jan-19 15-Jan-19 06:00 09:00 3
3 George 5-Feb-19 18-Feb-19 12:50 14:50 2
4 Jacob 2-Feb-19 20-Feb-19 15:50 16:50 1
5 Jacob 20-Feb-19 24-Feb-19 14:30 18:30 4
5 Jacob 3-Dec-19 25-Dec-19 06:40 11:40 5
6 Jacob 25-Dec-19 30-Dec-19 09:40 12:40 3',
header = TRUE, stringsAsFactors = FALSE) %>% as_tibble()
my_df <-
my_df %>%
mutate(s.date2 = as.Date(s.date, '%d-%B-%y'),
e.date2 = as.Date(e.date, '%d-%B-%y')) %>%
arrange(e.name, s.date2) %>%
group_by(e.name) %>%
mutate(lag_e.date2 = lag(e.date2,1)) %>%
ungroup %>%
mutate(new_episode = as.numeric(is.na(lag_e.date2) | s.date2 != lag_e.date2 )) %>%
mutate(episode = cumsum(new_episode)) %>%
group_by(episode) %>%
mutate(asc_rank = rank(s.date2),
desc_rank = rank(desc(s.date2)),
sum_hours = sum(total.hrs)) %>%
ungroup
then
my_df %>%
mutate(s.date_new = if_else(asc_rank ==1, s.date2, as.Date('1900-01-01')),
s.time_new = if_else(asc_rank ==1, s.time, '00:00'),
e.date_new = if_else(desc_rank ==1, e.date2, as.Date('1900-01-01')),
e.time_new = if_else(desc_rank ==1, e.time, '00:00')) %>%
select(e.name, s.date_new, e.date_new, s.time_new, e.time_new, sum_hours, episode) %>%
group_by(episode) %>%
mutate(s.date = max(s.date_new, na.rm = TRUE),
e.date = max(e.date_new, na.rm = TRUE),
s.time = max(s.time_new, na.rm = TRUE),
e.time = max(e.time_new, na.rm = TRUE),
sum_hours = max(sum_hours)) %>%
ungroup %>%
select(e.name, s.date, e.date, s.time, e.time, sum_hours) %>% distinct
# # A tibble: 4 x 6
# e.name s.date e.date s.time e.time sum_hours
# <chr> <date> <date> <chr> <chr> <dbl>
# 1 George 2019-01-01 2019-01-15 10:45 09:00 4
# 2 George 2019-02-05 2019-02-18 12:50 14:50 2
# 3 Jacob 2019-02-02 2019-02-24 15:50 18:30 5
# 4 Jacob 2019-12-03 2019-12-30 06:40 12:40 8
# >
#
Upvotes: 2