Reputation: 155
I have employee id, their clock in, and clock out timings by day. I want to calculate employee count by hour by day. Excel formula would work too.
My sample data looks like this:
Employee ID Day Clockin Clockout
1 Mon 7:00 15:00
1 Fri 7:00 15:00
2 Wed 8:00 22:00
2 Mon 10:00 18:00
2 Fri 9:00 20:00
3 Mon 7:00 8:00
My output should look like this:
Timestamp Mon Tue Wed Thu Fri
7:00 2 0 0 0 1
8:00 1 0 1 0 1
9:00 1 0 1 0 2
10:00 2 0 1 0 2
11:00 2 0 1 0 2
I tried to mutate clock in
and clock out
column but didn't work. I would prefer a solution in R. I tried solution mentioned here Calculate the days per month between checkin and checkout dates in R but doesn't seem to work in my case
Example of input
ID Day Clockin Clockout
Employee 1 Mon 7:00 15:00
Employee 2 Mon 8:00 15:00
Employee 3 Mon 12:00 14:00
Employee 4 Mon 13:00 20:00
Employee 5 Mon 15:00 22:00
Employee 6 Mon 23:00 23:30
Example of output
Mon Count
7:00 1
8:00 2
9:00 2
10:00 2
11:00 2
12:00 3
13:00 4
14:00 3
15:00 2
16:00 2
17:00 2
18:00 2
19:00 2
20:00 1
21:00 1
22:00 0
23:00 1
If you pay attention to the count, you will find it changing when anyone clocks in or clocks out through out the day.
Upvotes: 2
Views: 1233
Reputation: 155
I thought of sharing my solution here in case if it helps someone. Only difference this solution has is , I have added an additional column positions
as a filter. You can remove that filter if your problem is similar to mine as stated above. Otherwise, this filter helps with sorting hours
by position id
.
#loading libraries
library(lubridate)
library(readxl)
library(stringr)
library(tidyr)
#read data
df <- read_excel('data_sample.xlsx',
col_types = c('numeric', 'text', 'date', 'guess', 'guess','skip', 'numeric'))
#convert clocks to float for faster comparisons
df$`Clock In` = hour(df$`Clock In`) + minute(df$`Clock In`) / 60
df$`Clock Out` = hour(df$`Clock Out`) + minute(df$`Clock Out`) / 60
#remove shallow records
df <- df[!(df$`Clock In` == df$`Clock Out`),]
#24 hours strings
hours = lapply(0 : 23, function(i) str_pad(paste(toString(i), ':00', sep=""), 5, pad = '0'))
#empty presence matrix
hours_mat = c()
#fill the presence matrix with ines when exists in work place
for (r in 1:nrow(df)) {
lis = c()
for (i in 0:23) {
if (i < df[r, 'Clock In'] | i >= df[r, 'Clock Out']){
lis[i + 1] <- 0
}
else {
lis[i + 1] <- 1
}
}
hours_mat <- rbind(hours_mat, lis)
}
#convert matrix to dataframe
hours_df = as.data.frame(hours_mat)
colnames(hours_df) <- hours
#bind the matrix to the original dataframe
final_df <- cbind(df, hours_df)
#aggregate presence count over date in every position
result <- aggregate(final_df[1:nrow(final_df),7:ncol(final_df)],
by=list(Date = final_df$Date, Position = final_df$Position),
FUN=sum)
#factorize dates for converting from wide to long format
result$Date = factor(result$Date)
#wide to long format
long_result <- gather(result, Timestamp, Count, '00:00':'23:00',
factor_key = TRUE)
#long to wide format using dates
result_wide <- spread(long_result, Date, Count)
#to select a particular position, uncomment this line :
#result_wide <- result_wide[result_wide$Position == 'your required position as number']
#write the final output to "output.csv"
write.csv(result_wide, 'output.csv')
and sample data is as follows -
structure(list(`Employee Number` = c(1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1), Day = c("THU", "FRI", "SAT", "SUN",
"WED", "THU", "FRI", "SAT", "SUN", "WED", "THU", "THU", "FRI",
"SAT", "SUN", "WED", "THU", "THU"), Date = structure(c(1577923200,
1578009600, 1578096000, 1578182400, 1578441600, 1578528000, 1578614400,
1578700800, 1578787200, 1579046400, 1579132800, 1579132800, 1579219200,
1579305600, 1579392000, 1579651200, 1579737600, 1579737600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), ClockIn = structure(c(-2209021200,
-2209021200, -2209021200, -2209021200, -2209021200, -2209023000,
-2209021200, -2209021200, -2209021200, -2209021200, -2209075200,
-2209021200, -2209021200, -2209021200, -2209021200, -2209021200,
-2209075200, -2209021200), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`Clock Out` = structure(c(-2208992400, -2208992400, -2208992400,
-2208992400, -2208992400, -2208994200, -2208992400, -2208992400,
-2208992400, -2208992400, -2209075200, -2208999600, -2208992400,
-2208992400, -2208992400, -2208992400, -2209075200, -2208999600
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Department = c(20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20), Position = c(35, 35, 35, 35, 35, 35, 35, 35, 35,
35, 35, 35, 35, 35, 35, 35, 35, 35)), row.names = c(NA, -18L
), class = c("tbl_df", "tbl", "data.frame"))
Upvotes: 1
Reputation: 29119
This is a bit more complicated than it seems, since if an employee works the night shift until the next day, we need to change the day that we are counting their presence after midnight.
For that matter, I wrote my own function which takes the day, clock in, and clock out and returns a sequence of time (1 hour intervals) along with the day associated with each interval. See below;
time.seq <- function(day.i, start.i, end.i, step.i = '1 hour'){
require(lubridate)
require(DescTools) ## to get the abbreviated weekdays
start.i <- as.POSIXct(start.i, format="%H:%M")
end.i <- as.POSIXct(end.i, format="%H:%M")
if (start.i > end.i) { ## accounting for working on the next day (after midnight)
end.i <- end.i + days(1)
}
out.h <- seq(start.i, end.i , step.i)
## Going to the next day based on difference in time (unit = days)
day.abb.ex <- c(day.abb, "Mon")
out.d <- day.abb.ex[which(toupper(day.abb) == day.i) +
c(0, cumsum(as.numeric(diff(floor_date(out.h, unit = "day"))/86400)))]
out <- list(DAY = out.d, HOUR = out.h)
return(out)
}
Then using that function, and similar to the logic used in the other answer, we can get the count for each hour.
library(dplyr)
library(tidyr)
df %>%
rowwise() %>%
mutate(HOUR = list(time.seq(Day, Clockin, Clockout)[["HOUR"]]),
DAY = list(time.seq(Day, Clockin, Clockout)[["DAY"]])) %>%
unnest(c(HOUR, DAY)) %>%
count(Day=DAY, Hour = format(HOUR, '%H:%M'), name = "Count") %>%
pivot_wider(names_from = Day, values_from = Count)
#> # A tibble: 22 x 8
#> Hour Fri Mon Sat Sun Thu Tue Wed
#> <chr> <int> <int> <int> <int> <int> <int> <int>
#> 1 07:00 2 NA 1 NA 1 1 1
#> 2 08:00 2 NA 1 NA 1 1 1
#> 3 09:00 2 NA 1 NA 1 1 1
#> 4 10:00 4 NA 1 NA 3 3 3
#> 5 11:00 4 NA 1 NA 3 3 3
#> 6 12:00 5 1 1 NA 4 4 4
#> 7 13:00 5 1 1 NA 4 4 4
#> 8 14:00 5 1 1 NA 4 4 4
#> 9 15:00 5 1 1 NA 4 4 4
#> 10 16:00 3 1 NA NA 3 3 3
#> # ... with 12 more rows
df <- structure(list(Employee.ID = c(462L, 462L, 559L, 559L, 559L,
559L, 560L, 560L, 560L, 560L, 560L, 715L, 715L, 715L, 715L, 715L,
791L, 791L, 791L, 791L, 802L, 802L, 802L, 802L), Day = structure(c(2L,
4L, 7L, 8L, 6L, 2L, 3L, 7L, 8L, 6L, 2L, 3L, 7L, 2L, 4L, 5L, 7L,
8L, 6L, 2L, 7L, 8L, 6L, 2L), .Label = c("", "FRI", "MON", "SAT",
"SUN", "THU", "TUE", "WED"), class = "factor"), Clockin = structure(c(5L,
5L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 2L, 2L, 2L, 2L), .Label = c("", "10:00", "12:00",
"20:00", "7:00"), class = "factor"), Clockout = structure(c(2L,
2L, 4L, 4L, 5L, 4L, 7L, 8L, 7L, 7L, 6L, 10L, 9L, 11L, 9L, 9L,
2L, 2L, 2L, 2L, 4L, 7L, 3L, 4L), .Label = c("", "15:00", "17:30",
"18:00", "18:15", "19:45", "20:00", "22:00", "4:00", "4:15",
"4:45"), class = "factor")), row.names = c(NA, 24L), class = "data.frame")
Upvotes: 2
Reputation: 887571
If we are using the hour with 'Clockin', in R
, we can get the count
of 'Day', 'Clockin' and reshape into 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
library(lubridate)
df1 %>%
transmute(Day = factor(Day, levels = c("MON", "TUE", "WED", "THU", "FRI", "SAT")),
Clockin = ymd_hms(Clockin)) %>%
count(Day, Clockin) %>%
complete(Day, Clockin = seq(min(Clockin), max(Clockin), by = "1 hour"),
fill = list(n = 0)) %>%
mutate(Clockin = format(Clockin, "%H:%M")) %>%
pivot_wider(names_from = Day, values_from = n)
# A tibble: 4 x 7
# Clockin MON TUE WED THU FRI SAT
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 07:00 0 0 0 0 1 1
#2 08:00 0 0 0 0 0 0
#3 09:00 0 0 0 0 0 0
#4 10:00 0 1 1 1 1 0
With the new data
df2 %>%
transmute(Day, Hour = map2(as.POSIXct(Clockin, format = '%H:%M'),
as.POSIXct(Clockout, format = '%H:%M'), ~ seq(.x, .y, by = '1 hour'))) %>%
unnest(c(Hour)) %>%
count(Day, Hour = format(Hour, '%H:%M'))
# A tibble: 17 x 3
# Day Hour n
# <chr> <chr> <int>
# 1 Mon 07:00 1
# 2 Mon 08:00 2
# 3 Mon 09:00 2
# 4 Mon 10:00 2
# 5 Mon 11:00 2
# 6 Mon 12:00 3
# 7 Mon 13:00 4
# 8 Mon 14:00 4
# 9 Mon 15:00 4
#10 Mon 16:00 2
#11 Mon 17:00 2
#12 Mon 18:00 2
#13 Mon 19:00 2
#14 Mon 20:00 2
#15 Mon 21:00 1
#16 Mon 22:00 1
#17 Mon 23:00 1
df1 <- structure(list(`Employee ID` = c(462L, 462L, 559L, 559L, 559L,
559L), Day = c("FRI", "SAT", "TUE", "WED", "THU", "FRI"), Clockin = c("1899-12-31 07:00:00",
"1899-12-31 07:00:00", "1899-12-31 10:00:00", "1899-12-31 10:00:00",
"1899-12-31 10:00:00", "1899-12-31 10:00:00"), Clockout = c("1899-12-31 15:00:00",
"1899-12-31 15:00:00", "1899-12-31 18:00:00", "1899-12-31 18:00:00",
"1899-12-31 18:15:00", "1899-12-31 18:00:00")), row.names = c(NA,
-6L), class = "data.frame")
df2 <- structure(list(ID = c("Employee 1", "Employee 2", "Employee 3",
"Employee 4", "Employee 5", "Employee 6"), Day = c("Mon", "Mon",
"Mon", "Mon", "Mon", "Mon"), Clockin = c("7:00", "8:00", "12:00",
"13:00", "15:00", "23:00"), Clockout = c("15:00", "15:00", "14:00",
"20:00", "22:00", "23:30")), class = "data.frame", row.names = c(NA,
-6L))
Upvotes: 1