Reputation: 613
I have recently started my adventure with R and trying to solve the following problem. I have data.frame including arrivals and departures for particular months of the year. I have to find what is the median for every month through all this years. My results should be saved in .csv. Below is just the sample, the whole observations include dates up to 2017 (1548 obs. in total):
#dput output assigned to the flights variable
flights <- structure(list(X = 1:163, ReportPeriod = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 6L), .Label = c("01/01/2006 12:00:00 AM", "01/01/2007 12:00:00 AM",
"02/01/2006 12:00:00 AM", "02/01/2007 12:00:00 AM", "03/01/2006 12:00:00 AM",
"03/01/2007 12:00:00 AM", "04/01/2006 12:00:00 AM", "05/01/2006 12:00:00 AM",
"06/01/2006 12:00:00 AM", "07/01/2006 12:00:00 AM", "08/01/2006 12:00:00 AM",
"09/01/2006 12:00:00 AM", "10/01/2006 12:00:00 AM", "11/01/2006 12:00:00 AM",
"12/01/2006 12:00:00 AM"), class = "factor"), FlightType = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 1L), .Label = c("Charter", "Commuter", "Scheduled"), class = "factor"),
Arrival_Departure = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L), .Label = c("Arrival", "Departure"
), class = "factor"), Domestic_International = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("Domestic",
"International"), class = "factor"), FlightOpsCount = c(57L,
299L, 62L, 5L, 5996L, 31L, 5995L, 31L, 13695L, 4026L, 13666L,
4085L, 22L, 101L, 21L, 100L, 5525L, 28L, 5525L, 28L, 12308L,
3381L, 12322L, 3398L, 51L, 4L, 47L, 5L, 6217L, 31L, 6216L,
30L, 13925L, 3952L, 13963L, 3961L, 57L, 2L, 52L, 5885L, 31L,
5886L, 31L, 13420L, 3884L, 13461L, 3897L, 26L, 5L, 24L, 3L,
6089L, 41L, 6089L, 41L, 14126L, 3968L, 14138L, 3984L, 18L,
1L, 18L, 5980L, 39L, 5980L, 39L, 14208L, 4030L, 14211L, 4028L,
16L, 2L, 14L, 6092L, 39L, 6092L, 39L, 14866L, 4483L, 14883L,
4506L, 15L, 1L, 16L, 6134L, 40L, 6134L, 40L, 15243L, 4303L,
15272L, 4328L, 24L, 1L, 18L, 5903L, 37L, 5903L, 37L, 13832L,
3823L, 13815L, 3865L, 48L, 285L, 50L, 6135L, 40L, 6135L,
40L, 14292L, 3605L, 14332L, 3898L, 48L, 3L, 50L, 2L, 5772L,
39L, 5772L, 39L, 13855L, 3706L, 13855L, 3718L, 46L, 6L, 44L,
3L, 6144L, 40L, 6144L, 40L, 14417L, 4086L, 14474L, 4103L,
38L, 3L, 42L, 3L, 6081L, 94L, 6081L, 94L, 14140L, 4301L,
14165L, 4308L, 14L, 7L, 16L, 5L, 5470L, 83L, 5470L, 83L,
12734L, 3779L, 12768L, 3783L, 33L)), .Names = c("X", "ReportPeriod",
"FlightType", "Arrival_Departure", "Domestic_International",
"FlightOpsCount"), class = "data.frame", row.names = c(NA, -163L
))
And here is the expected output I should get:
month,Arrival_Departure,FlightOpsCount
January,Arrival,23152
January,Departure,23173
February,Arrival,20849
February,Departure,20878
March,Arrival,23981.5
March,Departure,24005
April,Arrival,23393
April,Departure,23451.5
May,Arrival,24342.5
May,Departure,24376
June,Arrival,24622.5
June,Departure,24667
July,Arrival,25795.5
July,Departure,25837
August,Arrival,25573.5
August,Departure,25600
September,Arrival,23306
September,Departure,23315
October,Arrival,23965
October,Departure,23990
November,Arrival,22379
November,Departure,22361
December,Arrival,23594
December,Departure,23579
I decided to do it in several steps, first thing I try to do is to receive correct date format from character string:
library(dbplyr)
step_1 = as_tibble(flights)
step_2 = step_1 %>%
transmute(
date_format = as.POSIXct(strptime(ReportPeriod, format = "%m/%d/%Y")),
even_new_date = as.Date(date_format, format = "%Y"),
Arrival_Departure,
FlightOpsCount)
And this is actually tricky for me.. I don't understand how to do this correctly and why there are two ways to get date format eg 2006-01-01 vs 2005-12-31? Which one is correct in this case?
Now, suppose 2006-01-01 is correct, I could use as.POSIXct within months() function to obtain month:
step2 = step_1 %>%
transmute(
month = months(as.POSIXct(strptime(ReportPeriod, format = "%m/%d/%Y"))),
Arrival_Departure,
FlightOpsCount)
Next step would require grouping operation:
step_3 = step_2 %>%
group_by(month, Arrival_Departure) %>%
summarize(median = median(FlightOpsCount))
And when writing it to csv I'm getting ridiculously small values..
"","month","Arrival_Departure","median"
"1","April","Arrival",102.5
"2","April","Departure",3061
"3","August","Arrival",1412.5
"4","August","Departure",3667.5
"5","December","Arrival",102
"6","December","Departure",1738
"7","February","Arrival",116
"8","February","Departure",116
"9","January","Arrival",284
"10","January","Departure",1708
"11","July","Arrival",95.5
"12","July","Departure",3571
"13","June","Arrival",119
"14","June","Departure",3292
"15","March","Arrival",115
"16","March","Departure",1759
"17","May","Arrival",1609.5
"18","May","Departure",3121
"19","November","Arrival",93.5
"20","November","Departure",93.5
"21","October","Arrival",2359
"22","October","Departure",2756
"23","September","Arrival",1228
"24","September","Departure",3187.5
Can somebody guide me through it and show me the proper way to solve the problem?
I would appreciate any help.
Upvotes: 0
Views: 597
Reputation: 613
Thank you all for your help! I do receive the correct values for particular months, this is my code:
#summarize Arrival & Departures through the years
step_1 <- flights %>%
group_by(ReportPeriod, Arrival_Departure) %>%
summarise(sum = sum(FlightOpsCount)) %>%
arrange(ReportPeriod) %>%
ungroup()
#modify date format in ReportPeriod column to receive months
step_2 <- step_1 %>%
transmute(month = months(as.Date(ReportPeriod,"%m/%d/%Y")),
Arrival_Departure,
sum) %>%
group_by(month, Arrival_Departure) %>%
summarise(FlightOpsCount = median(sum)) %>%
write.csv(., "flights_output.csv", row.names = FALSE, quote = FALSE)
However, I am getting months in alphabetical instead of chronological order. I found the solution somewhere here but it does not work properly and I am only getting NAs. Obviously I am calling it before writting whatever to .csv and add ungroup() at the end of step_2.
step_3 <- step_2 %>%
mutate(month = factor(month.name[month], levels = month.name)) %>%
arrange(month)
Upvotes: 1
Reputation: 76651
I believe this is much simpler. Note that the format for months
is slightly different from yours.
library(zoo)
months <- as.yearmon(flights$ReportPeriod, "%m/%d/%Y %H:%M:%S")
agg <- aggregate(FlightOpsCount ~ months + Arrival_Departure, flights, median)
Anyway, I couldn't get medians not even near the expected values in your question. And since these are medians, it seems that there is some mistake in your expectations.
If you want another format for the dates, you can use function format.Date
.
format(as.Date(months), "%Y %B") # or "%B %Y"
For a long list of formats possible, see the help page ?strptime
.
Upvotes: 2
Reputation: 2753
Here's a data.table
approach:
library(data.table)
library(lubridate)
dat <- fread("sample_data.txt", col.names = c("dte", "flight", "typ1","typ2","flt_count"))
dat$dte <- as.POSIXct(strptime(dat$dte, format = "%m/%d/%Y %H:%M:%S"), tz = "GMT")
new_dat <- dat[, sum(flt_count), by = list(month(dte),typ1)]
I renamed the columns just for convenience. You can edit the by
arguments to group/perform operations based on other variables as well. The output of the above code snippet is:
> new_dat
month typ1 V1
1: 1 Arrival 24104
2: 1 Departure 23844
3: 2 Arrival 21365
4: 2 Departure 21394
5: 3 Arrival 24180
6: 3 Departure 24222
Which seems to be what you're looking for. data.table
is really useful for large datasets.
For writing results, use write.csv(new_dat, "new_file.csv", row_names = F)
Hope this is helpful.
Upvotes: 1
Reputation: 325
You're almost there, though I would recommend using dplyr:
# Step 1: Convert dates using as.Date function
flights$ReportPeriod <- as.Date(flights$ReportPeriod, "%m/%d/%Y")
# Step 2: Use dplyr to summarize information
require(dplyr)
flights <- flights %>%
group_by(ReportPeriod, Arrival_Departure) %>%
summarise(FlightOpsCount = median(FlightOpsCount)) %>%
as.data.frame()
# Step 3: Convert date to string for month name
flights <- flights %>%
mutate(ReportPeriod = months(ReportPeriod)) %>%
rename(month = ReportPeriod) # If you need to rename the column to be "months"
# Alternate Step 3: If you want to add in year as well
require(lubridate)
flights <- flights %>%
mutate(ReportPeriod = paste(months(ReportPeriod),
year(ReportPeriod),
sep = " ")) %>%
rename(month = ReportPeriod) # If you need to rename the column to be "months"
# Step 4: Write to csv
write.csv(flights, "file_name.csv", row.names = FALSE)
Cheers.
Upvotes: 3