Reputation: 55
I have the following extract of my dataset about the occupancy of a football match:
example <- data.frame(Date <- c("2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30",
"2019-03-21", "2019-03-30",
"2019-03-21", "2019-03-30",
"2019-03-21"),
Block <- c("43L","43L", "43L", "15B", "15B", "15B", "43L", "43L",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B",
"15B", "15B",
"15B", "15B",
"15B"),
Preis <- as.numeric(c("24", "35", "30", "35", "45",
"40", "26", "30",
"35", "45", "40",
"34", "43", "42",
"35", "42", "45",
"36", "45", "43",
"36", "43", "40",
"35", "41",
"32", "42",
"30", "42",
"35")),
Max <- c("3", "3", "3", "10", "10","10","3", "3",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10",
"10", "10",
"10", "10",
"10"),
Actual <- c("2", "1", "2", "10", "9", "6","2", "2",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9",
"10", "9",
"10", "9",
"10"),
Temperatur <- c("15", "20", "18","15", "20", "18", "15", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20",
"15", "20",
"15", "20",
"15"),
Placesold <- c("1", "1", "1", "1", "1","1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1",
"1", "1",
"1", "1",
"1") )
colnames(example) <- c("Date", "Block", "Price", "Max", "Actual", "Temprature", "Placesold")
In reality, the dataset contains over 100 blocks and 46 different dates. If you take a closer look at the data, you can see that different numbers of seats are sold out in block 15B and 43L on different days.
table(example$Date, example$Block)
table(example$Placesold)
15B 43L
2019-03-21 10 2
2019-03-30 9 1
2019-4-07 6 2
> table(example$Placesold)
1
30
My goal is to add the seats that were not sold to the data set. The variable Placesold should be 0 instead of 1. In addition, the average price of the sold tickets should be used instead of the price (without 0).
To clarify my goal, I have added the missing rows for the reduced data set.
result <- data.frame(Date <- c("2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07"),
Block <- c("43L","43L", "43L",
"15B", "15B", "15B",
"43L", "43L","43L",
"15B", "15B", "15B",
"43L", "43L","43L",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B"),
Preis <- c("24", "35", "30",
"35", "45", "40",
"26", "35","30",
"35", "45", "40",
"25", "35","30",
"34", "43", "42",
"35", "42", "45",
"36", "45", "43",
"36", "43", "40",
"35", "41", "41.67",
"32", "42", "41.67",
"30", "42", "41.67",
"35","43.11","41.67"),
Max <- c("3", "3", "3", "10", "10","10",
"3", "3", "3",
"10", "10","10",
"3", "3", "3",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10"),
Actual <- c("2", "1", "2",
"10", "9", "6",
"2", "1","2",
"10", "9", "6",
"2", "1","2",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6"),
Temperatur <- c("15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18"),
Placesold <- c("1", "1", "1", "1", "1","1",
"1", "0", "1",
"1", "1", "1",
"0", "0", "0",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "0",
"1", "1", "0",
"1", "1", "0",
"1", "0", "0") )
colnames(result) <- c("Date", "Block", "Price", "Max", "Actual", "Temprature", "Placesold")
The results of the blocks and the data as well as the occurrence of the variable "Placesold" look like this:
table(result$Date, result$Block)
table(result$Placesold)
15B 43L
2019-03-21 10 3
2019-03-30 10 3
2019-4-07 10 3
> table(result$Placesold)
0 1
9 30
My first thought was to create a matrix with more rows, but to be honest I don't really know how. I hope you can help me.
Thank you very much.
Upvotes: 1
Views: 67
Reputation: 147
I use dplyr functions and base::merge
. merge
can perform cross join between data frames, vectors and other types.
Construction of each date and block pair - it includes unsold blocks of a date:
# ordered, unique vector of dates
dates <- example$Date %>% unique() %>% sort()
# ordered, unique vector of blocks
blocks <- example$Block %>% unique() %>% sort()
# insert dummy block to demonstrate effects of missing blocks
blocks <- c("11B", blocks)
# cross join of dates and blocks: each date paired with each block
# (it results a data.frame)
eachDateBlock <- merge(dates, blocks, by = NULL)
# merge generate x and y as names for the resulted data.frame
# I rename them as a preparation for left_join
eachDateBlock <- eachDateBlock %>% rename(Date = x, Block = y)
# rows from 'eachDateBlock' with matchig row in 'example' get values of variables,
# otherwise they filled by NAs
extendedData <- eachDateBlock %>%
left_join(example, by = c("Date" = "Date", "Block" = "Block"))
# NOTE: before avgPrice you need something similar conversion - I ignore
# other numeric columns here
#example$Price <- as.double(example$Price)
#example$Placesold <- as.double(example$Placesold)
# Overwrite NAs in rows of supplied unsold blocks
avgPrice <- mean(example$Price)
result <- extendedData %>% mutate(
Price = if_else(is.na(Price), avgPrice, Price),
Placesold = if_else(is.na(Placesold), 0, Placesold)
) %>% arrange(Date)
> table(result$Date, result$Block)
11B 15B 43L
2019-03-21 1 10 2
2019-03-30 1 9 1
2019-04-07 1 6 2
> table(result$Placesold)
0 1
3 30
> result
Date Block Price Max Actual Temprature Placesold
1 2019-03-21 11B 37.53333 <NA> <NA> <NA> 0
.
.
.
12 2019-03-21 43L 24.00000 3 2 15 1
13 2019-03-21 43L 26.00000 3 2 15 1
14 2019-03-30 11B 37.53333 <NA> <NA> <NA> 0
15 2019-03-30 15B 45.00000 10 9 20 1
.
.
.
24 2019-03-30 43L 35.00000 3 1 20 1
25 2019-04-07 11B 37.53333 <NA> <NA> <NA> 0
.
.
.
31 2019-04-07 15B 40.00000 10 6 18 1
32 2019-04-07 43L 30.00000 3 2 18 1
33 2019-04-07 43L 30.00000 3 2 18 1
Upvotes: 1