Louisa Fritz
Louisa Fritz

Reputation: 55

Adding rows with changing variable values in R

I have the following extract of my dataset about the occupancy of a football match:

example <- data.frame(Date <- c("2019-03-21", "2019-03-30", "2019-04-07", 
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21",  "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30", "2019-04-07",
                                "2019-03-21", "2019-03-30",
                                "2019-03-21", "2019-03-30",
                                "2019-03-21", "2019-03-30",
                                "2019-03-21"),
                         Block <- c("43L","43L", "43L", "15B", "15B", "15B", "43L", "43L",
                                    "15B", "15B", "15B",
                                    "15B", "15B", "15B",
                                    "15B", "15B", "15B",
                                    "15B", "15B", "15B",
                                    "15B", "15B", "15B",
                                    "15B", "15B",
                                    "15B", "15B",
                                    "15B", "15B",
                                    "15B"),
                         Preis <- as.numeric(c("24", "35", "30", "35", "45", 
                                    "40", "26", "30",
                                    "35", "45", "40",
                                    "34", "43", "42",
                                    "35", "42", "45",
                                    "36", "45", "43",
                                    "36", "43", "40",
                                    "35", "41",
                                    "32", "42",
                                    "30", "42",
                                    "35")),
                         Max  <- c("3", "3", "3", "10", "10","10","3", "3",
                                   "10", "10","10",
                                   "10", "10","10",
                                   "10", "10","10",
                                   "10", "10","10",
                                   "10", "10","10",
                                   "10", "10",
                                   "10", "10",
                                   "10", "10",
                                   "10"),
                         Actual <- c("2", "1", "2", "10", "9", "6","2", "2",
                                     "10", "9", "6",
                                     "10", "9", "6",
                                     "10", "9", "6",
                                     "10", "9", "6",
                                     "10", "9", "6",
                                     "10", "9",
                                     "10", "9", 
                                     "10", "9",
                                     "10"),
                         Temperatur <- c("15", "20", "18","15", "20", "18", "15",  "18",
                                         "15", "20", "18",
                                         "15", "20", "18",
                                         "15", "20", "18",
                                         "15", "20", "18",
                                         "15", "20", "18",
                                         "15", "20", 
                                         "15", "20", 
                                         "15", "20", 
                                         "15"),
                      Placesold <- c("1", "1", "1", "1", "1","1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", 
                                     "1", "1",
                                     "1", "1",
                                     "1") )
colnames(example) <- c("Date", "Block", "Price", "Max", "Actual", "Temprature", "Placesold")

In reality, the dataset contains over 100 blocks and 46 different dates. If you take a closer look at the data, you can see that different numbers of seats are sold out in block 15B and 43L on different days.

table(example$Date, example$Block)
table(example$Placesold)

           15B 43L
  2019-03-21  10   2
  2019-03-30   9   1
  2019-4-07    6   2
> table(example$Placesold)

 1 
30 

My goal is to add the seats that were not sold to the data set. The variable Placesold should be 0 instead of 1. In addition, the average price of the sold tickets should be used instead of the price (without 0).

To clarify my goal, I have added the missing rows for the reduced data set.

result <- data.frame(Date <- c("2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                                "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07",
                               "2019-03-21", "2019-03-30", "2019-4-07"),
                      Block <- c("43L","43L", "43L", 
                                 "15B", "15B", "15B", 
                                 "43L", "43L","43L",
                                 "15B", "15B", "15B",
                                 "43L", "43L","43L",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B",
                                 "15B", "15B", "15B"),
                      Preis <- c("24", "35", "30", 
                                 "35", "45", "40", 
                                 "26", "35","30",
                                 "35", "45", "40",
                                 "25", "35","30",
                                 "34", "43", "42",
                                 "35", "42", "45",
                                 "36", "45", "43",
                                 "36", "43", "40",
                                 "35", "41", "41.67",
                                 "32", "42", "41.67",
                                 "30", "42", "41.67",
                                 "35","43.11","41.67"), 
                      Max  <- c("3", "3", "3", "10", "10","10",
                                "3", "3", "3",
                                "10", "10","10",
                                "3", "3", "3",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10",
                                "10", "10","10"), 
                      Actual <- c("2", "1", "2",
                                  "10", "9", "6",
                                   "2", "1","2",
                                  "10", "9", "6",
                                   "2", "1","2",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6",
                                  "10", "9", "6"),
                      Temperatur <- c("15", "20", "18",
                                      "15", "20", "18", 
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18",
                                      "15", "20", "18"),
                      Placesold <- c("1", "1", "1", "1", "1","1", 
                                     "1", "0", "1",
                                     "1", "1", "1",
                                     "0", "0", "0",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "1",
                                     "1", "1", "0",
                                     "1", "1", "0",
                                     "1", "1", "0",
                                     "1", "0", "0") )

colnames(result) <- c("Date", "Block", "Price", "Max", "Actual", "Temprature", "Placesold")

The results of the blocks and the data as well as the occurrence of the variable "Placesold" look like this:

table(result$Date, result$Block)
table(result$Placesold)

           15B 43L
  2019-03-21  10   3
  2019-03-30  10   3
  2019-4-07   10   3
> table(result$Placesold)

 0  1 
 9 30 

My first thought was to create a matrix with more rows, but to be honest I don't really know how. I hope you can help me.

Thank you very much.

Upvotes: 1

Views: 67

Answers (1)

benaja
benaja

Reputation: 147

I use dplyr functions and base::merge. merge can perform cross join between data frames, vectors and other types. Construction of each date and block pair - it includes unsold blocks of a date:

# ordered, unique vector of dates
dates <- example$Date %>% unique() %>% sort()
# ordered, unique vector of blocks
blocks <- example$Block %>% unique() %>% sort()
# insert dummy block to demonstrate effects of missing blocks
blocks <- c("11B", blocks)
# cross join of dates and blocks: each date paired with each block
# (it results a data.frame)
eachDateBlock <- merge(dates, blocks, by = NULL)
# merge generate x and y as names for the resulted data.frame
# I rename them as a preparation for left_join
eachDateBlock <- eachDateBlock %>% rename(Date = x, Block = y)

# rows from 'eachDateBlock' with matchig row in 'example' get values of variables,
# otherwise they filled by NAs
extendedData <- eachDateBlock %>%
  left_join(example, by = c("Date" = "Date", "Block" = "Block"))

# NOTE: before avgPrice you need something similar conversion - I ignore
# other numeric columns here
#example$Price <- as.double(example$Price)
#example$Placesold <- as.double(example$Placesold)


# Overwrite NAs in rows of supplied unsold blocks
avgPrice <- mean(example$Price)
result <- extendedData %>% mutate(
  Price = if_else(is.na(Price), avgPrice, Price),
  Placesold = if_else(is.na(Placesold), 0, Placesold)
) %>% arrange(Date)
> table(result$Date, result$Block)

             11B 15B 43L
  2019-03-21   1  10   2
  2019-03-30   1   9   1
  2019-04-07   1   6   2

> table(result$Placesold)  

 0  1 
 3 30 

> result
         Date Block    Price  Max Actual Temprature Placesold
1  2019-03-21   11B 37.53333 <NA>   <NA>       <NA>         0
.
.
.
12 2019-03-21   43L 24.00000    3      2         15         1
13 2019-03-21   43L 26.00000    3      2         15         1
14 2019-03-30   11B 37.53333 <NA>   <NA>       <NA>         0
15 2019-03-30   15B 45.00000   10      9         20         1
.
.
.
24 2019-03-30   43L 35.00000    3      1         20         1
25 2019-04-07   11B 37.53333 <NA>   <NA>       <NA>         0
.
.
.
31 2019-04-07   15B 40.00000   10      6         18         1
32 2019-04-07   43L 30.00000    3      2         18         1
33 2019-04-07   43L 30.00000    3      2         18         1

Upvotes: 1

Related Questions