BlackHat
BlackHat

Reputation: 755

Alternative to Nested For Loop in R

I have two data sets: competitor_data - contains competitors for a given product as well as the price and date when the competitor prices were collected.

product_price - the date of each price change.

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                   "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                            competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
                            competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                                      date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
                                                  "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
                                    price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)

product_price$date = as.Date(product_price$date)

Objective

My script below using nested for loops but it takes over 24 hours to process 5000 unique product_id:

unique_skus <- unique(product_price$productId)
all_competitive_data <- data.frame()
mid_step_data <- data.frame()

start_time <-Sys.time()
for (i in 1:length(unique_skus)){
  step1 <- subset(product_price, productId == unique_skus[i])
  transact_dates = unique(step1$date)
  for (a in 1:length(transact_dates)){
    step2 <- subset(step1, date ==transact_dates[a])
    step3 <- inner_join(step2,competitor_data, by='productId')
    if (nrow(subset(step3, date > crawl_date)) == 0){
      step3 <- step3[ order(step3$crawl_date , decreasing = FALSE ),]
      competitor_price <- head(step3,1)$competitor_price
      step2$competitor_price = competitor_price
    }
    else {
      step4 <- subset(step3, date > crawl_date)
      step4 <- step4[ order(step4$crawl_date , decreasing = TRUE ),]
      competitor_price <- head(step4,1)$competitor_price
      step2$competitor_price = competitor_price
    }
    step2$price_leader <- ifelse(step2$price <= step2$competitor_price, 1, 0)
    mid_step_data = rbind(mid_step_data,step2)
  }
  all_competitive_data <- rbind(all_competitive_data,mid_step_data)
}
Sys.time()-start_time
all_competitive_data = unique(all_competitive_data)

Is there a way to accomplish this quickly perhaps using dplyr?

Upvotes: 0

Views: 2004

Answers (2)

rawr
rawr

Reputation: 20811

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                              crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                           "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                              competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
                              competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
                                   "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
                            price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)

product_price$date = as.Date(product_price$date)

Use this function to fill a vector with NAs forward then backward

## fill in NAs
f <- function(..., lead = NA) {
  # f(NA, 1, NA, 2, NA, NA, lead = NULL)
  x <- c(lead, c(...))
  head(zoo::na.locf(zoo::na.locf(x, na.rm = FALSE), fromLast = TRUE),
       if (is.null(lead)) length(x) else -length(lead))
}

Merge the two by product and date. We pad the first price by product with an extra NA so this will effectively use the previous price when we fill in the NAs

Then do the comparison of price and competitor price. The last step is just some cleaning up to prove it is the same result

dd <- merge(product_price, competitor_data,
            by.y = c('productId', 'crawl_date'),
            by.x = c('productId', 'date'), all = TRUE)
dd$competitor_price <-
  unlist(sapply(split(dd$competitor_price, dd$productId), f))
dd$price_leader <- +(dd$price <= dd$competitor_price)
(res1 <- `rownames<-`(dd[!is.na(dd$price_leader), -4], NULL))

#    productId       date price competitor_price price_leader
# 1     banana 2014-02-22  2.09             2.50            1
# 2     banana 2014-05-03  2.04             2.35            1
# 3     banana 2014-05-05  2.12             2.35            1
# 4     banana 2014-06-22  2.31             2.22            0
# 5     banana 2014-07-05  2.29             2.52            1
# 6     banana 2014-08-31  2.01             2.52            1
# 7        fig 2014-03-09  5.21             5.32            1
# 8        fig 2014-05-21  5.22             5.32            1
# 9        fig 2014-06-19  5.36             5.56            1
# 10       fig 2014-06-22  5.91             5.56            0
# 11       fig 2014-07-03  5.36             5.86            1
# 12       fig 2014-09-08  5.56             5.96            1

res0 <- `rownames<-`(all_competitive_data[
  order(all_competitive_data$productId, all_competitive_data$date), ], NULL)

all.equal(res0, res1)
# [1] TRUE

You can change any of these steps to dplyr or data.table syntax; I don't use either one, but it should be straight-forward:

library('dplyr')
dd <- full_join(product_price, competitor_data,
                by = c(
                  'productId' = 'productId',
                  'date' = 'crawl_date'
                )
) %>% arrange(productId, date)

dd %>% group_by(productId) %>%
  mutate(
    competitor_price = f(competitor_price),
    price_leader = as.integer(price <= competitor_price)
) %>% filter(!is.na(price_leader)) %>% select(-competitor)

# Source: local data frame [12 x 5]
# Groups: productId [2]
# 
#      productId       date price competitor_price price_leader
#          <chr>     <date> <dbl>            <dbl>        <int>
#   1     banana 2014-02-22  2.09             2.50            1
#   2     banana 2014-05-03  2.04             2.35            1
#   3     banana 2014-05-05  2.12             2.35            1
#   4     banana 2014-06-22  2.31             2.22            0
#   5     banana 2014-07-05  2.29             2.52            1
#   6     banana 2014-08-31  2.01             2.52            1
#   7        fig 2014-03-09  5.21             5.32            1
#   8        fig 2014-05-21  5.22             5.32            1
#   9        fig 2014-06-19  5.36             5.56            1
#   10       fig 2014-06-22  5.91             5.56            0
#   11       fig 2014-07-03  5.36             5.86            1
#   12       fig 2014-09-08  5.56             5.96            1

Upvotes: 3

Andrew Lavers
Andrew Lavers

Reputation: 4378

The below solution uses dplyr join to match. (NOTE:I changed "crawl_date" to "date" so that dplyr join would select the matching columns automatically. It can be matched explicitly by using something like

by=c('productId'='productId', date'='crawl_date')  

as a parameter to join.

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                              date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                           "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                              competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","ga**strong text**mespot","louis vuitton","gucci","tesla"),
                              competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$date = as.Date(competitor_data$date)

product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
                                   "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
                            price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)

product_price$date = as.Date(product_price$date)

require(dplyr)
joined <- product_price %>% left_join(competitor_data)
joined$leader <- as.integer(joined$price <= joined$competitor_price)

joined

The resulting data frame is

   productId       date price competitor competitor_price leader
1     banana 2014-05-05  2.12     google             1.99      0
2     banana 2014-06-22  2.31    tencent             2.52      1
3     banana 2014-07-05  2.29       <NA>               NA     NA
4     banana 2014-08-31  2.01       <NA>               NA     NA
5     banana 2014-05-03  2.04       <NA>               NA     NA
6     banana 2014-02-22  2.09       <NA>               NA     NA
7        fig 2014-05-21  5.22       <NA>               NA     NA
8        fig 2014-06-19  5.36       <NA>               NA     NA
9        fig 2014-03-09  5.21       <NA>               NA     NA
10       fig 2014-06-22  5.91       <NA>               NA     NA
11       fig 2014-07-03  5.36       <NA>               NA     NA
12       fig 2014-09-08  5.56       <NA>               NA     NA
> 

Upvotes: 0

Related Questions