Faryan
Faryan

Reputation: 409

Looping to get outliers data using dplyr in R

I have task to find outliers data, here it is my data:

# combination 1
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1")
product <- rep(0, 120)
detail <- rep(0, 120)
status <- rep(0, 120)
channel <- rep(0, 120)
transaction <-c(5664,4797,2515,1744,2166,2164,3513,6548,7620,8662,11295,11372,12094,14064,15412,13042,12779,14653,13586,12922,11321,9709,7899,5916,5791,5544,3567,1783,2900,4488,1830,4946,6735,16673,12024,8614,16545,11628,8856,13660,10913,11928,12359,9267,7672,6487,10677,4271,3351,4264,3764,3313,1492,4324,4277,4928,7752,8940,10545,10488,13766,11594,8317,12139,14274,11617,7513,8215,7687,4374,5465,4548,3419,2136,2679,2714,3072,2984,3203,6689,6113,8923,6755,6968,7711,5305,3827,4341,5915,6554,7376,6707,3685,4366,3086,1277,2218,1089,282 ,156 ,907,1691,2786,5229,6081,7133,8617,9759,12984,15060,11906,15909,21934,14993,9776,9721,8707,8080,2245,4702)
mycomb1 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

# combination 2
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5")
product <- rep(0, 120)
detail <- rep(1, 120)
status <- rep(0, 120)
channel <- rep(1, 120)
transaction <-c(5564,4588,3256,1034,2479,3678,5454,6104,8199,9261,10115,13665,12030,11996,12610,15061,15957,19130,15086,11779,14274,10614,7442,10216,4937,9178,5871,6702,3150,6505,4855,4744,10661,10485,10805,9321,14260,9831,15602,10599,14739,14117,8549,9638,9161,8282,7877,2060,2492,2816,3983,2053,4758,5717,2816,6141,8322,9745,9677,14478,11905,9580,8742,11012,5775,6773,8583,9261,10890,11950,5248,3579,3176,7268,605 ,1642,1122,6046,3241,4189,6534,7445,8518,7585,9574,5453,5467,4302,6664,8297,6801,5637,4323,2963,1872,1466,1472,1129,581 ,275 ,716 ,1963,2333,5507,7601,7478,7760,16975,11986,15282,12122,10815,16060,21552,11587,11873,7778,7058,6153,3423)
mycomb2 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

# combination 3
datex <- c(rep("07/01/2021", 22), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9")
product <- rep(1, 118)
detail <- rep(2, 118)
status <- rep(1, 118)
channel <- rep(2, 118)
transaction <- c(12,120 ,120 ,120 ,140 ,144 ,120 ,112 ,106 ,120 ,150 ,120 ,116 ,120 ,96,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,143 ,120 ,120 ,102 ,96,120 ,120 ,120 ,120 ,125 ,120 ,94,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,200 ,118 ,120 ,120 ,120 ,180 ,120 ,100 ,92,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,140 ,120 ,120 ,165 ,120 ,120 ,120 ,120 ,120 ,120 ,100 ,110 ,120 ,120 ,88,66,120 ,118 ,120 ,120 ,118 ,120 ,120 ,120 ,120 ,120 ,120 ,120 )
mycomb3 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

# my data
mydata <- rbind(mycomb1, mycomb2, mycomb3)
mydata

# A tibble: 358 x 8
#   datex      hourx seller product detail status channel transaction
#   <date>     <dbl> <chr>    <dbl>  <dbl>  <dbl>   <dbl>       <dbl>
# 1 2021-07-01     0 do1          0      0      0       0        5664
# 2 2021-07-01     1 do1          0      0      0       0        4797
# 3 2021-07-01     2 do1          0      0      0       0        2515
# 4 2021-07-01     3 do1          0      0      0       0        1744
# 5 2021-07-01     4 do1          0      0      0       0        2166
# 6 2021-07-01     5 do1          0      0      0       0        2164
# 7 2021-07-01     6 do1          0      0      0       0        3513
# 8 2021-07-01     7 do1          0      0      0       0        6548
# 9 2021-07-01     8 do1          0      0      0       0        7620
#10 2021-07-01     9 do1          0      0      0       0        8662
# … with 348 more rows

this is addition function to reorder column, to make result become better.

# Function
moveme <- function (invec, movecommand){
  movecommand <- lapply(strsplit(strsplit(movecommand, ";")[[1]], ",|\\s+"), function(x) x[x != ""])
  movelist <- lapply(movecommand, function(x){
    Where <- x[which(x %in% c("before", "after", "first", "last")):length(x)]
    ToMove <- setdiff(x, Where)
    list(ToMove, Where)
  })
  myVec <- invec
  for (i in seq_along(movelist)){
    temp <- setdiff(myVec, movelist[[i]][[1]])
    A <- movelist[[i]][[2]][1]
    if (A %in% c("before", "after")){
      ba <- movelist[[i]][[2]][2]
      if (A == "before"){
        after <- match(ba, temp)-1
      }
      else if (A == "after"){
        after <- match(ba, temp)
      }
    }
    else if (A == "first"){
      after <- 0
    }
    else if (A == "last"){
      after <- length(myVec)
    }
    myVec <- append(temp, values = movelist[[i]][[1]], after = after)
  }
  myVec
}

I want to use looping to get outliers data from many combination data that i have in mydata, this is manual procedure to get outliers data.

This is i do manually to get outliers for 1st & 2nd looping.

1ST LOOPING

## Looping 1
mydata.comb1 <- subset(mydata, seller == "do1" & product == 0 & detail == 0 & status == 0 & channel == 0)
# Checking Seasonality
library(seastests)
isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)
#TRUE
library(dplyr)
library(timetk)
mydata.comb1 %>%
  group_by(across(seller:channel)) %>%
  tk_anomaly_diagnostics(datex, transaction) %>%
  ungroup -> model.anomaly.seasonal
model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
model.anomaly.seasonal.data3 %>% 
  left_join(mydata.comb1) -> model.anomaly.seasonal.data4
model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
looping1 <- model.anomaly.seasonal.data5

looping1
# A tibble: 6 x 8
#  datex      hourx seller product detail status channel transaction
#  <date>     <dbl> <chr>    <dbl>  <dbl>  <dbl>   <dbl>       <dbl>
#1 2021-07-01    14 do1          0      0      0       0       15412
#2 2021-07-02     9 do1          0      0      0       0       16673
#3 2021-07-02    12 do1          0      0      0       0       16545
#4 2021-07-02    22 do1          0      0      0       0       10677
#5 2021-07-05    16 do1          0      0      0       0       21934
#6 2021-07-05    22 do1          0      0      0       0        2245

2ND LOOPING

mydata.comb2 <- subset(mydata, seller == "do9" & product == 1 & detail == 2 & status == 1 & channel == 2)
# Checking Seasonality
library(seastests)
y <- mydata.comb2$transaction
isSeasonal(as.ts(y), test = "wo", freq = 24)
#FALSE
library(dplyr)
library(qcc)
model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
model.anomaly.non.seasonal.result <- mydata.comb2[model.anomaly.non.seasonal.indeks,]
looping2 <- model.anomaly.non.seasonal.result

looping2
#        datex hourx seller product detail status channel transaction
#6  2021-07-01     7    do9       1      2      1       2         144
#11 2021-07-01    12    do9       1      2      1       2         150
#48 2021-07-03     1    do9       1      2      1       2         200
#53 2021-07-03     6    do9       1      2      1       2         180
#94 2021-07-04    23    do9       1      2      1       2         165

Then we collect the result from the looping. Here it is the result:

myresult <- rbind(looping1, looping2)
myresult
# A tibble: 11 x 8
#   datex      hourx seller product detail status channel transaction
# * <date>     <dbl> <chr>    <dbl>  <dbl>  <dbl>   <dbl>       <dbl>
# 1 2021-07-01    14 do1          0      0      0       0       15412
# 2 2021-07-02     9 do1          0      0      0       0       16673
# 3 2021-07-02    12 do1          0      0      0       0       16545
# 4 2021-07-02    22 do1          0      0      0       0       10677
# 5 2021-07-05    16 do1          0      0      0       0       21934
# 6 2021-07-05    22 do1          0      0      0       0        2245
# 7 2021-07-01     7 do9          1      2      1       2         144
# 8 2021-07-01    12 do9          1      2      1       2         150
# 9 2021-07-03     1 do9          1      2      1       2         200
#10 2021-07-03     6 do9          1      2      1       2         180
#11 2021-07-04    23 do9          1      2      1       2         165

I have 120K looping, it's going to be painfull if i do manually. So, how do i use dplyr to make this looping procedure more simple? Many Thank You.

Upvotes: 2

Views: 87

Answers (1)

Ronak Shah
Ronak Shah

Reputation: 389135

It might be possible to optimise the code further but putting the code that you already have in a function you can do -

library(dplyr)
library(seastests)
library(timetk)
library(qcc)
library(purrr)

custom_fn <- function(mydata.comb1) {
  if(isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)) {
    mydata.comb1 %>%
      group_by(across(seller:channel)) %>%
      tk_anomaly_diagnostics(datex, transaction) %>%
      ungroup -> model.anomaly.seasonal
    model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
    model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
    model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
    colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
    model.anomaly.seasonal.data3 %>% 
      left_join(mydata.comb1) -> model.anomaly.seasonal.data4
    model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
    looping1 <- model.anomaly.seasonal.data5
  } else {
    y <- mydata.comb1$transaction
    model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
    model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
    model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
    model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
    model.anomaly.non.seasonal.result <- mydata.comb1[model.anomaly.non.seasonal.indeks,]
    looping1 <- model.anomaly.non.seasonal.result
  }
  return(looping1)
}

Split the data into groups and apply this functions to every subset.

mydata %>%
  group_split(seller, product, detail, status, channel) %>%
  map_df(custom_fn)

#   datex      hourx seller product detail status channel transaction
#   <date>     <dbl> <chr>    <dbl>  <dbl>  <dbl>   <dbl>       <dbl>
# 1 2020-07-01    14 do1          0      0      0       0       15412
# 2 2020-07-02     9 do1          0      0      0       0       16673
# 3 2020-07-02    12 do1          0      0      0       0       16545
# 4 2020-07-02    22 do1          0      0      0       0       10677
# 5 2020-07-05    16 do1          0      0      0       0       21934
# 6 2020-07-05    22 do1          0      0      0       0        2245
# 7 2020-07-03    16 do5          0      1      0       1        5775
# 8 2020-07-05    11 do5          0      1      0       1       16975
# 9 2020-07-05    17 do5          0      1      0       1       21552
#10 2020-07-01     7 do9          1      2      1       2         144
#11 2020-07-01    12 do9          1      2      1       2         150
#12 2020-07-03     1 do9          1      2      1       2         200
#13 2020-07-03     6 do9          1      2      1       2         180
#14 2020-07-04    23 do9          1      2      1       2         165

Upvotes: 1

Related Questions