Reputation: 409
I have task to find outliers data, here it is my data:
# combination 1
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1")
product <- rep(0, 120)
detail <- rep(0, 120)
status <- rep(0, 120)
channel <- rep(0, 120)
transaction <-c(5664,4797,2515,1744,2166,2164,3513,6548,7620,8662,11295,11372,12094,14064,15412,13042,12779,14653,13586,12922,11321,9709,7899,5916,5791,5544,3567,1783,2900,4488,1830,4946,6735,16673,12024,8614,16545,11628,8856,13660,10913,11928,12359,9267,7672,6487,10677,4271,3351,4264,3764,3313,1492,4324,4277,4928,7752,8940,10545,10488,13766,11594,8317,12139,14274,11617,7513,8215,7687,4374,5465,4548,3419,2136,2679,2714,3072,2984,3203,6689,6113,8923,6755,6968,7711,5305,3827,4341,5915,6554,7376,6707,3685,4366,3086,1277,2218,1089,282 ,156 ,907,1691,2786,5229,6081,7133,8617,9759,12984,15060,11906,15909,21934,14993,9776,9721,8707,8080,2245,4702)
mycomb1 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)
# combination 2
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5")
product <- rep(0, 120)
detail <- rep(1, 120)
status <- rep(0, 120)
channel <- rep(1, 120)
transaction <-c(5564,4588,3256,1034,2479,3678,5454,6104,8199,9261,10115,13665,12030,11996,12610,15061,15957,19130,15086,11779,14274,10614,7442,10216,4937,9178,5871,6702,3150,6505,4855,4744,10661,10485,10805,9321,14260,9831,15602,10599,14739,14117,8549,9638,9161,8282,7877,2060,2492,2816,3983,2053,4758,5717,2816,6141,8322,9745,9677,14478,11905,9580,8742,11012,5775,6773,8583,9261,10890,11950,5248,3579,3176,7268,605 ,1642,1122,6046,3241,4189,6534,7445,8518,7585,9574,5453,5467,4302,6664,8297,6801,5637,4323,2963,1872,1466,1472,1129,581 ,275 ,716 ,1963,2333,5507,7601,7478,7760,16975,11986,15282,12122,10815,16060,21552,11587,11873,7778,7058,6153,3423)
mycomb2 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)
# combination 3
datex <- c(rep("07/01/2021", 22), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9")
product <- rep(1, 118)
detail <- rep(2, 118)
status <- rep(1, 118)
channel <- rep(2, 118)
transaction <- c(12,120 ,120 ,120 ,140 ,144 ,120 ,112 ,106 ,120 ,150 ,120 ,116 ,120 ,96,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,143 ,120 ,120 ,102 ,96,120 ,120 ,120 ,120 ,125 ,120 ,94,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,200 ,118 ,120 ,120 ,120 ,180 ,120 ,100 ,92,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,140 ,120 ,120 ,165 ,120 ,120 ,120 ,120 ,120 ,120 ,100 ,110 ,120 ,120 ,88,66,120 ,118 ,120 ,120 ,118 ,120 ,120 ,120 ,120 ,120 ,120 ,120 )
mycomb3 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)
# my data
mydata <- rbind(mycomb1, mycomb2, mycomb3)
mydata
# A tibble: 358 x 8
# datex hourx seller product detail status channel transaction
# <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2021-07-01 0 do1 0 0 0 0 5664
# 2 2021-07-01 1 do1 0 0 0 0 4797
# 3 2021-07-01 2 do1 0 0 0 0 2515
# 4 2021-07-01 3 do1 0 0 0 0 1744
# 5 2021-07-01 4 do1 0 0 0 0 2166
# 6 2021-07-01 5 do1 0 0 0 0 2164
# 7 2021-07-01 6 do1 0 0 0 0 3513
# 8 2021-07-01 7 do1 0 0 0 0 6548
# 9 2021-07-01 8 do1 0 0 0 0 7620
#10 2021-07-01 9 do1 0 0 0 0 8662
# … with 348 more rows
this is addition function to reorder column, to make result become better.
# Function
moveme <- function (invec, movecommand){
movecommand <- lapply(strsplit(strsplit(movecommand, ";")[[1]], ",|\\s+"), function(x) x[x != ""])
movelist <- lapply(movecommand, function(x){
Where <- x[which(x %in% c("before", "after", "first", "last")):length(x)]
ToMove <- setdiff(x, Where)
list(ToMove, Where)
})
myVec <- invec
for (i in seq_along(movelist)){
temp <- setdiff(myVec, movelist[[i]][[1]])
A <- movelist[[i]][[2]][1]
if (A %in% c("before", "after")){
ba <- movelist[[i]][[2]][2]
if (A == "before"){
after <- match(ba, temp)-1
}
else if (A == "after"){
after <- match(ba, temp)
}
}
else if (A == "first"){
after <- 0
}
else if (A == "last"){
after <- length(myVec)
}
myVec <- append(temp, values = movelist[[i]][[1]], after = after)
}
myVec
}
I want to use looping to get outliers data from many combination data that i have in mydata, this is manual procedure to get outliers data.
This is i do manually to get outliers for 1st & 2nd looping.
1ST LOOPING
## Looping 1
mydata.comb1 <- subset(mydata, seller == "do1" & product == 0 & detail == 0 & status == 0 & channel == 0)
# Checking Seasonality
library(seastests)
isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)
#TRUE
library(dplyr)
library(timetk)
mydata.comb1 %>%
group_by(across(seller:channel)) %>%
tk_anomaly_diagnostics(datex, transaction) %>%
ungroup -> model.anomaly.seasonal
model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
model.anomaly.seasonal.data3 %>%
left_join(mydata.comb1) -> model.anomaly.seasonal.data4
model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
looping1 <- model.anomaly.seasonal.data5
looping1
# A tibble: 6 x 8
# datex hourx seller product detail status channel transaction
# <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2021-07-01 14 do1 0 0 0 0 15412
#2 2021-07-02 9 do1 0 0 0 0 16673
#3 2021-07-02 12 do1 0 0 0 0 16545
#4 2021-07-02 22 do1 0 0 0 0 10677
#5 2021-07-05 16 do1 0 0 0 0 21934
#6 2021-07-05 22 do1 0 0 0 0 2245
2ND LOOPING
mydata.comb2 <- subset(mydata, seller == "do9" & product == 1 & detail == 2 & status == 1 & channel == 2)
# Checking Seasonality
library(seastests)
y <- mydata.comb2$transaction
isSeasonal(as.ts(y), test = "wo", freq = 24)
#FALSE
library(dplyr)
library(qcc)
model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
model.anomaly.non.seasonal.result <- mydata.comb2[model.anomaly.non.seasonal.indeks,]
looping2 <- model.anomaly.non.seasonal.result
looping2
# datex hourx seller product detail status channel transaction
#6 2021-07-01 7 do9 1 2 1 2 144
#11 2021-07-01 12 do9 1 2 1 2 150
#48 2021-07-03 1 do9 1 2 1 2 200
#53 2021-07-03 6 do9 1 2 1 2 180
#94 2021-07-04 23 do9 1 2 1 2 165
Then we collect the result from the looping. Here it is the result:
myresult <- rbind(looping1, looping2)
myresult
# A tibble: 11 x 8
# datex hourx seller product detail status channel transaction
# * <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2021-07-01 14 do1 0 0 0 0 15412
# 2 2021-07-02 9 do1 0 0 0 0 16673
# 3 2021-07-02 12 do1 0 0 0 0 16545
# 4 2021-07-02 22 do1 0 0 0 0 10677
# 5 2021-07-05 16 do1 0 0 0 0 21934
# 6 2021-07-05 22 do1 0 0 0 0 2245
# 7 2021-07-01 7 do9 1 2 1 2 144
# 8 2021-07-01 12 do9 1 2 1 2 150
# 9 2021-07-03 1 do9 1 2 1 2 200
#10 2021-07-03 6 do9 1 2 1 2 180
#11 2021-07-04 23 do9 1 2 1 2 165
I have 120K looping, it's going to be painfull if i do manually. So, how do i use dplyr to make this looping procedure more simple? Many Thank You.
Upvotes: 2
Views: 87
Reputation: 389135
It might be possible to optimise the code further but putting the code that you already have in a function you can do -
library(dplyr)
library(seastests)
library(timetk)
library(qcc)
library(purrr)
custom_fn <- function(mydata.comb1) {
if(isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)) {
mydata.comb1 %>%
group_by(across(seller:channel)) %>%
tk_anomaly_diagnostics(datex, transaction) %>%
ungroup -> model.anomaly.seasonal
model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
model.anomaly.seasonal.data3 %>%
left_join(mydata.comb1) -> model.anomaly.seasonal.data4
model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
looping1 <- model.anomaly.seasonal.data5
} else {
y <- mydata.comb1$transaction
model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
model.anomaly.non.seasonal.result <- mydata.comb1[model.anomaly.non.seasonal.indeks,]
looping1 <- model.anomaly.non.seasonal.result
}
return(looping1)
}
Split the data into groups and apply this functions to every subset.
mydata %>%
group_split(seller, product, detail, status, channel) %>%
map_df(custom_fn)
# datex hourx seller product detail status channel transaction
# <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2020-07-01 14 do1 0 0 0 0 15412
# 2 2020-07-02 9 do1 0 0 0 0 16673
# 3 2020-07-02 12 do1 0 0 0 0 16545
# 4 2020-07-02 22 do1 0 0 0 0 10677
# 5 2020-07-05 16 do1 0 0 0 0 21934
# 6 2020-07-05 22 do1 0 0 0 0 2245
# 7 2020-07-03 16 do5 0 1 0 1 5775
# 8 2020-07-05 11 do5 0 1 0 1 16975
# 9 2020-07-05 17 do5 0 1 0 1 21552
#10 2020-07-01 7 do9 1 2 1 2 144
#11 2020-07-01 12 do9 1 2 1 2 150
#12 2020-07-03 1 do9 1 2 1 2 200
#13 2020-07-03 6 do9 1 2 1 2 180
#14 2020-07-04 23 do9 1 2 1 2 165
Upvotes: 1