Reputation: 31
I am attempting to use the by
function to substitue outliers of many variables in a dataframe, according to a Group variable.
The followings are my efforts. But, I get an error.
# R code:
library(tidyverse)
library(dplyr)
# outlier function definition
my_outlier <- function(x){
stats <- boxplot.stats(unlist(x))
outlier <- stats$out
outlier_idx <- which(unlist(x) %in% outlier)
max <- max(x[-outlier_idx]); min <- min(x[-outlier_idx])
x <- ifelse(x>max, max,ifelse(x < min, min, x) )
return(x)
}
# use the above defined func to substitue outliers of 1 variable in a dataframe, according to a Group variable.
group_data <- as_tibble(data.frame(x=c(runif(10),2.5,-2.3,runif(10,1,2),3.5,-1.5), group=c(rep(1,12),rep(2,12)) ) )
View(group_data)
by(group_data$x, group_data$group, my_outlier, simplify=FALSE)
# use the above defined func to substitue outliers of 1+ variable in a dataframe, according to a Group variable.
group_datas <- as_tibble(data.frame(x=c(runif(10),2.5,-2.3,runif(10,1,2),3.5,-1.5),
y=c(runif(10,2,3),4,-1,runif(10,3,4),6,-1),
group=c(rep(1,12),rep(2,12)) ) )
by(group_data[,1:2], group_data$group, my_outlier)
when using my defined function to substitue outliers of 1+ variable in a dataframe, according to a Group variable, I got an error.
I don't know what part of my code caused the error.
Upvotes: 3
Views: 159
Reputation: 23129
For multivariate outliers, boxplot.stats
won't work, you can use the outlier
test from the packages outliers
:
library(outliers)
my_outlier2 <- function(x, plot=TRUE){
x <- as.matrix(x)
outlier <- rbind(outlier(x),outlier(x,opposite=TRUE))
outlier_idx <- which(duplicated(rbind(x, outlier), fromLast=TRUE))#which(apply(mapply(x, outlier, FUN="=="), MARGIN=1, FUN=all))
if (plot) { # if 2-D data, visualize
plot(x[,1], x[,2], pch=19, xlab='x', ylab='y')
points(x[outlier_idx,1], x[outlier_idx,2], col='red', pch=8, cex=2)
legend('topleft', legend=c('non-outlier', 'outlier'), pch=c(19, 8), col=c('black', 'red'))
}
x <- x[-outlier_idx,]
return(x)
}
# use the above defined func to substitue outliers of 1+ variable in a dataframe, according to a Group variable.
group_datas <- as_tibble(data.frame(x=c(runif(10),2.5,-2.3,runif(10,1,2),3.5,-1.5),
y=c(runif(10,2,3),4,-1,runif(10,3,4),6,-1),
group=c(rep(1,12),rep(2,12)) ) )
by(group_datas[,1:2], group_datas$group, my_outlier2)
Upvotes: 1