Julia
Julia

Reputation: 141

create multiple outliers plots and data frame with outliers in R

mydat=structure(list(Variable_16 = c(1474L, 1392L, 951L, 830L, 624L, 
2429L, 2270L, 925L, 704L, 972L, 1081L), Variable_17 = c(0.78, 
0.78, 0.55, 0.49, 0.4, 1.61, 1.52, 0.64, 0.49, 0.7, 0.8), Variable_18 = c(20175L, 
16780L, 16459L, 16173L, 13291L, 14681L, 14574L, 14949L, 12904L, 
12684L, 12544L), Variable_19 = c(10.65, 9.45, 9.5, 9.46, 8.5, 
9.7, 9.73, 10.37, 9.05, 9.12, 9.32), Variable_20 = c(111.31, 
110, 88, 86, 86, 89, 100, 65, 98, 78, 75), Variable_21 = c(597L, 
554L, 508L, 588L, 441L, 422L, 423L, 475L, 420L, 388L, 377L), 
    Variable_22 = c(337L, 294L, 359L, 310L, 253L, 255L, 243L, 
    272L, 274L, 266L, 246L), Variable_23 = c(286L, 250L, 278L, 
    232L, 214L, 189L, 190L, 213L, 208L, 210L, 193L), Variable_24 = c(251L, 
    221L, 238L, 214L, 169L, 153L, 149L, 176L, 183L, 163L, 156L
    ), Variable_25 = c(241L, 191L, 213L, 178L, 125L, 136L, 128L, 
    148L, 151L, 143L, 136L), Variable_26 = c(0.315039578, 0.311936937, 
    0.29330254, 0.344060854, 0.282149712, 0.278916061, 0.282376502, 
    0.329403606, 0.294530154, 0.278936017, 0.280089153), Variable_27 = c(0.177836412, 
    0.165540541, 0.207274827, 0.181392627, 0.161868202, 0.168539326, 
    0.162216288, 0.188626907, 0.192145863, 0.191229331, 0.182763744
    ), Variable_28 = c(0.150923483, 0.140765766, 0.160508083, 
    0.135751902, 0.136916187, 0.124917383, 0.126835781, 0.147711512, 
    0.145862553, 0.150970525, 0.143387816), Variable_29 = c(0.132453826, 
    0.124436937, 0.137413395, 0.125219427, 0.1081254, 0.101123596, 
    0.099465955, 0.122052705, 0.128330996, 0.117181884, 0.11589896
    ), Variable_30 = c(0.127176781, 0.107545045, 0.122979215, 
    0.104154476, 0.079974408, 0.08988764, 0.085447263, 0.102635229, 
    0.105890603, 0.102803738, 0.101040119), Variable.binary._31 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Variable.binary._32 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Variable.binary._33 = c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Target = c(56.79, 
    21.3, 61.69, 25.32, 26.61, 30.74, 12.27, 28.28, 14.06, 60.78, 
    47.08)), .Names = c("Variable_16", "Variable_17", "Variable_18", 
"Variable_19", "Variable_20", "Variable_21", "Variable_22", "Variable_23", 
"Variable_24", "Variable_25", "Variable_26", "Variable_27", "Variable_28", 
"Variable_29", "Variable_30", "Variable.binary._31", "Variable.binary._32", 
"Variable.binary._33", "Target"), class = "data.frame", row.names = c(NA, 
-11L))

i have many variables and i need to see outliers for all my variables. For example if i do so

#check outliers
boxplot(mydat$Variable_16)

i just see one plot for Variable_16. So my question: How to create boxplots with outliers for all variables at once, using lapply function, and not create for each variable separately? and then how values which marked as outliers write in csv files? My desired output

  row.number.16 Variable_16 row.number.17 Variable_17 row.number.18 Variable_18 row.number.19
1            30        1474            45        0.78           456       20175           567
2            45        1392            54        0.78            88       16780           234
3            78         951           678        0.55            42       16459           432
4           110         830           123        0.49          1234       16173           123
  Variable_19
1       10.65
2        9.45
3        9.50
4        9.46

where row number 16 it is number of row where for variable_16 contained outliers value and in variable_16 indicated this value

Upvotes: 0

Views: 539

Answers (1)

Parfait
Parfait

Reputation: 107687

Simply reshape your data from wide to long (the preferred format of nearly most analytical methods), then plot data accordingly. And to identify which values are outliers within each variable, calculate such an indicator as a new column with ave + ifelse using same default algorithm as boxplot and boxplot.stats. Then, export the long data into CSV.

# RESHAPE DATA WIDE TO LONG
rdf <- reshape(mydat, varying = list(names(mydat)), times = names(mydat),
               v.names = "value", timevar = "variable", 
               new.row.names = 1:1E5, direction = "long")

# FLAG OUTLIER WITHIN EACH VARIABLE
rdf$outlier <- with(rdf, ave(value, variable, FUN=function(v) 
                             ifelse(v <= -1.5 * quantile(v)[2] | v >= 1.5 * quantile(v)[4], 1, 0)))

head(rdf, 10)
#       variable value id outlier
# 1  Variable_16  1474  1       0
# 2  Variable_16  1392  2       0
# 3  Variable_16   951  3       0
# 4  Variable_16   830  4       0
# 5  Variable_16   624  5       0
# 6  Variable_16  2429  6       1
# 7  Variable_16  2270  7       1
# 8  Variable_16   925  8       0
# 9  Variable_16   704  9       0
# 10 Variable_16   972 10       0

# RUN BOXPLOT WITH FORMULA STYLE
boxplot(value ~ variable, rdf)

# EXPORT OUTLIER FLAGGED, LONG DATA TO CSV
write.csv(rdf, "/path/to/BoxplotOutliers.csv")

To adjust boxplot for readability

par(mar=c(10,5,4,1))
boxplot(value ~ variable, rdf, las=3, 
        xlab = "", main="Outlier BoxPlots by Variable")

Full Data Box Plot Output

And to remove any atypical variable, use variable field of the reshaped data for subsetting:

par(mar=c(10,5,4,1))
boxplot(value ~ variable, subset(rdf, variable != "Variable_18"), las=3, 
        xlab = "", main="Outlier BoxPlots by Variable")

Subsetted Data Box Plot Output

Upvotes: 2

Related Questions