Reputation: 141
mydat=structure(list(Variable_16 = c(1474L, 1392L, 951L, 830L, 624L,
2429L, 2270L, 925L, 704L, 972L, 1081L), Variable_17 = c(0.78,
0.78, 0.55, 0.49, 0.4, 1.61, 1.52, 0.64, 0.49, 0.7, 0.8), Variable_18 = c(20175L,
16780L, 16459L, 16173L, 13291L, 14681L, 14574L, 14949L, 12904L,
12684L, 12544L), Variable_19 = c(10.65, 9.45, 9.5, 9.46, 8.5,
9.7, 9.73, 10.37, 9.05, 9.12, 9.32), Variable_20 = c(111.31,
110, 88, 86, 86, 89, 100, 65, 98, 78, 75), Variable_21 = c(597L,
554L, 508L, 588L, 441L, 422L, 423L, 475L, 420L, 388L, 377L),
Variable_22 = c(337L, 294L, 359L, 310L, 253L, 255L, 243L,
272L, 274L, 266L, 246L), Variable_23 = c(286L, 250L, 278L,
232L, 214L, 189L, 190L, 213L, 208L, 210L, 193L), Variable_24 = c(251L,
221L, 238L, 214L, 169L, 153L, 149L, 176L, 183L, 163L, 156L
), Variable_25 = c(241L, 191L, 213L, 178L, 125L, 136L, 128L,
148L, 151L, 143L, 136L), Variable_26 = c(0.315039578, 0.311936937,
0.29330254, 0.344060854, 0.282149712, 0.278916061, 0.282376502,
0.329403606, 0.294530154, 0.278936017, 0.280089153), Variable_27 = c(0.177836412,
0.165540541, 0.207274827, 0.181392627, 0.161868202, 0.168539326,
0.162216288, 0.188626907, 0.192145863, 0.191229331, 0.182763744
), Variable_28 = c(0.150923483, 0.140765766, 0.160508083,
0.135751902, 0.136916187, 0.124917383, 0.126835781, 0.147711512,
0.145862553, 0.150970525, 0.143387816), Variable_29 = c(0.132453826,
0.124436937, 0.137413395, 0.125219427, 0.1081254, 0.101123596,
0.099465955, 0.122052705, 0.128330996, 0.117181884, 0.11589896
), Variable_30 = c(0.127176781, 0.107545045, 0.122979215,
0.104154476, 0.079974408, 0.08988764, 0.085447263, 0.102635229,
0.105890603, 0.102803738, 0.101040119), Variable.binary._31 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Variable.binary._32 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Variable.binary._33 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Target = c(56.79,
21.3, 61.69, 25.32, 26.61, 30.74, 12.27, 28.28, 14.06, 60.78,
47.08)), .Names = c("Variable_16", "Variable_17", "Variable_18",
"Variable_19", "Variable_20", "Variable_21", "Variable_22", "Variable_23",
"Variable_24", "Variable_25", "Variable_26", "Variable_27", "Variable_28",
"Variable_29", "Variable_30", "Variable.binary._31", "Variable.binary._32",
"Variable.binary._33", "Target"), class = "data.frame", row.names = c(NA,
-11L))
i have many variables and i need to see outliers for all my variables. For example if i do so
#check outliers
boxplot(mydat$Variable_16)
i just see one plot for Variable_16. So my question: How to create boxplots with outliers for all variables at once, using lapply function, and not create for each variable separately? and then how values which marked as outliers write in csv files? My desired output
row.number.16 Variable_16 row.number.17 Variable_17 row.number.18 Variable_18 row.number.19
1 30 1474 45 0.78 456 20175 567
2 45 1392 54 0.78 88 16780 234
3 78 951 678 0.55 42 16459 432
4 110 830 123 0.49 1234 16173 123
Variable_19
1 10.65
2 9.45
3 9.50
4 9.46
where row number 16 it is number of row where for variable_16 contained outliers value and in variable_16 indicated this value
Upvotes: 0
Views: 539
Reputation: 107687
Simply reshape your data from wide to long (the preferred format of nearly most analytical methods), then plot data accordingly. And to identify which values are outliers within each variable, calculate such an indicator as a new column with ave
+ ifelse
using same default algorithm as boxplot
and boxplot.stats
. Then, export the long data into CSV.
# RESHAPE DATA WIDE TO LONG
rdf <- reshape(mydat, varying = list(names(mydat)), times = names(mydat),
v.names = "value", timevar = "variable",
new.row.names = 1:1E5, direction = "long")
# FLAG OUTLIER WITHIN EACH VARIABLE
rdf$outlier <- with(rdf, ave(value, variable, FUN=function(v)
ifelse(v <= -1.5 * quantile(v)[2] | v >= 1.5 * quantile(v)[4], 1, 0)))
head(rdf, 10)
# variable value id outlier
# 1 Variable_16 1474 1 0
# 2 Variable_16 1392 2 0
# 3 Variable_16 951 3 0
# 4 Variable_16 830 4 0
# 5 Variable_16 624 5 0
# 6 Variable_16 2429 6 1
# 7 Variable_16 2270 7 1
# 8 Variable_16 925 8 0
# 9 Variable_16 704 9 0
# 10 Variable_16 972 10 0
# RUN BOXPLOT WITH FORMULA STYLE
boxplot(value ~ variable, rdf)
# EXPORT OUTLIER FLAGGED, LONG DATA TO CSV
write.csv(rdf, "/path/to/BoxplotOutliers.csv")
To adjust boxplot for readability
par(mar=c(10,5,4,1))
boxplot(value ~ variable, rdf, las=3,
xlab = "", main="Outlier BoxPlots by Variable")
And to remove any atypical variable, use variable field of the reshaped data for subsetting:
par(mar=c(10,5,4,1))
boxplot(value ~ variable, subset(rdf, variable != "Variable_18"), las=3,
xlab = "", main="Outlier BoxPlots by Variable")
Upvotes: 2