Reputation: 45
For reference, here is the whole code.
Sub CalculateOutliers()
Dim n As Long
Dim mean As Double
Dim SD As Double
Dim k As Long
Dim DataSet As Variant
Dim LowConf As Double
Dim HighConf As Double
'--------------------------------------------------------
DataSet = Selection.Value
'Copies highlighted data into DataSet variable
'Cell A1 is (1,1) Because it starts at 0 which is out of range
'--------------------------------------------------------
'--------------------------------------------------------
n = Selection.CountLarge
'Counts number of entries
'--------------------------------------------------------
'--------------------------------------------------------
'DEFINES 95(LowConf) AND 99(HighConf) PERCENT CONFIDENCES BASED ON
'SAMPLE SIZE
If n <= 5 Then
LowConf = 1.72
HighConf = 1.76
End If
If n = 6 Then
LowConf = 1.89
HighConf = 1.97
End If
If n = 7 Then
LowConf = 2.02
HighConf = 2.14
End If
If n = 8 Then
LowConf = 2.13
HighConf = 2.28
End If
If n = 9 Then
LowConf = 2.21
HighConf = 2.39
End If
If n = 10 Then
LowConf = 2.29
HighConf = 2.48
End If
If n = 11 Then
LowConf = 2.36
HighConf = 2.56
End If
If n = 12 Then
LowConf = 2.41
HighConf = 2.64
End If
If n = 13 Then
LowConf = 2.46
HighConf = 2.7
End If
If n = 14 Then
LowConf = 2.51
HighConf = 2.75
End If
If n = 15 Then
LowConf = 2.55
HighConf = 2.81
End If
If n = 16 Then
LowConf = 2.59
HighConf = 2.85
End If
If n = 17 Then
LowConf = 2.62
HighConf = 2.9
End If
If n = 18 Then
LowConf = 2.65
HighConf = 2.93
End If
If n = 19 Then
LowConf = 2.68
HighConf = 2.97
End If
If n = 20 Then
LowConf = 2.71
HighConf = 3
End If
If n = 21 Then
LowConf = 2.73
HighConf = 3.03
End If
If n = 22 Then
LowConf = 2.76
HighConf = 3.06
End If
If n = 23 Then
LowConf = 2.78
HighConf = 3.08
End If
If n = 24 Then
LowConf = 2.8
HighConf = 3.11
End If
If n = 25 Then
LowConf = 2.82
HighConf = 3.14
End If
If n = 26 Then
LowConf = 2.84
HighConf = 3.16
End If
If n = 27 Then
LowConf = 2.86
HighConf = 3.18
End If
If n = 28 Then
LowConf = 2.88
HighConf = 3.2
End If
If n = 29 Then
LowConf = 2.89
HighConf = 3.22
End If
If n = 30 Then
LowConf = 2.91
HighConf = 3.24
End If
If n <= 35 And n > 30 Then
LowConf = 2.98
HighConf = 3.32
End If
If n <= 40 And n > 35 Then
LowConf = 3.04
HighConf = 3.38
End If
If n <= 45 And n > 40 Then
LowConf = 3.09
HighConf = 3.44
End If
If n <= 50 And n > 45 Then
LowConf = 3.13
HighConf = 3.48
End If
If n <= 60 And n > 50 Then
LowConf = 3.2
HighConf = 3.56
End If
If n <= 70 And n > 60 Then
LowConf = 3.26
HighConf = 3.62
End If
If n <= 80 And n > 70 Then
LowConf = 3.31
HighConf = 3.67
End If
If n <= 90 And n > 80 Then
LowConf = 3.35
HighConf = 3.72
End If
If n <= 100 And n > 90 Then
LowConf = 3.38
HighConf = 3.75
End If
If n <= 150 And n > 100 Then
LowConf = 3.52
HighConf = 3.89
End If
If n <= 200 And n > 150 Then
LowConf = 3.61
HighConf = 3.98
End If
If n <= 300 And n > 200 Then
LowConf = 3.72
HighConf = 4.09
End If
If n <= 400 And n > 300 Then
LowConf = 3.8
HighConf = 4.17
End If
If n <= 500 And n > 400 Then
LowConf = 3.86
HighConf = 4.32
End If
If n > 500 Then
MsgBox "Sample size cannot exceed 500."
End If
'--------------------------------------------------------
'--------------------------------------------------------
If n < 50 Then
k = Int(n / 10)
Else
k = 5
End If
'determines k = number of possible outliers
'--------------------------------------------------------
Dim i As Long
For i = 1 To k
'--------------------------------------------------------
mean = Application.WorksheetFunction.Average(DataSet)
'Calculates mean of Data Set
'--------------------------------------------------------
'--------------------------------------------------------
SD = Application.WorksheetFunction.StDev(DataSet)
'Calculates Standard Deviation of Data Set
'--------------------------------------------------------
'--------------------------------------------------------
Dim Suspect As Double
If (Abs(Application.WorksheetFunction.Max(DataSet) - mean)) > (Abs(Application.WorksheetFunction.Min(DataSet) - mean)) Then
Suspect = Application.WorksheetFunction.Max(DataSet)
End If
If (Abs(Application.WorksheetFunction.Max(DataSet) - mean)) < (Abs(Application.WorksheetFunction.Min(DataSet) - mean)) Then
Suspect = Application.WorksheetFunction.Min(DataSet)
End If
'Defines what the most outlying value is
'--------------------------------------------------------
'--------------------------------------------------------
Dim Retest As Boolean
If (Abs(Suspect - mean) / SD) > LowConf Then
MsgBox "95% outlier: " & Suspect
Retest = True
End If
If (Abs(Suspect - mean) / SD) > HighConf Then
MsgBox "99% outlier: " & Suspect
Retest = True
End If
If Retest = True Then
'--------------------------------------------------------
Next i
End Sub
Talking about this chunk of code, I need to delete Suspect from DataSet then shrink DataSet by 1 but I am not sure how to go about this. Is there a function that will delete the Max or Min value and is there a way to resize DataSet given that it is a variant?
Dim Retest As Boolean
If (Abs(Suspect - mean) / SD) > LowConf Then
MsgBox "95% outlier: " & Suspect
Retest = True
End If
If (Abs(Suspect - mean) / SD) > HighConf Then
MsgBox "99% outlier: " & Suspect
Retest = True
End If
If Retest = True Then
Upvotes: 0
Views: 676
Reputation: 10715
Here is how I would implement the solution (similar to @Tim's comment)
Sub calculateOutliers()
Dim retest As Boolean, dataSet As Variant, n As Long, found As Range
Dim lowConf As Double, highConf As Double, suspect As Double, tmp As Double
Dim mean As Double, stdDev As Double, dataMin As Double, dataMax As Double
retest = True
While retest
dataSet = Selection.Value
n = Selection.CountLarge
If Not IsEmpty(dataSet) Then
updateConfidences dataSet, n, lowConf, highConf
retest = False
With Application.WorksheetFunction
mean = .Average(dataSet)
stdDev = .StDev(dataSet)
dataMin = .Min(dataSet)
dataMax = .Max(dataSet)
End With
suspect = IIf(Abs(dataMax - mean) > Abs(dataMin - mean), dataMax, dataMin)
tmp = Abs(suspect - mean) / stdDev
Set found = Selection.Find(What:=suspect, LookAt:=xlWhole)
If tmp > lowConf Then
With Selection
found.Interior.Color = RGB(255, 255, 0)
.Value = removeItemFromArray(dataSet, suspect)
End With
retest = True
End If
If tmp > highConf Then
found.Interior.Color = RGB(255, 0, 0)
retest = True
End If
End If
Wend
End Sub
'updates lowConf and highConf (byref)
Private Sub updateConfidences(ByRef dataSet As Variant, ByVal n As Long, _
ByRef lowConf As Double, ByRef highConf As Double)
'DEFINES 95(LowConf) AND 99(HighConf) PERCENT CONFIDENCES BASED ON SAMPLE SIZE
Select Case True
Case n <= 5: lowConf = 1.72: highConf = 1.76
Case n = 6: lowConf = 1.89: highConf = 1.97
Case n = 7: lowConf = 2.02: highConf = 2.14
Case n = 8: lowConf = 2.13: highConf = 2.28
Case n = 9: lowConf = 2.21: highConf = 2.39
Case n = 10: lowConf = 2.29: highConf = 2.48
Case n = 11: lowConf = 2.36: highConf = 2.56
Case n = 12: lowConf = 2.41: highConf = 2.64
Case n = 13: lowConf = 2.46: highConf = 2.7
Case n = 14: lowConf = 2.51: highConf = 2.75
Case n = 15: lowConf = 2.55: highConf = 2.81
Case n = 16: lowConf = 2.59: highConf = 2.85
Case n = 17: lowConf = 2.62: highConf = 2.9
Case n = 18: lowConf = 2.65: highConf = 2.93
Case n = 19: lowConf = 2.68: highConf = 2.97
Case n = 20: lowConf = 2.71: highConf = 3
Case n = 21: lowConf = 2.73: highConf = 3.03
Case n = 22: lowConf = 2.76: highConf = 3.06
Case n = 23: lowConf = 2.78: highConf = 3.08
Case n = 24: lowConf = 2.8: highConf = 3.11
Case n = 25: lowConf = 2.82: highConf = 3.14
Case n = 26: lowConf = 2.84: highConf = 3.16
Case n = 27: lowConf = 2.86: highConf = 3.18
Case n = 28: lowConf = 2.88: highConf = 3.2
Case n = 29: lowConf = 2.89: highConf = 3.22
Case n = 30: lowConf = 2.91: highConf = 3.24
Case n <= 35: lowConf = 2.98: highConf = 3.32
Case n <= 40: lowConf = 3.04: highConf = 3.38
Case n <= 45: lowConf = 3.09: highConf = 3.44
Case n <= 50: lowConf = 3.13: highConf = 3.48
Case n <= 60: lowConf = 3.2: highConf = 3.56
Case n <= 70: lowConf = 3.26: highConf = 3.62
Case n <= 80: lowConf = 3.31: highConf = 3.67
Case n <= 90: lowConf = 3.35: highConf = 3.72
Case n <= 100: lowConf = 3.38: highConf = 3.75
Case n <= 150: lowConf = 3.52: highConf = 3.89
Case n <= 200: lowConf = 3.61: highConf = 3.98
Case n <= 300: lowConf = 3.72: highConf = 4.09
Case n <= 400: lowConf = 3.8: highConf = 4.17
Case n <= 500: lowConf = 3.86: highConf = 4.32
Case n > 500: MsgBox "Sample size cannot exceed 500."
End Select
End Sub
Private Function removeItemFromArray(ByRef initialArray As Variant, _
ByVal suspect As Double) As Variant
Const dex As String = "#,###.00000"
Dim i As Long, j As Long, dim1 As Long, dim2 As Long, arrayCopy As Variant
dim1 = UBound(initialArray, 1)
dim2 = UBound(initialArray, 2)
ReDim arrayCopy(1 To dim1, 1 To dim2)
For i = 1 To dim1
For j = 1 To dim2
If Format(initialArray(i, j), dex) <> Format(suspect, dex) Then
arrayCopy(i, j) = initialArray(i, j)
End If
Next j
Next i
removeItemFromArray = arrayCopy
End Function
Upvotes: 1
Reputation: 3145
One approach is to sort the data, either in excel before reading it into the vba routine, or in vba see for example Chip Pearson. With sorted data, you can remove outliers by changing array bounds, e.g. DataSet(3) to DataSet(n-5)
has removed 2 outliers on the low side and 5 on the high side.
Hope that helps
Upvotes: 1