-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathstats_outliers.py
More file actions
86 lines (70 loc) · 3.07 KB
/
stats_outliers.py
File metadata and controls
86 lines (70 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Implementations of some basic statistical methods to identify outliers in univariate data
"""
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
def stat_info(data: list[int]):
mean = np.mean(data)
std_dev = np.std(data)
mode = stats.mode(data)
median = np.median(data)
print("Data:", data)
print("Basic data's statistics. Mean:", mean, ". Standard deviation:", std_dev, ". Mode:", mode[0], ". Median:", median)
# Show histogram of data
plt.hist(data, bins=100, density=True, color='g')
plt.title("Sample Distribution of Data")
#plt.show()
def z_score(data: list[int]) -> list[int]:
'''
Z-score can be used to measure outliers (far from a central) in a dataset.
Formula: z_score = (X - mean)/std_dev
For a normal (Gaussian ) distribution, data points whose abs(z-score) > 2 are about 2* 2.26 (2.15 from 2 - 3, and .13 for z-score > 3)
:param data: list of integers
:return: list of outliers
'''
mean = np.mean(data)
std_dev = np.std(data)
# Calculate z_scores for elements in data
z_scores = [(x - mean)/std_dev for x in data]
# Identify outliers which has |z_score| > 2
outliers = [data[i] for i in range(len(data)) if abs(z_scores[i]) > 2]
return outliers
def modified_z_score(data: list[int]) -> list[int]:
"""
In case data is not a Gaussian distribution, modified z-score could be used.
This measurement replaces Mean and Standard Deviation by Median and Mean Absolute Deviation (MeanAD) or Median Absolute Deviation (MAD)
modified_z_score = (X - median)/(1.486 * MAD)
An outlier could have its modified z-score less than -3.5 or greater than 3.5
:param data:
:return: outliers
"""
median = np.median(data)
MAD = [abs(x - median) for x in data]
modified_z_scores = [(data[i] - median)/(1.486*MAD[i]) if MAD[i] != 0 else 0 for i in range(len(data))]
outliers = [data[i] for i in range(len(data)) if abs(modified_z_scores[i]) > 3.5]
return outliers
def iqr_outliers(data):
"""
Interquartile Range is the range of data's values between the first and the third quartile (Q3-Q1).
Outliers using IQR include data whose values:
1. less than Q1 - 1.5 IQR
2. greater than Q3 + 1.5 IQR
:param data:
:return: outliers
"""
l = len(data)
iqr = stats.iqr(data)
q1 = np.median(data[:l//2])
q3 = np.median(data[l//2:])
outliers = [data[i] for i in range(len(data)) if (data[i] < (q1 - 1.5*iqr)) or (data[i] > (q3 + 1.5*iqr))]
return outliers
if __name__ == '__main__':
data = [-200, -150, 10, 12, 18, 14, 20, 15, 13, 11, 9, 8, 13, 14, 15, 16, 17, 22, 25, 27, 21, 20, 9, 15, 14, 18, 19, 20, 21, 22, 25, 28, 35, 21, 21, 30, 29, 150, 200, 300]
stat_info(data)
z_score_outliers = z_score(data)
print("Outliers using Z-score method:", z_score_outliers)
modified_z_score_outliers = modified_z_score(data)
print("Outliers using modified Z-score method:", z_score_outliers)
iqr_outliers = iqr_outliers(data)
print("Outliers using IQR method:", iqr_outliers)