-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPCA.py
97 lines (82 loc) · 3.71 KB
/
PCA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
The implementation of PCA model for anomaly detection.
Authors:
LogPAI Team
Reference:
[1] Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan.
Large-Scale System Problems Detection by Mining Console Logs. ACM
Symposium on Operating Systems Principles (SOSP), 2009.
"""
import numpy as np
class PCA(object):
def __init__(self, n_components=0.95, threshold=None, c_alpha=4.4172):
""" The PCA model for anomaly detection
Attributes
----------
proj_C: The projection matrix for projecting feature vector to abnormal space
n_components: float/int, number of principal compnents or the variance ratio they cover
threshold: float, the anomaly detection threshold. When setting to None, the threshold
is automatically caculated using Q-statistics
c_alpha: float, the c_alpha parameter for caculating anomaly detection threshold using
Q-statistics. The following is lookup table for c_alpha:
c_alpha = 1.7507; # alpha = 0.08
c_alpha = 1.9600; # alpha = 0.05
c_alpha = 2.5758; # alpha = 0.01
c_alpha = 2.807; # alpha = 0.005
c_alpha = 2.9677; # alpha = 0.003
c_alpha = 3.2905; # alpha = 0.001
c_alpha = 3.4808; # alpha = 0.0005
c_alpha = 3.8906; # alpha = 0.0001
c_alpha = 4.4172; # alpha = 0.00001
"""
self.proj_C = None
self.components = None
self.n_components = n_components
self.threshold = threshold
self.c_alpha = c_alpha
def fit(self, X):
"""
Auguments
---------
X: ndarray, the event count matrix of shape num_instances-by-num_events
"""
print('====== Model summary ======')
num_instances, num_events = X.shape
X_cov = np.dot(X.T, X) / float(num_instances)
U, sigma, V = np.linalg.svd(X_cov)
n_components = self.n_components
if n_components < 1:
total_variance = np.sum(sigma)
variance = 0
for i in range(num_events):
variance += sigma[i]
if variance / total_variance >= n_components:
break
n_components = i + 1
P = U[:, :n_components]
I = np.identity(num_events, int)
self.components = P
self.proj_C = I - np.dot(P, P.T)
print('n_components: {}'.format(n_components))
print('Project matrix shape: {}-by-{}'.format(self.proj_C.shape[0], self.proj_C.shape[1]))
if not self.threshold:
# Calculate threshold using Q-statistic. Information can be found at:
# http://conferences.sigcomm.org/sigcomm/2004/papers/p405-lakhina111.pdf
phi = np.zeros(3)
for i in range(3):
for j in range(n_components, num_events):
phi[i] += np.power(sigma[j], i + 1)
h0 = 1.0 - 2 * phi[0] * phi[2] / (3.0 * phi[1] * phi[1])
self.threshold = phi[0] * np.power(self.c_alpha * np.sqrt(2 * phi[1] * h0 * h0) / phi[0]
+ 1.0 + phi[1] * h0 * (h0 - 1) / (phi[0] * phi[0]),
1.0 / h0)
print('SPE threshold: {}\n'.format(self.threshold))
def predict(self, X):
assert self.proj_C is not None, 'PCA model needs to be trained before prediction.'
y_pred = np.zeros(X.shape[0])
for i in range(X.shape[0]):
y_a = np.dot(self.proj_C, X[i, :])
SPE = np.dot(y_a, y_a)
if SPE > self.threshold:
y_pred[i] = 1
return y_pred