-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_score.py
163 lines (131 loc) · 5.61 KB
/
model_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import make_scorer, recall_score, classification_report, confusion_matrix, precision_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
def pca_variance_analysis(data: pd.DataFrame, step_size=10):
"""
Simple function to explore how much of the data variance is explained
when grouped by principal components.
The results will be shown in a line graph.
:param data: pd.DataFrame with the data to analyze.
:param step_size: int, default = 10. Number of variables reduced in each step.
"""
variance_explained = []
for n_features in tqdm.tqdm(range(len(data.columns), 0, -step_size)):
pca = PCA(n_components=n_features)
pca.fit_transform(data)
var = pca.explained_variance_ratio_.sum()
variance_explained.append({'n_features': n_features,
'variance': var})
pd.DataFrame(variance_explained).set_index('n_features').plot(figsize=(7, 5))
plt.title('Variance explained by number of dimensions with PCA', fontsize=18)
plt.show()
def calculateVIF(data: pd.DataFrame) -> pd.DataFrame:
"""
This function calculates the degree of multicollinearity existing among
different variables in a dataset.
As a reference, significance levels of multicollinearity are:
- VIF < 5: Not significant
- 5 < VIF < 10: Moderately significant
- 10 < VIF: Highly significant
Args:
- data_: pandas DataFrame containing the variables.
returns: pandas DataFrame containing the multicollinearity level for each
variable.
"""
features_ = list(data_.columns)
num_features = len(features_)
model = LinearRegression()
result = pd.DataFrame(index=['VIF'], columns=features_)
result = result.fillna(0)
for ite in range(num_features):
x_features = features_[:]
y_feature = features_[ite]
x_features.remove(y_feature)
model.fit(data_[x_features], data_[y_feature])
result[y_feature] = 1 / (1 - model.score(data_[x_features], data_[y_feature]))
return result
def quick_score(x_, y_):
"""
Quickly tests the score for a dataset using a simple
Logistic Regression model.
The function shows the results on screen.
"""
x_train_, x_test_, y_train_, y_test_ = train_test_split(
x_, y_, test_size=0.2
)
# Invoke classifier
clf = LogisticRegression(n_jobs=-1)
# Make a scoring callable from recall_score
recall = make_scorer(recall_score)
# Cross-validate on the train data
train_cv = cross_val_score(X=x_train_, y=y_train_, estimator=clf, scoring=recall, cv=3, n_jobs=-1)
print("TRAIN GROUP")
print("\nCross-validation recall scores:", train_cv)
print("Mean recall score:", train_cv.mean())
# Now predict on the test group
print("\nTEST GROUP")
y_pred = clf.fit(x_train_, y_train_).predict(x_test_)
print("\nRecall:", recall_score(y_test_, y_pred))
# Classification report
print('\nClassification report:\n')
print(classification_report(y_test_, y_pred, zero_division=0))
# Confusion matrix
conf_matrix = confusion_matrix(y_test_, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap=plt.cm.copper)
plt.show()
def score_optimization(params_, clf_, train_test: list, performance: pd.DataFrame):
"""
This function receives a list of parameters and a classifier object and optimizes
a predictive model based on them.
The model returns the best performing trained model.
:arg: params_: list of dicts with parameters to try for the model.
:arg: clf_: model object.
:arg: train_test: list containing train and test data in order [x_train, y_train, x_test, y_test].
:arg: performance: pd.DataFrame where the performance of the model is stored.
"""
x_train, y_train, x_test, y_test = [i for i in train_test]
precision = make_scorer(precision_score)
# Load GridSearchCV
search = GridSearchCV(
estimator=clf_,
param_grid=params_,
n_jobs=-1,
scoring=precision
)
# Train search object
search.fit(x_train, y_train)
# Heading
print('\n', '-' * 40, '\n', clf_.__class__.__name__, '\n', '-' * 40)
# Extract best estimator
best = search.best_estimator_
print('Best parameters: \n\n', search.best_params_, '\n')
# Cross-validate on the train data
print("TRAIN GROUP")
train_cv_ = cross_val_score(X=x_train, y=y_train,
estimator=best, scoring=precision, cv=3)
print("\nCross-validation precision scores:", train_cv_)
print("Mean precision score:", train_cv_.mean())
# Now predict on the test group
print("\nTEST GROUP")
y_pred_ = best.fit(x_train, y_train).predict(x_test)
print("\nPrecision:", precision_score(y_test, y_pred_))
# Get classification report
print(classification_report(y_test, y_pred_))
# Print confusion matrix
conf_matrix_ = confusion_matrix(y_test, y_pred_)
sns.heatmap(conf_matrix_, annot=True, fmt='d', cmap=plt.cm.copper)
plt.show()
# Store results
performance.loc[clf_.__class__.__name__ + '_optimize', :] = [
train_cv_.mean(),
precision_score(y_test, y_pred_),
conf_matrix_[0, 0] / conf_matrix_[0, :].sum(),
# precision_optim(y_test,y_pred_)
]
# Look at the parameters for the top best scores
display(pd.DataFrame(search.cv_results_).iloc[:, 4:].sort_values(by='rank_test_score').head())
display(performance)
return best, performance