-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimpute_missing_values.py
101 lines (93 loc) · 4.82 KB
/
impute_missing_values.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
import numpy as np
import tqdm
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import BaggingRegressor
def impute_missing_values(df_, var_deviation_tolerance=0.97, actual_or_gaussian_residuals='actual',
col_floor_ceiling_dict=None, scores=False):
"""Impute missing values while minimizing distortion of variable distribution
by creating a bagged model using other variables and adding residuals to output values
Parameters:
df: dataframe with missing values
var_deviation_tolerance: target percent deviation from original variable distributions
actual_or_guassian_residuals: apply residuals to model outputs from actual distribution or from
a gaussian distribution based on residuals' means and variances
col_floor_ceiling_dict: a dictionary with the variable name and a tuple of the min and max for variables
with a finite range. Use float(inf) or float(-inf) for variables that are limited in only one direction
scores: return accuracy score of models per variable
Returns:
df: df with imputed values
problems: columns that failed to impute
column_scores: accuracy scores of imputation model on non-missing values
regressors: trained regressors that can be used for imputing missing values in future production data
"""
regressors = [] # List that will contain the regressor trained for predicting values for each variable
df_ = df_.copy()
columns = df_.columns
type_dict = df_.dtypes.to_dict()
missing_columns = list(df_.isna().sum()[df_.isna().sum() > 0].sort_values().index)
have_columns = [i for i in columns if i not in missing_columns]
column_scores = {}
problems = []
for col in tqdm.tqdm(missing_columns):
# noinspection PyBroadException
try:
percent_missing = df_[col].isna().sum() / df_.shape[0]
m = math.ceil(percent_missing / ((1 / var_deviation_tolerance) - 1))
other_columns = [i for i in columns if i != col]
na_index = df_[df_[col].isna() == 1].index
have_index = [i for i in df_.index if i not in na_index]
na_have_cols = set(df_.loc[na_index, other_columns].dropna(axis=1).columns)
have_have_cols = set(df_.loc[have_index, other_columns].dropna(axis=1).columns)
both_cols = na_have_cols.intersection(have_have_cols)
int_df = pd.get_dummies(df_.loc[:, both_cols], drop_first=False)
X_have = int_df.loc[have_index, :]
y_have = df_[col][have_index]
X_na = int_df.loc[na_index, :]
if type_dict[col] == 'object':
le = LabelEncoder()
y_have = le.fit_transform(y_have)
df_[col][have_index] = y_have
rf = RandomForestClassifier()
bag_classifier = BaggingClassifier(base_estimator=rf, n_estimators=m)
bag_classifier.fit(X_have, y_have)
column_scores[col] = bag_classifier.score(X_have, y_have)
residual_preds = bag_classifier.predict(X_have)
residuals = y_have - residual_preds
preds = bag_classifier.predict(X_na)
regressors.append(bag_classifier)
else:
bag_regressor = BaggingRegressor(n_estimators=m)
bag_regressor.fit(X_have, y_have)
column_scores[col] = bag_regressor.score(X_have, y_have)
residual_preds = bag_regressor.predict(X_have)
residuals = y_have - residual_preds
preds = bag_regressor.predict(X_na)
regressors.append(bag_regressor)
if actual_or_gaussian_residuals == 'actual':
rand_residuals = np.random.choice(residuals, len(X_na), replace=True)
else:
rand_residuals = np.random.normal(residuals.mean(), residuals.std(), len(X_na))
preds = preds + rand_residuals
if type_dict[col] == 'object':
preds = preds.round()
if not col_floor_ceiling_dict is None:
if col in col_floor_ceiling_dict.keys():
preds = np.clip(preds, col_floor_ceiling_dict[col][0], col_floor_ceiling_dict[col][1])
df_[col][na_index] = preds
have_columns.append(col)
except:
problems.append(col)
if not scores:
return df_, problems, regressors
else:
return df_, problems, column_scores, regressors
def change_type(df_: pd.DataFrame, colname: str, astype: type) -> pd.DataFrame:
df_[colname] = df_[colname].astype(astype)
if df_[colname].max() == np.inf:
print(f'{colname}: Value out of range for type {astype}')
else:
return df_