Skip to content

Commit

Permalink
Merge pull request #135 from scikit-learn-contrib/enbpi
Browse files Browse the repository at this point in the history
[ADD] EnbPI
  • Loading branch information
vtaquet authored Jun 13, 2022
2 parents f8fc330 + 3f779ec commit e49b6c4
Show file tree
Hide file tree
Showing 17 changed files with 2,245 additions and 66 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ __pycache__/
*.py[cod]
*$py.class
.DS_Store
.mypy*

# C extensions
*.so
Expand Down
1 change: 1 addition & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ History
0.3.3 (2022-XX-XX)
------------------
* Relax and fix typing
* Add EnbPI method for Time Series Regression

0.3.2 (2022-03-11)
------------------
Expand Down
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,11 @@ MAPIE methods belong to the field of conformal inference.
"Uncertainty Sets for Image Classifiers using Conformal Prediction."
International Conference on Learning Representations 2021.

[6] Chen Xu, Yao Xie.
"Conformal prediction for dynamic time-series"
https://arxiv.org/abs/2010.09107


📝 License
==========

Expand Down
Binary file modified doc/images/quickstart_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
intervals, through the `sklearn.model_selection.KFold()` object.
Residuals are therefore estimated using models trained on data with higher
indices than the validation data, which is inappropriate for time-series data.
Howerver, using a `sklearn.model_selection.TimeSeriesSplit` cross validation
However, using a `sklearn.model_selection.TimeSeriesSplit` cross validation
object for estimating the residuals breaks the theoretical guarantees of the
Jackknife+ and CV+ methods.
"""
Expand Down
230 changes: 230 additions & 0 deletions examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
"""
==================================================================
Estimating prediction intervals of time series forecast with EnbPI
==================================================================
This example uses
:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate
prediction intervals associated with time series forecast. It follows [6].
We use here the Victoria electricity demand dataset used in the book
"Forecasting: Principles and Practice" by R. J. Hyndman and G. Athanasopoulos.
The electricity demand features daily and weekly seasonalities and is impacted
by the temperature, considered here as a exogeneous variable.
A Random Forest model is already fitted on data. The hyper-parameters are
optimized with a :class:`sklearn.model_selection.RandomizedSearchCV` using a
sequential :class:`sklearn.model_selection.TimeSeriesSplit` cross validation,
in which the training set is prior to the validation set.
The best model is then feeded into
:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the
associated prediction intervals. We compare two approaches: with or without
``partial_fit`` called at every step following [6]. It appears that
``partial_fit`` offer a coverage closer to the targeted coverage, and with
narrower PIs.
"""

from typing import cast

from matplotlib import pylab as plt
import numpy as np
import pandas as pd
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit

from mapie.metrics import (
regression_coverage_score,
regression_mean_width_score,
)
from mapie._typing import NDArray
from mapie.subsample import BlockBootstrap
from mapie.time_series_regression import MapieTimeSeriesRegressor

# Load input data and feature engineering
url_file = (
"https://raw.githubusercontent.com/scikit-learn-contrib/MAPIE/"
+ "master/examples/data/demand_temperature.csv"
)
demand_df = pd.read_csv(url_file, parse_dates=True, index_col=0)

demand_df["Date"] = pd.to_datetime(demand_df.index)
demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64")
demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64")
demand_df["Hour"] = demand_df.index.hour
n_lags = 5
for hour in range(1, n_lags):
demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour)

# Train/validation/test split
num_test_steps = 24 * 7
demand_train = demand_df.iloc[:-num_test_steps, :].copy()
demand_test = demand_df.iloc[-num_test_steps:, :].copy()
features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + [
f"Lag_{hour}" for hour in range(1, n_lags)
]

X_train = demand_train.loc[
~np.any(demand_train[features].isnull(), axis=1), features
]
y_train = demand_train.loc[X_train.index, "Demand"]
X_test = demand_test.loc[:, features]
y_test = demand_test["Demand"]

perform_hyperparameters_search = False
if perform_hyperparameters_search:
# CV parameter search
n_iter = 100
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)
random_state = 59
rf_model = RandomForestRegressor(random_state=random_state)
rf_params = {"max_depth": randint(2, 30), "n_estimators": randint(10, 100)}
cv_obj = RandomizedSearchCV(
rf_model,
param_distributions=rf_params,
n_iter=n_iter,
cv=tscv,
scoring="neg_root_mean_squared_error",
random_state=random_state,
verbose=0,
n_jobs=-1,
)
cv_obj.fit(X_train, y_train)
model = cv_obj.best_estimator_
else:
# Model: Random Forest previously optimized with a cross-validation
model = RandomForestRegressor(
max_depth=10, n_estimators=50, random_state=59
)

# Estimate prediction intervals on test set with best estimator
alpha = 0.05
cv_mapietimeseries = BlockBootstrap(
n_resamplings=100, length=48, overlapping=True, random_state=59
)

mapie_enpbi = MapieTimeSeriesRegressor(
model,
method="enbpi",
cv=cv_mapietimeseries,
agg_function="mean",
n_jobs=-1,
)

print("EnbPI, with no partial_fit, width optimization")
mapie_enpbi = mapie_enpbi.fit(X_train, y_train)
y_pred_npfit_enbpi, y_pis_npfit_enbpi = mapie_enpbi.predict(
X_test, alpha=alpha, ensemble=True, optimize_beta=True
)
coverage_npfit_enbpi = regression_coverage_score(
y_test, y_pis_npfit_enbpi[:, 0, 0], y_pis_npfit_enbpi[:, 1, 0]
)

width_npfit_enbpi = regression_mean_width_score(
y_pis_npfit_enbpi[:, 1, 0], y_pis_npfit_enbpi[:, 0, 0]
)

print("EnbPI with partial_fit, width optimization")
mapie_enpbi = mapie_enpbi.fit(X_train, y_train)
y_pred_pfit_enbpi = np.zeros(y_pred_npfit_enbpi.shape)
y_pis_pfit_enbpi = np.zeros(y_pis_npfit_enbpi.shape)

step_size = 1
(
y_pred_pfit_enbpi[:step_size],
y_pis_pfit_enbpi[:step_size, :, :],
) = mapie_enpbi.predict(
X_test.iloc[:step_size, :], alpha=alpha, ensemble=True, optimize_beta=True
)

for step in range(step_size, len(X_test), step_size):
mapie_enpbi.partial_fit(
X_test.iloc[(step - step_size):step, :],
y_test.iloc[(step - step_size):step],
)
(
y_pred_pfit_enbpi[step:step + step_size],
y_pis_pfit_enbpi[step:step + step_size, :, :],
) = mapie_enpbi.predict(
X_test.iloc[step:(step + step_size), :],
alpha=alpha,
ensemble=True,
optimize_beta=True,
)
coverage_pfit_enbpi = regression_coverage_score(
y_test, y_pis_pfit_enbpi[:, 0, 0], y_pis_pfit_enbpi[:, 1, 0]
)
width_pfit_enbpi = regression_mean_width_score(
y_pis_pfit_enbpi[:, 1, 0], y_pis_pfit_enbpi[:, 0, 0]
)

# Print results
print(
"Coverage / prediction interval width mean for MapieTimeSeriesRegressor: "
"\nEnbPI without any partial_fit:"
f"{coverage_npfit_enbpi :.3f}, {width_npfit_enbpi:.3f}"
)
print(
"Coverage / prediction interval width mean for MapieTimeSeriesRegressor: "
"\nEnbPI with partial_fit:"
f"{coverage_pfit_enbpi:.3f}, {width_pfit_enbpi:.3f}"
)

enbpi_no_pfit = {
"y_pred": y_pred_npfit_enbpi,
"y_pis": y_pis_npfit_enbpi,
"coverage": coverage_npfit_enbpi,
"width": width_npfit_enbpi,
}

enbpi_pfit = {
"y_pred": y_pred_pfit_enbpi,
"y_pis": y_pis_pfit_enbpi,
"coverage": coverage_pfit_enbpi,
"width": width_pfit_enbpi,
}

results = [enbpi_no_pfit, enbpi_pfit]

# Plot estimated prediction intervals on test set
fig, axs = plt.subplots(
nrows=2, ncols=1, figsize=(15, 12), sharex="col"
)

for i, (ax, w, result) in enumerate(
zip(axs, ["EnbPI, without partial_fit", "EnbPI with partial_fit"], results)
):
ax.set_ylabel("Hourly demand (GW)", fontsize=20)
ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1")

ax.plot(
demand_test.index,
result["y_pred"],
lw=2,
c="C2",
label="Predictions",
)

y_pis = cast(NDArray, result["y_pis"])

ax.fill_between(
demand_test.index,
y_pis[:, 0, 0],
y_pis[:, 1, 0],
color="C2",
alpha=0.2,
label="MapieTimeSeriesRegressor PIs",
)

ax.set_title(
w + "\n"
f"Coverage:{result['coverage']:.3f} Width:{result['width']:.3f}",
fontweight="bold",
size=20
)
plt.xticks(size=15, rotation=45)
plt.yticks(size=15)

axs[0].legend(prop={'size': 22})
plt.show()
23 changes: 23 additions & 0 deletions mapie/_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,31 @@ def np_quantile_version_above_122(
return np.quantile(a, q, method=method, **kwargs) # type: ignore


def np_nanquantile_version_below_122(
a: ArrayLike,
q: ArrayLike,
method: str = "linear",
**kwargs: Any
) -> NDArray:
"""Wrapper of np.quantile function for numpy version < 1.22."""
return np.nanquantile(a, q, interpolation=method, **kwargs) # type: ignore


def np_nanquantile_version_above_122(
a: ArrayLike,
q: ArrayLike,
method: str = "linear",
**kwargs: Any
) -> NDArray:
"""Wrapper of np.quantile function for numpy version >= 1.22."""
return np.nanquantile(a, q, method=method, **kwargs) # type: ignore


numpy_version = parse_version(np.__version__)
if numpy_version < parse_version("1.22"):
np_quantile = np_quantile_version_below_122
np_nanquantile = np_nanquantile_version_below_122

else:
np_quantile = np_quantile_version_above_122
np_nanquantile = np_nanquantile_version_above_122
Loading

0 comments on commit e49b6c4

Please sign in to comment.