Merge pull request #135 from scikit-learn-contrib/enbpi

[ADD] EnbPI
scikit-learn-contrib · Jun 13, 2022 · e49b6c4 · e49b6c4
2 parents f8fc330 + 3f779ec
commit e49b6c4
Show file tree

Hide file tree

Showing 17 changed files with 2,245 additions and 66 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 .DS_Store
+.mypy*
 
 # C extensions
 *.so

diff --git a/HISTORY.rst b/HISTORY.rst
@@ -5,6 +5,7 @@ History
 0.3.3 (2022-XX-XX)
 ------------------
 * Relax and fix typing
+* Add EnbPI method for Time Series Regression
 
 0.3.2 (2022-03-11)
 ------------------

diff --git a/README.rst b/README.rst
@@ -244,6 +244,11 @@ MAPIE methods belong to the field of conformal inference.
 "Uncertainty Sets for Image Classifiers using Conformal Prediction."
 International Conference on Learning Representations 2021.
 
+[6] Chen Xu, Yao Xie.
+"Conformal prediction for dynamic time-series"
+https://arxiv.org/abs/2010.09107
+
+
 📝 License
 ==========
 

diff --git a/doc/images/quickstart_1.png b/doc/images/quickstart_1.png
diff --git a/examples/regression/1-quickstart/plot_timeseries_example.py b/examples/regression/1-quickstart/plot_timeseries_example.py
@@ -25,7 +25,7 @@
 intervals, through the `sklearn.model_selection.KFold()` object.
 Residuals are therefore estimated using models trained on data with higher
 indices than the validation data, which is inappropriate for time-series data.
-Howerver, using a `sklearn.model_selection.TimeSeriesSplit` cross validation
+However, using a `sklearn.model_selection.TimeSeriesSplit` cross validation
 object for estimating the residuals breaks the theoretical guarantees of the
 Jackknife+ and CV+ methods.
 """

diff --git a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py
@@ -0,0 +1,230 @@
+"""
+==================================================================
+Estimating prediction intervals of time series forecast with EnbPI
+==================================================================
+
+This example uses
+:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate
+prediction intervals associated with time series forecast. It follows [6].
+
+We use here the Victoria electricity demand dataset used in the book
+"Forecasting: Principles and Practice" by R. J. Hyndman and G. Athanasopoulos.
+The electricity demand features daily and weekly seasonalities and is impacted
+by the temperature, considered here as a exogeneous variable.
+
+A Random Forest model is already fitted on data. The hyper-parameters are
+optimized with a :class:`sklearn.model_selection.RandomizedSearchCV` using a
+sequential :class:`sklearn.model_selection.TimeSeriesSplit` cross validation,
+in which the training set is prior to the validation set.
+The best model is then feeded into
+:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the
+associated prediction intervals. We compare two approaches: with or without
+``partial_fit`` called at every step following [6]. It appears that
+``partial_fit`` offer a coverage closer to the targeted coverage, and with
+narrower PIs.
+"""
+
+from typing import cast
+
+from matplotlib import pylab as plt
+import numpy as np
+import pandas as pd
+from scipy.stats import randint
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
+
+from mapie.metrics import (
+    regression_coverage_score,
+    regression_mean_width_score,
+)
+from mapie._typing import NDArray
+from mapie.subsample import BlockBootstrap
+from mapie.time_series_regression import MapieTimeSeriesRegressor
+
+# Load input data and feature engineering
+url_file = (
+    "https://raw.githubusercontent.com/scikit-learn-contrib/MAPIE/"
+    + "master/examples/data/demand_temperature.csv"
+)
+demand_df = pd.read_csv(url_file, parse_dates=True, index_col=0)
+
+demand_df["Date"] = pd.to_datetime(demand_df.index)
+demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64")
+demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64")
+demand_df["Hour"] = demand_df.index.hour
+n_lags = 5
+for hour in range(1, n_lags):
+    demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour)
+
+# Train/validation/test split
+num_test_steps = 24 * 7
+demand_train = demand_df.iloc[:-num_test_steps, :].copy()
+demand_test = demand_df.iloc[-num_test_steps:, :].copy()
+features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + [
+    f"Lag_{hour}" for hour in range(1, n_lags)
+]
+
+X_train = demand_train.loc[
+    ~np.any(demand_train[features].isnull(), axis=1), features
+]
+y_train = demand_train.loc[X_train.index, "Demand"]
+X_test = demand_test.loc[:, features]
+y_test = demand_test["Demand"]
+
+perform_hyperparameters_search = False
+if perform_hyperparameters_search:
+    # CV parameter search
+    n_iter = 100
+    n_splits = 5
+    tscv = TimeSeriesSplit(n_splits=n_splits)
+    random_state = 59
+    rf_model = RandomForestRegressor(random_state=random_state)
+    rf_params = {"max_depth": randint(2, 30), "n_estimators": randint(10, 100)}
+    cv_obj = RandomizedSearchCV(
+        rf_model,
+        param_distributions=rf_params,
+        n_iter=n_iter,
+        cv=tscv,
+        scoring="neg_root_mean_squared_error",
+        random_state=random_state,
+        verbose=0,
+        n_jobs=-1,
+    )
+    cv_obj.fit(X_train, y_train)
+    model = cv_obj.best_estimator_
+else:
+    # Model: Random Forest previously optimized with a cross-validation
+    model = RandomForestRegressor(
+        max_depth=10, n_estimators=50, random_state=59
+    )
+
+# Estimate prediction intervals on test set with best estimator
+alpha = 0.05
+cv_mapietimeseries = BlockBootstrap(
+    n_resamplings=100, length=48, overlapping=True, random_state=59
+)
+
+mapie_enpbi = MapieTimeSeriesRegressor(
+    model,
+    method="enbpi",
+    cv=cv_mapietimeseries,
+    agg_function="mean",
+    n_jobs=-1,
+)
+
+print("EnbPI, with no partial_fit, width optimization")
+mapie_enpbi = mapie_enpbi.fit(X_train, y_train)
+y_pred_npfit_enbpi, y_pis_npfit_enbpi = mapie_enpbi.predict(
+    X_test, alpha=alpha, ensemble=True, optimize_beta=True
+)
+coverage_npfit_enbpi = regression_coverage_score(
+    y_test, y_pis_npfit_enbpi[:, 0, 0], y_pis_npfit_enbpi[:, 1, 0]
+)
+
+width_npfit_enbpi = regression_mean_width_score(
+    y_pis_npfit_enbpi[:, 1, 0], y_pis_npfit_enbpi[:, 0, 0]
+)
+
+print("EnbPI with partial_fit, width optimization")
+mapie_enpbi = mapie_enpbi.fit(X_train, y_train)
+y_pred_pfit_enbpi = np.zeros(y_pred_npfit_enbpi.shape)
+y_pis_pfit_enbpi = np.zeros(y_pis_npfit_enbpi.shape)
+
+step_size = 1
+(
+    y_pred_pfit_enbpi[:step_size],
+    y_pis_pfit_enbpi[:step_size, :, :],
+) = mapie_enpbi.predict(
+    X_test.iloc[:step_size, :], alpha=alpha, ensemble=True, optimize_beta=True
+)
+
+for step in range(step_size, len(X_test), step_size):
+    mapie_enpbi.partial_fit(
+        X_test.iloc[(step - step_size):step, :],
+        y_test.iloc[(step - step_size):step],
+    )
+    (
+        y_pred_pfit_enbpi[step:step + step_size],
+        y_pis_pfit_enbpi[step:step + step_size, :, :],
+    ) = mapie_enpbi.predict(
+        X_test.iloc[step:(step + step_size), :],
+        alpha=alpha,
+        ensemble=True,
+        optimize_beta=True,
+    )
+coverage_pfit_enbpi = regression_coverage_score(
+    y_test, y_pis_pfit_enbpi[:, 0, 0], y_pis_pfit_enbpi[:, 1, 0]
+)
+width_pfit_enbpi = regression_mean_width_score(
+    y_pis_pfit_enbpi[:, 1, 0], y_pis_pfit_enbpi[:, 0, 0]
+)
+
+# Print results
+print(
+    "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: "
+    "\nEnbPI without any partial_fit:"
+    f"{coverage_npfit_enbpi :.3f}, {width_npfit_enbpi:.3f}"
+)
+print(
+    "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: "
+    "\nEnbPI with partial_fit:"
+    f"{coverage_pfit_enbpi:.3f}, {width_pfit_enbpi:.3f}"
+)
+
+enbpi_no_pfit = {
+    "y_pred": y_pred_npfit_enbpi,
+    "y_pis": y_pis_npfit_enbpi,
+    "coverage": coverage_npfit_enbpi,
+    "width": width_npfit_enbpi,
+}
+
+enbpi_pfit = {
+    "y_pred": y_pred_pfit_enbpi,
+    "y_pis": y_pis_pfit_enbpi,
+    "coverage": coverage_pfit_enbpi,
+    "width": width_pfit_enbpi,
+}
+
+results = [enbpi_no_pfit, enbpi_pfit]
+
+# Plot estimated prediction intervals on test set
+fig, axs = plt.subplots(
+    nrows=2, ncols=1, figsize=(15, 12), sharex="col"
+)
+
+for i, (ax, w, result) in enumerate(
+    zip(axs, ["EnbPI, without partial_fit", "EnbPI with partial_fit"], results)
+):
+    ax.set_ylabel("Hourly demand (GW)", fontsize=20)
+    ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1")
+
+    ax.plot(
+        demand_test.index,
+        result["y_pred"],
+        lw=2,
+        c="C2",
+        label="Predictions",
+    )
+
+    y_pis = cast(NDArray, result["y_pis"])
+
+    ax.fill_between(
+        demand_test.index,
+        y_pis[:, 0, 0],
+        y_pis[:, 1, 0],
+        color="C2",
+        alpha=0.2,
+        label="MapieTimeSeriesRegressor PIs",
+    )
+
+    ax.set_title(
+        w + "\n"
+        f"Coverage:{result['coverage']:.3f}  Width:{result['width']:.3f}",
+        fontweight="bold",
+        size=20
+    )
+    plt.xticks(size=15, rotation=45)
+    plt.yticks(size=15)
+
+axs[0].legend(prop={'size': 22})
+plt.show()
diff --git a/mapie/_compatibility.py b/mapie/_compatibility.py
@@ -26,8 +26,31 @@ def np_quantile_version_above_122(
     return np.quantile(a, q, method=method, **kwargs)  # type: ignore
 
 
+def np_nanquantile_version_below_122(
+    a: ArrayLike,
+    q: ArrayLike,
+    method: str = "linear",
+    **kwargs: Any
+) -> NDArray:
+    """Wrapper of np.quantile function for numpy version < 1.22."""
+    return np.nanquantile(a, q, interpolation=method, **kwargs)  # type: ignore
+
+
+def np_nanquantile_version_above_122(
+    a: ArrayLike,
+    q: ArrayLike,
+    method: str = "linear",
+    **kwargs: Any
+) -> NDArray:
+    """Wrapper of np.quantile function for numpy version >= 1.22."""
+    return np.nanquantile(a, q, method=method, **kwargs)  # type: ignore
+
+
 numpy_version = parse_version(np.__version__)
 if numpy_version < parse_version("1.22"):
     np_quantile = np_quantile_version_below_122
+    np_nanquantile = np_nanquantile_version_below_122
+
 else:
     np_quantile = np_quantile_version_above_122
+    np_nanquantile = np_nanquantile_version_above_122