From a6b1a2bdd5bacd696415978e9f2c25494506d9f9 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Tue, 8 Feb 2022 17:10:08 +0100 Subject: [PATCH 01/32] [ADD] EnbPI Remark: "artifical" alpha=0.5 in partial_fit. Should change regression.py to avoid it --- HISTORY.rst | 1 + mapie/regression.py | 1 + mapie/tests/test_time_series_regression.py | 36 ++++ mapie/time_series_regression.py | 190 +++++++++++++++++++++ 4 files changed, 228 insertions(+) create mode 100644 mapie/tests/test_time_series_regression.py create mode 100644 mapie/time_series_regression.py diff --git a/HISTORY.rst b/HISTORY.rst index a56d5d1f7..d2aee8b2e 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -14,6 +14,7 @@ History "predict" in regression.py * Add replication of the Chen Xu's tutorial testing Jackknife+aB vs Jackknife+ * Add Jackknife+-after-Bootstrap documentation +* Add EnbPI method for Time Series Regression 0.3.1 (2021-11-19) ------------------ diff --git a/mapie/regression.py b/mapie/regression.py index 12f910cf5..674816ec3 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -182,6 +182,7 @@ class MapieRegressor(BaseEstimator, RegressorMixin): # type: ignore >>> print(y_pred) [ 5.28571429 7.17142857 9.05714286 10.94285714 12.82857143 14.71428571] """ + valid_methods_ = ["naive", "base", "plus", "minmax"] valid_agg_functions_ = [None, "median", "mean"] fit_attributes = [ diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py new file mode 100644 index 000000000..94411ed5b --- /dev/null +++ b/mapie/tests/test_time_series_regression.py @@ -0,0 +1,36 @@ +import numpy as np +from sklearn.linear_model import LinearRegression + +from mapie.time_series_regression import MapieTimeSeriesRegressor + + +def test_MapieTimeSeriesRegressor_partial_fit_ensemble_T() -> None: + """Test ``residuals_`` when ``ensemble`` is True.""" + X_toy = np.array([[0], [1], [2], [3], [4], [5]]) + y_toy = np.array([5, 7.5, 9.5, 10.5, 12.5, 15]) + mapie_ts_reg = MapieTimeSeriesRegressor(LinearRegression(), cv=-1).fit( + X_toy, y_toy + ) + assert round(mapie_ts_reg.residuals_[-1], 2) == round(np.abs(15 - 14.4), 2) + mapie_ts_reg = mapie_ts_reg.partial_fit( + X=np.array([[6]]), y=np.array([17.5]), ensemble=True + ) + assert round(mapie_ts_reg.residuals_[-1], 2) == round( + np.abs(17.5 - 16.56), 2 + ) + + +def test_MapieTimeSeriesRegressor_partial_fit_ensemble_F() -> None: + """Test ``residuals_`` when ``ensemble`` is False.""" + X_toy = np.array([[0], [1], [2], [3], [4], [5]]) + y_toy = np.array([5, 7.5, 9.5, 10.5, 12.5, 15]) + mapie_ts_reg = MapieTimeSeriesRegressor(LinearRegression(), cv=-1).fit( + X_toy, y_toy + ) + assert round(mapie_ts_reg.residuals_[-1], 2) == round(np.abs(15 - 14.4), 2) + mapie_ts_reg = mapie_ts_reg.partial_fit( + X=np.array([[6]]), y=np.array([17.5]), ensemble=False + ) + assert round(mapie_ts_reg.residuals_[-1], 2) == round( + np.abs(17.5 - 16.6), 2 + ) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py new file mode 100644 index 000000000..450841028 --- /dev/null +++ b/mapie/time_series_regression.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +from typing import Optional, Union + +import numpy as np +from sklearn.base import RegressorMixin +from sklearn.model_selection import BaseCrossValidator + +from .regression import MapieRegressor +from ._typing import ArrayLike + + +class MapieTimeSeriesRegressor(MapieRegressor): + """ + Prediction interval with out-of-fold residuals for time series. + + This class implements the EnbPI strategy and some variations + for estimating prediction intervals on single-output time series. + It is ``MapieReegressor`` with one more method ``partial_fit``. + Actually, EnbPI only corresponds to MapieRegressor if the ``cv`` argument + if of type ``Subsample`` (Jackknife+-after-Bootstrap method). Moreover, for + the moment we consider the absolute values of the residuals of the model, + and consequently the prediction intervals are symetryc. + + Parameters + ---------- + estimator : Optional[RegressorMixin] + Any regressor with scikit-learn API + (i.e. with fit and predict methods), by default ``None``. + If ``None``, estimator defaults to a ``LinearRegression`` instance. + + method: str, optional + Method to choose for prediction interval estimates. + Choose among: + + - "naive", based on training set residuals, + - "base", based on validation sets residuals, + - "plus", based on validation residuals and testing predictions, + - "minmax", based on validation residuals and testing predictions + (min/max among cross-validation clones). + + By default "plus". + + cv: Optional[Union[int, str, BaseCrossValidator]] + The cross-validation strategy for computing residuals. + It directly drives the distinction between jackknife and cv variants. + Choose among: + + - ``None``, to use the default 5-fold cross-validation + - integer, to specify the number of folds. + If equal to -1, equivalent to + ``sklearn.model_selection.LeaveOneOut()``. + - CV splitter: any ``sklearn.model_selection.BaseCrossValidator`` + Main variants are: + - ``sklearn.model_selection.LeaveOneOut`` (jackknife), + - ``sklearn.model_selection.KFold`` (cross-validation), + - ``subsample.Subsample`` object (bootstrap). + - ``"prefit"``, assumes that ``estimator`` has been fitted already, + and the ``method`` parameter is ignored. + All data provided in the ``fit`` method is then used + for computing residuals only. + At prediction time, quantiles of these residuals are used to provide + a prediction interval with fixed width. + The user has to take care manually that data for model fitting and + residual estimate are disjoint. + + By default ``None``. + + n_jobs: Optional[int] + Number of jobs for parallel processing using joblib + via the "locky" backend. + If ``-1`` all CPUs are used. + If ``1`` is given, no parallel computing code is used at all, + which is useful for debugging. + For n_jobs below ``-1``, ``(n_cpus + 1 - n_jobs)`` are used. + None is a marker for `unset` that will be interpreted as ``n_jobs=1`` + (sequential execution). + + By default ``None``. + + agg_function : str + Determines how to aggregate predictions from perturbed models, both at + training and prediction time. + + If ``None``, it is ignored except if cv class is ``Subsample``, + in which case an error is raised. + If "mean" or "median", returns the mean or median of the predictions + computed from the out-of-folds models. + Note: if you plan to set the ``ensemble`` argument to ``True`` in the + ``predict`` method, you have to specify an aggregation function. + Otherwise an error would be raised. + + The Jackknife+ interval can be interpreted as an interval around the + median prediction, and is guaranteed to lie inside the interval, + unlike the single estimator predictions. + + When the cross-validation strategy is Subsample (i.e. for the + Jackknife+-after-Bootstrap method), this function is also used to + aggregate the training set in-sample predictions. + + If cv is ``"prefit"``, ``agg_function`` is ignored. + + By default "mean". + + verbose : int, optional + The verbosity level, used with joblib for multiprocessing. + The frequency of the messages increases with the verbosity level. + If it more than ``10``, all iterations are reported. + Above ``50``, the output is sent to stdout. + + By default ``0``. + + Attributes + ---------- + valid_methods: List[str] + List of all valid methods. + + single_estimator_ : sklearn.RegressorMixin + Estimator fitted on the whole training set. + + estimators_ : list + List of out-of-folds estimators. + + residuals_ : ArrayLike of shape (n_samples_train,) + Residuals between ``y_train`` and ``y_pred``. + + k_ : ArrayLike + - Array of nans, of shape (len(y), 1) if cv is ``"prefit"`` + (defined but not used) + - Dummy array of folds containing each training sample, otherwise. + Of shape (n_samples_train, cv.get_n_splits(X_train, y_train)). + + n_features_in_: int + Number of features passed to the fit method. + + n_samples_val_: List[int] + Number of samples passed to the fit method. + + References + ---------- + Chen Xu, and Yao Xie. + "Conformal prediction for dynamic time-series." + """ + + def __init__( + self, + estimator: Optional[RegressorMixin] = None, + method: str = "plus", + cv: Optional[Union[int, str, BaseCrossValidator]] = None, + n_jobs: Optional[int] = None, + agg_function: Optional[str] = "mean", + verbose: int = 0, + ) -> None: + super().__init__(estimator, method, cv, n_jobs, agg_function, verbose) + + def partial_fit( + self, X: ArrayLike, y: ArrayLike, ensemble: bool = True + ) -> MapieTimeSeriesRegressor: + """ + Update the ``residuals_`` attribute when data with known labels are + available + + Parameters + ---------- + X : ArrayLike of shape (n_samples, n_features) + Input data. + + y : ArrayLike of shape (n_samples,) + Input labels. + + ensemble : bool + Boolean corresponing to the ``ensemble`` argument of ``predict`` + method, determining whether the predictions computed to determine + the new ``residuals_`` are ensembled or not. + If False, predictions are those of the model trained on the whole + training set. + + Returns + ------- + MapieTimeSeriesRegressor + The model itself. + """ + y_pred, y_pis = self.predict(X, alpha=0.5, ensemble=ensemble) + new_residuals = np.abs(y - y_pred) + + cut_index = min(len(new_residuals), len(self.residuals_)) + self.residuals_ = np.concatenate( + [self.residuals_[cut_index:], new_residuals], axis=0 + ) + return self From 14552cb99d72dd6e693ce249218caa640da3367b Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Fri, 11 Feb 2022 16:16:55 +0100 Subject: [PATCH 02/32] [CHANGE] partial_fit -> partial_update --- mapie/tests/test_time_series_regression.py | 19 +++++++++---------- mapie/time_series_regression.py | 6 +++--- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 94411ed5b..a673d0fb9 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -3,16 +3,17 @@ from mapie.time_series_regression import MapieTimeSeriesRegressor +X_toy = np.array([[0], [1], [2], [3], [4], [5]]) +y_toy = np.array([5, 7.5, 9.5, 10.5, 12.5, 15]) -def test_MapieTimeSeriesRegressor_partial_fit_ensemble_T() -> None: - """Test ``residuals_`` when ``ensemble`` is True.""" - X_toy = np.array([[0], [1], [2], [3], [4], [5]]) - y_toy = np.array([5, 7.5, 9.5, 10.5, 12.5, 15]) + +def test_MapieTimeSeriesRegressor_partial_update_ensemble_T() -> None: + """Test ``partial_update`` when ``ensemble`` is True.""" mapie_ts_reg = MapieTimeSeriesRegressor(LinearRegression(), cv=-1).fit( X_toy, y_toy ) assert round(mapie_ts_reg.residuals_[-1], 2) == round(np.abs(15 - 14.4), 2) - mapie_ts_reg = mapie_ts_reg.partial_fit( + mapie_ts_reg = mapie_ts_reg.partial_update( X=np.array([[6]]), y=np.array([17.5]), ensemble=True ) assert round(mapie_ts_reg.residuals_[-1], 2) == round( @@ -20,15 +21,13 @@ def test_MapieTimeSeriesRegressor_partial_fit_ensemble_T() -> None: ) -def test_MapieTimeSeriesRegressor_partial_fit_ensemble_F() -> None: - """Test ``residuals_`` when ``ensemble`` is False.""" - X_toy = np.array([[0], [1], [2], [3], [4], [5]]) - y_toy = np.array([5, 7.5, 9.5, 10.5, 12.5, 15]) +def test_MapieTimeSeriesRegressor_partial_update_ensemble_F() -> None: + """Test ``partial_update`` when ``ensemble`` is False.""" mapie_ts_reg = MapieTimeSeriesRegressor(LinearRegression(), cv=-1).fit( X_toy, y_toy ) assert round(mapie_ts_reg.residuals_[-1], 2) == round(np.abs(15 - 14.4), 2) - mapie_ts_reg = mapie_ts_reg.partial_fit( + mapie_ts_reg = mapie_ts_reg.partial_update( X=np.array([[6]]), y=np.array([17.5]), ensemble=False ) assert round(mapie_ts_reg.residuals_[-1], 2) == round( diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 450841028..7722a4583 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -153,12 +153,12 @@ def __init__( ) -> None: super().__init__(estimator, method, cv, n_jobs, agg_function, verbose) - def partial_fit( + def partial_update( self, X: ArrayLike, y: ArrayLike, ensemble: bool = True ) -> MapieTimeSeriesRegressor: """ Update the ``residuals_`` attribute when data with known labels are - available + available. Parameters ---------- @@ -180,7 +180,7 @@ def partial_fit( MapieTimeSeriesRegressor The model itself. """ - y_pred, y_pis = self.predict(X, alpha=0.5, ensemble=ensemble) + y_pred, _ = self.predict(X, alpha=0.5, ensemble=ensemble) new_residuals = np.abs(y - y_pred) cut_index = min(len(new_residuals), len(self.residuals_)) From e23a4d127136f0a5a6616bc8b83d5bcf56532120 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 28 Feb 2022 11:06:49 +0100 Subject: [PATCH 03/32] GMA & VTA's first remarks taken into account --- mapie/tests/test_time_series_regression.py | 17 ++++++----------- mapie/time_series_regression.py | 10 ++++++---- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index a673d0fb9..2d7a8ef28 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -1,5 +1,4 @@ import numpy as np -from sklearn.linear_model import LinearRegression from mapie.time_series_regression import MapieTimeSeriesRegressor @@ -7,13 +6,11 @@ y_toy = np.array([5, 7.5, 9.5, 10.5, 12.5, 15]) -def test_MapieTimeSeriesRegressor_partial_update_ensemble_T() -> None: +def test_MapieTimeSeriesRegressor_partial_fit_ensemble_T() -> None: """Test ``partial_update`` when ``ensemble`` is True.""" - mapie_ts_reg = MapieTimeSeriesRegressor(LinearRegression(), cv=-1).fit( - X_toy, y_toy - ) + mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) assert round(mapie_ts_reg.residuals_[-1], 2) == round(np.abs(15 - 14.4), 2) - mapie_ts_reg = mapie_ts_reg.partial_update( + mapie_ts_reg = mapie_ts_reg.partial_fit( X=np.array([[6]]), y=np.array([17.5]), ensemble=True ) assert round(mapie_ts_reg.residuals_[-1], 2) == round( @@ -21,13 +18,11 @@ def test_MapieTimeSeriesRegressor_partial_update_ensemble_T() -> None: ) -def test_MapieTimeSeriesRegressor_partial_update_ensemble_F() -> None: +def test_MapieTimeSeriesRegressor_partial_fit_ensemble_F() -> None: """Test ``partial_update`` when ``ensemble`` is False.""" - mapie_ts_reg = MapieTimeSeriesRegressor(LinearRegression(), cv=-1).fit( - X_toy, y_toy - ) + mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) assert round(mapie_ts_reg.residuals_[-1], 2) == round(np.abs(15 - 14.4), 2) - mapie_ts_reg = mapie_ts_reg.partial_update( + mapie_ts_reg = mapie_ts_reg.partial_fit( X=np.array([[6]]), y=np.array([17.5]), ensemble=False ) assert round(mapie_ts_reg.residuals_[-1], 2) == round( diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 7722a4583..4540a6dcc 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -16,11 +16,13 @@ class MapieTimeSeriesRegressor(MapieRegressor): This class implements the EnbPI strategy and some variations for estimating prediction intervals on single-output time series. - It is ``MapieReegressor`` with one more method ``partial_fit``. + It is ``MapieRegressor`` with one more method ``partial_fit``. Actually, EnbPI only corresponds to MapieRegressor if the ``cv`` argument if of type ``Subsample`` (Jackknife+-after-Bootstrap method). Moreover, for the moment we consider the absolute values of the residuals of the model, - and consequently the prediction intervals are symetryc. + and consequently the prediction intervals are symmetryc. Moreover we did + not implement the PI's optimization to the oracle interval yet. It is still + a first step before implementing the actual EnbPI. Parameters ---------- @@ -153,7 +155,7 @@ def __init__( ) -> None: super().__init__(estimator, method, cv, n_jobs, agg_function, verbose) - def partial_update( + def partial_fit( self, X: ArrayLike, y: ArrayLike, ensemble: bool = True ) -> MapieTimeSeriesRegressor: """ @@ -169,7 +171,7 @@ def partial_update( Input labels. ensemble : bool - Boolean corresponing to the ``ensemble`` argument of ``predict`` + Boolean corresponding to the ``ensemble`` argument of ``predict`` method, determining whether the predictions computed to determine the new ``residuals_`` are ensembled or not. If False, predictions are those of the model trained on the whole From db21938318b1ec7d95390ed7fa47ad9a24632bde Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 28 Feb 2022 13:30:34 +0100 Subject: [PATCH 04/32] I made a confusion deleting the doc oc regression.py... --- mapie/time_series_regression.py | 114 -------------------------------- 1 file changed, 114 deletions(-) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 4540a6dcc..c69c954f6 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -24,120 +24,6 @@ class MapieTimeSeriesRegressor(MapieRegressor): not implement the PI's optimization to the oracle interval yet. It is still a first step before implementing the actual EnbPI. - Parameters - ---------- - estimator : Optional[RegressorMixin] - Any regressor with scikit-learn API - (i.e. with fit and predict methods), by default ``None``. - If ``None``, estimator defaults to a ``LinearRegression`` instance. - - method: str, optional - Method to choose for prediction interval estimates. - Choose among: - - - "naive", based on training set residuals, - - "base", based on validation sets residuals, - - "plus", based on validation residuals and testing predictions, - - "minmax", based on validation residuals and testing predictions - (min/max among cross-validation clones). - - By default "plus". - - cv: Optional[Union[int, str, BaseCrossValidator]] - The cross-validation strategy for computing residuals. - It directly drives the distinction between jackknife and cv variants. - Choose among: - - - ``None``, to use the default 5-fold cross-validation - - integer, to specify the number of folds. - If equal to -1, equivalent to - ``sklearn.model_selection.LeaveOneOut()``. - - CV splitter: any ``sklearn.model_selection.BaseCrossValidator`` - Main variants are: - - ``sklearn.model_selection.LeaveOneOut`` (jackknife), - - ``sklearn.model_selection.KFold`` (cross-validation), - - ``subsample.Subsample`` object (bootstrap). - - ``"prefit"``, assumes that ``estimator`` has been fitted already, - and the ``method`` parameter is ignored. - All data provided in the ``fit`` method is then used - for computing residuals only. - At prediction time, quantiles of these residuals are used to provide - a prediction interval with fixed width. - The user has to take care manually that data for model fitting and - residual estimate are disjoint. - - By default ``None``. - - n_jobs: Optional[int] - Number of jobs for parallel processing using joblib - via the "locky" backend. - If ``-1`` all CPUs are used. - If ``1`` is given, no parallel computing code is used at all, - which is useful for debugging. - For n_jobs below ``-1``, ``(n_cpus + 1 - n_jobs)`` are used. - None is a marker for `unset` that will be interpreted as ``n_jobs=1`` - (sequential execution). - - By default ``None``. - - agg_function : str - Determines how to aggregate predictions from perturbed models, both at - training and prediction time. - - If ``None``, it is ignored except if cv class is ``Subsample``, - in which case an error is raised. - If "mean" or "median", returns the mean or median of the predictions - computed from the out-of-folds models. - Note: if you plan to set the ``ensemble`` argument to ``True`` in the - ``predict`` method, you have to specify an aggregation function. - Otherwise an error would be raised. - - The Jackknife+ interval can be interpreted as an interval around the - median prediction, and is guaranteed to lie inside the interval, - unlike the single estimator predictions. - - When the cross-validation strategy is Subsample (i.e. for the - Jackknife+-after-Bootstrap method), this function is also used to - aggregate the training set in-sample predictions. - - If cv is ``"prefit"``, ``agg_function`` is ignored. - - By default "mean". - - verbose : int, optional - The verbosity level, used with joblib for multiprocessing. - The frequency of the messages increases with the verbosity level. - If it more than ``10``, all iterations are reported. - Above ``50``, the output is sent to stdout. - - By default ``0``. - - Attributes - ---------- - valid_methods: List[str] - List of all valid methods. - - single_estimator_ : sklearn.RegressorMixin - Estimator fitted on the whole training set. - - estimators_ : list - List of out-of-folds estimators. - - residuals_ : ArrayLike of shape (n_samples_train,) - Residuals between ``y_train`` and ``y_pred``. - - k_ : ArrayLike - - Array of nans, of shape (len(y), 1) if cv is ``"prefit"`` - (defined but not used) - - Dummy array of folds containing each training sample, otherwise. - Of shape (n_samples_train, cv.get_n_splits(X_train, y_train)). - - n_features_in_: int - Number of features passed to the fit method. - - n_samples_val_: List[int] - Number of samples passed to the fit method. - References ---------- Chen Xu, and Yao Xie. From 666b8df52b7a6833fc39146accd6a0ea35f86050 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 28 Feb 2022 18:51:34 +0100 Subject: [PATCH 05/32] ADD examples/regression/plot_timeseries_enbpi.py --- examples/regression/plot_timeseries_enbpi.py | 162 ++++++++++++++++++ .../regression/plot_timeseries_example.py | 2 +- mapie/regression.py | 1 + 3 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 examples/regression/plot_timeseries_enbpi.py diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py new file mode 100644 index 000000000..a07518584 --- /dev/null +++ b/examples/regression/plot_timeseries_enbpi.py @@ -0,0 +1,162 @@ +""" +================================================================== +Estimating prediction intervals of time series forecast with EnbPI +================================================================== +This example uses +:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate +prediction intervals associated with time series forecast. The implementation +is still at its first step, based on Jackknife+-after-bootsrtap, to estimate +residuals and associated prediction intervals. + +We use here the Victoria electricity demand dataset used in the book +"Forecasting: Principles and Practice" by R. J. Hyndman and G. Athanasopoulos. +The electricity demand features daily and weekly seasonalities and is impacted +by the temperature, considered here as a exogeneous variable. + +The data is modelled by a Random Forest model with a +:class:`sklearn.model_selection.RandomizedSearchCV` using a sequential +:class:`sklearn.model_selection.TimeSeriesSplit` cross validation, in which the +training set is prior to the validation set. +The best model is then feeded into +:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the +associated prediction intervals. We compare two approaches: one with no +`partial_fit` call and one with `partial_fit` every 5 steps. +""" +import warnings + +import numpy as np +import pandas as pd +from matplotlib import pylab as plt +from scipy.stats import randint +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit + +from mapie.metrics import regression_coverage_score +from mapie.subsample import Subsample +from mapie.time_series_regression import MapieTimeSeriesRegressor + +warnings.simplefilter("ignore") + +# Load input data and feature engineering +demand_df = pd.read_csv( + "../data/demand_temperature.csv", parse_dates=True, index_col=0 +) +demand_df["Date"] = pd.to_datetime(demand_df.index) +demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64") +demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64") +demand_df["Hour"] = demand_df.index.hour +for hour in range(1, 5): + demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour) + +# Train/validation/test split +num_test_steps = 24 * 7 * 2 +demand_train = demand_df.iloc[:-num_test_steps, :].copy() +demand_test = demand_df.iloc[-num_test_steps:, :].copy() +features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + [ + f"Lag_{hour}" for hour in range(1, 5) +] +X_train = demand_train.loc[ + ~np.any(demand_train[features].isnull(), axis=1), features +] +y_train = demand_train.loc[X_train.index, "Demand"] +X_test = demand_test.loc[:, features] +y_test = demand_test["Demand"] + +# CV parameter search +n_iter = 10 +n_splits = 5 +tscv = TimeSeriesSplit(n_splits=n_splits) +random_state = 59 +rf_model = RandomForestRegressor(random_state=random_state) +rf_params = {"max_depth": randint(2, 30), "n_estimators": randint(10, 1e3)} +cv_obj = RandomizedSearchCV( + rf_model, + param_distributions=rf_params, + n_iter=n_iter, + cv=tscv, + scoring="neg_root_mean_squared_error", + random_state=random_state, + verbose=0, + n_jobs=-1, +) +cv_obj.fit(X_train, y_train) +best_est = cv_obj.best_estimator_ + +# Estimate prediction intervals on test set with best estimator +alpha = 0.1 +cv_Mapie = Subsample(30, random_state=random_state) +mapie = MapieTimeSeriesRegressor( + best_est, method="plus", cv=cv_Mapie, agg_function="median", n_jobs=-1 +) +mapie.fit(X_train, y_train) + +# With no partial_fit +y_pred, y_pis = mapie.predict(X_test, alpha=alpha) +coverage = regression_coverage_score(y_test, y_pis[:, 0, 0], y_pis[:, 1, 0]) +width = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean() + +# With partial_fit every five hours +y_pred_5_steps, y_pis_5_steps = mapie.predict(X_test.iloc[:5, :], alpha=alpha) + +for step in range(5, len(X_test), 5): + mapie.partial_fit( + X_test.iloc[(step - 5):step, :], y_test.iloc[(step - 5):step] + ) + y_pred_step, y_pis_step = mapie.predict( + X_test.iloc[step:(step + 5), :], alpha=alpha + ) + y_pred_5_steps = np.concatenate((y_pred_5_steps, y_pred_step), axis=0) + y_pis_5_steps = np.concatenate((y_pis_5_steps, y_pis_step), axis=0) + +coverage_5_step = regression_coverage_score( + y_test, y_pis_5_steps[:, 0, 0], y_pis_5_steps[:, 1, 0] +) +width_5_step = (y_pis_5_steps[:, 1, 0] - y_pis_5_steps[:, 0, 0]).mean() + + +# Print results +print( + "Coverage and prediction interval width mean for MapieTimeSeriesRegressor:" + "\nWithout any partial_fit:" + f"{coverage:.3f}, {width:.3f}" +) + +# Plot estimated prediction intervals on test set +fig = plt.figure(figsize=(15, 5)) +ax = fig.add_subplot(1, 1, 1) +ax.set_ylabel("Hourly demand (GW)") +ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") +ax.plot(demand_test.index, y_pred, lw=2, c="C2", label="Predictions") +ax.fill_between( + demand_test.index, + y_pis[:, 0, 0], + y_pis[:, 1, 0], + color="C2", + alpha=0.2, + label="MapieTimeSeriesRegressor PIs", +) +ax.legend() +plt.show() + +print( + "Coverage and prediction interval width mean for MapieTimeSeriesRegressor:" + "\nWith partial_fit every 5 steps:" + f"{coverage_5_step:.3f}, {width_5_step:.3f}" +) + +# Plot estimated prediction intervals on test set +fig = plt.figure(figsize=(15, 5)) +ax = fig.add_subplot(1, 1, 1) +ax.set_ylabel("Hourly demand (GW)") +ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") +ax.plot(demand_test.index, y_pred_5_steps, lw=2, c="C2", label="Predictions") +ax.fill_between( + demand_test.index, + y_pis_5_steps[:, 0, 0], + y_pis_5_steps[:, 1, 0], + color="C2", + alpha=0.2, + label="MapieTimeSeriesRegressor PIs", +) +ax.legend() +plt.show() diff --git a/examples/regression/plot_timeseries_example.py b/examples/regression/plot_timeseries_example.py index 8e9f5c269..4db03d2a6 100644 --- a/examples/regression/plot_timeseries_example.py +++ b/examples/regression/plot_timeseries_example.py @@ -25,7 +25,7 @@ intervals, through the `sklearn.model_selection.KFold()` object. Residuals are therefore estimated using models trained on data with higher indices than the validation data, which is inappropriate for time-series data. -Howerver, using a `sklearn.model_selection.TimeSeriesSplit` cross validation +However, using a `sklearn.model_selection.TimeSeriesSplit` cross validation object for estimating the residuals breaks the theoretical guarantees of the Jackknife+ and CV+ methods. """ diff --git a/mapie/regression.py b/mapie/regression.py index 674816ec3..bf3299e97 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -660,6 +660,7 @@ def predict( for _alpha in alpha_ ] ) + y_pred_up = np.column_stack( [ np.quantile( From b23484a861e2e2f406cd766befdebbfea83114e0 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Fri, 4 Mar 2022 18:03:16 +0100 Subject: [PATCH 06/32] Implement 3 predict method --- examples/regression/plot_timeseries_enbpi.py | 31 +- mapie/regression.py | 24 +- mapie/subsample.py | 108 ++++++ mapie/time_series_regression.py | 363 ++++++++++++++++++- 4 files changed, 481 insertions(+), 45 deletions(-) diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index a07518584..164374496 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -27,9 +27,7 @@ import numpy as np import pandas as pd from matplotlib import pylab as plt -from scipy.stats import randint from sklearn.ensemble import RandomForestRegressor -from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit from mapie.metrics import regression_coverage_score from mapie.subsample import Subsample @@ -62,31 +60,14 @@ X_test = demand_test.loc[:, features] y_test = demand_test["Demand"] -# CV parameter search -n_iter = 10 -n_splits = 5 -tscv = TimeSeriesSplit(n_splits=n_splits) -random_state = 59 -rf_model = RandomForestRegressor(random_state=random_state) -rf_params = {"max_depth": randint(2, 30), "n_estimators": randint(10, 1e3)} -cv_obj = RandomizedSearchCV( - rf_model, - param_distributions=rf_params, - n_iter=n_iter, - cv=tscv, - scoring="neg_root_mean_squared_error", - random_state=random_state, - verbose=0, - n_jobs=-1, -) -cv_obj.fit(X_train, y_train) -best_est = cv_obj.best_estimator_ +# Model +model = RandomForestRegressor(max_depth=15, n_estimators=673, random_state=59) # Estimate prediction intervals on test set with best estimator alpha = 0.1 -cv_Mapie = Subsample(30, random_state=random_state) +cv_Mapie = Subsample(30, random_state=59) mapie = MapieTimeSeriesRegressor( - best_est, method="plus", cv=cv_Mapie, agg_function="median", n_jobs=-1 + model, method="plus", cv=cv_Mapie, agg_function="median", n_jobs=-1 ) mapie.fit(X_train, y_train) @@ -100,10 +81,10 @@ for step in range(5, len(X_test), 5): mapie.partial_fit( - X_test.iloc[(step - 5):step, :], y_test.iloc[(step - 5):step] + X_test.iloc[(step - 5) : step, :], y_test.iloc[(step - 5) : step] ) y_pred_step, y_pis_step = mapie.predict( - X_test.iloc[step:(step + 5), :], alpha=alpha + X_test.iloc[step : (step + 5), :], alpha=alpha ) y_pred_5_steps = np.concatenate((y_pred_5_steps, y_pred_step), axis=0) y_pis_5_steps = np.concatenate((y_pis_5_steps, y_pis_step), axis=0) diff --git a/mapie/regression.py b/mapie/regression.py index bf3299e97..7345b9b5a 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -182,7 +182,7 @@ class MapieRegressor(BaseEstimator, RegressorMixin): # type: ignore >>> print(y_pred) [ 5.28571429 7.17142857 9.05714286 10.94285714 12.82857143 14.71428571] """ - + cv_need_agg_function = [Subsample] valid_methods_ = ["naive", "base", "plus", "minmax"] valid_agg_functions_ = [None, "median", "mean"] fit_attributes = [ @@ -249,7 +249,7 @@ def _check_agg_function( ------ ValueError If ``agg_function`` is not in [``None``, ``"mean"``, ``"median"``], - or is ``None`` while cv class is ``Subsample``. + or is ``None`` while cv class is in ``cv_need_agg_function``. """ if agg_function not in self.valid_agg_functions_: raise ValueError( @@ -257,10 +257,11 @@ def _check_agg_function( "Allowed values are None, 'mean', 'median'." ) - if isinstance(self.cv, Subsample) and (agg_function is None): + if ((agg_function is None) and + (type(self.cv) in self.cv_need_agg_function)): raise ValueError( - "You need to specify an aggregation function when " - "cv is a Subsample. " + f"You need to specify an aggregation function when " + "cv's type is in {self.cv_need_agg_function}. " ) if (agg_function is not None) or (self.cv == "prefit"): return agg_function @@ -543,7 +544,6 @@ def fit( check_nan_in_aposteriori_prediction(pred_matrix) y_pred = aggregate_all(agg_function, pred_matrix) - self.residuals_ = np.abs(y - y_pred) return self @@ -626,15 +626,9 @@ def predict( ) # At this point, y_pred_multi is of shape - # (n_samples_test, n_estimators_). - # If ``method`` is "plus": - # - if ``cv`` is not a ``Subsample``, - # we enforce y_pred_multi to be of shape - # (n_samples_test, n_samples_train), - # thanks to the folds identifier. - # - if ``cv``is a ``Subsample``, the methode - # ``aggregate_with_mask`` fits it to the right size - # thanks to the shape of k_. + # (n_samples_test, n_estimators_). The method + # ``aggregate_with_mask`` fits it to the right size thanks to + # the shape of k_. y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) diff --git a/mapie/subsample.py b/mapie/subsample.py index abb7b31c8..1541f2854 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -3,6 +3,7 @@ from typing import Any, Generator, Optional, Tuple, Union import numpy as np +from numpy.lib.stride_tricks import sliding_window_view from numpy.random import RandomState from sklearn.model_selection import BaseCrossValidator from sklearn.utils import check_random_state, resample @@ -99,3 +100,110 @@ def get_n_splits(self, *args: Any, **kargs: Any) -> int: Returns the number of splitting iterations in the cross-validator. """ return self.n_resamplings + + +class BlockBootstrap(BaseCrossValidator): # type: ignore + """ + Generate a sampling method, that block bootstraps the training set. + It can replace KFold, LeaveOneOut or SubSample as cv argument in the MAPIE + class. + + Parameters + ---------- + n_resamplings : int + Number of resamplings. By default ``30``. + length: int + Length of the blocks. + overlapping: bool + Whether the blocks can overlapp or not. By default ``False``. + n_blocsk: int + Number of blocks in each resampling. By default ``None``, + the size of the training set divided by ``length``. + random_state: Optional + int or RandomState instance. + + + Examples + -------- + >>> import numpy as np + >>> from mapie.subsample import BlockBootstrap + >>> cv = BlockBootstrap(n_resamplings=2, length = 3, random_state=0) + >>> X = np.array([1,2,3,4,5,6,7,8,9,10]) + >>> for train_index, test_index in cv.split(X): + ... print(f"train index is {train_index}, test index is {test_index}") + train index is [5 0 3 3 7 9 3 5 2 4], test index is [8 1 6] + train index is [7 6 8 8 1 6 7 7 8 1], test index is [0 2 3 4 5 9] + """ + + def __init__( + self, + n_resamplings: int = 30, + length: int = 1, + n_blocks: Optional[int] = None, + overlapping: bool = False, + random_state: Optional[Union[int, RandomState]] = None, + ) -> None: + self.n_resamplings = n_resamplings + self.length = length + self.n_blocks = n_blocks + self.overlapping = overlapping + self.random_state = random_state + + def split( + self, X: ArrayLike + ) -> Generator[Tuple[Any, ArrayLike], None, None]: + """ + Generate indices to split data into training and test sets. + + Parameters + ---------- + X : ArrayLike of shape (n_samples, n_features) + Training data. + + Yields + ------ + train : ArrayLike of shape (n_indices_training,) + The training set indices for that split. + test : ArrayLike of shape (n_indices_test,) + The testing set indices for that split. + """ + indices = np.arange(len(X)) + if self.length > len(indices): + raise ValueError( + "The length of blocks is greater than the lenght" + "of training set." + ) + + if self.overlapping: + blocks = sliding_window_view(indices, window_shape=self.length) + else: + indices = indices[len(indices) % self.length :] + blocks_number = len(indices) // self.length + blocks = np.array_split(indices, indices_or_sections=blocks_number) + + random_state = check_random_state(self.random_state) + n_blocks = ( + self.n_blocks + if self.n_blocks is not None + else (len(indices) // self.length) + 1 + ) + for k in range(self.n_resamplings): + block_indices = np.random.randint(len(blocks), size=n_blocks) + train_index = np.concatenate( + [blocks[k] for k in block_indices], axis=0 + ) + test_index = np.array( + list(set(indices) - set(train_index)), dtype=np.int64 + ) + yield train_index, test_index + + def get_n_splits(self, *args: Any, **kargs: Any) -> int: + """ + Returns the number of splitting iterations in the cross-validator. + + Returns + ------- + int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_resamplings diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index c69c954f6..d4343c970 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -1,18 +1,28 @@ from __future__ import annotations +from argparse import ArgumentDefaultsHelpFormatter -from typing import Optional, Union +from typing import Iterable, Optional, Tuple, Union, cast import numpy as np +import numpy.ma as ma from sklearn.base import RegressorMixin from sklearn.model_selection import BaseCrossValidator +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted +from .aggregation_functions import aggregate_all from .regression import MapieRegressor +from .subsample import Subsample, BlockBootstrap from ._typing import ArrayLike +from .utils import ( + check_alpha, + check_alpha_and_n_samples, +) class MapieTimeSeriesRegressor(MapieRegressor): """ - Prediction interval with out-of-fold residuals for time series. + Prediction interval with out-of-fold residuals for time series. This class implements the EnbPI strategy and some variations for estimating prediction intervals on single-output time series. @@ -30,6 +40,18 @@ class MapieTimeSeriesRegressor(MapieRegressor): "Conformal prediction for dynamic time-series." """ + cv_need_agg_function = [BlockBootstrap, Subsample] + valid_methods_ = ["plus"] + valid_agg_functions_ = [None, "median", "mean"] + fit_attributes = [ + "single_estimator_", + "estimators_", + "k_", + "residuals_", + "n_features_in_", + "n_samples_val_", + ] + def __init__( self, estimator: Optional[RegressorMixin] = None, @@ -41,6 +63,23 @@ def __init__( ) -> None: super().__init__(estimator, method, cv, n_jobs, agg_function, verbose) + def fit( + self, + X: ArrayLike, + y: ArrayLike, + sample_weight: Optional[ArrayLike] = None, + ) -> MapieTimeSeriesRegressor: + """ + Returns + ------- + MapieTimeSeriesRegressor + The model itself. + """ + self = super().fit(X=X, y=y, sample_weight=sample_weight) + y_pred = super().predict(X=X) + self.residuals_ = y.values - y_pred + return self + def partial_fit( self, X: ArrayLike, y: ArrayLike, ensemble: bool = True ) -> MapieTimeSeriesRegressor: @@ -69,10 +108,324 @@ def partial_fit( The model itself. """ y_pred, _ = self.predict(X, alpha=0.5, ensemble=ensemble) - new_residuals = np.abs(y - y_pred) + new_residuals = y - y_pred - cut_index = min(len(new_residuals), len(self.residuals_)) + cut_index = min( + len(new_residuals[~np.isnan(new_residuals)]), len(self.residuals_) + ) self.residuals_ = np.concatenate( - [self.residuals_[cut_index:], new_residuals], axis=0 + [ + self.residuals_[cut_index:], + new_residuals[~np.isnan(new_residuals)], + ], + axis=0, ) return self + + def predict( + self, + X: ArrayLike, + ensemble: bool = False, + alpha: Optional[Union[float, Iterable[float]]] = None, + ) -> Union[ArrayLike, Tuple[ArrayLike, ArrayLike]]: + + # Checks + check_is_fitted(self, self.fit_attributes) + self._check_ensemble(ensemble) + alpha_ = check_alpha(alpha) + X = check_array(X, force_all_finite=False, dtype=["float64", "object"]) + y_pred = self.single_estimator_.predict(X) + + if alpha is None: + return np.array(y_pred) + else: + alpha_ = cast(ArrayLike, alpha_) + check_alpha_and_n_samples(alpha_, self.residuals_.shape[0]) + + betas_0 = np.full_like(alpha_, np.nan, dtype=float) + + for ind, _alpha in enumerate(alpha_): + betas = np.linspace(0.0, _alpha, num=len(self.residuals_) + 2) + + one_alpha_beta = np.quantile( + self.residuals_, + 1 - _alpha + betas, + axis=0, + interpolation="higher", + ) + + beta = np.quantile( + self.residuals_, + betas, + axis=0, + interpolation="lower", + ) + betas_0[ind] = betas[np.argmin(one_alpha_beta - beta, axis=0)] + + lower_quantiles = np.quantile( + self.residuals_, + betas_0, + axis=0, + interpolation="lower", + ) + higher_quantiles = np.quantile( + self.residuals_, + 1 - _alpha + betas_0, + axis=0, + interpolation="higher", + ) + + if self.method in ["naive", "base"] or self.cv == "prefit": + y_pred_low = np.column_stack( + [ + y_pred[:, np.newaxis] + lower_quantiles[k] + for k in range(len(alpha_)) + ] + ) + y_pred_up = np.column_stack( + [ + y_pred[:, np.newaxis] + higher_quantiles[k] + for k in range(len(alpha_)) + ] + ) + else: + y_pred_multi = np.column_stack( + [e.predict(X) for e in self.estimators_] + ) + + # At this point, y_pred_multi is of shape + # (n_samples_test, n_estimators_). The method + # ``aggregate_with_mask`` fits it to the right size thanks to + # the shape of k_. + + y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) + + if self.method == "plus": + pred = aggregate_all(self.agg_function, y_pred_multi) + y_pred_low = np.column_stack( + [pred + lower_quantiles[k] for k in range(len(alpha_))] + ) + y_pred_up = np.column_stack( + [ + pred + higher_quantiles[k] + for k in range(len(alpha_)) + ] + ) + + if self.method == "minmax": + lower_bounds = np.min(y_pred_multi, axis=1, keepdims=True) + upper_bounds = np.max(y_pred_multi, axis=1, keepdims=True) + y_pred_low = np.column_stack( + [ + lower_bounds + lower_quantiles[k] + for k in range(len(alpha_)) + ] + ) + y_pred_up = np.column_stack( + [ + upper_bounds + higher_quantiles[k] + for k in range(len(alpha_)) + ] + ) + if ensemble: + y_pred = aggregate_all(self.agg_function, y_pred_multi) + return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) + + def predict2( + self, + X: ArrayLike, + ensemble: bool = False, + alpha: Optional[Union[float, Iterable[float]]] = None, + ) -> Union[ArrayLike, Tuple[ArrayLike, ArrayLike]]: + # Checks + check_is_fitted(self, self.fit_attributes) + self._check_ensemble(ensemble) + alpha_ = check_alpha(alpha) + X = check_array(X, force_all_finite=False, dtype=["float64", "object"]) + y_pred = self.single_estimator_.predict(X) + + if alpha is None: + return np.array(y_pred) + else: + alpha_ = cast(ArrayLike, alpha_) + check_alpha_and_n_samples(alpha_, self.residuals_.shape[0]) + if self.method in ["naive", "base"] or self.cv == "prefit": + quantile = np.quantile( + self.residuals_, 1 - alpha_, interpolation="higher" + ) + y_pred_low = y_pred[:, np.newaxis] - quantile + y_pred_up = y_pred[:, np.newaxis] + quantile + else: + y_pred_multi = np.column_stack( + [e.predict(X) for e in self.estimators_] + ) + + # At this point, y_pred_multi is of shape + # (n_samples_test, n_estimators_). The method + # ``aggregate_with_mask`` fits it to the right size thanks to + # the shape of k_. + + y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) + + if self.method == "plus": + lower_bounds = y_pred_multi + self.residuals_ + upper_bounds = y_pred_multi + self.residuals_ + if self.method == "minmax": + lower_bounds = np.min(y_pred_multi, axis=1, keepdims=True) + upper_bounds = np.max(y_pred_multi, axis=1, keepdims=True) + lower_bounds = lower_bounds + self.residuals_ + upper_bounds = upper_bounds + self.residuals_ + + y_pred_low = np.column_stack( + [ + np.quantile( + ma.masked_invalid(lower_bounds), + _alpha, + axis=1, + interpolation="lower", + ) + for _alpha in alpha_ + ] + ) + + y_pred_up = np.column_stack( + [ + np.quantile( + ma.masked_invalid(upper_bounds), + 1 - _alpha, + axis=1, + interpolation="higher", + ) + for _alpha in alpha_ + ] + ) + if ensemble: + y_pred = aggregate_all(self.agg_function, y_pred_multi) + return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) + + def predict3( + self, + X: ArrayLike, + ensemble: bool = False, + alpha: Optional[Union[float, Iterable[float]]] = None, + ) -> Union[ArrayLike, Tuple[ArrayLike, ArrayLike]]: + # Checks + check_is_fitted(self, self.fit_attributes) + self._check_ensemble(ensemble) + alpha_ = check_alpha(alpha) + X = check_array(X, force_all_finite=False, dtype=["float64", "object"]) + y_pred = self.single_estimator_.predict(X) + + if alpha is None: + return np.array(y_pred) + else: + alpha_ = cast(ArrayLike, alpha_) + check_alpha_and_n_samples(alpha_, self.residuals_.shape[0]) + + y_pred_low = [] + y_pred_up = [] + + for _alpha in alpha_: + betas = np.linspace(0.0, _alpha, num=len(self.residuals_) + 2) + + if self.method in ["naive", "base"] or self.cv == "prefit": + one_alpha_beta = np.quantile( + self.residuals_, + 1 - _alpha + betas, + axis=0, + interpolation="higher", + ) + beta = np.quantile( + self.residuals_, + betas, + axis=0, + interpolation="lower", + ) + + beta_0 = betas[np.argmin(one_alpha_beta - beta, axis=0)] + + lower_quantiles = np.quantile( + self.residuals_, + beta_0, + axis=0, + interpolation="lower", + ) + higher_quantiles = np.quantile( + self.residuals_, + 1 - _alpha + beta_0, + axis=0, + interpolation="higher", + ) + y_pred_low.append(y_pred[:, np.newaxis] + lower_quantiles) + y_pred_up.append(y_pred[:, np.newaxis] + higher_quantiles) + else: + y_pred_multi = np.column_stack( + [e.predict(X) for e in self.estimators_] + ) + + # At this point, y_pred_multi is of shape + # (n_samples_test, n_estimators_). The method + # ``aggregate_with_mask`` fits it to the right size thanks to + # the shape of k_. + + y_pred_multi = self.aggregate_with_mask( + y_pred_multi, self.k_ + ) + + if self.method == "plus": + lower_bounds = y_pred_multi + self.residuals_ + upper_bounds = y_pred_multi + self.residuals_ + + if self.method == "minmax": + lower_bounds = np.min( + y_pred_multi, axis=1, keepdims=True + ) + upper_bounds = np.max( + y_pred_multi, axis=1, keepdims=True + ) + lower_bounds = lower_bounds + self.residuals_ + upper_bounds = upper_bounds + self.residuals_ + + one_alpha_beta = np.quantile( + upper_bounds, + 1 - _alpha + betas, + axis=1, + interpolation="higher", + ) + + beta = np.quantile( + lower_bounds, + betas, + axis=1, + interpolation="lower", + ) + + betas_0 = betas[np.argmin(one_alpha_beta - beta, axis=0)] + + lower_quantiles = np.empty((len(betas_0),)) + upper_quantiles = np.empty((len(betas_0),)) + + for ind, beta_0 in enumerate(betas_0): + lower_quantiles[ind] = np.quantile( + lower_bounds[ind, :], + beta_0, + axis=0, + interpolation="lower", + ) + upper_quantiles[ind] = np.quantile( + upper_bounds[ind, :], + 1 - _alpha + beta_0, + axis=0, + interpolation="higher", + ) + y_pred_low.append(lower_quantiles) + y_pred_up.append(upper_quantiles) + + y_pred_low = np.column_stack(y_pred_low) + y_pred_up = np.column_stack(y_pred_up) + + print(y_pred_low.shape) + print(y_pred_up.shape) + if ensemble: + y_pred = aggregate_all(self.agg_function, y_pred_multi) + return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) From ac1a94b8c05585ae3519f4237d48df5525cd4db2 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 21 Mar 2022 11:19:58 +0100 Subject: [PATCH 07/32] [commit before update with master] --- examples/regression/plot_timeseries_enbpi.py | 4 +- mapie/regression.py | 6 +- mapie/subsample.py | 48 ++- mapie/tests/test_time_series_regression.py | 374 ++++++++++++++++++- mapie/time_series_regression.py | 212 +---------- 5 files changed, 418 insertions(+), 226 deletions(-) diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index 164374496..f5b3a1125 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -81,10 +81,10 @@ for step in range(5, len(X_test), 5): mapie.partial_fit( - X_test.iloc[(step - 5) : step, :], y_test.iloc[(step - 5) : step] + X_test.iloc[(step - 5): step, :], y_test.iloc[(step - 5):step] ) y_pred_step, y_pis_step = mapie.predict( - X_test.iloc[step : (step + 5), :], alpha=alpha + X_test.iloc[step: (step + 5), :], alpha=alpha ) y_pred_5_steps = np.concatenate((y_pred_5_steps, y_pred_step), axis=0) y_pis_5_steps = np.concatenate((y_pis_5_steps, y_pis_step), axis=0) diff --git a/mapie/regression.py b/mapie/regression.py index 7345b9b5a..77b9a3e3b 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -258,10 +258,10 @@ def _check_agg_function( ) if ((agg_function is None) and - (type(self.cv) in self.cv_need_agg_function)): + (type(self.cv) in self.cv_need_agg_function)): raise ValueError( - f"You need to specify an aggregation function when " - "cv's type is in {self.cv_need_agg_function}. " + "You need to specify an aggregation function when " + f"cv's type is in {self.cv_need_agg_function}." ) if (agg_function is not None) or (self.cv == "prefit"): return agg_function diff --git a/mapie/subsample.py b/mapie/subsample.py index 1541f2854..de9b97933 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -113,7 +113,8 @@ class BlockBootstrap(BaseCrossValidator): # type: ignore n_resamplings : int Number of resamplings. By default ``30``. length: int - Length of the blocks. + Length of the blocks. By default ``None``, + the length of the training set divided by ``n_blocks``. overlapping: bool Whether the blocks can overlapp or not. By default ``False``. n_blocsk: int @@ -122,6 +123,10 @@ class BlockBootstrap(BaseCrossValidator): # type: ignore random_state: Optional int or RandomState instance. + Raises + ------ + ValueError + If both ``length`` and ``n_blocks`` are ``None``. Examples -------- @@ -138,11 +143,16 @@ class BlockBootstrap(BaseCrossValidator): # type: ignore def __init__( self, n_resamplings: int = 30, - length: int = 1, + length: Optional[int] = None, n_blocks: Optional[int] = None, overlapping: bool = False, random_state: Optional[Union[int, RandomState]] = None, ) -> None: + if length is None and n_blocks is None: + raise ValueError( + "At least one argument in ['length', 'n_blocks]" + "has to be not None." + ) self.n_resamplings = n_resamplings self.length = length self.n_blocks = n_blocks @@ -166,29 +176,43 @@ def split( The training set indices for that split. test : ArrayLike of shape (n_indices_test,) The testing set indices for that split. + Raises + ------ + ValueError + If ``length`` is greater than the train set size. """ + length = ( + self.length if self.length is not None else len(X) // self.n_blocks + ) + n_blocks = ( + self.n_blocks + if self.n_blocks is not None + else (len(X) // length) + 1 + ) indices = np.arange(len(X)) - if self.length > len(indices): + if length > len(indices): raise ValueError( "The length of blocks is greater than the lenght" "of training set." ) if self.overlapping: - blocks = sliding_window_view(indices, window_shape=self.length) + blocks = sliding_window_view(indices, window_shape=length) else: - indices = indices[len(indices) % self.length :] - blocks_number = len(indices) // self.length + indices = indices[len(indices) % length:] + blocks_number = len(indices) // length blocks = np.array_split(indices, indices_or_sections=blocks_number) random_state = check_random_state(self.random_state) - n_blocks = ( - self.n_blocks - if self.n_blocks is not None - else (len(indices) // self.length) + 1 - ) + for k in range(self.n_resamplings): - block_indices = np.random.randint(len(blocks), size=n_blocks) + block_indices = resample( + range(len(blocks)), + replace=True, + n_samples=n_blocks, + random_state=random_state, + stratify=None, + ) train_index = np.concatenate( [blocks[k] for k in block_indices], axis=0 ) diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 2d7a8ef28..65cd59bcb 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -1,9 +1,379 @@ +from __future__ import annotations + +from itertools import combinations +from typing import Any, List, Optional, Tuple, Union + import numpy as np +import pytest +from sklearn.datasets import make_regression +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import KFold, LeaveOneOut, train_test_split +from typing_extensions import TypedDict +from mapie._typing import ArrayLike +from mapie.aggregation_functions import aggregate_all +from mapie.metrics import regression_coverage_score from mapie.time_series_regression import MapieTimeSeriesRegressor +from mapie.subsample import BlockBootstrap + +X_toy = np.array(range(50)).reshape(-1, 1) +y_toy = (5.0 + 2.0*X_toy).flatten() +X, y = make_regression(n_samples=500, n_features=10, noise=1.0, random_state=1) +k = np.ones(shape=(5, X.shape[1])) +METHODS = ["naive", "base", "plus", "minmax"] + +Params = TypedDict( + "Params", + { + "method": str, + "agg_function": str, + "cv": Optional[Union[int, KFold, BlockBootstrap]], + }, +) +STRATEGIES = { + "naive": Params(method="naive", agg_function="median", cv=None), + "jackknife": Params(method="base", agg_function="mean", cv=-1), + "jackknife_plus": Params(method="plus", agg_function="mean", cv=-1), + "jackknife_minmax": Params(method="minmax", agg_function="mean", cv=-1), + "cv": Params( + method="base", + agg_function="mean", + cv=KFold(n_splits=3, shuffle=True, random_state=1), + ), + "cv_plus": Params( + method="plus", + agg_function="mean", + cv=KFold(n_splits=3, shuffle=True, random_state=1), + ), + "cv_minmax": Params( + method="minmax", + agg_function="mean", + cv=KFold(n_splits=3, shuffle=True, random_state=1), + ), + "jackknife_plus_ab": Params( + method="plus", + agg_function="mean", + cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), + ), + "jackknife_minmax_ab": Params( + method="minmax", + agg_function="mean", + cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), + ), + "jackknife_plus_median_ab": Params( + method="plus", + agg_function="median", + cv=BlockBootstrap( + n_resamplings=30, + n_blocks=5, + random_state=1, + ), + ), +} + +WIDTHS = { + "naive": 3.76, + "jackknife": 3.76, + "jackknife_plus": 3.76, + "jackknife_minmax": 3.82, + "cv": 3.76, + "cv_plus": 3.76, + "cv_minmax": 3.95, + "prefit": 4.81, + "cv_plus_median": 3.90, + "jackknife_plus_ab": 3.76, + "jackknife_minmax_ab": 3.96, + "jackknife_plus_median_ab": 3.76, +} + +COVERAGES = { + "naive": 0.952, + "jackknife": 0.952, + "jackknife_plus": 0.952, + "jackknife_minmax": 0.952, + "cv": 0.958, + "cv_plus": 0.956, + "cv_minmax": 0.966, + "prefit": 0.980, + "cv_plus_median": 0.954, + "jackknife_plus_ab": 0.952, + "jackknife_minmax_ab": 0.970, + "jackknife_plus_median_ab": 0.960, +} + + +@pytest.mark.parametrize("agg_function", ["dummy", 0, 1, 2.5, [1, 2]]) +def test_invalid_agg_function(agg_function: Any) -> None: + """Test that invalid agg_functions raise errors.""" + + mapie_ts_reg = MapieTimeSeriesRegressor(agg_function=None) + with pytest.raises(ValueError, match=r".*If ensemble is True*"): + mapie_ts_reg.fit(X_toy, y_toy) + mapie_ts_reg.predict(X_toy, ensemble=True) + + +@pytest.mark.parametrize("strategy", [*STRATEGIES]) +@pytest.mark.parametrize("dataset", [(X, y), (X_toy, y_toy)]) +@pytest.mark.parametrize("alpha", [0.2, [0.2, 0.4], (0.2, 0.4)]) +def test_predict_output_shape( + strategy: str, alpha: Any, dataset: Tuple[ArrayLike, ArrayLike] +) -> None: + """Test predict output shape.""" + mapie_ts_reg = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) + (X, y) = dataset + mapie_ts_reg.fit(X, y) + y_pred, y_pis = mapie_ts_reg.predict(X, alpha=alpha) + n_alpha = len(alpha) if hasattr(alpha, "__len__") else 1 + assert y_pred.shape == (X.shape[0],) + assert y_pis.shape == (X.shape[0], 2, n_alpha) + + +@pytest.mark.parametrize("strategy", [*STRATEGIES]) +def test_results_for_same_alpha(strategy: str) -> None: + """ + Test that predictions and intervals + are similar with two equal values of alpha. + """ + mapie_ts_reg = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) + mapie_ts_reg.fit(X, y) + _, y_pis = mapie_ts_reg.predict(X, alpha=[0.1, 0.1]) + np.testing.assert_allclose(y_pis[:, 0, 0], y_pis[:, 0, 1]) + np.testing.assert_allclose(y_pis[:, 1, 0], y_pis[:, 1, 1]) + + +@pytest.mark.parametrize("strategy", [*STRATEGIES]) +@pytest.mark.parametrize( + "alpha", [np.array([0.05, 0.1]), [0.05, 0.1], (0.05, 0.1)] +) +def test_results_for_alpha_as_float_and_arraylike( + strategy: str, alpha: Any +) -> None: + """Test that output values do not depend on type of alpha.""" + mapie_ts_reg = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) + mapie_ts_reg.fit(X, y) + y_pred_float1, y_pis_float1 = mapie_ts_reg.predict(X, alpha=alpha[0]) + y_pred_float2, y_pis_float2 = mapie_ts_reg.predict(X, alpha=alpha[1]) + y_pred_array, y_pis_array = mapie_ts_reg.predict(X, alpha=alpha) + np.testing.assert_allclose(y_pred_float1, y_pred_array) + np.testing.assert_allclose(y_pred_float2, y_pred_array) + np.testing.assert_allclose(y_pis_float1[:, :, 0], y_pis_array[:, :, 0]) + np.testing.assert_allclose(y_pis_float2[:, :, 0], y_pis_array[:, :, 1]) + + +@pytest.mark.parametrize("strategy", [*STRATEGIES]) +def test_results_for_ordered_alpha(strategy: str) -> None: + """ + Test that prediction intervals lower (upper) bounds give + consistent results for ordered alphas. + """ + mapie = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) + mapie.fit(X, y) + y_pred, y_pis = mapie.predict(X, alpha=[0.05, 0.1]) + assert (y_pis[:, 0, 0] <= y_pis[:, 0, 1]).all() + assert (y_pis[:, 1, 0] >= y_pis[:, 1, 1]).all() + + +@pytest.mark.parametrize("strategy", [*STRATEGIES]) +def test_results_single_and_multi_jobs(strategy: str) -> None: + """ + Test that MapieTimeSeriesRegressor gives equal predictions + regardless of number of parallel jobs. + """ + mapie_single = MapieTimeSeriesRegressor(n_jobs=1, **STRATEGIES[strategy]) + mapie_multi = MapieTimeSeriesRegressor(n_jobs=-1, **STRATEGIES[strategy]) + mapie_single.fit(X_toy, y_toy) + mapie_multi.fit(X_toy, y_toy) + y_pred_single, y_pis_single = mapie_single.predict(X_toy, alpha=0.2) + y_pred_multi, y_pis_multi = mapie_multi.predict(X_toy, alpha=0.2) + np.testing.assert_allclose(y_pred_single, y_pred_multi) + np.testing.assert_allclose(y_pis_single, y_pis_multi) + + +@pytest.mark.parametrize("strategy", [*STRATEGIES]) +def test_results_with_constant_sample_weights(strategy: str) -> None: + """ + Test predictions when sample weights are None + or constant with different values. + """ + n_samples = len(X) + mapie0 = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) + mapie1 = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) + mapie2 = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) + mapie0.fit(X, y, sample_weight=None) + mapie1.fit(X, y, sample_weight=np.ones(shape=n_samples)) + mapie2.fit(X, y, sample_weight=np.ones(shape=n_samples) * 5) + y_pred0, y_pis0 = mapie0.predict(X, alpha=0.05) + y_pred1, y_pis1 = mapie1.predict(X, alpha=0.05) + y_pred2, y_pis2 = mapie2.predict(X, alpha=0.05) + np.testing.assert_allclose(y_pred0, y_pred1) + np.testing.assert_allclose(y_pred1, y_pred2) + np.testing.assert_allclose(y_pis0, y_pis1) + np.testing.assert_allclose(y_pis1, y_pis2) + + +@pytest.mark.parametrize("strategy", [*STRATEGIES]) +def test_prediction_between_low_up(strategy: str) -> None: + """Test that prediction lies between low and up prediction intervals.""" + mapie = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) + mapie.fit(X, y) + y_pred, y_pis = mapie.predict(X, alpha=0.1) + assert (y_pred >= y_pis[:, 0, 0]).all() + assert (y_pred <= y_pis[:, 1, 0]).all() + -X_toy = np.array([[0], [1], [2], [3], [4], [5]]) -y_toy = np.array([5, 7.5, 9.5, 10.5, 12.5, 15]) +@pytest.mark.parametrize("method", ["plus", "minmax"]) +@pytest.mark.parametrize("cv", [-1, 2, 3, 5]) +@pytest.mark.parametrize("agg_function", ["mean", "median"]) +@pytest.mark.parametrize("alpha", [0.05, 0.1, 0.2]) +def test_prediction_agg_function( + method: str, cv: Union[LeaveOneOut, KFold], agg_function: str, alpha: int +) -> None: + """ + Test that predictions differ when ensemble is True/False, + but not prediction intervals. + """ + mapie = MapieTimeSeriesRegressor( + method=method, cv=cv, agg_function=agg_function + ) + mapie.fit(X, y) + y_pred_1, y_pis_1 = mapie.predict(X, ensemble=True, alpha=alpha) + y_pred_2, y_pis_2 = mapie.predict(X, ensemble=False, alpha=alpha) + np.testing.assert_allclose(y_pis_1[:, 0, 0], y_pis_2[:, 0, 0]) + np.testing.assert_allclose(y_pis_1[:, 1, 0], y_pis_2[:, 1, 0]) + with pytest.raises(AssertionError): + np.testing.assert_allclose(y_pred_1, y_pred_2) + + +@pytest.mark.parametrize("strategy", [*STRATEGIES]) +def test_linear_regression_results(strategy: str) -> None: + """ + Test expected prediction intervals for + a multivariate linear regression problem + with fixed random state. + """ + mapie_ts = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) + mapie_ts.fit(X, y) + _, y_pis = mapie_ts.predict(X, alpha=0.05) + y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0] + width_mean = (y_pred_up - y_pred_low).mean() + coverage = regression_coverage_score(y, y_pred_low, y_pred_up) + np.testing.assert_allclose(width_mean, WIDTHS[strategy], rtol=1e-2) + np.testing.assert_allclose(coverage, COVERAGES[strategy], rtol=1e-2) + + +def test_results_prefit_ignore_method() -> None: + """Test that method is ignored when ``cv="prefit"``.""" + estimator = LinearRegression().fit(X, y) + all_y_pis: List[ArrayLike] = [] + for method in METHODS: + mapie_ts_reg = MapieTimeSeriesRegressor( + estimator=estimator, cv="prefit", method=method + ) + mapie_ts_reg.fit(X, y) + _, y_pis = mapie_ts_reg.predict(X, alpha=0.1) + all_y_pis.append(y_pis) + for y_pis1, y_pis2 in combinations(all_y_pis, 2): + np.testing.assert_allclose(y_pis1, y_pis2) + + +def test_results_prefit_naive() -> None: + """ + Test that prefit, fit and predict on the same dataset + is equivalent to the "naive" method. + """ + estimator = LinearRegression().fit(X, y) + mapie_ts_reg = MapieTimeSeriesRegressor(estimator=estimator, cv="prefit") + mapie_ts_reg.fit(X, y) + _, y_pis = mapie_ts_reg.predict(X, alpha=0.05) + width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean() + coverage = regression_coverage_score(y, y_pis[:, 0, 0], y_pis[:, 1, 0]) + np.testing.assert_allclose(width_mean, WIDTHS["naive"], rtol=1e-2) + np.testing.assert_allclose(coverage, COVERAGES["naive"], rtol=1e-2) + + +def test_results_prefit() -> None: + """Test prefit results on a standard train/validation/test split.""" + X_train_val, X_test, y_train_val, y_test = train_test_split( + X, y, test_size=1 / 10, random_state=1 + ) + X_train, X_val, y_train, y_val = train_test_split( + X_train_val, y_train_val, test_size=1 / 9, random_state=1 + ) + estimator = LinearRegression().fit(X_train, y_train) + mapie_ts_reg = MapieTimeSeriesRegressor(estimator=estimator, cv="prefit") + mapie_ts_reg.fit(X_val, y_val) + _, y_pis = mapie_ts_reg.predict(X_test, alpha=0.05) + width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean() + coverage = regression_coverage_score( + y_test, y_pis[:, 0, 0], y_pis[:, 1, 0] + ) + np.testing.assert_allclose(width_mean, WIDTHS["prefit"], rtol=1e-2) + np.testing.assert_allclose(coverage, COVERAGES["prefit"], rtol=1e-2) + + +def test_not_enough_resamplings() -> None: + """Test that a warning is raised if at least one residual is nan.""" + with pytest.warns(UserWarning, match=r"WARNING: at least one point of*"): + mapie_ts_reg = MapieTimeSeriesRegressor( + cv=BlockBootstrap(n_resamplings=1, n_blocks=1), agg_function="mean" + ) + mapie_ts_reg.fit(X, y) + + +def test_no_agg_fx_specified_with_subsample() -> None: + """Test that a warning is raised if at least one residual is nan.""" + with pytest.raises( + ValueError, match=r"You need to specify an aggregation*" + ): + mapie_ts_reg = MapieTimeSeriesRegressor( + cv=BlockBootstrap(n_resamplings=1, n_blocks=1), + agg_function=None, + ) + mapie_ts_reg.fit(X, y) + + +def test_invalid_aggregate_all() -> None: + """ + Test that wrong aggregation in MAPIE raise errors. + """ + with pytest.raises( + ValueError, + match=r".*Aggregation function called but not defined.*", + ): + aggregate_all(None, X) + + +def test_aggregate_with_mask_with_prefit() -> None: + """ + Test ``aggregate_with_mask`` in case ``cv`` is ``"prefit"``. + """ + mapie_ts_reg = MapieTimeSeriesRegressor(cv="prefit") + with pytest.raises( + ValueError, + match=r".*There should not be aggregation of predictions if cv is*", + ): + mapie_ts_reg.aggregate_with_mask(k, k) + + mapie_ts_reg = MapieTimeSeriesRegressor(agg_function="nonsense") + with pytest.raises( + ValueError, + match=r".*The value of self.agg_function is not correct*", + ): + mapie_ts_reg.aggregate_with_mask(k, k) + + +def test_pred_loof_isnan() -> None: + """Test that if validation set is empty then prediction is empty.""" + mapie_ts_reg = MapieTimeSeriesRegressor() + _, y_pred, _, _ = mapie_ts_reg._fit_and_predict_oof_model( + estimator=mapie_ts_reg(), + X=X_toy, + y=y_toy, + train_index=[0, 1, 2, 3, 4], + val_index=[], + k=0, + ) + assert len(y_pred) == 0 def test_MapieTimeSeriesRegressor_partial_fit_ensemble_T() -> None: diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index d4343c970..e6b42ab5a 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -1,10 +1,9 @@ from __future__ import annotations -from argparse import ArgumentDefaultsHelpFormatter +from configparser import Interpolation from typing import Iterable, Optional, Tuple, Union, cast import numpy as np -import numpy.ma as ma from sklearn.base import RegressorMixin from sklearn.model_selection import BaseCrossValidator from sklearn.utils import check_array @@ -41,7 +40,7 @@ class MapieTimeSeriesRegressor(MapieRegressor): """ cv_need_agg_function = [BlockBootstrap, Subsample] - valid_methods_ = ["plus"] + valid_methods_ = ["naive", "base", "plus", "minmax"] valid_agg_functions_ = [None, "median", "mean"] fit_attributes = [ "single_estimator_", @@ -77,7 +76,7 @@ def fit( """ self = super().fit(X=X, y=y, sample_weight=sample_weight) y_pred = super().predict(X=X) - self.residuals_ = y.values - y_pred + self.residuals_ = y - y_pred return self def partial_fit( @@ -141,11 +140,10 @@ def predict( else: alpha_ = cast(ArrayLike, alpha_) check_alpha_and_n_samples(alpha_, self.residuals_.shape[0]) - betas_0 = np.full_like(alpha_, np.nan, dtype=float) for ind, _alpha in enumerate(alpha_): - betas = np.linspace(0.0, _alpha, num=len(self.residuals_) + 2) + betas = np.linspace(0.0, _alpha, num=len(self.residuals_)+2) one_alpha_beta = np.quantile( self.residuals_, @@ -161,7 +159,6 @@ def predict( interpolation="lower", ) betas_0[ind] = betas[np.argmin(one_alpha_beta - beta, axis=0)] - lower_quantiles = np.quantile( self.residuals_, betas_0, @@ -170,7 +167,7 @@ def predict( ) higher_quantiles = np.quantile( self.residuals_, - 1 - _alpha + betas_0, + 1 - alpha_ + betas_0, axis=0, interpolation="higher", ) @@ -230,202 +227,3 @@ def predict( if ensemble: y_pred = aggregate_all(self.agg_function, y_pred_multi) return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) - - def predict2( - self, - X: ArrayLike, - ensemble: bool = False, - alpha: Optional[Union[float, Iterable[float]]] = None, - ) -> Union[ArrayLike, Tuple[ArrayLike, ArrayLike]]: - # Checks - check_is_fitted(self, self.fit_attributes) - self._check_ensemble(ensemble) - alpha_ = check_alpha(alpha) - X = check_array(X, force_all_finite=False, dtype=["float64", "object"]) - y_pred = self.single_estimator_.predict(X) - - if alpha is None: - return np.array(y_pred) - else: - alpha_ = cast(ArrayLike, alpha_) - check_alpha_and_n_samples(alpha_, self.residuals_.shape[0]) - if self.method in ["naive", "base"] or self.cv == "prefit": - quantile = np.quantile( - self.residuals_, 1 - alpha_, interpolation="higher" - ) - y_pred_low = y_pred[:, np.newaxis] - quantile - y_pred_up = y_pred[:, np.newaxis] + quantile - else: - y_pred_multi = np.column_stack( - [e.predict(X) for e in self.estimators_] - ) - - # At this point, y_pred_multi is of shape - # (n_samples_test, n_estimators_). The method - # ``aggregate_with_mask`` fits it to the right size thanks to - # the shape of k_. - - y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) - - if self.method == "plus": - lower_bounds = y_pred_multi + self.residuals_ - upper_bounds = y_pred_multi + self.residuals_ - if self.method == "minmax": - lower_bounds = np.min(y_pred_multi, axis=1, keepdims=True) - upper_bounds = np.max(y_pred_multi, axis=1, keepdims=True) - lower_bounds = lower_bounds + self.residuals_ - upper_bounds = upper_bounds + self.residuals_ - - y_pred_low = np.column_stack( - [ - np.quantile( - ma.masked_invalid(lower_bounds), - _alpha, - axis=1, - interpolation="lower", - ) - for _alpha in alpha_ - ] - ) - - y_pred_up = np.column_stack( - [ - np.quantile( - ma.masked_invalid(upper_bounds), - 1 - _alpha, - axis=1, - interpolation="higher", - ) - for _alpha in alpha_ - ] - ) - if ensemble: - y_pred = aggregate_all(self.agg_function, y_pred_multi) - return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) - - def predict3( - self, - X: ArrayLike, - ensemble: bool = False, - alpha: Optional[Union[float, Iterable[float]]] = None, - ) -> Union[ArrayLike, Tuple[ArrayLike, ArrayLike]]: - # Checks - check_is_fitted(self, self.fit_attributes) - self._check_ensemble(ensemble) - alpha_ = check_alpha(alpha) - X = check_array(X, force_all_finite=False, dtype=["float64", "object"]) - y_pred = self.single_estimator_.predict(X) - - if alpha is None: - return np.array(y_pred) - else: - alpha_ = cast(ArrayLike, alpha_) - check_alpha_and_n_samples(alpha_, self.residuals_.shape[0]) - - y_pred_low = [] - y_pred_up = [] - - for _alpha in alpha_: - betas = np.linspace(0.0, _alpha, num=len(self.residuals_) + 2) - - if self.method in ["naive", "base"] or self.cv == "prefit": - one_alpha_beta = np.quantile( - self.residuals_, - 1 - _alpha + betas, - axis=0, - interpolation="higher", - ) - beta = np.quantile( - self.residuals_, - betas, - axis=0, - interpolation="lower", - ) - - beta_0 = betas[np.argmin(one_alpha_beta - beta, axis=0)] - - lower_quantiles = np.quantile( - self.residuals_, - beta_0, - axis=0, - interpolation="lower", - ) - higher_quantiles = np.quantile( - self.residuals_, - 1 - _alpha + beta_0, - axis=0, - interpolation="higher", - ) - y_pred_low.append(y_pred[:, np.newaxis] + lower_quantiles) - y_pred_up.append(y_pred[:, np.newaxis] + higher_quantiles) - else: - y_pred_multi = np.column_stack( - [e.predict(X) for e in self.estimators_] - ) - - # At this point, y_pred_multi is of shape - # (n_samples_test, n_estimators_). The method - # ``aggregate_with_mask`` fits it to the right size thanks to - # the shape of k_. - - y_pred_multi = self.aggregate_with_mask( - y_pred_multi, self.k_ - ) - - if self.method == "plus": - lower_bounds = y_pred_multi + self.residuals_ - upper_bounds = y_pred_multi + self.residuals_ - - if self.method == "minmax": - lower_bounds = np.min( - y_pred_multi, axis=1, keepdims=True - ) - upper_bounds = np.max( - y_pred_multi, axis=1, keepdims=True - ) - lower_bounds = lower_bounds + self.residuals_ - upper_bounds = upper_bounds + self.residuals_ - - one_alpha_beta = np.quantile( - upper_bounds, - 1 - _alpha + betas, - axis=1, - interpolation="higher", - ) - - beta = np.quantile( - lower_bounds, - betas, - axis=1, - interpolation="lower", - ) - - betas_0 = betas[np.argmin(one_alpha_beta - beta, axis=0)] - - lower_quantiles = np.empty((len(betas_0),)) - upper_quantiles = np.empty((len(betas_0),)) - - for ind, beta_0 in enumerate(betas_0): - lower_quantiles[ind] = np.quantile( - lower_bounds[ind, :], - beta_0, - axis=0, - interpolation="lower", - ) - upper_quantiles[ind] = np.quantile( - upper_bounds[ind, :], - 1 - _alpha + beta_0, - axis=0, - interpolation="higher", - ) - y_pred_low.append(lower_quantiles) - y_pred_up.append(upper_quantiles) - - y_pred_low = np.column_stack(y_pred_low) - y_pred_up = np.column_stack(y_pred_up) - - print(y_pred_low.shape) - print(y_pred_up.shape) - if ensemble: - y_pred = aggregate_all(self.agg_function, y_pred_multi) - return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) From 8749fcb5927a2b7deaa7886c86250fbfdffe8262 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 21 Mar 2022 19:06:05 +0100 Subject: [PATCH 08/32] unit test OK, example not OK --- examples/regression/plot_timeseries_enbpi.py | 73 +++++++++++--------- mapie/regression.py | 6 +- mapie/subsample.py | 20 +++--- mapie/tests/test_subsample.py | 63 +++++++++++++++-- mapie/tests/test_time_series_regression.py | 41 ++++++----- mapie/time_series_regression.py | 13 +--- 6 files changed, 140 insertions(+), 76 deletions(-) diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index f5b3a1125..1b9879c89 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -13,14 +13,14 @@ The electricity demand features daily and weekly seasonalities and is impacted by the temperature, considered here as a exogeneous variable. -The data is modelled by a Random Forest model with a -:class:`sklearn.model_selection.RandomizedSearchCV` using a sequential +A Random Forest model is fitted on data. The hyper-parameters are optimized +with a :class:`sklearn.model_selection.RandomizedSearchCV` using a sequential :class:`sklearn.model_selection.TimeSeriesSplit` cross validation, in which the training set is prior to the validation set. The best model is then feeded into :class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the associated prediction intervals. We compare two approaches: one with no -`partial_fit` call and one with `partial_fit` every 5 steps. +`partial_fit` call and one with `partial_fit` every step. """ import warnings @@ -30,7 +30,7 @@ from sklearn.ensemble import RandomForestRegressor from mapie.metrics import regression_coverage_score -from mapie.subsample import Subsample +from mapie.subsample import BlockBootstrap from mapie.time_series_regression import MapieTimeSeriesRegressor warnings.simplefilter("ignore") @@ -39,20 +39,21 @@ demand_df = pd.read_csv( "../data/demand_temperature.csv", parse_dates=True, index_col=0 ) + +print(demand_df.shape) demand_df["Date"] = pd.to_datetime(demand_df.index) demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64") demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64") demand_df["Hour"] = demand_df.index.hour -for hour in range(1, 5): +for hour in range(1, 3): demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour) # Train/validation/test split -num_test_steps = 24 * 7 * 2 +num_test_steps = 24 * 7 demand_train = demand_df.iloc[:-num_test_steps, :].copy() demand_test = demand_df.iloc[-num_test_steps:, :].copy() -features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + [ - f"Lag_{hour}" for hour in range(1, 5) -] +features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + X_train = demand_train.loc[ ~np.any(demand_train[features].isnull(), axis=1), features ] @@ -60,14 +61,18 @@ X_test = demand_test.loc[:, features] y_test = demand_test["Demand"] -# Model -model = RandomForestRegressor(max_depth=15, n_estimators=673, random_state=59) +# Model: Random Forest previously optimized with a cross-validation +model = RandomForestRegressor(max_depth=10, n_estimators=50, random_state=59) # Estimate prediction intervals on test set with best estimator alpha = 0.1 -cv_Mapie = Subsample(30, random_state=59) +cv_MapieTimeSeries = BlockBootstrap(200, length=24, random_state=59) mapie = MapieTimeSeriesRegressor( - model, method="plus", cv=cv_Mapie, agg_function="median", n_jobs=-1 + model, + method="plus", + cv=cv_MapieTimeSeries, + agg_function="mean", + n_jobs=-1 ) mapie.fit(X_train, y_train) @@ -76,28 +81,30 @@ coverage = regression_coverage_score(y_test, y_pis[:, 0, 0], y_pis[:, 1, 0]) width = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean() -# With partial_fit every five hours -y_pred_5_steps, y_pis_5_steps = mapie.predict(X_test.iloc[:5, :], alpha=alpha) +# With partial_fit every 2 hours +gap = 1 -for step in range(5, len(X_test), 5): +y_pred_steps, y_pis_steps = mapie.predict(X_test.iloc[:gap, :], alpha=alpha) + +for step in range(gap, len(X_test), gap): mapie.partial_fit( - X_test.iloc[(step - 5): step, :], y_test.iloc[(step - 5):step] + X_test.iloc[(step - gap):step, :], y_test.iloc[(step - gap):step] ) - y_pred_step, y_pis_step = mapie.predict( - X_test.iloc[step: (step + 5), :], alpha=alpha + y_pred_gap_step, y_pis_gap_step = mapie.predict( + X_test.iloc[step:(step+gap), :], + alpha=alpha, ) - y_pred_5_steps = np.concatenate((y_pred_5_steps, y_pred_step), axis=0) - y_pis_5_steps = np.concatenate((y_pis_5_steps, y_pis_step), axis=0) + y_pred_steps = np.concatenate((y_pred_steps, y_pred_gap_step), axis=0) + y_pis_steps = np.concatenate((y_pis_steps, y_pis_gap_step), axis=0) -coverage_5_step = regression_coverage_score( - y_test, y_pis_5_steps[:, 0, 0], y_pis_5_steps[:, 1, 0] +coverage_steps = regression_coverage_score( + y_test, y_pis_steps[:, 0, 0], y_pis_steps[:, 1, 0] ) -width_5_step = (y_pis_5_steps[:, 1, 0] - y_pis_5_steps[:, 0, 0]).mean() - +width_steps = (y_pis_steps[:, 1, 0] - y_pis_steps[:, 0, 0]).mean() # Print results print( - "Coverage and prediction interval width mean for MapieTimeSeriesRegressor:" + "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " "\nWithout any partial_fit:" f"{coverage:.3f}, {width:.3f}" ) @@ -117,12 +124,13 @@ label="MapieTimeSeriesRegressor PIs", ) ax.legend() +plt.title("Without partial_fit") plt.show() print( - "Coverage and prediction interval width mean for MapieTimeSeriesRegressor:" - "\nWith partial_fit every 5 steps:" - f"{coverage_5_step:.3f}, {width_5_step:.3f}" + "Coverage / prediction interval width mean for MapieTimeSeriesRegressor " + "\nWith partial_fit every step: " + f"{coverage_steps:.3f}, {width_steps:.3f}" ) # Plot estimated prediction intervals on test set @@ -130,14 +138,15 @@ ax = fig.add_subplot(1, 1, 1) ax.set_ylabel("Hourly demand (GW)") ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") -ax.plot(demand_test.index, y_pred_5_steps, lw=2, c="C2", label="Predictions") +ax.plot(demand_test.index, y_pred_steps, lw=2, c="C2", label="Predictions") ax.fill_between( demand_test.index, - y_pis_5_steps[:, 0, 0], - y_pis_5_steps[:, 1, 0], + y_pis_steps[:, 0, 0], + y_pis_steps[:, 1, 0], color="C2", alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) ax.legend() +plt.title("With partial_fit") plt.show() diff --git a/mapie/regression.py b/mapie/regression.py index 2bc0e7270..661c858f6 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -187,6 +187,7 @@ class MapieRegressor(BaseEstimator, RegressorMixin): # type: ignore >>> print(y_pred) [ 5.28571429 7.17142857 9.05714286 10.94285714 12.82857143 14.71428571] """ + cv_need_agg_function = [Subsample] valid_methods_ = ["naive", "base", "plus", "minmax"] valid_agg_functions_ = [None, "median", "mean"] @@ -262,8 +263,9 @@ def _check_agg_function( "Allowed values are None, 'mean', 'median'." ) - if ((agg_function is None) and - (type(self.cv) in self.cv_need_agg_function)): + if (agg_function is None) and ( + type(self.cv) in self.cv_need_agg_function + ): raise ValueError( "You need to specify an aggregation function when " f"cv's type is in {self.cv_need_agg_function}." diff --git a/mapie/subsample.py b/mapie/subsample.py index de9b97933..e52778769 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -136,8 +136,8 @@ class BlockBootstrap(BaseCrossValidator): # type: ignore >>> X = np.array([1,2,3,4,5,6,7,8,9,10]) >>> for train_index, test_index in cv.split(X): ... print(f"train index is {train_index}, test index is {test_index}") - train index is [5 0 3 3 7 9 3 5 2 4], test index is [8 1 6] - train index is [7 6 8 8 1 6 7 7 8 1], test index is [0 2 3 4 5 9] + train index is [1 2 3 4 5 6 1 2 3 4 5 6], test index is [8 9 7] + train index is [4 5 6 7 8 9 1 2 3 7 8 9], test index is [] """ def __init__( @@ -148,11 +148,6 @@ def __init__( overlapping: bool = False, random_state: Optional[Union[int, RandomState]] = None, ) -> None: - if length is None and n_blocks is None: - raise ValueError( - "At least one argument in ['length', 'n_blocks]" - "has to be not None." - ) self.n_resamplings = n_resamplings self.length = length self.n_blocks = n_blocks @@ -181,6 +176,12 @@ def split( ValueError If ``length`` is greater than the train set size. """ + if (self.length is None) and (self.n_blocks is None): + raise ValueError( + "At least one argument in ['length', 'n_blocks]" + "has to be not None." + ) + length = ( self.length if self.length is not None else len(X) // self.n_blocks ) @@ -190,9 +191,9 @@ def split( else (len(X) // length) + 1 ) indices = np.arange(len(X)) - if length > len(indices): + if (length <= 0) or (length > len(indices)): raise ValueError( - "The length of blocks is greater than the lenght" + "The length of blocks is <= 0 or greater than the lenght" "of training set." ) @@ -202,7 +203,6 @@ def split( indices = indices[len(indices) % length:] blocks_number = len(indices) // length blocks = np.array_split(indices, indices_or_sections=blocks_number) - random_state = check_random_state(self.random_state) for k in range(self.n_resamplings): diff --git a/mapie/tests/test_subsample.py b/mapie/tests/test_subsample.py index 39dd2ab74..533a50b7f 100644 --- a/mapie/tests/test_subsample.py +++ b/mapie/tests/test_subsample.py @@ -1,11 +1,12 @@ from __future__ import annotations import numpy as np +import pytest -from mapie.subsample import Subsample +from mapie.subsample import BlockBootstrap, Subsample -def test_default_parameters() -> None: +def test_default_parameters_SubSample() -> None: """Test default values of Subsample.""" cv = Subsample() assert cv.n_resamplings == 30 @@ -13,13 +14,13 @@ def test_default_parameters() -> None: assert cv.random_state is None -def test_get_n_splits() -> None: +def test_get_n_splits_SubSample() -> None: """Test get_n_splits method of Subsample.""" cv = Subsample(n_resamplings=3) assert cv.get_n_splits() == 3 -def test_split() -> None: +def test_split_SubSample() -> None: """Test outputs of subsamplings.""" X = np.array([0, 1, 2, 3]) cv = Subsample(n_resamplings=2, random_state=1) @@ -29,3 +30,57 @@ def test_split() -> None: tests_expected = np.array([2, 0, 2]) np.testing.assert_equal(trains, trains_expected) np.testing.assert_equal(tests, tests_expected) + + +def test_default_parameters_BlockBootstrap() -> None: + """Test default values of Subsample.""" + cv = BlockBootstrap() + assert cv.n_resamplings == 30 + assert cv.length is None + assert cv.n_blocks is None + assert not cv.overlapping + assert cv.random_state is None + + +def test_get_n_splits_BlockBootstrap() -> None: + """Test get_n_splits method of Subsample.""" + cv = BlockBootstrap(n_resamplings=3) + assert cv.get_n_splits() == 3 + + +def test_split_BlockBootstrap() -> None: + """Test outputs of subsamplings.""" + X = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + cv = BlockBootstrap( + n_resamplings=1, length=2, overlapping=False, random_state=1 + ) + trains = np.concatenate([x[0] for x in cv.split(X)]) + tests = np.concatenate([x[1] for x in cv.split(X)]) + trains_expected = np.array([7, 8, 9, 10, 1, 2, 3, 4, 7, 8, 1, 2]) + tests_expected = np.array([5, 6]) + np.testing.assert_equal(trains, trains_expected) + np.testing.assert_equal(tests, tests_expected) + + X = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + cv = BlockBootstrap( + n_resamplings=1, length=2, overlapping=True, random_state=1 + ) + trains = np.concatenate([x[0] for x in cv.split(X)]) + tests = np.concatenate([x[1] for x in cv.split(X)]) + trains_expected = np.array([5, 6, 8, 9, 9, 10, 5, 6, 0, 1, 0, 1]) + tests_expected = np.array([2, 3, 4, 7]) + np.testing.assert_equal(trains, trains_expected) + np.testing.assert_equal(tests, tests_expected) + + +def test_split_BlockBootstrap_error() -> None: + """Test outputs of subsamplings.""" + X = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + cv = BlockBootstrap() + print(cv.length) + print(cv.n_blocks) + with pytest.raises(ValueError, match=r".*At least one argument*"): + next(cv.split(X)) + cv = BlockBootstrap(length=20) + with pytest.raises(ValueError, match=r".*The length of blocks is <= 0 *"): + next(cv.split(X)) diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 65cd59bcb..8a6b17763 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -16,8 +16,8 @@ from mapie.time_series_regression import MapieTimeSeriesRegressor from mapie.subsample import BlockBootstrap -X_toy = np.array(range(50)).reshape(-1, 1) -y_toy = (5.0 + 2.0*X_toy).flatten() +X_toy = np.array(range(5)).reshape(-1, 1) +y_toy = (5.0 + 2.0 * X_toy ** 1.1).flatten() X, y = make_regression(n_samples=500, n_features=10, noise=1.0, random_state=1) k = np.ones(shape=(5, X.shape[1])) METHODS = ["naive", "base", "plus", "minmax"] @@ -79,7 +79,7 @@ "cv": 3.76, "cv_plus": 3.76, "cv_minmax": 3.95, - "prefit": 4.81, + "prefit": 3.89, "cv_plus_median": 3.90, "jackknife_plus_ab": 3.76, "jackknife_minmax_ab": 3.96, @@ -93,12 +93,12 @@ "jackknife_minmax": 0.952, "cv": 0.958, "cv_plus": 0.956, - "cv_minmax": 0.966, - "prefit": 0.980, + "cv_minmax": 0.956, + "prefit": 0.90, "cv_plus_median": 0.954, "jackknife_plus_ab": 0.952, - "jackknife_minmax_ab": 0.970, - "jackknife_plus_median_ab": 0.960, + "jackknife_minmax_ab": 0.960, + "jackknife_plus_median_ab": 0.946, } @@ -365,8 +365,8 @@ def test_aggregate_with_mask_with_prefit() -> None: def test_pred_loof_isnan() -> None: """Test that if validation set is empty then prediction is empty.""" mapie_ts_reg = MapieTimeSeriesRegressor() - _, y_pred, _, _ = mapie_ts_reg._fit_and_predict_oof_model( - estimator=mapie_ts_reg(), + _, y_pred, _ = mapie_ts_reg._fit_and_predict_oof_model( + estimator=mapie_ts_reg, X=X_toy, y=y_toy, train_index=[0, 1, 2, 3, 4], @@ -376,25 +376,32 @@ def test_pred_loof_isnan() -> None: assert len(y_pred) == 0 +def test_MapieTimeSeriesRegressor_alpha_is_None() -> None: + """Test ``predict`` when ``alpha`` is None.""" + mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) + pred = mapie_ts_reg.predict(X_toy, alpha=None) + assert pred.shape == (len(pred), ) + + def test_MapieTimeSeriesRegressor_partial_fit_ensemble_T() -> None: """Test ``partial_update`` when ``ensemble`` is True.""" mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) - assert round(mapie_ts_reg.residuals_[-1], 2) == round(np.abs(15 - 14.4), 2) + assert round(mapie_ts_reg.residuals_[-1], 2) == round( + np.abs(14.189 - 14.049), 2 + ) mapie_ts_reg = mapie_ts_reg.partial_fit( X=np.array([[6]]), y=np.array([17.5]), ensemble=True ) - assert round(mapie_ts_reg.residuals_[-1], 2) == round( - np.abs(17.5 - 16.56), 2 - ) + assert round(mapie_ts_reg.residuals_[-1], 2) == round(17.5 - 18.665, 2) def test_MapieTimeSeriesRegressor_partial_fit_ensemble_F() -> None: """Test ``partial_update`` when ``ensemble`` is False.""" mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) - assert round(mapie_ts_reg.residuals_[-1], 2) == round(np.abs(15 - 14.4), 2) + assert round(mapie_ts_reg.residuals_[-1], 2) == round( + np.abs(14.189 - 14.049), 2 + ) mapie_ts_reg = mapie_ts_reg.partial_fit( X=np.array([[6]]), y=np.array([17.5]), ensemble=False ) - assert round(mapie_ts_reg.residuals_[-1], 2) == round( - np.abs(17.5 - 16.6), 2 - ) + assert round(mapie_ts_reg.residuals_[-1], 2) == round(17.5 - 18.66504, 2) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index e6b42ab5a..351a1282e 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -1,5 +1,4 @@ from __future__ import annotations -from configparser import Interpolation from typing import Iterable, Optional, Tuple, Union, cast @@ -42,14 +41,6 @@ class MapieTimeSeriesRegressor(MapieRegressor): cv_need_agg_function = [BlockBootstrap, Subsample] valid_methods_ = ["naive", "base", "plus", "minmax"] valid_agg_functions_ = [None, "median", "mean"] - fit_attributes = [ - "single_estimator_", - "estimators_", - "k_", - "residuals_", - "n_features_in_", - "n_samples_val_", - ] def __init__( self, @@ -75,7 +66,7 @@ def fit( The model itself. """ self = super().fit(X=X, y=y, sample_weight=sample_weight) - y_pred = super().predict(X=X) + y_pred = super().predict(X) self.residuals_ = y - y_pred return self @@ -143,7 +134,7 @@ def predict( betas_0 = np.full_like(alpha_, np.nan, dtype=float) for ind, _alpha in enumerate(alpha_): - betas = np.linspace(0.0, _alpha, num=len(self.residuals_)+2) + betas = np.linspace(0.0, _alpha, num=len(self.residuals_) + 2) one_alpha_beta = np.quantile( self.residuals_, From ff1246055c0967b07f6c4b1d2bda22c8630a4b41 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Tue, 22 Mar 2022 11:00:21 +0100 Subject: [PATCH 09/32] after tyoe checking --- examples/regression/plot_timeseries_enbpi.py | 19 ++++++------- mapie/subsample.py | 29 ++++++++++---------- mapie/time_series_regression.py | 7 ++--- 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index 1b9879c89..96acc2c51 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -40,7 +40,6 @@ "../data/demand_temperature.csv", parse_dates=True, index_col=0 ) -print(demand_df.shape) demand_df["Date"] = pd.to_datetime(demand_df.index) demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64") demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64") @@ -52,7 +51,9 @@ num_test_steps = 24 * 7 demand_train = demand_df.iloc[:-num_test_steps, :].copy() demand_test = demand_df.iloc[-num_test_steps:, :].copy() -features = ["Weekofyear", "Weekday", "Hour", "Temperature"] +features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + [ + f"Lag_{hour}" for hour in range(1, 2) +] X_train = demand_train.loc[ ~np.any(demand_train[features].isnull(), axis=1), features @@ -62,17 +63,13 @@ y_test = demand_test["Demand"] # Model: Random Forest previously optimized with a cross-validation -model = RandomForestRegressor(max_depth=10, n_estimators=50, random_state=59) +model = RandomForestRegressor(max_depth=17, n_estimators=150, random_state=59) # Estimate prediction intervals on test set with best estimator alpha = 0.1 -cv_MapieTimeSeries = BlockBootstrap(200, length=24, random_state=59) +cv_MapieTimeSeries = BlockBootstrap(100, length=48, random_state=59) mapie = MapieTimeSeriesRegressor( - model, - method="plus", - cv=cv_MapieTimeSeries, - agg_function="mean", - n_jobs=-1 + model, method="plus", cv=cv_MapieTimeSeries, agg_function="mean", n_jobs=-1 ) mapie.fit(X_train, y_train) @@ -81,7 +78,7 @@ coverage = regression_coverage_score(y_test, y_pis[:, 0, 0], y_pis[:, 1, 0]) width = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean() -# With partial_fit every 2 hours +# With partial_fit every hour gap = 1 y_pred_steps, y_pis_steps = mapie.predict(X_test.iloc[:gap, :], alpha=alpha) @@ -91,7 +88,7 @@ X_test.iloc[(step - gap):step, :], y_test.iloc[(step - gap):step] ) y_pred_gap_step, y_pis_gap_step = mapie.predict( - X_test.iloc[step:(step+gap), :], + X_test.iloc[step:(step + gap), :], alpha=alpha, ) y_pred_steps = np.concatenate((y_pred_steps, y_pred_gap_step), axis=0) diff --git a/mapie/subsample.py b/mapie/subsample.py index e52778769..f02827cd2 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -174,22 +174,23 @@ def split( Raises ------ ValueError - If ``length`` is greater than the train set size. + If ``length`` is not positive or greater than the train set size. """ - if (self.length is None) and (self.n_blocks is None): + if self.n_blocks is not None: + length = ( + self.length + if self.length is not None + else len(X) // self.n_blocks + ) + n_blocks = self.n_blocks + elif self.length is not None: + length = self.length + n_blocks = (len(X) // self.length) + 1 + else: raise ValueError( - "At least one argument in ['length', 'n_blocks]" - "has to be not None." + "At least one argument between ``length`` and " + "``n_blocks`` has to be not None" ) - - length = ( - self.length if self.length is not None else len(X) // self.n_blocks - ) - n_blocks = ( - self.n_blocks - if self.n_blocks is not None - else (len(X) // length) + 1 - ) indices = np.arange(len(X)) if (length <= 0) or (length > len(indices)): raise ValueError( @@ -200,7 +201,7 @@ def split( if self.overlapping: blocks = sliding_window_view(indices, window_shape=length) else: - indices = indices[len(indices) % length:] + indices = indices[(len(indices) % length):] blocks_number = len(indices) // length blocks = np.array_split(indices, indices_or_sections=blocks_number) random_state = check_random_state(self.random_state) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 351a1282e..369adc081 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -10,7 +10,7 @@ from .aggregation_functions import aggregate_all from .regression import MapieRegressor -from .subsample import Subsample, BlockBootstrap +from .subsample import BlockBootstrap from ._typing import ArrayLike from .utils import ( check_alpha, @@ -38,10 +38,6 @@ class MapieTimeSeriesRegressor(MapieRegressor): "Conformal prediction for dynamic time-series." """ - cv_need_agg_function = [BlockBootstrap, Subsample] - valid_methods_ = ["naive", "base", "plus", "minmax"] - valid_agg_functions_ = [None, "median", "mean"] - def __init__( self, estimator: Optional[RegressorMixin] = None, @@ -52,6 +48,7 @@ def __init__( verbose: int = 0, ) -> None: super().__init__(estimator, method, cv, n_jobs, agg_function, verbose) + self.cv_need_agg_function.append(BlockBootstrap) def fit( self, From 85c936f8fc5a97b04c353d8764e173e3c933e133 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Tue, 22 Mar 2022 11:12:42 +0100 Subject: [PATCH 10/32] correct typing --- mapie/regression.py | 5 ++--- mapie/time_series_regression.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/mapie/regression.py b/mapie/regression.py index 661c858f6..871fe1de9 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -19,7 +19,6 @@ from ._typing import ArrayLike from .aggregation_functions import aggregate_all, phi2D -from .subsample import Subsample from .utils import ( check_cv, check_alpha, @@ -188,7 +187,7 @@ class MapieRegressor(BaseEstimator, RegressorMixin): # type: ignore [ 5.28571429 7.17142857 9.05714286 10.94285714 12.82857143 14.71428571] """ - cv_need_agg_function = [Subsample] + cv_need_agg_function = ["Subsample"] valid_methods_ = ["naive", "base", "plus", "minmax"] valid_agg_functions_ = [None, "median", "mean"] fit_attributes = [ @@ -264,7 +263,7 @@ def _check_agg_function( ) if (agg_function is None) and ( - type(self.cv) in self.cv_need_agg_function + type(self.cv).__name__ in self.cv_need_agg_function ): raise ValueError( "You need to specify an aggregation function when " diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 369adc081..d9c5becb7 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -10,7 +10,6 @@ from .aggregation_functions import aggregate_all from .regression import MapieRegressor -from .subsample import BlockBootstrap from ._typing import ArrayLike from .utils import ( check_alpha, @@ -48,7 +47,7 @@ def __init__( verbose: int = 0, ) -> None: super().__init__(estimator, method, cv, n_jobs, agg_function, verbose) - self.cv_need_agg_function.append(BlockBootstrap) + self.cv_need_agg_function.append("BlockBootstrap") def fit( self, From 07578cfaadd95ce1ce1a4e3b6c63ead56226b00a Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 28 Mar 2022 18:45:49 +0200 Subject: [PATCH 11/32] ALL TESTS OK look at docstrinf --- examples/regression/plot_timeseries_enbpi.py | 6 +- mapie/subsample.py | 26 +++--- mapie/tests/test_regression.py | 8 +- mapie/tests/test_time_series_regression.py | 44 ++++------ mapie/time_series_regression.py | 87 ++++++++++---------- mapie/utils.py | 9 +- 6 files changed, 85 insertions(+), 95 deletions(-) diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index 96acc2c51..f0e51b8be 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -67,7 +67,7 @@ # Estimate prediction intervals on test set with best estimator alpha = 0.1 -cv_MapieTimeSeries = BlockBootstrap(100, length=48, random_state=59) +cv_MapieTimeSeries = BlockBootstrap(50, length=48, random_state=59) mapie = MapieTimeSeriesRegressor( model, method="plus", cv=cv_MapieTimeSeries, agg_function="mean", n_jobs=-1 ) @@ -85,10 +85,10 @@ for step in range(gap, len(X_test), gap): mapie.partial_fit( - X_test.iloc[(step - gap):step, :], y_test.iloc[(step - gap):step] + X_test.iloc[(step - gap) : step, :], y_test.iloc[(step - gap) : step] ) y_pred_gap_step, y_pis_gap_step = mapie.predict( - X_test.iloc[step:(step + gap), :], + X_test.iloc[step : (step + gap), :], alpha=alpha, ) y_pred_steps = np.concatenate((y_pred_steps, y_pred_gap_step), axis=0) diff --git a/mapie/subsample.py b/mapie/subsample.py index b53047403..52d5354d8 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -9,7 +9,7 @@ from sklearn.utils import check_random_state, resample from sklearn.utils.validation import _num_samples -from ._typing import ArrayLike, NDArray +from ._typing import NDArray class Subsample(BaseCrossValidator): @@ -56,15 +56,14 @@ def __init__( self.random_state = random_state def split( - self, - X: ArrayLike + self, X: NDArray ) -> Generator[Tuple[NDArray, NDArray], None, None]: """ Generate indices to split data into training and test sets. Parameters ---------- - X : ArrayLike of shape (n_samples, n_features) + X : NDArray of shape (n_samples, n_features) Training data. Yields @@ -157,21 +156,21 @@ def __init__( self.random_state = random_state def split( - self, X: ArrayLike - ) -> Generator[Tuple[Any, ArrayLike], None, None]: + self, X: NDArray + ) -> Generator[Tuple[NDArray, NDArray], None, None]: """ Generate indices to split data into training and test sets. Parameters ---------- - X : ArrayLike of shape (n_samples, n_features) + X : NDArray of shape (n_samples, n_features) Training data. Yields ------ - train : ArrayLike of shape (n_indices_training,) + train : NDArray of shape (n_indices_training,) The training set indices for that split. - test : ArrayLike of shape (n_indices_test,) + test : NDArray of shape (n_indices_test,) The testing set indices for that split. Raises ------ @@ -190,7 +189,7 @@ def split( n_blocks = (len(X) // self.length) + 1 else: raise ValueError( - "At least one argument between ``length`` and " + "At least one argument between ``length`` or " "``n_blocks`` has to be not None" ) indices = np.arange(len(X)) @@ -203,9 +202,12 @@ def split( if self.overlapping: blocks = sliding_window_view(indices, window_shape=length) else: - indices = indices[(len(indices) % length):] + indices = indices[(len(indices) % length) :] blocks_number = len(indices) // length - blocks = np.array_split(indices, indices_or_sections=blocks_number) + blocks = np.asarray( + np.array_split(indices, indices_or_sections=blocks_number) + ) + random_state = check_random_state(self.random_state) for k in range(self.n_resamplings): diff --git a/mapie/tests/test_regression.py b/mapie/tests/test_regression.py index b68c5cf43..e96198223 100644 --- a/mapie/tests/test_regression.py +++ b/mapie/tests/test_regression.py @@ -450,7 +450,7 @@ def test_pipeline_compatibility() -> None: { "x_cat": ["A", "A", "B", "A", "A", "B"], "x_num": [0, 1, 1, 4, np.nan, 5], - "y": [5, 7, 3, 9, 10, 8] + "y": [5, 7, 3, 9, 10, 8], } ) y = pd.Series([5, 7, 3, 9, 10, 8]) @@ -460,14 +460,12 @@ def test_pipeline_compatibility() -> None: ] ) categorical_preprocessor = Pipeline( - steps=[ - ("encoding", OneHotEncoder(handle_unknown="ignore")) - ] + steps=[("encoding", OneHotEncoder(handle_unknown="ignore"))] ) preprocessor = ColumnTransformer( [ ("cat", categorical_preprocessor, ["x_cat"]), - ("num", numeric_preprocessor, ["x_num"]) + ("num", numeric_preprocessor, ["x_num"]), ] ) pipe = make_pipeline(preprocessor, LinearRegression()) diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 8a6b17763..c8c5c79b4 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -5,12 +5,13 @@ import numpy as np import pytest +from sklearn import ensemble from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression from sklearn.model_selection import KFold, LeaveOneOut, train_test_split from typing_extensions import TypedDict -from mapie._typing import ArrayLike +from mapie._typing import NDArray from mapie.aggregation_functions import aggregate_all from mapie.metrics import regression_coverage_score from mapie.time_series_regression import MapieTimeSeriesRegressor @@ -109,14 +110,13 @@ def test_invalid_agg_function(agg_function: Any) -> None: mapie_ts_reg = MapieTimeSeriesRegressor(agg_function=None) with pytest.raises(ValueError, match=r".*If ensemble is True*"): mapie_ts_reg.fit(X_toy, y_toy) - mapie_ts_reg.predict(X_toy, ensemble=True) @pytest.mark.parametrize("strategy", [*STRATEGIES]) @pytest.mark.parametrize("dataset", [(X, y), (X_toy, y_toy)]) @pytest.mark.parametrize("alpha", [0.2, [0.2, 0.4], (0.2, 0.4)]) def test_predict_output_shape( - strategy: str, alpha: Any, dataset: Tuple[ArrayLike, ArrayLike] + strategy: str, alpha: Any, dataset: Tuple[NDArray, NDArray] ) -> None: """Test predict output shape.""" mapie_ts_reg = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) @@ -169,8 +169,10 @@ def test_results_for_ordered_alpha(strategy: str) -> None: mapie = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) mapie.fit(X, y) y_pred, y_pis = mapie.predict(X, alpha=[0.05, 0.1]) - assert (y_pis[:, 0, 0] <= y_pis[:, 0, 1]).all() - assert (y_pis[:, 1, 0] >= y_pis[:, 1, 1]).all() + assert np.all( + np.abs(y_pis[:, 1, 0] - y_pis[:, 0, 0]) + >= np.abs(y_pis[:, 1, 1] - y_pis[:, 0, 1]) + ) @pytest.mark.parametrize("strategy", [*STRATEGIES]) @@ -264,7 +266,7 @@ def test_linear_regression_results(strategy: str) -> None: def test_results_prefit_ignore_method() -> None: """Test that method is ignored when ``cv="prefit"``.""" estimator = LinearRegression().fit(X, y) - all_y_pis: List[ArrayLike] = [] + all_y_pis: List[NDArray] = [] for method in METHODS: mapie_ts_reg = MapieTimeSeriesRegressor( estimator=estimator, cv="prefit", method=method @@ -371,7 +373,6 @@ def test_pred_loof_isnan() -> None: y=y_toy, train_index=[0, 1, 2, 3, 4], val_index=[], - k=0, ) assert len(y_pred) == 0 @@ -379,29 +380,20 @@ def test_pred_loof_isnan() -> None: def test_MapieTimeSeriesRegressor_alpha_is_None() -> None: """Test ``predict`` when ``alpha`` is None.""" mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) - pred = mapie_ts_reg.predict(X_toy, alpha=None) - assert pred.shape == (len(pred), ) - -def test_MapieTimeSeriesRegressor_partial_fit_ensemble_T() -> None: - """Test ``partial_update`` when ``ensemble`` is True.""" - mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) - assert round(mapie_ts_reg.residuals_[-1], 2) == round( - np.abs(14.189 - 14.049), 2 - ) - mapie_ts_reg = mapie_ts_reg.partial_fit( - X=np.array([[6]]), y=np.array([17.5]), ensemble=True - ) - assert round(mapie_ts_reg.residuals_[-1], 2) == round(17.5 - 18.665, 2) + with pytest.raises(ValueError, match=r".*too many values to unpackt*"): + y_pred, y_pis = mapie_ts_reg.predict(X_toy, alpha=None) -def test_MapieTimeSeriesRegressor_partial_fit_ensemble_F() -> None: - """Test ``partial_update`` when ``ensemble`` is False.""" +def test_MapieTimeSeriesRegressor_partial_fit_ensemble() -> None: + """Test ``partial_fit``.""" mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) - assert round(mapie_ts_reg.residuals_[-1], 2) == round( - np.abs(14.189 - 14.049), 2 + assert round(mapie_ts_reg.conformity_scores_[-1], 2) == round( + np.abs(14.189 - 14.038), 2 ) mapie_ts_reg = mapie_ts_reg.partial_fit( - X=np.array([[6]]), y=np.array([17.5]), ensemble=False + X=np.array([[6]]), y=np.array([17.5]) + ) + assert round(mapie_ts_reg.conformity_scores_[-1], 2) == round( + 17.5 - 18.665, 2 ) - assert round(mapie_ts_reg.residuals_[-1], 2) == round(17.5 - 18.66504, 2) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index d9c5becb7..f4c482546 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -3,6 +3,7 @@ from typing import Iterable, Optional, Tuple, Union, cast import numpy as np +import numpy.ma as ma from sklearn.base import RegressorMixin from sklearn.model_selection import BaseCrossValidator from sklearn.utils import check_array @@ -10,7 +11,7 @@ from .aggregation_functions import aggregate_all from .regression import MapieRegressor -from ._typing import ArrayLike +from ._typing import ArrayLike, NDArray from .utils import ( check_alpha, check_alpha_and_n_samples, @@ -62,16 +63,18 @@ def fit( The model itself. """ self = super().fit(X=X, y=y, sample_weight=sample_weight) - y_pred = super().predict(X) - self.residuals_ = y - y_pred + y_pred, _ = super().predict(X, alpha=0.5, ensemble=True) + self.conformity_scores_ = np.asarray(y) - y_pred return self def partial_fit( - self, X: ArrayLike, y: ArrayLike, ensemble: bool = True + self, + X: ArrayLike, + y: ArrayLike, ) -> MapieTimeSeriesRegressor: """ - Update the ``residuals_`` attribute when data with known labels are - available. + Update the ``conformity_scores_`` attribute when data with known labels + are available. Parameters ---------- @@ -81,28 +84,24 @@ def partial_fit( y : ArrayLike of shape (n_samples,) Input labels. - ensemble : bool - Boolean corresponding to the ``ensemble`` argument of ``predict`` - method, determining whether the predictions computed to determine - the new ``residuals_`` are ensembled or not. - If False, predictions are those of the model trained on the whole - training set. - Returns ------- MapieTimeSeriesRegressor The model itself. """ - y_pred, _ = self.predict(X, alpha=0.5, ensemble=ensemble) - new_residuals = y - y_pred + y_pred, _ = self.predict(X, alpha=0.5, ensemble=True) + new_conformity_scores_ = np.asarray(y) - np.asarray(y_pred) + new_conformity_scores_ = new_conformity_scores_[ + ~np.isnan(new_conformity_scores_) + ] cut_index = min( - len(new_residuals[~np.isnan(new_residuals)]), len(self.residuals_) + len(new_conformity_scores_), len(self.conformity_scores_) ) - self.residuals_ = np.concatenate( + self.conformity_scores_ = np.concatenate( [ - self.residuals_[cut_index:], - new_residuals[~np.isnan(new_residuals)], + self.conformity_scores_[cut_index:], + new_conformity_scores_, ], axis=0, ) @@ -113,63 +112,64 @@ def predict( X: ArrayLike, ensemble: bool = False, alpha: Optional[Union[float, Iterable[float]]] = None, - ) -> Union[ArrayLike, Tuple[ArrayLike, ArrayLike]]: + ) -> Union[NDArray, Tuple[NDArray, NDArray]]: # Checks check_is_fitted(self, self.fit_attributes) self._check_ensemble(ensemble) - alpha_ = check_alpha(alpha) + alpha = cast(Optional[NDArray], check_alpha(alpha)) X = check_array(X, force_all_finite=False, dtype=["float64", "object"]) y_pred = self.single_estimator_.predict(X) + n = len(self.conformity_scores_) if alpha is None: return np.array(y_pred) else: - alpha_ = cast(ArrayLike, alpha_) - check_alpha_and_n_samples(alpha_, self.residuals_.shape[0]) - betas_0 = np.full_like(alpha_, np.nan, dtype=float) - - for ind, _alpha in enumerate(alpha_): - betas = np.linspace(0.0, _alpha, num=len(self.residuals_) + 2) + alpha_np = cast(NDArray, alpha) + check_alpha_and_n_samples(alpha_np, n) + betas_0 = np.full_like(alpha_np, np.nan, dtype=float) + for ind, _alpha in enumerate(alpha_np): + betas = np.linspace(0.0, _alpha, num=n + 1) one_alpha_beta = np.quantile( - self.residuals_, + ma.masked_invalid(self.conformity_scores_), 1 - _alpha + betas, axis=0, interpolation="higher", - ) + ) # type: ignore beta = np.quantile( - self.residuals_, + ma.masked_invalid(self.conformity_scores_), betas, axis=0, interpolation="lower", - ) + ) # type: ignore betas_0[ind] = betas[np.argmin(one_alpha_beta - beta, axis=0)] + lower_quantiles = np.quantile( - self.residuals_, + ma.masked_invalid(self.conformity_scores_), betas_0, axis=0, interpolation="lower", - ) + ) # type: ignore higher_quantiles = np.quantile( - self.residuals_, - 1 - alpha_ + betas_0, + ma.masked_invalid(self.conformity_scores_), + 1 - alpha_np + betas_0, axis=0, interpolation="higher", - ) + ) # type: ignore if self.method in ["naive", "base"] or self.cv == "prefit": y_pred_low = np.column_stack( [ y_pred[:, np.newaxis] + lower_quantiles[k] - for k in range(len(alpha_)) + for k in range(len(alpha_np)) ] ) y_pred_up = np.column_stack( [ y_pred[:, np.newaxis] + higher_quantiles[k] - for k in range(len(alpha_)) + for k in range(len(alpha_np)) ] ) else: @@ -187,12 +187,15 @@ def predict( if self.method == "plus": pred = aggregate_all(self.agg_function, y_pred_multi) y_pred_low = np.column_stack( - [pred + lower_quantiles[k] for k in range(len(alpha_))] + [ + pred + lower_quantiles[k] + for k in range(len(alpha_np)) + ] ) y_pred_up = np.column_stack( [ pred + higher_quantiles[k] - for k in range(len(alpha_)) + for k in range(len(alpha_np)) ] ) @@ -202,13 +205,13 @@ def predict( y_pred_low = np.column_stack( [ lower_bounds + lower_quantiles[k] - for k in range(len(alpha_)) + for k in range(len(alpha_np)) ] ) y_pred_up = np.column_stack( [ upper_bounds + higher_quantiles[k] - for k in range(len(alpha_)) + for k in range(len(alpha_np)) ] ) if ensemble: diff --git a/mapie/utils.py b/mapie/utils.py index 0f83f09e5..44fdc7e62 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -5,19 +5,14 @@ import numpy as np from sklearn.base import ClassifierMixin, RegressorMixin from sklearn.model_selection import BaseCrossValidator, KFold, LeaveOneOut -from sklearn.utils.validation import ( - _check_sample_weight, - _num_features -) +from sklearn.utils.validation import _check_sample_weight, _num_features from sklearn.utils import _safe_indexing from ._typing import ArrayLike, NDArray def check_null_weight( - sample_weight: Optional[ArrayLike], - X: ArrayLike, - y: ArrayLike + sample_weight: Optional[ArrayLike], X: ArrayLike, y: ArrayLike ) -> Tuple[Optional[NDArray], ArrayLike, ArrayLike]: """ Check sample weights and remove samples with null sample weights. From b5b46d6f2afeb5dcbfe693e0f74e612a9d2c0a23 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 28 Mar 2022 18:48:59 +0200 Subject: [PATCH 12/32] make lint PASS --- examples/regression/plot_timeseries_enbpi.py | 4 ++-- mapie/subsample.py | 2 +- mapie/tests/test_time_series_regression.py | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index f0e51b8be..391d697e9 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -85,10 +85,10 @@ for step in range(gap, len(X_test), gap): mapie.partial_fit( - X_test.iloc[(step - gap) : step, :], y_test.iloc[(step - gap) : step] + X_test.iloc[(step - gap):step, :], y_test.iloc[(step - gap):step] ) y_pred_gap_step, y_pis_gap_step = mapie.predict( - X_test.iloc[step : (step + gap), :], + X_test.iloc[step:(step + gap), :], alpha=alpha, ) y_pred_steps = np.concatenate((y_pred_steps, y_pred_gap_step), axis=0) diff --git a/mapie/subsample.py b/mapie/subsample.py index 52d5354d8..049ac50f6 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -202,7 +202,7 @@ def split( if self.overlapping: blocks = sliding_window_view(indices, window_shape=length) else: - indices = indices[(len(indices) % length) :] + indices = indices[(len(indices) % length):] blocks_number = len(indices) // length blocks = np.asarray( np.array_split(indices, indices_or_sections=blocks_number) diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index c8c5c79b4..80dcad92a 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -5,7 +5,6 @@ import numpy as np import pytest -from sklearn import ensemble from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression from sklearn.model_selection import KFold, LeaveOneOut, train_test_split From 8a807e9e9dcba84db5e436f5e79f8920f88daaca Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Wed, 30 Mar 2022 20:12:30 +0200 Subject: [PATCH 13/32] maked_quantile not perfect, yet --- README.rst | 5 + examples/regression/plot_timeseries_enbpi.py | 227 ++++++++++++---- .../regression/plot_timeseries_enbpi_train.py | 83 ++++++ mapie/regression.py | 10 +- mapie/time_series_regression.py | 245 ++++++++++++------ mapie/utils.py | 103 ++++++++ 6 files changed, 536 insertions(+), 137 deletions(-) create mode 100644 examples/regression/plot_timeseries_enbpi_train.py diff --git a/README.rst b/README.rst index cb21db27f..b40c0cadb 100644 --- a/README.rst +++ b/README.rst @@ -244,6 +244,11 @@ MAPIE methods belong to the field of conformal inference. "Uncertainty Sets for Image Classifiers using Conformal Prediction." International Conference on Learning Representations 2021. +[6] Chen Xu, Yao Xie. +"Conformal prediction for dynamic time-series" +https://arxiv.org/abs/2010.09107 + + 📝 License ========== diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index 391d697e9..5dbb47f42 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -4,24 +4,24 @@ ================================================================== This example uses :class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate -prediction intervals associated with time series forecast. The implementation -is still at its first step, based on Jackknife+-after-bootsrtap, to estimate -residuals and associated prediction intervals. +prediction intervals associated with time series forecast. It follows [6] and +an alternative expermimental implemetation inspired from [2] We use here the Victoria electricity demand dataset used in the book "Forecasting: Principles and Practice" by R. J. Hyndman and G. Athanasopoulos. The electricity demand features daily and weekly seasonalities and is impacted by the temperature, considered here as a exogeneous variable. -A Random Forest model is fitted on data. The hyper-parameters are optimized -with a :class:`sklearn.model_selection.RandomizedSearchCV` using a sequential -:class:`sklearn.model_selection.TimeSeriesSplit` cross validation, in which the -training set is prior to the validation set. +A Random Forest model is aloready fitted on data. The hyper-parameters are +optimized with a :class:`sklearn.model_selection.RandomizedSearchCV` using a +sequential :class:`sklearn.model_selection.TimeSeriesSplit` cross validation, +in which the training set is prior to the validation set. The best model is then feeded into :class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the associated prediction intervals. We compare two approaches: one with no `partial_fit` call and one with `partial_fit` every step. """ +import copy import warnings import numpy as np @@ -63,87 +63,202 @@ y_test = demand_test["Demand"] # Model: Random Forest previously optimized with a cross-validation -model = RandomForestRegressor(max_depth=17, n_estimators=150, random_state=59) +model = RandomForestRegressor(max_depth=15, n_estimators=1, random_state=59) # Estimate prediction intervals on test set with best estimator alpha = 0.1 -cv_MapieTimeSeries = BlockBootstrap(50, length=48, random_state=59) -mapie = MapieTimeSeriesRegressor( +cv_MapieTimeSeries = BlockBootstrap(20, length=48, random_state=59) + +mapie_model = MapieTimeSeriesRegressor( model, method="plus", cv=cv_MapieTimeSeries, agg_function="mean", n_jobs=-1 ) -mapie.fit(X_train, y_train) +mapie_model = mapie_model.fit(X_train, y_train) +mapie_no_pfit = mapie_model.fit(X_train, y_train) +mapie_pfit_JAB_F = mapie_model.fit(X_train, y_train) +mapie_pfit_JAB_T = mapie_model.fit(X_train, y_train) + +gap_pfit = 1 + +# With no partial_fit, JAB_like is False +y_pred_npfit_JAB_F, y_pis_npfit_JAB_F = mapie_no_pfit.predict( + X_test, alpha=alpha, ensemble=True +) +coverage_npfit_JAB_F = regression_coverage_score( + y_test, y_pis_npfit_JAB_F[:, 0, 0], y_pis_npfit_JAB_F[:, 1, 0] +) +width_npfit_JAB_F = ( + y_pis_npfit_JAB_F[:, 1, 0] - y_pis_npfit_JAB_F[:, 0, 0] +).mean() + +# With partial_fit every hour, JAB_like is False + +y_pred_pfit_JAB_F, y_pis_pfit_JAB_F = mapie_pfit_JAB_F.predict( + X_test.iloc[:gap_pfit, :], alpha=alpha, ensemble=True +) + +for step in range(gap_pfit, len(X_test), gap_pfit): + mapie_pfit_JAB_F.partial_fit( + X_test.iloc[(step - gap_pfit) : step, :], + y_test.iloc[(step - gap_pfit) : step], + ) + y_pred_gap_step, y_pis_gap_step = mapie_pfit_JAB_F.predict( + X_test.iloc[step : (step + gap_pfit), :], alpha=alpha, ensemble=True + ) + y_pred_pfit_JAB_F = np.concatenate( + (y_pred_pfit_JAB_F, y_pred_gap_step), axis=0 + ) + y_pis_pfit_JAB_F = np.concatenate( + (y_pis_pfit_JAB_F, y_pis_gap_step), axis=0 + ) -# With no partial_fit -y_pred, y_pis = mapie.predict(X_test, alpha=alpha) -coverage = regression_coverage_score(y_test, y_pis[:, 0, 0], y_pis[:, 1, 0]) -width = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean() +coverage_pfit_JAB_F = regression_coverage_score( + y_test, y_pis_pfit_JAB_F[:, 0, 0], y_pis_pfit_JAB_F[:, 1, 0] +) +width_pfit_JAB_F = ( + y_pis_pfit_JAB_F[:, 1, 0] - y_pis_pfit_JAB_F[:, 0, 0] +).mean() -# With partial_fit every hour -gap = 1 -y_pred_steps, y_pis_steps = mapie.predict(X_test.iloc[:gap, :], alpha=alpha) +# With no partial_fit, JAB_like is True +y_pred_npfit_JAB_T, y_pis_npfit_JAB_T = mapie_no_pfit.predict( + X_test, alpha=alpha, JAB_Like=True +) +coverage_npfit_JAB_T = regression_coverage_score( + y_test, y_pis_npfit_JAB_T[:, 0, 0], y_pis_npfit_JAB_T[:, 1, 0] +) +width_npfit_JAB_T = ( + y_pis_npfit_JAB_T[:, 1, 0] - y_pis_npfit_JAB_T[:, 0, 0] +).mean() -for step in range(gap, len(X_test), gap): - mapie.partial_fit( - X_test.iloc[(step - gap):step, :], y_test.iloc[(step - gap):step] +# With partial_fit every hour, JAB_like is True +y_pred_pfit_JAB_T, y_pis_pfit_JAB_T = mapie_no_pfit.predict( + X_test.iloc[:gap_pfit, :], alpha=alpha, JAB_Like=True +) +for step in range(gap_pfit, len(X_test), gap_pfit): + mapie_pfit_JAB_T.partial_fit( + X_test.iloc[(step - gap_pfit) : step, :], + y_test.iloc[(step - gap_pfit) : step], ) - y_pred_gap_step, y_pis_gap_step = mapie.predict( - X_test.iloc[step:(step + gap), :], + y_pred_gap_step, y_pis_gap_step = mapie_pfit_JAB_T.predict( + X_test.iloc[step : (step + gap_pfit), :], alpha=alpha, + JAB_Like=True, + ensemble=True, + ) + y_pred_pfit_JAB_T = np.concatenate( + (y_pred_pfit_JAB_T, y_pred_gap_step), axis=0 + ) + y_pis_pfit_JAB_T = np.concatenate( + (y_pis_pfit_JAB_T, y_pis_gap_step), axis=0 ) - y_pred_steps = np.concatenate((y_pred_steps, y_pred_gap_step), axis=0) - y_pis_steps = np.concatenate((y_pis_steps, y_pis_gap_step), axis=0) -coverage_steps = regression_coverage_score( - y_test, y_pis_steps[:, 0, 0], y_pis_steps[:, 1, 0] +coverage_pfit_JAB_T = regression_coverage_score( + y_test, y_pis_pfit_JAB_T[:, 0, 0], y_pis_pfit_JAB_T[:, 1, 0] ) -width_steps = (y_pis_steps[:, 1, 0] - y_pis_steps[:, 0, 0]).mean() +width_pfit_JAB_T = ( + y_pis_pfit_JAB_T[:, 1, 0] - y_pis_pfit_JAB_T[:, 0, 0] +).mean() # Print results print( "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " - "\nWithout any partial_fit:" - f"{coverage:.3f}, {width:.3f}" + "\nWithout any partial_fit. JAB_like is False:" + f"{coverage_npfit_JAB_F:.3f}, {width_npfit_JAB_F:.3f}" +) +print( + "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " + "\nWithout any partial_fit. JAB_like is True:" + f"{coverage_npfit_JAB_T:.3f}, {width_npfit_JAB_T:.3f}" +) +print( + "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " + "\nWith partial_fit. JAB_like is False:" + f"{coverage_pfit_JAB_F:.3f}, {width_pfit_JAB_F:.3f}" +) +print( + "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " + "\nWith partial_fit. JAB_like is True:" + f"{coverage_pfit_JAB_T:.3f}, {width_pfit_JAB_T:.3f}" ) # Plot estimated prediction intervals on test set -fig = plt.figure(figsize=(15, 5)) -ax = fig.add_subplot(1, 1, 1) -ax.set_ylabel("Hourly demand (GW)") -ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") -ax.plot(demand_test.index, y_pred, lw=2, c="C2", label="Predictions") -ax.fill_between( +fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots( + nrows=2, ncols=2, figsize=(30, 10), sharey="row", sharex="col" +) + +ax1.set_ylabel("Hourly demand (GW)") +ax1.plot(demand_test.Demand, lw=2, label="Test data", c="C1") +ax1.plot( + demand_test.index, y_pred_npfit_JAB_F, lw=2, c="C2", label="Predictions" +) +ax1.fill_between( demand_test.index, - y_pis[:, 0, 0], - y_pis[:, 1, 0], + y_pis_npfit_JAB_F[:, 0, 0], + y_pis_npfit_JAB_F[:, 1, 0], color="C2", alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) -ax.legend() -plt.title("Without partial_fit") -plt.show() +ax1.legend() +ax1.set_title( + "Without partial_fit, JAB False." + f"Coverage:{coverage_npfit_JAB_F:.3f} Width:{width_npfit_JAB_F:.3f}" +) -print( - "Coverage / prediction interval width mean for MapieTimeSeriesRegressor " - "\nWith partial_fit every step: " - f"{coverage_steps:.3f}, {width_steps:.3f}" +ax2.set_ylabel("Hourly demand (GW)") +ax2.plot(demand_test.Demand, lw=2, label="Test data", c="C1") +ax2.plot( + demand_test.index, y_pred_npfit_JAB_T, lw=2, c="C2", label="Predictions" +) +ax2.fill_between( + demand_test.index, + y_pis_npfit_JAB_T[:, 0, 0], + y_pis_npfit_JAB_T[:, 1, 0], + color="C2", + alpha=0.2, + label="MapieTimeSeriesRegressor PIs", +) +ax2.legend() +ax2.set_title( + "Without partial_fit, JAB True." + f"Coverage:{coverage_npfit_JAB_T:.3f} Width:{width_npfit_JAB_T:.3f}" ) -# Plot estimated prediction intervals on test set -fig = plt.figure(figsize=(15, 5)) -ax = fig.add_subplot(1, 1, 1) -ax.set_ylabel("Hourly demand (GW)") -ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") -ax.plot(demand_test.index, y_pred_steps, lw=2, c="C2", label="Predictions") -ax.fill_between( +ax3.set_ylabel("Hourly demand (GW)") +ax3.plot(demand_test.Demand, lw=2, label="Test data", c="C1") +ax3.plot( + demand_test.index, y_pred_npfit_JAB_F, lw=2, c="C2", label="Predictions" +) +ax3.fill_between( + demand_test.index, + y_pis_npfit_JAB_F[:, 0, 0], + y_pis_npfit_JAB_F[:, 1, 0], + color="C2", + alpha=0.2, + label="MapieTimeSeriesRegressor PIs", +) +ax3.legend() +ax3.set_title( + "With partial_fit, JAB False." + f"Coverage:{coverage_npfit_JAB_F:.3f} Width:{width_npfit_JAB_F:.3f}" +) + +ax4.set_ylabel("Hourly demand (GW)") +ax4.plot(demand_test.Demand, lw=2, label="Test data", c="C1") +ax4.plot( + demand_test.index, y_pred_pfit_JAB_T, lw=2, c="C2", label="Predictions" +) +ax4.fill_between( demand_test.index, - y_pis_steps[:, 0, 0], - y_pis_steps[:, 1, 0], + y_pis_pfit_JAB_T[:, 0, 0], + y_pis_pfit_JAB_T[:, 1, 0], color="C2", alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) -ax.legend() -plt.title("With partial_fit") +ax4.legend() +ax4.set_title( + "With partial_fit, JAB True." + f"Coverage:{coverage_npfit_JAB_T:.3f} Width:{width_npfit_JAB_T:.3f}" +) plt.show() diff --git a/examples/regression/plot_timeseries_enbpi_train.py b/examples/regression/plot_timeseries_enbpi_train.py new file mode 100644 index 000000000..ec71f4472 --- /dev/null +++ b/examples/regression/plot_timeseries_enbpi_train.py @@ -0,0 +1,83 @@ +""" +================================================================== +Estimating prediction intervals of time series forecast with EnbPI +================================================================== +This example uses +:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate +prediction intervals associated with time series forecast. It follows [6] and +an alternative expermimental implemetation inspired from [2] + +We use here the Victoria electricity demand dataset used in the book +"Forecasting: Principles and Practice" by R. J. Hyndman and G. Athanasopoulos. +The electricity demand features daily and weekly seasonalities and is impacted +by the temperature, considered here as a exogeneous variable. + +A Random Forest model is aloready fitted on data. The hyper-parameters are +optimized with a :class:`sklearn.model_selection.RandomizedSearchCV` using a +sequential :class:`sklearn.model_selection.TimeSeriesSplit` cross validation, +in which the training set is prior to the validation set. +The best model is then feeded into +:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the +associated prediction intervals. We compare two approaches: one with no +`partial_fit` call and one with `partial_fit` every step. +""" +import warnings + +import numpy as np +import pandas as pd +from scipy.stats import randint +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit + + +from mapie.metrics import regression_coverage_score +from mapie.time_series_regression import MapieTimeSeriesRegressor + +warnings.simplefilter("ignore") + +# Load input data and feature engineering +demand_df = pd.read_csv( + "../data/demand_temperature.csv", parse_dates=True, index_col=0 +) + +demand_df["Date"] = pd.to_datetime(demand_df.index) +demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64") +demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64") +demand_df["Hour"] = demand_df.index.hour +for hour in range(1, 3): + demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour) + +# Train/validation/test split +num_test_steps = 24 * 7 +demand_train = demand_df.iloc[:-num_test_steps, :].copy() +demand_test = demand_df.iloc[-num_test_steps:, :].copy() +features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + [ + f"Lag_{hour}" for hour in range(1, 2) +] + +X_train = demand_train.loc[ + ~np.any(demand_train[features].isnull(), axis=1), features +] +y_train = demand_train.loc[X_train.index, "Demand"] +X_test = demand_test.loc[:, features] +y_test = demand_test["Demand"] + +# CV parameter search +n_iter = 100 +n_splits = 5 +tscv = TimeSeriesSplit(n_splits=n_splits) +random_state = 59 +rf_model = RandomForestRegressor(random_state=random_state) +rf_params = {"max_depth": randint(2, 30), "n_estimators": randint(10, 100)} +cv_obj = RandomizedSearchCV( + rf_model, + param_distributions=rf_params, + n_iter=n_iter, + cv=tscv, + scoring="neg_root_mean_squared_error", + random_state=random_state, + verbose=0, + n_jobs=-1, +) +cv_obj.fit(X_train, y_train) +print(cv_obj.best_estimator_) \ No newline at end of file diff --git a/mapie/regression.py b/mapie/regression.py index 8117fd494..611225886 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -1,4 +1,5 @@ from __future__ import annotations +from configparser import Interpolation from typing import Iterable, List, Optional, Tuple, Union, cast @@ -29,6 +30,7 @@ check_null_weight, check_verbose, fit_estimator, + masked_quantile, ) @@ -192,7 +194,7 @@ class MapieRegressor(BaseEstimator, RegressorMixin): "estimators_", "k_", "conformity_scores_", - "n_features_in_" + "n_features_in_", ] def __init__( @@ -609,7 +611,7 @@ def predict( alpha_np = cast(NDArray, alpha) check_alpha_and_n_samples(alpha_np, n) if self.method in ["naive", "base"] or self.cv == "prefit": - quantile = np.quantile( + quantile = masked_quantile( self.conformity_scores_, 1 - alpha_np, method="higher" ) y_pred_low = y_pred[:, np.newaxis] - quantile @@ -644,7 +646,7 @@ def predict( y_pred_low = np.column_stack( [ - np.quantile( + masked_quantile( ma.masked_invalid(lower_bounds), _alpha, axis=1, @@ -656,7 +658,7 @@ def predict( y_pred_up = np.column_stack( [ - np.quantile( + masked_quantile( ma.masked_invalid(upper_bounds), 1 - _alpha, axis=1, diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index f4c482546..ab2d5159a 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -15,27 +15,25 @@ from .utils import ( check_alpha, check_alpha_and_n_samples, + masked_quantile, ) class MapieTimeSeriesRegressor(MapieRegressor): """ - Prediction interval with out-of-fold residuals for time series. + Prediction interval with out-of-fold residuals for time series. This class implements the EnbPI strategy and some variations for estimating prediction intervals on single-output time series. - It is ``MapieRegressor`` with one more method ``partial_fit``. - Actually, EnbPI only corresponds to MapieRegressor if the ``cv`` argument - if of type ``Subsample`` (Jackknife+-after-Bootstrap method). Moreover, for - the moment we consider the absolute values of the residuals of the model, - and consequently the prediction intervals are symmetryc. Moreover we did - not implement the PI's optimization to the oracle interval yet. It is still - a first step before implementing the actual EnbPI. + + Actually, EnbPI only corresponds to ``MapieTimeSeriesRegressor`` if the + ``cv`` argument if of type ``BlockBootstrap``. References ---------- Chen Xu, and Yao Xie. - "Conformal prediction for dynamic time-series." + [6] "Conformal prediction for dynamic time-series." + https://arxiv.org/abs/2010.09107 """ def __init__( @@ -112,6 +110,7 @@ def predict( X: ArrayLike, ensemble: bool = False, alpha: Optional[Union[float, Iterable[float]]] = None, + JAB_Like=False, ) -> Union[NDArray, Tuple[NDArray, NDArray]]: # Checks @@ -127,93 +126,185 @@ def predict( else: alpha_np = cast(NDArray, alpha) check_alpha_and_n_samples(alpha_np, n) - betas_0 = np.full_like(alpha_np, np.nan, dtype=float) - for ind, _alpha in enumerate(alpha_np): - betas = np.linspace(0.0, _alpha, num=n + 1) - one_alpha_beta = np.quantile( + if ( + (not JAB_Like) + or (self.method in ["naive", "base"]) + or (self.cv == "prefit") + ): + # This version of predict is the implementation of the paper + # [6]. Its PIs are closed to the oracle's ones. + betas_0 = np.full_like(alpha_np, np.nan, dtype=float) + for ind, _alpha in enumerate(alpha_np): + betas = np.linspace( + _alpha / (n + 1), _alpha, num=n + 1, endpoint=False + ) + + one_alpha_beta = masked_quantile( + ma.masked_invalid(self.conformity_scores_), + 1 - _alpha + betas, + axis=0, + method="higher", + ) # type: ignore + + beta = masked_quantile( + ma.masked_invalid(self.conformity_scores_), + betas, + axis=0, + method="lower", + ) # type: ignore + betas_0[ind] = betas[ + np.argmin(one_alpha_beta - beta, axis=0) + ] + + lower_quantiles = masked_quantile( ma.masked_invalid(self.conformity_scores_), - 1 - _alpha + betas, + betas_0, axis=0, - interpolation="higher", + method="lower", ) # type: ignore - - beta = np.quantile( + higher_quantiles = masked_quantile( ma.masked_invalid(self.conformity_scores_), - betas, + 1 - alpha_np + betas_0, axis=0, - interpolation="lower", + method="higher", ) # type: ignore - betas_0[ind] = betas[np.argmin(one_alpha_beta - beta, axis=0)] - - lower_quantiles = np.quantile( - ma.masked_invalid(self.conformity_scores_), - betas_0, - axis=0, - interpolation="lower", - ) # type: ignore - higher_quantiles = np.quantile( - ma.masked_invalid(self.conformity_scores_), - 1 - alpha_np + betas_0, - axis=0, - interpolation="higher", - ) # type: ignore - - if self.method in ["naive", "base"] or self.cv == "prefit": - y_pred_low = np.column_stack( - [ - y_pred[:, np.newaxis] + lower_quantiles[k] - for k in range(len(alpha_np)) - ] - ) - y_pred_up = np.column_stack( - [ - y_pred[:, np.newaxis] + higher_quantiles[k] - for k in range(len(alpha_np)) - ] - ) - else: - y_pred_multi = np.column_stack( - [e.predict(X) for e in self.estimators_] - ) - # At this point, y_pred_multi is of shape - # (n_samples_test, n_estimators_). The method - # ``aggregate_with_mask`` fits it to the right size thanks to - # the shape of k_. - - y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) - - if self.method == "plus": - pred = aggregate_all(self.agg_function, y_pred_multi) + if (self.method in ["naive", "base"]) or (self.cv == "prefit"): y_pred_low = np.column_stack( [ - pred + lower_quantiles[k] + y_pred[:, np.newaxis] + lower_quantiles[k] for k in range(len(alpha_np)) ] ) y_pred_up = np.column_stack( [ - pred + higher_quantiles[k] + y_pred[:, np.newaxis] + higher_quantiles[k] for k in range(len(alpha_np)) ] ) + else: + y_pred_multi = np.column_stack( + [e.predict(X) for e in self.estimators_] + ) - if self.method == "minmax": - lower_bounds = np.min(y_pred_multi, axis=1, keepdims=True) - upper_bounds = np.max(y_pred_multi, axis=1, keepdims=True) - y_pred_low = np.column_stack( - [ - lower_bounds + lower_quantiles[k] - for k in range(len(alpha_np)) - ] + # At this point, y_pred_multi is of shape + # (n_samples_test, n_estimators_). The method + # ``aggregate_with_mask`` fits it to the right size thanks + # to the shape of k_. + + y_pred_multi = self.aggregate_with_mask( + y_pred_multi, self.k_ ) - y_pred_up = np.column_stack( - [ - upper_bounds + higher_quantiles[k] - for k in range(len(alpha_np)) - ] + + if self.method == "plus": + pred = aggregate_all(self.agg_function, y_pred_multi) + y_pred_low = np.column_stack( + [ + pred + lower_quantiles[k] + for k in range(len(alpha_np)) + ] + ) + y_pred_up = np.column_stack( + [ + pred + higher_quantiles[k] + for k in range(len(alpha_np)) + ] + ) + + if self.method == "minmax": + lower_bounds = np.min( + y_pred_multi, axis=1, keepdims=True + ) + upper_bounds = np.max( + y_pred_multi, axis=1, keepdims=True + ) + y_pred_low = np.column_stack( + [ + lower_bounds + lower_quantiles[k] + for k in range(len(alpha_np)) + ] + ) + y_pred_up = np.column_stack( + [ + upper_bounds + higher_quantiles[k] + for k in range(len(alpha_np)) + ] + ) + else: + # This version of predict is the implementation of the paper + # [2]. Its PIs are wider. It does not coorespond to [6]. It is + # a try. It is a bit slower because the betas + # (width optimization parameters of the PIs) are optimized for + # every points. + y_pred_low = np.empty((len(y_pred), len(alpha)), dtype=float) + y_pred_up = np.empty((len(y_pred), len(alpha)), dtype=float) + + for ind_alpha, _alpha in enumerate(alpha_np): + betas = np.linspace( + _alpha / (n + 1), _alpha, num=n + 1, endpoint=False + ) + y_pred_multi = np.column_stack( + [e.predict(X) for e in self.estimators_] + ) + # At this point, y_pred_multi is of shape + # (n_samples_test, n_estimators_). The method + # ``aggregate_with_mask`` fits it to the right size + # thanks to the shape of k_. + + y_pred_multi = self.aggregate_with_mask( + y_pred_multi, self.k_ ) - if ensemble: - y_pred = aggregate_all(self.agg_function, y_pred_multi) + if self.method == "plus": + lower_bounds = y_pred_multi + self.conformity_scores_ + upper_bounds = y_pred_multi + self.conformity_scores_ + + if self.method == "minmax": + lower_bounds = np.min( + y_pred_multi, axis=1, keepdims=True + ) + upper_bounds = np.max( + y_pred_multi, axis=1, keepdims=True + ) + lower_bounds = lower_bounds + self.conformity_scores_ + upper_bounds = upper_bounds + self.conformity_scores_ + + one_alpha_beta = masked_quantile( + ma.masked_invalid(upper_bounds), + 1 - _alpha + betas, + axis=1, + method="higher", + ) # type: ignore + + beta = masked_quantile( + ma.masked_invalid(lower_bounds), + betas, + axis=1, + method="lower", + ) # type: ignore + + betas_0 = betas[np.argmin(one_alpha_beta - beta, axis=0)] + + lower_quantiles = np.empty((len(betas_0),)) + upper_quantiles = np.empty((len(betas_0),)) + + for ind, beta_0 in enumerate(betas_0): + lower_quantiles[ind] = masked_quantile( + ma.masked_invalid(lower_bounds[ind, :]), + beta_0, + axis=0, + method="lower", + ) # type: ignore + + upper_quantiles[ind] = masked_quantile( + ma.masked_invalid(upper_bounds[ind, :]), + 1 - _alpha + beta_0, + axis=0, + method="higher", + ) # type: ignore + y_pred_low[:, ind_alpha] = lower_quantiles + y_pred_up[:, ind_alpha] = upper_quantiles + + if ensemble: + y_pred = aggregate_all(self.agg_function, y_pred_multi) return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) diff --git a/mapie/utils.py b/mapie/utils.py index 44fdc7e62..25ef72e3a 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -423,3 +423,106 @@ def check_input_is_image(X: ArrayLike) -> None: "When X is an image, the number of dimensions" "must be equal to 3 or 4." ) + + +def masked_quantile( + a: NDArray, + q: Union[float, NDArray], + axis: Optional[int] = None, + method: str = "linear", +) -> NDArray: + """ + Compute quantile for masked arrays + + Parameters + ---------- + a: NDArray + Array of data. + + q: Union[float, NDArray] + Quantiles. + + axis:int + ``axis`` is ``0`` or ``1`` + + method: str + "linear", "higher" or "lower" + """ + return_float = False + flatten = False + if isinstance(q, float): + flatten = True + + if (len(a.shape)) == 1 or (len(a) == 1): + return_float = True + + q = cast(NDArray, check_alpha(q)) + if (len(a.shape) == 1) or (a.shape[1] == 1): + if axis == 1: + raise ValueError( + "axis 1 is out of bounds for array of dimension 1" + ) + a = a.reshape(-1, 1) + flatten = True + + if axis is None: + a = a.flatten() + axis = 0 + + if axis == 0: + if hasattr(a, "mask"): + a_np = np.where(a.mask, np.inf, a.data) + masked_sum = np.sum(a.mask, axis=0).astype(int) + else: + a_np = a.copy() + masked_sum = np.zeros((a.shape[1],)) + a_np = np.sort(a_np, axis=0) + grid = np.indices(a_np.shape)[0] + a_tile = np.tile(a_np, (len(q), 1, 1)) + grid_tile = np.tile(grid, (len(q), 1, 1)) + + nb_valid_values = (len(a_np) - masked_sum).astype(int) + q_out = np.outer(nb_valid_values - 1, q) + q_out_inf = np.floor(q_out).astype(int) + q_out_sup = np.ceil(q_out) + q_out_sup = np.minimum( + q_out_sup, np.outer(nb_valid_values - 1, np.ones(len(q))) + ).astype(int) + + inf_bool = np.equal( + grid_tile, + np.tile(q_out_inf.T, (1, 1, a_np.shape[0])) + .reshape(len(q), *a_np.shape) + .astype(int), + ) + sup_bool = np.equal( + grid_tile, + np.tile(q_out_sup.T, (1, 1, a_np.shape[0])).reshape( + (len(q), *a_np.shape) + ), + ) + + lower_values = a_tile[inf_bool].reshape(len(q), -1) + upper_values = a_tile[sup_bool].reshape(len(q), -1) + + if method == "lower": + values = lower_values + elif method == "higher": + values = upper_values + elif method == "linear": + + interpolated_values = lower_values + (q_out - q_out_inf).T * ( + upper_values - lower_values + ) + values = interpolated_values + if flatten: + if len(q) == 1: + values = values.flatten() + + if return_float: + values = values[0] + return values + elif axis == 1: + return masked_quantile(a.T, q, axis=0, method=method) + else: + raise ValueError("axis should be None, 0 or 1") From 98faf3d28d7ea07d209f7b2042025d8456f9998e Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Tue, 5 Apr 2022 13:27:24 +0200 Subject: [PATCH 14/32] PR EnbPI completed --- examples/regression/plot_timeseries_enbpi.py | 19 ++--- .../regression/plot_timeseries_enbpi_train.py | 6 +- mapie/regression.py | 1 - mapie/tests/test_time_series_regression.py | 81 ++++++++++++++++--- mapie/tests/test_utils.py | 24 +++++- mapie/time_series_regression.py | 61 ++++++++------ mapie/utils.py | 30 ++++--- 7 files changed, 156 insertions(+), 66 deletions(-) diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index 5dbb47f42..96a725b1d 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -18,10 +18,11 @@ in which the training set is prior to the validation set. The best model is then feeded into :class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the -associated prediction intervals. We compare two approaches: one with no -`partial_fit` call and one with `partial_fit` every step. +associated prediction intervals. We compare four approaches: with or without +``partial_fit`` called at every step, and following [6] or a approach inspired +from [2]. It appears that the approach inspired from [2] and ``partial_fit`` +offer higher coverage, but with higher width of PIs and are much slower. """ -import copy import warnings import numpy as np @@ -98,11 +99,11 @@ for step in range(gap_pfit, len(X_test), gap_pfit): mapie_pfit_JAB_F.partial_fit( - X_test.iloc[(step - gap_pfit) : step, :], - y_test.iloc[(step - gap_pfit) : step], + X_test.iloc[(step - gap_pfit):step, :], + y_test.iloc[(step - gap_pfit):step], ) y_pred_gap_step, y_pis_gap_step = mapie_pfit_JAB_F.predict( - X_test.iloc[step : (step + gap_pfit), :], alpha=alpha, ensemble=True + X_test.iloc[step:(step + gap_pfit), :], alpha=alpha, ensemble=True ) y_pred_pfit_JAB_F = np.concatenate( (y_pred_pfit_JAB_F, y_pred_gap_step), axis=0 @@ -136,11 +137,11 @@ ) for step in range(gap_pfit, len(X_test), gap_pfit): mapie_pfit_JAB_T.partial_fit( - X_test.iloc[(step - gap_pfit) : step, :], - y_test.iloc[(step - gap_pfit) : step], + X_test.iloc[(step - gap_pfit):step, :], + y_test.iloc[(step - gap_pfit):step], ) y_pred_gap_step, y_pis_gap_step = mapie_pfit_JAB_T.predict( - X_test.iloc[step : (step + gap_pfit), :], + X_test.iloc[step:(step + gap_pfit), :], alpha=alpha, JAB_Like=True, ensemble=True, diff --git a/examples/regression/plot_timeseries_enbpi_train.py b/examples/regression/plot_timeseries_enbpi_train.py index ec71f4472..e9ee8d450 100644 --- a/examples/regression/plot_timeseries_enbpi_train.py +++ b/examples/regression/plot_timeseries_enbpi_train.py @@ -29,10 +29,6 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit - -from mapie.metrics import regression_coverage_score -from mapie.time_series_regression import MapieTimeSeriesRegressor - warnings.simplefilter("ignore") # Load input data and feature engineering @@ -80,4 +76,4 @@ n_jobs=-1, ) cv_obj.fit(X_train, y_train) -print(cv_obj.best_estimator_) \ No newline at end of file +print(cv_obj.best_estimator_) diff --git a/mapie/regression.py b/mapie/regression.py index 611225886..1bceac984 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -1,5 +1,4 @@ from __future__ import annotations -from configparser import Interpolation from typing import Iterable, List, Optional, Tuple, Union, cast diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 80dcad92a..bf1f8df37 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -69,6 +69,25 @@ random_state=1, ), ), + "jackknife_plus_ab_JAB": Params( + method="plus", + agg_function="mean", + cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), + ), + "jackknife_minmax_ab_JAB": Params( + method="minmax", + agg_function="mean", + cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), + ), + "jackknife_plus_median_ab_JAB": Params( + method="plus", + agg_function="median", + cv=BlockBootstrap( + n_resamplings=30, + n_blocks=5, + random_state=1, + ), + ), } WIDTHS = { @@ -84,6 +103,9 @@ "jackknife_plus_ab": 3.76, "jackknife_minmax_ab": 3.96, "jackknife_plus_median_ab": 3.76, + "jackknife_plus_ab_JAB": 3.76, + "jackknife_minmax_ab_JAB": 3.96, + "jackknife_plus_median_ab_JAB": 3.76, } COVERAGES = { @@ -99,6 +121,9 @@ "jackknife_plus_ab": 0.952, "jackknife_minmax_ab": 0.960, "jackknife_plus_median_ab": 0.946, + "jackknife_plus_ab_JAB": 0.952, + "jackknife_minmax_ab_JAB": 0.960, + "jackknife_plus_median_ab_JAB": 0.965, } @@ -189,6 +214,15 @@ def test_results_single_and_multi_jobs(strategy: str) -> None: np.testing.assert_allclose(y_pred_single, y_pred_multi) np.testing.assert_allclose(y_pis_single, y_pis_multi) + y_pred_single_JAB, y_pis_single_JAB = mapie_single.predict( + X_toy, alpha=0.2, JAB_Like=True + ) + y_pred_multi_JAB, y_pis_multi_JAB = mapie_multi.predict( + X_toy, alpha=0.2, JAB_Like=True + ) + np.testing.assert_allclose(y_pred_single_JAB, y_pred_multi_JAB) + np.testing.assert_allclose(y_pis_single_JAB, y_pis_multi_JAB) + @pytest.mark.parametrize("strategy", [*STRATEGIES]) def test_results_with_constant_sample_weights(strategy: str) -> None: @@ -211,15 +245,19 @@ def test_results_with_constant_sample_weights(strategy: str) -> None: np.testing.assert_allclose(y_pis0, y_pis1) np.testing.assert_allclose(y_pis1, y_pis2) - -@pytest.mark.parametrize("strategy", [*STRATEGIES]) -def test_prediction_between_low_up(strategy: str) -> None: - """Test that prediction lies between low and up prediction intervals.""" - mapie = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) - mapie.fit(X, y) - y_pred, y_pis = mapie.predict(X, alpha=0.1) - assert (y_pred >= y_pis[:, 0, 0]).all() - assert (y_pred <= y_pis[:, 1, 0]).all() + y_pred0_JAB, y_pis0_JAB = mapie0.predict( + X[:200, :], alpha=0.05, JAB_Like=True + ) + y_pred1_JAB, y_pis1_JAB = mapie1.predict( + X[:200, :], alpha=0.05, JAB_Like=True + ) + y_pred2_JAB, y_pis2_JAB = mapie2.predict( + X[:200, :], alpha=0.05, JAB_Like=True + ) + np.testing.assert_allclose(y_pred0_JAB, y_pred1_JAB) + np.testing.assert_allclose(y_pred1_JAB, y_pred2_JAB) + np.testing.assert_allclose(y_pis0_JAB, y_pis1_JAB) + np.testing.assert_allclose(y_pis1_JAB, y_pis2_JAB) @pytest.mark.parametrize("method", ["plus", "minmax"]) @@ -237,13 +275,24 @@ def test_prediction_agg_function( method=method, cv=cv, agg_function=agg_function ) mapie.fit(X, y) - y_pred_1, y_pis_1 = mapie.predict(X, ensemble=True, alpha=alpha) - y_pred_2, y_pis_2 = mapie.predict(X, ensemble=False, alpha=alpha) + y_pred_1, y_pis_1 = mapie.predict(X[:200, :], ensemble=True, alpha=alpha) + y_pred_2, y_pis_2 = mapie.predict(X[:200, :], ensemble=False, alpha=alpha) np.testing.assert_allclose(y_pis_1[:, 0, 0], y_pis_2[:, 0, 0]) np.testing.assert_allclose(y_pis_1[:, 1, 0], y_pis_2[:, 1, 0]) with pytest.raises(AssertionError): np.testing.assert_allclose(y_pred_1, y_pred_2) + y_pred_1_JAB, y_pis_1_JAB = mapie.predict( + X[:200, :], ensemble=True, alpha=alpha, JAB_Like=True + ) + y_pred_2_JAB, y_pis_2_JAB = mapie.predict( + X[:200, :], ensemble=False, alpha=alpha, JAB_Like=True + ) + np.testing.assert_allclose(y_pis_1_JAB[:, 0, 0], y_pis_2_JAB[:, 0, 0]) + np.testing.assert_allclose(y_pis_1_JAB[:, 1, 0], y_pis_2_JAB[:, 1, 0]) + with pytest.raises(AssertionError): + np.testing.assert_allclose(y_pred_1_JAB, y_pred_2_JAB) + @pytest.mark.parametrize("strategy", [*STRATEGIES]) def test_linear_regression_results(strategy: str) -> None: @@ -254,10 +303,16 @@ def test_linear_regression_results(strategy: str) -> None: """ mapie_ts = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) mapie_ts.fit(X, y) - _, y_pis = mapie_ts.predict(X, alpha=0.05) + if "JAB" in strategy: + _, y_pis = mapie_ts.predict(X[:200, :], alpha=0.05, JAB_Like=True) + else: + _, y_pis = mapie_ts.predict(X, alpha=0.05, JAB_Like=False) y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0] width_mean = (y_pred_up - y_pred_low).mean() - coverage = regression_coverage_score(y, y_pred_low, y_pred_up) + if "JAB" in strategy: + coverage = regression_coverage_score(y[:200], y_pred_low, y_pred_up) + else: + coverage = regression_coverage_score(y, y_pred_low, y_pred_up) np.testing.assert_allclose(width_mean, WIDTHS[strategy], rtol=1e-2) np.testing.assert_allclose(coverage, COVERAGES[strategy], rtol=1e-2) diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py index 2d1539e2a..ca31261ab 100644 --- a/mapie/tests/test_utils.py +++ b/mapie/tests/test_utils.py @@ -16,6 +16,7 @@ check_null_weight, check_verbose, fit_estimator, + masked_quantile, ) from mapie._typing import ArrayLike @@ -63,12 +64,10 @@ def test_check_null_weight_with_zeros() -> None: sw_out, X_out, y_out = check_null_weight(sample_weight, X_toy, y_toy) np.testing.assert_almost_equal(np.array(sw_out), np.array([1, 1, 1, 1, 1])) np.testing.assert_almost_equal( - np.array(X_out), - np.array([[1], [2], [3], [4], [5]]) + np.array(X_out), np.array([[1], [2], [3], [4], [5]]) ) np.testing.assert_almost_equal( - np.array(y_out), - np.array([7, 9, 11, 13, 15]) + np.array(y_out), np.array([7, 9, 11, 13, 15]) ) @@ -194,3 +193,20 @@ def test_invalid_verbose(verbose: Any) -> None: def test_valid_verbose(verbose: Any) -> None: """Test that valid verboses raise no errors.""" check_verbose(verbose) + + +def test_masked_quantile_invalid_minus_one(): + with pytest.raises(ValueError, match=r".*axis should be None, 0 or 1.*"): + masked_quantile(a=X_toy, q=0.1, axis=-1) + + +def test_masked_quantile_invalid_one(): + with pytest.raises(ValueError, match=r".*axis 1 is out of bounds.*"): + masked_quantile(a=X_toy.flatten(), q=0.1, axis=1) + + +def test_masked_quantile_linear_interpolation(): + quantiles = masked_quantile( + a=X_toy, q=[0.1, 0.2, 0.5], axis=0, method="linear" + ) + np.testing.assert_allclose(quantiles, np.array([[0.5, 1.0, 2.5]])) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index ab2d5159a..cee380170 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -112,6 +112,19 @@ def predict( alpha: Optional[Union[float, Iterable[float]]] = None, JAB_Like=False, ) -> Union[NDArray, Tuple[NDArray, NDArray]]: + """ + ``predict`` method correspond to the ``MapieRegressor``'s one with the + argument ``JAB_Like`` in more. In case ``JAB_Like`` is ``False``, + predictions correspond to [6]. In case it is True, is is another + implementation inspired of [2] with wider PIs. The second method is + much slower because of PIwise optimization. + + Parameters + ---------- + JAB_Like : boolean, default False + Whether to use the implementation of [6] or an implementation + closer to [2]. + """ # Checks check_is_fitted(self, self.fit_attributes) @@ -132,8 +145,8 @@ def predict( or (self.method in ["naive", "base"]) or (self.cv == "prefit") ): - # This version of predict is the implementation of the paper - # [6]. Its PIs are closed to the oracle's ones. + # This version of predict corresponds to [6]. + # Its PIs are closed to the oracle's ones. betas_0 = np.full_like(alpha_np, np.nan, dtype=float) for ind, _alpha in enumerate(alpha_np): betas = np.linspace( @@ -153,8 +166,9 @@ def predict( axis=0, method="lower", ) # type: ignore + betas_0[ind] = betas[ - np.argmin(one_alpha_beta - beta, axis=0) + np.argmin(one_alpha_beta - beta, axis=1) ] lower_quantiles = masked_quantile( @@ -162,13 +176,13 @@ def predict( betas_0, axis=0, method="lower", - ) # type: ignore + ).T # type: ignore higher_quantiles = masked_quantile( ma.masked_invalid(self.conformity_scores_), 1 - alpha_np + betas_0, axis=0, method="higher", - ) # type: ignore + ).T # type: ignore if (self.method in ["naive", "base"]) or (self.cv == "prefit"): y_pred_low = np.column_stack( @@ -232,11 +246,22 @@ def predict( ] ) else: - # This version of predict is the implementation of the paper - # [2]. Its PIs are wider. It does not coorespond to [6]. It is - # a try. It is a bit slower because the betas - # (width optimization parameters of the PIs) are optimized for + # This version of predict corresponds to [2]. + # Its PIs are wider. It does not coorespond to [6]. It is + # a try. It is slower because the betas + # (width optimization parameters of the PIs) are optimized at # every points. + + y_pred_multi = np.column_stack( + [e.predict(X) for e in self.estimators_] + ) + # At this point, y_pred_multi is of shape + # (n_samples_test, n_estimators_). The method + # ``aggregate_with_mask`` fits it to the right size + # thanks to the shape of k_. + + y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) + y_pred_low = np.empty((len(y_pred), len(alpha)), dtype=float) y_pred_up = np.empty((len(y_pred), len(alpha)), dtype=float) @@ -244,17 +269,7 @@ def predict( betas = np.linspace( _alpha / (n + 1), _alpha, num=n + 1, endpoint=False ) - y_pred_multi = np.column_stack( - [e.predict(X) for e in self.estimators_] - ) - # At this point, y_pred_multi is of shape - # (n_samples_test, n_estimators_). The method - # ``aggregate_with_mask`` fits it to the right size - # thanks to the shape of k_. - y_pred_multi = self.aggregate_with_mask( - y_pred_multi, self.k_ - ) if self.method == "plus": lower_bounds = y_pred_multi + self.conformity_scores_ upper_bounds = y_pred_multi + self.conformity_scores_ @@ -274,14 +289,14 @@ def predict( 1 - _alpha + betas, axis=1, method="higher", - ) # type: ignore + ).T # type: ignore beta = masked_quantile( ma.masked_invalid(lower_bounds), betas, axis=1, method="lower", - ) # type: ignore + ).T # type: ignore betas_0 = betas[np.argmin(one_alpha_beta - beta, axis=0)] @@ -294,14 +309,14 @@ def predict( beta_0, axis=0, method="lower", - ) # type: ignore + ).T # type: ignore upper_quantiles[ind] = masked_quantile( ma.masked_invalid(upper_bounds[ind, :]), 1 - _alpha + beta_0, axis=0, method="higher", - ) # type: ignore + ).T # type: ignore y_pred_low[:, ind_alpha] = lower_quantiles y_pred_up[:, ind_alpha] = upper_quantiles diff --git a/mapie/utils.py b/mapie/utils.py index 25ef72e3a..3d2cfdce6 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -3,6 +3,7 @@ from typing import Any, Iterable, Optional, Tuple, Union, cast import numpy as np +import numpy.ma as ma from sklearn.base import ClassifierMixin, RegressorMixin from sklearn.model_selection import BaseCrossValidator, KFold, LeaveOneOut from sklearn.utils.validation import _check_sample_weight, _num_features @@ -432,7 +433,8 @@ def masked_quantile( method: str = "linear", ) -> NDArray: """ - Compute quantile for masked arrays + Compute quantile for masked arrays. It avoids using ``np.nanquantile`` that + is quite slow because of an ``apply_along_axis`` loop. Parameters ---------- @@ -442,17 +444,18 @@ def masked_quantile( q: Union[float, NDArray] Quantiles. - axis:int - ``axis`` is ``0`` or ``1`` + axis:Optional[int] + ``axis`` is ``None``, ``0`` or ``1``, default ``None``. + If ``axis`` is ``None``, compute the quantiles of the flatten array. method: str "linear", "higher" or "lower" """ return_float = False flatten = False + if isinstance(q, float): flatten = True - if (len(a.shape)) == 1 or (len(a) == 1): return_float = True @@ -462,19 +465,22 @@ def masked_quantile( raise ValueError( "axis 1 is out of bounds for array of dimension 1" ) - a = a.reshape(-1, 1) - flatten = True + if len(q) == 1: + flatten = True if axis is None: a = a.flatten() axis = 0 if axis == 0: + if (len(a.shape) == 1) or (a.shape[1] == 1): + a = a.reshape(-1, 1) if hasattr(a, "mask"): - a_np = np.where(a.mask, np.inf, a.data) - masked_sum = np.sum(a.mask, axis=0).astype(int) + a_m = cast(ma.MaskedArray, a) + a_np = np.where(a_m.mask, np.inf, a_m.data) + masked_sum = np.sum(a_m.mask, axis=0).astype(int) else: - a_np = a.copy() + a_np = a masked_sum = np.zeros((a.shape[1],)) a_np = np.sort(a_np, axis=0) grid = np.indices(a_np.shape)[0] @@ -515,9 +521,11 @@ def masked_quantile( upper_values - lower_values ) values = interpolated_values + if flatten: - if len(q) == 1: - values = values.flatten() + values = values.flatten() + else: + values = values.T if return_float: values = values[0] From ed39ccaf3cd314165a011d1a16c4b49ec85719d1 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Wed, 6 Apr 2022 11:44:31 +0200 Subject: [PATCH 15/32] EnbPI ready!! --- mapie/tests/test_time_series_regression.py | 26 ++++++++++++---------- mapie/tests/test_utils.py | 12 ++++++++++ mapie/utils.py | 7 +++--- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index bf1f8df37..e66fe174b 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -19,6 +19,7 @@ X_toy = np.array(range(5)).reshape(-1, 1) y_toy = (5.0 + 2.0 * X_toy ** 1.1).flatten() X, y = make_regression(n_samples=500, n_features=10, noise=1.0, random_state=1) +X_short, y_short = X[:50, :], y[:50] k = np.ones(shape=(5, X.shape[1])) METHODS = ["naive", "base", "plus", "minmax"] @@ -121,9 +122,9 @@ "jackknife_plus_ab": 0.952, "jackknife_minmax_ab": 0.960, "jackknife_plus_median_ab": 0.946, - "jackknife_plus_ab_JAB": 0.952, - "jackknife_minmax_ab_JAB": 0.960, - "jackknife_plus_median_ab_JAB": 0.965, + "jackknife_plus_ab_JAB": 0.92, + "jackknife_minmax_ab_JAB": 0.940, + "jackknife_plus_median_ab_JAB": 0.94, } @@ -246,14 +247,15 @@ def test_results_with_constant_sample_weights(strategy: str) -> None: np.testing.assert_allclose(y_pis1, y_pis2) y_pred0_JAB, y_pis0_JAB = mapie0.predict( - X[:200, :], alpha=0.05, JAB_Like=True + X_short, alpha=0.05, JAB_Like=True ) y_pred1_JAB, y_pis1_JAB = mapie1.predict( - X[:200, :], alpha=0.05, JAB_Like=True + X_short, alpha=0.05, JAB_Like=True ) y_pred2_JAB, y_pis2_JAB = mapie2.predict( - X[:200, :], alpha=0.05, JAB_Like=True + X_short, alpha=0.05, JAB_Like=True ) + np.testing.assert_allclose(y_pred0_JAB, y_pred1_JAB) np.testing.assert_allclose(y_pred1_JAB, y_pred2_JAB) np.testing.assert_allclose(y_pis0_JAB, y_pis1_JAB) @@ -275,18 +277,18 @@ def test_prediction_agg_function( method=method, cv=cv, agg_function=agg_function ) mapie.fit(X, y) - y_pred_1, y_pis_1 = mapie.predict(X[:200, :], ensemble=True, alpha=alpha) - y_pred_2, y_pis_2 = mapie.predict(X[:200, :], ensemble=False, alpha=alpha) + y_pred_1, y_pis_1 = mapie.predict(X_short, ensemble=True, alpha=alpha) + y_pred_2, y_pis_2 = mapie.predict(X_short, ensemble=False, alpha=alpha) np.testing.assert_allclose(y_pis_1[:, 0, 0], y_pis_2[:, 0, 0]) np.testing.assert_allclose(y_pis_1[:, 1, 0], y_pis_2[:, 1, 0]) with pytest.raises(AssertionError): np.testing.assert_allclose(y_pred_1, y_pred_2) y_pred_1_JAB, y_pis_1_JAB = mapie.predict( - X[:200, :], ensemble=True, alpha=alpha, JAB_Like=True + X_short, ensemble=True, alpha=alpha, JAB_Like=True ) y_pred_2_JAB, y_pis_2_JAB = mapie.predict( - X[:200, :], ensemble=False, alpha=alpha, JAB_Like=True + X_short, ensemble=False, alpha=alpha, JAB_Like=True ) np.testing.assert_allclose(y_pis_1_JAB[:, 0, 0], y_pis_2_JAB[:, 0, 0]) np.testing.assert_allclose(y_pis_1_JAB[:, 1, 0], y_pis_2_JAB[:, 1, 0]) @@ -304,13 +306,13 @@ def test_linear_regression_results(strategy: str) -> None: mapie_ts = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) mapie_ts.fit(X, y) if "JAB" in strategy: - _, y_pis = mapie_ts.predict(X[:200, :], alpha=0.05, JAB_Like=True) + _, y_pis = mapie_ts.predict(X_short, alpha=0.05, JAB_Like=True) else: _, y_pis = mapie_ts.predict(X, alpha=0.05, JAB_Like=False) y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0] width_mean = (y_pred_up - y_pred_low).mean() if "JAB" in strategy: - coverage = regression_coverage_score(y[:200], y_pred_low, y_pred_up) + coverage = regression_coverage_score(y_short, y_pred_low, y_pred_up) else: coverage = regression_coverage_score(y, y_pred_low, y_pred_up) np.testing.assert_allclose(width_mean, WIDTHS[strategy], rtol=1e-2) diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py index ca31261ab..6be7abbed 100644 --- a/mapie/tests/test_utils.py +++ b/mapie/tests/test_utils.py @@ -210,3 +210,15 @@ def test_masked_quantile_linear_interpolation(): a=X_toy, q=[0.1, 0.2, 0.5], axis=0, method="linear" ) np.testing.assert_allclose(quantiles, np.array([[0.5, 1.0, 2.5]])) + + +def test_masked_quantile_linear_interpolation_scalar(): + quantile = masked_quantile( + a=X_toy.flatten(), q=0.1, axis=0, method="linear" + ) + assert quantile == 0.5 + + +def test_masked_quantile_invalid_method(): + with pytest.raises(ValueError, match=r".*'method' has to be 'higher'.*"): + masked_quantile(a=X_toy.flatten(), q=0.1, axis=0, method="lineaar") diff --git a/mapie/utils.py b/mapie/utils.py index 3d2cfdce6..23158a9ac 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -516,12 +516,11 @@ def masked_quantile( elif method == "higher": values = upper_values elif method == "linear": - - interpolated_values = lower_values + (q_out - q_out_inf).T * ( + values = lower_values + (q_out - q_out_inf).T * ( upper_values - lower_values ) - values = interpolated_values - + else: + raise ValueError("'method' has to be 'higher','lower', or'linear'") if flatten: values = values.flatten() else: From 59b75053f9ebf0c39db5d9bebbd3cedc2fddd07a Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Wed, 13 Apr 2022 16:01:21 +0200 Subject: [PATCH 16/32] 3 methods for predict --- .../plot_MapieRegressor_benchmark.py | 164 +++++++ examples/regression/plot_timeseries_enbpi.py | 346 +++++++++----- .../regression/plot_timeseries_enbpi_train.py | 6 +- mapie/quantile_timeit.ipynb | 230 ++++++++++ mapie/regression.py | 15 +- mapie/tests/test_time_series_regression.py | 67 ++- mapie/tests/test_utils.py | 30 -- mapie/time_series_regression.py | 429 ++++++++++-------- mapie/utils.py | 109 ----- 9 files changed, 940 insertions(+), 456 deletions(-) create mode 100644 examples/regression/2-advanced-analysis/plot_MapieRegressor_benchmark.py create mode 100644 mapie/quantile_timeit.ipynb diff --git a/examples/regression/2-advanced-analysis/plot_MapieRegressor_benchmark.py b/examples/regression/2-advanced-analysis/plot_MapieRegressor_benchmark.py new file mode 100644 index 000000000..f899a9ea7 --- /dev/null +++ b/examples/regression/2-advanced-analysis/plot_MapieRegressor_benchmark.py @@ -0,0 +1,164 @@ +""" +================================================ +Estimating aleatoric and epistemic uncertainties +================================================ +This example uses :class:`mapie.regression.MapieRegressor` to estimate +prediction intervals capturing both aleatoric and epistemic uncertainties +on a one-dimensional dataset with homoscedastic noise and normal sampling. +""" +from ast import Sub +from typing import Any, Callable, Tuple, TypeVar, Union + +from typing_extensions import TypedDict +import numpy as np +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import PolynomialFeatures +import matplotlib.pyplot as plt + +from mapie.regression import MapieRegressor +from mapie.subsample import Subsample +from mapie._typing import NDArray + +F = TypeVar("F", bound=Callable[..., Any]) + + +# Functions for generating our dataset +def x_sinx(x: NDArray) -> NDArray: + """One-dimensional x*sin(x) function.""" + return x * np.sin(x) + + +def get_1d_data_with_normal_distrib( + funct: F, mu: float, sigma: float, n_samples: int, noise: float +) -> Tuple[NDArray, NDArray, NDArray, NDArray, NDArray]: + """ + Generate noisy 1D data with normal distribution from given function + and noise standard deviation. + + Parameters + ---------- + funct : F + Base function used to generate the dataset. + mu : float + Mean of normal training distribution. + sigma : float + Standard deviation of normal training distribution. + n_samples : int + Number of training samples. + noise : float + Standard deviation of noise. + + Returns + ------- + Tuple[NDArray, AnNDArrayy, NDArray, NDArray, NDArray] + Generated training and test data. + [0]: X_train + [1]: y_train + [2]: X_test + [3]: y_test + [4]: y_mesh + """ + np.random.seed(42) + X_train = np.random.normal(mu, sigma, n_samples) + X_test = np.arange(mu - 4 * sigma, mu + 4 * sigma, sigma / 20.0) + y_train, y_mesh, y_test = funct(X_train), funct(X_test), funct(X_test) + y_train += np.random.normal(0, noise, y_train.shape[0]) + y_test += np.random.normal(0, noise, y_test.shape[0]) + return ( + X_train.reshape(-1, 1), + y_train, + X_test.reshape(-1, 1), + y_test, + y_mesh, + ) + + +# Data generation +mu, sigma, n_samples, noise = 0, 2.5, 300, 0.5 +X_train, y_train, X_test, y_test, y_mesh = get_1d_data_with_normal_distrib( + x_sinx, mu, sigma, n_samples, noise +) + +# Definition of our base model +degree_polyn = 10 +polyn_model = Pipeline( + [ + ("poly", PolynomialFeatures(degree=degree_polyn)), + ("linear", LinearRegression()), + ] +) + +# Estimating prediction intervals +Params = TypedDict("Params", {"method": str, "cv": Union[int, Subsample]}) +STRATEGIES = { + "jackknife_plus": Params(method="plus", cv=-1), + "jackknife_minmax": Params(method="minmax", cv=-1), + "cv_plus": Params(method="plus", cv=10), + "cv_minmax": Params(method="minmax", cv=10), + "jackknide-plus-after-bootstrap": Params(method="plus", cv=10), + "jackknide-minmax-after-bootstrap": Params(method="minmax", cv=10), +} +y_pred, y_pis = {}, {} +for strategy, params in STRATEGIES.items(): + mapie = MapieRegressor(polyn_model, **params) + mapie.fit(X_train, y_train) + y_pred[strategy], y_pis[strategy] = mapie.predict(X_test, alpha=0.05) + + +# Visualization +def plot_1d_data( + X_train: NDArray, + y_train: NDArray, + X_test: NDArray, + y_test: NDArray, + y_sigma: float, + y_pred: NDArray, + y_pred_low: NDArray, + y_pred_up: NDArray, + ax: plt.Axes, + title: str, +) -> None: + ax.set_xlabel("x") + ax.set_ylabel("y") + ax.set_xlim([-10, 10]) + ax.set_ylim([np.min(y_test) * 1.3, np.max(y_test) * 1.3]) + ax.fill_between(X_test, y_pred_low, y_pred_up, alpha=0.3) + ax.scatter(X_train, y_train, color="red", alpha=0.3, label="Training data") + ax.plot(X_test, y_test, color="gray", label="True confidence intervals") + ax.plot(X_test, y_test - y_sigma, color="gray", ls="--") + ax.plot(X_test, y_test + y_sigma, color="gray", ls="--") + ax.plot(X_test, y_pred, color="b", alpha=0.5, label="Prediction intervals") + if title is not None: + ax.set_title(title) + ax.legend() + + +n_figs = len(STRATEGIES) +fig, axs = plt.subplots(2, 2, figsize=(13, 12)) +coords = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1]] +for strategy, coord in zip(STRATEGIES, coords): + plot_1d_data( + X_train.ravel(), + y_train.ravel(), + X_test.ravel(), + y_mesh.ravel(), + 1.96 * noise, + y_pred[strategy].ravel(), + y_pis[strategy][:, 0, 0].ravel(), + y_pis[strategy][:, 1, 0].ravel(), + ax=coord, + title=strategy, + ) + + +fig, ax = plt.subplots(1, 1, figsize=(7, 5)) +ax.set_xlim([-8, 8]) +ax.set_ylim([0, 4]) +for strategy in STRATEGIES: + ax.plot(X_test, y_pis[strategy][:, 1, 0] - y_pis[strategy][:, 0, 0]) +ax.axhline(1.96 * 2 * noise, ls="--", color="k") +ax.set_xlabel("x") +ax.set_ylabel("Prediction Interval Width") +ax.legend(list(STRATEGIES.keys()) + ["True width"], fontsize=8) +plt.show() diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index 96a725b1d..fa167cad9 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -45,7 +45,8 @@ demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64") demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64") demand_df["Hour"] = demand_df.index.hour -for hour in range(1, 3): +n_lags = 5 +for hour in range(1, n_lags): demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour) # Train/validation/test split @@ -53,7 +54,7 @@ demand_train = demand_df.iloc[:-num_test_steps, :].copy() demand_test = demand_df.iloc[-num_test_steps:, :].copy() features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + [ - f"Lag_{hour}" for hour in range(1, 2) + f"Lag_{hour}" for hour in range(1, n_lags) ] X_train = demand_train.loc[ @@ -64,202 +65,337 @@ y_test = demand_test["Demand"] # Model: Random Forest previously optimized with a cross-validation -model = RandomForestRegressor(max_depth=15, n_estimators=1, random_state=59) +model = RandomForestRegressor(max_depth=10, n_estimators=50, random_state=59) # Estimate prediction intervals on test set with best estimator -alpha = 0.1 -cv_MapieTimeSeries = BlockBootstrap(20, length=48, random_state=59) +alpha = 0.05 +cv_MapieTimeSeries = BlockBootstrap( + n_resamplings=100, length=48, overlapping=True, random_state=59 +) -mapie_model = MapieTimeSeriesRegressor( +mapie_plus = MapieTimeSeriesRegressor( + model, method="plus", cv=cv_MapieTimeSeries, agg_function="mean", n_jobs=-1 +) +mapie_enpbi = MapieTimeSeriesRegressor( model, method="plus", cv=cv_MapieTimeSeries, agg_function="mean", n_jobs=-1 ) -mapie_model = mapie_model.fit(X_train, y_train) -mapie_no_pfit = mapie_model.fit(X_train, y_train) -mapie_pfit_JAB_F = mapie_model.fit(X_train, y_train) -mapie_pfit_JAB_T = mapie_model.fit(X_train, y_train) -gap_pfit = 1 +gap = 1 -# With no partial_fit, JAB_like is False -y_pred_npfit_JAB_F, y_pis_npfit_JAB_F = mapie_no_pfit.predict( - X_test, alpha=alpha, ensemble=True +print("EnbPI, with no partial_fit, width optimization") +mapie_enpbi = mapie_enpbi.fit(X_train, y_train) +y_pred_npfit_enbpi, y_pis_npfit_enbpi = mapie_enpbi.predict( + X_test, alpha=alpha, ensemble=True, beta_optimize=True ) -coverage_npfit_JAB_F = regression_coverage_score( - y_test, y_pis_npfit_JAB_F[:, 0, 0], y_pis_npfit_JAB_F[:, 1, 0] +coverage_npfit_enbpi = regression_coverage_score( + y_test, y_pis_npfit_enbpi[:, 0, 0], y_pis_npfit_enbpi[:, 1, 0] ) -width_npfit_JAB_F = ( - y_pis_npfit_JAB_F[:, 1, 0] - y_pis_npfit_JAB_F[:, 0, 0] +width_npfit_enbpi = ( + y_pis_npfit_enbpi[:, 1, 0] - y_pis_npfit_enbpi[:, 0, 0] ).mean() -# With partial_fit every hour, JAB_like is False +print("EnbPI with partial_fit, width optimization") +mapie_enpbi = mapie_enpbi.fit(X_train, y_train) +y_pred_pfit_enbpi = np.zeros(y_pred_npfit_enbpi.shape) +y_pis_pfit_enbpi = np.zeros(y_pis_npfit_enbpi.shape) -y_pred_pfit_JAB_F, y_pis_pfit_JAB_F = mapie_pfit_JAB_F.predict( - X_test.iloc[:gap_pfit, :], alpha=alpha, ensemble=True +y_pred_pfit_enbpi[:gap], y_pis_pfit_enbpi[:gap, :, :] = mapie_enpbi.predict( + X_test.iloc[:gap, :], alpha=alpha, ensemble=True, beta_optimize=True ) -for step in range(gap_pfit, len(X_test), gap_pfit): - mapie_pfit_JAB_F.partial_fit( - X_test.iloc[(step - gap_pfit):step, :], - y_test.iloc[(step - gap_pfit):step], +for step in range(gap, len(X_test), gap): + mapie_enpbi.partial_fit( + X_test.iloc[(step - gap) : step, :], + y_test.iloc[(step - gap) : step], ) - y_pred_gap_step, y_pis_gap_step = mapie_pfit_JAB_F.predict( - X_test.iloc[step:(step + gap_pfit), :], alpha=alpha, ensemble=True + ( + y_pred_pfit_enbpi[step : step + gap], + y_pis_pfit_enbpi[step : step + gap, :, :], + ) = mapie_enpbi.predict( + X_test.iloc[step : (step + gap), :], + alpha=alpha, + ensemble=True, + beta_optimize=True, ) - y_pred_pfit_JAB_F = np.concatenate( - (y_pred_pfit_JAB_F, y_pred_gap_step), axis=0 +coverage_pfit_enbpi = regression_coverage_score( + y_test, y_pis_pfit_enbpi[:, 0, 0], y_pis_pfit_enbpi[:, 1, 0] +) +width_pfit_enbpi = ( + y_pis_pfit_enbpi[:, 1, 0] - y_pis_pfit_enbpi[:, 0, 0] +).mean() + +print("EnbPI with partial_fit, NO width optimization") +mapie_enpbi = mapie_enpbi.fit(X_train, y_train) +y_pred_pfit_enbpi_no_opt = np.zeros(y_pred_npfit_enbpi.shape) +y_pis_pfit_enbpi_no_opt = np.zeros(y_pis_npfit_enbpi.shape) +( + y_pred_pfit_enbpi_no_opt[:gap], + y_pis_pfit_enbpi_no_opt[:gap, :, :], +) = mapie_enpbi.predict( + X_test.iloc[:gap, :], alpha=alpha, ensemble=True, beta_optimize=False +) + +for step in range(gap, len(X_test), gap): + mapie_enpbi.partial_fit( + X_test.iloc[(step - gap) : step, :], + y_test.iloc[(step - gap) : step], ) - y_pis_pfit_JAB_F = np.concatenate( - (y_pis_pfit_JAB_F, y_pis_gap_step), axis=0 + ( + y_pred_pfit_enbpi_no_opt[step : step + gap], + y_pis_pfit_enbpi_no_opt[step : step + gap, :, :], + ) = mapie_enpbi.predict( + X_test.iloc[step : (step + gap), :], + alpha=alpha, + ensemble=True, + beta_optimize=False, ) - -coverage_pfit_JAB_F = regression_coverage_score( - y_test, y_pis_pfit_JAB_F[:, 0, 0], y_pis_pfit_JAB_F[:, 1, 0] +coverage_pfit_enbpi_no_opt = regression_coverage_score( + y_test, y_pis_pfit_enbpi_no_opt[:, 0, 0], y_pis_pfit_enbpi_no_opt[:, 1, 0] ) -width_pfit_JAB_F = ( - y_pis_pfit_JAB_F[:, 1, 0] - y_pis_pfit_JAB_F[:, 0, 0] +width_pfit_enbpi_no_opt = ( + y_pis_pfit_enbpi_no_opt[:, 1, 0] - y_pis_pfit_enbpi_no_opt[:, 0, 0] ).mean() -# With no partial_fit, JAB_like is True -y_pred_npfit_JAB_T, y_pis_npfit_JAB_T = mapie_no_pfit.predict( - X_test, alpha=alpha, JAB_Like=True +print("Plus, with partial_fit, width optimization") +mapie_plus = mapie_plus.fit(X_train, y_train) +y_pred_pfit_plus = np.zeros(y_pred_npfit_enbpi.shape) +y_pis_pfit_plus = np.zeros(y_pis_npfit_enbpi.shape) +(y_pred_pfit_plus[:gap], y_pis_pfit_plus[:gap, :, :],) = mapie_plus.predict( + X_test.iloc[:gap, :], + alpha=alpha, + beta_optimize=True, ) -coverage_npfit_JAB_T = regression_coverage_score( - y_test, y_pis_npfit_JAB_T[:, 0, 0], y_pis_npfit_JAB_T[:, 1, 0] +for step in range(gap, len(X_test), gap): + mapie_plus.partial_fit( + X_test.iloc[(step - gap) : step, :], + y_test.iloc[(step - gap) : step], + ) + ( + y_pred_pfit_plus[step : step + gap], + y_pis_pfit_plus[step : step + gap, :, :], + ) = mapie_plus.predict( + X_test.iloc[step : (step + gap), :], + alpha=alpha, + ensemble=True, + beta_optimize=True, + ) + +coverage_pfit_plus = regression_coverage_score( + y_test, y_pis_pfit_plus[:, 0, 0], y_pis_pfit_plus[:, 1, 0] ) -width_npfit_JAB_T = ( - y_pis_npfit_JAB_T[:, 1, 0] - y_pis_npfit_JAB_T[:, 0, 0] -).mean() +width_pfit_plus = (y_pis_pfit_plus[:, 1, 0] - y_pis_pfit_plus[:, 0, 0]).mean() -# With partial_fit every hour, JAB_like is True -y_pred_pfit_JAB_T, y_pis_pfit_JAB_T = mapie_no_pfit.predict( - X_test.iloc[:gap_pfit, :], alpha=alpha, JAB_Like=True +print("Plus, with partial_fit, NO width optimization") +mapie_plus = mapie_plus.fit(X_train, y_train) +y_pred_pfit_plus_no_opt = np.zeros(y_pred_npfit_enbpi.shape) +y_pis_pfit_plus_no_opt = np.zeros(y_pis_npfit_enbpi.shape) +( + y_pred_pfit_plus_no_opt[:gap], + y_pis_pfit_plus_no_opt[:gap, :, :], +) = mapie_plus.predict( + X_test.iloc[:gap, :], + alpha=alpha, + beta_optimize=False, ) -for step in range(gap_pfit, len(X_test), gap_pfit): - mapie_pfit_JAB_T.partial_fit( - X_test.iloc[(step - gap_pfit):step, :], - y_test.iloc[(step - gap_pfit):step], +for step in range(gap, len(X_test), gap): + mapie_plus.partial_fit( + X_test.iloc[(step - gap) : step, :], + y_test.iloc[(step - gap) : step], ) - y_pred_gap_step, y_pis_gap_step = mapie_pfit_JAB_T.predict( - X_test.iloc[step:(step + gap_pfit), :], + ( + y_pred_pfit_plus_no_opt[step : step + gap], + y_pis_pfit_plus_no_opt[step : step + gap, :, :], + ) = mapie_plus.predict( + X_test.iloc[step : (step + gap), :], alpha=alpha, - JAB_Like=True, ensemble=True, + beta_optimize=False, ) - y_pred_pfit_JAB_T = np.concatenate( - (y_pred_pfit_JAB_T, y_pred_gap_step), axis=0 + +coverage_pfit_plus_no_opt = regression_coverage_score( + y_test, y_pis_pfit_plus_no_opt[:, 0, 0], y_pis_pfit_plus_no_opt[:, 1, 0] +) +width_pfit_plus_no_opt = ( + y_pis_pfit_plus_no_opt[:, 1, 0] - y_pis_pfit_plus_no_opt[:, 0, 0] +).mean() + +print("Plus, with partial_fit, MapieRegressor_Like") +mapie_plus = mapie_plus.fit(X_train, y_train) +y_pred_pfit_MR = np.zeros(y_pred_npfit_enbpi.shape) +y_pis_pfit_MR = np.zeros(y_pis_npfit_enbpi.shape) +y_pred_pfit_MR[:gap], y_pis_pfit_MR[:gap, :, :] = mapie_plus.root_predict( + X_test.iloc[:gap, :], alpha=alpha +) +for step in range(gap, len(X_test), gap): + mapie_plus.partial_fit( + X_test.iloc[(step - gap) : step, :], + y_test.iloc[(step - gap) : step], ) - y_pis_pfit_JAB_T = np.concatenate( - (y_pis_pfit_JAB_T, y_pis_gap_step), axis=0 + ( + y_pred_pfit_MR[step : step + gap], + y_pis_pfit_MR[step : step + gap, :, :], + ) = mapie_plus.root_predict( + X_test.iloc[step : (step + gap), :], + alpha=alpha, + ensemble=True, ) -coverage_pfit_JAB_T = regression_coverage_score( - y_test, y_pis_pfit_JAB_T[:, 0, 0], y_pis_pfit_JAB_T[:, 1, 0] +coverage_pfit_MR = regression_coverage_score( + y_test, y_pis_pfit_MR[:, 0, 0], y_pis_pfit_MR[:, 1, 0] ) -width_pfit_JAB_T = ( - y_pis_pfit_JAB_T[:, 1, 0] - y_pis_pfit_JAB_T[:, 0, 0] -).mean() +width_pfit_MR = (y_pis_pfit_MR[:, 1, 0] - y_pis_pfit_MR[:, 0, 0]).mean() # Print results print( "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " - "\nWithout any partial_fit. JAB_like is False:" - f"{coverage_npfit_JAB_F:.3f}, {width_npfit_JAB_F:.3f}" + "\nEnbPI without any partial_fit:" + f"{coverage_npfit_enbpi :.3f}, {width_npfit_enbpi:.3f}" +) +print( + "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " + "\nEnbPI with partial_fit:" + f"{coverage_pfit_enbpi:.3f}, {width_pfit_enbpi:.3f}" ) print( "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " - "\nWithout any partial_fit. JAB_like is True:" - f"{coverage_npfit_JAB_T:.3f}, {width_npfit_JAB_T:.3f}" + "\nEnbPI with partial_fit, no with optimization:" + f"{coverage_pfit_enbpi_no_opt:.3f}, {width_pfit_enbpi_no_opt:.3f}" ) print( "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " - "\nWith partial_fit. JAB_like is False:" - f"{coverage_pfit_JAB_F:.3f}, {width_pfit_JAB_F:.3f}" + "\nPlus, with partial_fit:" + f"{coverage_pfit_plus:.3f}, {width_pfit_plus:.3f}" ) print( "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " - "\nWith partial_fit. JAB_like is True:" - f"{coverage_pfit_JAB_T:.3f}, {width_pfit_JAB_T:.3f}" + "\nPlus, with partial_fit. no width optimization:" + f"{coverage_pfit_plus_no_opt:.3f}, {width_pfit_plus_no_opt:.3f}" +) +print( + "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " + "\nMR_Like, with partial_fit:" + f"{coverage_pfit_MR:.3f}, {width_pfit_MR:.3f}" ) # Plot estimated prediction intervals on test set -fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots( - nrows=2, ncols=2, figsize=(30, 10), sharey="row", sharex="col" +fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots( + nrows=2, ncols=3, figsize=(30, 25), sharey="row", sharex="col" ) -ax1.set_ylabel("Hourly demand (GW)") -ax1.plot(demand_test.Demand, lw=2, label="Test data", c="C1") +for ax in [ax1, ax2, ax3, ax4, ax5, ax6]: + ax.set_ylabel("Hourly demand (GW)") + ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") + + ax1.plot( - demand_test.index, y_pred_npfit_JAB_F, lw=2, c="C2", label="Predictions" + demand_test.index, y_pred_npfit_enbpi, lw=2, c="C2", label="Predictions" ) ax1.fill_between( demand_test.index, - y_pis_npfit_JAB_F[:, 0, 0], - y_pis_npfit_JAB_F[:, 1, 0], + y_pis_npfit_enbpi[:, 0, 0], + y_pis_npfit_enbpi[:, 1, 0], color="C2", alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) -ax1.legend() ax1.set_title( - "Without partial_fit, JAB False." - f"Coverage:{coverage_npfit_JAB_F:.3f} Width:{width_npfit_JAB_F:.3f}" + "EnbPI, without partial_fit.\n" + f"Coverage:{coverage_npfit_enbpi:.3f} Width:{width_npfit_enbpi:.3f}" ) -ax2.set_ylabel("Hourly demand (GW)") -ax2.plot(demand_test.Demand, lw=2, label="Test data", c="C1") ax2.plot( - demand_test.index, y_pred_npfit_JAB_T, lw=2, c="C2", label="Predictions" + demand_test.index, y_pred_pfit_enbpi, lw=2, c="C2", label="Predictions" ) ax2.fill_between( demand_test.index, - y_pis_npfit_JAB_T[:, 0, 0], - y_pis_npfit_JAB_T[:, 1, 0], + y_pis_pfit_enbpi[:, 0, 0], + y_pis_pfit_enbpi[:, 1, 0], color="C2", alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) -ax2.legend() ax2.set_title( - "Without partial_fit, JAB True." - f"Coverage:{coverage_npfit_JAB_T:.3f} Width:{width_npfit_JAB_T:.3f}" + "EnbPI with partial_fit.\n" + f"Coverage:{coverage_pfit_enbpi:.3f} Width:{width_pfit_enbpi:.3f}" ) -ax3.set_ylabel("Hourly demand (GW)") -ax3.plot(demand_test.Demand, lw=2, label="Test data", c="C1") ax3.plot( - demand_test.index, y_pred_npfit_JAB_F, lw=2, c="C2", label="Predictions" + demand_test.index, + y_pred_pfit_enbpi_no_opt, + lw=2, + c="C2", + label="Predictions", ) ax3.fill_between( demand_test.index, - y_pis_npfit_JAB_F[:, 0, 0], - y_pis_npfit_JAB_F[:, 1, 0], + y_pis_pfit_enbpi_no_opt[:, 0, 0], + y_pis_pfit_enbpi_no_opt[:, 1, 0], color="C2", alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) -ax3.legend() ax3.set_title( - "With partial_fit, JAB False." - f"Coverage:{coverage_npfit_JAB_F:.3f} Width:{width_npfit_JAB_F:.3f}" + "EnbPI with partial_fit. No width optimization\n" + f"Coverage:{coverage_pfit_enbpi_no_opt:.3f}" + f"Width:{width_pfit_enbpi_no_opt:.3f}" ) -ax4.set_ylabel("Hourly demand (GW)") -ax4.plot(demand_test.Demand, lw=2, label="Test data", c="C1") ax4.plot( - demand_test.index, y_pred_pfit_JAB_T, lw=2, c="C2", label="Predictions" + demand_test.index, + y_pred_pfit_plus, + lw=2, + c="C2", + label="Predictions", ) ax4.fill_between( demand_test.index, - y_pis_pfit_JAB_T[:, 0, 0], - y_pis_pfit_JAB_T[:, 1, 0], + y_pis_pfit_plus[:, 0, 0], + y_pis_pfit_plus[:, 1, 0], color="C2", alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) -ax4.legend() ax4.set_title( - "With partial_fit, JAB True." - f"Coverage:{coverage_npfit_JAB_T:.3f} Width:{width_npfit_JAB_T:.3f}" + "Plus, with partial_fit.\n" + f"Coverage:{coverage_pfit_plus:.3f}" + f"Width:{width_pfit_plus:.3f}" +) + +ax5.plot( + demand_test.index, + y_pred_pfit_plus_no_opt, + lw=2, + c="C2", + label="Predictions", +) +ax5.fill_between( + demand_test.index, + y_pis_pfit_plus_no_opt[:, 0, 0], + y_pis_pfit_plus_no_opt[:, 1, 0], + color="C2", + alpha=0.2, + label="MapieTimeSeriesRegressor PIs", +) +ax5.set_title( + "Plus, with partial_fit no width optimization\n" + f"Coverage:{coverage_pfit_plus_no_opt:.3f}" + f"Width:{width_pfit_plus_no_opt:.3f}" +) + + +ax6.plot(demand_test.index, y_pred_pfit_MR, lw=2, c="C2", label="Predictions") +ax6.fill_between( + demand_test.index, + y_pis_pfit_MR[:, 0, 0], + y_pis_pfit_MR[:, 1, 0], + color="C2", + alpha=0.2, + label="MapieTimeSeriesRegressor PIs", +) +ax6.set_title( + "MapieRegressor Like, with partial_fit\n" + f"Coverage:{coverage_pfit_MR:.3f} Width:{width_pfit_MR:.3f}" ) +ax1.legend() plt.show() diff --git a/examples/regression/plot_timeseries_enbpi_train.py b/examples/regression/plot_timeseries_enbpi_train.py index e9ee8d450..2200d0635 100644 --- a/examples/regression/plot_timeseries_enbpi_train.py +++ b/examples/regression/plot_timeseries_enbpi_train.py @@ -40,7 +40,9 @@ demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64") demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64") demand_df["Hour"] = demand_df.index.hour -for hour in range(1, 3): + +n_lags = 5 +for hour in range(1, n_lags): demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour) # Train/validation/test split @@ -48,7 +50,7 @@ demand_train = demand_df.iloc[:-num_test_steps, :].copy() demand_test = demand_df.iloc[-num_test_steps:, :].copy() features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + [ - f"Lag_{hour}" for hour in range(1, 2) + f"Lag_{hour}" for hour in range(1, n_lags) ] X_train = demand_train.loc[ diff --git a/mapie/quantile_timeit.ipynb b/mapie/quantile_timeit.ipynb new file mode 100644 index 000000000..d9770966d --- /dev/null +++ b/mapie/quantile_timeit.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d4595769-7300-44d1-8850-c269ebf72fde", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"/Users/tmorzadec/Missions/MAPIE\")\n", + "import numpy as np\n", + "from mapie.utils import masked_quantile\n", + "from mapie.regression import MapieRegressor\n", + "from sklearn.datasets import make_regression\n", + "import numpy.ma as ma\n", + "from pycallgraph2 import PyCallGraph, Config\n", + "from pycallgraph2.output import GraphvizOutput\n", + "from functools import lru_cache\n", + "import callgraph.decorator as callgraph\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4e824af9-8e84-4b89-b6de-e983a168c02f", + "metadata": {}, + "outputs": [], + "source": [ + "X = np.random.uniform(low=-100, high=100, size = int(1e8)).reshape(int(1e4), -1)\n", + "\n", + "# indices1 = np.random.choice(X.shape[0], size=1, replace=True)\n", + "# indices2 = np.random.choice(X.shape[1], size=1, replace=True)\n", + "# indices = zip(indices1, indices2)\n", + "\n", + "# for (x, y) in indices:\n", + "# X[x,y] = np.nan\n", + "# " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ea9ba9ba-3dcf-43f0-9425-91437fd6f000", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1000, 10000)\n" + ] + } + ], + "source": [ + "q = list(np.linspace(0.1, 0.9, 1000))\n", + "print(X.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f90f220e-cc20-40cc-a33c-566470b61856", + "metadata": {}, + "outputs": [], + "source": [ + "@callgraph()\n", + "@lru_cache()\n", + "def mask():\n", + " masked_quantile(ma.masked_invalid(X), q, axis=0, method=\"higher\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29592f5d", + "metadata": {}, + "outputs": [], + "source": [ + "%timeit -n 1 -r 1 masked_quantile(ma.masked_invalid(X), q, axis=0, method=\"higher\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "843ab182-fd23-40f9-a569-647b1a2ad165", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.48 s ± 401 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit np.nanquantile(X, q, axis=0, interpolation=\"higher\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "260ba3a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.35 s ± 502 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit np.quantile(X, q, axis=0, interpolation=\"higher\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5349359a", + "metadata": {}, + "outputs": [], + "source": [ + "config = Config(max_depth=10)\n", + "with PyCallGraph(output=GraphvizOutput(), config=config):\n", + " masked_quantile(ma.masked_invalid(X), q, axis=0, method=\"higher\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2acbef74-3f03-457a-b31e-a5dad75d70c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n", + "500\n", + "(1, 500)\n", + "2\n", + "500\n", + "(1, 500)\n", + "2\n", + "500\n", + "(1, 500)\n", + "2\n", + "500\n", + "(1, 500)\n", + "[-118.90258408 209.02916237 -22.18296168 234.40885886 -159.61745268\n", + " -184.84111207 26.85546511 -19.12965127 -162.97950999 -53.36413493]\n", + "(1, 2, 1000)\n" + ] + } + ], + "source": [ + "mapie_reg = MapieRegressor(method=\"minmax\", agg_function=\"mean\", cv=-1)\n", + "alpha = [0.2, 0.8]\n", + "mapie_reg.fit(X, y)\n", + "#y_pred_float1, y_pis_float1 = mapie_reg.predict(X, alpha=alpha[0])\n", + "#y_pred_float2, y_pis_float2 = mapie_reg.predict(X, alpha=alpha[1])\n", + "y_pred_array, y_pis_array = mapie_reg.predict(X, alpha=alpha)\n", + "#print(y_pis_float1[0,1,:10])\n", + "#print(y_pis_float2[0,1,:10])\n", + "print(y_pis_array[0,1,:10])\n", + "print(y_pis_array.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f82a3d9-cacc-42c6-9350-0709811a0af9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.44901419, -0.31701145, 0.03759213, -1.03072401, 0.32107287,\n", + " 0.73329445, -0.16373546, -0.58167561, 0.24257418, -0.40065236])" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "X[0,:].flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d03bf675-6fd1-4407-ae2d-0937c2e46df9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "e1d6b69c58a8ab3fab9d4bd10bf376ef86c3438c956e9d4e062e4cc32a9f8bce" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/mapie/regression.py b/mapie/regression.py index 1bceac984..3465fe9a7 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -421,13 +421,14 @@ def aggregate_with_mask(self, x: NDArray, k: NDArray) -> NDArray: ArrayLike of shape (n_samples_test,) Array of aggregated predictions for each testing sample. """ - if self.agg_function == "median": - return phi2D(A=x, B=k, fun=lambda x: np.nanmedian(x, axis=1)) if self.cv == "prefit": raise ValueError( "There should not be aggregation of predictions if cv is " "'prefit'" ) + if self.agg_function == "median": + return phi2D(A=x, B=k, fun=lambda x: np.nanmedian(x, axis=1)) + # To aggregate with mean() the aggregation coud be done # with phi2D(A=x, B=k, fun=lambda x: np.nanmean(x, axis=1). # However, phi2D contains a np.apply_along_axis loop which @@ -610,7 +611,7 @@ def predict( alpha_np = cast(NDArray, alpha) check_alpha_and_n_samples(alpha_np, n) if self.method in ["naive", "base"] or self.cv == "prefit": - quantile = masked_quantile( + quantile = np.nanquantile( self.conformity_scores_, 1 - alpha_np, method="higher" ) y_pred_low = y_pred[:, np.newaxis] - quantile @@ -645,8 +646,8 @@ def predict( y_pred_low = np.column_stack( [ - masked_quantile( - ma.masked_invalid(lower_bounds), + np.nanquantile( + lower_bounds, _alpha, axis=1, method="lower", @@ -657,8 +658,8 @@ def predict( y_pred_up = np.column_stack( [ - masked_quantile( - ma.masked_invalid(upper_bounds), + np.nanquantile( + upper_bounds, 1 - _alpha, axis=1, method="higher", diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index e66fe174b..bcf566afc 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -51,18 +51,37 @@ agg_function="mean", cv=KFold(n_splits=3, shuffle=True, random_state=1), ), - "jackknife_plus_ab": Params( - method="plus", + "jackknife_plus_ab_enbpi": Params( + method="enbpi", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), ), - "jackknife_minmax_ab": Params( + "jackknife_minmax_ab_enbpi": Params( method="minmax", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), ), - "jackknife_plus_median_ab": Params( - method="plus", + "jackknife_plus_median_ab_enbpi": Params( + method="enbpi", + agg_function="median", + cv=BlockBootstrap( + n_resamplings=30, + n_blocks=5, + random_state=1, + ), + ), + "jackknife_plus_ab_enbpi_no_opt": Params( + method="enbpi", + agg_function="mean", + cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), + ), + "jackknife_minmax_ab_enbpi_no_opt": Params( + method="minmax", + agg_function="mean", + cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), + ), + "jackknife_plus_median_ab_enbpi_no_opt": Params( + method="enbpi", agg_function="median", cv=BlockBootstrap( n_resamplings=30, @@ -70,7 +89,7 @@ random_state=1, ), ), - "jackknife_plus_ab_JAB": Params( + "jackknife_plus_ab_plus": Params( method="plus", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), @@ -101,12 +120,18 @@ "cv_minmax": 3.95, "prefit": 3.89, "cv_plus_median": 3.90, - "jackknife_plus_ab": 3.76, - "jackknife_minmax_ab": 3.96, - "jackknife_plus_median_ab": 3.76, - "jackknife_plus_ab_JAB": 3.76, - "jackknife_minmax_ab_JAB": 3.96, - "jackknife_plus_median_ab_JAB": 3.76, + "jackknife_plus_ab_enbpi": 3.76, + "jackknife_minmax_ab_enbpi": 3.96, + "jackknife_plus_median_ab_enbpi": 3.76, + "jackknife_plus_ab_enbpi_no_opt": 3.76, + "jackknife_minmax_ab_enbpi_no_opt": 3.96, + "jackknife_plus_median_ab_enbpi_no_opt": 3.76, + "jackknife_plus_ab_plus": 3.76, + "jackknife_minmax_ab_plus": 3.96, + "jackknife_plus_median_ab_plus": 3.76, + "jackknife_plus_ab_MR": 3.76, + "jackknife_minmax_ab_MR": 3.96, + "jackknife_plus_median_MR": 3.76, } COVERAGES = { @@ -119,12 +144,18 @@ "cv_minmax": 0.956, "prefit": 0.90, "cv_plus_median": 0.954, - "jackknife_plus_ab": 0.952, - "jackknife_minmax_ab": 0.960, - "jackknife_plus_median_ab": 0.946, - "jackknife_plus_ab_JAB": 0.92, - "jackknife_minmax_ab_JAB": 0.940, - "jackknife_plus_median_ab_JAB": 0.94, + "jackknife_plus_ab_enbpi": 0.952, + "jackknife_minmax_ab_enbpi": 0.960, + "jackknife_plus_median_ab_enbpi": 0.946, + "jackknife_plus_ab_enbpi_no_opt": 0.952, + "jackknife_minmax_ab_enbpi_no_opt": 0.960, + "jackknife_plus_median_ab_enbpi_no_opt": 0.946, + "jackknife_plus_ab_plus": 0.92, + "jackknife_minmax_ab_plus": 0.940, + "jackknife_plus_median_ab_plus": 0.94, + "jackknife_plus_ab_MR": 0.92, + "jackknife_minmax_ab_MR": 0.940, + "jackknife_plus_median_MR": 0.94, } diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py index 6be7abbed..5382e51f5 100644 --- a/mapie/tests/test_utils.py +++ b/mapie/tests/test_utils.py @@ -16,7 +16,6 @@ check_null_weight, check_verbose, fit_estimator, - masked_quantile, ) from mapie._typing import ArrayLike @@ -193,32 +192,3 @@ def test_invalid_verbose(verbose: Any) -> None: def test_valid_verbose(verbose: Any) -> None: """Test that valid verboses raise no errors.""" check_verbose(verbose) - - -def test_masked_quantile_invalid_minus_one(): - with pytest.raises(ValueError, match=r".*axis should be None, 0 or 1.*"): - masked_quantile(a=X_toy, q=0.1, axis=-1) - - -def test_masked_quantile_invalid_one(): - with pytest.raises(ValueError, match=r".*axis 1 is out of bounds.*"): - masked_quantile(a=X_toy.flatten(), q=0.1, axis=1) - - -def test_masked_quantile_linear_interpolation(): - quantiles = masked_quantile( - a=X_toy, q=[0.1, 0.2, 0.5], axis=0, method="linear" - ) - np.testing.assert_allclose(quantiles, np.array([[0.5, 1.0, 2.5]])) - - -def test_masked_quantile_linear_interpolation_scalar(): - quantile = masked_quantile( - a=X_toy.flatten(), q=0.1, axis=0, method="linear" - ) - assert quantile == 0.5 - - -def test_masked_quantile_invalid_method(): - with pytest.raises(ValueError, match=r".*'method' has to be 'higher'.*"): - masked_quantile(a=X_toy.flatten(), q=0.1, axis=0, method="lineaar") diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index cee380170..2cab837c5 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -3,7 +3,6 @@ from typing import Iterable, Optional, Tuple, Union, cast import numpy as np -import numpy.ma as ma from sklearn.base import RegressorMixin from sklearn.model_selection import BaseCrossValidator from sklearn.utils import check_array @@ -15,31 +14,30 @@ from .utils import ( check_alpha, check_alpha_and_n_samples, - masked_quantile, ) class MapieTimeSeriesRegressor(MapieRegressor): """ - Prediction interval with out-of-fold residuals for time series. + Prediction intervals with out-of-fold residuals for time series. - This class implements the EnbPI strategy and some variations - for estimating prediction intervals on single-output time series. + This class implements the EnbPI strategy and some variants for estimating + prediction intervals on single-output time series. Actually, EnbPI only corresponds to ``MapieTimeSeriesRegressor`` if the - ``cv`` argument if of type ``BlockBootstrap``. + ``cv`` argument is of type ``BlockBootstrap`` and ``method`` is "enbpi". References ---------- Chen Xu, and Yao Xie. - [6] "Conformal prediction for dynamic time-series." + "Conformal prediction for dynamic time-series." https://arxiv.org/abs/2010.09107 """ def __init__( self, estimator: Optional[RegressorMixin] = None, - method: str = "plus", + method: str = "enbpi", cv: Optional[Union[int, str, BaseCrossValidator]] = None, n_jobs: Optional[int] = None, agg_function: Optional[str] = "mean", @@ -47,6 +45,30 @@ def __init__( ) -> None: super().__init__(estimator, method, cv, n_jobs, agg_function, verbose) self.cv_need_agg_function.append("BlockBootstrap") + self.valid_methods_.append("enbpi") + + def _relative_conformity_scores( + self, + X: ArrayLike, + y: ArrayLike, + ) -> ArrayLike: + """ + Compute the conformity scores on a data set. + + Parameters + ---------- + X : ArrayLike of shape (n_samples, n_features) + Input data. + + y : ArrayLike of shape (n_samples,) + Input labels. + + Returns + ------- + The conformity scores corresponding to the input data set. + """ + y_pred, _ = super().predict(X, alpha=0.5, ensemble=True) + return np.asarray(y) - np.asarray(y_pred) def fit( self, @@ -55,14 +77,17 @@ def fit( sample_weight: Optional[ArrayLike] = None, ) -> MapieTimeSeriesRegressor: """ + Compare to the method ``fit`` of ``MapieRegressor``, the ``fit`` method + of ``MapieTimeSeriesRegressor`` computes the ``conformity_scores_`` + with relative values. + Returns ------- MapieTimeSeriesRegressor The model itself. """ self = super().fit(X=X, y=y, sample_weight=sample_weight) - y_pred, _ = super().predict(X, alpha=0.5, ensemble=True) - self.conformity_scores_ = np.asarray(y) - y_pred + self.conformity_scores_ = self._relative_conformity_scores(X, y) return self def partial_fit( @@ -71,15 +96,16 @@ def partial_fit( y: ArrayLike, ) -> MapieTimeSeriesRegressor: """ - Update the ``conformity_scores_`` attribute when data with known labels - are available. + Update the ``conformity_scores_`` and ``k_`` attributes when new data + with known labels are available. + Note: Don't use ``partial_fit`` with samples of the training set. Parameters ---------- - X : ArrayLike of shape (n_samples, n_features) + X : ArrayLike of shape (n_samples_test, n_features) Input data. - y : ArrayLike of shape (n_samples,) + y : ArrayLike of shape (n_samples_test,) Input labels. Returns @@ -87,43 +113,135 @@ def partial_fit( MapieTimeSeriesRegressor The model itself. """ - y_pred, _ = self.predict(X, alpha=0.5, ensemble=True) - new_conformity_scores_ = np.asarray(y) - np.asarray(y_pred) - new_conformity_scores_ = new_conformity_scores_[ - ~np.isnan(new_conformity_scores_) - ] - - cut_index = min( - len(new_conformity_scores_), len(self.conformity_scores_) - ) + if len(X) > len(self.conformity_scores_): + raise ValueError("You try to update more residuals than tere are!") + new_conformity_scores_ = self._relative_conformity_scores(X, y) self.conformity_scores_ = np.concatenate( [ - self.conformity_scores_[cut_index:], + self.conformity_scores_[-len(new_conformity_scores_) :], new_conformity_scores_, - ], - axis=0, + ] ) + self.k_[:, -len(new_conformity_scores_)] = 1.0 return self + def _beta_optimize( + self, + alpha: NDArray, + upper_bounds: NDArray, + lower_bounds: NDArray, + beta_optimize: bool = True, + ) -> NDArray: + """ + ``_beta_optimize`` offers to minimize the width of the PIs, for a given + difference of quantiles. + + Parameters + ---------- + alpha: Optional[NDArray] + The quantiles to compute. + upper_bounds: NDArray + The array of upper values. + lower_bounds: NDArray + The array of lower values. + optimize: bool + Whether to optimize or not. If ``False``, betas are the half of + alphas. + + Returns + ------- + NDArray + Array of betas minimizing the differences + ``(1-alpa+beta)-quantile - beta-quantile``. + """ + if lower_bounds.shape != upper_bounds.shape: + raise ValueError( + "Lower and upper bounds arrays should have the same shape." + ) + betas_0 = np.full( + shape=(len(alpha), len(lower_bounds)), + fill_value=np.nan, + dtype=float, + ) + if not beta_optimize: + for ind_alpha, _alpha in enumerate(alpha): + betas_0[ind_alpha, :] = _alpha / 2.0 + return betas_0 + + for ind_alpha, _alpha in enumerate(alpha): + betas = np.linspace( + _alpha / (len(lower_bounds) + 1), + _alpha, + num=len(lower_bounds), + endpoint=False, + ) + one_alpha_beta = np.nanquantile( + upper_bounds, + 1 - _alpha + betas, + axis=1, + method="higher", + ) # type: ignore + beta = np.nanquantile( + lower_bounds, + betas, + axis=1, + method="lower", + ) # type: ignore + if len(betas_0.shape) == 2: + betas_0[ind_alpha, :] = betas[ + np.argmin(one_alpha_beta - beta, axis=0) + ] + else: + betas_0[ind_alpha] = betas[ + np.argmin(one_alpha_beta - beta, axis=0)[0] + ] + return betas_0 + + def _pred_multi(self, X: NDArray) -> NDArray: + """ + Return a prediction per train sample for each test sample, by + aggregation with matrix ``k_``. + + Parameters + ---------- + X: NDArray of shape (n_samples_test, n_features) + Input data + + Returns + ------- + NDArray of shape (n_samples_test, n_samples_train) + """ + y_pred_multi = np.column_stack( + [e.predict(X) for e in self.estimators_] + ) + # At this point, y_pred_multi is of shape + # (n_samples_test, n_estimators_). The method + # ``aggregate_with_mask`` fits it to the right size + # thanks to the shape of k_. + + y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) + return y_pred_multi + def predict( self, X: ArrayLike, ensemble: bool = False, alpha: Optional[Union[float, Iterable[float]]] = None, - JAB_Like=False, + beta_optimize: bool = True, ) -> Union[NDArray, Tuple[NDArray, NDArray]]: """ - ``predict`` method correspond to the ``MapieRegressor``'s one with the - argument ``JAB_Like`` in more. In case ``JAB_Like`` is ``False``, - predictions correspond to [6]. In case it is True, is is another - implementation inspired of [2] with wider PIs. The second method is - much slower because of PIwise optimization. + Correspond to the ``MapieRegressor``'s one with the + method ``'plus'``. In case ``method`` is ``'enbpi'``, predictions + correspond to 'Conformal prediction for dynamic time-series'. The + method ``'plus'`` is slower because of PI-wise optimization. However, + you can choose not to optimize the width of the PIs by setting + ``beta_optimize`` to ``False``. Parameters ---------- - JAB_Like : boolean, default False - Whether to use the implementation of [6] or an implementation - closer to [2]. + beta_optimize: bool + Whether to optimize the PIs' width or not. + """ # Checks @@ -132,194 +250,135 @@ def predict( alpha = cast(Optional[NDArray], check_alpha(alpha)) X = check_array(X, force_all_finite=False, dtype=["float64", "object"]) y_pred = self.single_estimator_.predict(X) - n = len(self.conformity_scores_) if alpha is None: return np.array(y_pred) else: alpha_np = cast(NDArray, alpha) - check_alpha_and_n_samples(alpha_np, n) + check_alpha_and_n_samples(alpha_np, len(self.conformity_scores_)) - if ( - (not JAB_Like) - or (self.method in ["naive", "base"]) - or (self.cv == "prefit") + if (self.method in ["enbpi", "naive", "base"]) or ( + self.cv == "prefit" ): - # This version of predict corresponds to [6]. - # Its PIs are closed to the oracle's ones. - betas_0 = np.full_like(alpha_np, np.nan, dtype=float) - for ind, _alpha in enumerate(alpha_np): - betas = np.linspace( - _alpha / (n + 1), _alpha, num=n + 1, endpoint=False - ) - - one_alpha_beta = masked_quantile( - ma.masked_invalid(self.conformity_scores_), - 1 - _alpha + betas, - axis=0, - method="higher", - ) # type: ignore - - beta = masked_quantile( - ma.masked_invalid(self.conformity_scores_), - betas, - axis=0, - method="lower", - ) # type: ignore - - betas_0[ind] = betas[ - np.argmin(one_alpha_beta - beta, axis=1) - ] - - lower_quantiles = masked_quantile( - ma.masked_invalid(self.conformity_scores_), - betas_0, + betas_0 = self._beta_optimize( + alpha=alpha_np, + lower_bounds=self.conformity_scores_, + upper_bounds=self.conformity_scores_, + beta_optimize=beta_optimize, + ) + lower_quantiles = np.nanquantile( + self.conformity_scores_, + betas_0[:, 0], axis=0, method="lower", ).T # type: ignore - higher_quantiles = masked_quantile( - ma.masked_invalid(self.conformity_scores_), - 1 - alpha_np + betas_0, + higher_quantiles = np.nanquantile( + self.conformity_scores_, + 1 - alpha_np + betas_0[:, 0], axis=0, method="higher", ).T # type: ignore if (self.method in ["naive", "base"]) or (self.cv == "prefit"): + y_pred_low = y_pred[:, np.newaxis] + lower_quantiles + y_pred_up = y_pred[:, np.newaxis] + higher_quantiles + else: # method == "enbpi" + # Correspond to "Conformal prediction for dynamic time + # series". + # Its PIs are closed to the oracle's ones if beta_optimized + # is True. + y_pred_multi = self._pred_multi(X) + pred = aggregate_all(self.agg_function, y_pred_multi) y_pred_low = np.column_stack( [ - y_pred[:, np.newaxis] + lower_quantiles[k] - for k in range(len(alpha_np)) + pred + lower_quantiles[k] + for k, _ in enumerate(alpha_np) ] ) y_pred_up = np.column_stack( [ - y_pred[:, np.newaxis] + higher_quantiles[k] - for k in range(len(alpha_np)) + pred + higher_quantiles[k] + for k, _ in enumerate(alpha_np) ] ) - else: - y_pred_multi = np.column_stack( - [e.predict(X) for e in self.estimators_] - ) - - # At this point, y_pred_multi is of shape - # (n_samples_test, n_estimators_). The method - # ``aggregate_with_mask`` fits it to the right size thanks - # to the shape of k_. - y_pred_multi = self.aggregate_with_mask( - y_pred_multi, self.k_ + if self.method == "minmax": + lower_bounds = np.min(y_pred_multi, axis=1, keepdims=True) + upper_bounds = np.max(y_pred_multi, axis=1, keepdims=True) + y_pred_low = np.column_stack( + [ + lower_bounds + lower_quantiles[k] + for k, _ in enumerate(alpha_np) + ] ) - - if self.method == "plus": - pred = aggregate_all(self.agg_function, y_pred_multi) - y_pred_low = np.column_stack( - [ - pred + lower_quantiles[k] - for k in range(len(alpha_np)) - ] - ) - y_pred_up = np.column_stack( - [ - pred + higher_quantiles[k] - for k in range(len(alpha_np)) - ] - ) - - if self.method == "minmax": - lower_bounds = np.min( - y_pred_multi, axis=1, keepdims=True - ) - upper_bounds = np.max( - y_pred_multi, axis=1, keepdims=True - ) - y_pred_low = np.column_stack( - [ - lower_bounds + lower_quantiles[k] - for k in range(len(alpha_np)) - ] - ) - y_pred_up = np.column_stack( - [ - upper_bounds + higher_quantiles[k] - for k in range(len(alpha_np)) - ] - ) - else: - # This version of predict corresponds to [2]. - # Its PIs are wider. It does not coorespond to [6]. It is - # a try. It is slower because the betas - # (width optimization parameters of the PIs) are optimized at - # every points. - - y_pred_multi = np.column_stack( - [e.predict(X) for e in self.estimators_] - ) - # At this point, y_pred_multi is of shape - # (n_samples_test, n_estimators_). The method - # ``aggregate_with_mask`` fits it to the right size - # thanks to the shape of k_. - - y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) - + y_pred_up = np.column_stack( + [ + upper_bounds + higher_quantiles[k] + for k, _ in enumerate(alpha_np) + ] + ) + elif self.method == "plus": + # This version of predict corresponds to "Predictive Inference + # Is Free with the Jackknife+-after-Bootstrap.". + # Its PIs are wider. It does not coorespond to "Conformal + # prediction for dynamic time series". It is a try. It is + # slower because the betas (width optimization parameters of + # the PIs) are optimized for every point. + + y_pred_multi = self._pred_multi(X) y_pred_low = np.empty((len(y_pred), len(alpha)), dtype=float) y_pred_up = np.empty((len(y_pred), len(alpha)), dtype=float) + lower_bounds = y_pred_multi + self.conformity_scores_ + upper_bounds = y_pred_multi + self.conformity_scores_ + + betas_0 = self._beta_optimize( + alpha=alpha_np, + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + beta_optimize=beta_optimize, + ) + for ind_alpha, _alpha in enumerate(alpha_np): - betas = np.linspace( - _alpha / (n + 1), _alpha, num=n + 1, endpoint=False - ) + lower_quantiles = np.empty((betas_0.shape[1],)) + upper_quantiles = np.empty((betas_0.shape[1],)) - if self.method == "plus": - lower_bounds = y_pred_multi + self.conformity_scores_ - upper_bounds = y_pred_multi + self.conformity_scores_ - - if self.method == "minmax": - lower_bounds = np.min( - y_pred_multi, axis=1, keepdims=True - ) - upper_bounds = np.max( - y_pred_multi, axis=1, keepdims=True - ) - lower_bounds = lower_bounds + self.conformity_scores_ - upper_bounds = upper_bounds + self.conformity_scores_ - - one_alpha_beta = masked_quantile( - ma.masked_invalid(upper_bounds), - 1 - _alpha + betas, - axis=1, - method="higher", - ).T # type: ignore - - beta = masked_quantile( - ma.masked_invalid(lower_bounds), - betas, - axis=1, - method="lower", - ).T # type: ignore - - betas_0 = betas[np.argmin(one_alpha_beta - beta, axis=0)] - - lower_quantiles = np.empty((len(betas_0),)) - upper_quantiles = np.empty((len(betas_0),)) - - for ind, beta_0 in enumerate(betas_0): - lower_quantiles[ind] = masked_quantile( - ma.masked_invalid(lower_bounds[ind, :]), + for ind_beta_0, beta_0 in enumerate(betas_0[ind_alpha, :]): + lower_quantiles[ind_beta_0] = np.nanquantile( + lower_bounds[ind_beta_0, :], beta_0, axis=0, method="lower", - ).T # type: ignore + ) # type: ignore - upper_quantiles[ind] = masked_quantile( - ma.masked_invalid(upper_bounds[ind, :]), + upper_quantiles[ind_beta_0] = np.nanquantile( + upper_bounds[ind_beta_0, :], 1 - _alpha + beta_0, axis=0, method="higher", - ).T # type: ignore - y_pred_low[:, ind_alpha] = lower_quantiles - y_pred_up[:, ind_alpha] = upper_quantiles + ) # type: ignore + y_pred_low[:, ind_alpha] = lower_quantiles + y_pred_up[:, ind_alpha] = upper_quantiles if ensemble: y_pred = aggregate_all(self.agg_function, y_pred_multi) return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) + + def root_predict( + self, + X: ArrayLike, + ensemble: bool = False, + alpha: Optional[Union[float, Iterable[float]]] = None, + ) -> Union[NDArray, Tuple[NDArray, NDArray]]: + """ + ``root_predict`` method correspond to the one of ``MapieRegressor``'s. + """ + conformity_scores_save = self.conformity_scores_.copy() + self.conformity_scores_ = np.abs(self.conformity_scores_) + if alpha is None: + y_pred = super().predict(X=X, ensemble=ensemble, alpha=alpha) + self.conformity_scores_ = conformity_scores_save + return y_pred + y_pred, y_pis = super().predict(X=X, ensemble=ensemble, alpha=alpha) + self.conformity_scores_ = conformity_scores_save + return y_pred, y_pis diff --git a/mapie/utils.py b/mapie/utils.py index 23158a9ac..e7be5d6c8 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -424,112 +424,3 @@ def check_input_is_image(X: ArrayLike) -> None: "When X is an image, the number of dimensions" "must be equal to 3 or 4." ) - - -def masked_quantile( - a: NDArray, - q: Union[float, NDArray], - axis: Optional[int] = None, - method: str = "linear", -) -> NDArray: - """ - Compute quantile for masked arrays. It avoids using ``np.nanquantile`` that - is quite slow because of an ``apply_along_axis`` loop. - - Parameters - ---------- - a: NDArray - Array of data. - - q: Union[float, NDArray] - Quantiles. - - axis:Optional[int] - ``axis`` is ``None``, ``0`` or ``1``, default ``None``. - If ``axis`` is ``None``, compute the quantiles of the flatten array. - - method: str - "linear", "higher" or "lower" - """ - return_float = False - flatten = False - - if isinstance(q, float): - flatten = True - if (len(a.shape)) == 1 or (len(a) == 1): - return_float = True - - q = cast(NDArray, check_alpha(q)) - if (len(a.shape) == 1) or (a.shape[1] == 1): - if axis == 1: - raise ValueError( - "axis 1 is out of bounds for array of dimension 1" - ) - if len(q) == 1: - flatten = True - - if axis is None: - a = a.flatten() - axis = 0 - - if axis == 0: - if (len(a.shape) == 1) or (a.shape[1] == 1): - a = a.reshape(-1, 1) - if hasattr(a, "mask"): - a_m = cast(ma.MaskedArray, a) - a_np = np.where(a_m.mask, np.inf, a_m.data) - masked_sum = np.sum(a_m.mask, axis=0).astype(int) - else: - a_np = a - masked_sum = np.zeros((a.shape[1],)) - a_np = np.sort(a_np, axis=0) - grid = np.indices(a_np.shape)[0] - a_tile = np.tile(a_np, (len(q), 1, 1)) - grid_tile = np.tile(grid, (len(q), 1, 1)) - - nb_valid_values = (len(a_np) - masked_sum).astype(int) - q_out = np.outer(nb_valid_values - 1, q) - q_out_inf = np.floor(q_out).astype(int) - q_out_sup = np.ceil(q_out) - q_out_sup = np.minimum( - q_out_sup, np.outer(nb_valid_values - 1, np.ones(len(q))) - ).astype(int) - - inf_bool = np.equal( - grid_tile, - np.tile(q_out_inf.T, (1, 1, a_np.shape[0])) - .reshape(len(q), *a_np.shape) - .astype(int), - ) - sup_bool = np.equal( - grid_tile, - np.tile(q_out_sup.T, (1, 1, a_np.shape[0])).reshape( - (len(q), *a_np.shape) - ), - ) - - lower_values = a_tile[inf_bool].reshape(len(q), -1) - upper_values = a_tile[sup_bool].reshape(len(q), -1) - - if method == "lower": - values = lower_values - elif method == "higher": - values = upper_values - elif method == "linear": - values = lower_values + (q_out - q_out_inf).T * ( - upper_values - lower_values - ) - else: - raise ValueError("'method' has to be 'higher','lower', or'linear'") - if flatten: - values = values.flatten() - else: - values = values.T - - if return_float: - values = values[0] - return values - elif axis == 1: - return masked_quantile(a.T, q, axis=0, method=method) - else: - raise ValueError("axis should be None, 0 or 1") From 81c46793e712a53360f129152a813cdb8c5fa7ee Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Thu, 14 Apr 2022 11:53:42 +0200 Subject: [PATCH 17/32] [REFACTO] new version of enbpi refactorized --- .../plot_MapieRegressor_benchmark.py | 1 - examples/regression/plot_timeseries_enbpi.py | 50 +++--- mapie/regression.py | 6 +- mapie/tests/test_time_series_regression.py | 156 ++++++++---------- mapie/time_series_regression.py | 100 ++++++----- mapie/utils.py | 1 - 6 files changed, 143 insertions(+), 171 deletions(-) diff --git a/examples/regression/2-advanced-analysis/plot_MapieRegressor_benchmark.py b/examples/regression/2-advanced-analysis/plot_MapieRegressor_benchmark.py index f899a9ea7..35bd02e2a 100644 --- a/examples/regression/2-advanced-analysis/plot_MapieRegressor_benchmark.py +++ b/examples/regression/2-advanced-analysis/plot_MapieRegressor_benchmark.py @@ -6,7 +6,6 @@ prediction intervals capturing both aleatoric and epistemic uncertainties on a one-dimensional dataset with homoscedastic noise and normal sampling. """ -from ast import Sub from typing import Any, Callable, Tuple, TypeVar, Union from typing_extensions import TypedDict diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index fa167cad9..05ca7350b 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -105,14 +105,14 @@ for step in range(gap, len(X_test), gap): mapie_enpbi.partial_fit( - X_test.iloc[(step - gap) : step, :], - y_test.iloc[(step - gap) : step], + X_test.iloc[(step - gap):step, :], + y_test.iloc[(step - gap):step], ) ( - y_pred_pfit_enbpi[step : step + gap], - y_pis_pfit_enbpi[step : step + gap, :, :], + y_pred_pfit_enbpi[step:step + gap], + y_pis_pfit_enbpi[step:step + gap, :, :], ) = mapie_enpbi.predict( - X_test.iloc[step : (step + gap), :], + X_test.iloc[step:(step + gap), :], alpha=alpha, ensemble=True, beta_optimize=True, @@ -137,14 +137,14 @@ for step in range(gap, len(X_test), gap): mapie_enpbi.partial_fit( - X_test.iloc[(step - gap) : step, :], - y_test.iloc[(step - gap) : step], + X_test.iloc[(step - gap):step, :], + y_test.iloc[(step - gap):step], ) ( - y_pred_pfit_enbpi_no_opt[step : step + gap], - y_pis_pfit_enbpi_no_opt[step : step + gap, :, :], + y_pred_pfit_enbpi_no_opt[step:step + gap], + y_pis_pfit_enbpi_no_opt[step:step + gap, :, :], ) = mapie_enpbi.predict( - X_test.iloc[step : (step + gap), :], + X_test.iloc[step:step + gap, :], alpha=alpha, ensemble=True, beta_optimize=False, @@ -168,14 +168,14 @@ ) for step in range(gap, len(X_test), gap): mapie_plus.partial_fit( - X_test.iloc[(step - gap) : step, :], - y_test.iloc[(step - gap) : step], + X_test.iloc[step - gap:step, :], + y_test.iloc[step - gap:step], ) ( - y_pred_pfit_plus[step : step + gap], - y_pis_pfit_plus[step : step + gap, :, :], + y_pred_pfit_plus[step:step + gap], + y_pis_pfit_plus[step:step + gap, :, :], ) = mapie_plus.predict( - X_test.iloc[step : (step + gap), :], + X_test.iloc[step:step + gap, :], alpha=alpha, ensemble=True, beta_optimize=True, @@ -200,14 +200,14 @@ ) for step in range(gap, len(X_test), gap): mapie_plus.partial_fit( - X_test.iloc[(step - gap) : step, :], - y_test.iloc[(step - gap) : step], + X_test.iloc[step - gap:step, :], + y_test.iloc[step - gap: step], ) ( - y_pred_pfit_plus_no_opt[step : step + gap], - y_pis_pfit_plus_no_opt[step : step + gap, :, :], + y_pred_pfit_plus_no_opt[step:step + gap], + y_pis_pfit_plus_no_opt[step:step + gap, :, :], ) = mapie_plus.predict( - X_test.iloc[step : (step + gap), :], + X_test.iloc[step:step + gap, :], alpha=alpha, ensemble=True, beta_optimize=False, @@ -229,14 +229,14 @@ ) for step in range(gap, len(X_test), gap): mapie_plus.partial_fit( - X_test.iloc[(step - gap) : step, :], - y_test.iloc[(step - gap) : step], + X_test.iloc[step - gap:step, :], + y_test.iloc[step - gap:step], ) ( - y_pred_pfit_MR[step : step + gap], - y_pis_pfit_MR[step : step + gap, :, :], + y_pred_pfit_MR[step:step + gap], + y_pis_pfit_MR[step:step + gap, :, :], ) = mapie_plus.root_predict( - X_test.iloc[step : (step + gap), :], + X_test.iloc[step:step + gap, :], alpha=alpha, ensemble=True, ) diff --git a/mapie/regression.py b/mapie/regression.py index bf8b02109..ae7a48adc 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -4,7 +4,6 @@ from joblib import Parallel, delayed import numpy as np -import numpy.ma as ma from sklearn.base import BaseEstimator, RegressorMixin, clone from sklearn.linear_model import LinearRegression from sklearn.model_selection import BaseCrossValidator @@ -29,9 +28,7 @@ check_null_weight, check_verbose, fit_estimator, - masked_quantile, ) -from ._compatibility import np_quantile class MapieRegressor(BaseEstimator, RegressorMixin): @@ -188,6 +185,7 @@ class MapieRegressor(BaseEstimator, RegressorMixin): cv_need_agg_function = ["Subsample"] valid_methods_ = ["naive", "base", "plus", "minmax"] + plus_like_method = ["plus"] valid_agg_functions_ = [None, "median", "mean"] fit_attributes = [ "single_estimator_", @@ -635,7 +633,7 @@ def predict( y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) - if self.method == "plus": + if self.method in self.plus_like_method: lower_bounds = y_pred_multi - self.conformity_scores_ upper_bounds = y_pred_multi + self.conformity_scores_ diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index bcf566afc..1016f8d3f 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -19,9 +19,8 @@ X_toy = np.array(range(5)).reshape(-1, 1) y_toy = (5.0 + 2.0 * X_toy ** 1.1).flatten() X, y = make_regression(n_samples=500, n_features=10, noise=1.0, random_state=1) -X_short, y_short = X[:50, :], y[:50] k = np.ones(shape=(5, X.shape[1])) -METHODS = ["naive", "base", "plus", "minmax"] +METHODS = ["base", "enbpi", "minmax", "naive", "plus"] Params = TypedDict( "Params", @@ -51,17 +50,17 @@ agg_function="mean", cv=KFold(n_splits=3, shuffle=True, random_state=1), ), - "jackknife_plus_ab_enbpi": Params( + "jackknife_enbpi_ab_wopt": Params( method="enbpi", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), ), - "jackknife_minmax_ab_enbpi": Params( + "jackknife_minmax_ab": Params( method="minmax", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), ), - "jackknife_plus_median_ab_enbpi": Params( + "jackknife_enbpi_median_ab_wopt": Params( method="enbpi", agg_function="median", cv=BlockBootstrap( @@ -70,17 +69,12 @@ random_state=1, ), ), - "jackknife_plus_ab_enbpi_no_opt": Params( + "jackknife_enbpi_ab": Params( method="enbpi", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), ), - "jackknife_minmax_ab_enbpi_no_opt": Params( - method="minmax", - agg_function="mean", - cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), - ), - "jackknife_plus_median_ab_enbpi_no_opt": Params( + "jackknife_enbpi_median_ab": Params( method="enbpi", agg_function="median", cv=BlockBootstrap( @@ -89,17 +83,17 @@ random_state=1, ), ), - "jackknife_plus_ab_plus": Params( + "jackknife_plus_ab_MR": Params( method="plus", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), ), - "jackknife_minmax_ab_JAB": Params( + "jackknife_minmax_ab_MR": Params( method="minmax", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), ), - "jackknife_plus_median_ab_JAB": Params( + "jackknife_plus_ab_median_MR": Params( method="plus", agg_function="median", cv=BlockBootstrap( @@ -111,51 +105,50 @@ } WIDTHS = { - "naive": 3.76, - "jackknife": 3.76, - "jackknife_plus": 3.76, - "jackknife_minmax": 3.82, - "cv": 3.76, - "cv_plus": 3.76, - "cv_minmax": 3.95, - "prefit": 3.89, + "naive": 3.83, + "jackknife": 3.83, + "jackknife_plus": 3.82, + "jackknife_minmax": 3.90, + "cv": 3.83, + "cv_plus": 3.87, + "cv_minmax": 4.03, + "prefit": 4.79, "cv_plus_median": 3.90, - "jackknife_plus_ab_enbpi": 3.76, - "jackknife_minmax_ab_enbpi": 3.96, - "jackknife_plus_median_ab_enbpi": 3.76, - "jackknife_plus_ab_enbpi_no_opt": 3.76, - "jackknife_minmax_ab_enbpi_no_opt": 3.96, - "jackknife_plus_median_ab_enbpi_no_opt": 3.76, - "jackknife_plus_ab_plus": 3.76, - "jackknife_minmax_ab_plus": 3.96, - "jackknife_plus_median_ab_plus": 3.76, - "jackknife_plus_ab_MR": 3.76, + "jackknife_enbpi_ab_wopt": 3.76, + "jackknife_minmax_ab": 3.96, + "jackknife_enbpi_median_ab_wopt": 3.76, + "jackknife_enbpi_ab": 3.76, + "jackknife_minmax_ab": 3.96, + "jackknife_enbpi_median_ab": 3.76, + "jackknife_plus_ab": 3.76, + "jackknife_plus_median_ab": 3.83, + "jackknife_plus_ab_MR": 3.82, "jackknife_minmax_ab_MR": 3.96, - "jackknife_plus_median_MR": 3.76, + "jackknife_plus_ab_median_MR": 3.83, } COVERAGES = { "naive": 0.952, "jackknife": 0.952, - "jackknife_plus": 0.952, + "jackknife_plus": 0.94, "jackknife_minmax": 0.952, "cv": 0.958, "cv_plus": 0.956, - "cv_minmax": 0.956, - "prefit": 0.90, + "cv_minmax": 0.966, + "prefit": 0.98, "cv_plus_median": 0.954, - "jackknife_plus_ab_enbpi": 0.952, - "jackknife_minmax_ab_enbpi": 0.960, - "jackknife_plus_median_ab_enbpi": 0.946, - "jackknife_plus_ab_enbpi_no_opt": 0.952, - "jackknife_minmax_ab_enbpi_no_opt": 0.960, - "jackknife_plus_median_ab_enbpi_no_opt": 0.946, + "jackknife_enbpi_ab_wopt": 0.952, + "jackknife_minmax_ab": 0.960, + "jackknife_enbpi_median_ab_wopt": 0.946, + "jackknife_enbpi_ab": 0.952, + "jackknife_minmax_ab": 0.960, + "jackknife_enbpi_median_ab": 0.946, "jackknife_plus_ab_plus": 0.92, "jackknife_minmax_ab_plus": 0.940, "jackknife_plus_median_ab_plus": 0.94, - "jackknife_plus_ab_MR": 0.92, - "jackknife_minmax_ab_MR": 0.940, - "jackknife_plus_median_MR": 0.94, + "jackknife_plus_ab_MR": 0.954, + "jackknife_minmax_ab_MR": 0.958, + "jackknife_plus_ab_median_MR": 0.954, } @@ -246,15 +239,6 @@ def test_results_single_and_multi_jobs(strategy: str) -> None: np.testing.assert_allclose(y_pred_single, y_pred_multi) np.testing.assert_allclose(y_pis_single, y_pis_multi) - y_pred_single_JAB, y_pis_single_JAB = mapie_single.predict( - X_toy, alpha=0.2, JAB_Like=True - ) - y_pred_multi_JAB, y_pis_multi_JAB = mapie_multi.predict( - X_toy, alpha=0.2, JAB_Like=True - ) - np.testing.assert_allclose(y_pred_single_JAB, y_pred_multi_JAB) - np.testing.assert_allclose(y_pis_single_JAB, y_pis_multi_JAB) - @pytest.mark.parametrize("strategy", [*STRATEGIES]) def test_results_with_constant_sample_weights(strategy: str) -> None: @@ -277,23 +261,8 @@ def test_results_with_constant_sample_weights(strategy: str) -> None: np.testing.assert_allclose(y_pis0, y_pis1) np.testing.assert_allclose(y_pis1, y_pis2) - y_pred0_JAB, y_pis0_JAB = mapie0.predict( - X_short, alpha=0.05, JAB_Like=True - ) - y_pred1_JAB, y_pis1_JAB = mapie1.predict( - X_short, alpha=0.05, JAB_Like=True - ) - y_pred2_JAB, y_pis2_JAB = mapie2.predict( - X_short, alpha=0.05, JAB_Like=True - ) - np.testing.assert_allclose(y_pred0_JAB, y_pred1_JAB) - np.testing.assert_allclose(y_pred1_JAB, y_pred2_JAB) - np.testing.assert_allclose(y_pis0_JAB, y_pis1_JAB) - np.testing.assert_allclose(y_pis1_JAB, y_pis2_JAB) - - -@pytest.mark.parametrize("method", ["plus", "minmax"]) +@pytest.mark.parametrize("method", ["plus", "minmax", "enbpi"]) @pytest.mark.parametrize("cv", [-1, 2, 3, 5]) @pytest.mark.parametrize("agg_function", ["mean", "median"]) @pytest.mark.parametrize("alpha", [0.05, 0.1, 0.2]) @@ -308,24 +277,13 @@ def test_prediction_agg_function( method=method, cv=cv, agg_function=agg_function ) mapie.fit(X, y) - y_pred_1, y_pis_1 = mapie.predict(X_short, ensemble=True, alpha=alpha) - y_pred_2, y_pis_2 = mapie.predict(X_short, ensemble=False, alpha=alpha) + y_pred_1, y_pis_1 = mapie.predict(X, ensemble=True, alpha=alpha) + y_pred_2, y_pis_2 = mapie.predict(X, ensemble=False, alpha=alpha) np.testing.assert_allclose(y_pis_1[:, 0, 0], y_pis_2[:, 0, 0]) np.testing.assert_allclose(y_pis_1[:, 1, 0], y_pis_2[:, 1, 0]) with pytest.raises(AssertionError): np.testing.assert_allclose(y_pred_1, y_pred_2) - y_pred_1_JAB, y_pis_1_JAB = mapie.predict( - X_short, ensemble=True, alpha=alpha, JAB_Like=True - ) - y_pred_2_JAB, y_pis_2_JAB = mapie.predict( - X_short, ensemble=False, alpha=alpha, JAB_Like=True - ) - np.testing.assert_allclose(y_pis_1_JAB[:, 0, 0], y_pis_2_JAB[:, 0, 0]) - np.testing.assert_allclose(y_pis_1_JAB[:, 1, 0], y_pis_2_JAB[:, 1, 0]) - with pytest.raises(AssertionError): - np.testing.assert_allclose(y_pred_1_JAB, y_pred_2_JAB) - @pytest.mark.parametrize("strategy", [*STRATEGIES]) def test_linear_regression_results(strategy: str) -> None: @@ -336,14 +294,16 @@ def test_linear_regression_results(strategy: str) -> None: """ mapie_ts = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) mapie_ts.fit(X, y) - if "JAB" in strategy: - _, y_pis = mapie_ts.predict(X_short, alpha=0.05, JAB_Like=True) + if "opt" in strategy: + beta_optimize = True else: - _, y_pis = mapie_ts.predict(X, alpha=0.05, JAB_Like=False) + beta_optimize = False + _, y_pis = mapie_ts.predict(X, alpha=0.05, beta_optimize=beta_optimize) y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0] width_mean = (y_pred_up - y_pred_low).mean() - if "JAB" in strategy: - coverage = regression_coverage_score(y_short, y_pred_low, y_pred_up) + + if mapie_ts.method == "plus": + coverage = regression_coverage_score(y, y_pred_low, y_pred_up) else: coverage = regression_coverage_score(y, y_pred_low, y_pred_up) np.testing.assert_allclose(width_mean, WIDTHS[strategy], rtol=1e-2) @@ -470,6 +430,8 @@ def test_MapieTimeSeriesRegressor_alpha_is_None() -> None: with pytest.raises(ValueError, match=r".*too many values to unpackt*"): y_pred, y_pis = mapie_ts_reg.predict(X_toy, alpha=None) + with pytest.raises(ValueError, match=r".*too many values to unpackt*"): + y_pred, y_pis = mapie_ts_reg.root_predict(X_toy, alpha=None) def test_MapieTimeSeriesRegressor_partial_fit_ensemble() -> None: @@ -484,3 +446,19 @@ def test_MapieTimeSeriesRegressor_partial_fit_ensemble() -> None: assert round(mapie_ts_reg.conformity_scores_[-1], 2) == round( 17.5 - 18.665, 2 ) + + +def test_MapieTimeSeriesRegressor_partial_fit_two_big() -> None: + """Test ``partial_fit`` raised error.""" + mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) + with pytest.raises(ValueError, match=r".*You try to update more*"): + mapie_ts_reg = mapie_ts_reg.partial_fit(X=X, y=y) + + +def test_MapieTimeSeriesRegressor_beta_optimize_eeror() -> None: + """Test ``partial_fit`` raised error.""" + mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1) + with pytest.raises(ValueError, match=r".*Lower and upper bounds arrays*"): + mapie_ts_reg._beta_optimize( + alpha=0.1, upper_bounds=X, lower_bounds=X_toy, beta_optimize=True + ) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 2cab837c5..27f3b18b1 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -46,12 +46,13 @@ def __init__( super().__init__(estimator, method, cv, n_jobs, agg_function, verbose) self.cv_need_agg_function.append("BlockBootstrap") self.valid_methods_.append("enbpi") + self.plus_like_method.append("enbpi") def _relative_conformity_scores( self, - X: ArrayLike, - y: ArrayLike, - ) -> ArrayLike: + X: NDArray, + y: NDArray, + ) -> NDArray: """ Compute the conformity scores on a data set. @@ -67,7 +68,7 @@ def _relative_conformity_scores( ------- The conformity scores corresponding to the input data set. """ - y_pred, _ = super().predict(X, alpha=0.5, ensemble=True) + y_pred, _ = self.root_predict(X, alpha=0.5, ensemble=True) return np.asarray(y) - np.asarray(y_pred) def fit( @@ -113,12 +114,14 @@ def partial_fit( MapieTimeSeriesRegressor The model itself. """ + X = cast(NDArray, X) + y = cast(NDArray, y) if len(X) > len(self.conformity_scores_): raise ValueError("You try to update more residuals than tere are!") new_conformity_scores_ = self._relative_conformity_scores(X, y) self.conformity_scores_ = np.concatenate( [ - self.conformity_scores_[-len(new_conformity_scores_) :], + self.conformity_scores_[-len(new_conformity_scores_):], new_conformity_scores_, ] ) @@ -127,10 +130,10 @@ def partial_fit( def _beta_optimize( self, - alpha: NDArray, + alpha: Union[float, NDArray], upper_bounds: NDArray, lower_bounds: NDArray, - beta_optimize: bool = True, + beta_optimize: bool = False, ) -> NDArray: """ ``_beta_optimize`` offers to minimize the width of the PIs, for a given @@ -158,6 +161,7 @@ def _beta_optimize( raise ValueError( "Lower and upper bounds arrays should have the same shape." ) + alpha = cast(NDArray, alpha) betas_0 = np.full( shape=(len(alpha), len(lower_bounds)), fill_value=np.nan, @@ -173,7 +177,7 @@ def _beta_optimize( _alpha / (len(lower_bounds) + 1), _alpha, num=len(lower_bounds), - endpoint=False, + endpoint=True, ) one_alpha_beta = np.nanquantile( upper_bounds, @@ -187,17 +191,13 @@ def _beta_optimize( axis=1, method="lower", ) # type: ignore - if len(betas_0.shape) == 2: - betas_0[ind_alpha, :] = betas[ - np.argmin(one_alpha_beta - beta, axis=0) - ] - else: - betas_0[ind_alpha] = betas[ - np.argmin(one_alpha_beta - beta, axis=0)[0] - ] + betas_0[ind_alpha, :] = betas[ + np.argmin(one_alpha_beta - beta, axis=0) + ] + return betas_0 - def _pred_multi(self, X: NDArray) -> NDArray: + def _pred_multi(self, X: ArrayLike) -> NDArray: """ Return a prediction per train sample for each test sample, by aggregation with matrix ``k_``. @@ -257,13 +257,13 @@ def predict( alpha_np = cast(NDArray, alpha) check_alpha_and_n_samples(alpha_np, len(self.conformity_scores_)) - if (self.method in ["enbpi", "naive", "base"]) or ( + if (self.method in ["base", "enbpi", "minmax", "naive"]) or ( self.cv == "prefit" ): betas_0 = self._beta_optimize( alpha=alpha_np, - lower_bounds=self.conformity_scores_, - upper_bounds=self.conformity_scores_, + lower_bounds=self.conformity_scores_.reshape(1, -1), + upper_bounds=self.conformity_scores_.reshape(1, -1), beta_optimize=beta_optimize, ) lower_quantiles = np.nanquantile( @@ -282,29 +282,23 @@ def predict( if (self.method in ["naive", "base"]) or (self.cv == "prefit"): y_pred_low = y_pred[:, np.newaxis] + lower_quantiles y_pred_up = y_pred[:, np.newaxis] + higher_quantiles - else: # method == "enbpi" - # Correspond to "Conformal prediction for dynamic time - # series". - # Its PIs are closed to the oracle's ones if beta_optimized - # is True. + else: y_pred_multi = self._pred_multi(X) - pred = aggregate_all(self.agg_function, y_pred_multi) - y_pred_low = np.column_stack( - [ - pred + lower_quantiles[k] - for k, _ in enumerate(alpha_np) - ] - ) - y_pred_up = np.column_stack( - [ - pred + higher_quantiles[k] - for k, _ in enumerate(alpha_np) - ] - ) - if self.method == "minmax": - lower_bounds = np.min(y_pred_multi, axis=1, keepdims=True) - upper_bounds = np.max(y_pred_multi, axis=1, keepdims=True) + if self.method == "enbpi": + # Correspond to "Conformal prediction for dynamic time + # series". Its PIs are closed to the oracle's ones if + # beta_optimized is True. + pred = aggregate_all(self.agg_function, y_pred_multi) + lower_bounds, upper_bounds = pred, pred + else: # self.method == "minmax": + lower_bounds = np.min( + y_pred_multi, axis=1, keepdims=True + ) + upper_bounds = np.max( + y_pred_multi, axis=1, keepdims=True + ) + y_pred_low = np.column_stack( [ lower_bounds + lower_quantiles[k] @@ -317,14 +311,17 @@ def predict( for k, _ in enumerate(alpha_np) ] ) - elif self.method == "plus": - # This version of predict corresponds to "Predictive Inference - # Is Free with the Jackknife+-after-Bootstrap.". + + if ensemble: + y_pred = aggregate_all(self.agg_function, y_pred_multi) + + else: # self.method == "plus": + # This version of predict corresponds to "Predictive + # Inference is Free with the Jackknife+-after-Bootstrap.". # Its PIs are wider. It does not coorespond to "Conformal # prediction for dynamic time series". It is a try. It is - # slower because the betas (width optimization parameters of - # the PIs) are optimized for every point. - + # slower because the betas (width optimization parameters + # of the PIs) are optimized for every point y_pred_multi = self._pred_multi(X) y_pred_low = np.empty((len(y_pred), len(alpha)), dtype=float) y_pred_up = np.empty((len(y_pred), len(alpha)), dtype=float) @@ -357,11 +354,12 @@ def predict( axis=0, method="higher", ) # type: ignore - y_pred_low[:, ind_alpha] = lower_quantiles - y_pred_up[:, ind_alpha] = upper_quantiles + y_pred_low[:, ind_alpha] = lower_quantiles + y_pred_up[:, ind_alpha] = upper_quantiles + + if ensemble: + y_pred = aggregate_all(self.agg_function, y_pred_multi) - if ensemble: - y_pred = aggregate_all(self.agg_function, y_pred_multi) return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) def root_predict( diff --git a/mapie/utils.py b/mapie/utils.py index d39710def..39d41a29f 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -3,7 +3,6 @@ from typing import Any, Iterable, Optional, Tuple, Union, cast import numpy as np -import numpy.ma as ma from sklearn.base import ClassifierMixin, RegressorMixin from sklearn.model_selection import BaseCrossValidator, KFold, LeaveOneOut from sklearn.utils.validation import _check_sample_weight, _num_features From 1c1e8b06679d5109c64297181cc8055a0c2409d6 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Thu, 14 Apr 2022 12:14:36 +0200 Subject: [PATCH 18/32] [CORRECT] Correct compatibility with numpy versions --- mapie/_compatibility.py | 23 +++++++++++++++++++++++ mapie/regression.py | 7 ++++--- mapie/time_series_regression.py | 15 ++++++++------- 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/mapie/_compatibility.py b/mapie/_compatibility.py index 204831ace..380ae34b8 100644 --- a/mapie/_compatibility.py +++ b/mapie/_compatibility.py @@ -26,8 +26,31 @@ def np_quantile_version_above_122( return np.quantile(a, q, method=method, **kwargs) # type: ignore +def np_nanquantile_version_below_122( + a: ArrayLike, + q: ArrayLike, + method: str = "linear", + **kwargs: Any +) -> NDArray: + """Wrapper of np.quantile function for numpy version < 1.22.""" + return np.nanquantile(a, q, interpolation=method, **kwargs) # type: ignore + + +def np_nanquantile_version_above_122( + a: ArrayLike, + q: ArrayLike, + method: str = "linear", + **kwargs: Any +) -> NDArray: + """Wrapper of np.quantile function for numpy version >= 1.22.""" + return np.nanquantile(a, q, method=method, **kwargs) # type: ignore + + numpy_version = parse_version(np.__version__) if numpy_version < parse_version("1.22"): np_quantile = np_quantile_version_below_122 + np_nanquantile = np_nanquantile_version_below_122 + else: np_quantile = np_quantile_version_above_122 + np_nanquantile = np_nanquantile_version_above_122 diff --git a/mapie/regression.py b/mapie/regression.py index ae7a48adc..566dd919b 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -17,6 +17,7 @@ ) from ._typing import ArrayLike, NDArray +from ._compatibility import np_nanquantile from .aggregation_functions import aggregate_all, phi2D from .utils import ( check_cv, @@ -610,7 +611,7 @@ def predict( alpha_np = cast(NDArray, alpha) check_alpha_and_n_samples(alpha_np, n) if self.method in ["naive", "base"] or self.cv == "prefit": - quantile = np.nanquantile( + quantile = np_nanquantile( self.conformity_scores_, 1 - alpha_np, method="higher" ) y_pred_low = y_pred[:, np.newaxis] - quantile @@ -645,7 +646,7 @@ def predict( y_pred_low = np.column_stack( [ - np.nanquantile( + np_nanquantile( lower_bounds, _alpha, axis=1, @@ -657,7 +658,7 @@ def predict( y_pred_up = np.column_stack( [ - np.nanquantile( + np_nanquantile( upper_bounds, 1 - _alpha, axis=1, diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 27f3b18b1..81509a7c1 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -8,9 +8,10 @@ from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted +from ._compatibility import np_nanquantile +from ._typing import ArrayLike, NDArray from .aggregation_functions import aggregate_all from .regression import MapieRegressor -from ._typing import ArrayLike, NDArray from .utils import ( check_alpha, check_alpha_and_n_samples, @@ -179,13 +180,13 @@ def _beta_optimize( num=len(lower_bounds), endpoint=True, ) - one_alpha_beta = np.nanquantile( + one_alpha_beta = np_nanquantile( upper_bounds, 1 - _alpha + betas, axis=1, method="higher", ) # type: ignore - beta = np.nanquantile( + beta = np_nanquantile( lower_bounds, betas, axis=1, @@ -266,13 +267,13 @@ def predict( upper_bounds=self.conformity_scores_.reshape(1, -1), beta_optimize=beta_optimize, ) - lower_quantiles = np.nanquantile( + lower_quantiles = np_nanquantile( self.conformity_scores_, betas_0[:, 0], axis=0, method="lower", ).T # type: ignore - higher_quantiles = np.nanquantile( + higher_quantiles = np_nanquantile( self.conformity_scores_, 1 - alpha_np + betas_0[:, 0], axis=0, @@ -341,14 +342,14 @@ def predict( upper_quantiles = np.empty((betas_0.shape[1],)) for ind_beta_0, beta_0 in enumerate(betas_0[ind_alpha, :]): - lower_quantiles[ind_beta_0] = np.nanquantile( + lower_quantiles[ind_beta_0] = np_nanquantile( lower_bounds[ind_beta_0, :], beta_0, axis=0, method="lower", ) # type: ignore - upper_quantiles[ind_beta_0] = np.nanquantile( + upper_quantiles[ind_beta_0] = np_nanquantile( upper_bounds[ind_beta_0, :], 1 - _alpha + beta_0, axis=0, From cf230349a54c68fa2e81973b60faf7dcbf395cec Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Thu, 14 Apr 2022 14:45:22 +0200 Subject: [PATCH 19/32] [CORRECT] Documentation --- examples/regression/plot_timeseries_enbpi.py | 160 ++----------------- mapie/time_series_regression.py | 10 +- 2 files changed, 17 insertions(+), 153 deletions(-) diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/plot_timeseries_enbpi.py index 05ca7350b..194b5c341 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/plot_timeseries_enbpi.py @@ -124,39 +124,6 @@ y_pis_pfit_enbpi[:, 1, 0] - y_pis_pfit_enbpi[:, 0, 0] ).mean() -print("EnbPI with partial_fit, NO width optimization") -mapie_enpbi = mapie_enpbi.fit(X_train, y_train) -y_pred_pfit_enbpi_no_opt = np.zeros(y_pred_npfit_enbpi.shape) -y_pis_pfit_enbpi_no_opt = np.zeros(y_pis_npfit_enbpi.shape) -( - y_pred_pfit_enbpi_no_opt[:gap], - y_pis_pfit_enbpi_no_opt[:gap, :, :], -) = mapie_enpbi.predict( - X_test.iloc[:gap, :], alpha=alpha, ensemble=True, beta_optimize=False -) - -for step in range(gap, len(X_test), gap): - mapie_enpbi.partial_fit( - X_test.iloc[(step - gap):step, :], - y_test.iloc[(step - gap):step], - ) - ( - y_pred_pfit_enbpi_no_opt[step:step + gap], - y_pis_pfit_enbpi_no_opt[step:step + gap, :, :], - ) = mapie_enpbi.predict( - X_test.iloc[step:step + gap, :], - alpha=alpha, - ensemble=True, - beta_optimize=False, - ) -coverage_pfit_enbpi_no_opt = regression_coverage_score( - y_test, y_pis_pfit_enbpi_no_opt[:, 0, 0], y_pis_pfit_enbpi_no_opt[:, 1, 0] -) -width_pfit_enbpi_no_opt = ( - y_pis_pfit_enbpi_no_opt[:, 1, 0] - y_pis_pfit_enbpi_no_opt[:, 0, 0] -).mean() - - print("Plus, with partial_fit, width optimization") mapie_plus = mapie_plus.fit(X_train, y_train) y_pred_pfit_plus = np.zeros(y_pred_npfit_enbpi.shape) @@ -186,61 +153,11 @@ ) width_pfit_plus = (y_pis_pfit_plus[:, 1, 0] - y_pis_pfit_plus[:, 0, 0]).mean() -print("Plus, with partial_fit, NO width optimization") -mapie_plus = mapie_plus.fit(X_train, y_train) -y_pred_pfit_plus_no_opt = np.zeros(y_pred_npfit_enbpi.shape) -y_pis_pfit_plus_no_opt = np.zeros(y_pis_npfit_enbpi.shape) -( - y_pred_pfit_plus_no_opt[:gap], - y_pis_pfit_plus_no_opt[:gap, :, :], -) = mapie_plus.predict( - X_test.iloc[:gap, :], - alpha=alpha, - beta_optimize=False, -) -for step in range(gap, len(X_test), gap): - mapie_plus.partial_fit( - X_test.iloc[step - gap:step, :], - y_test.iloc[step - gap: step], - ) - ( - y_pred_pfit_plus_no_opt[step:step + gap], - y_pis_pfit_plus_no_opt[step:step + gap, :, :], - ) = mapie_plus.predict( - X_test.iloc[step:step + gap, :], - alpha=alpha, - ensemble=True, - beta_optimize=False, - ) - -coverage_pfit_plus_no_opt = regression_coverage_score( - y_test, y_pis_pfit_plus_no_opt[:, 0, 0], y_pis_pfit_plus_no_opt[:, 1, 0] -) -width_pfit_plus_no_opt = ( - y_pis_pfit_plus_no_opt[:, 1, 0] - y_pis_pfit_plus_no_opt[:, 0, 0] -).mean() - -print("Plus, with partial_fit, MapieRegressor_Like") +print("Plus, with NO partial_fit, MapieRegressor_Like no") mapie_plus = mapie_plus.fit(X_train, y_train) -y_pred_pfit_MR = np.zeros(y_pred_npfit_enbpi.shape) -y_pis_pfit_MR = np.zeros(y_pis_npfit_enbpi.shape) -y_pred_pfit_MR[:gap], y_pis_pfit_MR[:gap, :, :] = mapie_plus.root_predict( - X_test.iloc[:gap, :], alpha=alpha +y_pred_pfit_MR, y_pis_pfit_MR = mapie_enpbi.predict( + X_test, alpha=alpha, ensemble=True ) -for step in range(gap, len(X_test), gap): - mapie_plus.partial_fit( - X_test.iloc[step - gap:step, :], - y_test.iloc[step - gap:step], - ) - ( - y_pred_pfit_MR[step:step + gap], - y_pis_pfit_MR[step:step + gap, :, :], - ) = mapie_plus.root_predict( - X_test.iloc[step:step + gap, :], - alpha=alpha, - ensemble=True, - ) - coverage_pfit_MR = regression_coverage_score( y_test, y_pis_pfit_MR[:, 0, 0], y_pis_pfit_MR[:, 1, 0] ) @@ -257,21 +174,11 @@ "\nEnbPI with partial_fit:" f"{coverage_pfit_enbpi:.3f}, {width_pfit_enbpi:.3f}" ) -print( - "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " - "\nEnbPI with partial_fit, no with optimization:" - f"{coverage_pfit_enbpi_no_opt:.3f}, {width_pfit_enbpi_no_opt:.3f}" -) print( "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " "\nPlus, with partial_fit:" f"{coverage_pfit_plus:.3f}, {width_pfit_plus:.3f}" ) -print( - "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " - "\nPlus, with partial_fit. no width optimization:" - f"{coverage_pfit_plus_no_opt:.3f}, {width_pfit_plus_no_opt:.3f}" -) print( "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " "\nMR_Like, with partial_fit:" @@ -279,11 +186,11 @@ ) # Plot estimated prediction intervals on test set -fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots( - nrows=2, ncols=3, figsize=(30, 25), sharey="row", sharex="col" +fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots( + nrows=2, ncols=2, figsize=(30, 25), sharey="row", sharex="col" ) -for ax in [ax1, ax2, ax3, ax4, ax5, ax6]: +for ax in [ax1, ax2, ax3, ax4]: ax.set_ylabel("Hourly demand (GW)") ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") @@ -320,35 +227,15 @@ f"Coverage:{coverage_pfit_enbpi:.3f} Width:{width_pfit_enbpi:.3f}" ) -ax3.plot( - demand_test.index, - y_pred_pfit_enbpi_no_opt, - lw=2, - c="C2", - label="Predictions", -) -ax3.fill_between( - demand_test.index, - y_pis_pfit_enbpi_no_opt[:, 0, 0], - y_pis_pfit_enbpi_no_opt[:, 1, 0], - color="C2", - alpha=0.2, - label="MapieTimeSeriesRegressor PIs", -) -ax3.set_title( - "EnbPI with partial_fit. No width optimization\n" - f"Coverage:{coverage_pfit_enbpi_no_opt:.3f}" - f"Width:{width_pfit_enbpi_no_opt:.3f}" -) -ax4.plot( +ax3.plot( demand_test.index, y_pred_pfit_plus, lw=2, c="C2", label="Predictions", ) -ax4.fill_between( +ax3.fill_between( demand_test.index, y_pis_pfit_plus[:, 0, 0], y_pis_pfit_plus[:, 1, 0], @@ -356,36 +243,13 @@ alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) -ax4.set_title( +ax3.set_title( "Plus, with partial_fit.\n" f"Coverage:{coverage_pfit_plus:.3f}" f"Width:{width_pfit_plus:.3f}" ) - -ax5.plot( - demand_test.index, - y_pred_pfit_plus_no_opt, - lw=2, - c="C2", - label="Predictions", -) -ax5.fill_between( - demand_test.index, - y_pis_pfit_plus_no_opt[:, 0, 0], - y_pis_pfit_plus_no_opt[:, 1, 0], - color="C2", - alpha=0.2, - label="MapieTimeSeriesRegressor PIs", -) -ax5.set_title( - "Plus, with partial_fit no width optimization\n" - f"Coverage:{coverage_pfit_plus_no_opt:.3f}" - f"Width:{width_pfit_plus_no_opt:.3f}" -) - - -ax6.plot(demand_test.index, y_pred_pfit_MR, lw=2, c="C2", label="Predictions") -ax6.fill_between( +ax4.plot(demand_test.index, y_pred_pfit_MR, lw=2, c="C2", label="Predictions") +ax4.fill_between( demand_test.index, y_pis_pfit_MR[:, 0, 0], y_pis_pfit_MR[:, 1, 0], @@ -393,7 +257,7 @@ alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) -ax6.set_title( +ax4.set_title( "MapieRegressor Like, with partial_fit\n" f"Coverage:{coverage_pfit_MR:.3f} Width:{width_pfit_MR:.3f}" ) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 81509a7c1..f272b7ec0 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -120,11 +120,11 @@ def partial_fit( if len(X) > len(self.conformity_scores_): raise ValueError("You try to update more residuals than tere are!") new_conformity_scores_ = self._relative_conformity_scores(X, y) - self.conformity_scores_ = np.concatenate( - [ - self.conformity_scores_[-len(new_conformity_scores_):], - new_conformity_scores_, - ] + self.conformity_scores_ = np.roll( + self.conformity_scores_, -len(new_conformity_scores_) + ) + self.conformity_scores_[-len(new_conformity_scores_):] = ( + new_conformity_scores_ ) self.k_[:, -len(new_conformity_scores_)] = 1.0 return self From 76eb321622707c1060a23ea8fc1b0814c4c1b60f Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Fri, 6 May 2022 12:29:53 +0200 Subject: [PATCH 20/32] prune enbpi commit --- .../plot_timeseries_enbpi.py | 32 ++- .../regression/plot_timeseries_enbpi_train.py | 81 ------- mapie/regression.py | 2 +- mapie/subsample.py | 4 +- mapie/tests/test_time_series_regression.py | 117 ++-------- mapie/time_series_regression.py | 203 ++++++------------ 6 files changed, 110 insertions(+), 329 deletions(-) rename examples/regression/{ => 2-advanced-analysis}/plot_timeseries_enbpi.py (89%) delete mode 100644 examples/regression/plot_timeseries_enbpi_train.py diff --git a/examples/regression/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py similarity index 89% rename from examples/regression/plot_timeseries_enbpi.py rename to examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py index 194b5c341..0cb4da004 100644 --- a/examples/regression/plot_timeseries_enbpi.py +++ b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py @@ -28,7 +28,9 @@ import numpy as np import pandas as pd from matplotlib import pylab as plt +from scipy.stats import randint from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit from mapie.metrics import regression_coverage_score from mapie.subsample import BlockBootstrap @@ -64,8 +66,31 @@ X_test = demand_test.loc[:, features] y_test = demand_test["Demand"] -# Model: Random Forest previously optimized with a cross-validation -model = RandomForestRegressor(max_depth=10, n_estimators=50, random_state=59) +model_params_fit_not_done = False +if model_params_fit_not_done: + # CV parameter search + n_iter = 100 + n_splits = 5 + tscv = TimeSeriesSplit(n_splits=n_splits) + random_state = 59 + rf_model = RandomForestRegressor(random_state=random_state) + rf_params = {"max_depth": randint(2, 30), "n_estimators": randint(10, 100)} + cv_obj = RandomizedSearchCV( + rf_model, + param_distributions=rf_params, + n_iter=n_iter, + cv=tscv, + scoring="neg_root_mean_squared_error", + random_state=random_state, + verbose=0, + n_jobs=-1, + ) + cv_obj.fit(X_train, y_train) + model = cv_obj.best_estimator_ +else: + # Model: Random Forest previously optimized with a cross-validation + model = RandomForestRegressor( + max_depth=10, n_estimators=50, random_state=59) # Estimate prediction intervals on test set with best estimator alpha = 0.05 @@ -99,6 +124,7 @@ y_pred_pfit_enbpi = np.zeros(y_pred_npfit_enbpi.shape) y_pis_pfit_enbpi = np.zeros(y_pis_npfit_enbpi.shape) + y_pred_pfit_enbpi[:gap], y_pis_pfit_enbpi[:gap, :, :] = mapie_enpbi.predict( X_test.iloc[:gap, :], alpha=alpha, ensemble=True, beta_optimize=True ) @@ -155,7 +181,7 @@ print("Plus, with NO partial_fit, MapieRegressor_Like no") mapie_plus = mapie_plus.fit(X_train, y_train) -y_pred_pfit_MR, y_pis_pfit_MR = mapie_enpbi.predict( +y_pred_pfit_MR, y_pis_pfit_MR = mapie_plus.predict( X_test, alpha=alpha, ensemble=True ) coverage_pfit_MR = regression_coverage_score( diff --git a/examples/regression/plot_timeseries_enbpi_train.py b/examples/regression/plot_timeseries_enbpi_train.py deleted file mode 100644 index 2200d0635..000000000 --- a/examples/regression/plot_timeseries_enbpi_train.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -================================================================== -Estimating prediction intervals of time series forecast with EnbPI -================================================================== -This example uses -:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate -prediction intervals associated with time series forecast. It follows [6] and -an alternative expermimental implemetation inspired from [2] - -We use here the Victoria electricity demand dataset used in the book -"Forecasting: Principles and Practice" by R. J. Hyndman and G. Athanasopoulos. -The electricity demand features daily and weekly seasonalities and is impacted -by the temperature, considered here as a exogeneous variable. - -A Random Forest model is aloready fitted on data. The hyper-parameters are -optimized with a :class:`sklearn.model_selection.RandomizedSearchCV` using a -sequential :class:`sklearn.model_selection.TimeSeriesSplit` cross validation, -in which the training set is prior to the validation set. -The best model is then feeded into -:class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the -associated prediction intervals. We compare two approaches: one with no -`partial_fit` call and one with `partial_fit` every step. -""" -import warnings - -import numpy as np -import pandas as pd -from scipy.stats import randint -from sklearn.ensemble import RandomForestRegressor -from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit - -warnings.simplefilter("ignore") - -# Load input data and feature engineering -demand_df = pd.read_csv( - "../data/demand_temperature.csv", parse_dates=True, index_col=0 -) - -demand_df["Date"] = pd.to_datetime(demand_df.index) -demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64") -demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64") -demand_df["Hour"] = demand_df.index.hour - -n_lags = 5 -for hour in range(1, n_lags): - demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour) - -# Train/validation/test split -num_test_steps = 24 * 7 -demand_train = demand_df.iloc[:-num_test_steps, :].copy() -demand_test = demand_df.iloc[-num_test_steps:, :].copy() -features = ["Weekofyear", "Weekday", "Hour", "Temperature"] + [ - f"Lag_{hour}" for hour in range(1, n_lags) -] - -X_train = demand_train.loc[ - ~np.any(demand_train[features].isnull(), axis=1), features -] -y_train = demand_train.loc[X_train.index, "Demand"] -X_test = demand_test.loc[:, features] -y_test = demand_test["Demand"] - -# CV parameter search -n_iter = 100 -n_splits = 5 -tscv = TimeSeriesSplit(n_splits=n_splits) -random_state = 59 -rf_model = RandomForestRegressor(random_state=random_state) -rf_params = {"max_depth": randint(2, 30), "n_estimators": randint(10, 100)} -cv_obj = RandomizedSearchCV( - rf_model, - param_distributions=rf_params, - n_iter=n_iter, - cv=tscv, - scoring="neg_root_mean_squared_error", - random_state=random_state, - verbose=0, - n_jobs=-1, -) -cv_obj.fit(X_train, y_train) -print(cv_obj.best_estimator_) diff --git a/mapie/regression.py b/mapie/regression.py index 566dd919b..7670c5ebe 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -224,7 +224,7 @@ def _check_parameters(self) -> None: if self.method not in self.valid_methods_: raise ValueError( "Invalid method. " - "Allowed values are 'naive', 'base', 'plus' and 'minmax'." + f"Allowed values are {self.valid_methods_}." ) check_n_jobs(self.n_jobs) diff --git a/mapie/subsample.py b/mapie/subsample.py index 049ac50f6..43f86035f 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -26,9 +26,9 @@ class Subsample(BaseCrossValidator): Number of samples in each resampling. By default ``None``, the size of the training set. replace: bool - Whether to replace samples in resamplings or not. + Whether to replace samples in resamplings or not. By default ``True``. random_state: Optional - int or RandomState instance. + int or RandomState instance. . By default ``None`` Examples diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 1016f8d3f..78636df39 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -20,7 +20,7 @@ y_toy = (5.0 + 2.0 * X_toy ** 1.1).flatten() X, y = make_regression(n_samples=500, n_features=10, noise=1.0, random_state=1) k = np.ones(shape=(5, X.shape[1])) -METHODS = ["base", "enbpi", "minmax", "naive", "plus"] +METHODS = ["enbpi"] Params = TypedDict( "Params", @@ -31,35 +31,11 @@ }, ) STRATEGIES = { - "naive": Params(method="naive", agg_function="median", cv=None), - "jackknife": Params(method="base", agg_function="mean", cv=-1), - "jackknife_plus": Params(method="plus", agg_function="mean", cv=-1), - "jackknife_minmax": Params(method="minmax", agg_function="mean", cv=-1), - "cv": Params( - method="base", - agg_function="mean", - cv=KFold(n_splits=3, shuffle=True, random_state=1), - ), - "cv_plus": Params( - method="plus", - agg_function="mean", - cv=KFold(n_splits=3, shuffle=True, random_state=1), - ), - "cv_minmax": Params( - method="minmax", - agg_function="mean", - cv=KFold(n_splits=3, shuffle=True, random_state=1), - ), - "jackknife_enbpi_ab_wopt": Params( + "jackknife_enbpi_mean_ab_wopt": Params( method="enbpi", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), ), - "jackknife_minmax_ab": Params( - method="minmax", - agg_function="mean", - cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), - ), "jackknife_enbpi_median_ab_wopt": Params( method="enbpi", agg_function="median", @@ -69,7 +45,7 @@ random_state=1, ), ), - "jackknife_enbpi_ab": Params( + "jackknife_enbpi_mean_ab": Params( method="enbpi", agg_function="mean", cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), @@ -83,72 +59,20 @@ random_state=1, ), ), - "jackknife_plus_ab_MR": Params( - method="plus", - agg_function="mean", - cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), - ), - "jackknife_minmax_ab_MR": Params( - method="minmax", - agg_function="mean", - cv=BlockBootstrap(n_resamplings=30, n_blocks=5, random_state=1), - ), - "jackknife_plus_ab_median_MR": Params( - method="plus", - agg_function="median", - cv=BlockBootstrap( - n_resamplings=30, - n_blocks=5, - random_state=1, - ), - ), } WIDTHS = { - "naive": 3.83, - "jackknife": 3.83, - "jackknife_plus": 3.82, - "jackknife_minmax": 3.90, - "cv": 3.83, - "cv_plus": 3.87, - "cv_minmax": 4.03, - "prefit": 4.79, - "cv_plus_median": 3.90, - "jackknife_enbpi_ab_wopt": 3.76, - "jackknife_minmax_ab": 3.96, + "jackknife_enbpi_mean_ab_wopt": 3.76, "jackknife_enbpi_median_ab_wopt": 3.76, - "jackknife_enbpi_ab": 3.76, - "jackknife_minmax_ab": 3.96, + "jackknife_enbpi_mean_ab": 3.76, "jackknife_enbpi_median_ab": 3.76, - "jackknife_plus_ab": 3.76, - "jackknife_plus_median_ab": 3.83, - "jackknife_plus_ab_MR": 3.82, - "jackknife_minmax_ab_MR": 3.96, - "jackknife_plus_ab_median_MR": 3.83, } COVERAGES = { - "naive": 0.952, - "jackknife": 0.952, - "jackknife_plus": 0.94, - "jackknife_minmax": 0.952, - "cv": 0.958, - "cv_plus": 0.956, - "cv_minmax": 0.966, - "prefit": 0.98, - "cv_plus_median": 0.954, - "jackknife_enbpi_ab_wopt": 0.952, - "jackknife_minmax_ab": 0.960, + "jackknife_enbpi_mean_ab_wopt": 0.952, "jackknife_enbpi_median_ab_wopt": 0.946, - "jackknife_enbpi_ab": 0.952, - "jackknife_minmax_ab": 0.960, + "jackknife_enbpi_mean_ab": 0.952, "jackknife_enbpi_median_ab": 0.946, - "jackknife_plus_ab_plus": 0.92, - "jackknife_minmax_ab_plus": 0.940, - "jackknife_plus_median_ab_plus": 0.94, - "jackknife_plus_ab_MR": 0.954, - "jackknife_minmax_ab_MR": 0.958, - "jackknife_plus_ab_median_MR": 0.954, } @@ -262,7 +186,7 @@ def test_results_with_constant_sample_weights(strategy: str) -> None: np.testing.assert_allclose(y_pis1, y_pis2) -@pytest.mark.parametrize("method", ["plus", "minmax", "enbpi"]) +@pytest.mark.parametrize("method", ["enbpi"]) @pytest.mark.parametrize("cv", [-1, 2, 3, 5]) @pytest.mark.parametrize("agg_function", ["mean", "median"]) @pytest.mark.parametrize("alpha", [0.05, 0.1, 0.2]) @@ -292,6 +216,7 @@ def test_linear_regression_results(strategy: str) -> None: a multivariate linear regression problem with fixed random state. """ + mapie_ts = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) mapie_ts.fit(X, y) if "opt" in strategy: @@ -302,29 +227,11 @@ def test_linear_regression_results(strategy: str) -> None: y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0] width_mean = (y_pred_up - y_pred_low).mean() - if mapie_ts.method == "plus": - coverage = regression_coverage_score(y, y_pred_low, y_pred_up) - else: - coverage = regression_coverage_score(y, y_pred_low, y_pred_up) + coverage = regression_coverage_score(y, y_pred_low, y_pred_up) np.testing.assert_allclose(width_mean, WIDTHS[strategy], rtol=1e-2) np.testing.assert_allclose(coverage, COVERAGES[strategy], rtol=1e-2) -def test_results_prefit_ignore_method() -> None: - """Test that method is ignored when ``cv="prefit"``.""" - estimator = LinearRegression().fit(X, y) - all_y_pis: List[NDArray] = [] - for method in METHODS: - mapie_ts_reg = MapieTimeSeriesRegressor( - estimator=estimator, cv="prefit", method=method - ) - mapie_ts_reg.fit(X, y) - _, y_pis = mapie_ts_reg.predict(X, alpha=0.1) - all_y_pis.append(y_pis) - for y_pis1, y_pis2 in combinations(all_y_pis, 2): - np.testing.assert_allclose(y_pis1, y_pis2) - - def test_results_prefit_naive() -> None: """ Test that prefit, fit and predict on the same dataset @@ -456,9 +363,9 @@ def test_MapieTimeSeriesRegressor_partial_fit_two_big() -> None: def test_MapieTimeSeriesRegressor_beta_optimize_eeror() -> None: - """Test ``partial_fit`` raised error.""" + """Test ``beta_optimize`` raised error.""" mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1) with pytest.raises(ValueError, match=r".*Lower and upper bounds arrays*"): mapie_ts_reg._beta_optimize( - alpha=0.1, upper_bounds=X, lower_bounds=X_toy, beta_optimize=True + alpha=0.1, upper_bounds=X, lower_bounds=X_toy ) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index f272b7ec0..4ffc596fc 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -22,11 +22,12 @@ class MapieTimeSeriesRegressor(MapieRegressor): """ Prediction intervals with out-of-fold residuals for time series. - This class implements the EnbPI strategy and some variants for estimating - prediction intervals on single-output time series. + This class implements the EnbPI strategy for estimating + prediction intervals on single-output time series. The only valid + ``method`` is 'enbpi'. Actually, EnbPI only corresponds to ``MapieTimeSeriesRegressor`` if the - ``cv`` argument is of type ``BlockBootstrap`` and ``method`` is "enbpi". + ``cv`` argument is of type ``BlockBootstrap``. References ---------- @@ -46,7 +47,7 @@ def __init__( ) -> None: super().__init__(estimator, method, cv, n_jobs, agg_function, verbose) self.cv_need_agg_function.append("BlockBootstrap") - self.valid_methods_.append("enbpi") + self.valid_methods_ = ["enbpi"] self.plus_like_method.append("enbpi") def _relative_conformity_scores( @@ -69,7 +70,7 @@ def _relative_conformity_scores( ------- The conformity scores corresponding to the input data set. """ - y_pred, _ = self.root_predict(X, alpha=0.5, ensemble=True) + y_pred, _ = super().predict(X, alpha=0.5, ensemble=True) return np.asarray(y) - np.asarray(y_pred) def fit( @@ -114,11 +115,18 @@ def partial_fit( ------- MapieTimeSeriesRegressor The model itself. + + Raises + ------ + ValueError + If the lenght of y is greater than the lenght of the training set. """ X = cast(NDArray, X) y = cast(NDArray, y) if len(X) > len(self.conformity_scores_): - raise ValueError("You try to update more residuals than tere are!") + raise ValueError( + "You try to update more residuals than there are!" + ) new_conformity_scores_ = self._relative_conformity_scores(X, y) self.conformity_scores_ = np.roll( self.conformity_scores_, -len(new_conformity_scores_) @@ -126,7 +134,6 @@ def partial_fit( self.conformity_scores_[-len(new_conformity_scores_):] = ( new_conformity_scores_ ) - self.k_[:, -len(new_conformity_scores_)] = 1.0 return self def _beta_optimize( @@ -134,7 +141,6 @@ def _beta_optimize( alpha: Union[float, NDArray], upper_bounds: NDArray, lower_bounds: NDArray, - beta_optimize: bool = False, ) -> NDArray: """ ``_beta_optimize`` offers to minimize the width of the PIs, for a given @@ -142,21 +148,23 @@ def _beta_optimize( Parameters ---------- - alpha: Optional[NDArray] - The quantiles to compute. - upper_bounds: NDArray - The array of upper values. - lower_bounds: NDArray - The array of lower values. - optimize: bool - Whether to optimize or not. If ``False``, betas are the half of - alphas. + alpha: Union[float, NDArray] + The quantiles to compute. + upper_bounds: NDArray + The array of upper values. + lower_bounds: NDArray + The array of lower values. Returns ------- NDArray Array of betas minimizing the differences ``(1-alpa+beta)-quantile - beta-quantile``. + + Raises + ------ + ValueError + If lower and upper bounds arrays don't have the same shape. """ if lower_bounds.shape != upper_bounds.shape: raise ValueError( @@ -168,10 +176,6 @@ def _beta_optimize( fill_value=np.nan, dtype=float, ) - if not beta_optimize: - for ind_alpha, _alpha in enumerate(alpha): - betas_0[ind_alpha, :] = _alpha / 2.0 - return betas_0 for ind_alpha, _alpha in enumerate(alpha): betas = np.linspace( @@ -231,12 +235,7 @@ def predict( beta_optimize: bool = True, ) -> Union[NDArray, Tuple[NDArray, NDArray]]: """ - Correspond to the ``MapieRegressor``'s one with the - method ``'plus'``. In case ``method`` is ``'enbpi'``, predictions - correspond to 'Conformal prediction for dynamic time-series'. The - method ``'plus'`` is slower because of PI-wise optimization. However, - you can choose not to optimize the width of the PIs by setting - ``beta_optimize`` to ``False``. + Correspond to 'Conformal prediction for dynamic time-series'. Parameters ---------- @@ -258,126 +257,56 @@ def predict( alpha_np = cast(NDArray, alpha) check_alpha_and_n_samples(alpha_np, len(self.conformity_scores_)) - if (self.method in ["base", "enbpi", "minmax", "naive"]) or ( - self.cv == "prefit" - ): + if beta_optimize: betas_0 = self._beta_optimize( alpha=alpha_np, lower_bounds=self.conformity_scores_.reshape(1, -1), upper_bounds=self.conformity_scores_.reshape(1, -1), - beta_optimize=beta_optimize, ) - lower_quantiles = np_nanquantile( - self.conformity_scores_, - betas_0[:, 0], - axis=0, - method="lower", - ).T # type: ignore - higher_quantiles = np_nanquantile( - self.conformity_scores_, - 1 - alpha_np + betas_0[:, 0], - axis=0, - method="higher", - ).T # type: ignore - - if (self.method in ["naive", "base"]) or (self.cv == "prefit"): - y_pred_low = y_pred[:, np.newaxis] + lower_quantiles - y_pred_up = y_pred[:, np.newaxis] + higher_quantiles - else: - y_pred_multi = self._pred_multi(X) - - if self.method == "enbpi": - # Correspond to "Conformal prediction for dynamic time - # series". Its PIs are closed to the oracle's ones if - # beta_optimized is True. - pred = aggregate_all(self.agg_function, y_pred_multi) - lower_bounds, upper_bounds = pred, pred - else: # self.method == "minmax": - lower_bounds = np.min( - y_pred_multi, axis=1, keepdims=True - ) - upper_bounds = np.max( - y_pred_multi, axis=1, keepdims=True - ) - - y_pred_low = np.column_stack( - [ - lower_bounds + lower_quantiles[k] - for k, _ in enumerate(alpha_np) - ] - ) - y_pred_up = np.column_stack( - [ - upper_bounds + higher_quantiles[k] - for k, _ in enumerate(alpha_np) - ] - ) - - if ensemble: - y_pred = aggregate_all(self.agg_function, y_pred_multi) - - else: # self.method == "plus": - # This version of predict corresponds to "Predictive - # Inference is Free with the Jackknife+-after-Bootstrap.". - # Its PIs are wider. It does not coorespond to "Conformal - # prediction for dynamic time series". It is a try. It is - # slower because the betas (width optimization parameters - # of the PIs) are optimized for every point - y_pred_multi = self._pred_multi(X) - y_pred_low = np.empty((len(y_pred), len(alpha)), dtype=float) - y_pred_up = np.empty((len(y_pred), len(alpha)), dtype=float) + else: + betas_0 = np.full( + shape=(len(alpha), len(self.conformity_scores_)), + fill_value=np.nan, + dtype=float, + ) + for ind_alpha, _alpha in enumerate(alpha): + betas_0[ind_alpha, :] = _alpha / 2.0 - lower_bounds = y_pred_multi + self.conformity_scores_ - upper_bounds = y_pred_multi + self.conformity_scores_ + lower_quantiles = np_nanquantile( + self.conformity_scores_, + betas_0[:, 0], + axis=0, + method="lower", + ).T # type: ignore + higher_quantiles = np_nanquantile( + self.conformity_scores_, + 1 - alpha_np + betas_0[:, 0], + axis=0, + method="higher", + ).T # type: ignore - betas_0 = self._beta_optimize( - alpha=alpha_np, - lower_bounds=lower_bounds, - upper_bounds=upper_bounds, - beta_optimize=beta_optimize, + if self.cv == "prefit": + y_pred_low = y_pred[:, np.newaxis] + lower_quantiles + y_pred_up = y_pred[:, np.newaxis] + higher_quantiles + else: + y_pred_multi = self._pred_multi(X) + pred = aggregate_all(self.agg_function, y_pred_multi) + lower_bounds, upper_bounds = pred, pred + + y_pred_low = np.column_stack( + [ + lower_bounds + lower_quantiles[k] + for k, _ in enumerate(alpha_np) + ] + ) + y_pred_up = np.column_stack( + [ + upper_bounds + higher_quantiles[k] + for k, _ in enumerate(alpha_np) + ] ) - - for ind_alpha, _alpha in enumerate(alpha_np): - lower_quantiles = np.empty((betas_0.shape[1],)) - upper_quantiles = np.empty((betas_0.shape[1],)) - - for ind_beta_0, beta_0 in enumerate(betas_0[ind_alpha, :]): - lower_quantiles[ind_beta_0] = np_nanquantile( - lower_bounds[ind_beta_0, :], - beta_0, - axis=0, - method="lower", - ) # type: ignore - - upper_quantiles[ind_beta_0] = np_nanquantile( - upper_bounds[ind_beta_0, :], - 1 - _alpha + beta_0, - axis=0, - method="higher", - ) # type: ignore - y_pred_low[:, ind_alpha] = lower_quantiles - y_pred_up[:, ind_alpha] = upper_quantiles if ensemble: y_pred = aggregate_all(self.agg_function, y_pred_multi) return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) - - def root_predict( - self, - X: ArrayLike, - ensemble: bool = False, - alpha: Optional[Union[float, Iterable[float]]] = None, - ) -> Union[NDArray, Tuple[NDArray, NDArray]]: - """ - ``root_predict`` method correspond to the one of ``MapieRegressor``'s. - """ - conformity_scores_save = self.conformity_scores_.copy() - self.conformity_scores_ = np.abs(self.conformity_scores_) - if alpha is None: - y_pred = super().predict(X=X, ensemble=ensemble, alpha=alpha) - self.conformity_scores_ = conformity_scores_save - return y_pred - y_pred, y_pis = super().predict(X=X, ensemble=ensemble, alpha=alpha) - self.conformity_scores_ = conformity_scores_save - return y_pred, y_pis From 0017fd36dc9ded35313d3c73e8732b4e98dc015b Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Fri, 6 May 2022 12:48:45 +0200 Subject: [PATCH 21/32] all test pass after enbpi pruning --- mapie/tests/test_time_series_regression.py | 24 +++++----------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 78636df39..1f5648632 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -1,7 +1,6 @@ from __future__ import annotations -from itertools import combinations -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Optional, Tuple, Union import numpy as np import pytest @@ -66,6 +65,8 @@ "jackknife_enbpi_median_ab_wopt": 3.76, "jackknife_enbpi_mean_ab": 3.76, "jackknife_enbpi_median_ab": 3.76, + "prefit": 4.79, + } COVERAGES = { @@ -73,6 +74,8 @@ "jackknife_enbpi_median_ab_wopt": 0.946, "jackknife_enbpi_mean_ab": 0.952, "jackknife_enbpi_median_ab": 0.946, + "prefit": 0.98, + } @@ -232,21 +235,6 @@ def test_linear_regression_results(strategy: str) -> None: np.testing.assert_allclose(coverage, COVERAGES[strategy], rtol=1e-2) -def test_results_prefit_naive() -> None: - """ - Test that prefit, fit and predict on the same dataset - is equivalent to the "naive" method. - """ - estimator = LinearRegression().fit(X, y) - mapie_ts_reg = MapieTimeSeriesRegressor(estimator=estimator, cv="prefit") - mapie_ts_reg.fit(X, y) - _, y_pis = mapie_ts_reg.predict(X, alpha=0.05) - width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean() - coverage = regression_coverage_score(y, y_pis[:, 0, 0], y_pis[:, 1, 0]) - np.testing.assert_allclose(width_mean, WIDTHS["naive"], rtol=1e-2) - np.testing.assert_allclose(coverage, COVERAGES["naive"], rtol=1e-2) - - def test_results_prefit() -> None: """Test prefit results on a standard train/validation/test split.""" X_train_val, X_test, y_train_val, y_test = train_test_split( @@ -337,8 +325,6 @@ def test_MapieTimeSeriesRegressor_alpha_is_None() -> None: with pytest.raises(ValueError, match=r".*too many values to unpackt*"): y_pred, y_pis = mapie_ts_reg.predict(X_toy, alpha=None) - with pytest.raises(ValueError, match=r".*too many values to unpackt*"): - y_pred, y_pis = mapie_ts_reg.root_predict(X_toy, alpha=None) def test_MapieTimeSeriesRegressor_partial_fit_ensemble() -> None: From 57a07c2a56555f527cb6c94e57097dc9571d5a7c Mon Sep 17 00:00:00 2001 From: Vianney Taquet Date: Mon, 30 May 2022 14:17:25 +0200 Subject: [PATCH 22/32] Add notebook with TS changepoint --- mapie/time_series_regression.py | 2 + notebooks/regression/ts-changepoint.ipynb | 734 ++++++++++++++++++++++ notebooks/regression/ts-changepoint.md | 372 +++++++++++ 3 files changed, 1108 insertions(+) create mode 100644 notebooks/regression/ts-changepoint.ipynb create mode 100644 notebooks/regression/ts-changepoint.md diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 4ffc596fc..ae7955f42 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -284,6 +284,8 @@ def predict( axis=0, method="higher", ).T # type: ignore + self.lower_quantiles_ = lower_quantiles + self.higher_quantiles_ = higher_quantiles if self.cv == "prefit": y_pred_low = y_pred[:, np.newaxis] + lower_quantiles diff --git a/notebooks/regression/ts-changepoint.ipynb b/notebooks/regression/ts-changepoint.ipynb new file mode 100644 index 000000000..103b4aafd --- /dev/null +++ b/notebooks/regression/ts-changepoint.ipynb @@ -0,0 +1,734 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Estimating prediction intervals of time series forecast with EnbPI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/scikit-learn-contrib/MAPIE/blob/add-ts-notebooks/notebooks/regression/ts-changepoint.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example uses `mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate\n", + "prediction intervals associated with time series forecast. It follows Xu \\& Xie (2021).\n", + "We use here the Victoria electricity demand dataset used in the book\n", + "\"Forecasting: Principles and Practice\" by R. J. Hyndman and G. Athanasopoulos.\n", + "The electricity demand features daily and weekly seasonalities and is impacted\n", + "by the temperature, considered here as a exogeneous variable.\n", + "A Random Forest model is already fitted on data. The hyper-parameters are\n", + "optimized with a `sklearn.model_selection.RandomizedSearchCV` using a\n", + "sequential `sklearn.model_selection.TimeSeriesSplit` cross validation,\n", + "in which the training set is prior to the validation set.\n", + "The best model is then feeded into\n", + "`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the\n", + "associated prediction intervals. We compare four approaches: with or without\n", + "``partial_fit`` called at every step. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "install_mapie = False\n", + "if install_mapie:\n", + " !pip install \"git+https://github.com/scikit-learn-contrib/MAPIE.git@add-ts-notebooks\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from matplotlib import pylab as plt\n", + "from scipy.stats import randint\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit\n", + "\n", + "from mapie.metrics import regression_coverage_score, regression_mean_width_score\n", + "from mapie.subsample import BlockBootstrap\n", + "from mapie.time_series_regression import MapieTimeSeriesRegressor\n", + "\n", + "%reload_ext autoreload\n", + "%autoreload 2\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load input data and feature engineering" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "url_file = \"https://raw.githubusercontent.com/scikit-learn-contrib/MAPIE/master/examples/data/demand_temperature.csv\"\n", + "demand_df = pd.read_csv(\n", + " url_file, parse_dates=True, index_col=0\n", + ")\n", + "\n", + "demand_df[\"Date\"] = pd.to_datetime(demand_df.index)\n", + "demand_df[\"Weekofyear\"] = demand_df.Date.dt.isocalendar().week.astype(\"int64\")\n", + "demand_df[\"Weekday\"] = demand_df.Date.dt.isocalendar().day.astype(\"int64\")\n", + "demand_df[\"Hour\"] = demand_df.index.hour\n", + "n_lags = 5\n", + "for hour in range(1, n_lags):\n", + " demand_df[f\"Lag_{hour}\"] = demand_df[\"Demand\"].shift(hour)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Train/validation/test split" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "num_test_steps = 24 * 7\n", + "demand_train = demand_df.iloc[:-num_test_steps, :].copy()\n", + "demand_test = demand_df.iloc[-num_test_steps:, :].copy()\n", + "features = [\"Weekofyear\", \"Weekday\", \"Hour\", \"Temperature\"] \n", + "features += [f\"Lag_{hour}\" for hour in range(1, n_lags)]\n", + "\n", + "X_train = demand_train.loc[\n", + " ~np.any(demand_train[features].isnull(), axis=1), features\n", + "]\n", + "y_train = demand_train.loc[X_train.index, \"Demand\"]\n", + "X_test = demand_test.loc[:, features]\n", + "y_test = demand_test[\"Demand\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Hourly demand (GW)')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(16, 5))\n", + "plt.plot(y_train)\n", + "plt.plot(y_test)\n", + "plt.ylabel(\"Hourly demand (GW)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Optimize the base estimator" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "model_params_fit_not_done = False\n", + "if model_params_fit_not_done:\n", + " # CV parameter search\n", + " n_iter = 100\n", + " n_splits = 5\n", + " tscv = TimeSeriesSplit(n_splits=n_splits)\n", + " random_state = 59\n", + " rf_model = RandomForestRegressor(random_state=random_state)\n", + " rf_params = {\"max_depth\": randint(2, 30), \"n_estimators\": randint(10, 100)}\n", + " cv_obj = RandomizedSearchCV(\n", + " rf_model,\n", + " param_distributions=rf_params,\n", + " n_iter=n_iter,\n", + " cv=tscv,\n", + " scoring=\"neg_root_mean_squared_error\",\n", + " random_state=random_state,\n", + " verbose=0,\n", + " n_jobs=-1,\n", + " )\n", + " cv_obj.fit(X_train, y_train)\n", + " model = cv_obj.best_estimator_\n", + "else:\n", + " # Model: Random Forest previously optimized with a cross-validation\n", + " model = RandomForestRegressor(\n", + " max_depth=10, n_estimators=50, random_state=59)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Estimate prediction intervals on the test set" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "alpha = 0.05\n", + "gap = 1\n", + "cv_mapiets = BlockBootstrap(\n", + " n_resamplings=100, length=48, overlapping=True, random_state=59\n", + ")\n", + "mapie_enbpi = MapieTimeSeriesRegressor(\n", + " model, method=\"enbpi\", cv=cv_mapiets, agg_function=\"mean\", n_jobs=-1\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Without partial fit" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EnbPI, with no partial_fit, width optimization\n" + ] + } + ], + "source": [ + "print(\"EnbPI, with no partial_fit, width optimization\")\n", + "mapie_enbpi = mapie_enbpi.fit(X_train, y_train)\n", + "y_pred_npfit, y_pis_npfit = mapie_enbpi.predict(\n", + " X_test, alpha=alpha, ensemble=True, beta_optimize=True\n", + ")\n", + "coverage_npfit = regression_coverage_score(\n", + " y_test, y_pis_npfit[:, 0, 0], y_pis_npfit[:, 1, 0]\n", + ")\n", + "width_npfit = regression_mean_width_score(\n", + " y_pis_npfit[:, 0, 0], y_pis_npfit[:, 1, 0]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With partial fit" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EnbPI with partial_fit, width optimization\n" + ] + } + ], + "source": [ + "print(\"EnbPI with partial_fit, width optimization\")\n", + "mapie_enbpi = mapie_enbpi.fit(X_train, y_train)\n", + "\n", + "y_pred_pfit = np.zeros(y_pred_npfit.shape)\n", + "y_pis_pfit = np.zeros(y_pis_npfit.shape)\n", + "y_pred_pfit[:gap], y_pis_pfit[:gap, :, :] = mapie_enbpi.predict(\n", + " X_test.iloc[:gap, :], alpha=alpha, ensemble=True\n", + ")\n", + "for step in range(gap, len(X_test), gap):\n", + " mapie_enbpi.partial_fit(\n", + " X_test.iloc[(step - gap):step, :],\n", + " y_test.iloc[(step - gap):step],\n", + " )\n", + " (\n", + " y_pred_pfit[step:step + gap],\n", + " y_pis_pfit[step:step + gap, :, :],\n", + " ) = mapie_enbpi.predict(\n", + " X_test.iloc[step:(step + gap), :],\n", + " alpha=alpha,\n", + " ensemble=True\n", + " )\n", + "coverage_pfit = regression_coverage_score(\n", + " y_test, y_pis_pfit[:, 0, 0], y_pis_pfit[:, 1, 0]\n", + ")\n", + "width_pfit = regression_mean_width_score(\n", + " y_pis_pfit[:, 0, 0], y_pis_pfit[:, 1, 0]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## V. Plot estimated prediction intervals on test set" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "y_preds = [y_pred_npfit, y_pred_pfit]\n", + "y_pis = [y_pis_npfit, y_pis_pfit]\n", + "coverages = [coverage_npfit, coverage_pfit]\n", + "widths = [width_npfit, width_pfit]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_forecast(y_train, y_test, y_preds, y_pis, coverages, widths, plot_coverage=True):\n", + " fig, axs = plt.subplots(\n", + " nrows=2, ncols=1, figsize=(14, 8), sharey=\"row\", sharex=\"col\"\n", + " )\n", + " for i, (ax, w) in enumerate(zip(axs, [\"without\", \"with\"])):\n", + " ax.set_ylabel(\"Hourly demand (GW)\")\n", + " ax.plot(y_train[int(-len(y_test)/2):], lw=2, label=\"Training data\", c=\"C0\")\n", + " ax.plot(y_test, lw=2, label=\"Test data\", c=\"C1\")\n", + "\n", + " ax.plot(\n", + " y_test.index, y_preds[i], lw=2, c=\"C2\", label=\"Predictions\"\n", + " )\n", + " ax.fill_between(\n", + " y_test.index,\n", + " y_pis[i][:, 0, 0],\n", + " y_pis[i][:, 1, 0],\n", + " color=\"C2\",\n", + " alpha=0.2,\n", + " label=\"Prediction intervals\",\n", + " )\n", + " title = f\"EnbPI, {w} update of residuals. \"\n", + " if plot_coverage:\n", + " title += f\"Coverage:{coverages[i]:.3f} and Width:{widths[i]:.3f}\"\n", + " ax.set_title(title)\n", + " ax.legend()\n", + " fig.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_forecast(y_train, y_test, y_preds, y_pis, coverages, widths)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## VI. Forecast on test dataset with change point" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will now see how MAPIE adapts its prediction intervals when a brutal changepoint arises in the test set. To simulate this, we will artificially decrease the electricity demand by 2 GW in the test set, aiming at simulating an effect, such as blackout or lockdown due to a pandemic, that was not taken into account by the model during its training. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Corrupt the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "demand_df_corrupted = demand_df.copy()\n", + "demand_df_corrupted.Demand.iloc[-int(num_test_steps/2):] -= 2" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "n_lags = 5\n", + "for hour in range(1, n_lags):\n", + " demand_df[f\"Lag_{hour}\"] = demand_df[\"Demand\"].shift(hour)\n", + "demand_train_corrupted = demand_df_corrupted.iloc[:-num_test_steps, :].copy()\n", + "demand_test_corrupted = demand_df_corrupted.iloc[-num_test_steps:, :].copy()\n", + "\n", + "X_train = demand_train_corrupted.loc[\n", + " ~np.any(demand_train_corrupted[features].isnull(), axis=1), features\n", + "]\n", + "y_train = demand_train_corrupted.loc[X_train.index, \"Demand\"]\n", + "X_test = demand_test_corrupted.loc[:, features]\n", + "y_test = demand_test_corrupted[\"Demand\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(16, 5))\n", + "plt.ylabel(\"Hourly demand (GW)\")\n", + "plt.plot(y_train)\n", + "plt.plot(y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prediction intervals without partial fit" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EnbPI, with no partial_fit, width optimization\n" + ] + } + ], + "source": [ + "print(\"EnbPI, with no partial_fit, width optimization\")\n", + "mapie_enbpi = mapie_enbpi.fit(X_train, y_train)\n", + "y_pred_npfit, y_pis_npfit = mapie_enbpi.predict(\n", + " X_test, alpha=alpha, ensemble=True, beta_optimize=True\n", + ")\n", + "coverage_npfit = regression_coverage_score(\n", + " y_test, y_pis_npfit[:, 0, 0], y_pis_npfit[:, 1, 0]\n", + ")\n", + "width_npfit = regression_mean_width_score(\n", + " y_pis_npfit[:, 0, 0], y_pis_npfit[:, 1, 0]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prediction intervals with partial fit" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EnbPI with partial_fit, width optimization\n" + ] + } + ], + "source": [ + "print(\"EnbPI with partial_fit, width optimization\")\n", + "mapie_enbpi = mapie_enbpi.fit(X_train, y_train)\n", + "\n", + "y_pred_pfit = np.zeros(y_pred_npfit.shape)\n", + "y_pis_pfit = np.zeros(y_pis_npfit.shape)\n", + "conformity_scores_pfit, lower_quantiles_pfit, higher_quantiles_pfit = [], [], []\n", + "y_pred_pfit[:gap], y_pis_pfit[:gap, :, :] = mapie_enbpi.predict(\n", + " X_test.iloc[:gap, :], alpha=alpha, ensemble=True\n", + ")\n", + "for step in range(gap, len(X_test), gap):\n", + " mapie_enbpi.partial_fit(\n", + " X_test.iloc[(step - gap):step, :],\n", + " y_test.iloc[(step - gap):step],\n", + " )\n", + " (\n", + " y_pred_pfit[step:step + gap],\n", + " y_pis_pfit[step:step + gap, :, :],\n", + " ) = mapie_enbpi.predict(\n", + " X_test.iloc[step:(step + gap), :],\n", + " alpha=alpha,\n", + " ensemble=True\n", + " )\n", + " conformity_scores_pfit.append(mapie_enbpi.conformity_scores_)\n", + " lower_quantiles_pfit.append(mapie_enbpi.lower_quantiles_)\n", + " higher_quantiles_pfit.append(mapie_enbpi.higher_quantiles_)\n", + "coverage_pfit = regression_coverage_score(\n", + " y_test, y_pis_pfit[:, 0, 0], y_pis_pfit[:, 1, 0]\n", + ")\n", + "width_pfit = regression_mean_width_score(\n", + " y_pis_pfit[:, 0, 0], y_pis_pfit[:, 1, 0]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot estimated prediction intervals on test set" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "y_preds = [y_pred_npfit, y_pred_pfit]\n", + "y_pis = [y_pis_npfit, y_pis_pfit]\n", + "coverages = [coverage_npfit, coverage_pfit]\n", + "widths = [width_npfit, width_pfit]" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_forecast(y_train, y_test, y_preds, y_pis, coverages, widths, plot_coverage=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "window = 24\n", + "rolling_coverage_pfit, rolling_coverage_npfit = [], []\n", + "for i in range(window, len(y_test), 1):\n", + " rolling_coverage_pfit.append(\n", + " regression_coverage_score(\n", + " y_test[i-window:i], y_pis_pfit[i-window:i, 0, 0], y_pis_pfit[i-window:i, 1, 0]\n", + " )\n", + " )\n", + " rolling_coverage_npfit.append(\n", + " regression_coverage_score(\n", + " y_test[i-window:i], y_pis_npfit[i-window:i, 0, 0], y_pis_npfit[i-window:i, 1, 0]\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Marginal coverage on a 24-hour rolling window of prediction intervals" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10, 5))\n", + "plt.ylabel(f\"Rolling coverage [{window} hours]\")\n", + "plt.plot(y_test[window:].index, rolling_coverage_npfit, label=\"Without update of residuals\")\n", + "plt.plot(y_test[window:].index, rolling_coverage_pfit, label=\"With update of residuals\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Temporal evolution of the distribution of residuals used for estimating prediction intervals" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAl0AAAEvCAYAAAB/gHR8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAA04UlEQVR4nO3de1hVVf4/8PeHi6CCBoIXVESRiyBeEmnKykpyIEUrx3T8lpeaLG007ZeV9eRkUzONZb9uo5POz242aWM6mbe+YlmalgNKKIIhSmKggQiCIgpn/f44Bx6Oghw5uNeG8349z372OfvsvXnvNoOfWWudtUUpBSIiIiK6ttx0ByAiIiJyBSy6iIiIiAzAoouIiIjIACy6iIiIiAzAoouIiIjIACy6iIiIiAzgoTsAAAQEBKiQkBDdMcynKNu6DgjTm4PIRR0pPAsA6BPYXnMSq4qCLABA226RmpMYIzU1tUgpFag7B1FzMUXRFRISgpSUFN0xzCf5Bes6/gWdKYhc1t+2WIucpxPMUeTsXjYLAHDj9Lc1JzGGiPysOwNRcxIzTI4aGxurWHQREVFdIpKqlIrVnYOouXBMFxEREZEBWHSZ2er7rQsRafHoR6l49KNU3TFq7X11NPa+Olp3DCJqIlOM6aIGnDutOwGRSzt97oLuCHa8LpbojkBETmBLFxEREZEBWHQRERERGYBFFxEREZEBOKbLzPoM152AyKUN6xugO4KdM92G6Y5ARE7gPF1ERGRKnKeLWht2LxIREREZgEWXma0cZ12ISIspK/Zgyoo9umPUSn8lHumvxOuOQURNxDFdZnbxvO4ERC7t/MXqBj9LPnjyqs4VH9XF2Thwt/BvAlFLxpYuIiIiIgOw6CIiIiIyAIsuIiIiIgNwTJeZhf9WdwIilzaiX2fdEeyUBY/QHYGInMCiy8yGzdadgMilTb81VHcEO7+5f6HuCETkBHYvEhERERmARZeZvTfKuhCRFhPe3Y0J7+7WHaNWxl9uRsZfbtYdg4iaiEUXERERkQFYdBEREREZgEUXERERkQFYdBEREREZgFNGmFn03boTELm00QO66Y5gpzw0SXcEInICiy4zi3tYdwIil/bAjSG6I9i5YcLTuiMQkRPYvWhmF85ZFyLSouJCNSouVOuOUavibBkqzpbpjkFETcSiy8w+Hm9diEiLqe/twdT39uiOUevIm4k48mai7hhE1EQsuoiIiIgMwKKLiIiIyAAsuoiIiIgMwKKLiIiIyAAOTRkhIrkAygBUA6hSSsWKiD+A1QBCAOQCuE8pddq2/3wAD9n2n62U+rLZk7uCQZN0JyByab8b0kN3BDtn+03QHYGInHA183TdrpQqqvP+GQDblFKviMgztvdPi0gUgIkAogEEAUgWkXCllHm+d91SDP4f3QmIXNr42J66I9iJu2eW7ghE5ARnuhfHAvjA9voDAHfX2b5KKVWplDoK4DCAOCd+jus6e8q6EJEWxWcvoPjsBd0xap0uLMDpwgLdMYioiRwtuhSA/xWRVBGZbtvWRSlVAAC2dWfb9u4A8uoce9y2ja7Wp5OtCxFpMWNlKmasTNUdo1b+8vHIX865+4haKke7F4cppfJFpDOArSKSdYV9pZ5t6rKdrMXbdAAIDg52MAYRERFRy+RQS5dSKt+2/hXAOli7C0+KSDcAsK1/te1+HEDdgRA9AOTXc85lSqlYpVRsYGBg06+AiIiIqAVotOgSkfYi4lvzGsBIAAcArAcwxbbbFACf216vBzBRRLxEpDeAMADmeY4GERERkQaOdC92AbBORGr2/5dSaouI/BfApyLyEIBjAMYDgFIqQ0Q+BXAQQBWAx/jNRSIiInJ1jRZdSqkjAAbWs/0UgBENHPMygJedTufqhj6oOwGRS7v/N70a/Cwg/6urO1nU751MA5wfONXpcxCRPlczTxcZrf843QmIXFrSwCDdEewMGfUH3RGIyAl8DJCZlR63LkSkRX5JBfJLKnTHqHUi7zBO5B3WHYOImogtXWa29hHretpGvTmIXNTc1WkAgNWP3Kg3iM2pj6YCALo+u1NvECJqErZ0ERERERmARRcRERGRAVh0ERERERmARRcRERGRATiQ3sxu+qPuBEQu7eFb+uiOYOdi3GO6IxCRE1h0mVlEou4ERC4tPqqL7gh2BsU7P8EqEenD7kUzK8q2LkSkRU5hOXIKy3XHqHXspzQc+ylNdwwiaiK2dJnZF3Osa87TRaTFs2v3A2ieebqSD568qv3ra2UrW2MbcsB5uohaJLZ0ERERERmARRcRERGRAVh0ERERERmARRcRERGRATiQ3sxufVJ3AiKXNuuOMN0R7Fhunqc7AhE5gUWXmYXerjsBkUu7OSxAdwQ7MbeO1R2BiJzA7kUzK0i3LkSkRUZ+KTLyS3XHqJWTvgs56bt0xyCiJmJLl5ltmW9dc54uIi1e/OIggOaZp6s5nN/wlPXFAM7TRdQSsaWLiIiIyAAsuoiIiIgMwO5FIiIDBOR/dXUHRPHh1kStDVu6iIiIiAzAli4zG7FAdwIil/ZUQoTuCHbc4/+kOwIROYFFl5kF36A7AZFLG9LLX3cEO5Fxd+qOQEROYPeimR37wboQkRapPxcj9edi3TFqZe3Ziqw9W3XHIKImYkuXmW170brmPF1EWizacgiAeebpqk5eaH3BFi+iFoktXUREREQGYNFFREREZAAWXUREREQGYNFFREREZAAOpDezhL/qTkDk0hYkRemOYMd79CLdEYjICSy6zKzbAN0JiFxadFBH3RHshA64SXcEInICuxfNLOdr60JEWuzMLsLO7CLdMWrt//Zz7P/2c90xiKiJ2NJlZt++Zl2H3q43B5GLevurbADAzWEBmpNYue181fri1rF6gxBRk7Cli4iIiMgALLqIiIiIDMCii4iIiMgADhddIuIuIvtEZIPtvb+IbBWRbNvar86+80XksIgcEpHfXovgRERERC3J1QykfxxAJoAOtvfPANimlHpFRJ6xvX9aRKIATAQQDSAIQLKIhCulqpsxt2tIekN3AiKX9pd7Y3RHsOP7u3d0RyAiJzjU0iUiPQCMAvDPOpvHAvjA9voDAHfX2b5KKVWplDoK4DCAuGZJ62oCwqwLEWkRGuiD0EAf3TFqBYcPQnD4IN0xiKiJHO1efAPAUwAsdbZ1UUoVAIBt3dm2vTuAvDr7Hbdto6t1aLN1ISItkg+eRPLBk7pj1EpL/gRpyZ/ojkFETdRo96KIjAbwq1IqVURuc+CcUs82Vc95pwOYDgDBwcEOnNYF7bJ1JUQk6s1B5KKW7zgCAIiP6qI5iZXnnr9bX8T/Xm8QImoSR1q6hgEYIyK5AFYBuENEVgI4KSLdAMC2/tW2/3EAPesc3wNA/qUnVUotU0rFKqViAwMDnbgEIiIiIvNrtOhSSs1XSvVQSoXAOkD+K6XU/QDWA5hi220KgJpnU6wHMFFEvESkN4AwAHuaPTkRERFRC+LMY4BeAfCpiDwE4BiA8QCglMoQkU8BHARQBeAxfnORiIiIXN1VFV1Kqe0AtttenwIwooH9XgbwspPZiIiIiFoNPvDazO59V3cCIpf2fycM0h3BTqcH3tcdgYicwKLLzDr20J2AyKUFXddWdwQ7XXv21R2BiJzAZy+a2YHPrAsRafHFj/n44sfLvnytTerGfyJ14z8b35GITIktXWb23xXWdf9xenMQuaiV3/8MAEgaGKQ5iZX3j+9bX4z6g9YcRNQ0bOkiIiIiMgCLLiIiIiIDsOgiIiIiMgCLLiIiIiIDcCC9md33oe4ERC5t6f1DdEewE/Twv3VHICInsOgys/addCcgcmn+7dvojmDHL7Cb7ghE5AR2L5rZvo+tCxFp8e+UPPw7JU93jFp71r2NPeve1h2DiJqIRZeZpf3LuhCRFmtSj2NN6nHdMWq1z1yN9pmrdccgoiZi0UVERERkABZdRERERAZg0UVERERkABZdRERERAbglBFm9j+ck4dIp/enxemOYKfP45t1RyAiJ7DoMrM27XQnIHJpbdu4645gp217X90RiMgJ7F40sz3LrQsRafHR7lx8tDtXd4xaP6z+G35Y/TfdMYioiVh0mVnGf6wLEWmxIb0AG9ILdMeo5ZPzBXxyvtAdg4iaiEUXERERkQFYdBEREREZgEUXERERkQFYdBEREREZgFNGmNm0jboTELm01Y/cqDuCnehnd+qOQEROYEsXERERkQFYdJnZd29ZFyLSYtm3OVj2bY7uGLW+X/knfL/yT7pjEFETsegys5++tC5EpMW2zF+xLfNX3TFq+R7bBt9j23THIKImYtFFREREZAAWXUREREQGYNFFREREZABOGWFmnt66ExC5NG9Pd90R7FS78W8CUUvGosvM7v9MdwIil/bBg3G6I9gZ8Eyy7ghE5AR2LxIREREZgEWXmX2zyLoQkRZvbcvGW9uydceotfu9p7H7vad1xyCiJmLRZWZHvrEuRKTFd4eL8N3hIt0xanUo+A4dCr7THYOImohFFxEREZEBWHQRERERGaDRoktEvEVkj4j8KCIZIrLQtt1fRLaKSLZt7VfnmPkiclhEDonIb6/lBRARERG1BI60dFUCuEMpNRDAIAAJIvIbAM8A2KaUCgOwzfYeIhIFYCKAaAAJAJaIiLkmu2kp2vlZFyLSwq9dG/i1a6M7Rq1Kz+tQ6Xmd7hhE1ESNztOllFIAym1vPW2LAjAWwG227R8A2A7gadv2VUqpSgBHReQwgDgAu5szuEuYsFJ3AiKX9o8HhuiOYOf6eRt0RyAiJzg0pktE3EUkDcCvALYqpX4A0EUpVQAAtnVn2+7dAeTVOfy4bRsRERGRy3Ko6FJKVSulBgHoASBORPpfYXep7xSX7SQyXURSRCSlsLDQobAuJ/kF60JEWvxtSxb+tiVLd4xau5fNwu5ls3THIKImuqrHACmlSkRkO6xjtU6KSDelVIGIdIO1FQywtmz1rHNYDwD59ZxrGYBlABAbG3tZUUYA8v6rOwGRS9v782ndEex0KNqnOwIROcGRby8Gish1ttdtAcQDyAKwHsAU225TAHxue70ewEQR8RKR3gDCAOxp5txERERELYojLV3dAHxg+waiG4BPlVIbRGQ3gE9F5CEAxwCMBwClVIaIfArgIIAqAI8ppaqvTXwiIiKilsGRby+mAxhcz/ZTAEY0cMzLAF52Oh0RERFRK3FVY7rIYB2CdCcgcmndOnrrjmCnwruL7ghE5AQWXWY2brnuBEQu7Y2JlzXyaxX7xGe6IxCRE/jsRSIiIiIDsOgys83PWBci0mLhFxlY+EWG7hi1vl/yML5f8rDuGETUROxeNLMT+3UnIHJpB/PP6I5gx7ckU3cEInICW7qIiIiIDMCii4iIiMgALLqIiIiIDMAxXWbWKVR3AiKX1iewve4Ids76hOiOQEROYNFlZmPe0p2AyKX99d4BuiPYiZu9UncEInICuxeJiIiIDMCiy8zWz7YuRKTF/LXpmL82XXeMWnveuh973rpfdwwiaiJ2L5rZqRzdCYhc2pHCs7oj2Glfnqs7AhE5gS1dRERERAZg0UVERERkABZdRERERAbgmC4z6xqjOwGRS4sK6qA7gp2y6/rpjkBETmDRZWaJr+hOQOTS/pQUrTuCnd/MXK47AhE5gd2LRERERAZg0WVmnz1sXYhIizmr9mHOqn26Y9RKeX0cUl4fpzsGETURuxfN7Ey+7gRELq2g9LzuCHbanj+pOwIROYEtXUREREQGYNFFREREZAAWXUREREQG4JguM+s5VHcCIpd2fS8/3RHsnAkYrDsCETmBRZeZxb+gOwGRS3s6IVJ3BDs3Tn9bdwQicgK7F4mIiIgMwKLLzFbfb12ISItHP0rFox+l6o5Ra++ro7H31dG6YxBRE7F70czOndadgMilnT53QXcEO14XS3RHICInsKWLiIiIyAAsuoiIiIgMwKKLiIiIyAAc02VmfYbrTkDk0ob1DdAdwc6ZbsN0RyAiJ7DoMrPhT+lOQOTSZo8I0x3Bzo3T/qY7AhE5gd2LRERERAZg0WVmK8dZFyLSYsqKPZiyYo/uGLXSX4lH+ivxumMQUROxe9HMLp7XnYDIpZ2/WK07gh13C/8mELVkbOkiIiIiMkCjRZeI9BSRr0UkU0QyRORx23Z/EdkqItm2tV+dY+aLyGEROSQiv72WF0BERETUEjjS0lUF4P8opfoB+A2Ax0QkCsAzALYppcIAbLO9h+2ziQCiASQAWCIi7tciPBEREVFL0eiYLqVUAYAC2+syEckE0B3AWAC32Xb7AMB2AE/btq9SSlUCOCoihwHEAdjd3OFbvXA2EhLpNKJfZ90R7JQFj9AdgYiccFUD6UUkBMBgAD8A6GIryKCUKhCRmr9O3QF8X+ew47ZtdLWGzdadgMilTb81VHcEO7+5f6HuCETkBIcH0ouID4DPAMxRSp250q71bFP1nG+6iKSISEphYaGjMYiIiIhaJIeKLhHxhLXg+lgptda2+aSIdLN93g3Ar7btxwH0rHN4DwD5l55TKbVMKRWrlIoNDAxsav7W7b1R1oWItJjw7m5MeNc8IyMy/nIzMv5ys+4YRNREjnx7UQD8PwCZSqnX63y0HsAU2+spAD6vs32iiHiJSG8AYQDMM7sgERERkQaOjOkaBuABAPtFJM227VkArwD4VEQeAnAMwHgAUEpliMinAA7C+s3Hx5RS5pphkIiIiMhgjnx7cSfqH6cFAPV+lUYp9TKAl53IRURERNSqcEZ6IiIiIgPw2YtmFn237gRELm30gG66I9gpD03SHYGInMCiy8ziHtadgMilPXBjiO4Idm6Y8LTuCETkBHYvmtmFc9aFiLSouFCNigvm+R5QxdkyVJwt0x2DiJqIRZeZfTzeuhCRFlPf24Op75lnxpsjbybiyJuJumMQUROx6CIiIiIyAIsuIiIiIgOw6CIiIiIyAIsuIiIiIgNwyggzGzRJdwIil/a7IT10R7Bztt8E3RGIyAksusxs8P/oTkDk0sbH9tQdwU7cPbN0RyAiJ7B70czOnrIuRKRF8dkLKD57QXeMWqcLC3C6sEB3DCJqIrZ0mdmnk63raRv15iByUTNWpgIAVj9yo+YkVvnLrfP2+T27U3MSImoKtnQRERERGYBFFxEREZEBWHQRERERGYBFFxEREZEBOJDezIY+qDsBkUu7/ze9dEewc37gVN0RiMgJLLrMrP843QmIXFrSwCDdEewMGfUH3RGIyAnsXjSz0uPWhYi0yC+pQH5Jhe4YtU7kHcaJvMO6YxBRE7Gly8zWPmJdc54uIi3mrk4DYJ55uk59NBUA0JXzdBG1SGzpIiIiIjIAiy4iIiIiA7DoIiIiIjIAiy4iIiIiA3AgvZnd9EfdCYhc2sO39NEdwc7FuMd0RyAiJ7DoMrOIRN0JiFxafFQX3RHsDIr/ve4IROQEdi+aWVG2dSEiLXIKy5FTWK47Rq1jP6Xh2E9pumMQUROxpcvMvphjXXOeLiItnl27H4B55ukqW2MbcsB5uohaJLZ0ERERERmARRcRERGRAVh0ERERERmARRcRERGRATiQ3sxufVJ3AiKXNuuOMN0R7Fhunqc7AhE5gUWXmYXerjsBkUu7OSxAdwQ7MbeO1R2BiJzA7kUzK0i3LkSkRUZ+KTLyS3XHqJWTvgs56bt0xyCiJmJLl5ltmW9dc54uIi1e/OIgAPPM03V+w1PWFwM4TxdRS8SWLiIiIiIDNFp0icgKEflVRA7U2eYvIltFJNu29qvz2XwROSwih0Tkt9cqOBEREVFL4khL1/sAEi7Z9gyAbUqpMADbbO8hIlEAJgKIth2zRETcmy0tERERUQvVaNGllPoWQPElm8cC+MD2+gMAd9fZvkopVamUOgrgMIC45olKRERE1HI1dSB9F6VUAQAopQpEpLNte3cA39fZ77htGzXFiAW6ExC5tKcSInRHsOMe/yfdEYjICc397UWpZ5uqd0eR6QCmA0BwcHAzx2glgm/QnYDIpQ3p5a87gp3IuDt1RyAiJzT124snRaQbANjWv9q2HwfQs85+PQDk13cCpdQypVSsUio2MDCwiTFauWM/WBci0iL152Kk/nzp6Ap9svZsRdaerbpjEFETNbXoWg9giu31FACf19k+UUS8RKQ3gDAAe5yL6MK2vWhdiEiLRVsOYdGWQ7pj1KpOXojq5IW6YxBREzXavSginwC4DUCAiBwH8CcArwD4VEQeAnAMwHgAUEpliMinAA4CqALwmFKq+hplJyIiImoxGi26lFK/b+CjEQ3s/zKAl50JRURERNTacEZ6IiIiIgOw6CIiIiIyAB94bWYJf9WdgMilLUiK0h3BjvfoRbojEJETWHSZWbcBuhMQubTooI66I9gJHXCT7ghE5AR2L5pZztfWhYi02JldhJ3ZRbpj1Nr/7efY/+3nje9IRKbEli4z+/Y16zr0dr05iFzU219lAwBuDgvQnMTKbeer1he3jtUbhIiahC1dRERERAZg0UVERERkAHYvEhGZ0aHNl2+rrmr4s4jEa5uHiJzGoouIiEwpMDDQ48cff1wPoD/YM0PmZwFwoKqq6g9Dhgz5tb4dWHSZWdIbuhMQubS/3BujO4Id32F/0B3BUC+//HJI165d2wcGBp52c3NTuvMQXYnFYpHCwsKoEydO/BPAmPr2YdFlZgFhuhMQubTQQB/dEewEd++uO4KhQkJC2gYGBp5hwUUtgZubmwoMDCw9ceJE/4b2YdFlZjXjNjhWg0iL5IMnAQDxUV0M/9lpeSWXbTtx9AAAoGvvy/+mD4q41omMJyJgwUUtie33tcGucPaRm9mud6wLEV1zyQdPXrYs+jILi77MqvczHYLykxGUn6zlZ7uqY8eOeYwePbpPz549+4eGhkYPHz68b3p6uldTzrVlyxafvn37RkdGRkaVl5dLc+ZctGhR4DvvvNMJAN56661Oubm5ns15fiMkJCT0OXjwYJuGPjfiutasWdMhJCSkf3BwcP9nn322a8326dOn91i/fr2vs+dnSxcREbUIn6f90qzPZRo7qHvplT63WCwYM2ZM30mTJp3asGHDEQDYtWtX2/z8fM8BAwZUXu3P+/DDD/1nzZp14vHHHz/lyP5VVVXw8HDsn+mnnnqqsOb1ypUrAwYNGlQREhJy8WozNgeLxQKlFNzd3R0+JiUlxbu6ulqioqIuNLTPtb6uqqoqzJ07N/jLL7/8qU+fPhcHDhzYb9y4cSVDhgw5/+STT/46bdq0XmPGjClz5mewpYuIiKgeGzZs8PXw8FB1C5qbbrqpIiEhodxiseCRRx7pERYWFh0eHh61fPlyv5pj4uLiIhISEvr07t07esyYMb0tFgtef/31gI0bN/ovWrQoqGZbQ8ffcMMN4UlJSb0jIiKiN2zY4Dt06NCIu+66q09ISEj/mTNndl+6dKl/TExMv/Dw8KiMjAwvAHjiiSeCFixY0OW9997zO3DgQLvJkyf3iYyMjFq1alXHO++8M7Qm/7p16zqMHDky9NJrnTlzZvfQ0NDo8PDwqOnTp/cAgLy8PI8777wzNCIiIioiIiJq69at7QHghRde6BIWFhYdFhYW/eKLL3YGgEOHDrXp06dP9P333x8cHR0dlZOT0+b555/v0r9//37h4eFRc+fODQKAM2fOuN122219IyIiosLCwqJrrvv999/vlJSUVAJYi59x48aF1Py3WbhwYedLr6u8vFx27NjRbujQoRHR0dH9br755rCff/7ZEwDi4uIiHnzwwZ6DBw+ODAsLi/7666/bOXK/t2/f3r5Xr16VUVFRF7y9vdW9995bvGbNmusAIDw8/EJJSYnHsWPHnGqsYksXERGAgPyvLtvmWdmuwc+o9UtPT287cODAc/V99uGHH163f//+tpmZmRkFBQUecXFx/UaOHFkOAJmZmW3T0tKOhISEXBwyZEjk1q1bfZ544omi7777zmf06NGl06ZNO/3+++83eHx6enr7ffv2ZURGRl7YsGGDb1ZWVts1a9Yc6dy5c1WvXr1ivLy8ivbv35/55z//ufPixYs7r1ixIq8m17Rp004vXbq082uvvZZ36623nrNYLJg/f36P/Px8j6CgoKoVK1Z0mjp1qt0DRU+ePOm+adMmvyNHjhxwc3NDUVGROwA8+uijwbfcckvZggULcqqqqlBaWuq+Y8eOdv/61786paamZiqlMGTIkH4jRowoCwgIqM7NzfVevnx57sqVK4+tXbu2w+HDh73T09MzlVKIj4/vu3nzZp+TJ096dO3a9eL27dsPA8CpU6fcAeCHH37wmTx5cjEA7N69u11BQYFndnZ2BgAUFRW5BwQEVNe9rsrKSpk9e3bwxo0bDwcFBVUtX77c78knn+z+73//OxcAzp0757Zv376szZs3+0yfPr13dnZ2xhdffOE7b968npfey7Zt21r27duXlZeX16Z79+61LW09evS48MMPP9R+myYmJubcV1995TN16tSSpvw+ASy6iIiIrtqOHTt877vvvmIPDw/07Nmz6oYbbijfuXNnu44dO1piYmLOhoaGXgSA6Ojoczk5OZeNU7rS8QMGDDgbGRlZ+49/TEzM2V69el0EgODg4MrExMRSABg4cGDFN998c8VxRm5ubrjvvvtOLV++3P+xxx47tXfvXp+1a9cerbuPv79/tZeXl2XixIm9Ro0aVTphwoRSANi1a5fvmjVrjgKAh4cHOnXqVL19+3afu+66q6RDhw4WABg1atTpr7/+2nf8+PEl3bp1uzBixIizALBly5YO3377bYeoqKgowFoEZWVleY8YMaLsueee6zljxozuY8eOLU1ISCgHgMLCQs+uXbteBIDIyMjKvLw8rylTpvRMSkoqveeee85cel3p6ele2dnZbe+4445wwNqlGRgYWNvtOGnSpGIASExMLC8vL3crKipyT0pKKktKSjrY0H8rpS7/zoaI1G4MDAys+uWXXxocc+YIFl1mdu+7uhMQubQ5/St0R7BTEjMFANBBcw5XERMTU/Gf//zHr77P6vsHuoaXl1fth+7u7qiqqrps0PyVjm/Xrp2lofO5ubnB29tb1byurq5udED+jBkzTo0aNaqvt7e3SkpKOu3paT8W3dPTE2lpaZnr16/vsGrVKr+lS5d2/v7773+q71yO5lZKYc6cOQXz5s0runS/vXv3Hvzss886Pvfcc92Tk5PPvPbaawVeXl6WiooKNwAIDAysPnDgwMF169Z1WLJkSefVq1f717Rg1Tm/9O3btyItLS2rviwictn7xlq6goODL9Qtqo4fP94mKCiotpA7f/68tG3b1nLp8VeDY7rMrGMP60JEWgR6KwR6m2fGgg4d/NChQ701AF0DSUlJZRcuXJDFixcH1Gz75ptv2m3cuNFn+PDhZWvWrPGvqqpCfn6+x549e3xuueWWs46e29njr8THx6e6tLS0dhR7SEjIxS5dulxcvHhxt4cffviyIqi0tNStuLjYfcKECaX/+Mc/8jIzM9sBwLBhw8peffXVQMA6zqq4uNjtjjvuKN+0adN1ZWVlbmfOnHHbtGmT3+23337Z4PLExMQzH330UUBpaakbABw9etTzl19+8cjNzfX09fW1zJw5s3jOnDkn09LS2gFAWFjY+czMTC8AKCgo8KiursbUqVNLXnrppV/279/f7tLrGjBgwPni4mKP5OTk9gBQWVkpKSkp3jU//5NPPvEDgC+//NLH19e3ulOnTtVJSUllWVlZBy9d9u3bl2W7J2dzc3O9s7Ky2pw/f17Wrl3rP27cuJKac+bk5HgPHDjQqf8nxpYuMzvwmXXdf5zeHEQuaucJ65/Im7tWaU5idSJ7LwCga9j1mpO4Bjc3N6xfvz5n5syZPd94442uXl5eqkePHpVvv/12XmJiYvmuXbt8+vXrFy0iauHChceDg4Or0tPTHTr3Aw88UOLM8VcyefLkolmzZvWaN2+eJSUlJdPHx0dNnDjx1N///nePIUOGnL90/5KSEvfRo0f3raysFAB46aWX8gBg6dKlx6ZOndorPDw8wM3NDe+8887P8fHxZydNmnTq+uuv72e7jsJhw4ZVHDp0yK7b7d577z2TkZHhPXTo0EjA2gr28ccfH83KyvKaP39+Dzc3N3h4eKglS5b8DACJiYklX331le/dd99dlpub6/nQQw+FWCwWAYAXX3zxeH3XtWrVqpzZs2cHl5WVuVdXV8uMGTNOxsbGngcAPz+/6sGDB0eWl5e7L1u2zK47tSGenp5YvHjxsYSEhPDq6mpMmjSpqOZ8lZWVkpub63Xrrbc6VRjLlZoKjRIbG6tSUlJ0xzCf90ZZ19M26s1B5ALSkj+5bNtzKdaB9C/H1juW2nBu370BALAMm3PZZ4Pif29sGAMkJydfiI+P3687R2swefLk4MGDB5+bO3fuZS1dZlBeXi7Dhg2LSE1NzXJ0moyGxMXFRdQMuG+mePjwww+vS01Nbffmm2/mN7bvjz/+GDBw4MCQ+j5jS5dZ1Mw+X1fFqYY/4yz1RETkgOjo6H5t27a1vPvuu3mN762Hj4+PWrBgQf7Ro0fbhIWFNThXly5VVVXy/PPPOz0rMosuIiKiViwjIyNTdwZHjBs37rJvKTbFnj17DjXHeep68MEHTzfHeTiQnoiIiMgALLqIiIiIDMDuRTO7ea7uBEQu7ekB5pqnq3zQwwAAh55pQkSmw6LLzLw4BSKRTh3a6P92d13t2rfXHYGInMDuRTM7st26EJEW2/I9sS3fs/EdDXLi0A84cegH3TFcyrFjxzxGjx7dp2fPnv1DQ0Ojhw8f3jc9Pd2rKefasmWLT9++faNrHtjcnDkXLVoU+M4773QCgLfeeqtTbm6ueX5xHZSQkNDn4MGDDT5mx+jr2rdvn/egQYMi27Rpc/2CBQu61Gw/f/68xMbGRly8ePFKh9eLLV1mdvQb67rPbVpjELmqr2wF14igq//jei0EFe0GAFgibtCcRJP9/+7YrOeLGV96pY8tFgvGjBnTd9KkSac2bNhwBAB27drVNj8/33PAgAGVV/vjPvzwQ/9Zs2adePzxx085sn9VVRUcnbPqqaeeKqx5vXLlyoBBgwZVhISEaPnFtVgsUErB3d298Z1tUlJSvKurqyUqKqrB6SKMvq7OnTtXvfnmm8fWrFlj9xgIb29vNXz48DP//Oc//WfMmFF8NedkSxcREVE9NmzY4Ovh4aHqFjQ33XRTRUJCQrnFYsEjjzzSIywsLDo8PDxq+fLlfjXHxMXFRSQkJPTp3bt39JgxY3pbLBa8/vrrARs3bvRftGhRUM22ho6/4YYbwpOSknpHREREb9iwwXfo0KERd911V5+QkJD+M2fO7L506VL/mJiYfuHh4VEZGRleAPDEE08ELViwoMt7773nd+DAgXaTJ0/uExkZGbVq1aqOd955Z2hN/nXr1nUYOXJk6KXXOnPmzO6hoaHR4eHhUdOnT+8BAHl5eR533nlnaERERFRERETU1q1b2wPACy+80CUsLCw6LCws+sUXX+wMAIcOHWrTp0+f6Pvvvz84Ojo6Kicnp83zzz/fpX///v3Cw8Oj5s6dGwQAZ86ccbvtttv6RkRERIWFhUXXXPf777/fKSkpqQSwFpvjxo0Lqflvs3Dhws6XXld5ebns2LGj3dChQyOio6P73XzzzWE///yzJ2CdHPXBBx/sOXjw4MiwsLDor7/+uknDILt37141fPjwc56enpeNM/jd735XsmrVKv+rPSdbuoiIiOqRnp7eduDAgfXOav7hhx9et3///raZmZkZBQUFHnFxcf1GjhxZDgCZmZlt09LSjoSEhFwcMmRI5NatW32eeOKJou+++85n9OjRpdOmTTv9/vvvN3h8enp6+3379mVERkZe2LBhg29WVlbbNWvWHOncuXNVr169Yry8vIr279+f+ec//7nz4sWLO69YsaJ20tNp06adXrp0aeeaGdktFgvmz5/fIz8/3yMoKKhqxYoVnaZOnWo3K/3JkyfdN23a5HfkyJEDbm5uKCoqcgeARx99NPiWW24pW7BgQU5VVRVKS0vdd+zY0e5f//pXp9TU1EylFIYMGdJvxIgRZQEBAdW5ubney5cvz125cuWxtWvXdjh8+LB3enp6plIK8fHxfTdv3uxz8uRJj65du17cvn37YQA4deqUOwD88MMPPpMnTy4GgN27d7crKCjwzM7OzgCAoqIi94CAgOq611VZWSmzZ88O3rhx4+GgoKCq5cuX+z355JPdax6Mfe7cObd9+/Zlbd682Wf69Om9s7OzMxp74PXV/G4MHTq0Ij09/aoHWbLoaqnqm6X+SjiDPRFRs9mxY4fvfffdV+zh4YGePXtW3XDDDeU7d+5s17FjR0tMTMzZ0NDQiwAQHR19Licn57JxSlc6fsCAAWcjIyNru9liYmLO9urV6yIABAcHVyYmJpYCwMCBAyu++eYb3yvldHNzw3333Xdq+fLl/o899tipvXv3+qxdu9buWYT+/v7VXl5elokTJ/YaNWpU6YQJE0oBYNeuXb5r1qw5CgAeHh7o1KlT9fbt233uuuuukg4dOlgAYNSoUae//vpr3/Hjx5d069btwogRI84CwJYtWzp8++23HaKioqIAaxGUlZXlPWLEiLLnnnuu54wZM7qPHTu2NCEhoRwACgsLPbt27XoRACIjIyvz8vK8pkyZ0jMpKan0nnvuuWzS1PT0dK/s7Oy2d9xxRzhg7dIMDAys7XacNGlSMQAkJiaWl5eXuxUVFbknJSWVJSUlHbzynXWMh4cHPD091enTp938/PwsDh/XHD+ciIiotYmJian4z3/+41ffZ1d6brGXl1fth+7u7qiqqrps0PyVjm/Xrp3dP+J1z+fm5gZvb29V87q6urrRAfkzZsw4NWrUqL7e3t4qKSnptKen/Vh0T09PpKWlZa5fv77DqlWr/JYuXdr5+++//6m+czmaWymFOXPmFMybN++yZz3u3bv34Geffdbxueee656cnHzmtddeK/Dy8rJUVFS4AUBgYGD1gQMHDq5bt67DkiVLOq9evdq/pgWrzvmlb9++FWlpafW2UInIZe8ba+n661//GvjBBx8EAsCWLVuyGxs7dvHiRWnXrt1VfcWZY7rMbPgz1oWItFgw+BwWDDbHw64BoHLoTFQOnak7hstISkoqu3DhgixevDigZts333zTbuPGjT7Dhw8vW7NmjX9VVRXy8/M99uzZ43PLLbecdfTczh5/JT4+PtWlpaW1o9hDQkIudunS5eLixYu7Pfzww5cVQaWlpW7FxcXuEyZMKP3HP/6Rl5mZ2Q4Ahg0bVvbqq68GAtZxVsXFxW533HFH+aZNm64rKytzO3PmjNumTZv8br/99rJLz5mYmHjmo48+CigtLXUDgKNHj3r+8ssvHrm5uZ6+vr6WmTNnFs+ZM+dkWlpaOwAICws7n5mZ6QUABQUFHtXV1Zg6dWrJSy+99Mv+/fvbXXpdAwYMOF9cXOyRnJzcHgAqKyslJSXFu+bnf/LJJ34A8OWXX/r4+vpWd+rUqTopKaksKyvr4KVLTdfi/PnzC2u2NVZwnThxwt3Pz6+qbkHsCLZ0mZlHk76VTEQAkg9e3bNpA+rZ5uX4l68M4dmmwW/T0zXg5uaG9evX58ycObPnG2+80dXLy0v16NGj8u23385LTEws37Vrl0+/fv2iRUQtXLjweHBwcFV6erpD537ggQdKnDn+SiZPnlw0a9asXvPmzbOkpKRk+vj4qIkTJ576+9//7jFkyJDzl+5fUlLiPnr06L6VlZUCAC+99FIeACxduvTY1KlTe4WHhwe4ubnhnXfe+Tk+Pv7spEmTTl1//fX9bNdROGzYsIpDhw7Z/XLee++9ZzIyMryHDh0aCVhbwT7++OOjWVlZXvPnz+/h5uYGDw8PtWTJkp8BIDExseSrr77yvfvuu8tyc3M9H3rooRCLxSIA8OKLLx6v77pWrVqVM3v27OCysjL36upqmTFjxsnY2NjzAODn51c9ePDgyPLycvdly5bZdac66tixYx5Dhw6NOnv2rLuIqHfffbdLZmbmAX9/f8vmzZs7jBgx4orffq2PXKmp0CixsbEqJSVFdwy96hujlf2/1nXYSOfPzzFd5GKuuujK/+qybZvyrN0wd/U0x5QRJw7uAAB0jbrlss8G9bzO8RO1kL8HycnJF+Lj4/frztEaTJ48OXjw4MHn5s6de1lLlxmUl5fLsGHDIlJTU7McnSajIXFxcRE1A+6bKd5lRo4cGfrqq68eHzhw4GVTh/z4448BAwcODKnvOLZ0XUtXO9j9Usesc/I0S9FF5GLqK6Ku1ncnzVV0BZ1OBQBYcHnRlZZX4vB5BkU0VyJqCaKjo/u1bdvW8u677+Y1vrcePj4+asGCBflHjx5tExYW1uBcXWZw/vx5GTNmTEl9BVdjrlnRJSIJAN4E4A7gn0qpV67VzyIiIqL6ZWRkZOrO4Ihx48Zd9i3FptizZ8+h5jhPQ7y9vdUf//hHhya4vdQ1KbpExB3A3wHcCeA4gP+KyHqlVLN8VZOagFNMEBERaXWtWrriABxWSh0BABFZBWAsABZdRNRkacmf6I5ABlJKwWKxiJubm/7Bx0QOsA3+b3DermtVdHUHULfv+DiAlv+wMGfHaBERkcNyc3MrCgsLOwYGBpay8CKzs1gsUlhY2BHAgYb2uVZFV32Ttdn9D0ZEpgOYbntbLiLN0QcbAMCU38xwzl31bWyl11ovV7pWwLWut0Vc66bmOU0zXuskzcc3qlmu1d/fPyQuLu7giRMn+oPzSpL5WQAcqKqq+kNDO1yrous4gLqzvvYAkF93B6XUMgDLmvOHikiKUiq2Oc9pVrzW1suVrpfX2jq50rUSXY1r9f8c/gsgTER6i0gbABMBrL9GP4uIiIjI9K5JS5dSqkpE/gjgS1injFihlMq4Fj+LiIiIqCW4ZvN0KaU2odmGQjisWbsrTY7X2nq50vXyWlsnV7pWIoeZ4jFARERERK0dvw1CREREZIAWXXSJyKsikiUi6SKyTkSua2C/XBHZLyJpItIin6x9FdeaICKHROSwiDxjcMxmISLjRSRDRCwi0uA3oFrDfQWu6npbw731F5GtIpJtW/s1sF+LvbeN3Sexesv2ebqIXK8jZ3Nw4FpvE5FS231ME5EFOnISmUWLLroAbAXQXyk1AMBPAOZfYd/blVKDWvDXmBu91jqPX0oEEAXg9yISZWjK5nEAwL0AvnVg35Z+XwEHrrcV3dtnAGxTSoUB2GZ735AWd28dvE+JAMJsy3QASw0N2Uyu4ndyh+0+DlJKvWhoSCKTadFFl1Lqf5VSVba338M6H1ir5OC11j5+SSl1AUDN45daFKVUplLqmj6w1EwcvN5WcW9hzfyB7fUHAO7WF+WacOQ+jQXwobL6HsB1ItLN6KDNoLX8ThIZpkUXXZd4EEBDz+lRAP5XRFJtM+G3dA1da32PX+puSCI9Wtt9vZLWcm+7KKUKAMC27tzAfi313jpyn1rLvXT0Om4UkR9FZLOIRBsTjcicrtmUEc1FRJIBdK3no+eUUp/b9nkOQBWAjxs4zTClVL6IdAawVUSylFKOdF0ZqhmutdHHL5mFI9fqgBZxX4Fmud5WcW+v4jQt5t5ewpH71GLuZSMcuY69AHoppcpF5C4A/4G1W5XIJZm+6FJKxV/pcxGZAmA0gBGqgfkvlFL5tvWvIrIO1mZx0/0Bb4ZrbfTxS2bR2LU6eI4WcV+BZrneVnFvReSkiHRTShXYutR+beAcLebeXsKR+9Ri7mUjHHnc25k6rzeJyBIRCVBKmf55m0TXQovuXhSRBABPAxijlDrXwD7tRcS35jWAkbjCE8DNypFrhQs9fqm13Ner0Fru7XoAU2yvpwC4rJWvhd9bR+7TegCTbd9i/A2A0pou1xam0WsVka4iIrbXcbD+m3PK8KREJtGiiy4A7wDwhbX7IU1E/gEAIhIkIjWz4XcBsFNEfgSwB8BGpdQWPXGd0ui12gba1zx+KRPApy3x8Usico+IHAdwI4CNIvKlbXtrvK8OXW9rubcAXgFwp4hkA7jT9r7V3NuG7pOIPCoij9p22wTgCIDDAJYDmKklrJMcvNbfAThgu5dvAZjYUI8EkSvgjPREREREBmjpLV1ERERELQKLLiIiIiIDsOgiIiIiMgCLLiIiIiIDsOgiIiIiMgCLLiIiIiIDsOgiIiIiMgCLLiIiIiID/H/E328au+/P6QAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(7, 5))\n", + "for i, j in enumerate([0, -1]):\n", + " plt.hist(conformity_scores_pfit[j], range=[-2.5, 0.5], bins=30, color=f\"C{i}\", alpha=0.3, label=f\"Conformity scores(step={j})\")\n", + " plt.axvline(lower_quantiles_pfit[j], ls=\"--\", color=f\"C{i}\")\n", + " plt.axvline(higher_quantiles_pfit[j], ls=\"--\", color=f\"C{i}\")\n", + "plt.legend(loc=[1, 0])" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "b42a3cd8426fe817ae9ba121b635fcbd42db4879a102a9a759ab9c3dd732f904" + }, + "jupytext": { + "formats": "ipynb,md" + }, + "kernelspec": { + "display_name": "mapie-notebooks", + "language": "python", + "name": "mapie-notebooks" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/regression/ts-changepoint.md b/notebooks/regression/ts-changepoint.md new file mode 100644 index 000000000..c8e511520 --- /dev/null +++ b/notebooks/regression/ts-changepoint.md @@ -0,0 +1,372 @@ +--- +jupyter: + jupytext: + formats: ipynb,md + text_representation: + extension: .md + format_name: markdown + format_version: '1.3' + jupytext_version: 1.13.6 + kernelspec: + display_name: mapie-notebooks + language: python + name: mapie-notebooks +--- + +# Estimating prediction intervals of time series forecast with EnbPI + + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/scikit-learn-contrib/MAPIE/blob/add-ts-notebooks/notebooks/regression/ts-changepoint.ipynb) + + +This example uses `mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate +prediction intervals associated with time series forecast. It follows Xu \& Xie (2021). +We use here the Victoria electricity demand dataset used in the book +"Forecasting: Principles and Practice" by R. J. Hyndman and G. Athanasopoulos. +The electricity demand features daily and weekly seasonalities and is impacted +by the temperature, considered here as a exogeneous variable. +A Random Forest model is already fitted on data. The hyper-parameters are +optimized with a `sklearn.model_selection.RandomizedSearchCV` using a +sequential `sklearn.model_selection.TimeSeriesSplit` cross validation, +in which the training set is prior to the validation set. +The best model is then feeded into +`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the +associated prediction intervals. We compare four approaches: with or without +``partial_fit`` called at every step. + +```python +install_mapie = False +if install_mapie: + !pip install "git+https://github.com/scikit-learn-contrib/MAPIE.git@add-ts-notebooks" +``` + +```python +import warnings + +import numpy as np +import pandas as pd +from matplotlib import pylab as plt +from scipy.stats import randint +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit + +from mapie.metrics import regression_coverage_score, regression_mean_width_score +from mapie.subsample import BlockBootstrap +from mapie.time_series_regression import MapieTimeSeriesRegressor + +%reload_ext autoreload +%autoreload 2 +warnings.simplefilter("ignore") +``` + +## 1. Load input data and feature engineering + +```python +url_file = "https://raw.githubusercontent.com/scikit-learn-contrib/MAPIE/master/examples/data/demand_temperature.csv" +demand_df = pd.read_csv( + url_file, parse_dates=True, index_col=0 +) + +demand_df["Date"] = pd.to_datetime(demand_df.index) +demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64") +demand_df["Weekday"] = demand_df.Date.dt.isocalendar().day.astype("int64") +demand_df["Hour"] = demand_df.index.hour +n_lags = 5 +for hour in range(1, n_lags): + demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour) + +``` + +## 2. Train/validation/test split + +```python +num_test_steps = 24 * 7 +demand_train = demand_df.iloc[:-num_test_steps, :].copy() +demand_test = demand_df.iloc[-num_test_steps:, :].copy() +features = ["Weekofyear", "Weekday", "Hour", "Temperature"] +features += [f"Lag_{hour}" for hour in range(1, n_lags)] + +X_train = demand_train.loc[ + ~np.any(demand_train[features].isnull(), axis=1), features +] +y_train = demand_train.loc[X_train.index, "Demand"] +X_test = demand_test.loc[:, features] +y_test = demand_test["Demand"] +``` + +```python +plt.figure(figsize=(16, 5)) +plt.plot(y_train) +plt.plot(y_test) +plt.ylabel("Hourly demand (GW)") +``` + +## 3. Optimize the base estimator + +```python +model_params_fit_not_done = False +if model_params_fit_not_done: + # CV parameter search + n_iter = 100 + n_splits = 5 + tscv = TimeSeriesSplit(n_splits=n_splits) + random_state = 59 + rf_model = RandomForestRegressor(random_state=random_state) + rf_params = {"max_depth": randint(2, 30), "n_estimators": randint(10, 100)} + cv_obj = RandomizedSearchCV( + rf_model, + param_distributions=rf_params, + n_iter=n_iter, + cv=tscv, + scoring="neg_root_mean_squared_error", + random_state=random_state, + verbose=0, + n_jobs=-1, + ) + cv_obj.fit(X_train, y_train) + model = cv_obj.best_estimator_ +else: + # Model: Random Forest previously optimized with a cross-validation + model = RandomForestRegressor( + max_depth=10, n_estimators=50, random_state=59) +``` + +## 4. Estimate prediction intervals on the test set + +```python +alpha = 0.05 +gap = 1 +cv_mapiets = BlockBootstrap( + n_resamplings=100, length=48, overlapping=True, random_state=59 +) +mapie_enbpi = MapieTimeSeriesRegressor( + model, method="enbpi", cv=cv_mapiets, agg_function="mean", n_jobs=-1 +) +``` + +### Without partial fit + +```python +print("EnbPI, with no partial_fit, width optimization") +mapie_enbpi = mapie_enbpi.fit(X_train, y_train) +y_pred_npfit, y_pis_npfit = mapie_enbpi.predict( + X_test, alpha=alpha, ensemble=True, beta_optimize=True +) +coverage_npfit = regression_coverage_score( + y_test, y_pis_npfit[:, 0, 0], y_pis_npfit[:, 1, 0] +) +width_npfit = regression_mean_width_score( + y_pis_npfit[:, 0, 0], y_pis_npfit[:, 1, 0] +) +``` + +### With partial fit + +```python +print("EnbPI with partial_fit, width optimization") +mapie_enbpi = mapie_enbpi.fit(X_train, y_train) + +y_pred_pfit = np.zeros(y_pred_npfit.shape) +y_pis_pfit = np.zeros(y_pis_npfit.shape) +y_pred_pfit[:gap], y_pis_pfit[:gap, :, :] = mapie_enbpi.predict( + X_test.iloc[:gap, :], alpha=alpha, ensemble=True +) +for step in range(gap, len(X_test), gap): + mapie_enbpi.partial_fit( + X_test.iloc[(step - gap):step, :], + y_test.iloc[(step - gap):step], + ) + ( + y_pred_pfit[step:step + gap], + y_pis_pfit[step:step + gap, :, :], + ) = mapie_enbpi.predict( + X_test.iloc[step:(step + gap), :], + alpha=alpha, + ensemble=True + ) +coverage_pfit = regression_coverage_score( + y_test, y_pis_pfit[:, 0, 0], y_pis_pfit[:, 1, 0] +) +width_pfit = regression_mean_width_score( + y_pis_pfit[:, 0, 0], y_pis_pfit[:, 1, 0] +) +``` + +## V. Plot estimated prediction intervals on test set + +```python +y_preds = [y_pred_npfit, y_pred_pfit] +y_pis = [y_pis_npfit, y_pis_pfit] +coverages = [coverage_npfit, coverage_pfit] +widths = [width_npfit, width_pfit] +``` + +```python +def plot_forecast(y_train, y_test, y_preds, y_pis, coverages, widths, plot_coverage=True): + fig, axs = plt.subplots( + nrows=2, ncols=1, figsize=(14, 8), sharey="row", sharex="col" + ) + for i, (ax, w) in enumerate(zip(axs, ["without", "with"])): + ax.set_ylabel("Hourly demand (GW)") + ax.plot(y_train[int(-len(y_test)/2):], lw=2, label="Training data", c="C0") + ax.plot(y_test, lw=2, label="Test data", c="C1") + + ax.plot( + y_test.index, y_preds[i], lw=2, c="C2", label="Predictions" + ) + ax.fill_between( + y_test.index, + y_pis[i][:, 0, 0], + y_pis[i][:, 1, 0], + color="C2", + alpha=0.2, + label="Prediction intervals", + ) + title = f"EnbPI, {w} update of residuals. " + if plot_coverage: + title += f"Coverage:{coverages[i]:.3f} and Width:{widths[i]:.3f}" + ax.set_title(title) + ax.legend() + fig.tight_layout() + plt.show() +``` + +```python +plot_forecast(y_train, y_test, y_preds, y_pis, coverages, widths) +``` + +## VI. Forecast on test dataset with change point + + +We will now see how MAPIE adapts its prediction intervals when a brutal changepoint arises in the test set. To simulate this, we will artificially decrease the electricity demand by 2 GW in the test set, aiming at simulating an effect, such as blackout or lockdown due to a pandemic, that was not taken into account by the model during its training. + + +### Corrupt the dataset + +```python +demand_df_corrupted = demand_df.copy() +demand_df_corrupted.Demand.iloc[-int(num_test_steps/2):] -= 2 +``` + +```python +n_lags = 5 +for hour in range(1, n_lags): + demand_df[f"Lag_{hour}"] = demand_df["Demand"].shift(hour) +demand_train_corrupted = demand_df_corrupted.iloc[:-num_test_steps, :].copy() +demand_test_corrupted = demand_df_corrupted.iloc[-num_test_steps:, :].copy() + +X_train = demand_train_corrupted.loc[ + ~np.any(demand_train_corrupted[features].isnull(), axis=1), features +] +y_train = demand_train_corrupted.loc[X_train.index, "Demand"] +X_test = demand_test_corrupted.loc[:, features] +y_test = demand_test_corrupted["Demand"] +``` + +```python +plt.figure(figsize=(16, 5)) +plt.ylabel("Hourly demand (GW)") +plt.plot(y_train) +plt.plot(y_test) +``` + +### Prediction intervals without partial fit + +```python +print("EnbPI, with no partial_fit, width optimization") +mapie_enbpi = mapie_enbpi.fit(X_train, y_train) +y_pred_npfit, y_pis_npfit = mapie_enbpi.predict( + X_test, alpha=alpha, ensemble=True, beta_optimize=True +) +coverage_npfit = regression_coverage_score( + y_test, y_pis_npfit[:, 0, 0], y_pis_npfit[:, 1, 0] +) +width_npfit = regression_mean_width_score( + y_pis_npfit[:, 0, 0], y_pis_npfit[:, 1, 0] +) +``` + +### Prediction intervals with partial fit + +```python +print("EnbPI with partial_fit, width optimization") +mapie_enbpi = mapie_enbpi.fit(X_train, y_train) + +y_pred_pfit = np.zeros(y_pred_npfit.shape) +y_pis_pfit = np.zeros(y_pis_npfit.shape) +conformity_scores_pfit, lower_quantiles_pfit, higher_quantiles_pfit = [], [], [] +y_pred_pfit[:gap], y_pis_pfit[:gap, :, :] = mapie_enbpi.predict( + X_test.iloc[:gap, :], alpha=alpha, ensemble=True +) +for step in range(gap, len(X_test), gap): + mapie_enbpi.partial_fit( + X_test.iloc[(step - gap):step, :], + y_test.iloc[(step - gap):step], + ) + ( + y_pred_pfit[step:step + gap], + y_pis_pfit[step:step + gap, :, :], + ) = mapie_enbpi.predict( + X_test.iloc[step:(step + gap), :], + alpha=alpha, + ensemble=True + ) + conformity_scores_pfit.append(mapie_enbpi.conformity_scores_) + lower_quantiles_pfit.append(mapie_enbpi.lower_quantiles_) + higher_quantiles_pfit.append(mapie_enbpi.higher_quantiles_) +coverage_pfit = regression_coverage_score( + y_test, y_pis_pfit[:, 0, 0], y_pis_pfit[:, 1, 0] +) +width_pfit = regression_mean_width_score( + y_pis_pfit[:, 0, 0], y_pis_pfit[:, 1, 0] +) +``` + +### Plot estimated prediction intervals on test set + +```python +y_preds = [y_pred_npfit, y_pred_pfit] +y_pis = [y_pis_npfit, y_pis_pfit] +coverages = [coverage_npfit, coverage_pfit] +widths = [width_npfit, width_pfit] +``` + +```python +plot_forecast(y_train, y_test, y_preds, y_pis, coverages, widths, plot_coverage=False) +``` + +```python +window = 24 +rolling_coverage_pfit, rolling_coverage_npfit = [], [] +for i in range(window, len(y_test), 1): + rolling_coverage_pfit.append( + regression_coverage_score( + y_test[i-window:i], y_pis_pfit[i-window:i, 0, 0], y_pis_pfit[i-window:i, 1, 0] + ) + ) + rolling_coverage_npfit.append( + regression_coverage_score( + y_test[i-window:i], y_pis_npfit[i-window:i, 0, 0], y_pis_npfit[i-window:i, 1, 0] + ) + ) +``` + +### Marginal coverage on a 24-hour rolling window of prediction intervals + +```python +plt.figure(figsize=(10, 5)) +plt.ylabel(f"Rolling coverage [{window} hours]") +plt.plot(y_test[window:].index, rolling_coverage_npfit, label="Without update of residuals") +plt.plot(y_test[window:].index, rolling_coverage_pfit, label="With update of residuals") +``` + +### Temporal evolution of the distribution of residuals used for estimating prediction intervals + +```python +plt.figure(figsize=(7, 5)) +for i, j in enumerate([0, -1]): + plt.hist(conformity_scores_pfit[j], range=[-2.5, 0.5], bins=30, color=f"C{i}", alpha=0.3, label=f"Conformity scores(step={j})") + plt.axvline(lower_quantiles_pfit[j], ls="--", color=f"C{i}") + plt.axvline(higher_quantiles_pfit[j], ls="--", color=f"C{i}") +plt.legend(loc=[1, 0]) +``` From c06b0e609b6750511baaef4721739c63a9880922 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Wed, 1 Jun 2022 12:26:44 +0200 Subject: [PATCH 23/32] Take VTA remarks into account --- .gitignore | 1 + HISTORY.rst | 2 +- doc/images/quickstart_1.png | Bin 82894 -> 82894 bytes .../plot_MapieRegressor_benchmark.py | 163 ------------- .../plot_timeseries_enbpi.py | 166 ++++--------- mapie/quantile_timeit.ipynb | 230 ------------------ mapie/regression.py | 43 ++-- mapie/subsample.py | 46 ++-- mapie/tests/test_time_series_regression.py | 6 +- mapie/time_series_regression.py | 189 ++++++-------- 10 files changed, 173 insertions(+), 673 deletions(-) delete mode 100644 examples/regression/2-advanced-analysis/plot_MapieRegressor_benchmark.py delete mode 100644 mapie/quantile_timeit.ipynb diff --git a/.gitignore b/.gitignore index cad52327c..4591e989d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *$py.class .DS_Store +.mypy* # C extensions *.so diff --git a/HISTORY.rst b/HISTORY.rst index 872fe634a..4660c8661 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -5,6 +5,7 @@ History 0.3.3 (2022-XX-XX) ------------------ * Relax and fix typing +* Add EnbPI method for Time Series Regression 0.3.2 (2022-03-11) ------------------ @@ -17,7 +18,6 @@ History * Uniformize the use of matrix k_ and add an argument "ensemble" to method "predict" in regression.py * Add replication of the Chen Xu's tutorial testing Jackknife+aB vs Jackknife+ * Add Jackknife+-after-Bootstrap documentation -* Add EnbPI method for Time Series Regression * Improve scikit-learn pipelines compatibility 0.3.1 (2021-11-19) diff --git a/doc/images/quickstart_1.png b/doc/images/quickstart_1.png index 969d76c373a61a661cad58d6a425832d8522b8e7..3322f6981ea6a768e1951c2ea27841fed0626ba1 100644 GIT binary patch delta 46 zcmX@t&U&t$b%KYSp^idENl8JmmA-y%Vo5 NDArray: - """One-dimensional x*sin(x) function.""" - return x * np.sin(x) - - -def get_1d_data_with_normal_distrib( - funct: F, mu: float, sigma: float, n_samples: int, noise: float -) -> Tuple[NDArray, NDArray, NDArray, NDArray, NDArray]: - """ - Generate noisy 1D data with normal distribution from given function - and noise standard deviation. - - Parameters - ---------- - funct : F - Base function used to generate the dataset. - mu : float - Mean of normal training distribution. - sigma : float - Standard deviation of normal training distribution. - n_samples : int - Number of training samples. - noise : float - Standard deviation of noise. - - Returns - ------- - Tuple[NDArray, AnNDArrayy, NDArray, NDArray, NDArray] - Generated training and test data. - [0]: X_train - [1]: y_train - [2]: X_test - [3]: y_test - [4]: y_mesh - """ - np.random.seed(42) - X_train = np.random.normal(mu, sigma, n_samples) - X_test = np.arange(mu - 4 * sigma, mu + 4 * sigma, sigma / 20.0) - y_train, y_mesh, y_test = funct(X_train), funct(X_test), funct(X_test) - y_train += np.random.normal(0, noise, y_train.shape[0]) - y_test += np.random.normal(0, noise, y_test.shape[0]) - return ( - X_train.reshape(-1, 1), - y_train, - X_test.reshape(-1, 1), - y_test, - y_mesh, - ) - - -# Data generation -mu, sigma, n_samples, noise = 0, 2.5, 300, 0.5 -X_train, y_train, X_test, y_test, y_mesh = get_1d_data_with_normal_distrib( - x_sinx, mu, sigma, n_samples, noise -) - -# Definition of our base model -degree_polyn = 10 -polyn_model = Pipeline( - [ - ("poly", PolynomialFeatures(degree=degree_polyn)), - ("linear", LinearRegression()), - ] -) - -# Estimating prediction intervals -Params = TypedDict("Params", {"method": str, "cv": Union[int, Subsample]}) -STRATEGIES = { - "jackknife_plus": Params(method="plus", cv=-1), - "jackknife_minmax": Params(method="minmax", cv=-1), - "cv_plus": Params(method="plus", cv=10), - "cv_minmax": Params(method="minmax", cv=10), - "jackknide-plus-after-bootstrap": Params(method="plus", cv=10), - "jackknide-minmax-after-bootstrap": Params(method="minmax", cv=10), -} -y_pred, y_pis = {}, {} -for strategy, params in STRATEGIES.items(): - mapie = MapieRegressor(polyn_model, **params) - mapie.fit(X_train, y_train) - y_pred[strategy], y_pis[strategy] = mapie.predict(X_test, alpha=0.05) - - -# Visualization -def plot_1d_data( - X_train: NDArray, - y_train: NDArray, - X_test: NDArray, - y_test: NDArray, - y_sigma: float, - y_pred: NDArray, - y_pred_low: NDArray, - y_pred_up: NDArray, - ax: plt.Axes, - title: str, -) -> None: - ax.set_xlabel("x") - ax.set_ylabel("y") - ax.set_xlim([-10, 10]) - ax.set_ylim([np.min(y_test) * 1.3, np.max(y_test) * 1.3]) - ax.fill_between(X_test, y_pred_low, y_pred_up, alpha=0.3) - ax.scatter(X_train, y_train, color="red", alpha=0.3, label="Training data") - ax.plot(X_test, y_test, color="gray", label="True confidence intervals") - ax.plot(X_test, y_test - y_sigma, color="gray", ls="--") - ax.plot(X_test, y_test + y_sigma, color="gray", ls="--") - ax.plot(X_test, y_pred, color="b", alpha=0.5, label="Prediction intervals") - if title is not None: - ax.set_title(title) - ax.legend() - - -n_figs = len(STRATEGIES) -fig, axs = plt.subplots(2, 2, figsize=(13, 12)) -coords = [axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1]] -for strategy, coord in zip(STRATEGIES, coords): - plot_1d_data( - X_train.ravel(), - y_train.ravel(), - X_test.ravel(), - y_mesh.ravel(), - 1.96 * noise, - y_pred[strategy].ravel(), - y_pis[strategy][:, 0, 0].ravel(), - y_pis[strategy][:, 1, 0].ravel(), - ax=coord, - title=strategy, - ) - - -fig, ax = plt.subplots(1, 1, figsize=(7, 5)) -ax.set_xlim([-8, 8]) -ax.set_ylim([0, 4]) -for strategy in STRATEGIES: - ax.plot(X_test, y_pis[strategy][:, 1, 0] - y_pis[strategy][:, 0, 0]) -ax.axhline(1.96 * 2 * noise, ls="--", color="k") -ax.set_xlabel("x") -ax.set_ylabel("Prediction Interval Width") -ax.legend(list(STRATEGIES.keys()) + ["True width"], fontsize=8) -plt.show() diff --git a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py index 0cb4da004..d86ecd1f4 100644 --- a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py +++ b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py @@ -2,28 +2,27 @@ ================================================================== Estimating prediction intervals of time series forecast with EnbPI ================================================================== + This example uses :class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate -prediction intervals associated with time series forecast. It follows [6] and -an alternative expermimental implemetation inspired from [2] +prediction intervals associated with time series forecast. It follows [6]. We use here the Victoria electricity demand dataset used in the book "Forecasting: Principles and Practice" by R. J. Hyndman and G. Athanasopoulos. The electricity demand features daily and weekly seasonalities and is impacted by the temperature, considered here as a exogeneous variable. -A Random Forest model is aloready fitted on data. The hyper-parameters are +A Random Forest model is already fitted on data. The hyper-parameters are optimized with a :class:`sklearn.model_selection.RandomizedSearchCV` using a sequential :class:`sklearn.model_selection.TimeSeriesSplit` cross validation, in which the training set is prior to the validation set. The best model is then feeded into :class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the associated prediction intervals. We compare four approaches: with or without -``partial_fit`` called at every step, and following [6] or a approach inspired -from [2]. It appears that the approach inspired from [2] and ``partial_fit`` -offer higher coverage, but with higher width of PIs and are much slower. +``partial_fit`` called at every step, and following [6]. It appears that +``partial_fit`` offer higher coverage, but with higher width of PIs and is much +slower. """ -import warnings import numpy as np import pandas as pd @@ -32,12 +31,13 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit -from mapie.metrics import regression_coverage_score +from mapie.metrics import ( + regression_coverage_score, + regression_mean_width_score, +) from mapie.subsample import BlockBootstrap from mapie.time_series_regression import MapieTimeSeriesRegressor -warnings.simplefilter("ignore") - # Load input data and feature engineering demand_df = pd.read_csv( "../data/demand_temperature.csv", parse_dates=True, index_col=0 @@ -66,8 +66,8 @@ X_test = demand_test.loc[:, features] y_test = demand_test["Demand"] -model_params_fit_not_done = False -if model_params_fit_not_done: +perform_hyperparameters_search = False +if perform_hyperparameters_search: # CV parameter search n_iter = 100 n_splits = 5 @@ -90,104 +90,69 @@ else: # Model: Random Forest previously optimized with a cross-validation model = RandomForestRegressor( - max_depth=10, n_estimators=50, random_state=59) + max_depth=10, n_estimators=50, random_state=59 + ) # Estimate prediction intervals on test set with best estimator alpha = 0.05 -cv_MapieTimeSeries = BlockBootstrap( +cv_mapietimeseries = BlockBootstrap( n_resamplings=100, length=48, overlapping=True, random_state=59 ) -mapie_plus = MapieTimeSeriesRegressor( - model, method="plus", cv=cv_MapieTimeSeries, agg_function="mean", n_jobs=-1 -) mapie_enpbi = MapieTimeSeriesRegressor( - model, method="plus", cv=cv_MapieTimeSeries, agg_function="mean", n_jobs=-1 + model, + method="enbpi", + cv=cv_mapietimeseries, + agg_function="mean", + n_jobs=-1, ) -gap = 1 - print("EnbPI, with no partial_fit, width optimization") mapie_enpbi = mapie_enpbi.fit(X_train, y_train) y_pred_npfit_enbpi, y_pis_npfit_enbpi = mapie_enpbi.predict( - X_test, alpha=alpha, ensemble=True, beta_optimize=True + X_test, alpha=alpha, ensemble=True, optimize_beta=True ) coverage_npfit_enbpi = regression_coverage_score( y_test, y_pis_npfit_enbpi[:, 0, 0], y_pis_npfit_enbpi[:, 1, 0] ) -width_npfit_enbpi = ( - y_pis_npfit_enbpi[:, 1, 0] - y_pis_npfit_enbpi[:, 0, 0] -).mean() + +width_npfit_enbpi = regression_mean_width_score( + y_pis_npfit_enbpi[:, 1, 0], y_pis_npfit_enbpi[:, 0, 0] +) print("EnbPI with partial_fit, width optimization") mapie_enpbi = mapie_enpbi.fit(X_train, y_train) y_pred_pfit_enbpi = np.zeros(y_pred_npfit_enbpi.shape) y_pis_pfit_enbpi = np.zeros(y_pis_npfit_enbpi.shape) - -y_pred_pfit_enbpi[:gap], y_pis_pfit_enbpi[:gap, :, :] = mapie_enpbi.predict( - X_test.iloc[:gap, :], alpha=alpha, ensemble=True, beta_optimize=True +step_size = 1 +( + y_pred_pfit_enbpi[:step_size], + y_pis_pfit_enbpi[:step_size, :, :], +) = mapie_enpbi.predict( + X_test.iloc[:step_size, :], alpha=alpha, ensemble=True, optimize_beta=True ) -for step in range(gap, len(X_test), gap): +for step in range(step_size, len(X_test), step_size): mapie_enpbi.partial_fit( - X_test.iloc[(step - gap):step, :], - y_test.iloc[(step - gap):step], + X_test.iloc[(step - step_size) : step, :], + y_test.iloc[(step - step_size) : step], ) ( - y_pred_pfit_enbpi[step:step + gap], - y_pis_pfit_enbpi[step:step + gap, :, :], + y_pred_pfit_enbpi[step : step + step_size], + y_pis_pfit_enbpi[step : step + step_size, :, :], ) = mapie_enpbi.predict( - X_test.iloc[step:(step + gap), :], + X_test.iloc[step : (step + step_size), :], alpha=alpha, ensemble=True, - beta_optimize=True, + optimize_beta=True, ) coverage_pfit_enbpi = regression_coverage_score( y_test, y_pis_pfit_enbpi[:, 0, 0], y_pis_pfit_enbpi[:, 1, 0] ) -width_pfit_enbpi = ( - y_pis_pfit_enbpi[:, 1, 0] - y_pis_pfit_enbpi[:, 0, 0] -).mean() - -print("Plus, with partial_fit, width optimization") -mapie_plus = mapie_plus.fit(X_train, y_train) -y_pred_pfit_plus = np.zeros(y_pred_npfit_enbpi.shape) -y_pis_pfit_plus = np.zeros(y_pis_npfit_enbpi.shape) -(y_pred_pfit_plus[:gap], y_pis_pfit_plus[:gap, :, :],) = mapie_plus.predict( - X_test.iloc[:gap, :], - alpha=alpha, - beta_optimize=True, +width_pfit_enbpi = regression_mean_width_score( + y_pis_pfit_enbpi[:, 1, 0], y_pis_pfit_enbpi[:, 0, 0] ) -for step in range(gap, len(X_test), gap): - mapie_plus.partial_fit( - X_test.iloc[step - gap:step, :], - y_test.iloc[step - gap:step], - ) - ( - y_pred_pfit_plus[step:step + gap], - y_pis_pfit_plus[step:step + gap, :, :], - ) = mapie_plus.predict( - X_test.iloc[step:step + gap, :], - alpha=alpha, - ensemble=True, - beta_optimize=True, - ) - -coverage_pfit_plus = regression_coverage_score( - y_test, y_pis_pfit_plus[:, 0, 0], y_pis_pfit_plus[:, 1, 0] -) -width_pfit_plus = (y_pis_pfit_plus[:, 1, 0] - y_pis_pfit_plus[:, 0, 0]).mean() - -print("Plus, with NO partial_fit, MapieRegressor_Like no") -mapie_plus = mapie_plus.fit(X_train, y_train) -y_pred_pfit_MR, y_pis_pfit_MR = mapie_plus.predict( - X_test, alpha=alpha, ensemble=True -) -coverage_pfit_MR = regression_coverage_score( - y_test, y_pis_pfit_MR[:, 0, 0], y_pis_pfit_MR[:, 1, 0] -) -width_pfit_MR = (y_pis_pfit_MR[:, 1, 0] - y_pis_pfit_MR[:, 0, 0]).mean() # Print results print( @@ -200,27 +165,16 @@ "\nEnbPI with partial_fit:" f"{coverage_pfit_enbpi:.3f}, {width_pfit_enbpi:.3f}" ) -print( - "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " - "\nPlus, with partial_fit:" - f"{coverage_pfit_plus:.3f}, {width_pfit_plus:.3f}" -) -print( - "Coverage / prediction interval width mean for MapieTimeSeriesRegressor: " - "\nMR_Like, with partial_fit:" - f"{coverage_pfit_MR:.3f}, {width_pfit_MR:.3f}" -) # Plot estimated prediction intervals on test set -fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots( - nrows=2, ncols=2, figsize=(30, 25), sharey="row", sharex="col" +fig, (ax1, ax2) = plt.subplots( + nrows=2, ncols=1, figsize=(30, 25), sharey="row", sharex="col" ) -for ax in [ax1, ax2, ax3, ax4]: +for ax in [ax1, ax2]: ax.set_ylabel("Hourly demand (GW)") ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") - ax1.plot( demand_test.index, y_pred_npfit_enbpi, lw=2, c="C2", label="Predictions" ) @@ -253,39 +207,5 @@ f"Coverage:{coverage_pfit_enbpi:.3f} Width:{width_pfit_enbpi:.3f}" ) - -ax3.plot( - demand_test.index, - y_pred_pfit_plus, - lw=2, - c="C2", - label="Predictions", -) -ax3.fill_between( - demand_test.index, - y_pis_pfit_plus[:, 0, 0], - y_pis_pfit_plus[:, 1, 0], - color="C2", - alpha=0.2, - label="MapieTimeSeriesRegressor PIs", -) -ax3.set_title( - "Plus, with partial_fit.\n" - f"Coverage:{coverage_pfit_plus:.3f}" - f"Width:{width_pfit_plus:.3f}" -) -ax4.plot(demand_test.index, y_pred_pfit_MR, lw=2, c="C2", label="Predictions") -ax4.fill_between( - demand_test.index, - y_pis_pfit_MR[:, 0, 0], - y_pis_pfit_MR[:, 1, 0], - color="C2", - alpha=0.2, - label="MapieTimeSeriesRegressor PIs", -) -ax4.set_title( - "MapieRegressor Like, with partial_fit\n" - f"Coverage:{coverage_pfit_MR:.3f} Width:{width_pfit_MR:.3f}" -) ax1.legend() plt.show() diff --git a/mapie/quantile_timeit.ipynb b/mapie/quantile_timeit.ipynb deleted file mode 100644 index d9770966d..000000000 --- a/mapie/quantile_timeit.ipynb +++ /dev/null @@ -1,230 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "d4595769-7300-44d1-8850-c269ebf72fde", - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append(\"/Users/tmorzadec/Missions/MAPIE\")\n", - "import numpy as np\n", - "from mapie.utils import masked_quantile\n", - "from mapie.regression import MapieRegressor\n", - "from sklearn.datasets import make_regression\n", - "import numpy.ma as ma\n", - "from pycallgraph2 import PyCallGraph, Config\n", - "from pycallgraph2.output import GraphvizOutput\n", - "from functools import lru_cache\n", - "import callgraph.decorator as callgraph\n", - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "4e824af9-8e84-4b89-b6de-e983a168c02f", - "metadata": {}, - "outputs": [], - "source": [ - "X = np.random.uniform(low=-100, high=100, size = int(1e8)).reshape(int(1e4), -1)\n", - "\n", - "# indices1 = np.random.choice(X.shape[0], size=1, replace=True)\n", - "# indices2 = np.random.choice(X.shape[1], size=1, replace=True)\n", - "# indices = zip(indices1, indices2)\n", - "\n", - "# for (x, y) in indices:\n", - "# X[x,y] = np.nan\n", - "# " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ea9ba9ba-3dcf-43f0-9425-91437fd6f000", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1000, 10000)\n" - ] - } - ], - "source": [ - "q = list(np.linspace(0.1, 0.9, 1000))\n", - "print(X.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f90f220e-cc20-40cc-a33c-566470b61856", - "metadata": {}, - "outputs": [], - "source": [ - "@callgraph()\n", - "@lru_cache()\n", - "def mask():\n", - " masked_quantile(ma.masked_invalid(X), q, axis=0, method=\"higher\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29592f5d", - "metadata": {}, - "outputs": [], - "source": [ - "%timeit -n 1 -r 1 masked_quantile(ma.masked_invalid(X), q, axis=0, method=\"higher\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "843ab182-fd23-40f9-a569-647b1a2ad165", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7.48 s ± 401 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit np.nanquantile(X, q, axis=0, interpolation=\"higher\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "260ba3a6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7.35 s ± 502 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit np.quantile(X, q, axis=0, interpolation=\"higher\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "5349359a", - "metadata": {}, - "outputs": [], - "source": [ - "config = Config(max_depth=10)\n", - "with PyCallGraph(output=GraphvizOutput(), config=config):\n", - " masked_quantile(ma.masked_invalid(X), q, axis=0, method=\"higher\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2acbef74-3f03-457a-b31e-a5dad75d70c4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2\n", - "500\n", - "(1, 500)\n", - "2\n", - "500\n", - "(1, 500)\n", - "2\n", - "500\n", - "(1, 500)\n", - "2\n", - "500\n", - "(1, 500)\n", - "[-118.90258408 209.02916237 -22.18296168 234.40885886 -159.61745268\n", - " -184.84111207 26.85546511 -19.12965127 -162.97950999 -53.36413493]\n", - "(1, 2, 1000)\n" - ] - } - ], - "source": [ - "mapie_reg = MapieRegressor(method=\"minmax\", agg_function=\"mean\", cv=-1)\n", - "alpha = [0.2, 0.8]\n", - "mapie_reg.fit(X, y)\n", - "#y_pred_float1, y_pis_float1 = mapie_reg.predict(X, alpha=alpha[0])\n", - "#y_pred_float2, y_pis_float2 = mapie_reg.predict(X, alpha=alpha[1])\n", - "y_pred_array, y_pis_array = mapie_reg.predict(X, alpha=alpha)\n", - "#print(y_pis_float1[0,1,:10])\n", - "#print(y_pis_float2[0,1,:10])\n", - "print(y_pis_array[0,1,:10])\n", - "print(y_pis_array.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f82a3d9-cacc-42c6-9350-0709811a0af9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([-0.44901419, -0.31701145, 0.03759213, -1.03072401, 0.32107287,\n", - " 0.73329445, -0.16373546, -0.58167561, 0.24257418, -0.40065236])" - ] - }, - "execution_count": 106, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "X[0,:].flatten()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d03bf675-6fd1-4407-ae2d-0937c2e46df9", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "interpreter": { - "hash": "e1d6b69c58a8ab3fab9d4bd10bf376ef86c3438c956e9d4e062e4cc32a9f8bce" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/mapie/regression.py b/mapie/regression.py index 7670c5ebe..6150b389c 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -439,6 +439,32 @@ def aggregate_with_mask(self, x: NDArray, k: NDArray) -> NDArray: return np.matmul(x, (K / (K.sum(axis=1, keepdims=True))).T) raise ValueError("The value of self.agg_function is not correct") + def _pred_multi(self, X: ArrayLike) -> NDArray: + """ + Return a prediction per train sample for each test sample, by + aggregation with matrix ``k_``. + + Parameters + ---------- + X: NDArray of shape (n_samples_test, n_features) + Input data + + Returns + ------- + NDArray of shape (n_samples_test, n_samples_train) + """ + y_pred_multi = np.column_stack( + [e.predict(X) for e in self.estimators_] + ) + # At this point, y_pred_multi is of shape + # (n_samples_test, n_estimators_). The method + # ``aggregate_with_mask`` fits it to the right size + # thanks to the shape of k_. + + y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) + return y_pred_multi + + def fit( self, X: ArrayLike, @@ -617,22 +643,7 @@ def predict( y_pred_low = y_pred[:, np.newaxis] - quantile y_pred_up = y_pred[:, np.newaxis] + quantile else: - y_pred_multi = np.column_stack( - [e.predict(X) for e in self.estimators_] - ) - - # At this point, y_pred_multi is of shape - # (n_samples_test, n_estimators_). - # If ``method`` is "plus": - # - if ``cv`` is not a ``Subsample``, - # we enforce y_pred_multi to be of shape - # (n_samples_test, n_samples_train), - # thanks to the folds identifier. - # - if ``cv``is a ``Subsample``, the methode - # ``aggregate_with_mask`` fits it to the right size - # thanks to the shape of k_. - - y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) + y_pred_multi = self._pred_multi(X) if self.method in self.plus_like_method: lower_bounds = y_pred_multi - self.conformity_scores_ diff --git a/mapie/subsample.py b/mapie/subsample.py index 43f86035f..899dbeacc 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -27,8 +27,8 @@ class Subsample(BaseCrossValidator): the size of the training set. replace: bool Whether to replace samples in resamplings or not. By default ``True``. - random_state: Optional - int or RandomState instance. . By default ``None`` + random_state: Optional[Union[int, RandomState]] + int or RandomState instance. By default ``None`` Examples @@ -106,8 +106,8 @@ def get_n_splits(self, *args: Any, **kargs: Any) -> int: class BlockBootstrap(BaseCrossValidator): # type: ignore """ Generate a sampling method, that block bootstraps the training set. - It can replace KFold, LeaveOneOut or SubSample as cv argument in the MAPIE - class. + It can replace KFold, LeaveOneOut or SubSample as cv argument in the + MapieRegressor class. Parameters ---------- @@ -117,8 +117,8 @@ class BlockBootstrap(BaseCrossValidator): # type: ignore Length of the blocks. By default ``None``, the length of the training set divided by ``n_blocks``. overlapping: bool - Whether the blocks can overlapp or not. By default ``False``. - n_blocsk: int + Whether the blocks can overlap or not. By default ``False``. + n_blocks: int Number of blocks in each resampling. By default ``None``, the size of the training set divided by ``length``. random_state: Optional @@ -133,8 +133,8 @@ class BlockBootstrap(BaseCrossValidator): # type: ignore -------- >>> import numpy as np >>> from mapie.subsample import BlockBootstrap - >>> cv = BlockBootstrap(n_resamplings=2, length = 3, random_state=0) - >>> X = np.array([1,2,3,4,5,6,7,8,9,10]) + >>> cv = BlockBootstrap(n_resamplings=2, length=3, random_state=0) + >>> X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) >>> for train_index, test_index in cv.split(X): ... print(f"train index is {train_index}, test index is {test_index}") train index is [1 2 3 4 5 6 1 2 3 4 5 6], test index is [8 9 7] @@ -177,33 +177,35 @@ def split( ValueError If ``length`` is not positive or greater than the train set size. """ + if (self.n_blocks is not None) + (self.length is not None) != 1: + raise ValueError( + "Exactly one argument between ``length`` or " + "``n_blocks`` has to be not None" + ) + + n = len(X) + if self.n_blocks is not None: length = ( - self.length - if self.length is not None - else len(X) // self.n_blocks + self.length if self.length is not None else n // self.n_blocks ) n_blocks = self.n_blocks elif self.length is not None: length = self.length - n_blocks = (len(X) // self.length) + 1 - else: - raise ValueError( - "At least one argument between ``length`` or " - "``n_blocks`` has to be not None" - ) - indices = np.arange(len(X)) - if (length <= 0) or (length > len(indices)): + n_blocks = (n // self.length) + 1 + + indices = np.arange(n) + if (length <= 0) or (length > n): raise ValueError( - "The length of blocks is <= 0 or greater than the lenght" + "The length of blocks is <= 0 or greater than the length" "of training set." ) if self.overlapping: blocks = sliding_window_view(indices, window_shape=length) else: - indices = indices[(len(indices) % length):] - blocks_number = len(indices) // length + indices = indices[(n % length) :] + blocks_number = n // length blocks = np.asarray( np.array_split(indices, indices_or_sections=blocks_number) ) diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 1f5648632..8caf3d0fd 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -223,10 +223,10 @@ def test_linear_regression_results(strategy: str) -> None: mapie_ts = MapieTimeSeriesRegressor(**STRATEGIES[strategy]) mapie_ts.fit(X, y) if "opt" in strategy: - beta_optimize = True + optimize_beta = True else: - beta_optimize = False - _, y_pis = mapie_ts.predict(X, alpha=0.05, beta_optimize=beta_optimize) + optimize_beta = False + _, y_pis = mapie_ts.predict(X, alpha=0.05, optimize_beta=optimize_beta) y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0] width_mean = (y_pred_up - y_pred_low).mean() diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 4ffc596fc..61f989120 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -73,69 +73,6 @@ def _relative_conformity_scores( y_pred, _ = super().predict(X, alpha=0.5, ensemble=True) return np.asarray(y) - np.asarray(y_pred) - def fit( - self, - X: ArrayLike, - y: ArrayLike, - sample_weight: Optional[ArrayLike] = None, - ) -> MapieTimeSeriesRegressor: - """ - Compare to the method ``fit`` of ``MapieRegressor``, the ``fit`` method - of ``MapieTimeSeriesRegressor`` computes the ``conformity_scores_`` - with relative values. - - Returns - ------- - MapieTimeSeriesRegressor - The model itself. - """ - self = super().fit(X=X, y=y, sample_weight=sample_weight) - self.conformity_scores_ = self._relative_conformity_scores(X, y) - return self - - def partial_fit( - self, - X: ArrayLike, - y: ArrayLike, - ) -> MapieTimeSeriesRegressor: - """ - Update the ``conformity_scores_`` and ``k_`` attributes when new data - with known labels are available. - Note: Don't use ``partial_fit`` with samples of the training set. - - Parameters - ---------- - X : ArrayLike of shape (n_samples_test, n_features) - Input data. - - y : ArrayLike of shape (n_samples_test,) - Input labels. - - Returns - ------- - MapieTimeSeriesRegressor - The model itself. - - Raises - ------ - ValueError - If the lenght of y is greater than the lenght of the training set. - """ - X = cast(NDArray, X) - y = cast(NDArray, y) - if len(X) > len(self.conformity_scores_): - raise ValueError( - "You try to update more residuals than there are!" - ) - new_conformity_scores_ = self._relative_conformity_scores(X, y) - self.conformity_scores_ = np.roll( - self.conformity_scores_, -len(new_conformity_scores_) - ) - self.conformity_scores_[-len(new_conformity_scores_):] = ( - new_conformity_scores_ - ) - return self - def _beta_optimize( self, alpha: Union[float, NDArray], @@ -143,8 +80,7 @@ def _beta_optimize( lower_bounds: NDArray, ) -> NDArray: """ - ``_beta_optimize`` offers to minimize the width of the PIs, for a given - difference of quantiles. + Minimize the width of the PIs, for a given difference of quantiles. Parameters ---------- @@ -172,7 +108,7 @@ def _beta_optimize( ) alpha = cast(NDArray, alpha) betas_0 = np.full( - shape=(len(alpha), len(lower_bounds)), + shape=(len(lower_bounds), len(alpha)), fill_value=np.nan, dtype=float, ) @@ -189,57 +125,96 @@ def _beta_optimize( 1 - _alpha + betas, axis=1, method="higher", - ) # type: ignore + ) beta = np_nanquantile( lower_bounds, betas, axis=1, method="lower", - ) # type: ignore - betas_0[ind_alpha, :] = betas[ + ) + betas_0[:, ind_alpha] = betas[ np.argmin(one_alpha_beta - beta, axis=0) ] return betas_0 - def _pred_multi(self, X: ArrayLike) -> NDArray: + def fit( + self, + X: ArrayLike, + y: ArrayLike, + sample_weight: Optional[ArrayLike] = None, + ) -> MapieTimeSeriesRegressor: + """ + Compared to the method ``fit`` of ``MapieRegressor``, the ``fit`` + method of ``MapieTimeSeriesRegressor`` computes the + ``conformity_scores_`` with relative values. + + Returns + ------- + MapieTimeSeriesRegressor + The model itself. """ - Return a prediction per train sample for each test sample, by - aggregation with matrix ``k_``. + self = super().fit(X=X, y=y, sample_weight=sample_weight) + self.conformity_scores_ = self._relative_conformity_scores(X, y) + return self + + def partial_fit( + self, + X: ArrayLike, + y: ArrayLike, + ) -> MapieTimeSeriesRegressor: + """ + Update the ``conformity_scores_`` attribute when new data with known + labels are available. + Note: Don't use ``partial_fit`` with samples of the training set. Parameters ---------- - X: NDArray of shape (n_samples_test, n_features) - Input data + X : ArrayLike of shape (n_samples_test, n_features) + Input data. + + y : ArrayLike of shape (n_samples_test,) + Input labels. Returns ------- - NDArray of shape (n_samples_test, n_samples_train) + MapieTimeSeriesRegressor + The model itself. + + Raises + ------ + ValueError + If the length of y is greater than the length of the training set. """ - y_pred_multi = np.column_stack( - [e.predict(X) for e in self.estimators_] + X = cast(NDArray, X) + y = cast(NDArray, y) + n = len(self.conformity_scores_) + if len(X) > n: + raise ValueError( + "You try to update more residuals than there are!" + ) + new_conformity_scores_ = self._relative_conformity_scores(X, y) + self.conformity_scores_ = np.roll( + self.conformity_scores_, -len(new_conformity_scores_) ) - # At this point, y_pred_multi is of shape - # (n_samples_test, n_estimators_). The method - # ``aggregate_with_mask`` fits it to the right size - # thanks to the shape of k_. - - y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) - return y_pred_multi + self.conformity_scores_[ + -len(new_conformity_scores_) : + ] = new_conformity_scores_ + return self def predict( self, X: ArrayLike, ensemble: bool = False, alpha: Optional[Union[float, Iterable[float]]] = None, - beta_optimize: bool = True, + optimize_beta: bool = True, ) -> Union[NDArray, Tuple[NDArray, NDArray]]: """ Correspond to 'Conformal prediction for dynamic time-series'. Parameters ---------- - beta_optimize: bool + optimize_beta: bool Whether to optimize the PIs' width or not. """ @@ -248,42 +223,36 @@ def predict( check_is_fitted(self, self.fit_attributes) self._check_ensemble(ensemble) alpha = cast(Optional[NDArray], check_alpha(alpha)) - X = check_array(X, force_all_finite=False, dtype=["float64", "object"]) y_pred = self.single_estimator_.predict(X) + n = len(self.conformity_scores_) if alpha is None: return np.array(y_pred) else: alpha_np = cast(NDArray, alpha) - check_alpha_and_n_samples(alpha_np, len(self.conformity_scores_)) + check_alpha_and_n_samples(alpha_np, n) - if beta_optimize: + if optimize_beta: betas_0 = self._beta_optimize( - alpha=alpha_np, - lower_bounds=self.conformity_scores_.reshape(1, -1), - upper_bounds=self.conformity_scores_.reshape(1, -1), + alpha_np, + self.conformity_scores_.reshape(1, -1), + self.conformity_scores_.reshape(1, -1), ) else: - betas_0 = np.full( - shape=(len(alpha), len(self.conformity_scores_)), - fill_value=np.nan, - dtype=float, - ) - for ind_alpha, _alpha in enumerate(alpha): - betas_0[ind_alpha, :] = _alpha / 2.0 + betas_0 = np.repeat(alpha[:, np.newaxis] / 2, n, axis=0) lower_quantiles = np_nanquantile( self.conformity_scores_, - betas_0[:, 0], + betas_0[0, :], axis=0, method="lower", - ).T # type: ignore + ).T higher_quantiles = np_nanquantile( self.conformity_scores_, - 1 - alpha_np + betas_0[:, 0], + 1 - alpha_np + betas_0[0, :], axis=0, method="higher", - ).T # type: ignore + ).T if self.cv == "prefit": y_pred_low = y_pred[:, np.newaxis] + lower_quantiles @@ -293,18 +262,8 @@ def predict( pred = aggregate_all(self.agg_function, y_pred_multi) lower_bounds, upper_bounds = pred, pred - y_pred_low = np.column_stack( - [ - lower_bounds + lower_quantiles[k] - for k, _ in enumerate(alpha_np) - ] - ) - y_pred_up = np.column_stack( - [ - upper_bounds + higher_quantiles[k] - for k, _ in enumerate(alpha_np) - ] - ) + y_pred_low = lower_bounds + lower_quantiles.reshape(-1, 1) + y_pred_up = upper_bounds + higher_quantiles.reshape(-1, 1) if ensemble: y_pred = aggregate_all(self.agg_function, y_pred_multi) From 98a87f61f3596efb768193695e84ebeca3e9865f Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Wed, 1 Jun 2022 14:24:55 +0200 Subject: [PATCH 24/32] all test pass after remarks of VTA integration --- .../2-advanced-analysis/plot_timeseries_enbpi.py | 10 +++++----- mapie/regression.py | 1 - mapie/subsample.py | 2 +- mapie/tests/test_subsample.py | 2 +- mapie/time_series_regression.py | 7 +++---- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py index d86ecd1f4..7b5d57c11 100644 --- a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py +++ b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py @@ -135,14 +135,14 @@ for step in range(step_size, len(X_test), step_size): mapie_enpbi.partial_fit( - X_test.iloc[(step - step_size) : step, :], - y_test.iloc[(step - step_size) : step], + X_test.iloc[(step - step_size):step, :], + y_test.iloc[(step - step_size):step], ) ( - y_pred_pfit_enbpi[step : step + step_size], - y_pis_pfit_enbpi[step : step + step_size, :, :], + y_pred_pfit_enbpi[step:step + step_size], + y_pis_pfit_enbpi[step:step + step_size, :, :], ) = mapie_enpbi.predict( - X_test.iloc[step : (step + step_size), :], + X_test.iloc[step:(step + step_size), :], alpha=alpha, ensemble=True, optimize_beta=True, diff --git a/mapie/regression.py b/mapie/regression.py index 6150b389c..9d52193de 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -464,7 +464,6 @@ def _pred_multi(self, X: ArrayLike) -> NDArray: y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) return y_pred_multi - def fit( self, X: ArrayLike, diff --git a/mapie/subsample.py b/mapie/subsample.py index 899dbeacc..4f7671669 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -204,7 +204,7 @@ def split( if self.overlapping: blocks = sliding_window_view(indices, window_shape=length) else: - indices = indices[(n % length) :] + indices = indices[(n % length):] blocks_number = n // length blocks = np.asarray( np.array_split(indices, indices_or_sections=blocks_number) diff --git a/mapie/tests/test_subsample.py b/mapie/tests/test_subsample.py index 533a50b7f..22320ffff 100644 --- a/mapie/tests/test_subsample.py +++ b/mapie/tests/test_subsample.py @@ -79,7 +79,7 @@ def test_split_BlockBootstrap_error() -> None: cv = BlockBootstrap() print(cv.length) print(cv.n_blocks) - with pytest.raises(ValueError, match=r".*At least one argument*"): + with pytest.raises(ValueError, match=r".*Exactly one argument*"): next(cv.split(X)) cv = BlockBootstrap(length=20) with pytest.raises(ValueError, match=r".*The length of blocks is <= 0 *"): diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index f34cf0aa9..2e9f6492d 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -5,7 +5,6 @@ import numpy as np from sklearn.base import RegressorMixin from sklearn.model_selection import BaseCrossValidator -from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted from ._compatibility import np_nanquantile @@ -198,7 +197,7 @@ def partial_fit( self.conformity_scores_, -len(new_conformity_scores_) ) self.conformity_scores_[ - -len(new_conformity_scores_) : + -len(new_conformity_scores_): ] = new_conformity_scores_ return self @@ -264,8 +263,8 @@ def predict( pred = aggregate_all(self.agg_function, y_pred_multi) lower_bounds, upper_bounds = pred, pred - y_pred_low = lower_bounds + lower_quantiles.reshape(-1, 1) - y_pred_up = upper_bounds + higher_quantiles.reshape(-1, 1) + y_pred_low = lower_bounds.reshape(-1, 1) + lower_quantiles + y_pred_up = upper_bounds.reshape(-1, 1) + higher_quantiles if ensemble: y_pred = aggregate_all(self.agg_function, y_pred_multi) From 8590a9cb5c998e1927615244183343f42752db82 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Wed, 1 Jun 2022 14:40:14 +0200 Subject: [PATCH 25/32] the maximun test coverage is reached after remarks integration --- mapie/subsample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapie/subsample.py b/mapie/subsample.py index 4f7671669..fd0f5a347 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -190,7 +190,7 @@ def split( self.length if self.length is not None else n // self.n_blocks ) n_blocks = self.n_blocks - elif self.length is not None: + else: length = self.length n_blocks = (n // self.length) + 1 From 9cff9bfba8c78a9625eb31abf4fb03bbdea50e86 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Wed, 1 Jun 2022 15:06:59 +0200 Subject: [PATCH 26/32] a bug to be fixed --- mapie/subsample.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mapie/subsample.py b/mapie/subsample.py index fd0f5a347..91843dae0 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Generator, Optional, Tuple, Union +from typing import Any, Generator, Optional, Tuple, Union, cast import numpy as np from numpy.lib.stride_tricks import sliding_window_view @@ -191,8 +191,8 @@ def split( ) n_blocks = self.n_blocks else: - length = self.length - n_blocks = (n // self.length) + 1 + length = cast(int, self.length) + n_blocks = (n // length) + 1 indices = np.arange(n) if (length <= 0) or (length > n): From a7c5c57bb77e6ecc749c1bc06c0ddf566e804102 Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Tue, 7 Jun 2022 09:57:14 +0200 Subject: [PATCH 27/32] an import error add to be corrected in the documentation --- .../2-advanced-analysis/plot_timeseries_enbpi.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py index 7b5d57c11..9c6464cae 100644 --- a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py +++ b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py @@ -39,9 +39,10 @@ from mapie.time_series_regression import MapieTimeSeriesRegressor # Load input data and feature engineering -demand_df = pd.read_csv( - "../data/demand_temperature.csv", parse_dates=True, index_col=0 -) +url_file = ("https://raw.githubusercontent.com/scikit-learn-contrib/MAPIE/" + + "master/examples/data/demand_temperature.csv" + ) +demand_df = pd.read_csv(url_file, parse_dates=True, index_col=0) demand_df["Date"] = pd.to_datetime(demand_df.index) demand_df["Weekofyear"] = demand_df.Date.dt.isocalendar().week.astype("int64") From 247e26dfc99183e97b564a01b05d8e1305cbd24f Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Wed, 8 Jun 2022 19:06:02 +0200 Subject: [PATCH 28/32] VTA remarks taken into account --- .../plot_timeseries_enbpi.py | 76 ++++++++--------- mapie/regression.py | 6 +- mapie/tests/test_regression.py | 6 +- mapie/tests/test_time_series_regression.py | 32 ++------ mapie/time_series_regression.py | 81 ++++++++++--------- 5 files changed, 90 insertions(+), 111 deletions(-) diff --git a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py index 9c6464cae..0254d5ec2 100644 --- a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py +++ b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py @@ -18,15 +18,16 @@ in which the training set is prior to the validation set. The best model is then feeded into :class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the -associated prediction intervals. We compare four approaches: with or without -``partial_fit`` called at every step, and following [6]. It appears that +associated prediction intervals. We compare two approaches: with or without +``partial_fit`` called at every step following [6]. It appears that ``partial_fit`` offer higher coverage, but with higher width of PIs and is much slower. """ +import matplotlib +from matplotlib import pylab as plt import numpy as np import pandas as pd -from matplotlib import pylab as plt from scipy.stats import randint from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit @@ -39,9 +40,10 @@ from mapie.time_series_regression import MapieTimeSeriesRegressor # Load input data and feature engineering -url_file = ("https://raw.githubusercontent.com/scikit-learn-contrib/MAPIE/" + - "master/examples/data/demand_temperature.csv" - ) +url_file = ( + "https://raw.githubusercontent.com/scikit-learn-contrib/MAPIE/" + + "master/examples/data/demand_temperature.csv" +) demand_df = pd.read_csv(url_file, parse_dates=True, index_col=0) demand_df["Date"] = pd.to_datetime(demand_df.index) @@ -168,45 +170,37 @@ ) # Plot estimated prediction intervals on test set -fig, (ax1, ax2) = plt.subplots( +fig, axs = plt.subplots( nrows=2, ncols=1, figsize=(30, 25), sharey="row", sharex="col" ) +font = {"family": "normal", "weight": "bold", "size": 22} +matplotlib.rc("font", **font) + -for ax in [ax1, ax2]: +for i, (ax, w) in enumerate( + zip(axs, ["EnbPI, without partial_fit", "EnbPI with partial_fit"]) +): ax.set_ylabel("Hourly demand (GW)") ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") -ax1.plot( - demand_test.index, y_pred_npfit_enbpi, lw=2, c="C2", label="Predictions" -) -ax1.fill_between( - demand_test.index, - y_pis_npfit_enbpi[:, 0, 0], - y_pis_npfit_enbpi[:, 1, 0], - color="C2", - alpha=0.2, - label="MapieTimeSeriesRegressor PIs", -) -ax1.set_title( - "EnbPI, without partial_fit.\n" - f"Coverage:{coverage_npfit_enbpi:.3f} Width:{width_npfit_enbpi:.3f}" -) - -ax2.plot( - demand_test.index, y_pred_pfit_enbpi, lw=2, c="C2", label="Predictions" -) -ax2.fill_between( - demand_test.index, - y_pis_pfit_enbpi[:, 0, 0], - y_pis_pfit_enbpi[:, 1, 0], - color="C2", - alpha=0.2, - label="MapieTimeSeriesRegressor PIs", -) -ax2.set_title( - "EnbPI with partial_fit.\n" - f"Coverage:{coverage_pfit_enbpi:.3f} Width:{width_pfit_enbpi:.3f}" -) - -ax1.legend() + ax.plot( + demand_test.index, + y_pred_npfit_enbpi, + lw=2, + c="C2", + label="Predictions", + ) + ax.fill_between( + demand_test.index, + y_pis_npfit_enbpi[:, 0, 0], + y_pis_npfit_enbpi[:, 1, 0], + color="C2", + alpha=0.2, + label="MapieTimeSeriesRegressor PIs", + ) + ax.set_title( + w + "\n" + f"Coverage:{coverage_npfit_enbpi:.3f} Width:{width_npfit_enbpi:.3f}" + ) +axs[0].legend() plt.show() diff --git a/mapie/regression.py b/mapie/regression.py index 9d52193de..f57e8dadf 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -397,7 +397,7 @@ def _fit_and_predict_oof_model( y_pred = np.array([]) return estimator, y_pred, val_index - def aggregate_with_mask(self, x: NDArray, k: NDArray) -> NDArray: + def _aggregate_with_mask(self, x: NDArray, k: NDArray) -> NDArray: """ Take the array of predictions, made by the refitted estimators, on the testing set, and the 1-nan array indicating for each training @@ -458,10 +458,10 @@ def _pred_multi(self, X: ArrayLike) -> NDArray: ) # At this point, y_pred_multi is of shape # (n_samples_test, n_estimators_). The method - # ``aggregate_with_mask`` fits it to the right size + # ``_aggregate_with_mask`` fits it to the right size # thanks to the shape of k_. - y_pred_multi = self.aggregate_with_mask(y_pred_multi, self.k_) + y_pred_multi = self._aggregate_with_mask(y_pred_multi, self.k_) return y_pred_multi def fit( diff --git a/mapie/tests/test_regression.py b/mapie/tests/test_regression.py index e96198223..251c4c9d5 100644 --- a/mapie/tests/test_regression.py +++ b/mapie/tests/test_regression.py @@ -413,21 +413,21 @@ def test_invalid_aggregate_all() -> None: def test_aggregate_with_mask_with_prefit() -> None: """ - Test ``aggregate_with_mask`` in case ``cv`` is ``"prefit"``. + Test ``_aggregate_with_mask`` in case ``cv`` is ``"prefit"``. """ mapie_reg = MapieRegressor(cv="prefit") with pytest.raises( ValueError, match=r".*There should not be aggregation of predictions if cv is*", ): - mapie_reg.aggregate_with_mask(k, k) + mapie_reg._aggregate_with_mask(k, k) mapie_reg = MapieRegressor(agg_function="nonsense") with pytest.raises( ValueError, match=r".*The value of self.agg_function is not correct*", ): - mapie_reg.aggregate_with_mask(k, k) + mapie_reg._aggregate_with_mask(k, k) def test_pred_loof_isnan() -> None: diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 8caf3d0fd..eb0ae1889 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -197,8 +197,8 @@ def test_prediction_agg_function( method: str, cv: Union[LeaveOneOut, KFold], agg_function: str, alpha: int ) -> None: """ - Test that predictions differ when ensemble is True/False, - but not prediction intervals. + Test that PIs are the same but predictions differ when ensemble is + True or False. """ mapie = MapieTimeSeriesRegressor( method=method, cv=cv, agg_function=agg_function @@ -265,7 +265,10 @@ def test_not_enough_resamplings() -> None: def test_no_agg_fx_specified_with_subsample() -> None: - """Test that a warning is raised if at least one residual is nan.""" + """ + Test that an error is raised if ``cv`` is ``BlockBootstrap`` but + ``agg_function`` is ``None``. + """ with pytest.raises( ValueError, match=r"You need to specify an aggregation*" ): @@ -287,25 +290,6 @@ def test_invalid_aggregate_all() -> None: aggregate_all(None, X) -def test_aggregate_with_mask_with_prefit() -> None: - """ - Test ``aggregate_with_mask`` in case ``cv`` is ``"prefit"``. - """ - mapie_ts_reg = MapieTimeSeriesRegressor(cv="prefit") - with pytest.raises( - ValueError, - match=r".*There should not be aggregation of predictions if cv is*", - ): - mapie_ts_reg.aggregate_with_mask(k, k) - - mapie_ts_reg = MapieTimeSeriesRegressor(agg_function="nonsense") - with pytest.raises( - ValueError, - match=r".*The value of self.agg_function is not correct*", - ): - mapie_ts_reg.aggregate_with_mask(k, k) - - def test_pred_loof_isnan() -> None: """Test that if validation set is empty then prediction is empty.""" mapie_ts_reg = MapieTimeSeriesRegressor() @@ -341,10 +325,10 @@ def test_MapieTimeSeriesRegressor_partial_fit_ensemble() -> None: ) -def test_MapieTimeSeriesRegressor_partial_fit_two_big() -> None: +def test_MapieTimeSeriesRegressor_partial_fit_too_big() -> None: """Test ``partial_fit`` raised error.""" mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) - with pytest.raises(ValueError, match=r".*You try to update more*"): + with pytest.raises(ValueError, match=r".*The number of observations*"): mapie_ts_reg = mapie_ts_reg.partial_fit(X=X, y=y) diff --git a/mapie/time_series_regression.py b/mapie/time_series_regression.py index 2e9f6492d..3fc2b321e 100644 --- a/mapie/time_series_regression.py +++ b/mapie/time_series_regression.py @@ -190,7 +190,8 @@ def partial_fit( n = len(self.conformity_scores_) if len(X) > n: raise ValueError( - "You try to update more residuals than there are!" + "The number of observations to update is higher than the" + "number of training instances." ) new_conformity_scores_ = self._relative_conformity_scores(X, y) self.conformity_scores_ = np.roll( @@ -227,46 +228,46 @@ def predict( if alpha is None: return np.array(y_pred) - else: - alpha_np = cast(NDArray, alpha) - check_alpha_and_n_samples(alpha_np, n) - - if optimize_beta: - betas_0 = self._beta_optimize( - alpha_np, - self.conformity_scores_.reshape(1, -1), - self.conformity_scores_.reshape(1, -1), - ) - else: - betas_0 = np.repeat(alpha[:, np.newaxis] / 2, n, axis=0) - - lower_quantiles = np_nanquantile( - self.conformity_scores_, - betas_0[0, :], - axis=0, - method="lower", - ).T - higher_quantiles = np_nanquantile( - self.conformity_scores_, - 1 - alpha_np + betas_0[0, :], - axis=0, - method="higher", - ).T - self.lower_quantiles_ = lower_quantiles - self.higher_quantiles_ = higher_quantiles - if self.cv == "prefit": - y_pred_low = y_pred[:, np.newaxis] + lower_quantiles - y_pred_up = y_pred[:, np.newaxis] + higher_quantiles - else: - y_pred_multi = self._pred_multi(X) - pred = aggregate_all(self.agg_function, y_pred_multi) - lower_bounds, upper_bounds = pred, pred + alpha_np = cast(NDArray, alpha) + check_alpha_and_n_samples(alpha_np, n) + + if optimize_beta: + betas_0 = self._beta_optimize( + alpha_np, + self.conformity_scores_.reshape(1, -1), + self.conformity_scores_.reshape(1, -1), + ) + else: + betas_0 = np.repeat(alpha[:, np.newaxis] / 2, n, axis=0) + + lower_quantiles = np_nanquantile( + self.conformity_scores_, + betas_0[0, :], + axis=0, + method="lower", + ).T + higher_quantiles = np_nanquantile( + self.conformity_scores_, + 1 - alpha_np + betas_0[0, :], + axis=0, + method="higher", + ).T + self.lower_quantiles_ = lower_quantiles + self.higher_quantiles_ = higher_quantiles + + if self.cv == "prefit": + y_pred_low = y_pred[:, np.newaxis] + lower_quantiles + y_pred_up = y_pred[:, np.newaxis] + higher_quantiles + else: + y_pred_multi = self._pred_multi(X) + pred = aggregate_all(self.agg_function, y_pred_multi) + lower_bounds, upper_bounds = pred, pred - y_pred_low = lower_bounds.reshape(-1, 1) + lower_quantiles - y_pred_up = upper_bounds.reshape(-1, 1) + higher_quantiles + y_pred_low = lower_bounds.reshape(-1, 1) + lower_quantiles + y_pred_up = upper_bounds.reshape(-1, 1) + higher_quantiles - if ensemble: - y_pred = aggregate_all(self.agg_function, y_pred_multi) + if ensemble: + y_pred = aggregate_all(self.agg_function, y_pred_multi) - return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) + return y_pred, np.stack([y_pred_low, y_pred_up], axis=1) From e1edc5fe5829304038153f9f42bbb9b059b454ca Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Thu, 9 Jun 2022 09:37:06 +0200 Subject: [PATCH 29/32] a bug fixed in enbpi example --- .../plot_timeseries_enbpi.py | 35 +++++++++++++++---- mapie/tests/test_time_series_regression.py | 2 +- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py index 0254d5ec2..84789c84b 100644 --- a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py +++ b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py @@ -24,6 +24,8 @@ slower. """ +from typing import cast + import matplotlib from matplotlib import pylab as plt import numpy as np @@ -36,6 +38,7 @@ regression_coverage_score, regression_mean_width_score, ) +from mapie._typing import NDArray from mapie.subsample import BlockBootstrap from mapie.time_series_regression import MapieTimeSeriesRegressor @@ -169,6 +172,22 @@ f"{coverage_pfit_enbpi:.3f}, {width_pfit_enbpi:.3f}" ) +enbpi_no_pfit = { + "y_pred": y_pred_npfit_enbpi, + "y_pis": y_pis_npfit_enbpi, + "coverage": coverage_npfit_enbpi, + "width": width_npfit_enbpi, +} + +enbpi_pfit = { + "y_pred": y_pred_pfit_enbpi, + "y_pis": y_pis_pfit_enbpi, + "coverage": coverage_pfit_enbpi, + "width": width_pfit_enbpi, +} + +results = [enbpi_no_pfit, enbpi_pfit] + # Plot estimated prediction intervals on test set fig, axs = plt.subplots( nrows=2, ncols=1, figsize=(30, 25), sharey="row", sharex="col" @@ -177,30 +196,34 @@ matplotlib.rc("font", **font) -for i, (ax, w) in enumerate( - zip(axs, ["EnbPI, without partial_fit", "EnbPI with partial_fit"]) +for i, (ax, w, result) in enumerate( + zip(axs, ["EnbPI, without partial_fit", "EnbPI with partial_fit"], results) ): ax.set_ylabel("Hourly demand (GW)") ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") ax.plot( demand_test.index, - y_pred_npfit_enbpi, + result["y_pred"], lw=2, c="C2", label="Predictions", ) + + y_pis = cast(NDArray, result["y_pis"]) + ax.fill_between( demand_test.index, - y_pis_npfit_enbpi[:, 0, 0], - y_pis_npfit_enbpi[:, 1, 0], + y_pis[:, 0, 0], + y_pis[:, 1, 0], color="C2", alpha=0.2, label="MapieTimeSeriesRegressor PIs", ) + ax.set_title( w + "\n" - f"Coverage:{coverage_npfit_enbpi:.3f} Width:{width_npfit_enbpi:.3f}" + f"Coverage:{result['coverage']:.3f} Width:{result['width']:.3f}" ) axs[0].legend() plt.show() diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index eb0ae1889..02ea8da0c 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -303,7 +303,7 @@ def test_pred_loof_isnan() -> None: assert len(y_pred) == 0 -def test_MapieTimeSeriesRegressor_alpha_is_None() -> None: +def test_MapieTimeSeriesRegressor_if_alpha_is_None() -> None: """Test ``predict`` when ``alpha`` is None.""" mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) From 2af9f36c866ecb636955ec9d484c2739d204e02c Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 13 Jun 2022 14:26:11 +0200 Subject: [PATCH 30/32] Last remarks of VTA taken into account --- .../2-advanced-analysis/plot_timeseries_enbpi.py | 6 +++--- mapie/regression.py | 2 +- mapie/tests/test_time_series_regression.py | 8 +++++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py index 84789c84b..6b1dac7ef 100644 --- a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py +++ b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py @@ -20,8 +20,8 @@ :class:`mapie.time_series_regression.MapieTimeSeriesRegressor` to estimate the associated prediction intervals. We compare two approaches: with or without ``partial_fit`` called at every step following [6]. It appears that -``partial_fit`` offer higher coverage, but with higher width of PIs and is much -slower. +``partial_fit`` offer a coverage closer to the targeted coverage, and with +narrower PIs. """ from typing import cast @@ -190,7 +190,7 @@ # Plot estimated prediction intervals on test set fig, axs = plt.subplots( - nrows=2, ncols=1, figsize=(30, 25), sharey="row", sharex="col" + nrows=2, ncols=1, figsize=(30, 25), sharex="col" ) font = {"family": "normal", "weight": "bold", "size": 22} matplotlib.rc("font", **font) diff --git a/mapie/regression.py b/mapie/regression.py index f57e8dadf..e4fc66efb 100644 --- a/mapie/regression.py +++ b/mapie/regression.py @@ -400,7 +400,7 @@ def _fit_and_predict_oof_model( def _aggregate_with_mask(self, x: NDArray, k: NDArray) -> NDArray: """ Take the array of predictions, made by the refitted estimators, - on the testing set, and the 1-nan array indicating for each training + on the testing set, and the 1-or-nan array indicating for each training sample which one to integrate, and aggregate to produce phi-{t}(x_t) for each training sample x_t. diff --git a/mapie/tests/test_time_series_regression.py b/mapie/tests/test_time_series_regression.py index 02ea8da0c..abc5f9ad3 100644 --- a/mapie/tests/test_time_series_regression.py +++ b/mapie/tests/test_time_series_regression.py @@ -20,6 +20,8 @@ X, y = make_regression(n_samples=500, n_features=10, noise=1.0, random_state=1) k = np.ones(shape=(5, X.shape[1])) METHODS = ["enbpi"] +UPDATE_DATA = ([6], 17.5) +CONFORMITY_SCORES = [14.189 - 14.038, 17.5 - 18.665] Params = TypedDict( "Params", @@ -315,13 +317,13 @@ def test_MapieTimeSeriesRegressor_partial_fit_ensemble() -> None: """Test ``partial_fit``.""" mapie_ts_reg = MapieTimeSeriesRegressor(cv=-1).fit(X_toy, y_toy) assert round(mapie_ts_reg.conformity_scores_[-1], 2) == round( - np.abs(14.189 - 14.038), 2 + np.abs(CONFORMITY_SCORES[0]), 2 ) mapie_ts_reg = mapie_ts_reg.partial_fit( - X=np.array([[6]]), y=np.array([17.5]) + X=np.array([UPDATE_DATA[0]]), y=np.array([UPDATE_DATA[1]]) ) assert round(mapie_ts_reg.conformity_scores_[-1], 2) == round( - 17.5 - 18.665, 2 + CONFORMITY_SCORES[1], 2 ) From c3234fa99a46774cd514d6628a9fce19afbd3e7f Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 13 Jun 2022 14:56:55 +0200 Subject: [PATCH 31/32] merge with master before push for release --- .../regression/2-advanced-analysis/plot_timeseries_enbpi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py index 6b1dac7ef..c980e83aa 100644 --- a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py +++ b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py @@ -192,7 +192,7 @@ fig, axs = plt.subplots( nrows=2, ncols=1, figsize=(30, 25), sharex="col" ) -font = {"family": "normal", "weight": "bold", "size": 22} +font = {"weight": "bold", "size": 22} matplotlib.rc("font", **font) From 3f779ecfd89e4104eddefc79f82d9f4945e6318e Mon Sep 17 00:00:00 2001 From: Thomas Morzadec Date: Mon, 13 Jun 2022 18:54:04 +0200 Subject: [PATCH 32/32] plot_enbpi so nice --- .../plot_timeseries_enbpi.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py index c980e83aa..80212f847 100644 --- a/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py +++ b/examples/regression/2-advanced-analysis/plot_timeseries_enbpi.py @@ -26,7 +26,6 @@ from typing import cast -import matplotlib from matplotlib import pylab as plt import numpy as np import pandas as pd @@ -190,16 +189,13 @@ # Plot estimated prediction intervals on test set fig, axs = plt.subplots( - nrows=2, ncols=1, figsize=(30, 25), sharex="col" + nrows=2, ncols=1, figsize=(15, 12), sharex="col" ) -font = {"weight": "bold", "size": 22} -matplotlib.rc("font", **font) - for i, (ax, w, result) in enumerate( zip(axs, ["EnbPI, without partial_fit", "EnbPI with partial_fit"], results) ): - ax.set_ylabel("Hourly demand (GW)") + ax.set_ylabel("Hourly demand (GW)", fontsize=20) ax.plot(demand_test.Demand, lw=2, label="Test data", c="C1") ax.plot( @@ -223,7 +219,12 @@ ax.set_title( w + "\n" - f"Coverage:{result['coverage']:.3f} Width:{result['width']:.3f}" + f"Coverage:{result['coverage']:.3f} Width:{result['width']:.3f}", + fontweight="bold", + size=20 ) -axs[0].legend() + plt.xticks(size=15, rotation=45) + plt.yticks(size=15) + +axs[0].legend(prop={'size': 22}) plt.show()