diff --git a/HISTORY.rst b/HISTORY.rst index 5b9f4ef34..ad4a310a0 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -8,8 +8,8 @@ History * Add "naive" and "top-k" methods in MapieClassifier * Include J+aB method in regression tutorial * Add MNIST example for classification +* Add replication of the Chen Xu's tutorial testing Jackknife+aB vs Jackknife+. * Add Jackknife+-after-Bootstrap documentation -* Refactorize regression code 0.3.1 (2021-11-19) ------------------ diff --git a/examples/regression/plot_kim2020_simulations.py b/examples/regression/plot_kim2020_simulations.py new file mode 100644 index 000000000..9cef7cad7 --- /dev/null +++ b/examples/regression/plot_kim2020_simulations.py @@ -0,0 +1,499 @@ +""" +================================================== +Reproducing the simulations from Kim et al. (2020) +================================================== + +:class:`mapie.regression.MapieRegressor` is used to reproduce the simulations +by Kim et al. (2020) [1] in their article which introduces the +jackknife+-after-bootstrap method. + +For a given model, the simulation fits MAPIE regressors with jackknife+ and +jackknife+-after-bootstrap methods, on different resamplings of a data set +loaded from 'https://archive.ics.uci.edu/', and compares the coverage levels +and the width means of the PIs. + +In order to reproduce results from the tutorial notebook of [1], we +implemented their regression model ``Ridge2``, a variant of `sklearn.Ridge` +with an adaptive regularization parameter (other models can also be tested). + +We compare jackknife+ and jackknife+-after-bootstrap, with fixed and random +numbers of bootstraps, for a given training set of size ``n``, and different +resampling sets of size ``m``, following the discussion in [1]. + +This simulation is carried out to assert that the jackknife+ and +jackknife+-after-bootsrap methods implemented in MAPIE give the same +results than [1], and that the targeted coverage level is obtained. + +[1] Byol Kim, Chen Xu, and Rina Foygel Barber. +"Predictive Inference Is Free with the Jackknife+-after-Bootstrap." +34th Conference on Neural Information Processing Systems (NeurIPS 2020). +""" +from __future__ import annotations + +from io import BytesIO +from typing import Any, Optional, Tuple +from urllib.request import urlopen +from zipfile import ZipFile + +import matplotlib.pyplot as plt +import matplotlib.ticker as mtick +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.linear_model import Ridge +from sklearn.model_selection import train_test_split + +from mapie._typing import ArrayLike +from mapie.metrics import ( + regression_mean_width_score, + regression_coverage_score, +) +from mapie.regression import MapieRegressor +from mapie.subsample import Subsample + + +def get_X_y() -> Tuple[ArrayLike, ArrayLike]: + """ + Downloads the ``blog`` dataset from a zip file on the UCI Machine Learning + website, and returns X and y, which are respectively the explicative + data and the labels. + + Returns + ------- + Tuple[ArrayLike, ArrayLike] of shapes + (n_samples, n_features) and (n_samples,) + Explicative data and labels + """ + website = "https://archive.ics.uci.edu/" + page = "ml/machine-learning-databases/" + folder = "00304/" + zip_folder = "BlogFeedback.zip" + csv_file = "blogData_train.csv" + url = website + page + folder + zip_folder + resp = urlopen(url) + zipfile = ZipFile(BytesIO(resp.read())) + df = pd.read_csv(zipfile.open(csv_file)).to_numpy() + X = df[:, :-1] + y = np.log(1 + df[:, -1]) + return (X, y) + + +class Ridge2(RegressorMixin, BaseEstimator): # type:ignore + """ + Little variation of Ridge proposed in [1]. + Rectify alpha on the training set svd max value. + + Parameters + ---------- + ridge_mult : float + Multiplicative factor such that the alpha factor of the ``Ridge`` model + fitted by ``Ridge2`` is the squared maximum eigenvalue of the training + set times ``ridge_mult``. + """ + + def __init__(self, ridge_mult: float = 0.001) -> None: + self.ridge_mult = ridge_mult + self.__name__ = "Ridge2" + + def fit(self, X: ArrayLike, y: Optional[ArrayLike] = None) -> Ridge2: + """ + Fit Ridge2. + + Parameters + ---------- + X : ArrayLike of shape (n_samples, n_features) + Training data. + + y : ArrayLike of shape (n_samples,) + Training labels. + + Returns + ------- + Ridge2 + The model itself. + """ + alpha = self.ridge_mult * np.linalg.svd(X, compute_uv=False).max() ** 2 + self.ridge2 = Ridge(alpha=alpha).fit(X=X, y=y) + return self + + def predict(self, X: ArrayLike) -> ArrayLike: + """ + Predict target on new samples. + + Parameters + ---------- + X : ArrayLike of shape (n_samples, n_features) + Test data. + + Returns + ------- + np.ndarray of shape (n_samples, ) + Predictions on test data + """ + return self.ridge2.predict(X) + + +def compute_PIs( + estimator: BaseEstimator, + X_train: ArrayLike, + y_train: ArrayLike, + X_test: ArrayLike, + method: str, + cv: Any, + alpha: float, + agg_function: Optional[str] = None, +) -> pd.DataFrame: + """ + Train and test a model with a MAPIE method, + and return a DataFrame of upper and lower bounds of the predictions + on the test set. + + Parameters + ---------- + estimator : BaseEstimator + Base model to fit. + X_train : np.ndarray + Features of training set. + y_train : np.ndarray + Target of training set. + X_test : np.ndarray + Features of testing set. + method : str + Method for estimating prediction intervals. + cv : Any + Strategy for computing residuals. + alpha : float + 1 - (target coverage level). + agg_function: str + 'mean' or 'median'. + Function to aggregate the predictions of the B estimators. + + Returns + ------- + pd.DataFrame + DataFrame of upper and lower predictions. + """ + mapie_estimator = MapieRegressor( + estimator=estimator, + method=method, + cv=cv, + n_jobs=-1, + agg_function=agg_function, + ) + + mapie_estimator = mapie_estimator.fit(X=X_train, y=y_train) + _, y_pis = mapie_estimator.predict(X=X_test, alpha=alpha) + PI = np.c_[y_pis[:, 0, 0], y_pis[:, 1, 0]] + return pd.DataFrame(PI, columns=["lower", "upper"]) + + +def get_coverage_width(PIs: pd.DataFrame, y: ArrayLike) -> Tuple[float, float]: + """ + Computes the mean coverage and width of the predictions intervals of a + DataFrame given by the ``compute_PIs`` function + + Parameters + ---------- + PIs : pd.DataFrame + DataFrame returned by `compute_PIs``, with lower and upper bounds of + the PIs. + + y : ArrayLike + Targets supposedly covered by the PIs. + + Returns + ------- + (coverage, width) : Tuple[float, float] + The mean coverage and width of the PIs. + """ + coverage = regression_coverage_score( + y_true=y, y_pred_low=PIs["lower"], y_pred_up=PIs["upper"] + ) + width = regression_mean_width_score( + y_pred_low=PIs["lower"], y_pred_up=PIs["upper"] + ) + return (coverage, width) + + +def B_random_from_B_fixed( + B: int, train_size: int, m: int, itrial: int = 0, random_state: int = 98765 +) -> int: + """ + Generates a random number from a binomial distribution. + + Parameters + ---------- + B : int + Fixed B, parameter of the binomial distribution + train_size : int + Training set size + m : int + Resampling set size + itrial : int + Number of the trial + random_state + Base random state (fixed according to [1].) + + Returns + ------- + int + Integer drawn according to + Binomial(B/(1-1/(train_size+1))^m),(1-1/(train_size+1))^m), + where B, train_size and m are parameters. + """ + np.random.seed(random_state + itrial) + return int( + np.random.binomial( + int(B / (1 - 1.0 / (1 + train_size)) ** m), + (1 - 1.0 / (1 + train_size)) ** m, + size=1, + ) + ) + + +def comparison_JAB( + model: BaseEstimator = Ridge2(), + agg_function: str = "mean", + alpha: float = 0.1, + trials: int = 10, + train_size: int = 200, + boostrap_size: int = 10, + B_fixed: int = 50, + random_state: int = 98765, +) -> pd.DataFrame: + """ + Launch trials of jackknife-plus and jackknife-plus_after_boostrap, + with B fixed and random, for a given number of resample size and a given + number of trials, and returns the results as a DataFrame, + + Parameters + ---------- + model : BaseEstimator + Base model. By default, Ridge2. + agg_function: str + Aggregation function to test. + alpha : float + 1 - (target coverage level). + trials: int + Number of trials launch for a given boostrap set size. + train_size : int + Size of the train set. + bootstrap_size : int + Number of boostrap sizes to test, + uniformly distributed between 10 and 100% + of the train set size. + B_fixed : int + Number of bootstrap samples in J+aB is drawn as + B ~ Binomial(int(B_fixed/(1-1/(n+1))^m),(1-1/(n+1))^m), + where n is the training set size, and m the resampling set size. + random_state : int + Random state. By default, 98765 (from [1]). + + Returns + ------- + pd.DataFrame + DataFrame with columns: + - itrial : the number of the trial + - model : the estimator's name + - method : jackknife+ of jackknife+-after-bootsrap + - coverage : PIs' coverage + - width : mean PI's width + - m : the resampling set size + - agg_function: aggregation method + """ + + results = pd.DataFrame( + columns=["itrial", "estimator", "method", "coverage", "width", "m"], + index=np.arange(trials * (2 * boostrap_size + 1)), + ) + + (X, y) = get_X_y() + m_vals = np.round( + train_size * np.linspace(0.1, 1, num=boostrap_size) + ).astype(int) + + result_index = 0 + for itrial in range(trials): + X_train, X_test, y_train, y_test = train_test_split( + X, y, train_size=train_size, random_state=random_state + itrial + ) + PIs = PIs = compute_PIs( + estimator=model, + X_train=X_train, + y_train=y_train, + X_test=X_test, + method="plus", + cv=-1, + alpha=alpha, + agg_function=agg_function, + ) + (coverage, width) = get_coverage_width(PIs, y_test) + results.iloc[result_index, :] = [ + itrial, + type(model).__name__, + "J+", + coverage, + width, + 0, + ] + result_index += 1 + + for i_m, m in enumerate(m_vals): + # J+aB, random B + B_random = B_random_from_B_fixed( + B_fixed, train_size, m, itrial=i_m + ) + subsample_B_random = Subsample( + n_resamplings=B_random, + n_samples=m, + replace=True, + random_state=random_state, + ) + PIs = compute_PIs( + estimator=model, + X_train=X_train, + y_train=y_train, + X_test=X_test, + method="plus", + cv=subsample_B_random, + alpha=alpha, + agg_function=agg_function, + ) + (coverage, width) = get_coverage_width(PIs, y_test) + results.iloc[result_index, :] = [ + itrial, + type(model).__name__, + "J+aB Random B", + coverage, + width, + m, + ] + result_index += 1 + + # J+aB, fixed B + subsample_B_fixed = Subsample( + n_resamplings=B_fixed, + n_samples=m, + replace=True, + random_state=random_state, + ) + PIs = PIs = compute_PIs( + estimator=model, + X_train=X_train, + y_train=y_train, + X_test=X_test, + method="plus", + cv=subsample_B_fixed, + alpha=alpha, + agg_function=agg_function, + ) + (coverage, width) = get_coverage_width(PIs, y_test) + results.iloc[result_index, :] = [ + itrial, + type(model).__name__, + "J+aB Fixed B", + coverage, + width, + m, + ] + result_index += 1 + results["agg_function"] = agg_function + results["alpha"] = alpha + results = results.astype( + { + "itrial": int, + "estimator": str, + "method": str, + "coverage": float, + "width": float, + "m": int, + "agg_function": str, + } + ) + return results + + +def plot_results(results: pd.DataFrame, score: str) -> None: + """ + Compares the desired score (i.e. coverage or width) between the Jackknife+ + and Jackknife+-after-Bootstrap and between fixed and random B parameter + as in [1] simulations. + + Parameters + ---------- + results : pd.DataFrame + DataFrame returned by comparison_JAB. + score: str + 'coverage' or 'width' + """ + + res = results.copy() + res["fixed_random"] = res["method"].str.split("[' ']").str.get(1) + res["method"] = res["method"].str.split().str.get(0) + m_vals = res["m"].values + res["ratio"] = m_vals / m_vals.max() + data_J = res.loc[res.method == "J+", ["ratio", score]] + data_JaB = res.loc[res.method == "J+aB", ["ratio", score]] + data_fix = res.loc[res.fixed_random == "Fixed", ["ratio", score]] + data_random = res.loc[res.fixed_random == "Random", ["ratio", score]] + + alpha = pd.unique(results["alpha"])[0] + + # plot the comparison between J+ vs J+AB + fig, axes = plt.subplots(1, 2, figsize=(8, 6), sharey=True) + data_J.boxplot(by="ratio", ax=axes[0]) + data_JaB.boxplot(by="ratio", ax=axes[1]) + + if score == "coverage": + axes[0].axhline(y=1 - alpha, color="red") + axes[1].axhline(y=1 - alpha, color="red") + xticks = mtick.PercentFormatter(1, decimals=0) + axes[0].yaxis.set_major_formatter(xticks) + axes[0].set_title("J+") + axes[1].set_title("J+aB") + axes[0].set_xticks([]) + axes[0].set_xlabel("") + axes[1].set_xlabel("m/n", fontsize=14) + axes[0].set_ylabel(score) + fig.suptitle( + f"\n Agg. function = {res['agg_function'].unique()[0]}" + + f"\nModel = {res.estimator.unique()[0]}" + + "\n Data=BLOG" + ) + plt.show() + + # plot the comparison between random vs fixed B + fig, axes = plt.subplots(1, 2, figsize=(8, 6), sharey=True) + data_fix.boxplot(by="ratio", ax=axes[0]) + data_random.boxplot(by="ratio", ax=axes[1]) + axes[0].set_title("Fixed B", fontsize=14) + axes[1].set_title("Random B", fontsize=14) + if score == "coverage": + axes[0].axhline(y=1 - alpha, color="red") + axes[1].axhline(y=1 - alpha, color="red") + axes[0].yaxis.set_major_formatter(xticks) + axes[0].set_ylabel(score) + axes[1].set_ylabel("") + for ax in axes: + ax.set_xlabel("m/n", fontsize=14) + fig.suptitle( + f"\n Agg. function = {res['agg_function'].unique()[0]}" + + f"\nModel = {res.estimator.unique()[0]}" + + "\n Data=BLOG" + ) + plt.show() + + +if __name__ == "__main__": + + results_coverages_widths = comparison_JAB( + model=Ridge2(), + alpha=0.1, + trials=2, + train_size=40, + boostrap_size=5, + B_fixed=20, + ) + print(results_coverages_widths.info()) + plot_results(results_coverages_widths, "coverage") + plot_results(results_coverages_widths, "width") diff --git a/mapie/metrics.py b/mapie/metrics.py index 4533ae342..3c5024e4b 100644 --- a/mapie/metrics.py +++ b/mapie/metrics.py @@ -87,3 +87,37 @@ def classification_coverage_score( y_pred_set, y_true.reshape(-1, 1), axis=1 ).mean() return float(coverage) + + +def regression_mean_width_score( + y_pred_low: ArrayLike, + y_pred_up: ArrayLike, +) -> float: + """ + Effective mean width score obtained by the prediction intervals. + + Parameters + ---------- + y_pred_low : ArrayLike of shape (n_samples,) + Lower bound of prediction intervals. + y_pred_up : ArrayLike of shape (n_samples,) + Upper bound of prediction intervals. + + Returns + ------- + float + Effective mean width of the prediction intervals. + + Examples + -------- + >>> from mapie.metrics import regression_mean_width_score + >>> import numpy as np + >>> y_pred_low = np.array([4, 6, 9, 8.5, 10.5]) + >>> y_pred_up = np.array([6, 9, 10, 12.5, 12]) + >>> print(regression_mean_width_score(y_pred_low, y_pred_up)) + 2.3 + """ + y_pred_low = column_or_1d(y_pred_low) + y_pred_up = column_or_1d(y_pred_up) + mean_width = np.abs(y_pred_up - y_pred_low).mean() + return float(mean_width) diff --git a/mapie/subsample.py b/mapie/subsample.py index ea15b6c7b..abb7b31c8 100644 --- a/mapie/subsample.py +++ b/mapie/subsample.py @@ -95,7 +95,7 @@ def get_n_splits(self, *args: Any, **kargs: Any) -> int: Returns ------- - n_splits : int + int Returns the number of splitting iterations in the cross-validator. """ return self.n_resamplings diff --git a/mapie/tests/test_metrics.py b/mapie/tests/test_metrics.py index 2aa361434..1f415d666 100644 --- a/mapie/tests/test_metrics.py +++ b/mapie/tests/test_metrics.py @@ -4,8 +4,9 @@ import pytest import numpy as np from mapie.metrics import ( - regression_coverage_score, classification_coverage_score, + regression_coverage_score, + regression_mean_width_score, ) @@ -36,39 +37,45 @@ def test_regression_ypredlow_shape() -> None: "Test shape of y_pred_low." with pytest.raises(ValueError, match=r".*y should be a 1d array*"): regression_coverage_score(y_toy, y_preds[:, :2], y_preds[:, 2]) + with pytest.raises(ValueError, match=r".*y should be a 1d array*"): + regression_mean_width_score(y_preds[:, :2], y_preds[:, 2]) def test_regression_ypredup_shape() -> None: "Test shape of y_pred_up." with pytest.raises(ValueError, match=r".*y should be a 1d array*"): regression_coverage_score(y_toy, y_preds[:, 1], y_preds[:, 1:]) + with pytest.raises(ValueError, match=r".*y should be a 1d array*"): + regression_mean_width_score(y_preds[:, :2], y_preds[:, 2]) def test_regression_same_length() -> None: "Test when y_true and y_preds have different lengths." with pytest.raises(ValueError, match=r".*could not be broadcast*"): regression_coverage_score(y_toy, y_preds[:-1, 1], y_preds[:-1, 2]) + with pytest.raises(ValueError, match=r".*y should be a 1d array*"): + regression_mean_width_score(y_preds[:, :2], y_preds[:, 2]) -def test_regression_toydata() -> None: +def test_regression_toydata_coverage_score() -> None: "Test coverage_score for toy data." scr = regression_coverage_score(y_toy, y_preds[:, 1], y_preds[:, 2]) assert scr == 0.8 -def test_regression_ytrue_type() -> None: +def test_regression_ytrue_type_coverage_score() -> None: "Test that list(y_true) gives right coverage." scr = regression_coverage_score(list(y_toy), y_preds[:, 1], y_preds[:, 2]) assert scr == 0.8 -def test_regression_ypredlow_type() -> None: +def test_regression_ypredlow_type_coverage_score() -> None: "Test that list(y_pred_low) gives right coverage." scr = regression_coverage_score(y_toy, list(y_preds[:, 1]), y_preds[:, 2]) assert scr == 0.8 -def test_regression_ypredup_type() -> None: +def test_regression_ypredup_type_coverage_score() -> None: "Test that list(y_pred_up) gives right coverage." scr = regression_coverage_score(y_toy, y_preds[:, 1], list(y_preds[:, 2])) assert scr == 0.8 @@ -109,3 +116,21 @@ def test_classification_y_pred_set_type() -> None: "Test that list(y_pred_set) gives right coverage." scr = classification_coverage_score(y_true_class, list(y_pred_set)) assert scr == 0.8 + + +def test_regression_toydata_mean_width_score() -> None: + "Test mean_width_score for toy data." + scr = regression_mean_width_score(y_preds[:, 1], y_preds[:, 2]) + assert scr == 2.3 + + +def test_regression_ypredlow_type_mean_width_score() -> None: + "Test that list(y_pred_low) gives right coverage." + scr = regression_mean_width_score(list(y_preds[:, 1]), y_preds[:, 2]) + assert scr == 2.3 + + +def test_regression_ypredup_type_mean_width_score() -> None: + "Test that list(y_pred_up) gives right coverage." + scr = regression_mean_width_score(y_preds[:, 1], list(y_preds[:, 2])) + assert scr == 2.3