scikit-learn-contrib · vtaquet · Oct 12, 2021 · Sep 27, 2021 · Sep 29, 2021 · Sep 30, 2021
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+.DS_Store
 
 # C extensions
 *.so
@@ -13,7 +14,12 @@ doc/examples_regression/
 doc/auto_examples/
 doc/modules/generated/
 doc/datasets/generated/
+<<<<<<< HEAD
+doc/examples_regression/
+doc/examples_classification/
+=======
 doc/generated/
+>>>>>>> dev
 
 # Distribution / packaging
 

diff --git a/doc/images/quickstart_1.png b/doc/images/quickstart_1.png
diff --git a/examples/regression/plot_barber2020_simulations.py b/examples/regression/plot_barber2020_simulations.py
@@ -28,21 +28,20 @@
 "Predictive inference with the jackknife+."
 Ann. Statist., 49(1):486–507, February 2021.
 """
-from typing import List, Dict, Any
+from typing import Any, Dict, List
 
 import numpy as np
-from sklearn.linear_model import LinearRegression
-from matplotlib import pyplot as plt
-
-from mapie.regression import MapieRegressor
 from mapie.metrics import regression_coverage_score
+from mapie.regression import MapieRegressor
+from matplotlib import pyplot as plt
+from sklearn.linear_model import LinearRegression
 
 
 def PIs_vs_dimensions(
     strategies: Dict[str, Any],
     alpha: float,
     n_trial: int,
-    dimensions: List[int]
+    dimensions: List[int],
 ) -> Dict[str, Dict[int, Dict[str, np.ndarray]]]:
     """
     Compute the prediction intervals for a linear regression problem.
@@ -89,15 +88,17 @@ def PIs_vs_dimensions(
         strategy: {
             dimension: {
                 "coverage": np.empty(n_trial),
-                "width_mean": np.empty(n_trial)
-            } for dimension in dimensions
-        } for strategy in strategies
+                "width_mean": np.empty(n_trial),
+            }
+            for dimension in dimensions
+        }
+        for strategy in strategies
     }
     for dimension in dimensions:
         for trial in range(n_trial):
             beta = np.random.normal(size=dimension)
-            beta_norm = np.sqrt((beta**2).sum())
-            beta = beta/beta_norm*np.sqrt(SNR)
+            beta_norm = np.sqrt((beta ** 2).sum())
+            beta = beta / beta_norm * np.sqrt(SNR)
             X_train = np.random.normal(size=(n_train, dimension))
             noise_train = np.random.normal(size=n_train)
             noise_test = np.random.normal(size=n_test)
@@ -108,26 +109,23 @@ def PIs_vs_dimensions(
             for strategy, params in strategies.items():
                 mapie = MapieRegressor(
                     LinearRegression(),
-                    ensemble=True,
+                    agg_function="median",
                     n_jobs=-1,
                     **params
                 )
                 mapie.fit(X_train, y_train)
                 y_pred, y_pis = mapie.predict(X_test, alpha=alpha)
-                results[strategy][dimension]["coverage"][trial] = (
-                    regression_coverage_score(
-                        y_test, y_pis[:, 0, 0], y_pis[:, 1, 0]
-                    )
+                coverage = regression_coverage_score(
+                    y_test, y_pis[:, 0, 0], y_pis[:, 1, 0]
                 )
-                results[strategy][dimension]["width_mean"][trial] = (
-                    y_pis[:, 1, 0] - y_pis[:, 0, 0]
-                ).mean()
+                results[strategy][dimension]["coverage"][trial] = coverage
+                width_mean = (y_pis[:, 1, 0] - y_pis[:, 0, 0]).mean()
+                results[strategy][dimension]["width_mean"][trial] = width_mean
     return results
 
 
 def plot_simulation_results(
-    results: Dict[str, Dict[int, Dict[str, np.ndarray]]],
-    title: str
+    results: Dict[str, Dict[int, Dict[str, np.ndarray]]], title: str
 ) -> None:
     """
     Show the prediction interval coverages and widths as a function
@@ -148,35 +146,31 @@ def plot_simulation_results(
     for strategy in results:
         dimensions = list(results[strategy].keys())
         n_dim = len(dimensions)
-        coverage_mean, coverage_SE, width_mean, width_SE = (
-            np.zeros(n_dim), np.zeros(n_dim), np.zeros(n_dim), np.zeros(n_dim)
-        )
+        coverage_mean = np.zeros(n_dim)
+        coverage_SE = np.zeros(n_dim)
+        width_mean = np.zeros(n_dim)
+        width_SE = np.zeros(n_dim)
+
         for idim, dim in enumerate(dimensions):
-            coverage_mean[idim] = (
-                results[strategy][dim]["coverage"].mean()
-            )
-            coverage_SE[idim] = (
-                results[strategy][dim]["coverage"].std()/np.sqrt(ntrial)
-            )
-            width_mean[idim] = (
-                results[strategy][dim]["width_mean"].mean()
-            )
-            width_SE[idim] = (
-                results[strategy][dim]["width_mean"].std()/np.sqrt(ntrial)
-            )
+            coverage = results[strategy][dim]["coverage"]
+            coverage_mean[idim] = coverage.mean()
+            coverage_SE[idim] = coverage.std() / np.sqrt(ntrial)
+            width = results[strategy][dim]["width_mean"]
+            width_mean[idim] = width.mean()
+            width_SE[idim] = width.std() / np.sqrt(ntrial)
         ax1.plot(dimensions, coverage_mean, label=strategy)
         ax1.fill_between(
             dimensions,
             coverage_mean - coverage_SE,
             coverage_mean + coverage_SE,
-            alpha=0.25
+            alpha=0.25,
         )
         ax2.plot(dimensions, width_mean, label=strategy)
         ax2.fill_between(
             dimensions,
             width_mean - width_SE,
             width_mean + width_SE,
-            alpha=0.25
+            alpha=0.25,
         )
     ax1.axhline(1 - alpha, linestyle="dashed", c="k")
     ax1.set_ylim(0.0, 1.0)
@@ -192,7 +186,7 @@ def plot_simulation_results(
 STRATEGIES = {
     "naive": dict(method="naive"),
     "cv": dict(method="base", cv=5),
-    "cv_plus": dict(method="plus", cv=5)
+    "cv_plus": dict(method="plus", cv=5),
 }
 alpha = 0.1
 ntrial = 3

diff --git a/examples/regression/plot_both_uncertainties.py b/examples/regression/plot_both_uncertainties.py
@@ -6,29 +6,27 @@
 prediction intervals capturing both aleatoric and epistemic uncertainties
 on a one-dimensional dataset with homoscedastic noise and normal sampling.
 """
-from typing import Tuple, Any, TypeVar, Callable
-from typing_extensions import TypedDict
-import numpy as np
+from typing import Any, Callable, Tuple, TypeVar
+
 import matplotlib.pyplot as plt
-from sklearn.pipeline import Pipeline
+import numpy as np
+from mapie.regression import MapieRegressor
 from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import PolynomialFeatures
-from mapie.regression import MapieRegressor
+from typing_extensions import TypedDict
+
 F = TypeVar("F", bound=Callable[..., Any])
 
 
 # Functions for generating our dataset
 def x_sinx(x: np.ndarray) -> Any:
     """One-dimensional x*sin(x) function."""
-    return x*np.sin(x)
+    return x * np.sin(x)
 
 
 def get_1d_data_with_normal_distrib(
-    funct: F,
-    mu: float,
-    sigma: float,
-    n_samples: int,
-    noise: float
+    funct: F, mu: float, sigma: float, n_samples: int, noise: float
 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
     """
     Generate noisy 1D data with normal distribution from given function
@@ -59,12 +57,16 @@ def get_1d_data_with_normal_distrib(
     """
     np.random.seed(42)
     X_train = np.random.normal(mu, sigma, n_samples)
-    X_test = np.arange(mu - 4*sigma, mu + 4*sigma, sigma/20.)
+    X_test = np.arange(mu - 4 * sigma, mu + 4 * sigma, sigma / 20.0)
     y_train, y_mesh, y_test = funct(X_train), funct(X_test), funct(X_test)
     y_train += np.random.normal(0, noise, y_train.shape[0])
     y_test += np.random.normal(0, noise, y_test.shape[0])
     return (
-        X_train.reshape(-1, 1), y_train, X_test.reshape(-1, 1), y_test, y_mesh
+        X_train.reshape(-1, 1),
+        y_train,
+        X_test.reshape(-1, 1),
+        y_test,
+        y_mesh,
     )
 
 
@@ -79,7 +81,7 @@ def get_1d_data_with_normal_distrib(
 polyn_model = Pipeline(
     [
         ("poly", PolynomialFeatures(degree=degree_polyn)),
-        ("linear", LinearRegression())
+        ("linear", LinearRegression()),
     ]
 )
 
@@ -93,7 +95,7 @@ def get_1d_data_with_normal_distrib(
 }
 y_pred, y_pis = {}, {}
 for strategy, params in STRATEGIES.items():
-    mapie = MapieRegressor(polyn_model, ensemble=False, **params)
+    mapie = MapieRegressor(polyn_model, **params)
     mapie.fit(X_train, y_train)
     y_pred[strategy], y_pis[strategy] = mapie.predict(X_test, alpha=0.05)
 
@@ -109,12 +111,12 @@ def plot_1d_data(
     y_pred_low: np.ndarray,
     y_pred_up: np.ndarray,
     ax: plt.Axes,
-    title: str
+    title: str,
 ) -> None:
     ax.set_xlabel("x")
     ax.set_ylabel("y")
     ax.set_xlim([-10, 10])
-    ax.set_ylim([np.min(y_test)*1.3, np.max(y_test)*1.3])
+    ax.set_ylim([np.min(y_test) * 1.3, np.max(y_test) * 1.3])
     ax.fill_between(X_test, y_pred_low, y_pred_up, alpha=0.3)
     ax.scatter(X_train, y_train, color="red", alpha=0.3, label="Training data")
     ax.plot(X_test, y_test, color="gray", label="True confidence intervals")
@@ -135,12 +137,12 @@ def plot_1d_data(
         y_train.ravel(),
         X_test.ravel(),
         y_mesh.ravel(),
-        1.96*noise,
+        1.96 * noise,
         y_pred[strategy].ravel(),
         y_pis[strategy][:, 0, 0].ravel(),
         y_pis[strategy][:, 1, 0].ravel(),
         ax=coord,
-        title=strategy
+        title=strategy,
     )
 
 
@@ -149,7 +151,7 @@ def plot_1d_data(
 ax.set_ylim([0, 4])
 for strategy in STRATEGIES:
     ax.plot(X_test, y_pis[strategy][:, 1, 0] - y_pis[strategy][:, 0, 0])
-ax.axhline(1.96*2*noise, ls="--", color="k")
+ax.axhline(1.96 * 2 * noise, ls="--", color="k")
 ax.set_xlabel("x")
 ax.set_ylabel("Prediction Interval Width")
 ax.legend(list(STRATEGIES.keys()) + ["True width"], fontsize=8)

diff --git a/examples/regression/plot_homoscedastic_1d_data.py b/examples/regression/plot_homoscedastic_1d_data.py
@@ -8,27 +8,24 @@
 different strategies.
 """
 from typing import Tuple
-from typing_extensions import TypedDict
 
 import numpy as np
 import scipy
-from sklearn.preprocessing import PolynomialFeatures
-from sklearn.pipeline import Pipeline
-from sklearn.linear_model import LinearRegression
-from matplotlib import pyplot as plt
-
 from mapie.regression import MapieRegressor
+from matplotlib import pyplot as plt
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import PolynomialFeatures
+from typing_extensions import TypedDict
 
 
 def f(x: np.ndarray) -> np.ndarray:
     """Polynomial function used to generate one-dimensional data"""
-    return np.array(5*x + 5*x**4 - 9*x**2)
+    return np.array(5 * x + 5 * x ** 4 - 9 * x ** 2)
 
 
 def get_homoscedastic_data(
-    n_train: int = 200,
-    n_true: int = 200,
-    sigma: float = 0.1
+    n_train: int = 200, n_true: int = 200, sigma: float = 0.1
 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, float]:
     """
     Generate one-dimensional data from a given function,
@@ -61,7 +58,7 @@ def get_homoscedastic_data(
     X_true = np.linspace(0, 1, n_true)
     y_train = f(X_train) + np.random.normal(0, sigma, n_train)
     y_true = f(X_true)
-    y_true_sigma = q95*sigma
+    y_true_sigma = q95 * sigma
     return X_train, y_train, X_true, y_true, y_true_sigma
 
 
@@ -75,7 +72,7 @@ def plot_1d_data(
     y_pred_low: np.ndarray,
     y_pred_up: np.ndarray,
     ax: plt.Axes,
-    title: str
+    title: str,
 ) -> None:
     """
     Generate a figure showing the training data and estimated
@@ -120,10 +117,12 @@ def plot_1d_data(
 
 X_train, y_train, X_test, y_test, y_test_sigma = get_homoscedastic_data()
 
-polyn_model = Pipeline([
-    ("poly", PolynomialFeatures(degree=4)),
-    ("linear", LinearRegression(fit_intercept=False))
-])
+polyn_model = Pipeline(
+    [
+        ("poly", PolynomialFeatures(degree=4)),
+        ("linear", LinearRegression(fit_intercept=False)),
+    ]
+)
 
 Params = TypedDict("Params", {"method": str, "cv": int})
 STRATEGIES = {
@@ -134,17 +133,19 @@ def plot_1d_data(
     "cv_plus": Params(method="plus", cv=10),
     "cv_minmax": Params(method="minmax", cv=10),
 }
-fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(3*6, 12))
+fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(
+    2, 3, figsize=(3 * 6, 12)
+)
 axs = [ax1, ax2, ax3, ax4, ax5, ax6]
 for i, (strategy, params) in enumerate(STRATEGIES.items()):
     mapie = MapieRegressor(
-        polyn_model,
-        ensemble=True,
-        n_jobs=-1,
-        **params
+        polyn_model, agg_function="median", n_jobs=-1, **params
     )
     mapie.fit(X_train.reshape(-1, 1), y_train)
-    y_pred, y_pis = mapie.predict(X_test.reshape(-1, 1), alpha=0.05,)
+    y_pred, y_pis = mapie.predict(
+        X_test.reshape(-1, 1),
+        alpha=0.05,
+    )
     plot_1d_data(
         X_train,
         y_train,
@@ -155,6 +156,6 @@ def plot_1d_data(
         y_pis[:, 0, 0],
         y_pis[:, 1, 0],
         axs[i],
-        strategy
+        strategy,
     )
 plt.show()