scikit-adaptation · tgnassou · Feb 7, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024
diff --git a/examples/datasets/plot_shifted_dataset_regression.py b/examples/datasets/plot_shifted_dataset_regression.py
@@ -0,0 +1,89 @@
+"""
+Plot dataset source domain and shifted target domain
+====================================================
+
+This illustrates the :func:`~skada.datasets.make_shifted_dataset`
+dataset generator. Each method consists of generating source data
+and shifted target data. We illustrate here:
+covariate shift, target shift, concept drift, and sample bias.
+See detailed description of each shift in [1]_.
+
+.. [1] Moreno-Torres, J. G., Raeder, T., Alaiz-Rodriguez,
+       R., Chawla, N. V., and Herrera, F. (2012).
+       A unifying view on dataset shift in classification.
+       Pattern recognition, 45(1):521-530.
+"""
+# %% Imports
+
+import matplotlib.pyplot as plt
+
+from skada.datasets import make_shifted_datasets
+from skada import source_target_split
+
+
+# %% Helper function
+
+
+
+def plot_shifted_dataset(shift, random_state=42):
+    """Plot source and shifted target data for a given type of shift.
+
+    The possible shifts are 'covariate_shift', 'target_shift' or
+    'concept_drift'.
+
+    We use here the same random seed for multiple calls to
+    ensure same distributions.
+    """
+    X, y, sample_domain = make_shifted_datasets(
+        n_samples_source=20,
+        n_samples_target=20,
+        shift=shift,
+        noise=0.3,
+        label="regression",
+        random_state=random_state,
+    )
+    X_source, X_target, y_source, y_target = source_target_split(X, y, sample_domain=sample_domain)
+
+    print(y.shape, y_source.shape, y_target.shape, sample_domain.shape)
+    print()
+    fig, (ax1, ax2) = plt.subplots(1, 2, sharex="row", sharey="row", figsize=(8, 4))
+    fig.suptitle(shift.replace("_", " ").title(), fontsize=14)
+    plt.subplots_adjust(bottom=0.15)
+    s = ax1.scatter(
+        X_source[:, 0],
+        X_source[:, 1],
+        c=y_source*10,
+        vmax=1,
+        alpha=0.5,
+    )
+    cb=fig.colorbar(s)
+    cb.set_label("y-value*10")
+    ax1.set_title("Source data")
+    ax1.set_xlabel("Feature 1")
+    ax1.set_ylabel("Feature 2")
+
+    s = ax2.scatter(
+        X_target[:, 0],
+        X_target[:, 1],
+        c=y_target*10,
+        vmax=1,
+        alpha=0.5,
+    )
+    cb=fig.colorbar(s)
+    cb.set_label("y-value*10")
+    ax2.set_title("Target data")
+    ax2.set_xlabel("Feature 1")
+    ax2.set_ylabel("Feature 2")
+
+    plt.show()
+
+
+# %% Visualize shifted datasets
+
+for shift in [
+    "covariate_shift",
+    "target_shift",
+    "concept_drift"
+]:
+    plot_shifted_dataset(shift)
+
diff --git a/skada/datasets/_samples_generator.py b/skada/datasets/_samples_generator.py
@@ -10,6 +10,7 @@
 
 from scipy import signal
 from scipy.fftpack import rfft, irfft
+from scipy.stats import multivariate_normal
 
 from sklearn.datasets import make_blobs
 
@@ -37,6 +38,7 @@ def _generate_data_2d_classif(n_samples, rng, label='binary'):
     label : tuple, default='binary'
         If 'binary, return binary class
         If 'multiclass', return multiclass
+        if 'regression', return regression's y-values
     """
     n2 = n_samples
     n1 = n2 * 4
@@ -75,14 +77,20 @@ def _generate_data_2d_classif(n_samples, rng, label='binary'):
     # make labels
     if label == 'binary':
         y = np.concatenate((np.zeros(n1), np.ones(4 * n2)), 0)
+        y = y.astype(int)
     elif label == 'multiclass':
         y = np.zeros(n1)
         for i in range(4):
             y = np.concatenate((y, (i + 1) * np.ones(n2)), 0)
+            y = y.astype(int)
+    elif label == 'regression':
+        # create label y with gaussian distribution
+        normal_rv = multivariate_normal(mu1, Sigma1)
+        y = normal_rv.pdf(x)
     else:
         raise ValueError(f"Invalid label value: {label}. The label should either be "
-                         "'binary' or 'multiclass'")
-    return x, y.astype(int)
+                         "'binary', 'multiclass' or 'regression'")
+    return x, y
 
 
 def _generate_data_2d_classif_subspace(n_samples, rng, label='binary'):
@@ -98,6 +106,7 @@ def _generate_data_2d_classif_subspace(n_samples, rng, label='binary'):
     label : tuple, default='binary'
         If 'binary, return binary class
         If 'multiclass', return multiclass
+        if 'regression', return regression's y-values
     """
     n2 = n_samples
     n1 = n2 * 2
@@ -124,15 +133,21 @@ def _generate_data_2d_classif_subspace(n_samples, rng, label='binary'):
     # make labels
     if label == 'binary':
         y = np.concatenate((np.zeros(n1), np.ones(2 * n2)), 0)
+        y = y.astype(int)
     elif label == 'multiclass':
         y = np.zeros(n1)
         for i in range(4):
             y = np.concatenate((y, (i + 1) * np.ones(n2)), 0)
+            y = y.astype(int)
+    elif label == 'regression':
+        # create label y with gaussian distribution
+        normal_rv = multivariate_normal(mu1, Sigma1)
+        y = normal_rv.pdf(x)
     else:
         raise ValueError(f"Invalid label value: {label}. The label should either be "
-                         "'binary' or 'multiclass'")
+                         "'binary', 'multiclass' or 'regression'")
 
-    return x, y.astype(int)
+    return x, y
 
 
 def _generate_data_from_moons(n_samples, index, rng):

diff --git a/skada/datasets/tests/test_samples_generator.py b/skada/datasets/tests/test_samples_generator.py
@@ -150,6 +150,35 @@ def test_make_multi_source_shifted_datasets(shift):
     assert np.unique(y_target).shape[0] <= 5, "Unexpected number of cluster"
 
 
+@pytest.mark.parametrize(
+    "shift",
+    ["covariate_shift", "target_shift", "concept_drift"],
+)
+def test_make_shifted_datasets_regression(shift):
+    X, y, sample_domain = make_shifted_datasets(
+        n_samples_source=10,
+        n_samples_target=10,
+        shift=shift,
+        noise=None,
+        label="regression",
+    )
+    X_source, y_source, X_target, y_target = check_X_y_domain(
+        X,
+        y=y,
+        sample_domain=sample_domain,
+        return_joint=False,
+    )
+
+    assert X_source.shape == (10 * 8, 2), "X source shape mismatch"
+    assert y_source.shape == (10 * 8,), "y source shape mismatch"
+    assert (max(y_source) <=1) and min(y_source) >=0, "Wrong y-values in source domain, probabilities should be between 0 and 1"
+    assert X_target.shape == (10 * 8, 2), "X target shape mismatch"
+    assert y_target.shape == (10 * 8,), "y target shape mismatch"
+    assert (max(y_target) <=1) and min(y_target) >=0, "Wrong y-values in target domain, probabilities should be between 0 and 1"
+
+
+
+
 def test_make_subspace_datasets():
     X, y, sample_domain = make_shifted_datasets(
         n_samples_source=10,