Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Regression label for 2d classification data generation #69

Merged
merged 20 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions examples/datasets/plot_shifted_dataset_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""
Plot dataset source domain and shifted target domain
====================================================

This illustrates the :func:`~skada.datasets.make_shifted_dataset`
dataset generator. Each method consists of generating source data
and shifted target data. We illustrate here:
covariate shift, target shift, concept drift, and sample bias.
See detailed description of each shift in [1]_.

.. [1] Moreno-Torres, J. G., Raeder, T., Alaiz-Rodriguez,
R., Chawla, N. V., and Herrera, F. (2012).
A unifying view on dataset shift in classification.
Pattern recognition, 45(1):521-530.
"""
# %% Imports

import matplotlib.pyplot as plt

from skada.datasets import make_shifted_datasets
from skada import source_target_split


# %% Helper function



def plot_shifted_dataset(shift, random_state=42):
"""Plot source and shifted target data for a given type of shift.

The possible shifts are 'covariate_shift', 'target_shift' or
'concept_drift'.

We use here the same random seed for multiple calls to
ensure same distributions.
"""
X, y, sample_domain = make_shifted_datasets(
n_samples_source=20,
n_samples_target=20,
shift=shift,
noise=0.3,
label="regression",
random_state=random_state,
)
X_source, X_target, y_source, y_target = source_target_split(X, y, sample_domain=sample_domain)

print(y.shape, y_source.shape, y_target.shape, sample_domain.shape)
print()
fig, (ax1, ax2) = plt.subplots(1, 2, sharex="row", sharey="row", figsize=(8, 4))
fig.suptitle(shift.replace("_", " ").title(), fontsize=14)
plt.subplots_adjust(bottom=0.15)
s = ax1.scatter(
X_source[:, 0],
X_source[:, 1],
c=y_source*10,
vmax=1,
alpha=0.5,
)
cb=fig.colorbar(s)
cb.set_label("y-value*10")
ax1.set_title("Source data")
ax1.set_xlabel("Feature 1")
ax1.set_ylabel("Feature 2")

s = ax2.scatter(
X_target[:, 0],
X_target[:, 1],
c=y_target*10,
vmax=1,
alpha=0.5,
)
cb=fig.colorbar(s)
cb.set_label("y-value*10")
ax2.set_title("Target data")
ax2.set_xlabel("Feature 1")
ax2.set_ylabel("Feature 2")

plt.show()


# %% Visualize shifted datasets

for shift in [
"covariate_shift",
"target_shift",
"concept_drift"
]:
plot_shifted_dataset(shift)

23 changes: 19 additions & 4 deletions skada/datasets/_samples_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from scipy import signal
from scipy.fftpack import rfft, irfft
from scipy.stats import multivariate_normal

from sklearn.datasets import make_blobs

Expand Down Expand Up @@ -37,6 +38,7 @@ def _generate_data_2d_classif(n_samples, rng, label='binary'):
label : tuple, default='binary'
If 'binary, return binary class
If 'multiclass', return multiclass
if 'regression', return regression's y-values
"""
n2 = n_samples
n1 = n2 * 4
Expand Down Expand Up @@ -75,14 +77,20 @@ def _generate_data_2d_classif(n_samples, rng, label='binary'):
# make labels
if label == 'binary':
y = np.concatenate((np.zeros(n1), np.ones(4 * n2)), 0)
y = y.astype(int)
elif label == 'multiclass':
y = np.zeros(n1)
for i in range(4):
y = np.concatenate((y, (i + 1) * np.ones(n2)), 0)
y = y.astype(int)
elif label == 'regression':
# create label y with gaussian distribution
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it could be nice to have the possibility to modify the mu and the Sigma1 as we want. So just put it in the parameters of the function with default values.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And change the name to mu_regression, sigma_regression

normal_rv = multivariate_normal(mu1, Sigma1)
y = normal_rv.pdf(x)
else:
raise ValueError(f"Invalid label value: {label}. The label should either be "
"'binary' or 'multiclass'")
return x, y.astype(int)
"'binary', 'multiclass' or 'regression'")
return x, y


def _generate_data_2d_classif_subspace(n_samples, rng, label='binary'):
Expand All @@ -98,6 +106,7 @@ def _generate_data_2d_classif_subspace(n_samples, rng, label='binary'):
label : tuple, default='binary'
If 'binary, return binary class
If 'multiclass', return multiclass
if 'regression', return regression's y-values
"""
n2 = n_samples
n1 = n2 * 2
Expand All @@ -124,15 +133,21 @@ def _generate_data_2d_classif_subspace(n_samples, rng, label='binary'):
# make labels
if label == 'binary':
y = np.concatenate((np.zeros(n1), np.ones(2 * n2)), 0)
y = y.astype(int)
elif label == 'multiclass':
y = np.zeros(n1)
for i in range(4):
y = np.concatenate((y, (i + 1) * np.ones(n2)), 0)
y = y.astype(int)
elif label == 'regression':
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

# create label y with gaussian distribution
normal_rv = multivariate_normal(mu1, Sigma1)
y = normal_rv.pdf(x)
else:
raise ValueError(f"Invalid label value: {label}. The label should either be "
"'binary' or 'multiclass'")
"'binary', 'multiclass' or 'regression'")

return x, y.astype(int)
return x, y


def _generate_data_from_moons(n_samples, index, rng):
Expand Down
29 changes: 29 additions & 0 deletions skada/datasets/tests/test_samples_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,35 @@ def test_make_multi_source_shifted_datasets(shift):
assert np.unique(y_target).shape[0] <= 5, "Unexpected number of cluster"


@pytest.mark.parametrize(
"shift",
["covariate_shift", "target_shift", "concept_drift"],
)
def test_make_shifted_datasets_regression(shift):
X, y, sample_domain = make_shifted_datasets(
n_samples_source=10,
n_samples_target=10,
shift=shift,
noise=None,
label="regression",
)
X_source, y_source, X_target, y_target = check_X_y_domain(
X,
y=y,
sample_domain=sample_domain,
return_joint=False,
)

assert X_source.shape == (10 * 8, 2), "X source shape mismatch"
assert y_source.shape == (10 * 8,), "y source shape mismatch"
assert (max(y_source) <=1) and min(y_source) >=0, "Wrong y-values in source domain, probabilities should be between 0 and 1"
assert X_target.shape == (10 * 8, 2), "X target shape mismatch"
assert y_target.shape == (10 * 8,), "y target shape mismatch"
assert (max(y_target) <=1) and min(y_target) >=0, "Wrong y-values in target domain, probabilities should be between 0 and 1"




def test_make_subspace_datasets():
X, y, sample_domain = make_shifted_datasets(
n_samples_source=10,
Expand Down
Loading