From 9a976ce8f64b132ed0792e0784cf4aa7058bcf78 Mon Sep 17 00:00:00 2001 From: Yanis Lalou <57602694+YanisLalou@users.noreply.github.com> Date: Fri, 25 Oct 2024 17:39:32 +0200 Subject: [PATCH] [MRG] Add `DomainAndLabelStratifiedSubsampleTransformer` + Fix `DomainStratifiedSubsampleTransformer` (#268) * Add DomainAndLabelStratifiedSubsampleTransformer + fix DomainStratifiedSubsampleTransformer * Add test to check stratification proportions * rename subsamplers --------- Co-authored-by: Antoine Collas --- skada/__init__.py | 2 +- skada/tests/test_transformers.py | 78 +++++++++++++++++++++++++++----- skada/transformers.py | 77 +++++++++++++++++++++++++++++-- 3 files changed, 141 insertions(+), 16 deletions(-) diff --git a/skada/__init__.py b/skada/__init__.py index 7d7b73d8..927a8b09 100644 --- a/skada/__init__.py +++ b/skada/__init__.py @@ -61,7 +61,7 @@ OTLabelProp, JCPOTLabelPropAdapter, JCPOTLabelProp) -from .transformers import SubsampleTransformer, DomainStratifiedSubsampleTransformer +from .transformers import Subsampler, DomainSubsampler, StratifiedDomainSubsampler from ._self_labeling import DASVMClassifier from ._pipeline import make_da_pipeline from .utils import source_target_split, per_domain_split diff --git a/skada/tests/test_transformers.py b/skada/tests/test_transformers.py index 7f2892dd..bb7f3ca5 100644 --- a/skada/tests/test_transformers.py +++ b/skada/tests/test_transformers.py @@ -2,24 +2,27 @@ # # License: BSD 3-Clause +from collections import Counter + import numpy as np from sklearn.preprocessing import StandardScaler from skada import CORAL, make_da_pipeline from skada.transformers import ( - DomainStratifiedSubsampleTransformer, - SubsampleTransformer, + DomainSubsampler, + StratifiedDomainSubsampler, + Subsampler, ) -def test_SubsampleTransformer(da_dataset): +def test_Subsampler(da_dataset): X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"]) sample_weight = np.ones_like(y) train_size = 10 # test size of output on fit_transform - transformer = SubsampleTransformer(train_size=train_size, random_state=42) + transformer = Subsampler(train_size=train_size, random_state=42) X_subsampled, y_subsampled, params = transformer.fit_transform( X, y, sample_domain=sample_domain, sample_weight=sample_weight @@ -40,26 +43,26 @@ def test_SubsampleTransformer(da_dataset): assert X_target_subsampled.shape[0] == X_target.shape[0] # now with a pipeline with end task - transformer = SubsampleTransformer(train_size=train_size) + transformer = Subsampler(train_size=train_size) pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL()) pipeline.fit(X, y, sample_domain=sample_domain) ypred = pipeline.predict(X_target, sample_domain=sample_domain_target) assert ypred.shape[0] == X_target.shape[0] - assert ypred.shape[0] == X_target.shape[0] + + ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True) + assert ypred.shape[0] == X.shape[0] -def test_DomainStratifiedSubsampleTransformer(da_dataset): +def test_DomainSubsampler(da_dataset): X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"]) sample_weight = np.ones_like(y) train_size = 10 # test size of output on fit_transform - transformer = DomainStratifiedSubsampleTransformer( - train_size=train_size, random_state=42 - ) + transformer = DomainSubsampler(train_size=train_size, random_state=42) X_subsampled, y_subsampled, params = transformer.fit_transform( X, y, sample_domain=sample_domain, sample_weight=sample_weight @@ -82,11 +85,64 @@ def test_DomainStratifiedSubsampleTransformer(da_dataset): assert X_target_subsampled.shape[0] == X_target.shape[0] # now with a pipeline with end task - transformer = DomainStratifiedSubsampleTransformer(train_size=train_size) + transformer = DomainSubsampler(train_size=train_size) pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL()) pipeline.fit(X, y, sample_domain=sample_domain) ypred = pipeline.predict(X_target, sample_domain=sample_domain_target) assert ypred.shape[0] == X_target.shape[0] + + ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True) + assert ypred.shape[0] == X.shape[0] + + +def test_StratifiedDomainSubsampler(da_dataset): + X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"]) + sample_weight = np.ones_like(y) + + train_size = 10 + + # test size of output on fit_transform + transformer = StratifiedDomainSubsampler(train_size=train_size, random_state=42) + + X_subsampled, y_subsampled, params = transformer.fit_transform( + X, y, sample_domain=sample_domain, sample_weight=sample_weight + ) + + assert X_subsampled.shape == (train_size, X.shape[1]) + assert y_subsampled.shape[0] == train_size + assert params["sample_domain"].shape[0] == train_size + assert params["sample_weight"].shape[0] == train_size + + # Check stratification proportions + original_freq = Counter(zip(sample_domain, y)) + subsampled_freq = Counter(zip(params["sample_domain"], y_subsampled)) + + for key in original_freq: + original_ratio = original_freq[key] / len(y) + subsampled_ratio = subsampled_freq[key] / train_size + assert np.isclose( + original_ratio, subsampled_ratio, atol=0.1 + ), f"Stratification not preserved for {key}" + + # test size of output on transform + X_target, y_target, sample_domain_target = da_dataset.pack_test(as_targets=["t"]) + + X_target_subsampled = transformer.transform( + X_target, y_target, sample_domain=sample_domain_target + ) + + assert X_target_subsampled.shape[0] == X_target.shape[0] + + # now with a pipeline with end task + transformer = StratifiedDomainSubsampler(train_size=train_size) + pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL()) + + pipeline.fit(X, y, sample_domain=sample_domain) + + ypred = pipeline.predict(X_target, sample_domain=sample_domain_target) assert ypred.shape[0] == X_target.shape[0] + + ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True) + assert ypred.shape[0] == X.shape[0] diff --git a/skada/transformers.py b/skada/transformers.py index d5501d93..f7be92fd 100644 --- a/skada/transformers.py +++ b/skada/transformers.py @@ -6,10 +6,11 @@ from sklearn.utils import check_random_state from .base import BaseAdapter +from .model_selection import StratifiedDomainShuffleSplit from .utils import check_X_y_domain -class SubsampleTransformer(BaseAdapter): +class Subsampler(BaseAdapter): """Transformer that subsamples the data. This transformer is useful to speed up computations when the data is too @@ -67,12 +68,14 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None): ) return X_subsampled, y_subsampled, params - def transform(self, X, y=None, *, sample_domain=None, sample_weight=None): + def transform( + self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None + ): """Transform the data.""" return X -class DomainStratifiedSubsampleTransformer(BaseAdapter): +class DomainSubsampler(BaseAdapter): """Transformer that subsamples the data in a domain stratified way. This transformer is useful to speed up computations when the data is too @@ -129,6 +132,72 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None): ) return X_subsampled, y_subsampled, params - def transform(self, X, y=None, *, sample_domain=None, sample_weight=None): + def transform( + self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None + ): + """Transform the data.""" + return X + + +class StratifiedDomainSubsampler(BaseAdapter): + """Transformer that subsamples the data in a domain and label stratified way. + This transformer is useful to speed up computations when the data is too + large. It randomly selects a subset of the data to work with during training + but does not change the data during testing. + + .. note:: + This transformer should not be used as the last step of a pipeline + because it returns non standard output. + + Parameters + ---------- + train_size : int, float + Number of samples to keep (keep all if data smaller) if integer, or + proportion of train sample if float 0<= train_size <= 1. + random_state : int, RandomState instance or None, default=None + Controls the random resampling of the data. + """ + + def __init__(self, train_size, random_state=None): + self.train_size = train_size + self.random_state = random_state + + def _pack_params(self, idx, **params): + return { + k: (v[idx] if idx is not None else v) + for k, v in params.items() + if v is not None + } + + def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None): + """Fit and transform the data.""" + X, y, sample_domain = check_X_y_domain(X, y, sample_domain) + + self.rng_ = check_random_state(self.random_state) + + if self.train_size >= X.shape[0]: + return ( + X, + y, + self._pack_params( + None, sample_domain=sample_domain, sample_weight=sample_weight + ), + ) + + splitter = StratifiedDomainShuffleSplit( + n_splits=1, train_size=self.train_size, random_state=self.rng_ + ) + + train_idx, _ = next(splitter.split(X, y, sample_domain)) + X_subsampled = X[train_idx] + y_subsampled = y[train_idx] if y is not None else None + params = self._pack_params( + train_idx, sample_domain=sample_domain, sample_weight=sample_weight + ) + return X_subsampled, y_subsampled, params + + def transform( + self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None + ): """Transform the data.""" return X