Skip to content

Commit

Permalink
[MRG] Add DomainAndLabelStratifiedSubsampleTransformer + Fix `Domai…
Browse files Browse the repository at this point in the history
…nStratifiedSubsampleTransformer` (#268)

* Add DomainAndLabelStratifiedSubsampleTransformer + fix DomainStratifiedSubsampleTransformer

* Add test to check stratification proportions

* rename subsamplers

---------

Co-authored-by: Antoine Collas <[email protected]>
  • Loading branch information
YanisLalou and antoinecollas authored Oct 25, 2024
1 parent 65f1659 commit 9a976ce
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 16 deletions.
2 changes: 1 addition & 1 deletion skada/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
OTLabelProp,
JCPOTLabelPropAdapter,
JCPOTLabelProp)
from .transformers import SubsampleTransformer, DomainStratifiedSubsampleTransformer
from .transformers import Subsampler, DomainSubsampler, StratifiedDomainSubsampler
from ._self_labeling import DASVMClassifier
from ._pipeline import make_da_pipeline
from .utils import source_target_split, per_domain_split
Expand Down
78 changes: 67 additions & 11 deletions skada/tests/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,27 @@
#
# License: BSD 3-Clause

from collections import Counter

import numpy as np
from sklearn.preprocessing import StandardScaler

from skada import CORAL, make_da_pipeline
from skada.transformers import (
DomainStratifiedSubsampleTransformer,
SubsampleTransformer,
DomainSubsampler,
StratifiedDomainSubsampler,
Subsampler,
)


def test_SubsampleTransformer(da_dataset):
def test_Subsampler(da_dataset):
X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])
sample_weight = np.ones_like(y)

train_size = 10

# test size of output on fit_transform
transformer = SubsampleTransformer(train_size=train_size, random_state=42)
transformer = Subsampler(train_size=train_size, random_state=42)

X_subsampled, y_subsampled, params = transformer.fit_transform(
X, y, sample_domain=sample_domain, sample_weight=sample_weight
Expand All @@ -40,26 +43,26 @@ def test_SubsampleTransformer(da_dataset):
assert X_target_subsampled.shape[0] == X_target.shape[0]

# now with a pipeline with end task
transformer = SubsampleTransformer(train_size=train_size)
transformer = Subsampler(train_size=train_size)
pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())

pipeline.fit(X, y, sample_domain=sample_domain)

ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
assert ypred.shape[0] == X_target.shape[0]
assert ypred.shape[0] == X_target.shape[0]

ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
assert ypred.shape[0] == X.shape[0]


def test_DomainStratifiedSubsampleTransformer(da_dataset):
def test_DomainSubsampler(da_dataset):
X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])
sample_weight = np.ones_like(y)

train_size = 10

# test size of output on fit_transform
transformer = DomainStratifiedSubsampleTransformer(
train_size=train_size, random_state=42
)
transformer = DomainSubsampler(train_size=train_size, random_state=42)

X_subsampled, y_subsampled, params = transformer.fit_transform(
X, y, sample_domain=sample_domain, sample_weight=sample_weight
Expand All @@ -82,11 +85,64 @@ def test_DomainStratifiedSubsampleTransformer(da_dataset):
assert X_target_subsampled.shape[0] == X_target.shape[0]

# now with a pipeline with end task
transformer = DomainStratifiedSubsampleTransformer(train_size=train_size)
transformer = DomainSubsampler(train_size=train_size)
pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())

pipeline.fit(X, y, sample_domain=sample_domain)

ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
assert ypred.shape[0] == X_target.shape[0]

ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
assert ypred.shape[0] == X.shape[0]


def test_StratifiedDomainSubsampler(da_dataset):
X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])
sample_weight = np.ones_like(y)

train_size = 10

# test size of output on fit_transform
transformer = StratifiedDomainSubsampler(train_size=train_size, random_state=42)

X_subsampled, y_subsampled, params = transformer.fit_transform(
X, y, sample_domain=sample_domain, sample_weight=sample_weight
)

assert X_subsampled.shape == (train_size, X.shape[1])
assert y_subsampled.shape[0] == train_size
assert params["sample_domain"].shape[0] == train_size
assert params["sample_weight"].shape[0] == train_size

# Check stratification proportions
original_freq = Counter(zip(sample_domain, y))
subsampled_freq = Counter(zip(params["sample_domain"], y_subsampled))

for key in original_freq:
original_ratio = original_freq[key] / len(y)
subsampled_ratio = subsampled_freq[key] / train_size
assert np.isclose(
original_ratio, subsampled_ratio, atol=0.1
), f"Stratification not preserved for {key}"

# test size of output on transform
X_target, y_target, sample_domain_target = da_dataset.pack_test(as_targets=["t"])

X_target_subsampled = transformer.transform(
X_target, y_target, sample_domain=sample_domain_target
)

assert X_target_subsampled.shape[0] == X_target.shape[0]

# now with a pipeline with end task
transformer = StratifiedDomainSubsampler(train_size=train_size)
pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())

pipeline.fit(X, y, sample_domain=sample_domain)

ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
assert ypred.shape[0] == X_target.shape[0]

ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
assert ypred.shape[0] == X.shape[0]
77 changes: 73 additions & 4 deletions skada/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from sklearn.utils import check_random_state

from .base import BaseAdapter
from .model_selection import StratifiedDomainShuffleSplit
from .utils import check_X_y_domain


class SubsampleTransformer(BaseAdapter):
class Subsampler(BaseAdapter):
"""Transformer that subsamples the data.
This transformer is useful to speed up computations when the data is too
Expand Down Expand Up @@ -67,12 +68,14 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
)
return X_subsampled, y_subsampled, params

def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
def transform(
self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
):
"""Transform the data."""
return X


class DomainStratifiedSubsampleTransformer(BaseAdapter):
class DomainSubsampler(BaseAdapter):
"""Transformer that subsamples the data in a domain stratified way.
This transformer is useful to speed up computations when the data is too
Expand Down Expand Up @@ -129,6 +132,72 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
)
return X_subsampled, y_subsampled, params

def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
def transform(
self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
):
"""Transform the data."""
return X


class StratifiedDomainSubsampler(BaseAdapter):
"""Transformer that subsamples the data in a domain and label stratified way.
This transformer is useful to speed up computations when the data is too
large. It randomly selects a subset of the data to work with during training
but does not change the data during testing.
.. note::
This transformer should not be used as the last step of a pipeline
because it returns non standard output.
Parameters
----------
train_size : int, float
Number of samples to keep (keep all if data smaller) if integer, or
proportion of train sample if float 0<= train_size <= 1.
random_state : int, RandomState instance or None, default=None
Controls the random resampling of the data.
"""

def __init__(self, train_size, random_state=None):
self.train_size = train_size
self.random_state = random_state

def _pack_params(self, idx, **params):
return {
k: (v[idx] if idx is not None else v)
for k, v in params.items()
if v is not None
}

def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
"""Fit and transform the data."""
X, y, sample_domain = check_X_y_domain(X, y, sample_domain)

self.rng_ = check_random_state(self.random_state)

if self.train_size >= X.shape[0]:
return (
X,
y,
self._pack_params(
None, sample_domain=sample_domain, sample_weight=sample_weight
),
)

splitter = StratifiedDomainShuffleSplit(
n_splits=1, train_size=self.train_size, random_state=self.rng_
)

train_idx, _ = next(splitter.split(X, y, sample_domain))
X_subsampled = X[train_idx]
y_subsampled = y[train_idx] if y is not None else None
params = self._pack_params(
train_idx, sample_domain=sample_domain, sample_weight=sample_weight
)
return X_subsampled, y_subsampled, params

def transform(
self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
):
"""Transform the data."""
return X

0 comments on commit 9a976ce

Please sign in to comment.