From 57b057289ee5c862422df7d85e4fe72047567b7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Severin=20Paul=20H=C3=B6fer?= <84280965+zzril@users.noreply.github.com> Date: Wed, 7 Jun 2023 11:58:36 +0200 Subject: [PATCH] feat: Add `StandardScaler` transformer (#316) Closes #142. ### Summary of Changes * Added new class `StandardScaler` in `tabular/transformation`. * Added tests. * Added helper method `check_that_tables_are_close`. Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> --------- Co-authored-by: Simon Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com> Co-authored-by: Lars Reimann --- .../data/tabular/transformation/__init__.py | 2 + .../transformation/_standard_scaler.py | 180 ++++++++++++++ tests/helpers/__init__.py | 3 +- tests/helpers/_assertions.py | 24 ++ .../transformation/test_standard_scaler.py | 231 ++++++++++++++++++ 5 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 src/safeds/data/tabular/transformation/_standard_scaler.py create mode 100644 tests/helpers/_assertions.py create mode 100644 tests/safeds/data/tabular/transformation/test_standard_scaler.py diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index e8af96cd7..edf45242b 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -4,6 +4,7 @@ from ._label_encoder import LabelEncoder from ._one_hot_encoder import OneHotEncoder from ._range_scaler import RangeScaler +from ._standard_scaler import StandardScaler from ._table_transformer import InvertibleTableTransformer, TableTransformer __all__ = [ @@ -13,4 +14,5 @@ "InvertibleTableTransformer", "TableTransformer", "RangeScaler", + "StandardScaler", ] diff --git a/src/safeds/data/tabular/transformation/_standard_scaler.py b/src/safeds/data/tabular/transformation/_standard_scaler.py new file mode 100644 index 000000000..8e2466d5f --- /dev/null +++ b/src/safeds/data/tabular/transformation/_standard_scaler.py @@ -0,0 +1,180 @@ +from __future__ import annotations + +from sklearn.preprocessing import StandardScaler as sk_StandardScaler + +from safeds.data.tabular.containers import Table +from safeds.data.tabular.transformation._table_transformer import InvertibleTableTransformer +from safeds.exceptions import TransformerNotFittedError, UnknownColumnNameError + + +class StandardScaler(InvertibleTableTransformer): + """The StandardScaler transforms column values by scaling each value to a given range.""" + + def __init__(self) -> None: + self._column_names: list[str] | None = None + self._wrapped_transformer: sk_StandardScaler | None = None + + def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table : Table + The table used to fit the transformer. + column_names : Optional[list[str]] + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer : TableTransformer + The fitted transformer. + """ + if column_names is None: + column_names = table.column_names + else: + missing_columns = set(column_names) - set(table.column_names) + if len(missing_columns) > 0: + raise UnknownColumnNameError(list(missing_columns)) + + wrapped_transformer = sk_StandardScaler() + wrapped_transformer.fit(table._data[column_names]) + + result = StandardScaler() + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: Table) -> Table: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table : Table + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table : Table + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = set(self._column_names) - set(table.column_names) + if len(missing_columns) > 0: + raise UnknownColumnNameError(list(missing_columns)) + + data = table._data.copy() + data.columns = table.column_names + data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) + return Table._from_pandas_dataframe(data) + + def inverse_transform(self, transformed_table: Table) -> Table: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table : Table + The table to be transformed back to the original version. + + Returns + ------- + table : Table + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + data = transformed_table._data.copy() + data.columns = transformed_table.column_names + data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) + return Table._from_pandas_dataframe(data) + + def is_fitted(self) -> bool: + """ + Check if the transformer is fitted. + + Returns + ------- + is_fitted : bool + Whether the transformer is fitted. + """ + return self._wrapped_transformer is not None + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the StandardScaler. + + Returns + ------- + added_columns : list[str] + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted(): + raise TransformerNotFittedError + return [] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the StandardScaler. + + Returns + ------- + changed_columns : list[str] + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the StandardScaler. + + Returns + ------- + removed_columns : list[str] + A list of names of the removed columns, ordered as they appear in the table the StandardScaler was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted(): + raise TransformerNotFittedError + return [] diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py index 019d5cbe7..f7b114a0b 100644 --- a/tests/helpers/__init__.py +++ b/tests/helpers/__init__.py @@ -1,3 +1,4 @@ +from ._assertions import assert_that_tables_are_close from ._resources import resolve_resource_path -__all__ = ["resolve_resource_path"] +__all__ = ["assert_that_tables_are_close", "resolve_resource_path"] diff --git a/tests/helpers/_assertions.py b/tests/helpers/_assertions.py new file mode 100644 index 000000000..ecd93c1b4 --- /dev/null +++ b/tests/helpers/_assertions.py @@ -0,0 +1,24 @@ +import pytest +from safeds.data.tabular.containers import Table + + +def assert_that_tables_are_close(table1: Table, table2: Table) -> None: + """ + Assert that two tables are almost equal. + + Parameters + ---------- + table1: Table + The first table. + table2: Table + The table to compare the first table to. + """ + assert table1.schema == table2.schema + for column_name in table1.column_names: + assert table1.get_column(column_name).type == table2.get_column(column_name).type + assert table1.get_column(column_name).type.is_numeric() + assert table2.get_column(column_name).type.is_numeric() + for i in range(table1.number_of_rows): + entry_1 = table1.get_column(column_name).get_value(i) + entry_2 = table2.get_column(column_name).get_value(i) + assert entry_1 == pytest.approx(entry_2) diff --git a/tests/safeds/data/tabular/transformation/test_standard_scaler.py b/tests/safeds/data/tabular/transformation/test_standard_scaler.py new file mode 100644 index 000000000..58e4d773f --- /dev/null +++ b/tests/safeds/data/tabular/transformation/test_standard_scaler.py @@ -0,0 +1,231 @@ +import pytest +from safeds.data.tabular.containers import Table +from safeds.data.tabular.transformation import StandardScaler +from safeds.exceptions import TransformerNotFittedError, UnknownColumnNameError + +from tests.helpers import assert_that_tables_are_close + + +class TestFit: + def test_should_raise_if_column_not_found(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + with pytest.raises(UnknownColumnNameError): + StandardScaler().fit(table, ["col2"]) + + def test_should_not_change_original_transformer(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = StandardScaler() + transformer.fit(table, None) + + assert transformer._wrapped_transformer is None + assert transformer._column_names is None + + +class TestTransform: + def test_should_raise_if_column_not_found(self) -> None: + table_to_fit = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = StandardScaler().fit(table_to_fit, None) + + table_to_transform = Table( + { + "col2": ["a", "b", "c"], + }, + ) + + with pytest.raises(UnknownColumnNameError): + transformer.transform(table_to_transform) + + def test_should_raise_if_not_fitted(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = StandardScaler() + + with pytest.raises(TransformerNotFittedError): + transformer.transform(table) + + +class TestIsFitted: + def test_should_return_false_before_fitting(self) -> None: + transformer = StandardScaler() + assert not transformer.is_fitted() + + def test_should_return_true_after_fitting(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = StandardScaler() + fitted_transformer = transformer.fit(table, None) + assert fitted_transformer.is_fitted() + + +class TestFitAndTransformOnMultipleTables: + @pytest.mark.parametrize( + ("fit_and_transform_table", "only_transform_table", "column_names", "expected_1", "expected_2"), + [ + ( + Table( + { + "col1": [0.0, 0.0, 1.0, 1.0], + "col2": [0.0, 0.0, 1.0, 1.0], + }, + ), + Table( + { + "col1": [2], + "col2": [2], + }, + ), + None, + Table( + { + "col1": [-1.0, -1.0, 1.0, 1.0], + "col2": [-1.0, -1.0, 1.0, 1.0], + }, + ), + Table( + { + "col1": [3.0], + "col2": [3.0], + }, + ), + ), + ], + ) + def test_should_return_transformed_tables( + self, + fit_and_transform_table: Table, + only_transform_table: Table, + column_names: list[str] | None, + expected_1: Table, + expected_2: Table, + ) -> None: + s = StandardScaler().fit(fit_and_transform_table, column_names) + assert s.fit_and_transform(fit_and_transform_table, column_names) == expected_1 + assert s.transform(only_transform_table) == expected_2 + + +class TestFitAndTransform: + def test_should_not_change_original_table(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + StandardScaler().fit_and_transform(table) + + expected = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + assert table == expected + + def test_get_names_of_added_columns(self) -> None: + transformer = StandardScaler() + with pytest.raises(TransformerNotFittedError): + transformer.get_names_of_added_columns() + + table = Table( + { + "a": [0.0], + }, + ) + transformer = transformer.fit(table, None) + assert transformer.get_names_of_added_columns() == [] + + def test_get_names_of_changed_columns(self) -> None: + transformer = StandardScaler() + with pytest.raises(TransformerNotFittedError): + transformer.get_names_of_changed_columns() + table = Table( + { + "a": [0.0], + }, + ) + transformer = transformer.fit(table, None) + assert transformer.get_names_of_changed_columns() == ["a"] + + def test_get_names_of_removed_columns(self) -> None: + transformer = StandardScaler() + with pytest.raises(TransformerNotFittedError): + transformer.get_names_of_removed_columns() + + table = Table( + { + "a": [0.0], + }, + ) + transformer = transformer.fit(table, None) + assert transformer.get_names_of_removed_columns() == [] + + +class TestInverseTransform: + @pytest.mark.parametrize( + "table", + [ + Table( + { + "col1": [0.0, 5.0, 5.0, 10.0], + }, + ), + ], + ) + def test_should_return_original_table(self, table: Table) -> None: + transformer = StandardScaler().fit(table, None) + + assert transformer.inverse_transform(transformer.transform(table)) == table + + def test_should_not_change_transformed_table(self) -> None: + table = Table( + { + "col1": [0.0, 0.5, 1.0], + }, + ) + + transformer = StandardScaler().fit(table, None) + transformed_table = transformer.transform(table) + transformed_table = transformer.inverse_transform(transformed_table) + + expected = Table( + { + "col1": [0.0, 0.5, 1.0], + }, + ) + + assert_that_tables_are_close(transformed_table, expected) + + def test_should_raise_if_not_fitted(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 5.0, 10.0], + }, + ) + + transformer = StandardScaler() + + with pytest.raises(TransformerNotFittedError): + transformer.inverse_transform(table)