Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: string operations on cells #791

Merged
merged 49 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
49b4401
feat: namespace for string operations on cells
lars-reimann May 18, 2024
1049033
feat: `starts_with` and `ends_with`
lars-reimann May 18, 2024
773789c
feat: `length`
lars-reimann May 18, 2024
7242f06
feat: `to_lowercase` and `to_uppercase`
lars-reimann May 18, 2024
1423d14
feat: `trim`, `trim_start`, `trim_end`
lars-reimann May 18, 2024
9591561
docs: minor improvements
lars-reimann May 18, 2024
6ed16c0
refactor: make metrics classes abstract
lars-reimann May 19, 2024
6b7fac6
docs: normalize result names of plot methods
lars-reimann May 19, 2024
b96d9e6
docs: rename variable in example
lars-reimann May 19, 2024
ea95caa
docs: note that regressors must be fitted before using metrics methods
lars-reimann May 19, 2024
6dbdce2
feat: property access to parameters of label and one-hot encoder
lars-reimann May 19, 2024
541b1db
docs: document the class, not the `__init__`
lars-reimann May 19, 2024
cdfdbbb
test: refactor helper for cell tests
lars-reimann May 19, 2024
1e9036b
test: `Cell.abs`
lars-reimann May 19, 2024
bd0d172
test: `Cell.neg`
lars-reimann May 19, 2024
b8f9c95
test: `+Cell`
lars-reimann May 19, 2024
fc391da
test: `Cell.floor`
lars-reimann May 19, 2024
cdc11d7
test: `Cell.ceil`
lars-reimann May 19, 2024
fb76608
test: `Cell.not_`
lars-reimann May 19, 2024
03e36c1
test: `Cell.and_`
lars-reimann May 19, 2024
a1dba01
test: `Cell.or_`
lars-reimann May 19, 2024
3e6a0d5
test: `Cell.xor`
lars-reimann May 19, 2024
33eb5f2
test: `Cell.add`
lars-reimann May 19, 2024
74ff41b
test: `Cell.sub`
lars-reimann May 19, 2024
c0d7875
test: `Cell.mul`
lars-reimann May 19, 2024
62fe85f
test: `Cell.div`
lars-reimann May 19, 2024
eaea032
test: `Cell.__floordiv__`
lars-reimann May 19, 2024
4b8f287
test: `Cell.eq`
lars-reimann May 19, 2024
06a9714
test: `Cell.ne`
lars-reimann May 19, 2024
6baedcf
test: `Cell.ge`
lars-reimann May 19, 2024
7ac41ef
test: `Cell.lt`
lars-reimann May 19, 2024
267e2af
test: `Cell.le`
lars-reimann May 19, 2024
59018cd
test: `Cell.mod`
lars-reimann May 19, 2024
bbd86e6
test: `Cell.pow`
lars-reimann May 19, 2024
38d3a3d
test: better way to test inverted dunder methods
lars-reimann May 19, 2024
8a5a25f
test: `Cell.__sizeof__`
lars-reimann May 19, 2024
ccc5463
test: `Cell._equals`
lars-reimann May 19, 2024
011736d
test: `Cell._hash`
lars-reimann May 19, 2024
9a2a4a5
test: simplify test of `Column.__hash__`
lars-reimann May 19, 2024
53aa165
feat: `StringCell.index_of`
lars-reimann May 19, 2024
3e38a9e
feat: `StringCell.to_float` and `StringCell.to_int`
lars-reimann May 19, 2024
f64d120
feat: `StringCell.to_date` and `StringCell.to_datetime`
lars-reimann May 19, 2024
4102a54
test: `StringCell._equals`, `StringCell.__hash__`, `StringCell.__size…
lars-reimann May 19, 2024
d73aef2
feat: `StringCell.replace`, `StringCell.substring`
lars-reimann May 19, 2024
6c4a360
feat: rename `string` to `str`
lars-reimann May 19, 2024
e35c755
docs: fix example output
lars-reimann May 19, 2024
2cce126
style: fix ruff error
lars-reimann May 19, 2024
3ec0096
style: fix mypy errors
lars-reimann May 19, 2024
c5479e9
style: apply automated linter fixes
megalinter-bot May 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/resources/from_json_file.json
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
{ "a": { "0": 1, "1": 2, "2": 3 }, "b": { "0": 4, "1": 5, "2": 6 } }
{
"columns": [
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
]
}
6 changes: 0 additions & 6 deletions src/resources/from_json_file_2.json

This file was deleted.

7 changes: 6 additions & 1 deletion src/resources/to_json_file.json
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
{ "a": { "0": 1, "1": 2, "2": 3 }, "b": { "0": 4, "1": 5, "2": 6 } }
{
"columns": [
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
]
}
6 changes: 0 additions & 6 deletions src/resources/to_json_file_2.json

This file was deleted.

3 changes: 3 additions & 0 deletions src/safeds/data/tabular/containers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ._cell import Cell
from ._column import Column
from ._row import Row
from ._string_cell import StringCell
from ._table import Table

apipkg.initpkg(
Expand All @@ -16,6 +17,7 @@
"Cell": "._cell:Cell",
"Column": "._column:Column",
"Row": "._row:Row",
"StringCell": "._string_cell:StringCell",
"Table": "._table:Table",
},
)
Expand All @@ -24,5 +26,6 @@
"Cell",
"Column",
"Row",
"StringCell",
"Table",
]
79 changes: 45 additions & 34 deletions src/safeds/data/tabular/containers/_cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
if TYPE_CHECKING:
import polars as pl

from ._string_cell import StringCell

T_co = TypeVar("T_co", covariant=True)
P = TypeVar("P")
P_contra = TypeVar("P_contra", contravariant=True)
R_co = TypeVar("R_co", covariant=True)


Expand Down Expand Up @@ -109,10 +111,10 @@ def __mul__(self, other: Any) -> Cell[R_co]: ...
def __rmul__(self, other: Any) -> Cell[R_co]: ...

@abstractmethod
def __pow__(self, other: float | Cell[P]) -> Cell[R_co]: ...
def __pow__(self, other: float | Cell[P_contra]) -> Cell[R_co]: ...

@abstractmethod
def __rpow__(self, other: float | Cell[P]) -> Cell[R_co]: ...
def __rpow__(self, other: float | Cell[P_contra]) -> Cell[R_co]: ...

@abstractmethod
def __sub__(self, other: Any) -> Cell[R_co]: ...
Expand All @@ -134,6 +136,15 @@ def __hash__(self) -> int: ...
@abstractmethod
def __sizeof__(self) -> int: ...

# ------------------------------------------------------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------------------------------------------------------

@property
@abstractmethod
def str(self) -> StringCell:
"""Namespace for operations on strings."""

# ------------------------------------------------------------------------------------------------------------------
# Boolean operations
# ------------------------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -372,6 +383,36 @@ def add(self, other: Any) -> Cell[R_co]:
"""
return self.__add__(other)

def div(self, other: Any) -> Cell[R_co]:
"""
Divide by a value. This is equivalent to the `/` operator.

Examples
--------
>>> from safeds.data.tabular.containers import Column
>>> column = Column("example", [6, 8])
>>> column.transform(lambda cell: cell.div(2))
+---------+
| example |
| --- |
| f64 |
+=========+
| 3.00000 |
| 4.00000 |
+---------+

>>> column.transform(lambda cell: cell / 2)
+---------+
| example |
| --- |
| f64 |
+=========+
| 3.00000 |
| 4.00000 |
+---------+
"""
return self.__truediv__(other)

def mod(self, other: Any) -> Cell[R_co]:
"""
Perform a modulo operation. This is equivalent to the `%` operator.
Expand Down Expand Up @@ -432,7 +473,7 @@ def mul(self, other: Any) -> Cell[R_co]:
"""
return self.__mul__(other)

def pow(self, other: float | Cell[P]) -> Cell[R_co]:
def pow(self, other: float | Cell[P_contra]) -> Cell[R_co]:
"""
Raise to a power. This is equivalent to the `**` operator.

Expand Down Expand Up @@ -492,36 +533,6 @@ def sub(self, other: Any) -> Cell[R_co]:
"""
return self.__sub__(other)

def div(self, other: Any) -> Cell[R_co]:
"""
Divide by a value. This is equivalent to the `/` operator.

Examples
--------
>>> from safeds.data.tabular.containers import Column
>>> column = Column("example", [6, 8])
>>> column.transform(lambda cell: cell.div(2))
+---------+
| example |
| --- |
| f64 |
+=========+
| 3.00000 |
| 4.00000 |
+---------+

>>> column.transform(lambda cell: cell / 2)
+---------+
| example |
| --- |
| f64 |
+=========+
| 3.00000 |
| 4.00000 |
+---------+
"""
return self.__truediv__(other)

# ------------------------------------------------------------------------------------------------------------------
# Comparison operations
# ------------------------------------------------------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,8 +756,8 @@ def correlation_with(self, other: Column) -> float:
>>> column1.correlation_with(column2)
1.0

>>> column4 = Column("test", [3, 2, 1])
>>> column1.correlation_with(column4)
>>> column3 = Column("test", [3, 2, 1])
>>> column1.correlation_with(column3)
-1.0
"""
import polars as pl
Expand Down
26 changes: 23 additions & 3 deletions src/safeds/data/tabular/containers/_lazy_cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
if TYPE_CHECKING:
import polars as pl

from ._string_cell import StringCell

T = TypeVar("T")
P = TypeVar("P")
R = TypeVar("R")
Expand All @@ -31,7 +33,9 @@ def __init__(self, expression: pl.Expr) -> None:
# "Boolean" operators (actually bitwise) -----------------------------------

def __invert__(self) -> Cell[bool]:
return _wrap(self._expression.__invert__())
import polars as pl

return _wrap(self._expression.cast(pl.Boolean).__invert__())

def __and__(self, other: bool | Cell[bool]) -> Cell[bool]:
return _wrap(self._expression.__and__(other))
Expand Down Expand Up @@ -83,10 +87,16 @@ def __abs__(self) -> Cell[R]:
return _wrap(self._expression.__abs__())

def __ceil__(self) -> Cell[R]:
return _wrap(self._expression.ceil())
import polars as pl

# polars does not yet implement floor for integers
return _wrap(self._expression.cast(pl.Float64).ceil())

def __floor__(self) -> Cell[R]:
return _wrap(self._expression.floor())
import polars as pl

# polars does not yet implement floor for integers
return _wrap(self._expression.cast(pl.Float64).floor())

def __neg__(self) -> Cell[R]:
return _wrap(self._expression.__neg__())
Expand Down Expand Up @@ -166,6 +176,16 @@ def __hash__(self) -> int:
def __sizeof__(self) -> int:
return self._expression.__sizeof__()

# ------------------------------------------------------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------------------------------------------------------

@property
def str(self) -> StringCell:
from ._lazy_string_cell import _LazyStringCell # circular import

return _LazyStringCell(self._expression)

# ------------------------------------------------------------------------------------------------------------------
# Internal
# ------------------------------------------------------------------------------------------------------------------
Expand Down
101 changes: 101 additions & 0 deletions src/safeds/data/tabular/containers/_lazy_string_cell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _ClosedBound

from ._lazy_cell import _LazyCell
from ._string_cell import StringCell

if TYPE_CHECKING:
import datetime

import polars as pl

from ._cell import Cell


class _LazyStringCell(StringCell):
# ------------------------------------------------------------------------------------------------------------------
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(self, expression: pl.Expr) -> None:
self._expression: pl.Expr = expression

def __hash__(self) -> int:
return _structural_hash(self._expression.meta.serialize())

def __sizeof__(self) -> int:
return self._expression.__sizeof__()

# ------------------------------------------------------------------------------------------------------------------
# String operations
# ------------------------------------------------------------------------------------------------------------------

def contains(self, substring: str) -> Cell[bool]:
return _LazyCell(self._expression.str.contains(substring, literal=True))

def length(self, optimize_for_ascii: bool = False) -> Cell[int]:
if optimize_for_ascii:
return _LazyCell(self._expression.str.len_bytes())
else:
return _LazyCell(self._expression.str.len_chars())

def ends_with(self, suffix: str) -> Cell[bool]:
return _LazyCell(self._expression.str.ends_with(suffix))

def index_of(self, substring: str) -> Cell[int | None]:
return _LazyCell(self._expression.str.find(substring, literal=True))

def replace(self, old: str, new: str) -> Cell[str]:
return _LazyCell(self._expression.str.replace_all(old, new, literal=True))

def starts_with(self, prefix: str) -> Cell[bool]:
return _LazyCell(self._expression.str.starts_with(prefix))

def substring(self, start: int = 0, length: int | None = None) -> Cell[str]:
_check_bounds("length", length, lower_bound=_ClosedBound(0))

return _LazyCell(self._expression.str.slice(start, length))

def to_date(self) -> Cell[datetime.date | None]:
return _LazyCell(self._expression.str.to_date(format="%F", strict=False))

def to_datetime(self) -> Cell[datetime.datetime | None]:
return _LazyCell(self._expression.str.to_datetime(format="%+", strict=False))

def to_int(self, *, base: int = 10) -> Cell[int | None]:
return _LazyCell(self._expression.str.to_integer(base=base, strict=False))

def to_float(self) -> Cell[float | None]:
import polars as pl

return _LazyCell(self._expression.cast(pl.Float64, strict=False))

def to_lowercase(self) -> Cell[str]:
return _LazyCell(self._expression.str.to_lowercase())

def to_uppercase(self) -> Cell[str]:
return _LazyCell(self._expression.str.to_uppercase())

def trim(self) -> Cell[str]:
return _LazyCell(self._expression.str.strip_chars())

def trim_end(self) -> Cell[str]:
return _LazyCell(self._expression.str.strip_chars_end())

def trim_start(self) -> Cell[str]:
return _LazyCell(self._expression.str.strip_chars_start())

# ------------------------------------------------------------------------------------------------------------------
# Internal
# ------------------------------------------------------------------------------------------------------------------

def _equals(self, other: object) -> bool:
if not isinstance(other, _LazyStringCell):
return NotImplemented
if self is other:
return True
return self._expression.meta.eq(other._expression.meta)
Loading