feat: ✨ add conversion from resource properties to Pandera schema (#1051

) ## Description This PR adds the ability to convert a set of resource properties into a Pandera schema. This schema will be able to validate a polars dataframe against itself.  This PR needs an in-depth review. ## Checklist - [x] Added or updated tests - [x] Updated documentation - [x] Ran `just run-all` --------- Co-authored-by: Luke W. Johnston <[email protected]> Co-authored-by: Luke W. Johnston <[email protected]>
seedcase-project · Feb 24, 2025 · 798bab0 · 798bab0
1 parent 3c014dc
commit 798bab0
Show file tree

Hide file tree

Showing 3 changed files with 209 additions and 0 deletions.
diff --git a/src/seedcase_sprout/core/sprout_checks/get_polars_data_type.py b/src/seedcase_sprout/core/sprout_checks/get_polars_data_type.py
@@ -0,0 +1,52 @@
+import polars as pl
+
+from seedcase_sprout.core.properties import FieldType
+
+
+def get_polars_data_type(field_type: FieldType | None) -> pl.DataType:
+    """Maps Frictionless field types to Polars data types.
+
+    If the Frictionless field type has formatting constraints that are not included
+    in any specialised Polars data type, the mapping is to string. The formatting
+    constraints are then checked without Polars.
+
+    Args:
+        field_type: The Frictionless field type to map.
+
+    Returns:
+        The Polars data type the field is mapped to.
+
+    Raises:
+        NotImplementedError: If Sprout doesn't yet support the Frictionless field type.
+    """
+    match field_type:
+        case "geojson":
+            raise NotImplementedError()
+        # While Polars does have most of these data types, there isn't a
+        # perfect overlap between them and what Frictionless has, even
+        # if they have similar/same names for the types. For example,
+        # checks against date/datetimes/times types are different between
+        # Polars and Frictionless. Or the way booleans get treated. Polars
+        # may cast `123` to True, but Frictionless will indicate it is not
+        # a boolean. We'll slowly improve on this as we use Sprout.
+        case (
+            "string"
+            | "boolean"
+            | "datetime"
+            | "date"
+            | "time"
+            | "year"
+            | "yearmonth"
+            | "duration"
+            | "list"
+            | "array"
+            | "object"
+            | "geopoint"
+        ):
+            return pl.String
+        case "number":
+            return pl.Float64
+        case "integer":
+            return pl.Int64
+        case _:
+            return pl.String
diff --git a/src/seedcase_sprout/core/sprout_checks/resource_properties_to_pandera_schema.py b/src/seedcase_sprout/core/sprout_checks/resource_properties_to_pandera_schema.py
@@ -0,0 +1,40 @@
+import pandera.polars as pa
+
+from seedcase_sprout.core.get_nested_attr import get_nested_attr
+from seedcase_sprout.core.properties import FieldProperties, ResourceProperties
+from seedcase_sprout.core.sprout_checks.get_pandera_checks import (
+    get_pandera_checks,
+)
+from seedcase_sprout.core.sprout_checks.get_polars_data_type import (
+    get_polars_data_type,
+)
+
+
+def resource_properties_to_pandera_schema(
+    resource_properties: ResourceProperties,
+) -> pa.DataFrameSchema:
+    """Converts a set of resource properties to a Pandera schema.
+
+    Args:
+        resource_properties: The resource properties to convert.
+
+    Returns:
+        The resulting Pandera schema.
+    """
+    fields: list[FieldProperties] = get_nested_attr(
+        resource_properties,
+        "schema.fields",
+        default=[],
+    )
+
+    columns = {
+        field.name: pa.Column(
+            dtype=get_polars_data_type(field.type),
+            checks=get_pandera_checks(field),
+            nullable=not get_nested_attr(field, "constraints.required", default=False),
+            coerce=True,
+        )
+        for field in fields
+    }
+
+    return pa.DataFrameSchema(columns, strict=True)
diff --git a/tests/core/sprout_checks/test_resource_properties_to_pandera_schema.py b/tests/core/sprout_checks/test_resource_properties_to_pandera_schema.py
@@ -0,0 +1,117 @@
+import polars as pl
+from pytest import mark
+
+from seedcase_sprout.core.properties import (
+    ConstraintsProperties,
+    FieldProperties,
+    ResourceProperties,
+    TableSchemaProperties,
+)
+from seedcase_sprout.core.sprout_checks.resource_properties_to_pandera_schema import (
+    resource_properties_to_pandera_schema,
+)
+
+
+@mark.parametrize(
+    "resource_properties",
+    [
+        ResourceProperties(),
+        ResourceProperties(
+            schema=TableSchemaProperties(),
+        ),
+        ResourceProperties(
+            schema=TableSchemaProperties(fields=None),
+        ),
+        ResourceProperties(
+            schema=TableSchemaProperties(fields=[]),
+        ),
+    ],
+)
+def test_converts_properties_without_fields(resource_properties):
+    """When the properties have no fields, the Pandera schema should have no columns."""
+    schema = resource_properties_to_pandera_schema(resource_properties)
+
+    assert schema.columns == {}
+    assert schema.strict
+
+
+@mark.parametrize(
+    "field_type,data_type,num_checks",
+    [
+        ("number", pl.Float64, 0),
+        ("integer", pl.Int64, 0),
+        ("string", pl.String, 0),
+        ("boolean", pl.String, 1),
+        ("object", pl.String, 1),
+        ("array", pl.String, 1),
+        ("list", pl.String, 0),
+        ("datetime", pl.String, 1),
+        ("date", pl.String, 1),
+        ("time", pl.String, 1),
+        ("year", pl.String, 1),
+        ("yearmonth", pl.String, 1),
+        ("duration", pl.String, 1),
+        ("geopoint", pl.String, 1),
+        ("any", pl.String, 0),
+        (None, pl.String, 0),
+    ],
+)
+def test_converts_individual_fields_correctly(field_type, data_type, num_checks):
+    """Should convert each type of field to a Pandera column correctly."""
+    resource_properties = ResourceProperties(
+        schema=TableSchemaProperties(
+            fields=[FieldProperties(name="my_field", type=field_type)]
+        )
+    )
+
+    schema = resource_properties_to_pandera_schema(resource_properties)
+
+    assert schema.strict
+    assert len(schema.columns) == 1
+    column = list(schema.columns.values())[0]
+    assert column.name == "my_field"
+    assert column.dtype.type == data_type
+    assert len(column.checks) == num_checks
+    assert column.coerce
+    assert column.nullable
+    assert column.required
+
+
+def test_converts_multiple_fields():
+    """Should convert multiple fields to multiple Pandera columns correctly."""
+    resource_properties = ResourceProperties(
+        schema=TableSchemaProperties(
+            fields=[
+                FieldProperties(name="my_date", type="date"),
+                FieldProperties(name="my_boolean", type="boolean"),
+            ]
+        )
+    )
+
+    schema = resource_properties_to_pandera_schema(resource_properties)
+
+    assert [(column.name, column.dtype.type) for column in schema.columns.values()] == [
+        ("my_date", pl.String),
+        ("my_boolean", pl.String),
+    ]
+
+
+@mark.parametrize("required,expected", [(True, False), (False, True), (None, True)])
+def test_converts_required_constraint(required, expected):
+    """Should convert the required constraint to Pandera's nullable correctly."""
+    resource_properties = ResourceProperties(
+        schema=TableSchemaProperties(
+            fields=[
+                FieldProperties(
+                    name="my_date",
+                    type="date",
+                    constraints=ConstraintsProperties(required=required),
+                )
+            ]
+        )
+    )
+
+    schema = resource_properties_to_pandera_schema(resource_properties)
+
+    column = list(schema.columns.values())[0]
+    assert column.nullable is expected