Skip to content

Commit

Permalink
feat: ✨ add conversion from resource properties to Pandera schema (#1051
Browse files Browse the repository at this point in the history
)

## Description

This PR adds the ability to convert a set of resource properties into a
Pandera schema. This schema will be able to validate a polars dataframe
against itself.

<!-- Select quick/in-depth as necessary -->
This PR needs an in-depth review.

## Checklist

- [x] Added or updated tests
- [x] Updated documentation
- [x] Ran `just run-all`

---------

Co-authored-by: Luke W. Johnston <[email protected]>
Co-authored-by: Luke W. Johnston <[email protected]>
  • Loading branch information
3 people authored Feb 24, 2025
1 parent 3c014dc commit 798bab0
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 0 deletions.
52 changes: 52 additions & 0 deletions src/seedcase_sprout/core/sprout_checks/get_polars_data_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import polars as pl

from seedcase_sprout.core.properties import FieldType


def get_polars_data_type(field_type: FieldType | None) -> pl.DataType:
"""Maps Frictionless field types to Polars data types.
If the Frictionless field type has formatting constraints that are not included
in any specialised Polars data type, the mapping is to string. The formatting
constraints are then checked without Polars.
Args:
field_type: The Frictionless field type to map.
Returns:
The Polars data type the field is mapped to.
Raises:
NotImplementedError: If Sprout doesn't yet support the Frictionless field type.
"""
match field_type:
case "geojson":
raise NotImplementedError()
# While Polars does have most of these data types, there isn't a
# perfect overlap between them and what Frictionless has, even
# if they have similar/same names for the types. For example,
# checks against date/datetimes/times types are different between
# Polars and Frictionless. Or the way booleans get treated. Polars
# may cast `123` to True, but Frictionless will indicate it is not
# a boolean. We'll slowly improve on this as we use Sprout.
case (
"string"
| "boolean"
| "datetime"
| "date"
| "time"
| "year"
| "yearmonth"
| "duration"
| "list"
| "array"
| "object"
| "geopoint"
):
return pl.String
case "number":
return pl.Float64
case "integer":
return pl.Int64
case _:
return pl.String
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pandera.polars as pa

from seedcase_sprout.core.get_nested_attr import get_nested_attr
from seedcase_sprout.core.properties import FieldProperties, ResourceProperties
from seedcase_sprout.core.sprout_checks.get_pandera_checks import (
get_pandera_checks,
)
from seedcase_sprout.core.sprout_checks.get_polars_data_type import (
get_polars_data_type,
)


def resource_properties_to_pandera_schema(
resource_properties: ResourceProperties,
) -> pa.DataFrameSchema:
"""Converts a set of resource properties to a Pandera schema.
Args:
resource_properties: The resource properties to convert.
Returns:
The resulting Pandera schema.
"""
fields: list[FieldProperties] = get_nested_attr(
resource_properties,
"schema.fields",
default=[],
)

columns = {
field.name: pa.Column(
dtype=get_polars_data_type(field.type),
checks=get_pandera_checks(field),
nullable=not get_nested_attr(field, "constraints.required", default=False),
coerce=True,
)
for field in fields
}

return pa.DataFrameSchema(columns, strict=True)
117 changes: 117 additions & 0 deletions tests/core/sprout_checks/test_resource_properties_to_pandera_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import polars as pl
from pytest import mark

from seedcase_sprout.core.properties import (
ConstraintsProperties,
FieldProperties,
ResourceProperties,
TableSchemaProperties,
)
from seedcase_sprout.core.sprout_checks.resource_properties_to_pandera_schema import (
resource_properties_to_pandera_schema,
)


@mark.parametrize(
"resource_properties",
[
ResourceProperties(),
ResourceProperties(
schema=TableSchemaProperties(),
),
ResourceProperties(
schema=TableSchemaProperties(fields=None),
),
ResourceProperties(
schema=TableSchemaProperties(fields=[]),
),
],
)
def test_converts_properties_without_fields(resource_properties):
"""When the properties have no fields, the Pandera schema should have no columns."""
schema = resource_properties_to_pandera_schema(resource_properties)

assert schema.columns == {}
assert schema.strict


@mark.parametrize(
"field_type,data_type,num_checks",
[
("number", pl.Float64, 0),
("integer", pl.Int64, 0),
("string", pl.String, 0),
("boolean", pl.String, 1),
("object", pl.String, 1),
("array", pl.String, 1),
("list", pl.String, 0),
("datetime", pl.String, 1),
("date", pl.String, 1),
("time", pl.String, 1),
("year", pl.String, 1),
("yearmonth", pl.String, 1),
("duration", pl.String, 1),
("geopoint", pl.String, 1),
("any", pl.String, 0),
(None, pl.String, 0),
],
)
def test_converts_individual_fields_correctly(field_type, data_type, num_checks):
"""Should convert each type of field to a Pandera column correctly."""
resource_properties = ResourceProperties(
schema=TableSchemaProperties(
fields=[FieldProperties(name="my_field", type=field_type)]
)
)

schema = resource_properties_to_pandera_schema(resource_properties)

assert schema.strict
assert len(schema.columns) == 1
column = list(schema.columns.values())[0]
assert column.name == "my_field"
assert column.dtype.type == data_type
assert len(column.checks) == num_checks
assert column.coerce
assert column.nullable
assert column.required


def test_converts_multiple_fields():
"""Should convert multiple fields to multiple Pandera columns correctly."""
resource_properties = ResourceProperties(
schema=TableSchemaProperties(
fields=[
FieldProperties(name="my_date", type="date"),
FieldProperties(name="my_boolean", type="boolean"),
]
)
)

schema = resource_properties_to_pandera_schema(resource_properties)

assert [(column.name, column.dtype.type) for column in schema.columns.values()] == [
("my_date", pl.String),
("my_boolean", pl.String),
]


@mark.parametrize("required,expected", [(True, False), (False, True), (None, True)])
def test_converts_required_constraint(required, expected):
"""Should convert the required constraint to Pandera's nullable correctly."""
resource_properties = ResourceProperties(
schema=TableSchemaProperties(
fields=[
FieldProperties(
name="my_date",
type="date",
constraints=ConstraintsProperties(required=required),
)
]
)
)

schema = resource_properties_to_pandera_schema(resource_properties)

column = list(schema.columns.values())[0]
assert column.nullable is expected

0 comments on commit 798bab0

Please sign in to comment.