-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: ✨ add conversion from resource properties to Pandera schema (#1051
) ## Description This PR adds the ability to convert a set of resource properties into a Pandera schema. This schema will be able to validate a polars dataframe against itself. <!-- Select quick/in-depth as necessary --> This PR needs an in-depth review. ## Checklist - [x] Added or updated tests - [x] Updated documentation - [x] Ran `just run-all` --------- Co-authored-by: Luke W. Johnston <[email protected]> Co-authored-by: Luke W. Johnston <[email protected]>
- Loading branch information
1 parent
3c014dc
commit 798bab0
Showing
3 changed files
with
209 additions
and
0 deletions.
There are no files selected for viewing
52 changes: 52 additions & 0 deletions
52
src/seedcase_sprout/core/sprout_checks/get_polars_data_type.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import polars as pl | ||
|
||
from seedcase_sprout.core.properties import FieldType | ||
|
||
|
||
def get_polars_data_type(field_type: FieldType | None) -> pl.DataType: | ||
"""Maps Frictionless field types to Polars data types. | ||
If the Frictionless field type has formatting constraints that are not included | ||
in any specialised Polars data type, the mapping is to string. The formatting | ||
constraints are then checked without Polars. | ||
Args: | ||
field_type: The Frictionless field type to map. | ||
Returns: | ||
The Polars data type the field is mapped to. | ||
Raises: | ||
NotImplementedError: If Sprout doesn't yet support the Frictionless field type. | ||
""" | ||
match field_type: | ||
case "geojson": | ||
raise NotImplementedError() | ||
# While Polars does have most of these data types, there isn't a | ||
# perfect overlap between them and what Frictionless has, even | ||
# if they have similar/same names for the types. For example, | ||
# checks against date/datetimes/times types are different between | ||
# Polars and Frictionless. Or the way booleans get treated. Polars | ||
# may cast `123` to True, but Frictionless will indicate it is not | ||
# a boolean. We'll slowly improve on this as we use Sprout. | ||
case ( | ||
"string" | ||
| "boolean" | ||
| "datetime" | ||
| "date" | ||
| "time" | ||
| "year" | ||
| "yearmonth" | ||
| "duration" | ||
| "list" | ||
| "array" | ||
| "object" | ||
| "geopoint" | ||
): | ||
return pl.String | ||
case "number": | ||
return pl.Float64 | ||
case "integer": | ||
return pl.Int64 | ||
case _: | ||
return pl.String |
40 changes: 40 additions & 0 deletions
40
src/seedcase_sprout/core/sprout_checks/resource_properties_to_pandera_schema.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import pandera.polars as pa | ||
|
||
from seedcase_sprout.core.get_nested_attr import get_nested_attr | ||
from seedcase_sprout.core.properties import FieldProperties, ResourceProperties | ||
from seedcase_sprout.core.sprout_checks.get_pandera_checks import ( | ||
get_pandera_checks, | ||
) | ||
from seedcase_sprout.core.sprout_checks.get_polars_data_type import ( | ||
get_polars_data_type, | ||
) | ||
|
||
|
||
def resource_properties_to_pandera_schema( | ||
resource_properties: ResourceProperties, | ||
) -> pa.DataFrameSchema: | ||
"""Converts a set of resource properties to a Pandera schema. | ||
Args: | ||
resource_properties: The resource properties to convert. | ||
Returns: | ||
The resulting Pandera schema. | ||
""" | ||
fields: list[FieldProperties] = get_nested_attr( | ||
resource_properties, | ||
"schema.fields", | ||
default=[], | ||
) | ||
|
||
columns = { | ||
field.name: pa.Column( | ||
dtype=get_polars_data_type(field.type), | ||
checks=get_pandera_checks(field), | ||
nullable=not get_nested_attr(field, "constraints.required", default=False), | ||
coerce=True, | ||
) | ||
for field in fields | ||
} | ||
|
||
return pa.DataFrameSchema(columns, strict=True) |
117 changes: 117 additions & 0 deletions
117
tests/core/sprout_checks/test_resource_properties_to_pandera_schema.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import polars as pl | ||
from pytest import mark | ||
|
||
from seedcase_sprout.core.properties import ( | ||
ConstraintsProperties, | ||
FieldProperties, | ||
ResourceProperties, | ||
TableSchemaProperties, | ||
) | ||
from seedcase_sprout.core.sprout_checks.resource_properties_to_pandera_schema import ( | ||
resource_properties_to_pandera_schema, | ||
) | ||
|
||
|
||
@mark.parametrize( | ||
"resource_properties", | ||
[ | ||
ResourceProperties(), | ||
ResourceProperties( | ||
schema=TableSchemaProperties(), | ||
), | ||
ResourceProperties( | ||
schema=TableSchemaProperties(fields=None), | ||
), | ||
ResourceProperties( | ||
schema=TableSchemaProperties(fields=[]), | ||
), | ||
], | ||
) | ||
def test_converts_properties_without_fields(resource_properties): | ||
"""When the properties have no fields, the Pandera schema should have no columns.""" | ||
schema = resource_properties_to_pandera_schema(resource_properties) | ||
|
||
assert schema.columns == {} | ||
assert schema.strict | ||
|
||
|
||
@mark.parametrize( | ||
"field_type,data_type,num_checks", | ||
[ | ||
("number", pl.Float64, 0), | ||
("integer", pl.Int64, 0), | ||
("string", pl.String, 0), | ||
("boolean", pl.String, 1), | ||
("object", pl.String, 1), | ||
("array", pl.String, 1), | ||
("list", pl.String, 0), | ||
("datetime", pl.String, 1), | ||
("date", pl.String, 1), | ||
("time", pl.String, 1), | ||
("year", pl.String, 1), | ||
("yearmonth", pl.String, 1), | ||
("duration", pl.String, 1), | ||
("geopoint", pl.String, 1), | ||
("any", pl.String, 0), | ||
(None, pl.String, 0), | ||
], | ||
) | ||
def test_converts_individual_fields_correctly(field_type, data_type, num_checks): | ||
"""Should convert each type of field to a Pandera column correctly.""" | ||
resource_properties = ResourceProperties( | ||
schema=TableSchemaProperties( | ||
fields=[FieldProperties(name="my_field", type=field_type)] | ||
) | ||
) | ||
|
||
schema = resource_properties_to_pandera_schema(resource_properties) | ||
|
||
assert schema.strict | ||
assert len(schema.columns) == 1 | ||
column = list(schema.columns.values())[0] | ||
assert column.name == "my_field" | ||
assert column.dtype.type == data_type | ||
assert len(column.checks) == num_checks | ||
assert column.coerce | ||
assert column.nullable | ||
assert column.required | ||
|
||
|
||
def test_converts_multiple_fields(): | ||
"""Should convert multiple fields to multiple Pandera columns correctly.""" | ||
resource_properties = ResourceProperties( | ||
schema=TableSchemaProperties( | ||
fields=[ | ||
FieldProperties(name="my_date", type="date"), | ||
FieldProperties(name="my_boolean", type="boolean"), | ||
] | ||
) | ||
) | ||
|
||
schema = resource_properties_to_pandera_schema(resource_properties) | ||
|
||
assert [(column.name, column.dtype.type) for column in schema.columns.values()] == [ | ||
("my_date", pl.String), | ||
("my_boolean", pl.String), | ||
] | ||
|
||
|
||
@mark.parametrize("required,expected", [(True, False), (False, True), (None, True)]) | ||
def test_converts_required_constraint(required, expected): | ||
"""Should convert the required constraint to Pandera's nullable correctly.""" | ||
resource_properties = ResourceProperties( | ||
schema=TableSchemaProperties( | ||
fields=[ | ||
FieldProperties( | ||
name="my_date", | ||
type="date", | ||
constraints=ConstraintsProperties(required=required), | ||
) | ||
] | ||
) | ||
) | ||
|
||
schema = resource_properties_to_pandera_schema(resource_properties) | ||
|
||
column = list(schema.columns.values())[0] | ||
assert column.nullable is expected |