diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 13c6600..f107ade 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ -# Contributing to sdf-action +# Contributing to sdf-tests -Thank you for your interest in contributing to `sdf-action`! We welcome contributions from the community and are excited to see what you can bring to the project. Before you get started, please review the guidelines below to ensure a smooth and efficient contribution process. +Thank you for your interest in contributing to `sdf-tests`! We welcome contributions from the community and are excited to see what you can bring to the project. Before you get started, please review the guidelines below to ensure a smooth and efficient contribution process. ## PR Labeling Guidelines diff --git a/README.md b/README.md index 7c9f267..b2d7779 100644 --- a/README.md +++ b/README.md @@ -19,25 +19,27 @@ For an in-depth guide on how to use SDF tests, please see the Tests section of [ ## SDF Standard Library Tests -| Test Name | Type | -| ------------------------------ | --------- | -| [`not_null()`](#not-null) | Scalar | -| [`valid_scalar(condition)`](#valid-scalar) | Scalar | -| [`valid_aggregate(condition)`](#valid-aggregate) | Aggregate | -| [`unique()`](#unique) | Aggregate | -| [`in_accepted_values([values])`](#in-accepted-values) | Aggregate | -| [`minimum(value)`](#minimum) | Aggregate | -| [`maxiumum(value)`](#maximum) | Aggregate | -| [`exclusive_minimum(value)`](#exclusive-minimum) | Aggregate | -| [`exclusive_maximum(value)`](#exclusive-maximum) | Aggregate | -| [`between(lower, upper)`](#between) | Aggregate | -| [`max_length(value)`](#max-length) | Aggregate | -| [`min_length(value)`](#min-length) | Aggregate | -| [`like(string)`](#like) | Aggregate | -| [`try_cast(type)`](#try-cast) | Aggregate | -| [`primary_key(column)`](#primary-key) | Aggregate | -| [`unique_columns([c1, c2])`](#unique-columns)| Table | - +| Test Name | Type | +| -------------------------------------------------------------------- | --------- | +| [`not_null()`](#not-null) | Scalar | +| [`valid_scalar(condition)`](#valid-scalar) | Scalar | +| [`valid_aggregate(condition)`](#valid-aggregate) | Aggregate | +| [`unique()`](#unique) | Aggregate | +| [`in_accepted_values([values])`](#in-accepted-values) | Aggregate | +| [`minimum(value)`](#minimum) | Aggregate | +| [`maxiumum(value)`](#maximum) | Aggregate | +| [`exclusive_minimum(value)`](#exclusive-minimum) | Aggregate | +| [`exclusive_maximum(value)`](#exclusive-maximum) | Aggregate | +| [`between(lower, upper)`](#between) | Aggregate | +| [`max_length(value)`](#max-length) | Aggregate | +| [`min_length(value)`](#min-length) | Aggregate | +| [`like(string)`](#like) | Aggregate | +| [`try_cast(type)`](#try-cast) | Aggregate | +| [`primary_key(column)`](#primary-key) | Aggregate | +| [`unique_columns([c1, c2])`](#unique-columns) | Table | +| [`fresh(reference_value, date_part, value)`](#fresh) | Aggregate | +| [`maximum_count([c1, c2], value)`](#maximum_row_count_by_partitions) | Table | +| [`minimum_count([c1, c2], value)`](#maximum_row_count_by_partitions) | Table | #### Not Null @@ -232,3 +234,48 @@ table: - expect: unique_columns(['a', 'b']) ``` +#### Fresh + +Asserts that a column contains values more recent than a given number of interval compared to a reference value. +The column and reference must be of the same data type. + +**Example:** +```yaml +columns: + - name: a + tests: + - expect: fresh('a', current_date(), 1) + - expect: fresh('a', current_date(), 1) + - name: b + tests: + - expect: fresh('b', current_timestamp(), 180, 'minute') + - expect: fresh('warn', 'b', current_datetime(), 90, 'minute') +``` + +#### Maximum Count by Partition + +Asserts that a table grouped by a list of columns contains less rows than a threshold value. +The column and reference must be of the same data type. + +**Example:** +```yaml +columns: + - name: a + tests: + - expect: maximum_row_count_by_partition('a', current_date(), 1) + - expect: maximum_row_count_by_partition('a', current_date(), 1) +``` + +#### Minimum Count by Partition + +Asserts that a table grouped by a list of columns contains more rows than a threshold value. +The column and reference must be of the same data type. + +**Example:** +```yaml +columns: + - name: a + tests: + - expect: maximum_row_count_by_partition('a', current_date(), 1) + - expect: maximum_row_count_by_partition('a', current_date(), 1) +``` diff --git a/macros/test.jinja b/macros/test.jinja index 10f5055..0e4f3e3 100644 --- a/macros/test.jinja +++ b/macros/test.jinja @@ -26,7 +26,6 @@ '{{severity}}: column {{ column_name }} has unexpected values in {{condition_str}}' {%- endmacro %} - {# should mention column_name in condition#} {% macro valid_scalar(severity, column_name, condition) -%} COUNT(CASE WHEN NOT({{ condition }}) THEN 1 ELSE NULL END) > 0 @@ -43,7 +42,7 @@ {# ---------------------------------------------------------------------------------------------- #} -{# number column checks: via aggregate #} +{# number column checks: via aggregate #} {% macro minimum(severity, column_name, value) -%} NOT(MIN({{column_name}}) >= {{value}}) ==> @@ -68,23 +67,29 @@ '{{severity}}: column {{ column_name }} has values greater than or equal to {{ value }}' {%- endmacro %} - {% macro between(severity, column_name, min_value, max_value) -%} NOT(MIN({{column_name}}) >= {{min_value}}) or MAX({{column_name}}) > {{max_value}} ==> '{{severity}}: column {{ column_name }} has values outside of {{min_value | safe_str()}}..{{max_value| safe_str()}}' {%- endmacro %} +{% macro fresh(severity, column_name, reference_value, value, date_part) %} + MAX(EXTRACT({{ date_part }} FROM {{ column_name }}) - {{ reference_value }} ) > {{value}} + ==> + '{{severity}}: column {{ column_name }} has no values fresher than interval {{ value | safe_str }} {{ date_part }}' +{%- endmacro %} + + {# ---------------------------------------------------------------------------------------------- #} -{# string column checks: via aggregate #} +{# string column checks: via aggregate #} {% macro max_length(severity, column_name, value) -%} MAX(LENGTH({{column_name}})) > {{value}} ==> '{{severity}}: column {{ column_name }} has string lengths greater than {{ value | safe_str }}' {%- endmacro %} -{# string column checks: via aggregate #} +{# string column checks: via aggregate #} {% macro min_length(severity, column_name, value) -%} MIN(LENGTH({{column_name}})) < {{value}} ==> @@ -92,14 +97,14 @@ {%- endmacro %} -{# string column checks: via aggregate #} +{# string column checks: via aggregate #} {% macro like(severity, column_name, value) -%} NOT(COUNT(CASE WHEN {{ column_name }} LIKE {{value}} THEN 1 ELSE NULL END)>0) ==> '{{severity}}: column {{ column_name }} has strings that are NOT like {{ value| safe_str }}' {%- endmacro %} -{# string column checks: via aggregate #} +{# string column checks: via aggregate #} {% macro try_cast(severity, column_name, type_name) -%} NOT(COUNT(CASE WHEN TRY_CAST({{column_name}} AS {{type_name}}) IS NOT NULL THEN 1 ELSE NULL END)>0) ==> @@ -116,9 +121,8 @@ {%- endmacro %} - {# ---------------------------------------------------------------------------------------------- #} -{# dbt generic tests #} +{# generic tests #} @@ -183,6 +187,56 @@ SELECT reason FROM {{verdict}} {% endmacro %} +{% macro maximum_count(severity, table_name, max_count, column_list) %} + +{%- if column_list is none -%} +{%- set verdict = table_name ~ 'maximum_count' ~ max_count | join('_') | safe_id -%} +{%- set grouped_columns = table_name ~ 'maximum_count' ~ max_count | join('_') | safe_id -%} +{% else %} +{%- set verdict = table_name ~ 'maximum_count' ~ max_count ~ 'by' ~ column_list | join('_') | safe_id -%} +{%- set grouped_columns = 'GROUP BY ' ~ {{ column_list | join(', ') }} -%} +{%- endif -%} + +{{verdict}} AS ( + WITH RowCounts AS ( + SELECT {{ 'COUNT(*)' ~ column_list | join(', ')}} + FROM {{table_name}} + {{ grouped_columns }} + HAVING COUNT(*) > {{ min_count }} + ) + SELECT + '{{severity}}: columns {{ (column_list | join(', '))| safe_str }} has row count above maximum threshold' AS reason + FROM RowCounts +) +==> + SELECT reason FROM {{verdict}} +{% endmacro %} + +{% macro minimum_count(severity, table_name, row_count, column_list) %} + +{%- if column_list is none -%} +{%- set verdict = table_name ~ 'maximum_count' ~ max_count | join('_') | safe_id -%} +{%- set grouped_columns = table_name ~ 'maximum_count' ~ max_count | join('_') | safe_id -%} +{% else %} +{%- set verdict = table_name ~ 'maximum_count' ~ max_count ~ 'by' ~ column_list | join('_') | safe_id -%} +{%- set grouped_columns = {{ column_list | join(', ') }} -%} +{%- endif -%} + +{{verdict}} AS ( + WITH RowCounts AS ( + SELECT {{ column_list | join(', ')}} + FROM {{table_name}} + {{ grouped_columns }} + HAVING COUNT(*) > {{ row_count }} + ) + SELECT + '{{severity}}: columns {{ (column_list | join(', '))| safe_str }} has row count below minimum threshold' AS reason + FROM RowCounts +) +==> + SELECT reason FROM {{verdict}} +{% endmacro %} + {# ---------------------------------------------------------------------------------------------- #} {# generate column constraints #}