Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate: improve performance and simplify error report format #172

Merged
merged 2 commits into from
Feb 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 64 additions & 33 deletions src/cmd/validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::CliResult;
use anyhow::{anyhow, Result};
use csv::ByteRecord;
use indicatif::{ProgressBar, ProgressDrawTarget};
use jsonschema::paths::PathChunk;
use jsonschema::{output::BasicOutput, JSONSchema};
#[allow(unused_imports)]
use log::{debug, info};
Expand All @@ -29,7 +30,7 @@ Example output files from `mydata.csv`. If piped from stdin, then filename is `s

* mydata.csv.valid
* mydata.csv.invalid
* mydata.csv.validation-errors.jsonl
* mydata.csv.validation-errors.tsv

JSON Schema can be a local file or a URL.

Expand Down Expand Up @@ -180,6 +181,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let batch_size = batch.len();

// do actual validation via Rayon parallel iterator
// validation_results vector should have same row count and in same order as input CSV
let validation_results: Vec<Option<String>> = batch
.par_iter()
.map(|record| {
Expand Down Expand Up @@ -328,10 +330,12 @@ fn write_error_report(input_path: &str, validation_error_messages: Vec<String>)
.unwrap_or_else(|_| DEFAULT_WTR_BUFFER_CAPACITY.to_string());
let wtr_buffer_size: usize = wtr_capacitys.parse().unwrap_or(DEFAULT_WTR_BUFFER_CAPACITY);

let output_file = File::create(input_path.to_owned() + ".validation-errors.jsonl")?;
let output_file = File::create(input_path.to_owned() + ".validation-errors.tsv")?;

let mut output_writer = BufWriter::with_capacity(wtr_buffer_size, output_file);

output_writer.write_all("row_number\tfield\terror\n".as_bytes())?;

// write out error report
for error_msg in validation_error_messages {
output_writer.write_all(error_msg.as_bytes())?;
Expand Down Expand Up @@ -368,26 +372,20 @@ fn do_json_validation(
// debug!("instance[{row_number}]: {instance:?}");

match validate_json_instance(&instance, schema_compiled) {
Ok(mut validation_result) => {
let is_valid = validation_result.get("valid").unwrap().as_bool().unwrap();

if is_valid {
Ok(None)
} else {
// debug!("row[{row_number}] is invalid");

// insert row index and return validation error
validation_result.as_object_mut().unwrap().insert(
"row_number".to_string(),
Value::Number(Number::from(row_number)),
);

Ok(Some(validation_result.to_string()))
}
}
Err(e) => {
return fail!(format!("Unable to validate. row: {row_number}, error: {e}"));
Some(validation_errors) => {
use itertools::Itertools;
// squash multiple errors into one long String with linebreaks
let combined_errors: String = validation_errors
.iter()
.map(|tuple| {
// validation error file format: row_number, field, error
format!("{}\t{}\t{}", row_number_string, tuple.0, tuple.1)
})
.join("\n");

Ok(Some(combined_errors))
}
None => Ok(None),
}
}

Expand Down Expand Up @@ -608,14 +606,39 @@ mod tests_for_csv_to_json_conversion {
}

/// Validate JSON instance against compiled JSON schema
fn validate_json_instance(instance: &Value, schema_compiled: &JSONSchema) -> Result<Value> {
let output: BasicOutput = schema_compiled.apply(instance).basic();

match serde_json::to_value(output) {
Ok(json) => Ok(json),
Err(e) => Err(anyhow!(
"Cannot convert schema validation output to json: {e}"
)),
/// If invalid, returns Some(Vec<(String,String)>) holding the error messages
fn validate_json_instance(
instance: &Value,
schema_compiled: &JSONSchema,
) -> Option<Vec<(String, String)>> {
let validation_output = schema_compiled.apply(instance);

// If validation output is Invalid, then grab field names and errors
if !validation_output.flag() {
// get validation errors as String
let validation_errors: Vec<(String, String)> = match validation_output.basic() {
BasicOutput::Invalid(errors) => errors
.iter()
.map(|e| {
if let Some(PathChunk::Property(box_str)) = e.instance_location().last() {
(box_str.to_string(), e.error_description().to_string())
} else {
(
e.instance_location().to_string(),
e.error_description().to_string(),
)
}
})
.collect(),
BasicOutput::Valid(_annotations) => {
// shouln't happen
panic!("Unexpected error.");
}
};

Some(validation_errors)
} else {
None
}
}

Expand Down Expand Up @@ -668,9 +691,9 @@ mod tests_for_schema_validation {

let instance = to_json_instance(&headers, &record, &schema_json()).unwrap();

let result = validate_json_instance(&instance, &compiled_schema()).unwrap();
let result = validate_json_instance(&instance, &compiled_schema());

assert_eq!(true, result["valid"].as_bool().unwrap());
assert!(result.is_none());
}

#[test]
Expand All @@ -685,9 +708,17 @@ mod tests_for_schema_validation {

let instance = to_json_instance(&headers, &record, &schema_json()).unwrap();

let result = validate_json_instance(&instance, &compiled_schema()).unwrap();
let result = validate_json_instance(&instance, &compiled_schema());

assert!(result.is_some());

assert_eq!(false, result["valid"].as_bool().unwrap());
assert_eq!(
vec![(
"name".to_string(),
"\"X\" is shorter than 2 characters".to_string()
)],
result.unwrap()
);
}
}

Expand Down
7 changes: 4 additions & 3 deletions tests/test_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ fn generate_schema_with_value_constraints_then_feed_into_validate() {
wrk.output(&mut cmd2);

// validation report
let validation_errors_expected = r#"{"valid":false,"errors":[{"keywordLocation":"/properties/ExtractDate/type","instanceLocation":"/ExtractDate","error":"null is not of type \"string\""},{"keywordLocation":"/properties/ExtractDate/enum","instanceLocation":"/ExtractDate","error":"null is not one of [\"07/07/2014 00:00\",\"2014-07-07 00:00\"]"}],"row_number":1}
"#;
let validation_errors_expected = "row_number\tfield\terror\n\
1\tExtractDate\tnull is not of type \"string\"\n\
1\tExtractDate\tnull is not one of [\"07/07/2014 00:00\",\"2014-07-07 00:00\"]\n";

// check validation error output
let validation_error_output: String =
wrk.from_str(&wrk.path("adur-public-toilets.csv.validation-errors.jsonl"));
wrk.from_str(&wrk.path("adur-public-toilets.csv.validation-errors.tsv"));

assert!(validation_error_output.len() > 0);

Expand Down
30 changes: 12 additions & 18 deletions tests/test_validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ fn validate_bad_csv() {
wrk.assert_err(&mut cmd);
}

fn adur_errors() -> &'static str {
"row_number\tfield\terror\n\
1\tExtractDate\tnull is not of type \"string\"\n\
1\tOrganisationLabel\tnull is not of type \"string\"\n\
3\tCoordinateReferenceSystem\t\"OSGB3\" does not match \"(WGS84|OSGB36)\"\n\
3\tCategory\t\"Mens\" does not match \"(Female|Male|Female and Male|Unisex|Male urinal|Children only|None)\"\n"
}

#[test]
fn validate_adur_public_toilets_dataset_with_json_schema() {
let wrk = Workdir::new("validate").flexible(true);
Expand Down Expand Up @@ -69,15 +77,8 @@ fn validate_adur_public_toilets_dataset_with_json_schema() {

// check validation error output

let validation_errors_expected = r#"{"valid":false,"errors":[{"keywordLocation":"/properties/ExtractDate/type","instanceLocation":"/ExtractDate","absoluteKeywordLocation":"https://example.com/properties/ExtractDate/type","error":"null is not of type \"string\""},{"keywordLocation":"/properties/OrganisationLabel/type","instanceLocation":"/OrganisationLabel","absoluteKeywordLocation":"https://example.com/properties/OrganisationLabel/type","error":"null is not of type \"string\""}],"row_number":1}
{"valid":false,"errors":[{"keywordLocation":"/properties/CoordinateReferenceSystem/pattern","instanceLocation":"/CoordinateReferenceSystem","absoluteKeywordLocation":"https://example.com/properties/CoordinateReferenceSystem/pattern","error":"\"OSGB3\" does not match \"(WGS84|OSGB36)\""},{"keywordLocation":"/properties/Category/pattern","instanceLocation":"/Category","absoluteKeywordLocation":"https://example.com/properties/Category/pattern","error":"\"Mens\" does not match \"(Female|Male|Female and Male|Unisex|Male urinal|Children only|None)\""}],"row_number":3}
"#;
let validation_error_output: String =
wrk.from_str(&wrk.path("data.csv.validation-errors.jsonl"));
assert_eq!(
validation_errors_expected.to_string(),
validation_error_output
);
let validation_error_output: String = wrk.from_str(&wrk.path("data.csv.validation-errors.tsv"));
assert_eq!(adur_errors(), validation_error_output);
}

#[test]
Expand All @@ -103,13 +104,6 @@ fn validate_adur_public_toilets_dataset_with_json_schema_url() {

// check validation error output

let validation_errors_expected = r#"{"valid":false,"errors":[{"keywordLocation":"/properties/ExtractDate/type","instanceLocation":"/ExtractDate","absoluteKeywordLocation":"https://example.com/properties/ExtractDate/type","error":"null is not of type \"string\""},{"keywordLocation":"/properties/OrganisationLabel/type","instanceLocation":"/OrganisationLabel","absoluteKeywordLocation":"https://example.com/properties/OrganisationLabel/type","error":"null is not of type \"string\""}],"row_number":1}
{"valid":false,"errors":[{"keywordLocation":"/properties/CoordinateReferenceSystem/pattern","instanceLocation":"/CoordinateReferenceSystem","absoluteKeywordLocation":"https://example.com/properties/CoordinateReferenceSystem/pattern","error":"\"OSGB3\" does not match \"(WGS84|OSGB36)\""},{"keywordLocation":"/properties/Category/pattern","instanceLocation":"/Category","absoluteKeywordLocation":"https://example.com/properties/Category/pattern","error":"\"Mens\" does not match \"(Female|Male|Female and Male|Unisex|Male urinal|Children only|None)\""}],"row_number":3}
"#;
let validation_error_output: String =
wrk.from_str(&wrk.path("data.csv.validation-errors.jsonl"));
assert_eq!(
validation_errors_expected.to_string(),
validation_error_output
);
let validation_error_output: String = wrk.from_str(&wrk.path("data.csv.validation-errors.tsv"));
assert_eq!(adur_errors(), validation_error_output);
}