From 937a2dd78b64af22a5e22b0d0b12045b251affff Mon Sep 17 00:00:00 2001 From: Michael Huang Date: Tue, 22 Feb 2022 15:29:24 +0800 Subject: [PATCH 1/2] avoid json serde for valid rec; simplify err output --- src/cmd/validate.rs | 90 +++++++++++++++++++++++++++--------------- tests/test_schema.rs | 7 ++-- tests/test_validate.rs | 22 ++++++----- 3 files changed, 74 insertions(+), 45 deletions(-) diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index dbb9148e6..d980b6351 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -5,6 +5,7 @@ use crate::CliResult; use anyhow::{anyhow, Result}; use csv::ByteRecord; use indicatif::{ProgressBar, ProgressDrawTarget}; +use jsonschema::paths::PathChunk; use jsonschema::{output::BasicOutput, JSONSchema}; #[allow(unused_imports)] use log::{debug, info}; @@ -29,7 +30,7 @@ Example output files from `mydata.csv`. If piped from stdin, then filename is `s * mydata.csv.valid * mydata.csv.invalid -* mydata.csv.validation-errors.jsonl +* mydata.csv.validation-errors.tsv JSON Schema can be a local file or a URL. @@ -180,6 +181,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let batch_size = batch.len(); // do actual validation via Rayon parallel iterator + // validation_results vector should have same row count and in same order as input CSV let validation_results: Vec> = batch .par_iter() .map(|record| { @@ -328,10 +330,12 @@ fn write_error_report(input_path: &str, validation_error_messages: Vec) .unwrap_or_else(|_| DEFAULT_WTR_BUFFER_CAPACITY.to_string()); let wtr_buffer_size: usize = wtr_capacitys.parse().unwrap_or(DEFAULT_WTR_BUFFER_CAPACITY); - let output_file = File::create(input_path.to_owned() + ".validation-errors.jsonl")?; + let output_file = File::create(input_path.to_owned() + ".validation-errors.tsv")?; let mut output_writer = BufWriter::with_capacity(wtr_buffer_size, output_file); + output_writer.write_all("row_number\tfield\terror\n".as_bytes())?; + // write out error report for error_msg in validation_error_messages { output_writer.write_all(error_msg.as_bytes())?; @@ -368,25 +372,23 @@ fn do_json_validation( // debug!("instance[{row_number}]: {instance:?}"); match validate_json_instance(&instance, schema_compiled) { - Ok(mut validation_result) => { - let is_valid = validation_result.get("valid").unwrap().as_bool().unwrap(); - - if is_valid { - Ok(None) - } else { - // debug!("row[{row_number}] is invalid"); - - // insert row index and return validation error - validation_result.as_object_mut().unwrap().insert( - "row_number".to_string(), - Value::Number(Number::from(row_number)), - ); - - Ok(Some(validation_result.to_string())) - } + Some(validation_errors) => { + + use itertools::Itertools; + // squash multiple errors into one long String with linebreaks + let combined_errors: String = validation_errors + .iter() + .map(|tuple| { + // validation error file format: row_number, field, error + format!("{}\t{}\t{}", row_number_string, tuple.0, tuple.1) + }) + .join("\n"); + + Ok(Some(combined_errors)) + } - Err(e) => { - return fail!(format!("Unable to validate. row: {row_number}, error: {e}")); + None => { + Ok(None) } } } @@ -608,15 +610,37 @@ mod tests_for_csv_to_json_conversion { } /// Validate JSON instance against compiled JSON schema -fn validate_json_instance(instance: &Value, schema_compiled: &JSONSchema) -> Result { - let output: BasicOutput = schema_compiled.apply(instance).basic(); - - match serde_json::to_value(output) { - Ok(json) => Ok(json), - Err(e) => Err(anyhow!( - "Cannot convert schema validation output to json: {e}" - )), +/// If invalid, returns Some(Vec<(String,String)>) holding the error messages +fn validate_json_instance(instance: &Value, schema_compiled: &JSONSchema) -> Option> { + let output = schema_compiled.apply(instance); + + if output.flag() != true { + // get validation errors as String + let validation_errors: Vec<(String,String)> = match output.basic() { + BasicOutput::Invalid(errors) => { + + errors + .iter() + .map(move |e| { + if let Some(PathChunk::Property(box_str)) = e.instance_location().last() { + (box_str.to_string(), e.error_description().to_string()) + } else { + (e.instance_location().to_string(), e.error_description().to_string()) + } + }) + .collect() + }, + BasicOutput::Valid(_annotations) => { + // shouln't happen + panic!("Unexpected error."); + } + }; + + Some(validation_errors) + } else { + None } + } #[cfg(test)] @@ -668,9 +692,9 @@ mod tests_for_schema_validation { let instance = to_json_instance(&headers, &record, &schema_json()).unwrap(); - let result = validate_json_instance(&instance, &compiled_schema()).unwrap(); + let result = validate_json_instance(&instance, &compiled_schema()); - assert_eq!(true, result["valid"].as_bool().unwrap()); + assert!(result.is_none()); } #[test] @@ -685,9 +709,11 @@ mod tests_for_schema_validation { let instance = to_json_instance(&headers, &record, &schema_json()).unwrap(); - let result = validate_json_instance(&instance, &compiled_schema()).unwrap(); + let result = validate_json_instance(&instance, &compiled_schema()); + + assert!(result.is_some()); - assert_eq!(false, result["valid"].as_bool().unwrap()); + assert_eq!(vec![("name".to_string(), "\"X\" is shorter than 2 characters".to_string())], result.unwrap()); } } diff --git a/tests/test_schema.rs b/tests/test_schema.rs index 7f1be4ba0..44016b0ee 100644 --- a/tests/test_schema.rs +++ b/tests/test_schema.rs @@ -44,12 +44,13 @@ fn generate_schema_with_value_constraints_then_feed_into_validate() { wrk.output(&mut cmd2); // validation report - let validation_errors_expected = r#"{"valid":false,"errors":[{"keywordLocation":"/properties/ExtractDate/type","instanceLocation":"/ExtractDate","error":"null is not of type \"string\""},{"keywordLocation":"/properties/ExtractDate/enum","instanceLocation":"/ExtractDate","error":"null is not one of [\"07/07/2014 00:00\",\"2014-07-07 00:00\"]"}],"row_number":1} -"#; + let validation_errors_expected = "row_number\tfield\terror\n\ + 1\tExtractDate\tnull is not of type \"string\"\n\ + 1\tExtractDate\tnull is not one of [\"07/07/2014 00:00\",\"2014-07-07 00:00\"]\n"; // check validation error output let validation_error_output: String = - wrk.from_str(&wrk.path("adur-public-toilets.csv.validation-errors.jsonl")); + wrk.from_str(&wrk.path("adur-public-toilets.csv.validation-errors.tsv")); assert!(validation_error_output.len() > 0); diff --git a/tests/test_validate.rs b/tests/test_validate.rs index 923ba9664..5a305e9aa 100644 --- a/tests/test_validate.rs +++ b/tests/test_validate.rs @@ -36,6 +36,14 @@ fn validate_bad_csv() { wrk.assert_err(&mut cmd); } +fn adur_errors() -> &'static str { + "row_number\tfield\terror\n\ + 1\tExtractDate\tnull is not of type \"string\"\n\ + 1\tOrganisationLabel\tnull is not of type \"string\"\n\ + 3\tCoordinateReferenceSystem\t\"OSGB3\" does not match \"(WGS84|OSGB36)\"\n\ + 3\tCategory\t\"Mens\" does not match \"(Female|Male|Female and Male|Unisex|Male urinal|Children only|None)\"\n" +} + #[test] fn validate_adur_public_toilets_dataset_with_json_schema() { let wrk = Workdir::new("validate").flexible(true); @@ -69,13 +77,10 @@ fn validate_adur_public_toilets_dataset_with_json_schema() { // check validation error output - let validation_errors_expected = r#"{"valid":false,"errors":[{"keywordLocation":"/properties/ExtractDate/type","instanceLocation":"/ExtractDate","absoluteKeywordLocation":"https://example.com/properties/ExtractDate/type","error":"null is not of type \"string\""},{"keywordLocation":"/properties/OrganisationLabel/type","instanceLocation":"/OrganisationLabel","absoluteKeywordLocation":"https://example.com/properties/OrganisationLabel/type","error":"null is not of type \"string\""}],"row_number":1} -{"valid":false,"errors":[{"keywordLocation":"/properties/CoordinateReferenceSystem/pattern","instanceLocation":"/CoordinateReferenceSystem","absoluteKeywordLocation":"https://example.com/properties/CoordinateReferenceSystem/pattern","error":"\"OSGB3\" does not match \"(WGS84|OSGB36)\""},{"keywordLocation":"/properties/Category/pattern","instanceLocation":"/Category","absoluteKeywordLocation":"https://example.com/properties/Category/pattern","error":"\"Mens\" does not match \"(Female|Male|Female and Male|Unisex|Male urinal|Children only|None)\""}],"row_number":3} -"#; let validation_error_output: String = - wrk.from_str(&wrk.path("data.csv.validation-errors.jsonl")); + wrk.from_str(&wrk.path("data.csv.validation-errors.tsv")); assert_eq!( - validation_errors_expected.to_string(), + adur_errors(), validation_error_output ); } @@ -103,13 +108,10 @@ fn validate_adur_public_toilets_dataset_with_json_schema_url() { // check validation error output - let validation_errors_expected = r#"{"valid":false,"errors":[{"keywordLocation":"/properties/ExtractDate/type","instanceLocation":"/ExtractDate","absoluteKeywordLocation":"https://example.com/properties/ExtractDate/type","error":"null is not of type \"string\""},{"keywordLocation":"/properties/OrganisationLabel/type","instanceLocation":"/OrganisationLabel","absoluteKeywordLocation":"https://example.com/properties/OrganisationLabel/type","error":"null is not of type \"string\""}],"row_number":1} -{"valid":false,"errors":[{"keywordLocation":"/properties/CoordinateReferenceSystem/pattern","instanceLocation":"/CoordinateReferenceSystem","absoluteKeywordLocation":"https://example.com/properties/CoordinateReferenceSystem/pattern","error":"\"OSGB3\" does not match \"(WGS84|OSGB36)\""},{"keywordLocation":"/properties/Category/pattern","instanceLocation":"/Category","absoluteKeywordLocation":"https://example.com/properties/Category/pattern","error":"\"Mens\" does not match \"(Female|Male|Female and Male|Unisex|Male urinal|Children only|None)\""}],"row_number":3} -"#; let validation_error_output: String = - wrk.from_str(&wrk.path("data.csv.validation-errors.jsonl")); + wrk.from_str(&wrk.path("data.csv.validation-errors.tsv")); assert_eq!( - validation_errors_expected.to_string(), + adur_errors(), validation_error_output ); } From 38d8bcbb0181461b8f7d7c07cbd6eb5ee0e56a4b Mon Sep 17 00:00:00 2001 From: Michael Huang Date: Tue, 22 Feb 2022 16:04:01 +0800 Subject: [PATCH 2/2] clippy and fmt --- src/cmd/validate.rs | 57 +++++++++++++++++++++++------------------- tests/test_validate.rs | 16 +++--------- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index d980b6351..b6e0af68e 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -373,7 +373,6 @@ fn do_json_validation( match validate_json_instance(&instance, schema_compiled) { Some(validation_errors) => { - use itertools::Itertools; // squash multiple errors into one long String with linebreaks let combined_errors: String = validation_errors @@ -381,15 +380,12 @@ fn do_json_validation( .map(|tuple| { // validation error file format: row_number, field, error format!("{}\t{}\t{}", row_number_string, tuple.0, tuple.1) - }) + }) .join("\n"); - + Ok(Some(combined_errors)) - - } - None => { - Ok(None) } + None => Ok(None), } } @@ -611,25 +607,29 @@ mod tests_for_csv_to_json_conversion { /// Validate JSON instance against compiled JSON schema /// If invalid, returns Some(Vec<(String,String)>) holding the error messages -fn validate_json_instance(instance: &Value, schema_compiled: &JSONSchema) -> Option> { - let output = schema_compiled.apply(instance); +fn validate_json_instance( + instance: &Value, + schema_compiled: &JSONSchema, +) -> Option> { + let validation_output = schema_compiled.apply(instance); - if output.flag() != true { + // If validation output is Invalid, then grab field names and errors + if !validation_output.flag() { // get validation errors as String - let validation_errors: Vec<(String,String)> = match output.basic() { - BasicOutput::Invalid(errors) => { - - errors - .iter() - .map(move |e| { - if let Some(PathChunk::Property(box_str)) = e.instance_location().last() { - (box_str.to_string(), e.error_description().to_string()) - } else { - (e.instance_location().to_string(), e.error_description().to_string()) - } - }) - .collect() - }, + let validation_errors: Vec<(String, String)> = match validation_output.basic() { + BasicOutput::Invalid(errors) => errors + .iter() + .map(|e| { + if let Some(PathChunk::Property(box_str)) = e.instance_location().last() { + (box_str.to_string(), e.error_description().to_string()) + } else { + ( + e.instance_location().to_string(), + e.error_description().to_string(), + ) + } + }) + .collect(), BasicOutput::Valid(_annotations) => { // shouln't happen panic!("Unexpected error."); @@ -640,7 +640,6 @@ fn validate_json_instance(instance: &Value, schema_compiled: &JSONSchema) -> Opt } else { None } - } #[cfg(test)] @@ -713,7 +712,13 @@ mod tests_for_schema_validation { assert!(result.is_some()); - assert_eq!(vec![("name".to_string(), "\"X\" is shorter than 2 characters".to_string())], result.unwrap()); + assert_eq!( + vec![( + "name".to_string(), + "\"X\" is shorter than 2 characters".to_string() + )], + result.unwrap() + ); } } diff --git a/tests/test_validate.rs b/tests/test_validate.rs index 5a305e9aa..5e246dddd 100644 --- a/tests/test_validate.rs +++ b/tests/test_validate.rs @@ -77,12 +77,8 @@ fn validate_adur_public_toilets_dataset_with_json_schema() { // check validation error output - let validation_error_output: String = - wrk.from_str(&wrk.path("data.csv.validation-errors.tsv")); - assert_eq!( - adur_errors(), - validation_error_output - ); + let validation_error_output: String = wrk.from_str(&wrk.path("data.csv.validation-errors.tsv")); + assert_eq!(adur_errors(), validation_error_output); } #[test] @@ -108,10 +104,6 @@ fn validate_adur_public_toilets_dataset_with_json_schema_url() { // check validation error output - let validation_error_output: String = - wrk.from_str(&wrk.path("data.csv.validation-errors.tsv")); - assert_eq!( - adur_errors(), - validation_error_output - ); + let validation_error_output: String = wrk.from_str(&wrk.path("data.csv.validation-errors.tsv")); + assert_eq!(adur_errors(), validation_error_output); }