From ea2e1016a1592b3ee7e92e983097785cc85726a8 Mon Sep 17 00:00:00 2001 From: Rick Moynihan Date: Mon, 20 Nov 2023 14:38:20 +0000 Subject: [PATCH] Add --validate-data flag to run in validation mode In this mode we validate the data against the table schema and report any errors. No RDFization is done, and a return code is propogated upon a failure to the shell. --- src/csv2rdf/csvw.clj | 47 +++++++++++++++++++++ src/csv2rdf/main.clj | 25 +++++++---- test/csv2rdf/main_test.clj | 41 ++++++++++++++++++ test/examples/validation/fail-1.csv | 6 +++ test/examples/validation/fail-2.csv | 5 +++ test/examples/validation/named-numbers.json | 18 ++++++++ test/examples/validation/success.csv | 6 +++ 7 files changed, 140 insertions(+), 8 deletions(-) create mode 100644 test/csv2rdf/main_test.clj create mode 100644 test/examples/validation/fail-1.csv create mode 100644 test/examples/validation/fail-2.csv create mode 100644 test/examples/validation/named-numbers.json create mode 100644 test/examples/validation/success.csv diff --git a/src/csv2rdf/csvw.clj b/src/csv2rdf/csvw.clj index dd81de1e..b6ab8009 100644 --- a/src/csv2rdf/csvw.clj +++ b/src/csv2rdf/csvw.clj @@ -18,6 +18,53 @@ annotated-rows (csv/annotated-rows url table dialect)] (table-statements context table annotated-rows))) +(defn annotate-tables [tabular-source metadata-source] + (processing/get-metadata tabular-source metadata-source)) + +(defn- validate-rows + "Validates the CSVW schema for the given tabular file, metadata and options. + + `tabular-source` and `metadata-source` can be any of the following + types: + + - java.io.File + - java.lang.String + - java.net.URI + - java.nio.file.Path (including nio Paths that are inside zip filesystems) + + If metadata-source is non-nil then processing will start from the + asscociated metadata document, otherwise it will start from + tabular-source." + [tabular-source metadata-source] + (let [{:keys [tables] :as metadata} (processing/get-metadata tabular-source metadata-source) + table-group-dialect (:dialect metadata) + output-tables (remove properties/suppress-output? tables) + ;;ctx (table-group-context mode metadata) ;; TODO this might be useful later when iterating over tables + ] + + (util/liberal-mapcat (fn [{:keys [url dialect] :as table}] + ;;(validated-rows ctx table table-group-dialect) + (let [dialect (or dialect table-group-dialect)] + (csv/annotated-rows url table dialect))) + + output-tables))) + +(defn only-validate-schema + "Only validate the data against the schemas in the metadata file, and + report errors. Does not convert into RDF. + + Returns a map with the key `:data-validation-errors?` set to a + boolean indicating whether any schema errors occurred." + [{:keys [tabular-source metadata-source]}] + (let [errors? (atom false)] + (doseq [{:keys [cells] row-number :source-number :as row} (validate-rows tabular-source metadata-source) + {:keys [errors column-number column] :as cell} cells + :when (seq errors)] + (reset! errors? true) + (doseq [error errors] + (println (format "Row #%d col #%d (column '%s') has error: " row-number column-number (:name column)) error))) + {:data-validation-errors? @errors?})) + (defn csv->rdf "Runs the CSVW process for the given tabular or metadata data sources and options. diff --git a/src/csv2rdf/main.clj b/src/csv2rdf/main.clj index 01dab8ca..13d5f9bb 100644 --- a/src/csv2rdf/main.clj +++ b/src/csv2rdf/main.clj @@ -12,6 +12,8 @@ (def options-spec [["-t" "--tabular TABULAR" "Location of the tabular file"] ["-u" "--user-metadata METADATA" "Location of the metadata file"] + ["-s" "--validate-schema" "Validate the schema only"] + ["-d" "--validate-data" "Validate the data against the schema only (no RDFization)"] ["-o" "--output-file OUTPUT" "Output file to write to"] ["-m" "--mode MODE" "CSVW mode to run" :validate [#(contains? #{:minimal :standard :annotated} %)] @@ -67,23 +69,29 @@ (println "Usage:") (println summary))) -(defn- inner-main [args] + + +(defn inner-main [args] (let [options (parse-cli-options args) - {:keys [mode tabular user-metadata output-file]} options + {:keys [mode tabular user-metadata output-file validate-data annotate-tables]} options opts {:tabular-source (some-> tabular parse-source) :metadata-source (some-> user-metadata parse-source) :rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE) :mode mode} output-file (some-> output-file io/file)] - (if output-file - (with-open [w (io/writer output-file)] - (write-output w opts)) - (write-output (io/writer *out*) opts)))) + + (cond validate-data (csvw/only-validate-schema opts) + + :else (if output-file + (with-open [w (io/writer output-file)] + (write-output w opts)) + (write-output (io/writer *out*) opts))))) (defn- -main [& args] (try - (inner-main args) - (System/exit 0) + (if (:data-validation-errors? (inner-main args)) + (System/exit 2) + (System/exit 0)) (catch Throwable ex (display-error ex) (System/exit 1)))) @@ -91,6 +99,7 @@ (comment + (inner-main ["-s" "-t" "/Users/rick/repos/dclg-epcs/resources/public/csvw/basic/certificates.csv" "-u" "/Users/rick/repos/dclg-epcs/resources/public/csvw/basic/epc_domestic.json"]) (time (inner-main ["-t" "out/hmrc-rts-small-area.csv" "-u" "out/hmrc-rts-small-area.csv-metadata.json" "-m" "annotated" "-o" "cube.nt"])) (require '[clj-async-profiler.core :as prof]) diff --git a/test/csv2rdf/main_test.clj b/test/csv2rdf/main_test.clj new file mode 100644 index 00000000..61fda0ff --- /dev/null +++ b/test/csv2rdf/main_test.clj @@ -0,0 +1,41 @@ +(ns csv2rdf.main-test + (:require [csv2rdf.main :as sut] + [clojure.test :as t])) + +;; See issue 47 +;; Resolving template property URIs with values containing spaces should work + +(defmacro capture + "Capture return value of body and stdout, and return a hashmap + of :return-value and :stdout." + [body] + `(let [s# (new java.io.StringWriter)] + (binding [*out* s#] + (let [ret# ~body] + {:return-value ret# + :stdout (str s#)})))) + +(t/deftest inner-main-test-validate-data + (t/testing "--validate-data") + (let [{:keys [return-value stdout]} + (capture (sut/inner-main ["-t" "./test/examples/validation/success.csv" + "-u" "./test/examples/validation/named-numbers.json" + "--validate-data"]))] + (t/is (= {:data-validation-errors? false} return-value)) + (t/is (= "" stdout))) + + (let [{:keys [return-value stdout]} + (capture (sut/inner-main ["-t" "./test/examples/validation/fail-1.csv" + "-u" "./test/examples/validation/named-numbers.json" + "--validate-data"]))] + (t/is (= {:data-validation-errors? true} return-value)) + (t/is (= "Row #3 col #2 (column 'number') has error: Cannot parse 'two' as type 'int': For input string: \"two\"\n" + stdout))) + + (let [{:keys [return-value stdout]} + (capture (sut/inner-main ["-t" "./test/examples/validation/fail-2.csv" + "-u" "./test/examples/validation/named-numbers.json" + "--validate-data"]))] + (t/is (= {:data-validation-errors? true} return-value)) + (t/is (= "Row #3 col #2 (column 'number') has error: Cannot parse 'three' as type 'int': For input string: \"three\"\n" + stdout)))) diff --git a/test/examples/validation/fail-1.csv b/test/examples/validation/fail-1.csv new file mode 100644 index 00000000..a9227701 --- /dev/null +++ b/test/examples/validation/fail-1.csv @@ -0,0 +1,6 @@ +name,number +one,1 +two,two +three,3 +four,4 +five,5 diff --git a/test/examples/validation/fail-2.csv b/test/examples/validation/fail-2.csv new file mode 100644 index 00000000..14f37e58 --- /dev/null +++ b/test/examples/validation/fail-2.csv @@ -0,0 +1,5 @@ +name,number +one,1 +3,three +four,4 +five,5 diff --git a/test/examples/validation/named-numbers.json b/test/examples/validation/named-numbers.json new file mode 100644 index 00000000..e4083bbd --- /dev/null +++ b/test/examples/validation/named-numbers.json @@ -0,0 +1,18 @@ +{ + "@context": "http://www.w3.org/ns/csvw", + "url": "fail-2.csv", + "tableSchema": { + "columns": [ + { + "name": "name", + "datatype": "string", + "required": true + }, + { + "name": "number", + "required": true, + "datatype": "int" + } + ] + } + } diff --git a/test/examples/validation/success.csv b/test/examples/validation/success.csv new file mode 100644 index 00000000..fb71b17f --- /dev/null +++ b/test/examples/validation/success.csv @@ -0,0 +1,6 @@ +name,number +one,1 +two,2 +three,3 +four,4 +five,5