Skip to content

Commit

Permalink
Add --validate-data flag to run in validation mode
Browse files Browse the repository at this point in the history
In this mode we validate the data against the table schema and report
any errors.  No RDFization is done, and a return code is propogated
upon a failure to the shell.
  • Loading branch information
RickMoynihan committed Nov 20, 2023
1 parent 9fcfa8c commit ea2e101
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 8 deletions.
47 changes: 47 additions & 0 deletions src/csv2rdf/csvw.clj
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,53 @@
annotated-rows (csv/annotated-rows url table dialect)]
(table-statements context table annotated-rows)))

(defn annotate-tables [tabular-source metadata-source]
(processing/get-metadata tabular-source metadata-source))

(defn- validate-rows
"Validates the CSVW schema for the given tabular file, metadata and options.
`tabular-source` and `metadata-source` can be any of the following
types:
- java.io.File
- java.lang.String
- java.net.URI
- java.nio.file.Path (including nio Paths that are inside zip filesystems)
If metadata-source is non-nil then processing will start from the
asscociated metadata document, otherwise it will start from
tabular-source."
[tabular-source metadata-source]
(let [{:keys [tables] :as metadata} (processing/get-metadata tabular-source metadata-source)
table-group-dialect (:dialect metadata)
output-tables (remove properties/suppress-output? tables)
;;ctx (table-group-context mode metadata) ;; TODO this might be useful later when iterating over tables
]

(util/liberal-mapcat (fn [{:keys [url dialect] :as table}]
;;(validated-rows ctx table table-group-dialect)
(let [dialect (or dialect table-group-dialect)]
(csv/annotated-rows url table dialect)))

output-tables)))

(defn only-validate-schema
"Only validate the data against the schemas in the metadata file, and
report errors. Does not convert into RDF.
Returns a map with the key `:data-validation-errors?` set to a
boolean indicating whether any schema errors occurred."
[{:keys [tabular-source metadata-source]}]
(let [errors? (atom false)]
(doseq [{:keys [cells] row-number :source-number :as row} (validate-rows tabular-source metadata-source)
{:keys [errors column-number column] :as cell} cells
:when (seq errors)]
(reset! errors? true)
(doseq [error errors]
(println (format "Row #%d col #%d (column '%s') has error: " row-number column-number (:name column)) error)))
{:data-validation-errors? @errors?}))

(defn csv->rdf
"Runs the CSVW process for the given tabular or metadata data sources
and options.
Expand Down
25 changes: 17 additions & 8 deletions src/csv2rdf/main.clj
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
(def options-spec
[["-t" "--tabular TABULAR" "Location of the tabular file"]
["-u" "--user-metadata METADATA" "Location of the metadata file"]
["-s" "--validate-schema" "Validate the schema only"]
["-d" "--validate-data" "Validate the data against the schema only (no RDFization)"]
["-o" "--output-file OUTPUT" "Output file to write to"]
["-m" "--mode MODE" "CSVW mode to run"
:validate [#(contains? #{:minimal :standard :annotated} %)]
Expand Down Expand Up @@ -67,30 +69,37 @@
(println "Usage:")
(println summary)))

(defn- inner-main [args]


(defn inner-main [args]
(let [options (parse-cli-options args)
{:keys [mode tabular user-metadata output-file]} options
{:keys [mode tabular user-metadata output-file validate-data annotate-tables]} options
opts {:tabular-source (some-> tabular parse-source)
:metadata-source (some-> user-metadata parse-source)
:rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE)
:mode mode}
output-file (some-> output-file io/file)]
(if output-file
(with-open [w (io/writer output-file)]
(write-output w opts))
(write-output (io/writer *out*) opts))))

(cond validate-data (csvw/only-validate-schema opts)

:else (if output-file
(with-open [w (io/writer output-file)]
(write-output w opts))
(write-output (io/writer *out*) opts)))))

(defn- -main [& args]
(try
(inner-main args)
(System/exit 0)
(if (:data-validation-errors? (inner-main args))
(System/exit 2)
(System/exit 0))
(catch Throwable ex
(display-error ex)
(System/exit 1))))


(comment

(inner-main ["-s" "-t" "/Users/rick/repos/dclg-epcs/resources/public/csvw/basic/certificates.csv" "-u" "/Users/rick/repos/dclg-epcs/resources/public/csvw/basic/epc_domestic.json"])
(time (inner-main ["-t" "out/hmrc-rts-small-area.csv" "-u" "out/hmrc-rts-small-area.csv-metadata.json" "-m" "annotated" "-o" "cube.nt"]))

(require '[clj-async-profiler.core :as prof])
Expand Down
41 changes: 41 additions & 0 deletions test/csv2rdf/main_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
(ns csv2rdf.main-test
(:require [csv2rdf.main :as sut]
[clojure.test :as t]))

;; See issue 47
;; Resolving template property URIs with values containing spaces should work

(defmacro capture
"Capture return value of body and stdout, and return a hashmap
of :return-value and :stdout."
[body]
`(let [s# (new java.io.StringWriter)]
(binding [*out* s#]
(let [ret# ~body]
{:return-value ret#
:stdout (str s#)}))))

(t/deftest inner-main-test-validate-data
(t/testing "--validate-data")
(let [{:keys [return-value stdout]}
(capture (sut/inner-main ["-t" "./test/examples/validation/success.csv"
"-u" "./test/examples/validation/named-numbers.json"
"--validate-data"]))]
(t/is (= {:data-validation-errors? false} return-value))
(t/is (= "" stdout)))

(let [{:keys [return-value stdout]}
(capture (sut/inner-main ["-t" "./test/examples/validation/fail-1.csv"
"-u" "./test/examples/validation/named-numbers.json"
"--validate-data"]))]
(t/is (= {:data-validation-errors? true} return-value))
(t/is (= "Row #3 col #2 (column 'number') has error: Cannot parse 'two' as type 'int': For input string: \"two\"\n"
stdout)))

(let [{:keys [return-value stdout]}
(capture (sut/inner-main ["-t" "./test/examples/validation/fail-2.csv"
"-u" "./test/examples/validation/named-numbers.json"
"--validate-data"]))]
(t/is (= {:data-validation-errors? true} return-value))
(t/is (= "Row #3 col #2 (column 'number') has error: Cannot parse 'three' as type 'int': For input string: \"three\"\n"
stdout))))
6 changes: 6 additions & 0 deletions test/examples/validation/fail-1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name,number
one,1
two,two
three,3
four,4
five,5
5 changes: 5 additions & 0 deletions test/examples/validation/fail-2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name,number
one,1
3,three
four,4
five,5
18 changes: 18 additions & 0 deletions test/examples/validation/named-numbers.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"@context": "http://www.w3.org/ns/csvw",
"url": "fail-2.csv",
"tableSchema": {
"columns": [
{
"name": "name",
"datatype": "string",
"required": true
},
{
"name": "number",
"required": true,
"datatype": "int"
}
]
}
}
6 changes: 6 additions & 0 deletions test/examples/validation/success.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name,number
one,1
two,2
three,3
four,4
five,5

0 comments on commit ea2e101

Please sign in to comment.