From 52bcb37d6f7fde905db630ea31f065e8b6086c11 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Mon, 3 Jun 2024 14:15:48 -0400 Subject: [PATCH] stop repeating field over and over #1 --- README.md | 1 - .../export/croissant/CroissantExporter.java | 6 +- .../cars/expected/cars-croissant.json | 77 +++---------------- 3 files changed, 15 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index bf81503..aa92dfb 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,6 @@ Same as above but use a JVM option in domain.xml such as the example below. ### Differences from Kaggle - I see an `encodingFormat` of `text/comma-separated-values`. Kind of curious about that since I think `text/csv` is more the MIME type that's on https://www.iana.org/assignments/media-types/media-types.xhtml and https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types . See https://github.com/IQSS/dataverse/issues/4943#issuecomment-2145333830 -- One big difference I see is that you have many `recordSets` (and each one containing a single `field`) despite there being only 1 CSV. My understanding was that a `recordSet` maps roughly to a table and a `field` maps roughly to a column. So you'll see that our implementation has only 1 `recordSet` with many `field`s. This might be a good thing to get clarification on. - Another thing that sticks out is that I see all of the `field`s have a `dataType` of `sc:Integer`. But nearly all of the columns (excluding `quality` and `Id`) are `sc:Float`. On the Kaggle side, we have a column type of "Id" and so if that's set on a column, we set the `dataType` to `sc:Text` since Ids can often be non-numerical. Just a minor difference there, though, so nothing alarming to me personally. ### Differences from pyDataverse diff --git a/src/main/java/io/gdcc/spi/export/croissant/CroissantExporter.java b/src/main/java/io/gdcc/spi/export/croissant/CroissantExporter.java index 88ca2ac..d38cb31 100644 --- a/src/main/java/io/gdcc/spi/export/croissant/CroissantExporter.java +++ b/src/main/java/io/gdcc/spi/export/croissant/CroissantExporter.java @@ -193,6 +193,8 @@ public void exportDataset(ExportDataProvider dataProvider, OutputStream outputSt int fileCounter = 0; for (JsonValue jsonValue : datasetFileDetails) { + JsonObjectBuilder recordSetContent = Json.createObjectBuilder(); + recordSetContent.add("@type", "cr:RecordSet"); JsonObject fileDetails = jsonValue.asJsonObject(); /** * When there is an originalFileName, it means that the file has gone through ingest @@ -306,9 +308,9 @@ public void exportDataset(ExportDataProvider dataProvider, OutputStream outputSt "fileObject", Json.createObjectBuilder() .add("@id", fileId)))); - fieldSetObject.add("field", fieldSetArray); - recordSet.add(fieldSetObject); } + recordSetContent.add("field", fieldSetArray); + recordSet.add(recordSetContent); fileIndex++; } fileCounter++; diff --git a/src/test/resources/cars/expected/cars-croissant.json b/src/test/resources/cars/expected/cars-croissant.json index 6ddfc94..fdcb906 100644 --- a/src/test/resources/cars/expected/cars-croissant.json +++ b/src/test/resources/cars/expected/cars-croissant.json @@ -126,12 +126,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "price", @@ -143,12 +138,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "mpg", @@ -160,12 +150,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "rep78", @@ -177,12 +162,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "headroom", @@ -194,12 +174,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "trunk", @@ -211,12 +186,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "weight", @@ -228,12 +198,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "length", @@ -245,12 +210,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "turn", @@ -262,12 +222,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "displacement", @@ -279,12 +234,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "gear_ratio", @@ -296,12 +246,7 @@ "@id": "data/stata13-auto.dta" } } - } - ] - }, - { - "@type": "cr:RecordSet", - "field": [ + }, { "@type": "cr:Field", "name": "foreign",