From 58ec9e2004bfbdb3427a5d30f2ae92cb3c87b88c Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 8 Sep 2024 01:21:57 +0200 Subject: [PATCH] Feather: add read support for StringView and BinaryView (but not in OGR generic Arrow code) --- autotest/generate_parquet_test_file.py | 89 +++++++++++++++ autotest/ogr/data/arrow/binaryview.feather | Bin 0 -> 618 bytes autotest/ogr/data/arrow/largelistview.feather | Bin 0 -> 762 bytes autotest/ogr/data/arrow/listview.feather | Bin 0 -> 746 bytes autotest/ogr/data/arrow/stringview.feather | Bin 0 -> 2618 bytes autotest/ogr/ogr_arrow.py | 55 +++++++++ autotest/ogr/ogr_parquet.py | 39 +++++++ .../arrow_common/ograrrowlayer.hpp | 108 ++++++++++++++++-- .../parquet/ogrparquetwriterlayer.cpp | 22 ++++ 9 files changed, 306 insertions(+), 7 deletions(-) create mode 100644 autotest/ogr/data/arrow/binaryview.feather create mode 100644 autotest/ogr/data/arrow/largelistview.feather create mode 100644 autotest/ogr/data/arrow/listview.feather create mode 100644 autotest/ogr/data/arrow/stringview.feather diff --git a/autotest/generate_parquet_test_file.py b/autotest/generate_parquet_test_file.py index 71e226d4ed9e..fd233ca21e55 100644 --- a/autotest/generate_parquet_test_file.py +++ b/autotest/generate_parquet_test_file.py @@ -1245,6 +1245,91 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): ) +def generate_arrow_stringview(): + import pathlib + + import pyarrow as pa + import pyarrow.feather as feather + + stringview = pa.array(["foo", "bar", "looooooooooong string"], pa.string_view()) + list_stringview = pa.array( + [None, [None], ["foo", "bar", "looooooooooong string"]], + pa.list_(pa.string_view()), + ) + list_of_list_stringview = pa.array( + [None, [None], [["foo", "bar", "looooooooooong string"]]], + pa.list_(pa.list_(pa.string_view())), + ) + map_stringview = pa.array( + [None, [], [("x", "x_val"), ("y", None)]], + type=pa.map_(pa.string_view(), pa.string_view()), + ) + + names = [ + "stringview", + "list_stringview", + "list_of_list_stringview", + "map_stringview", + ] + + locals_ = locals() + table = pa.table([locals_[x] for x in names], names=names) + + HERE = pathlib.Path(__file__).parent + feather.write_feather(table, HERE / "ogr/data/arrow/stringview.feather") + + +def generate_arrow_binaryview(): + import pathlib + + import pyarrow as pa + import pyarrow.feather as feather + + binaryview = pa.array([b"foo", b"bar", b"looooooooooong binary"], pa.binary_view()) + + names = ["binaryview"] + + locals_ = locals() + table = pa.table([locals_[x] for x in names], names=names) + + HERE = pathlib.Path(__file__).parent + feather.write_feather(table, HERE / "ogr/data/arrow/binaryview.feather") + + +def generate_arrow_listview(): + import pathlib + + import pyarrow as pa + import pyarrow.feather as feather + + listview = pa.array([[1]], pa.list_view(pa.int32())) + + names = ["listview"] + + locals_ = locals() + table = pa.table([locals_[x] for x in names], names=names) + + HERE = pathlib.Path(__file__).parent + feather.write_feather(table, HERE / "ogr/data/arrow/listview.feather") + + +def generate_arrow_largelistview(): + import pathlib + + import pyarrow as pa + import pyarrow.feather as feather + + largelistview = pa.array([[1]], pa.large_list_view(pa.int32())) + + names = ["largelistview"] + + locals_ = locals() + table = pa.table([locals_[x] for x in names], names=names) + + HERE = pathlib.Path(__file__).parent + feather.write_feather(table, HERE / "ogr/data/arrow/largelistview.feather") + + if __name__ == "__main__": generate_test_parquet() generate_all_geoms_parquet() @@ -1252,3 +1337,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): generate_nested_types() generate_extension_custom() generate_extension_json() + generate_arrow_stringview() + generate_arrow_binaryview() + generate_arrow_listview() + generate_arrow_largelistview() diff --git a/autotest/ogr/data/arrow/binaryview.feather b/autotest/ogr/data/arrow/binaryview.feather new file mode 100644 index 0000000000000000000000000000000000000000..9f62bd82944c4fd7c8b8c7a4c5a2407336256330 GIT binary patch literal 618 zcmcIiJ!=9%5PcUnddQ($D2kO>n3M^mNEP%4h+t!@f{DW6fki&rkRoM%fNT->1|u_xBnhQd-(1tlq?oG3@&033jlo4|X+ ztQ3$XTE?20eQ$p2(axJ@v77)$${&D|3iJ_>BSXOxB}s`vRM4hwNgD!otR)&Sbv z8&g&B)_!b0+w~WZtk$PF6=OLbt#j#flb+_+mPg~h-K^hrE|^=eLOm%-O{077=`~vt zFz}lP>x_pK&*WDMx&$(%{l_`yo4j(=L-&pn@0vxT!#_D!yN`J0IJ$^qTzrrEf1bE& ztX)acm)F5+_DlFV Jw=uUjeggK;GpGOn literal 0 HcmV?d00001 diff --git a/autotest/ogr/data/arrow/stringview.feather b/autotest/ogr/data/arrow/stringview.feather new file mode 100644 index 0000000000000000000000000000000000000000..43ab1534e0f82dc7d119ceff6de9d322653a54b5 GIT binary patch literal 2618 zcmeHJyKWOv5S>jN$3jlBC?=Rku{;zEq6iX5RA|#tfUqJYD3FYi*0B}GMzIqTDTRgj z07?qV6mIA#T~MUp15)q-l=%Y8Id^BG^~M%7NE8|C?%a9adF@@VuB| zg>X*tl9d_BVF&OspaoP8&x}5~6Z!6myuvzk5|QV>hC>*fFMz(OJc#8g);!i`^=+lo z-DaiAHM?o-H=;c`f=$82;^is7t2&IIxm?6*Hagu(yH;^HkqGU| z30vk%|FpHU8r^6Uq=eAhk9AMwKpWGmL+2pjlQEc2`d`3G$MG5ZM}Oh_8+79OWK;F= zTugzU1>|NRkt$-~F9x_74$v&-NXW}&ot(U@x=d$WPY>D96PO3D03SK%_--zKpiu1h zV=VTSJuzTkSx3U3Fm)_G&c-Boq=HcKvf{d8v+XI(+7tRQa&yP+K~m!*U=Pf+_gY!{*I#=&Ys zxy7q`wCA-dAs+zc-^TU|l(#j9C7$pADfn{ijmzL)#-FVHnE0KT-xq?$53=WIJZuyk z{h#%TqrHSWmnE4bn1PG-B%ezlYs`I@9rR_er*cDM>f%^|Drj96L4M(VxLd)O&K;}@ zig+J$%X!M= 16.0.0") ogr.Open("vsi://data/arrow/test.feather") + + +############################################################################### + + +@gdaltest.enable_exceptions() +def test_ogr_arrow_string_view(): + + version = int( + ogr.GetDriverByName("ARROW").GetMetadataItem("ARROW_VERSION").split(".")[0] + ) + if version < 15: + pytest.skip("requires Arrow >= 15") + + with ogr.Open("data/arrow/stringview.feather") as ds: + lyr = ds.GetLayer(0) + f = lyr.GetNextFeature() + assert f["stringview"] == "foo" + assert f["list_stringview"] is None + assert f["list_of_list_stringview"] is None + assert f["map_stringview"] is None + + f = lyr.GetNextFeature() + assert f["stringview"] == "bar" + assert f["list_stringview"] == [""] + assert f["list_of_list_stringview"] == "[null]" + assert f["map_stringview"] == "{}" + + f = lyr.GetNextFeature() + assert f["stringview"] == "looooooooooong string" + assert f["list_stringview"] == ["foo", "bar", "looooooooooong string"] + assert f["list_of_list_stringview"] == '[["foo","bar","looooooooooong string"]]' + assert f["map_stringview"] == '{"x":"x_val","y":null}' + + +############################################################################### + + +@gdaltest.enable_exceptions() +def test_ogr_arrow_binary_view(): + + version = int( + ogr.GetDriverByName("ARROW").GetMetadataItem("ARROW_VERSION").split(".")[0] + ) + if version < 15: + pytest.skip("requires Arrow >= 15") + + with ogr.Open("data/arrow/binaryview.feather") as ds: + lyr = ds.GetLayer(0) + f = lyr.GetNextFeature() + assert f.GetFieldAsBinary("binaryview") == b"foo" + f = lyr.GetNextFeature() + assert f.GetFieldAsBinary("binaryview") == b"bar" + f = lyr.GetNextFeature() + assert f.GetFieldAsBinary("binaryview") == b"looooooooooong binary" diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py index 2563010535ac..d0baf5726ad7 100755 --- a/autotest/ogr/ogr_parquet.py +++ b/autotest/ogr/ogr_parquet.py @@ -4125,3 +4125,42 @@ def test_ogr_parquet_vsi_arrow_file_system(): ds = ogr.Open("PARQUET:vsi://data/parquet/test.parquet") lyr = ds.GetLayer(0) assert lyr.GetFeatureCount() > 0 + + +############################################################################### + + +@gdaltest.enable_exceptions() +@pytest.mark.require_driver("ARROW") +@pytest.mark.parametrize( + "src_filename,expected_error_msg", + [ + ("data/arrow/stringview.feather", "StringView not supported"), + ("data/arrow/binaryview.feather", "BinaryView not supported"), + ], +) +def test_ogr_parquet_IsArrowSchemaSupported_arrow_15_types( + src_filename, expected_error_msg, tmp_vsimem +): + + version = int( + ogr.GetDriverByName("ARROW").GetMetadataItem("ARROW_VERSION").split(".")[0] + ) + if version < 15: + pytest.skip("requires Arrow >= 15.0.0") + + src_ds = ogr.Open(src_filename) + src_lyr = src_ds.GetLayer(0) + + outfilename = str(tmp_vsimem / "test.parquet") + with ogr.GetDriverByName("Parquet").CreateDataSource(outfilename) as dst_ds: + dst_lyr = dst_ds.CreateLayer( + "test", srs=src_lyr.GetSpatialRef(), geom_type=ogr.wkbPoint, options=[] + ) + + stream = src_lyr.GetArrowStream() + schema = stream.GetSchema() + + success, error_msg = dst_lyr.IsArrowSchemaSupported(schema) + assert not success + assert error_msg == expected_error_msg diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp index 084a7273acea..341ca12803fa 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -248,6 +248,9 @@ inline bool OGRArrowLayer::IsHandledListOrMapType( itemTypeId == arrow::Type::DECIMAL256 || itemTypeId == arrow::Type::STRING || itemTypeId == arrow::Type::LARGE_STRING || +#if ARROW_VERSION_MAJOR >= 15 + itemTypeId == arrow::Type::STRING_VIEW || +#endif itemTypeId == arrow::Type::STRUCT || (itemTypeId == arrow::Type::MAP && IsHandledMapType( @@ -276,7 +279,12 @@ inline bool OGRArrowLayer::IsHandledListType( inline bool OGRArrowLayer::IsHandledMapType(const std::shared_ptr &mapType) { - return mapType->key_type()->id() == arrow::Type::STRING && + const auto typeId = mapType->key_type()->id(); + return (typeId == arrow::Type::STRING +#if ARROW_VERSION_MAJOR >= 15 + || typeId == arrow::Type::STRING_VIEW +#endif + ) && IsHandledListOrMapType(mapType->item_type()); } @@ -369,6 +377,9 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR( break; case arrow::Type::STRING: case arrow::Type::LARGE_STRING: +#if ARROW_VERSION_MAJOR >= 15 + case arrow::Type::STRING_VIEW: +#endif bTypeOK = true; eType = OFTString; if (osExtensionName == EXTENSION_NAME_ARROW_JSON) @@ -376,6 +387,9 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR( break; case arrow::Type::BINARY: case arrow::Type::LARGE_BINARY: +#if ARROW_VERSION_MAJOR >= 15 + case arrow::Type::BINARY_VIEW: +#endif bTypeOK = true; eType = OFTBinary; break; @@ -476,6 +490,9 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR( break; case arrow::Type::STRING: case arrow::Type::LARGE_STRING: +#if ARROW_VERSION_MAJOR >= 15 + case arrow::Type::STRING_VIEW: +#endif eType = OFTStringList; break; default: @@ -538,8 +555,6 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR( case arrow::Type::RUN_END_ENCODED: #endif #if ARROW_VERSION_MAJOR >= 15 - case arrow::Type::STRING_VIEW: - case arrow::Type::BINARY_VIEW: case arrow::Type::LIST_VIEW: case arrow::Type::LARGE_LIST_VIEW: #endif @@ -1321,6 +1336,15 @@ static void AddToArray(CPLJSONArray &oArray, const arrow::Array *array, nIdx)); break; } +#if ARROW_VERSION_MAJOR >= 15 + case arrow::Type::STRING_VIEW: + { + oArray.Add( + static_cast(array)->GetString( + nIdx)); + break; + } +#endif case arrow::Type::LIST: case arrow::Type::LARGE_LIST: case arrow::Type::FIXED_SIZE_LIST: @@ -1491,6 +1515,14 @@ static void AddToDict(CPLJSONObject &oDict, const std::string &osKey, ->GetString(nIdx)); break; } +#if ARROW_VERSION_MAJOR >= 15 + case arrow::Type::STRING_VIEW: + { + oDict.Add(osKey, static_cast(array) + ->GetString(nIdx)); + break; + } +#endif case arrow::Type::LIST: case arrow::Type::LARGE_LIST: case arrow::Type::FIXED_SIZE_LIST: @@ -1514,12 +1546,12 @@ static void AddToDict(CPLJSONObject &oDict, const std::string &osKey, /* GetMapAsJSON() */ /************************************************************************/ +template static CPLJSONObject GetMapAsJSON(const arrow::Array *array, const size_t nIdxInArray) { const auto mapArray = static_cast(array); - const auto keys = - std::static_pointer_cast(mapArray->keys()); + const auto keys = std::static_pointer_cast(mapArray->keys()); const auto values = mapArray->items(); const auto nIdxStart = mapArray->value_offset(nIdxInArray); const int nCount = mapArray->value_length(nIdxInArray); @@ -1538,6 +1570,24 @@ static CPLJSONObject GetMapAsJSON(const arrow::Array *array, return oRoot; } +static CPLJSONObject GetMapAsJSON(const arrow::Array *array, + const size_t nIdxInArray) +{ + const auto mapArray = static_cast(array); + const auto eKeyType = mapArray->keys()->type()->id(); + if (eKeyType == arrow::Type::STRING) + return GetMapAsJSON(array, nIdxInArray); +#if ARROW_VERSION_MAJOR >= 15 + else if (eKeyType == arrow::Type::STRING_VIEW) + return GetMapAsJSON(array, nIdxInArray); +#endif + else + { + CPLAssert(false); + return CPLJSONObject(); + } +} + /************************************************************************/ /* GetStructureAsJSON() */ /************************************************************************/ @@ -1802,6 +1852,27 @@ static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInArray, poFeature->SetField(i, aosList.List()); break; } +#if ARROW_VERSION_MAJOR >= 15 + case arrow::Type::STRING_VIEW: + { + const auto values = + std::static_pointer_cast( + array->values()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); + CPLStringList aosList; + for (int k = 0; k < nCount; k++) + { + if (values->IsNull(nIdxStart + k)) + aosList.AddString( + ""); // we cannot have null strings in a list + else + aosList.AddString(values->GetString(nIdxStart + k).c_str()); + } + poFeature->SetField(i, aosList.List()); + break; + } +#endif case arrow::Type::LIST: case arrow::Type::LARGE_LIST: case arrow::Type::FIXED_SIZE_LIST: @@ -2244,6 +2315,31 @@ inline OGRFeature *OGRArrowLayer::ReadFeature( poFeature->SetField(i, out_length, data); break; } +#if ARROW_VERSION_MAJOR >= 15 + case arrow::Type::BINARY_VIEW: + { + const auto castArray = + static_cast(array); + const auto view = castArray->GetView(nIdxInBatch); + poFeature->SetField(i, static_cast(view.size()), + view.data()); + break; + } +#endif +#if ARROW_VERSION_MAJOR >= 15 + case arrow::Type::STRING_VIEW: + { + const auto castArray = + static_cast(array); + const auto strView = castArray->GetView(nIdxInBatch); + char *pszString = + static_cast(CPLMalloc(strView.length() + 1)); + memcpy(pszString, strView.data(), strView.length()); + pszString[strView.length()] = 0; + poFeature->SetFieldSameTypeUnsafe(i, pszString); + break; + } +#endif case arrow::Type::FIXED_SIZE_BINARY: { const auto castArray = @@ -2424,8 +2520,6 @@ inline OGRFeature *OGRArrowLayer::ReadFeature( case arrow::Type::RUN_END_ENCODED: #endif #if ARROW_VERSION_MAJOR >= 15 - case arrow::Type::STRING_VIEW: - case arrow::Type::BINARY_VIEW: case arrow::Type::LIST_VIEW: case arrow::Type::LARGE_LIST_VIEW: #endif diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp index 8e369b13a595..f72610e96dc8 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp @@ -1281,6 +1281,28 @@ bool OGRParquetWriterLayer::IsArrowSchemaSupported( osErrorMsg = "float16 not supported"; return false; } + if (schema->format[0] == 'v' && schema->format[1] == 'u') + { + osErrorMsg = "StringView not supported"; + return false; + } + if (schema->format[0] == 'v' && schema->format[1] == 'z') + { + osErrorMsg = "BinaryView not supported"; + return false; + } + if (schema->format[0] == '+' && schema->format[1] == 'v' && + schema->format[1] == 'l') + { + osErrorMsg = "ListView not supported"; + return false; + } + if (schema->format[0] == '+' && schema->format[1] == 'v' && + schema->format[1] == 'L') + { + osErrorMsg = "LargeListView not supported"; + return false; + } for (int64_t i = 0; i < schema->n_children; ++i) { if (!IsArrowSchemaSupported(schema->children[i], papszOptions,