Skip to content

Commit

Permalink
Merge pull request #10749 from rouault/arrow_read_stringview
Browse files Browse the repository at this point in the history
Feather: add read support for StringView and BinaryView…
  • Loading branch information
rouault committed Sep 18, 2024
2 parents 42e4237 + 58ec9e2 commit 7ca9ed0
Show file tree
Hide file tree
Showing 9 changed files with 306 additions and 7 deletions.
89 changes: 89 additions & 0 deletions autotest/generate_parquet_test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -1245,10 +1245,99 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized):
)


def generate_arrow_stringview():
import pathlib

import pyarrow as pa
import pyarrow.feather as feather

stringview = pa.array(["foo", "bar", "looooooooooong string"], pa.string_view())
list_stringview = pa.array(
[None, [None], ["foo", "bar", "looooooooooong string"]],
pa.list_(pa.string_view()),
)
list_of_list_stringview = pa.array(
[None, [None], [["foo", "bar", "looooooooooong string"]]],
pa.list_(pa.list_(pa.string_view())),
)
map_stringview = pa.array(
[None, [], [("x", "x_val"), ("y", None)]],
type=pa.map_(pa.string_view(), pa.string_view()),
)

names = [
"stringview",
"list_stringview",
"list_of_list_stringview",
"map_stringview",
]

locals_ = locals()
table = pa.table([locals_[x] for x in names], names=names)

HERE = pathlib.Path(__file__).parent
feather.write_feather(table, HERE / "ogr/data/arrow/stringview.feather")


def generate_arrow_binaryview():
import pathlib

import pyarrow as pa
import pyarrow.feather as feather

binaryview = pa.array([b"foo", b"bar", b"looooooooooong binary"], pa.binary_view())

names = ["binaryview"]

locals_ = locals()
table = pa.table([locals_[x] for x in names], names=names)

HERE = pathlib.Path(__file__).parent
feather.write_feather(table, HERE / "ogr/data/arrow/binaryview.feather")


def generate_arrow_listview():
import pathlib

import pyarrow as pa
import pyarrow.feather as feather

listview = pa.array([[1]], pa.list_view(pa.int32()))

names = ["listview"]

locals_ = locals()
table = pa.table([locals_[x] for x in names], names=names)

HERE = pathlib.Path(__file__).parent
feather.write_feather(table, HERE / "ogr/data/arrow/listview.feather")


def generate_arrow_largelistview():
import pathlib

import pyarrow as pa
import pyarrow.feather as feather

largelistview = pa.array([[1]], pa.large_list_view(pa.int32()))

names = ["largelistview"]

locals_ = locals()
table = pa.table([locals_[x] for x in names], names=names)

HERE = pathlib.Path(__file__).parent
feather.write_feather(table, HERE / "ogr/data/arrow/largelistview.feather")


if __name__ == "__main__":
generate_test_parquet()
generate_all_geoms_parquet()
generate_parquet_wkt_with_dict()
generate_nested_types()
generate_extension_custom()
generate_extension_json()
generate_arrow_stringview()
generate_arrow_binaryview()
generate_arrow_listview()
generate_arrow_largelistview()
Binary file added autotest/ogr/data/arrow/binaryview.feather
Binary file not shown.
Binary file added autotest/ogr/data/arrow/largelistview.feather
Binary file not shown.
Binary file added autotest/ogr/data/arrow/listview.feather
Binary file not shown.
Binary file added autotest/ogr/data/arrow/stringview.feather
Binary file not shown.
55 changes: 55 additions & 0 deletions autotest/ogr/ogr_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,3 +794,58 @@ def test_ogr_arrow_vsi_arrow_file_system():
pytest.skip("requires Arrow >= 16.0.0")

ogr.Open("vsi://data/arrow/test.feather")


###############################################################################


@gdaltest.enable_exceptions()
def test_ogr_arrow_string_view():

version = int(
ogr.GetDriverByName("ARROW").GetMetadataItem("ARROW_VERSION").split(".")[0]
)
if version < 15:
pytest.skip("requires Arrow >= 15")

with ogr.Open("data/arrow/stringview.feather") as ds:
lyr = ds.GetLayer(0)
f = lyr.GetNextFeature()
assert f["stringview"] == "foo"
assert f["list_stringview"] is None
assert f["list_of_list_stringview"] is None
assert f["map_stringview"] is None

f = lyr.GetNextFeature()
assert f["stringview"] == "bar"
assert f["list_stringview"] == [""]
assert f["list_of_list_stringview"] == "[null]"
assert f["map_stringview"] == "{}"

f = lyr.GetNextFeature()
assert f["stringview"] == "looooooooooong string"
assert f["list_stringview"] == ["foo", "bar", "looooooooooong string"]
assert f["list_of_list_stringview"] == '[["foo","bar","looooooooooong string"]]'
assert f["map_stringview"] == '{"x":"x_val","y":null}'


###############################################################################


@gdaltest.enable_exceptions()
def test_ogr_arrow_binary_view():

version = int(
ogr.GetDriverByName("ARROW").GetMetadataItem("ARROW_VERSION").split(".")[0]
)
if version < 15:
pytest.skip("requires Arrow >= 15")

with ogr.Open("data/arrow/binaryview.feather") as ds:
lyr = ds.GetLayer(0)
f = lyr.GetNextFeature()
assert f.GetFieldAsBinary("binaryview") == b"foo"
f = lyr.GetNextFeature()
assert f.GetFieldAsBinary("binaryview") == b"bar"
f = lyr.GetNextFeature()
assert f.GetFieldAsBinary("binaryview") == b"looooooooooong binary"
39 changes: 39 additions & 0 deletions autotest/ogr/ogr_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4125,3 +4125,42 @@ def test_ogr_parquet_vsi_arrow_file_system():
ds = ogr.Open("PARQUET:vsi://data/parquet/test.parquet")
lyr = ds.GetLayer(0)
assert lyr.GetFeatureCount() > 0


###############################################################################


@gdaltest.enable_exceptions()
@pytest.mark.require_driver("ARROW")
@pytest.mark.parametrize(
"src_filename,expected_error_msg",
[
("data/arrow/stringview.feather", "StringView not supported"),
("data/arrow/binaryview.feather", "BinaryView not supported"),
],
)
def test_ogr_parquet_IsArrowSchemaSupported_arrow_15_types(
src_filename, expected_error_msg, tmp_vsimem
):

version = int(
ogr.GetDriverByName("ARROW").GetMetadataItem("ARROW_VERSION").split(".")[0]
)
if version < 15:
pytest.skip("requires Arrow >= 15.0.0")

src_ds = ogr.Open(src_filename)
src_lyr = src_ds.GetLayer(0)

outfilename = str(tmp_vsimem / "test.parquet")
with ogr.GetDriverByName("Parquet").CreateDataSource(outfilename) as dst_ds:
dst_lyr = dst_ds.CreateLayer(
"test", srs=src_lyr.GetSpatialRef(), geom_type=ogr.wkbPoint, options=[]
)

stream = src_lyr.GetArrowStream()
schema = stream.GetSchema()

success, error_msg = dst_lyr.IsArrowSchemaSupported(schema)
assert not success
assert error_msg == expected_error_msg
Loading

0 comments on commit 7ca9ed0

Please sign in to comment.