From 7212ecd7791d92d83b7a5d48ababae6b67e4670e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Jan 2024 20:23:06 +0100 Subject: [PATCH] ENH: support the Arrow PyCapsule Interface on pandas.DataFrame (export) (#56587) * ENH: support the Arrow PyCapsule Interface on pandas.DataFrame * expand documentation on how index is handled --- pandas/compat/_optional.py | 5 ++- pandas/core/frame.py | 27 +++++++++++++ pandas/tests/frame/test_arrow_interface.py | 45 ++++++++++++++++++++++ pandas/tests/test_optional_dependency.py | 14 +++++++ 4 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/frame/test_arrow_interface.py diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 3cdeae52a25ba..064ae473652d2 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -147,9 +147,8 @@ def import_optional_dependency( The imported module, when found and the version is correct. None is returned when the package is not found and `errors` is False, or when the package's version is too old and `errors` - is ``'warn'``. + is ``'warn'`` or ``'ignore'``. """ - assert errors in {"warn", "raise", "ignore"} package_name = INSTALL_MAPPING.get(name) @@ -190,5 +189,7 @@ def import_optional_dependency( return None elif errors == "raise": raise ImportError(msg) + else: + return None return module diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e6d86cba0a4c3..e093d551f3ead 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -988,6 +988,33 @@ def __dataframe_consortium_standard__( ) return convert_to_standard_compliant_dataframe(self, api_version=api_version) + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + # ---------------------------------------------------------------------- @property diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py new file mode 100644 index 0000000000000..ac7b51cbdfa92 --- /dev/null +++ b/pandas/tests/frame/test_arrow_interface.py @@ -0,0 +1,45 @@ +import ctypes + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd + +pa = pytest.importorskip("pyarrow") + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_arrow_interface(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + capsule = df.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + table = pa.table(df) + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.table(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="15.0") +def test_dataframe_to_arrow(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + table = pa.RecordBatchReader.from_stream(df) + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.RecordBatchReader.from_stream(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index c1d1948d6c31a..52b5f636b1254 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -50,6 +50,20 @@ def test_bad_version(monkeypatch): result = import_optional_dependency("fakemodule") assert result is module + with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"): + import_optional_dependency("fakemodule", min_version="1.1.0") + + with tm.assert_produces_warning(UserWarning): + result = import_optional_dependency( + "fakemodule", errors="warn", min_version="1.1.0" + ) + assert result is None + + result = import_optional_dependency( + "fakemodule", errors="ignore", min_version="1.1.0" + ) + assert result is None + def test_submodule(monkeypatch): # Create a fake module with a submodule