From c2f207db11cfb7ca34c7f1e6e2b6c6e91a41b50f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 12 Jun 2024 15:57:26 +0000 Subject: [PATCH] Add basic tests of dataframe scan Also assert that unsupported file scan operations raise. --- .../cudf_polars/tests/test_dataframescan.py | 43 +++++++++++++++++++ python/cudf_polars/tests/test_scan.py | 10 +++++ 2 files changed, 53 insertions(+) create mode 100644 python/cudf_polars/tests/test_dataframescan.py diff --git a/python/cudf_polars/tests/test_dataframescan.py b/python/cudf_polars/tests/test_dataframescan.py new file mode 100644 index 00000000000..1ffe06ac562 --- /dev/null +++ b/python/cudf_polars/tests/test_dataframescan.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "subset", + [ + None, + ["a", "c"], + ["b", "c", "d"], + ["b", "d"], + ["b", "c"], + ["c", "e"], + ["d", "e"], + pl.selectors.string(), + pl.selectors.integer(), + ], +) +@pytest.mark.parametrize("predicate_pushdown", [False, True]) +def test_scan_drop_nulls(subset, predicate_pushdown): + df = pl.LazyFrame( + { + "a": [1, 2, 3, 4], + "b": [None, 4, 5, None], + "c": [6, 7, None, None], + "d": [8, None, 9, 10], + "e": [None, None, "A", None], + } + ) + # Drop nulls are pushed into filters + q = df.drop_nulls(subset) + + assert_gpu_result_equal( + q, collect_kwargs={"predicate_pushdown": predicate_pushdown} + ) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index b2443e357e2..9491ddc2e94 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -6,6 +6,7 @@ import polars as pl +from cudf_polars import translate_ir from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -86,3 +87,12 @@ def test_scan(df, columns, mask): if columns is not None: q = df.select(*columns) assert_gpu_result_equal(q) + + +def test_scan_unsupported_raises(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3]}) + + df.write_ndjson(tmp_path / "df.json") + q = pl.scan_ndjson(tmp_path / "df.json") + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit())