Skip to content

Commit

Permalink
Start migrating I/O writers to pylibcudf (starting with JSON)
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 committed Jun 6, 2024
1 parent 3b734ec commit 591cdd2
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 76 deletions.
101 changes: 32 additions & 69 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,38 +9,27 @@ from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool
from libcpp.map cimport map
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
from cudf._lib.column cimport Column
from cudf._lib.io.utils cimport (
make_sink_info,
make_source_info,
update_struct_field_names,
)
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
from cudf._lib.pylibcudf.libcudf.io.json cimport (
json_reader_options,
json_recovery_mode_t,
json_writer_options,
read_json as libcudf_read_json,
schema_element,
write_json as libcudf_write_json,
)
from cudf._lib.pylibcudf.libcudf.io.types cimport (
column_name_info,
compression_type,
sink_info,
table_metadata,
table_with_metadata,
)
from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
from cudf._lib.types cimport dtype_to_data_type
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport data_from_unique_ptr

import cudf._lib.pylibcudf as plc


cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
Expand Down Expand Up @@ -175,45 +164,27 @@ def write_json(
--------
cudf.to_json
"""
cdef table_view input_table_view = table_view_from_table(
table, ignore_index=True
)
cdef list colnames = []

cdef unique_ptr[data_sink] data_sink_c
cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
cdef string na_c = na_rep.encode()
cdef bool include_nulls_c = include_nulls
cdef bool lines_c = lines
cdef int rows_per_chunk_c = rows_per_chunk
cdef string true_value_c = 'true'.encode()
cdef string false_value_c = 'false'.encode()
cdef table_metadata tbl_meta

num_index_cols_meta = 0
cdef column_name_info child_info
for i, name in enumerate(table._column_names, num_index_cols_meta):
child_info.name = name.encode()
tbl_meta.schema_info.push_back(child_info)
_set_col_children_metadata(
table[name]._column,
tbl_meta.schema_info[i]
)

cdef json_writer_options options = move(
json_writer_options.builder(sink_info_c, input_table_view)
.metadata(tbl_meta)
.na_rep(na_c)
.include_nulls(include_nulls_c)
.lines(lines_c)
.rows_per_chunk(rows_per_chunk_c)
.true_value(true_value_c)
.false_value(false_value_c)
.build()
)
for name in table._column_names:
colnames.append((name, _dtype_to_names_list(table[name]._column)))

try:
with nogil:
libcudf_write_json(options)
plc.io.json.write_json(
plc.io.SinkInfo([path_or_buf]),
plc.io.TableWithMetadata(
plc.Table([
c.to_pylibcudf(mode="read") for c in table._data.columns
]),
colnames
),
na_rep,
include_nulls,
lines,
rows_per_chunk,
true_value="true",
false_value="false"
)
except OverflowError:
raise OverflowError(
f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
Expand Down Expand Up @@ -254,23 +225,15 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
)
return dtype_to_data_type(dtype)

cdef _set_col_children_metadata(Column col,
column_name_info& col_meta):
cdef column_name_info child_info

def _dtype_to_names_list(col):
cdef list child_names = []
if isinstance(col.dtype, cudf.StructDtype):
for i, (child_col, name) in enumerate(
zip(col.children, list(col.dtype.fields))
):
child_info.name = name.encode()
col_meta.children.push_back(child_info)
_set_col_children_metadata(
child_col, col_meta.children[i]
)
for child_col, name in zip(col.children, list(col.dtype.fields)):
child_names.append((name, _dtype_to_names_list(child_col)))
elif isinstance(col.dtype, cudf.ListDtype):
for i, child_col in enumerate(col.children):
col_meta.children.push_back(child_info)
_set_col_children_metadata(
child_col, col_meta.children[i]
)
else:
return
for child_col in col.children:
list_child_names = _dtype_to_names_list(child_col)
child_names.append(("", list_child_names))

return child_names
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources avro.pyx types.pyx)
set(cython_sources avro.pyx json.pyx types.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand All @@ -21,5 +21,5 @@ rapids_cython_create_modules(
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
)

set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types)
set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_json pylibcudf_io_types)
link_to_pyarrow_headers("${targets_using_arrow_headers}")
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport avro, types
from .types cimport SourceInfo, TableWithMetadata
from . cimport avro, json, types
from .types cimport SinkInfo, SourceInfo, TableWithMetadata
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import avro, types
from .types import SourceInfo, TableWithMetadata
from . import avro, json, types
from .types import SinkInfo, SourceInfo, TableWithMetadata
19 changes: 19 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/json.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.string cimport string

from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type


cpdef void write_json(
SinkInfo sink_info,
TableWithMetadata tbl,
str na_rep = *,
bool include_nulls = *,
bool lines = *,
int rows_per_chunk = *,
str true_value = *,
str false_value = *
)
47 changes: 47 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/json.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.limits cimport numeric_limits
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.io.json cimport (
json_writer_options,
write_json as cpp_write_json,
)
from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata
from cudf._lib.pylibcudf.types cimport size_type


cpdef void write_json(
SinkInfo sink_info,
TableWithMetadata table_w_meta,
str na_rep = "",
bool include_nulls = False,
bool lines = False,
int rows_per_chunk = numeric_limits[size_type].max(),
str true_value = "true",
str false_value = "false"
):
"""
"""
cdef table_metadata tbl_meta = table_w_meta.metadata
cdef string na_rep_c = na_rep.encode()
cdef string true_value_c = true_value.encode()
cdef string false_value_c = false_value.encode()

cdef json_writer_options options = move(
json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view())
.metadata(tbl_meta)
.na_rep(na_rep_c)
.include_nulls(include_nulls)
.lines(lines)
.rows_per_chunk(rows_per_chunk)
.true_value(true_value_c)
.false_value(false_value_c)
.build()
)

with nogil:
cpp_write_json(options)
11 changes: 11 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/types.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector

from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.types cimport (
column_encoding,
column_in_metadata,
Expand All @@ -22,8 +26,15 @@ cdef class TableWithMetadata:
cdef public Table tbl
cdef table_metadata metadata

cdef vector[column_name_info] _make_column_info(self, list column_names)

@staticmethod
cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)

cdef class SourceInfo:
cdef source_info c_obj

cdef class SinkInfo:
# This vector just exists to keep the unique_ptrs to the sinks alive
cdef vector[unique_ptr[data_sink]] sink_storage
cdef sink_info c_obj
Loading

0 comments on commit 591cdd2

Please sign in to comment.