Skip to content

Commit

Permalink
Start migrating I/O writers to pylibcudf (starting with JSON) (rapids…
Browse files Browse the repository at this point in the history
…ai#15952)

Switches the JSON writer to use pylibcudf.
xref rapidsai#15162

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai#15952
  • Loading branch information
lithomas1 authored Jul 2, 2024
1 parent a1447c7 commit 1a4c2aa
Show file tree
Hide file tree
Showing 17 changed files with 768 additions and 177 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ I/O Functions
:maxdepth: 1

avro
json
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
====
JSON
====

.. automodule:: cudf._lib.pylibcudf.io.json
:members:
98 changes: 29 additions & 69 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,38 +9,27 @@ from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool
from libcpp.map cimport map
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
from cudf._lib.column cimport Column
from cudf._lib.io.utils cimport (
make_sink_info,
make_source_info,
update_struct_field_names,
)
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
from cudf._lib.pylibcudf.libcudf.io.json cimport (
json_reader_options,
json_recovery_mode_t,
json_writer_options,
read_json as libcudf_read_json,
schema_element,
write_json as libcudf_write_json,
)
from cudf._lib.pylibcudf.libcudf.io.types cimport (
column_name_info,
compression_type,
sink_info,
table_metadata,
table_with_metadata,
)
from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
from cudf._lib.types cimport dtype_to_data_type
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport data_from_unique_ptr

import cudf._lib.pylibcudf as plc


cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
Expand Down Expand Up @@ -175,45 +164,27 @@ def write_json(
--------
cudf.to_json
"""
cdef table_view input_table_view = table_view_from_table(
table, ignore_index=True
)

cdef unique_ptr[data_sink] data_sink_c
cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
cdef string na_c = na_rep.encode()
cdef bool include_nulls_c = include_nulls
cdef bool lines_c = lines
cdef int rows_per_chunk_c = rows_per_chunk
cdef string true_value_c = 'true'.encode()
cdef string false_value_c = 'false'.encode()
cdef table_metadata tbl_meta

num_index_cols_meta = 0
cdef column_name_info child_info
for i, name in enumerate(table._column_names, num_index_cols_meta):
child_info.name = name.encode()
tbl_meta.schema_info.push_back(child_info)
_set_col_children_metadata(
table[name]._column,
tbl_meta.schema_info[i]
)
cdef list colnames = []

cdef json_writer_options options = move(
json_writer_options.builder(sink_info_c, input_table_view)
.metadata(tbl_meta)
.na_rep(na_c)
.include_nulls(include_nulls_c)
.lines(lines_c)
.rows_per_chunk(rows_per_chunk_c)
.true_value(true_value_c)
.false_value(false_value_c)
.build()
)
for name in table._column_names:
colnames.append((name, _dtype_to_names_list(table[name]._column)))

try:
with nogil:
libcudf_write_json(options)
plc.io.json.write_json(
plc.io.SinkInfo([path_or_buf]),
plc.io.TableWithMetadata(
plc.Table([
c.to_pylibcudf(mode="read") for c in table._columns
]),
colnames
),
na_rep,
include_nulls,
lines,
rows_per_chunk,
true_value="true",
false_value="false"
)
except OverflowError:
raise OverflowError(
f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
Expand Down Expand Up @@ -254,23 +225,12 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
)
return dtype_to_data_type(dtype)

cdef _set_col_children_metadata(Column col,
column_name_info& col_meta):
cdef column_name_info child_info

def _dtype_to_names_list(col):
if isinstance(col.dtype, cudf.StructDtype):
for i, (child_col, name) in enumerate(
zip(col.children, list(col.dtype.fields))
):
child_info.name = name.encode()
col_meta.children.push_back(child_info)
_set_col_children_metadata(
child_col, col_meta.children[i]
)
return [(name, _dtype_to_names_list(child))
for name, child in zip(col.dtype.fields, col.children)]
elif isinstance(col.dtype, cudf.ListDtype):
for i, child_col in enumerate(col.children):
col_meta.children.push_back(child_info)
_set_col_children_metadata(
child_col, col_meta.children[i]
)
else:
return
return [("", _dtype_to_names_list(child))
for child in col.children]
return []
6 changes: 4 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources avro.pyx datasource.pyx types.pyx)
set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand All @@ -21,5 +21,7 @@ rapids_cython_create_modules(
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
)

set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types)
set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json
pylibcudf_io_types
)
link_to_pyarrow_headers("${targets_using_arrow_headers}")
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport avro, datasource, types
from . cimport avro, datasource, json, types
from .types cimport SourceInfo, TableWithMetadata
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import avro, datasource, types
from .types import SourceInfo, TableWithMetadata
from . import avro, datasource, json, types
from .types import SinkInfo, SourceInfo, TableWithMetadata
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ cpdef TableWithMetadata read_avro(
size_type num_rows = -1
):
"""
Reads an Avro dataset into a set of columns.
Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
Parameters
----------
Expand All @@ -36,7 +36,7 @@ cpdef TableWithMetadata read_avro(
Returns
-------
TableWithMetadata
The Table and its corresponding metadata that was read in.
The Table and its corresponding metadata (column names) that were read in.
"""
cdef vector[string] c_columns
if columns is not None and len(columns) > 0:
Expand Down
18 changes: 18 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/json.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool

from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.types cimport size_type


cpdef void write_json(
SinkInfo sink_info,
TableWithMetadata tbl,
str na_rep = *,
bool include_nulls = *,
bool lines = *,
size_type rows_per_chunk = *,
str true_value = *,
str false_value = *
)
68 changes: 68 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/json.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.limits cimport numeric_limits
from libcpp.string cimport string

from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.io.json cimport (
json_writer_options,
write_json as cpp_write_json,
)
from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata
from cudf._lib.pylibcudf.types cimport size_type


cpdef void write_json(
SinkInfo sink_info,
TableWithMetadata table_w_meta,
str na_rep = "",
bool include_nulls = False,
bool lines = False,
size_type rows_per_chunk = numeric_limits[size_type].max(),
str true_value = "true",
str false_value = "false"
):
"""
Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format.
Parameters
----------
sink_info: SinkInfo
The SinkInfo object to write the JSON to.
table_w_meta: TableWithMetadata
The TableWithMetadata object containing the Table to write
na_rep: str, default ""
The string representation for null values.
include_nulls: bool, default False
Enables/Disables output of nulls as 'null'.
lines: bool, default False
If `True`, write output in the JSON lines format.
rows_per_chunk: size_type, defaults to length of the input table
The maximum number of rows to write at a time.
true_value: str, default "true"
The string representation for values != 0 in INT8 types.
false_value: str, default "false"
The string representation for values == 0 in INT8 types.
"""
cdef table_metadata tbl_meta = table_w_meta.metadata
cdef string na_rep_c = na_rep.encode()

cdef json_writer_options options = (
json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view())
.metadata(tbl_meta)
.na_rep(na_rep_c)
.include_nulls(include_nulls)
.lines(lines)
.build()
)

if rows_per_chunk != numeric_limits[size_type].max():
options.set_rows_per_chunk(rows_per_chunk)
if true_value != "true":
options.set_true_value(<string>true_value.encode())
if false_value != "false":
options.set_false_value(<string>false_value.encode())

with nogil:
cpp_write_json(options)
11 changes: 11 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/types.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector

from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.types cimport (
column_encoding,
column_in_metadata,
Expand All @@ -22,8 +26,15 @@ cdef class TableWithMetadata:
cdef public Table tbl
cdef table_metadata metadata

cdef vector[column_name_info] _make_column_info(self, list column_names)

@staticmethod
cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)

cdef class SourceInfo:
cdef source_info c_obj

cdef class SinkInfo:
# This vector just exists to keep the unique_ptrs to the sinks alive
cdef vector[unique_ptr[data_sink]] sink_storage
cdef sink_info c_obj
Loading

0 comments on commit 1a4c2aa

Please sign in to comment.