Skip to content

Commit

Permalink
Add in JNI for parsing JSON data and getting the metadata back too. (N…
Browse files Browse the repository at this point in the history
…VIDIA#11431)

Adds in a new java binding to allow reading a JSON buffer and getting back the metadata along with the table when inferring the schema.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jim Brennan (https://github.com/jbrennan333)
  - Nghia Truong (https://github.com/ttnghia)

URL: rapidsai/cudf#11431
  • Loading branch information
revans2 authored Aug 3, 2022
1 parent 039622f commit 276b996
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 0 deletions.
23 changes: 23 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,9 @@ private static native long[] readJSON(String[] columnNames,
String filePath, long address, long length,
boolean dayFirst, boolean lines) throws CudfException;

private static native long readAndInferJSON(long address, long length,
boolean dayFirst, boolean lines) throws CudfException;

/**
* Read in Parquet formatted data.
* @param filterColumnNames name of the columns to read, or an empty array if we want to read
Expand Down Expand Up @@ -918,6 +921,26 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon
}
}

/**
* Read JSON formatted data and infer the column names and schema.
* @param opts various JSON parsing options.
* @param buffer raw UTF8 formatted bytes.
* @param offset the starting offset into buffer.
* @param len the number of bytes to parse.
* @return the data parsed as a table on the GPU and the metadata for the table returned.
*/
public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
long offset, long len) {
if (len <= 0) {
len = buffer.length - offset;
}
assert len > 0;
assert len <= buffer.length - offset;
assert offset >= 0 && offset < buffer.length;
return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
opts.isDayFirst(), opts.isLines()));
}

/**
* Read JSON formatted data.
* @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
Expand Down
67 changes: 67 additions & 0 deletions java/src/main/java/ai/rapids/cudf/TableWithMeta.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/


package ai.rapids.cudf;

/**
* A table along with some metadata about the table. This is typically returned when
* reading data from an input file where the metadata can be important.
*/
public class TableWithMeta implements AutoCloseable {
private long handle;

TableWithMeta(long handle) {
this.handle = handle;
}

/**
* Get the table out of this metadata. Note that this can only be called once. Later calls
* will return a null.
*/
public Table releaseTable() {
long[] ptr = releaseTable(handle);
if (ptr == null) {
return null;
} else {
return new Table(ptr);
}
}

/**
* Get the names of the top level columns. In the future new APIs can be added to get
* names of child columns.
*/
public String[] getColumnNames() {
return getColumnNames(handle);
}

@Override
public void close() throws Exception {
if (handle != 0) {
close(handle);
handle = 0;
}
}

private static native void close(long handle);

private static native long[] releaseTable(long handle);

private static native String[] getColumnNames(long handle);
}
71 changes: 71 additions & 0 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1314,6 +1314,77 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
CATCH_STD(env, NULL);
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {

JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
if (buffer_length <= 0) {
JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
}

try {
cudf::jni::auto_set_device(env);

auto source = cudf::io::source_info{reinterpret_cast<char *>(buffer),
static_cast<std::size_t>(buffer_length)};

cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
.dayfirst(static_cast<bool>(day_first))
.lines(static_cast<bool>(lines));

auto result =
std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));

return reinterpret_cast<jlong>(result.release());
}
CATCH_STD(env, 0);
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv *env, jclass, jlong handle) {
JNI_NULL_CHECK(env, handle, "handle is null", );

try {
cudf::jni::auto_set_device(env);
delete reinterpret_cast<cudf::io::table_with_metadata *>(handle);
}
CATCH_STD(env, );
}

JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_TableWithMeta_getColumnNames(JNIEnv *env, jclass,
jlong handle) {
JNI_NULL_CHECK(env, handle, "handle is null", nullptr);

try {
cudf::jni::auto_set_device(env);
auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
auto length = ptr->metadata.column_names.size();
auto ret = static_cast<jobjectArray>(
env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr));
for (size_t i = 0; i < length; i++) {
env->SetObjectArrayElement(ret, i, env->NewStringUTF(ptr->metadata.column_names[i].c_str()));
}

return ret;
}
CATCH_STD(env, nullptr);
}

JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv *env, jclass,
jlong handle) {
JNI_NULL_CHECK(env, handle, "handle is null", nullptr);

try {
cudf::jni::auto_set_device(env);
auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
if (ptr->tbl) {
return convert_table_for_return(env, ptr->tbl);
} else {
return nullptr;
}
}
CATCH_STD(env, nullptr);
}

JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
Expand Down

0 comments on commit 276b996

Please sign in to comment.