New implementation of getJsonObject (#1893)

* get-json-object: Add JSON parser and parser utility (#1836) * Add Json Parser; Add Json Parser utility; Define internal interfaces; Copy get-json-obj CUDA code from cuDF; Signed-off-by: Chong Gao <res_life@163.com> * Code format --------- Signed-off-by: Chong Gao <res_life@163.com> Co-authored-by: Chong Gao <res_life@163.com> * get-json-object: match current field name (#1857) Signed-off-by: Chong Gao <res_life@163.com> Co-authored-by: Chong Gao <res_life@163.com> * get-json-object: add utility write_escaped_text for JSON generator (#1863) Signed-off-by: Chong Gao <res_life@163.com> Co-authored-by: Chong Gao <res_life@163.com> * Add JNI for GetJsonObject (#1862) * Add JNI for GetJsonObject Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * clean up Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Parse json path in plugin Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Apply suggestions from code review Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> * Use table_view Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Update java Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Apply suggestions from code review Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> * clean up Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * use matched enum for type Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * clean up Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * upmerge Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * format Signed-off-by: Haoyang Li <haoyangl@nvidia.com> --------- Signed-off-by: Haoyang Li <haoyangl@nvidia.com> Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> * get-json-object: main flow (#1868) Signed-off-by: Chong Gao <res_life@163.com> Co-authored-by: Chong Gao <res_life@163.com> * Optimize memory usage in match_current_field_name (#1889) * Optimize match_current_field_name using less memory Signed-off-by: Chong Gao <res_life@163.com> * Convert a function to device code * Add a JNI test case * Add JNI test case * Change nesting depth to 4 * Change nesting depth to 8 to fix test Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * remove clang format change Signed-off-by: Haoyang Li <haoyangl@nvidia.com> --------- Signed-off-by: Chong Gao <res_life@163.com> Signed-off-by: Haoyang Li <haoyangl@nvidia.com> Co-authored-by: Chong Gao <res_life@163.com> * get-json-object: Recursive to iterative (#1890) * Change recursive to iterative Signed-off-by: Chong Gao <res_life@163.com> --------- Signed-off-by: Chong Gao <res_life@163.com> Co-authored-by: Chong Gao <res_life@163.com> * Fix bug * Format * Use uppercase for path_instruction_type Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Add test cases from Baidu * Fix escape char error; add test case * getJsonObject number normalization (#1897) * Support number normalization Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * delete cpp test and add a java test case Signed-off-by: Haoyang Li <haoyangl@nvidia.com> --------- Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Add test case * Fix a escape/unescape size bug Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Fix bug: handle leading zeros for number; Refactor * Apply suggestions from code review Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> * Address comments Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * fix java test Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Add test cases; Fix a bug * follow up escape/unescape bug fix Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Minor refactor * Add a case; Fix bug --------- Signed-off-by: Chong Gao <res_life@163.com> Signed-off-by: Haoyang Li <haoyangl@nvidia.com> Co-authored-by: Chong Gao <res_life@163.com> Co-authored-by: Haoyang Li <haoyangl@nvidia.com> Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
NVIDIA · Mar 27, 2024 · 87216f2 · 87216f2
1 parent f94b894
commit 87216f2
Show file tree

Hide file tree

Showing 8 changed files with 4,161 additions and 19 deletions.
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
@@ -156,6 +156,7 @@ add_library(
   src/GpuTimeZoneDBJni.cpp
   src/HashJni.cpp
   src/HistogramJni.cpp
+  src/JSONUtilsJni.cpp
   src/MapUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/ParseURIJni.cpp
@@ -170,6 +171,7 @@ add_library(
   src/cast_string_to_float.cu
   src/datetime_rebase.cu
   src/decimal_utils.cu
+  src/get_json_object.cu
   src/histogram.cu
   src/map_utils.cu
   src/murmur_hash.cu

diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_jni_apis.hpp"
+#include "get_json_object.hpp"
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+using path_instruction_type = spark_rapids_jni::path_instruction_type;
+
+extern "C" {
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObject(
+  JNIEnv* env, jclass, jlong input_column, jobjectArray path_instructions)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  JNI_NULL_CHECK(env, path_instructions, "path_instructions is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const n_column_view      = reinterpret_cast<cudf::column_view const*>(input_column);
+    auto const n_strings_col_view = cudf::strings_column_view{*n_column_view};
+
+    std::vector<std::tuple<path_instruction_type, std::string, int64_t>> instructions;
+    int size = env->GetArrayLength(path_instructions);
+    for (int i = 0; i < size; i++) {
+      jobject instruction = env->GetObjectArrayElement(path_instructions, i);
+      JNI_NULL_CHECK(env, instruction, "path_instruction is null", 0);
+      jclass instruction_class = env->GetObjectClass(instruction);
+      JNI_NULL_CHECK(env, instruction_class, "instruction_class is null", 0);
+
+      jfieldID field_id = env->GetFieldID(instruction_class, "type", "I");
+      JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
+      jint type                              = env->GetIntField(instruction, field_id);
+      path_instruction_type instruction_type = static_cast<path_instruction_type>(type);
+
+      field_id = env->GetFieldID(instruction_class, "name", "Ljava/lang/String;");
+      JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
+      jstring name = (jstring)env->GetObjectField(instruction, field_id);
+      JNI_NULL_CHECK(env, name, "name is null", 0);
+      const char* name_str = env->GetStringUTFChars(name, JNI_FALSE);
+
+      field_id = env->GetFieldID(instruction_class, "index", "J");
+      JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
+      jlong index = env->GetLongField(instruction, field_id);
+
+      instructions.emplace_back(instruction_type, name_str, index);
+
+      env->ReleaseStringUTFChars(name, name_str);
+    }
+
+    return cudf::jni::release_as_jlong(
+      spark_rapids_jni::get_json_object(n_strings_col_view, instructions));
+  }
+  CATCH_STD(env, 0);
+}
+}
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
@@ -800,15 +800,15 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
   if (sign) { result[index++] = '-'; }
 
   uint64_t output         = v.mantissa;
-  uint32_t const olength  = decimal_length(output);
-  int32_t exp             = v.exponent + static_cast<int32_t>(olength) - 1;
+  int32_t const olength   = decimal_length(output);
+  int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
   // Values in the interval [1E-3, 1E7) are special.
   if (scientificNotation) {
     // Print in the format x.xxxxxE-yy.
-    for (uint32_t i = 0; i < olength - 1; ++i) {
-      uint32_t const c = output % 10;
+    for (int i = 0; i < olength - 1; ++i) {
+      int const c = output % 10;
       output /= 10;
       result[index + olength - i] = (char)('0' + c);
     }
@@ -845,7 +845,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
         output /= 10;
         index++;
       }
-    } else if (exp + 1 >= olength) {
+    } else if (exp + 1 >= static_cast<int32_t>(olength)) {
       // Decimal dot is after any of the digits.
       for (int i = 0; i < olength; i++) {
         result[index + olength - i - 1] = (char)('0' + output % 10);
@@ -880,7 +880,7 @@ __device__ inline int d2s_size(floating_decimal_64 const v, bool const sign)
   if (sign) { index++; }
 
   uint64_t output         = v.mantissa;
-  uint32_t const olength  = decimal_length(output);
+  int32_t const olength   = decimal_length(output);
   int32_t exp             = v.exponent + static_cast<int32_t>(olength) - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
@@ -920,7 +920,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
   if (sign) { result[index++] = '-'; }
 
   uint32_t output         = v.mantissa;
-  uint32_t const olength  = decimal_length(output);
+  int32_t const olength   = decimal_length(output);
   int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
@@ -995,7 +995,7 @@ __device__ inline int f2s_size(floating_decimal_32 const v, bool const sign)
   if (sign) { index++; }
 
   uint32_t output         = v.mantissa;
-  uint32_t const olength  = decimal_length(output);
+  int32_t const olength   = decimal_length(output);
   int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
@@ -1149,6 +1149,57 @@ __device__ inline int compute_f2s_size(float value)
   return f2s_size(v, sign);
 }
 
+//===== special inf handling for json =====
+
+__device__ inline int copy_special_str_json(char* const result,
+                                            bool const sign,
+                                            bool const exponent,
+                                            bool const mantissa)
+{
+  // no NaN in json
+  if (exponent) {
+    if (sign) {
+      memcpy(result, "\"-Infinity\"", 11);
+      return 11;
+    } else {
+      memcpy(result, "\"Infinity\"", 10);
+      return 10;
+    }
+  }
+  if (sign) {
+    memcpy(result, "-0.0", 4);
+    return 4;
+  } else {
+    memcpy(result, "0.0", 3);
+    return 3;
+  }
+}
+
+__device__ inline int special_str_size_json(bool const sign,
+                                            bool const exponent,
+                                            bool const mantissa)
+{
+  // no NaN in json
+  if (exponent) { return sign + 10; }
+  return sign + 3;
+}
+
+__device__ inline int d2s_buffered_n_json(double f, char* result)
+{
+  bool sign = false, special = false;
+  floating_decimal_64 v = d2d(f, sign, special);
+  if (special) { return copy_special_str_json(result, sign, v.exponent, v.mantissa); }
+  return to_chars(v, sign, result);
+}
+
+__device__ inline int compute_d2s_size_json(double value)
+{
+  bool sign = false, special = false;
+  floating_decimal_64 v = d2d(value, sign, special);
+  if (special) { return special_str_size_json(sign, v.exponent, v.mantissa); }
+  return d2s_size(v, sign);
+}
+
 }  // namespace
 
 //===== APIs =====
@@ -1223,9 +1274,9 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
   using U   = std::conditional_t<std::is_same_v<T, floating_decimal_32>, uint32_t, uint64_t>;
   int index = 0;
   if (sign) { result[index++] = '-'; }
-  U output               = v.mantissa;
-  uint32_t const olength = decimal_length(output);
-  int32_t exp            = v.exponent + static_cast<int32_t>(olength) - 1;
+  U output              = v.mantissa;
+  int32_t const olength = decimal_length(output);
+  int32_t exp           = v.exponent + static_cast<int32_t>(olength) - 1;
   if (exp < 0) {
     // Decimal dot is before any of the digits.
     int index_for_carrier = index;
@@ -1291,7 +1342,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     }
   } else {
     // 0 <= exp < olength - 1
-    uint32_t temp_d = digits, tailing_zero = 0;
+    int32_t temp_d = digits, tailing_zero = 0;
     if (exp + digits + 1 > olength) {
       temp_d       = olength - exp - 1;
       tailing_zero = digits - temp_d;
@@ -1301,10 +1352,10 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     U integer        = rounded_output / pow10;
     U decimal        = rounded_output % pow10;
     // calculate integer length after format to cover carry case
-    uint32_t integer_len          = decimal_length(integer);
-    uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
-    uint32_t sep_cnt              = 0;
-    int rev_index                 = 0;
+    int32_t integer_len          = decimal_length(integer);
+    int32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+    int32_t sep_cnt              = 0;
+    int rev_index                = 0;
     for (int i = 0; i < integer_len; i++) {
       if (sep_cnt == 3) {
         result[formated_integer_len - (rev_index++) - 1] = ',';
@@ -1338,9 +1389,9 @@ __device__ inline int format_size(T const v, bool const sign, int digits)
   using U   = std::conditional_t<std::is_same_v<T, floating_decimal_32>, uint32_t, uint64_t>;
   int index = 0;
   if (sign) { index++; }
-  U output               = v.mantissa;
-  uint32_t const olength = decimal_length(output);
-  int32_t exp            = v.exponent + static_cast<int32_t>(olength) - 1;
+  U output              = v.mantissa;
+  int32_t const olength = decimal_length(output);
+  int32_t exp           = v.exponent + static_cast<int32_t>(olength) - 1;
   if (exp < 0) {
     index += 2 + digits;
   } else if (exp + 1 >= olength) {
@@ -1424,4 +1475,15 @@ __device__ inline int format_float(double value, int digits, bool is_float, char
   }
 }
 
+//===== json_parser utility =====
+
+__device__ inline int double_normalization(double value, char* output)
+{
+  if (output == nullptr) {
+    return compute_d2s_size_json(value);
+  } else {
+    return d2s_buffered_n_json(value, output);
+  }
+}
+
 }  // namespace spark_rapids_jni::ftos_converter