NVIDIA · sameerz · May 1, 2023 · Apr 17, 2023 · Apr 21, 2023 · Apr 23, 2023
diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -7890,8 +7890,8 @@ are limited.
 <td> </td>
 <td> </td>
 <td><b>NS</b></td>
-<td><em>PS<br/>unsupported child types BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, DATE, TIMESTAMP, DECIMAL, NULL, BINARY, CALENDAR, ARRAY, MAP, STRUCT, UDT</em></td>
-<td><b>NS</b></td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP</em></td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP</em></td>
 <td> </td>
 </tr>
 <tr>

diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
@@ -368,3 +368,16 @@ def test_from_json_map():
         lambda spark : unary_op_df(spark, json_string_gen) \
             .select(f.from_json(f.col('a'), 'MAP<STRING,STRING>')),
         conf={"spark.rapids.sql.expression.JsonToStructs": "true"})
+
+@pytest.mark.parametrize('data_gen', [StringGen(r'{"a": "[0-9]{0,5}", "b": "[A-Z]{0,5}", "c": 1234}')])
+@pytest.mark.parametrize('schema', [StructType([StructField("a", StringType())]),
+                                    StructType([StructField("d", StringType())]),
+                                    StructType([StructField("a", StringType()), StructField("b", StringType())]),
+                                    StructType([StructField("c", LongType()), StructField("a", StringType())]),
+                                    StructType([StructField("a", StringType()), StructField("a", StringType())])
+                                    ])
+def test_from_json_struct(data_gen, schema):
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : unary_op_df(spark, data_gen) \
+            .select(f.from_json(f.col('a'), schema)),
+        conf={"spark.rapids.sql.expression.JsonToStructs": "true"})
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -3373,14 +3373,15 @@ object GpuOverrides extends Logging {
     expr[JsonToStructs](
       "Returns a struct value with the given `jsonStr` and `schema`",
       ExprChecks.projectOnly(
-        TypeSig.MAP.nested(TypeSig.STRING),
+        (TypeSig.STRUCT + TypeSig.MAP).nested(TypeSig.all),
         (TypeSig.STRUCT + TypeSig.MAP + TypeSig.ARRAY).nested(TypeSig.all),
         Seq(ParamCheck("jsonStr", TypeSig.STRING, TypeSig.STRING))),
       (a, conf, p, r) => new UnaryExprMeta[JsonToStructs](a, conf, p, r) {
         override def tagExprForGpu(): Unit =
           GpuJsonScan.tagJsonToStructsSupport(a.options, this)
 
         override def convertToGpu(child: Expression): GpuExpression =
+          // GPU implementation currently does not support duplicated json key names in input
           GpuJsonToStructs(a.schema, a.options, child, a.timeZoneId)
       }).disabledByDefault("parsing JSON from a column has a large number of issues and " +
       "should be considered beta quality right now."),

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala
@@ -16,12 +16,16 @@
 
 package org.apache.spark.sql.rapids
 
+import scala.collection.mutable.Set
+
 import ai.rapids.cudf
-import com.nvidia.spark.rapids.{GpuColumnVector, GpuUnaryExpression}
+import com.nvidia.spark.rapids.{GpuColumnVector, GpuScalar, GpuUnaryExpression}
+import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
+import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq
 import com.nvidia.spark.rapids.jni.MapUtils
 
 import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, NullIntolerant, TimeZoneAwareExpression}
-import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType}
+import org.apache.spark.sql.types.{AbstractDataType, DataType, MapType, StringType, StructType}
 
 case class GpuJsonToStructs(
     schema: DataType,
@@ -30,8 +34,118 @@ case class GpuJsonToStructs(
     timeZoneId: Option[String] = None)
     extends GpuUnaryExpression with TimeZoneAwareExpression with ExpectsInputTypes
         with NullIntolerant {
+
+  private def cleanAndConcat(input: cudf.ColumnVector): (cudf.ColumnVector, cudf.ColumnVector) ={
+    withResource(cudf.Scalar.fromString("{}")) { emptyRow =>
+      val stripped = withResource(cudf.Scalar.fromString(" ")) { space =>
+        input.strip(space)
+      }
+      withResource(stripped) { stripped =>
+        val isNullOrEmptyInput = withResource(input.isNull) { isNull =>
+          val isEmpty = withResource(stripped.getCharLengths) { lengths =>
+            withResource(cudf.Scalar.fromInt(0)) { zero =>
+              lengths.lessOrEqualTo(zero)
+            }
+          }
+          withResource(isEmpty) { isEmpty =>
+            isNull.binaryOp(cudf.BinaryOp.NULL_LOGICAL_OR, isEmpty, cudf.DType.BOOL8)
+          }
+        }
+        closeOnExcept(isNullOrEmptyInput) { _ =>
+          withResource(isNullOrEmptyInput.ifElse(emptyRow, stripped)) { cleaned =>
+            withResource(cudf.Scalar.fromString("\n")) { lineSep =>
+              withResource(cleaned.stringContains(lineSep)) { inputHas =>
+                withResource(inputHas.any()) { anyLineSep =>
+                  if (anyLineSep.isValid && anyLineSep.getBoolean) {
+                    throw new IllegalArgumentException("We cannot currently support parsing " +
+                        "JSON that contains a line separator in it")
+                  }
+                }
+              }
+              (isNullOrEmptyInput, cleaned.joinStrings(lineSep, emptyRow))
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private def processFieldNames(names: Seq[(String, DataType)]): Seq[(String, DataType)] = {
+    val existingNames = Set[String]()
+    // for duplicated field names, only keep the one with the largest index
+    names.foldRight(Seq[(String, DataType)]())((elem, acc) => {
+      val (name, dtype) = elem
+      if (existingNames(name)) (null, dtype)+:acc else {existingNames += name; (name, dtype)+:acc}})
+  }
+
   override protected def doColumnar(input: GpuColumnVector): cudf.ColumnVector = {
-    MapUtils.extractRawMapFromJsonString(input.getBase)
+    schema match {
+      case _: MapType =>
+        MapUtils.extractRawMapFromJsonString(input.getBase)
+      case struct: StructType => {
+        // We cannot handle all corner cases with this right now. The parser just isn't
+        // good enough, but we will try to handle a few common ones.
+        val numRows = input.getRowCount.toInt
+
+        // Step 1: verify and preprocess the data to clean it up and normalize a few things
+        // Step 2: Concat the data into a single buffer
+        val (isNullOrEmpty, combined) = cleanAndConcat(input.getBase)
+        withResource(isNullOrEmpty) { isNullOrEmpty =>
+          // Step 3: copy the data back to the host so we can parse it.
+          val combinedHost = withResource(combined) { combined =>
+            combined.copyToHost()
+          }
+          // Step 4: Have cudf parse the JSON data
+          val (names, rawTable) = withResource(combinedHost) { combinedHost =>
+            val data = combinedHost.getData
+            val start = combinedHost.getStartListOffset(0)
+            val end = combinedHost.getEndListOffset(0)
+            val length = end - start
+
+            withResource(cudf.Table.readJSON(cudf.JSONOptions.DEFAULT, data, start,
+              length)) { tableWithMeta =>
+              val names = tableWithMeta.getColumnNames
+              (names, tableWithMeta.releaseTable())
+            }
+          }
+
+          // process duplicated field names in input struct schema
+          val fieldNames = processFieldNames(struct.fields.map { field => 
+              (field.name, field.dataType)})
+
+          withResource(rawTable) { rawTable =>
+            // Step 5: verify that the data looks correct
+            if (rawTable.getRowCount != numRows) {
+              throw new IllegalStateException("The input data didn't parse correctly and we read " +
+                  s"a different number of rows than was expected. Expected $numRows, " +
+                  s"but got ${rawTable.getRowCount}")
+            }
+
+            // Step 6: get the data based on input struct schema
+            val columns = fieldNames.safeMap { case (name, dtype) =>
+              val i = names.indexOf(name)
+              if (i == -1) {
+                GpuColumnVector.columnVectorFromNull(numRows, dtype)
+              } else {
+                rawTable.getColumn(i).incRefCount
+              }
+            }
+
+            // Step 7: turn the data into a Struct
+            withResource(columns) { columns =>
+              withResource(cudf.ColumnVector.makeStruct(columns: _*)) { structData =>
+                // Step 8: put nulls back in for nulls and empty strings
+                withResource(GpuScalar.from(null, struct)) { nullVal =>
+                  isNullOrEmpty.ifElse(nullVal, structData)
+                }
+              }
+            }
+          }
+        }
+      }
+      case _ => throw new IllegalArgumentException(
+        s"GpuJsonToStructs currently does not support schema of type ${schema}.")
+    }
   }
 
   override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =

diff --git a/tools/generated_files/supportedExprs.csv b/tools/generated_files/supportedExprs.csv
@@ -265,7 +265,7 @@ IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,N
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonToStructs,NS,`from_json`,This is disabled by default because parsing JSON from a column has a large number of issues and should be considered beta quality right now.,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because parsing JSON from a column has a large number of issues and should be considered beta quality right now.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,NS,NA
+JsonToStructs,NS,`from_json`,This is disabled by default because parsing JSON from a column has a large number of issues and should be considered beta quality right now.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA
 JsonTuple,S,`json_tuple`,None,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,S,`json_tuple`,None,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,S,`json_tuple`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA