Revert "Added in very specific support for from_json to a Map<String,…

…String> (NVIDIA#6211)" This reverts commit 8b497c5.
revans2 · Sep 6, 2022 · 76d78fa · 76d78fa
1 parent 1b4a488
commit 76d78fa
Show file tree

Hide file tree

Showing 8 changed files with 36 additions and 291 deletions.
diff --git a/docs/configs.md b/docs/configs.md
@@ -249,7 +249,6 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.IsNaN"></a>spark.rapids.sql.expression.IsNaN|`isnan`|Checks if a value is NaN|true|None|
 <a name="sql.expression.IsNotNull"></a>spark.rapids.sql.expression.IsNotNull|`isnotnull`|Checks if a value is not null|true|None|
 <a name="sql.expression.IsNull"></a>spark.rapids.sql.expression.IsNull|`isnull`|Checks if a value is null|true|None|
-<a name="sql.expression.JsonToStructs"></a>spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|false|This is disabled by default because parsing JSON from a column has a large number of issues and should be considered beta quality right now.|
 <a name="sql.expression.KnownFloatingPointNormalized"></a>spark.rapids.sql.expression.KnownFloatingPointNormalized| |Tag to prevent redundant normalization|true|None|
 <a name="sql.expression.KnownNotNull"></a>spark.rapids.sql.expression.KnownNotNull| |Tag an expression as known to not be null|true|None|
 <a name="sql.expression.Lag"></a>spark.rapids.sql.expression.Lag|`lag`|Window function that returns N entries behind this one|true|None|

diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -7684,53 +7684,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<td rowSpan="2">JsonToStructs</td>
-<td rowSpan="2">`from_json`</td>
-<td rowSpan="2">Returns a struct value with the given `jsonStr` and `schema`</td>
-<td rowSpan="2">This is disabled by default because parsing JSON from a column has a large number of issues and should be considered beta quality right now.</td>
-<td rowSpan="2">project</td>
-<td>jsonStr</td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td>S</td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-</tr>
-<tr>
-<td>result</td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td><b>NS</b></td>
-<td><em>PS<br/>unsupported child types BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, DATE, TIMESTAMP, DECIMAL, NULL, BINARY, CALENDAR, ARRAY, MAP, STRUCT, UDT</em></td>
-<td><b>NS</b></td>
-<td> </td>
-</tr>
-<tr>
 <td rowSpan="2">KnownFloatingPointNormalized</td>
 <td rowSpan="2"> </td>
 <td rowSpan="2">Tag to prevent redundant normalization</td>

diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
@@ -359,14 +359,3 @@ def test_json_read_count(spark_tmp_path, v1_enabled_list):
     assert_gpu_and_cpu_row_counts_equal(
             lambda spark : spark.read.schema(schema).json(data_path),
             conf=updated_conf)
-
-def test_from_json_map():
-    # The test here is working around some inconsistencies in how the keys are parsed for maps
-    # on the GPU the keys are dense, but on the CPU they are sparse
-    json_string_gen = StringGen("{\"a\": \"[0-9]{0,5}\"(, \"b\": \"[A-Z]{0,5}\")?}")
-    assert_gpu_and_cpu_are_equal_collect(
-        lambda spark : unary_op_df(spark, json_string_gen)\
-                .selectExpr("from_json(a, \"MAP<STRING,STRING>\") as parsed")\
-                .selectExpr("parsed[\"a\"] as pa", "parsed[\"b\"] as pb"),
-        conf={"spark.rapids.sql.expression.JsonToStructs": "true"})
-
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -3532,20 +3532,6 @@ object GpuOverrides extends Logging {
           GpuGetJsonObject(lhs, rhs)
       }
     ),
-    expr[JsonToStructs](
-       "Returns a struct value with the given `jsonStr` and `schema`",
-      ExprChecks.projectOnly(
-        TypeSig.MAP.nested(TypeSig.STRING),
-        (TypeSig.STRUCT + TypeSig.MAP + TypeSig.ARRAY).nested(TypeSig.all),
-        Seq(ParamCheck("jsonStr", TypeSig.STRING, TypeSig.STRING))),
-      (a, conf, p, r) => new UnaryExprMeta[JsonToStructs](a, conf, p, r) {
-        override def tagExprForGpu(): Unit =
-          GpuJsonScan.tagJsonToStructsSupport(a.options, this)
-
-        override def convertToGpu(child: Expression): GpuExpression =
-          GpuJsonToStructs(a.schema, a.options, child, a.timeZoneId)
-      }).disabledByDefault("parsing JSON from a column has a large number of issues and " +
-        "should be considered beta quality right now."),
     expr[org.apache.spark.sql.execution.ScalarSubquery](
       "Subquery that will return only one row and one column",
       ExprChecks.projectOnly(

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala
@@ -56,89 +56,71 @@ object GpuJsonScan {
       scanMeta)
   }
 
-  def tagSupportOptions(
-      options: JSONOptionsInRead,
+  def tagSupport(
+      sparkSession: SparkSession,
+      dataSchema: StructType,
+      readSchema: StructType,
+      options: Map[String, String],
       meta: RapidsMeta[_, _, _]): Unit = {
 
-    if (options.multiLine) {
+    val parsedOptions = new JSONOptionsInRead(
+      options,
+      sparkSession.sessionState.conf.sessionLocalTimeZone,
+      sparkSession.sessionState.conf.columnNameOfCorruptRecord)
+
+    if (!meta.conf.isJsonEnabled) {
+      meta.willNotWorkOnGpu("JSON input and output has been disabled. To enable set " +
+        s"${RapidsConf.ENABLE_JSON} to true")
+    }
+
+    if (!meta.conf.isJsonReadEnabled) {
+      meta.willNotWorkOnGpu("JSON input has been disabled. To enable set " +
+        s"${RapidsConf.ENABLE_JSON_READ} to true. Please note that, currently json reader does " +
+        s"not support column prune, so user must specify the full schema or just let spark to " +
+        s"infer the schema")
+    }
+
+    if (parsedOptions.multiLine) {
       meta.willNotWorkOnGpu("GpuJsonScan does not support multiLine")
     }
 
     // {"name": /* hello */ "Reynold Xin"} is not supported by CUDF
-    if (options.allowComments) {
+    if (parsedOptions.allowComments) {
       meta.willNotWorkOnGpu("GpuJsonScan does not support allowComments")
     }
 
     // {name: 'Reynold Xin'} is not supported by CUDF
-    if (options.allowUnquotedFieldNames) {
+    if (parsedOptions.allowUnquotedFieldNames) {
       meta.willNotWorkOnGpu("GpuJsonScan does not support allowUnquotedFieldNames")
     }
 
     // {'name': 'Reynold Xin'} is not supported by CUDF
-    // This is different because the default for this is true, but we don't support it so we lie...
-    if (options.parameters.get("allowSingleQuotes").map(_.toBoolean).getOrElse(false)) {
+    if (options.get("allowSingleQuotes").map(_.toBoolean).getOrElse(false)) {
       meta.willNotWorkOnGpu("GpuJsonScan does not support allowSingleQuotes")
     }
 
     // {"name": "Cazen Lee", "price": "\$10"} is not supported by CUDF
-    if (options.allowBackslashEscapingAnyCharacter) {
+    if (parsedOptions.allowBackslashEscapingAnyCharacter) {
       meta.willNotWorkOnGpu("GpuJsonScan does not support allowBackslashEscapingAnyCharacter")
     }
 
     // {"a":null, "b":1, "c":3.0}, Spark will drop column `a` if dropFieldIfAllNull is enabled.
-    if (options.dropFieldIfAllNull) {
+    if (parsedOptions.dropFieldIfAllNull) {
       meta.willNotWorkOnGpu("GpuJsonScan does not support dropFieldIfAllNull")
     }
 
-    if (options.parseMode != PermissiveMode) {
+    if (parsedOptions.parseMode != PermissiveMode) {
       meta.willNotWorkOnGpu("GpuJsonScan only supports Permissive JSON parsing")
     }
 
-    if (options.lineSeparator.getOrElse("\n") != "\n") {
+    if (parsedOptions.lineSeparator.getOrElse("\n") != "\n") {
       meta.willNotWorkOnGpu("GpuJsonScan only supports \"\\n\" as a line separator")
     }
 
-    options.encoding.foreach(enc =>
+    parsedOptions.encoding.foreach(enc =>
       if (enc != StandardCharsets.UTF_8.name() && enc != StandardCharsets.US_ASCII.name()) {
-        meta.willNotWorkOnGpu("GpuJsonScan only supports UTF8 or US-ASCII encoded data")
-      })
-  }
-
-  def tagJsonToStructsSupport(options:Map[String, String],
-      meta: RapidsMeta[_, _, _]): Unit = {
-    val parsedOptions = new JSONOptionsInRead(
-      options,
-      SQLConf.get.sessionLocalTimeZone,
-      SQLConf.get.columnNameOfCorruptRecord)
-
-    tagSupportOptions(parsedOptions, meta)
-  }
-
-  def tagSupport(
-      sparkSession: SparkSession,
-      dataSchema: StructType,
-      readSchema: StructType,
-      options: Map[String, String],
-      meta: RapidsMeta[_, _, _]): Unit = {
-
-    val parsedOptions = new JSONOptionsInRead(
-      options,
-      sparkSession.sessionState.conf.sessionLocalTimeZone,
-      sparkSession.sessionState.conf.columnNameOfCorruptRecord)
-
-    if (!meta.conf.isJsonEnabled) {
-      meta.willNotWorkOnGpu("JSON input and output has been disabled. To enable set " +
-          s"${RapidsConf.ENABLE_JSON} to true")
-    }
-
-    if (!meta.conf.isJsonReadEnabled) {
-      meta.willNotWorkOnGpu("JSON input has been disabled. To enable set " +
-          s"${RapidsConf.ENABLE_JSON_READ} to true. Please note that, currently json reader does " +
-          s"not support column prune, so user must specify the full schema or just let spark to " +
-          s"infer the schema")
-    }
-
-    tagSupportOptions(parsedOptions, meta)
+      meta.willNotWorkOnGpu("GpuJsonScan only supports UTF8 or US-ASCII encoded data")
+    })
 
     val types = readSchema.map(_.dataType)
     if (types.contains(DateType)) {
@@ -154,17 +136,17 @@ object GpuJsonScan {
 
     if (!meta.conf.isJsonFloatReadEnabled && types.contains(FloatType)) {
       meta.willNotWorkOnGpu("JSON reading is not 100% compatible when reading floats. " +
-          s"To enable it please set ${RapidsConf.ENABLE_READ_JSON_FLOATS} to true.")
+        s"To enable it please set ${RapidsConf.ENABLE_READ_JSON_FLOATS} to true.")
     }
 
     if (!meta.conf.isJsonDoubleReadEnabled && types.contains(DoubleType)) {
       meta.willNotWorkOnGpu("JSON reading is not 100% compatible when reading doubles. " +
-          s"To enable it please set ${RapidsConf.ENABLE_READ_JSON_DOUBLES} to true.")
+        s"To enable it please set ${RapidsConf.ENABLE_READ_JSON_DOUBLES} to true.")
     }
 
     if (!meta.conf.isJsonDecimalReadEnabled && types.exists(_.isInstanceOf[DecimalType])) {
       meta.willNotWorkOnGpu("JSON reading is not 100% compatible when reading decimals. " +
-          s"To enable it please set ${RapidsConf.ENABLE_READ_JSON_DECIMALS} to true.")
+        s"To enable it please set ${RapidsConf.ENABLE_READ_JSON_DECIMALS} to true.")
     }
 
     dataSchema.getFieldIndex(parsedOptions.columnNameOfCorruptRecord).foreach { corruptFieldIndex =>