NVIDIA · revans2 · Apr 24, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md
@@ -129,7 +129,6 @@ Name | Description | Default Value | Applicable at
 <a name="sql.json.read.decimal.enabled"></a>spark.rapids.sql.json.read.decimal.enabled|When reading a quoted string as a decimal Spark supports reading non-ascii unicode digits, and the RAPIDS Accelerator does not.|true|Runtime
 <a name="sql.json.read.double.enabled"></a>spark.rapids.sql.json.read.double.enabled|JSON reading is not 100% compatible when reading doubles.|true|Runtime
 <a name="sql.json.read.float.enabled"></a>spark.rapids.sql.json.read.float.enabled|JSON reading is not 100% compatible when reading floats.|true|Runtime
-<a name="sql.json.read.mixedTypesAsString.enabled"></a>spark.rapids.sql.json.read.mixedTypesAsString.enabled|JSON reading is not 100% compatible when reading mixed types as string.|false|Runtime
 <a name="sql.mode"></a>spark.rapids.sql.mode|Set the mode for the Rapids Accelerator. The supported modes are explainOnly and executeOnGPU. This config can not be changed at runtime, you must restart the application for it to take affect. The default mode is executeOnGPU, which means the RAPIDS Accelerator plugin convert the Spark operations and execute them on the GPU when possible. The explainOnly mode allows running queries on the CPU and the RAPIDS Accelerator will evaluate the queries as if it was going to run on the GPU. The explanations of what would have run on the GPU and why are output in log messages. When using explainOnly mode, the default explain output is ALL, this can be changed by setting spark.rapids.sql.explain. See that config for more details.|executeongpu|Startup
 <a name="sql.optimizer.joinReorder.enabled"></a>spark.rapids.sql.optimizer.joinReorder.enabled|When enabled, joins may be reordered for improved query performance|true|Runtime
 <a name="sql.python.gpu.enabled"></a>spark.rapids.sql.python.gpu.enabled|This is an experimental feature and is likely to change in the future. Enable (true) or disable (false) support for scheduling Python Pandas UDFs with GPU resources. When enabled, pandas UDFs are assumed to share the same GPU that the RAPIDs accelerator uses and will honor the python GPU configs|false|Runtime

diff --git a/docs/compatibility.md b/docs/compatibility.md
@@ -368,10 +368,8 @@ In versions of Spark before 3.5.0 there is no maximum to how deeply nested JSON
 no matter what version of Spark is used. If the nesting level is over this the JSON is considered
 invalid and all values will be returned as nulls.
 
-Only structs are supported for nested types. There are also some issues with arrays of structs. If
-your data includes this, even if you are not reading it, you might get an exception. You can
-try to set `spark.rapids.sql.json.read.mixedTypesAsString.enabled` to true to work around this,
-but it also has some issues with it.
+Mixed types can have some problems. If an item being read could have some lines that are arrays 
+and others that are structs/dictionaries it is possible an error will be thrown.
 
 Dates and Timestamps have some issues and may return values for technically invalid inputs.
 

diff --git a/integration_tests/src/main/python/json_matrix_test.py b/integration_tests/src/main/python/json_matrix_test.py
@@ -52,17 +52,15 @@ def read_json_as_text(spark, data_path, column_name):
     'spark.rapids.sql.format.json.read.enabled': 'true',
     'spark.rapids.sql.json.read.float.enabled': 'true',
     'spark.rapids.sql.json.read.double.enabled': 'true',
-    'spark.rapids.sql.json.read.decimal.enabled': 'true',
-    'spark.rapids.sql.json.read.mixedTypesAsString.enabled': 'true'
+    'spark.rapids.sql.json.read.decimal.enabled': 'true'
 }
 
 _enable_json_to_structs_conf = {
     'spark.rapids.sql.expression.JsonToStructs': 'true',
     'spark.rapids.sql.json.read.float.enabled': 'true',
     'spark.rapids.sql.json.read.double.enabled': 'true',
     'spark.rapids.sql.json.read.decimal.enabled': 'true',
-    'spark.rapids.sql.json.read.decimal.enabled': 'true',
-    'spark.rapids.sql.json.read.mixedTypesAsString.enabled': 'true'
+    'spark.rapids.sql.json.read.decimal.enabled': 'true'
 }
 
 _enable_get_json_object_conf = {

diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
@@ -382,8 +382,7 @@ def test_read_invalid_json(spark_tmp_table_factory, std_input_path, read_func, f
 @pytest.mark.parametrize('v1_enabled_list', ["", "json"])
 def test_read_valid_json(spark_tmp_table_factory, std_input_path, read_func, filename, schema, v1_enabled_list):
     conf = copy_and_update(_enable_all_types_conf,
-        {'spark.sql.sources.useV1SourceList': v1_enabled_list,
-         'spark.rapids.sql.json.read.mixedTypesAsString.enabled': True})
+        {'spark.sql.sources.useV1SourceList': v1_enabled_list})
     assert_gpu_and_cpu_are_equal_collect(
         read_func(std_input_path + '/' + filename,
                   schema,
@@ -898,11 +897,10 @@ def test_from_json_struct_of_list(schema):
 @pytest.mark.xfail(reason = 'https://github.com/NVIDIA/spark-rapids/issues/10351')
 def test_from_json_mixed_types_list_struct(schema):
     json_string_gen = StringGen(r'{"a": (\[1,2,3\]|{"b":"[a-z]{2}"}) }')
-    conf = copy_and_update(_enable_all_types_conf, {'spark.rapids.sql.json.read.mixedTypesAsString.enabled': 'true'})
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : unary_op_df(spark, json_string_gen) \
             .select('a', f.from_json('a', schema)),
-        conf=conf)
+        conf=_enable_all_types_conf)
 
 @pytest.mark.parametrize('schema', ['struct<a:string>', 'struct<a:string,b:int>'])
 @allow_non_gpu(*non_utc_allow)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -3689,8 +3689,7 @@ object GpuOverrides extends Logging {
 
         override def convertToGpu(child: Expression): GpuExpression =
           // GPU implementation currently does not support duplicated json key names in input
-          GpuJsonToStructs(a.schema, a.options, child, conf.isJsonMixedTypesAsStringEnabled,
-            a.timeZoneId)
+          GpuJsonToStructs(a.schema, a.options, child, a.timeZoneId)
       }).disabledByDefault("it is currently in beta and undergoes continuous enhancements."+
       " Please consult the "+
       "[compatibility documentation](../compatibility.md#json-supporting-types)"+
@@ -3883,8 +3882,7 @@ object GpuOverrides extends Logging {
             a.dataFilters,
             conf.maxReadBatchSizeRows,
             conf.maxReadBatchSizeBytes,
-            conf.maxGpuColumnSizeBytes,
-            conf.isJsonMixedTypesAsStringEnabled)
+            conf.maxGpuColumnSizeBytes)
       })).map(r => (r.getClassFor.asSubclass(classOf[Scan]), r)).toMap
 
   val scans: Map[Class[_ <: Scan], ScanRule[_ <: Scan]] =

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1239,12 +1239,6 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
     .booleanConf
     .createWithDefault(true)
 
-  val ENABLE_READ_JSON_MIXED_TYPES_AS_STRING =
-    conf("spark.rapids.sql.json.read.mixedTypesAsString.enabled")
-    .doc("JSON reading is not 100% compatible when reading mixed types as string.")
-    .booleanConf
-    .createWithDefault(false)
-
   val ENABLE_AVRO = conf("spark.rapids.sql.format.avro.enabled")
     .doc("When set to true enables all avro input and output acceleration. " +
       "(only input is currently supported anyways)")
@@ -2686,8 +2680,6 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val isJsonDecimalReadEnabled: Boolean = get(ENABLE_READ_JSON_DECIMALS)
 
-  lazy val isJsonMixedTypesAsStringEnabled: Boolean = get(ENABLE_READ_JSON_MIXED_TYPES_AS_STRING)
-
   lazy val isAvroEnabled: Boolean = get(ENABLE_AVRO)
 
   lazy val isAvroReadEnabled: Boolean = get(ENABLE_AVRO_READ)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala
@@ -246,8 +246,7 @@ case class GpuJsonScan(
     dataFilters: Seq[Expression],
     maxReaderBatchSizeRows: Integer,
     maxReaderBatchSizeBytes: Long,
-    maxGpuColumnSizeBytes: Long,
-    mixedTypesAsStringEnabled: Boolean)
+    maxGpuColumnSizeBytes: Long)
   extends TextBasedFileScan(sparkSession, options) with GpuScan {
 
   private lazy val parsedOptions: JSONOptions = new JSONOptions(
@@ -270,8 +269,7 @@ case class GpuJsonScan(
 
     GpuJsonPartitionReaderFactory(sparkSession.sessionState.conf, broadcastedConf,
       dataSchema, readDataSchema, readPartitionSchema, parsedOptions, maxReaderBatchSizeRows,
-      maxReaderBatchSizeBytes, maxGpuColumnSizeBytes, metrics, options.asScala.toMap,
-      mixedTypesAsStringEnabled)
+      maxReaderBatchSizeBytes, maxGpuColumnSizeBytes, metrics, options.asScala.toMap)
   }
 
   override def withInputFile(): GpuScan = this
@@ -289,8 +287,7 @@ case class GpuJsonPartitionReaderFactory(
     maxReaderBatchSizeBytes: Long,
     maxGpuColumnSizeBytes: Long,
     metrics: Map[String, GpuMetric],
-    @transient params: Map[String, String],
-    mixedTypesAsStringEnabled: Boolean) extends ShimFilePartitionReaderFactory(params) {
+    @transient params: Map[String, String]) extends ShimFilePartitionReaderFactory(params) {
 
   override def buildReader(partitionedFile: PartitionedFile): PartitionReader[InternalRow] = {
     throw new IllegalStateException("ROW BASED PARSING IS NOT SUPPORTED ON THE GPU...")
@@ -300,7 +297,7 @@ case class GpuJsonPartitionReaderFactory(
     val conf = broadcastedConf.value.value
     val reader = new PartitionReaderWithBytesRead(new JsonPartitionReader(conf, partFile,
       dataSchema, readDataSchema, parsedOptions, maxReaderBatchSizeRows, maxReaderBatchSizeBytes,
-      metrics, mixedTypesAsStringEnabled))
+      metrics))
     ColumnarPartitionReaderWithPartitionValues.newReader(partFile, reader, partitionSchema,
       maxGpuColumnSizeBytes)
   }
@@ -346,14 +343,13 @@ class JsonPartitionReader(
     parsedOptions: JSONOptions,
     maxRowsPerChunk: Integer,
     maxBytesPerChunk: Long,
-    execMetrics: Map[String, GpuMetric],
-    enableMixedTypesAsString: Boolean)
+    execMetrics: Map[String, GpuMetric])
   extends GpuTextBasedPartitionReader[HostLineBufferer, HostLineBuffererFactory.type](conf,
     partFile, dataSchema, readDataSchema, parsedOptions.lineSeparatorInRead, maxRowsPerChunk,
     maxBytesPerChunk, execMetrics, HostLineBuffererFactory) {
 
   def buildJsonOptions(parsedOptions: JSONOptions): cudf.JSONOptions =
-    GpuJsonReadCommon.cudfJsonOptions(parsedOptions, enableMixedTypesAsString)
+    GpuJsonReadCommon.cudfJsonOptions(parsedOptions)
 
   /**
    * Read the host buffer to GPU table

diff --git a/...ugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuReadJsonFileFormat.scala b/...ugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuReadJsonFileFormat.scala
@@ -68,8 +68,7 @@ class GpuReadJsonFileFormat extends JsonFileFormat with GpuReadFileFormatWithMet
       rapidsConf.maxReadBatchSizeBytes,
       rapidsConf.maxGpuColumnSizeBytes,
       metrics,
-      options,
-      rapidsConf.isJsonMixedTypesAsStringEnabled)
+      options)
     PartitionReaderIterator.buildReader(factory)
   }
 

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonReadCommon.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonReadCommon.scala
@@ -362,11 +362,10 @@ object GpuJsonReadCommon {
     }
   }
 
-  def cudfJsonOptions(options: JSONOptions,
-      enableMixedTypes: Boolean): ai.rapids.cudf.JSONOptions = {
+  def cudfJsonOptions(options: JSONOptions): ai.rapids.cudf.JSONOptions = {
     ai.rapids.cudf.JSONOptions.builder()
     .withRecoverWithNull(true)
-    .withMixedTypesAsStrings(enableMixedTypes)
+    .withMixedTypesAsStrings(true)
     .withNormalizeWhitespace(true)
     .withKeepQuotes(true)
     .withNormalizeSingleQuotes(options.allowSingleQuotes)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala
@@ -69,7 +69,6 @@ case class GpuJsonToStructs(
     schema: DataType,
     options: Map[String, String],
     child: Expression,
-    enableMixedTypesAsString: Boolean,
     timeZoneId: Option[String] = None)
     extends GpuUnaryExpression with TimeZoneAwareExpression with ExpectsInputTypes
         with NullIntolerant {
@@ -155,7 +154,7 @@ case class GpuJsonToStructs(
     SQLConf.get.columnNameOfCorruptRecord)
 
   private lazy val jsonOptions =
-    GpuJsonReadCommon.cudfJsonOptions(parsedOptions, enableMixedTypesAsString)
+    GpuJsonReadCommon.cudfJsonOptions(parsedOptions)
 
   override protected def doColumnar(input: GpuColumnVector): cudf.ColumnVector = {
     schema match {