NVIDIA · parthosa · Mar 27, 2024 · Mar 25, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/...cala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala b/...cala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 {"spark": "341"}
 {"spark": "341db"}
 {"spark": "342"}
-{"spark": "350"}
-{"spark": "351"}
 spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.shims
 

diff --git a/...cala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala b/...cala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "350"}
+{"spark": "351"}
+spark-rapids-shim-json-lines ***/
+package org.apache.spark.sql.rapids.shims
+
+import java.net.URI
+
+import com.nvidia.spark.rapids.GpuDataWritingCommand
+import com.nvidia.spark.rapids.shims.SparkShimImpl
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.util.removeInternalMetadata
+import org.apache.spark.sql.execution.command.{CommandUtils, LeafRunnableCommand}
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.rapids._
+import org.apache.spark.sql.sources.BaseRelation
+
+case class GpuCreateDataSourceTableAsSelectCommand(
+    table: CatalogTable,
+    mode: SaveMode,
+    query: LogicalPlan,
+    outputColumnNames: Seq[String],
+    origProvider: Class[_])
+  extends LeafRunnableCommand {
+  assert(query.resolved)
+  override def innerChildren: Seq[LogicalPlan] = query :: Nil
+
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+    assert(table.tableType != CatalogTableType.VIEW)
+    assert(table.provider.isDefined)
+
+    val sessionState = sparkSession.sessionState
+    val db = table.identifier.database.getOrElse(sessionState.catalog.getCurrentDatabase)
+    val tableIdentWithDB = table.identifier.copy(database = Some(db))
+    val tableName = tableIdentWithDB.unquotedString
+
+    if (sessionState.catalog.tableExists(tableIdentWithDB)) {
+      assert(mode != SaveMode.Overwrite,
+        s"Expect the table $tableName has been dropped when the save mode is Overwrite")
+
+      if (mode == SaveMode.ErrorIfExists) {
+        throw new AnalysisException(s"Table $tableName already exists. You need to drop it first.")
+      }
+      if (mode == SaveMode.Ignore) {
+        // Since the table already exists and the save mode is Ignore, we will just return.
+        return Seq.empty
+      }
+
+      saveDataIntoTable(
+        sparkSession, table, table.storage.locationUri, SaveMode.Append, tableExists = true)
+    } else {
+      table.storage.locationUri.foreach { p =>
+        GpuDataWritingCommand.assertEmptyRootPath(p, mode, sparkSession.sessionState.newHadoopConf)
+      }
+      assert(table.schema.isEmpty)
+      sparkSession.sessionState.catalog.validateTableLocation(table)
+      val tableLocation = if (table.tableType == CatalogTableType.MANAGED) {
+        Some(sessionState.catalog.defaultTablePath(table.identifier))
+      } else {
+        table.storage.locationUri
+      }
+      val result = saveDataIntoTable(
+        sparkSession, table, tableLocation, SaveMode.Overwrite, tableExists = false)
+      val newTable = table.copy(
+        storage = table.storage.copy(locationUri = tableLocation),
+        // We will use the schema of resolved.relation as the schema of the table (instead of
+        // the schema of df). It is important since the nullability may be changed by the relation
+        // provider (for example, see org.apache.spark.sql.parquet.DefaultSource).
+        schema = removeInternalMetadata(result.schema))
+      // Table location is already validated. No need to check it again during table creation.
+      sessionState.catalog.createTable(newTable, ignoreIfExists = false, validateLocation = false)
+
+      result match {
+        case _: HadoopFsRelation if table.partitionColumnNames.nonEmpty &&
+          sparkSession.sqlContext.conf.manageFilesourcePartitions =>
+          // Need to recover partitions into the metastore so our saved data is visible.
+          sessionState.executePlan(
+            SparkShimImpl.v1RepairTableCommand(table.identifier)).toRdd
+        case _ =>
+      }
+    }
+
+    CommandUtils.updateTableStats(sparkSession, table)
+
+    Seq.empty[Row]
+  }
+
+  private def saveDataIntoTable(
+      session: SparkSession,
+      table: CatalogTable,
+      tableLocation: Option[URI],
+      mode: SaveMode,
+      tableExists: Boolean): BaseRelation = {
+    // Create the relation based on the input logical plan: `query`.
+    val pathOption = tableLocation.map("path" -> CatalogUtils.URIToString(_))
+    val dataSource = GpuDataSource(
+      session,
+      className = table.provider.get,
+      partitionColumns = table.partitionColumnNames,
+      bucketSpec = table.bucketSpec,
+      options = table.storage.properties ++ pathOption,
+      catalogTable = if (tableExists) Some(table) else None,
+      origProvider = origProvider)
+    try {
+      dataSource.writeAndRead(mode, query, outputColumnNames)
+    } catch {
+      case ex: AnalysisException =>
+        logError(s"Failed to write to table ${table.identifier.unquotedString}", ex)
+        throw ex
+    }
+  }
+}
diff --git a/...k350/scala/org/apache/spark/sql/rapids/GpuCreateDataSourceTableAsSelectCommandSuite.scala b/...k350/scala/org/apache/spark/sql/rapids/GpuCreateDataSourceTableAsSelectCommandSuite.scala
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "350"}
+{"spark": "351"}
+spark-rapids-shim-json-lines ***/
+package org.apache.spark.sql.rapids
+
+import com.nvidia.spark.rapids.FunSuiteWithTempDir
+import com.nvidia.spark.rapids.SparkQueryCompareTestSuite
+
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.expressions.FileSourceMetadataAttribute.FILE_SOURCE_METADATA_COL_ATTR_KEY
+import org.apache.spark.sql.connector.catalog.{Column, Identifier}
+import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
+
+class GpuCreateDataSourceTableAsSelectCommandSuite
+  extends SparkQueryCompareTestSuite
+  with FunSuiteWithTempDir {
+
+  test("SPARK-43123: Metadata column related field metadata should not be leaked to catalogs") {
+    val inputDf = "inputDf"
+    val targetTable = "targetTable"
+    val columnName = "dataColumn"
+    // Create a metadata having an internal metadata field as its key
+    val newMetadata = Metadata.fromJson(s"""{"$FILE_SOURCE_METADATA_COL_ATTR_KEY": "dummy"}""")
+    withGpuSparkSession { spark =>
+      withTable(spark, targetTable) {
+        // Create an Dataframe having a column with the above metadata
+        val schema = StructType(Array(
+          StructField(columnName, StringType, nullable = true, newMetadata)
+        ))
+        val emptyRDD = spark.sparkContext.emptyRDD[Row]
+        spark.createDataFrame(emptyRDD, schema).createOrReplaceTempView(inputDf)
+
+        // Create the target table from the Dataframe (CTAS)
+        spark.sql(s"""
+          |CREATE TABLE $targetTable USING PARQUET
+          |AS SELECT $columnName FROM $inputDf
+          |""".stripMargin)
+
+        // Fetch the created table's columns to verify metadata leakage
+        val tableColumns = getColumns(spark, targetTable)
+        assert(tableColumns.length == 1, "Table should only contain one column.")
+        val firstColumn = tableColumns.head
+        assert(firstColumn.name == columnName, s"Column name should be '$columnName'.")
+        assert(firstColumn.dataType == StringType, "Column type should be StringType.")
+        assert(firstColumn.metadataInJSON() == null, "Column metadata should be empty.")
+      }
+    }
+  }
+
+  private def withTable(spark: SparkSession, tableNames: String*)(f: => Unit): Unit = {
+    Utils.tryWithSafeFinally(f) {
+      tableNames.foreach { name =>
+        spark.sql(s"DROP TABLE IF EXISTS $name")
+      }
+    }
+  }
+
+  /**
+   * This method accesses the current catalog of the Spark session to
+   * fetch the schema of the input table. It then returns the columns of the table
+   * as an array of Column objects.
+   */
+  private def getColumns(spark: SparkSession, tableName: String): Array[Column] = {
+    val catalogManager = spark.sessionState.catalogManager
+    val currentCatalog = catalogManager.currentCatalog.asTableCatalog
+    val identifier = Identifier.of(catalogManager.currentNamespace, tableName)
+    currentCatalog.loadTable(identifier).columns()
+  }
+}