diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 920b3392854c9..54f9fd4395489 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -597,6 +597,7 @@ Below is a list of all the keywords in Spark SQL. |NANOSECONDS|non-reserved|non-reserved|non-reserved| |NATURAL|reserved|strict-non-reserved|reserved| |NO|non-reserved|non-reserved|reserved| +|NONE|non-reserved|non-reserved|reserved| |NOT|reserved|non-reserved|reserved| |NULL|reserved|non-reserved|reserved| |NULLS|non-reserved|non-reserved|non-reserved| diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index 85a4633e80502..bde298c23e786 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -316,6 +316,7 @@ NANOSECOND: 'NANOSECOND'; NANOSECONDS: 'NANOSECONDS'; NATURAL: 'NATURAL'; NO: 'NO'; +NONE: 'NONE'; NOT: 'NOT'; NULL: 'NULL'; NULLS: 'NULLS'; diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 54eff14b6d4df..2f5bf8bbfec14 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -174,6 +174,8 @@ statement | ALTER TABLE identifierReference (partitionSpec)? SET locationSpec #setTableLocation | ALTER TABLE identifierReference RECOVER PARTITIONS #recoverPartitions + | ALTER TABLE identifierReference + (clusterBySpec | CLUSTER BY NONE) #alterClusterBy | DROP TABLE (IF EXISTS)? identifierReference PURGE? #dropTable | DROP VIEW (IF EXISTS)? identifierReference #dropView | CREATE (OR REPLACE)? (GLOBAL? TEMPORARY)? @@ -1572,6 +1574,7 @@ ansiNonReserved | NANOSECOND | NANOSECONDS | NO + | NONE | NULLS | NUMERIC | OF @@ -1920,6 +1923,7 @@ nonReserved | NANOSECOND | NANOSECONDS | NO + | NONE | NOT | NULL | NULLS diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableChange.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableChange.java index ebecb6f507e6a..117f1748e209b 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableChange.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableChange.java @@ -22,6 +22,7 @@ import javax.annotation.Nullable; import org.apache.spark.annotation.Evolving; +import org.apache.spark.sql.connector.expressions.NamedReference; import org.apache.spark.sql.types.DataType; /** @@ -248,6 +249,17 @@ static TableChange deleteColumn(String[] fieldNames, Boolean ifExists) { return new DeleteColumn(fieldNames, ifExists); } + /** + * Create a TableChange for changing clustering columns for a table. + * + * @param clusteringColumns clustering columns to change to. Each clustering column represents + * field names. + * @return a TableChange for this assignment + */ + static TableChange clusterBy(NamedReference[] clusteringColumns) { + return new ClusterBy(clusteringColumns); + } + /** * A TableChange to set a table property. *

@@ -752,4 +764,22 @@ public int hashCode() { } } + /** A TableChange to alter clustering columns for a table. */ + final class ClusterBy implements TableChange { + private final NamedReference[] clusteringColumns; + + private ClusterBy(NamedReference[] clusteringColumns) { + this.clusteringColumns = clusteringColumns; + } + + public NamedReference[] clusteringColumns() { return clusteringColumns; } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + ClusterBy that = (ClusterBy) o; + return Arrays.equals(clusteringColumns, that.clusteringColumns()); + } + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index d55b9c972697e..c281b0df8a6da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -231,6 +231,14 @@ object ClusterBySpec { case ClusterByTransform(columnNames) => ClusterBySpec(columnNames) } } + + def extractClusterByTransform( + schema: StructType, + clusterBySpec: ClusterBySpec, + resolver: Resolver): ClusterByTransform = { + val normalizedClusterBySpec = normalizeClusterBySpec(schema, clusterBySpec, resolver) + ClusterByTransform(normalizedClusterBySpec.columnNames) + } } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index dc43bd1636594..7f93e993c6fa8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -4594,6 +4594,25 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { ifExists) } + /** + * Parse a [[AlterTableClusterBy]] command. + * + * For example: + * {{{ + * ALTER TABLE table1 CLUSTER BY (a.b.c) + * ALTER TABLE table1 CLUSTER BY NONE + * }}} + */ + override def visitAlterClusterBy(ctx: AlterClusterByContext): LogicalPlan = withOrigin(ctx) { + val table = createUnresolvedTable(ctx.identifierReference, "ALTER TABLE ... CLUSTER BY") + if (ctx.NONE() != null) { + AlterTableClusterBy(table, None) + } else { + assert(ctx.clusterBySpec() != null) + AlterTableClusterBy(table, Some(visitClusterBySpec(ctx.clusterBySpec()))) + } + } + /** * Parse [[SetViewProperties]] or [[SetTableProperties]] commands. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala index 9c66e68d686d5..2f5d4b9c86e25 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.analysis.{FieldName, FieldPosition, ResolvedFieldName} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.catalog.ClusterBySpec import org.apache.spark.sql.catalyst.util.{ResolveDefaultColumns, TypeUtils} import org.apache.spark.sql.connector.catalog.{TableCatalog, TableChange} import org.apache.spark.sql.errors.QueryCompilationErrors @@ -244,3 +245,19 @@ case class AlterColumn( override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = copy(table = newChild) } + +/** + * The logical plan of the following commands: + * - ALTER TABLE ... CLUSTER BY (col1, col2, ...) + * - ALTER TABLE ... CLUSTER BY NONE + */ +case class AlterTableClusterBy( + table: LogicalPlan, clusterBySpec: Option[ClusterBySpec]) extends AlterTableCommand { + override def changes: Seq[TableChange] = { + Seq(TableChange.clusterBy(clusterBySpec + .map(_.columnNames.toArray) // CLUSTER BY (col1, col2, ...) + .getOrElse(Array.empty))) + } + + protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = copy(table = newChild) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala index f36310e8ad899..c5888d72c2b23 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala @@ -26,13 +26,14 @@ import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.CurrentUserContext import org.apache.spark.sql.catalyst.analysis.{AsOfTimestamp, AsOfVersion, NamedRelation, NoSuchDatabaseException, NoSuchFunctionException, NoSuchNamespaceException, NoSuchTableException, TimeTravelSpec} +import org.apache.spark.sql.catalyst.catalog.ClusterBySpec import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.plans.logical.{SerdeInfo, TableSpec} import org.apache.spark.sql.catalyst.util.GeneratedColumn import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns._ import org.apache.spark.sql.connector.catalog.TableChange._ import org.apache.spark.sql.connector.catalog.functions.UnboundFunction -import org.apache.spark.sql.connector.expressions.LiteralValue +import org.apache.spark.sql.connector.expressions.{ClusterByTransform, LiteralValue, Transform} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.types.{ArrayType, MapType, Metadata, MetadataBuilder, StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -134,6 +135,61 @@ private[sql] object CatalogV2Util { Collections.unmodifiableMap(newProperties) } + /** + * Apply ClusterBy changes to a map and return the result. + */ + def applyClusterByChanges( + properties: Map[String, String], + schema: StructType, + changes: Seq[TableChange]): Map[String, String] = { + applyClusterByChanges(properties.asJava, schema, changes).asScala.toMap + } + + /** + * Apply ClusterBy changes to a Java map and return the result. + */ + def applyClusterByChanges( + properties: util.Map[String, String], + schema: StructType, + changes: Seq[TableChange]): util.Map[String, String] = { + val newProperties = new util.HashMap[String, String](properties) + + changes.foreach { + case clusterBy: ClusterBy => + val clusterByProp = + ClusterBySpec.toProperty( + schema, + ClusterBySpec(clusterBy.clusteringColumns.toIndexedSeq), + conf.resolver) + newProperties.put(clusterByProp._1, clusterByProp._2) + + case _ => + // ignore non-property changes + } + + Collections.unmodifiableMap(newProperties) + } + + /** + * Apply ClusterBy changes to the partitioning transforms and return the result. + */ + def applyClusterByChanges( + partitioning: Array[Transform], + schema: StructType, + changes: Seq[TableChange]): Array[Transform] = { + + val newPartitioning = partitioning.filterNot(_.isInstanceOf[ClusterByTransform]).toBuffer + changes.foreach { + case clusterBy: ClusterBy => + newPartitioning += ClusterBySpec.extractClusterByTransform( + schema, ClusterBySpec(clusterBy.clusteringColumns.toIndexedSeq), conf.resolver) + + case _ => + // ignore other changes + } + newPartitioning.toArray + } + /** * Apply schema changes to a schema and return the result. */ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index 8612a6e9c50ff..16bc751aab88a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -21,6 +21,7 @@ import java.util.Locale import org.apache.spark.SparkThrowable import org.apache.spark.sql.catalyst.analysis._ +import org.apache.spark.sql.catalyst.catalog.ClusterBySpec import org.apache.spark.sql.catalyst.expressions.{EqualTo, Hex, Literal} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition.{after, first} @@ -235,6 +236,23 @@ class DDLParserSuite extends AnalysisTest { } } + test("alter table cluster by") { + comparePlans( + parsePlan("ALTER TABLE table_name CLUSTER BY (`a.b`, c.d, none)"), + AlterTableClusterBy( + UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"), + Some(ClusterBySpec(Seq( + FieldReference(Seq("a.b")), + FieldReference(Seq("c", "d")), + FieldReference(Seq("none"))))))) + + comparePlans( + parsePlan("ALTER TABLE table_name CLUSTER BY NONE"), + AlterTableClusterBy( + UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"), + None)) + } + test("create/replace table - with comment") { val createSql = "CREATE TABLE my_tab(a INT, b STRING) USING parquet COMMENT 'abc'" val replaceSql = "REPLACE TABLE my_tab(a INT, b STRING) USING parquet COMMENT 'abc'" diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableCatalog.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableCatalog.scala index d511477ef5d33..654fa0719cf82 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableCatalog.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableCatalog.scala @@ -133,13 +133,14 @@ class BasicInMemoryTableCatalog extends TableCatalog { val table = loadTable(ident).asInstanceOf[InMemoryTable] val properties = CatalogV2Util.applyPropertiesChanges(table.properties, changes) val schema = CatalogV2Util.applySchemaChanges(table.schema, changes, None, "ALTER TABLE") + val finalPartitioning = CatalogV2Util.applyClusterByChanges(table.partitioning, schema, changes) // fail if the last column in the schema was dropped if (schema.fields.isEmpty) { throw new IllegalArgumentException(s"Cannot drop all fields") } - val newTable = new InMemoryTable(table.name, schema, table.partitioning, properties) + val newTable = new InMemoryTable(table.name, schema, finalPartitioning, properties) .withData(table.data) tables.put(ident, newTable) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 169aad2f234d6..d8fa48a72cf81 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -103,6 +103,15 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) builder.build()) AlterTableChangeColumnCommand(table.catalogTable.identifier, colName, newColumn) + case AlterTableClusterBy(ResolvedTable(catalog, ident, table: V1Table, _), clusterBySpecOpt) + if isSessionCatalog(catalog) => + val prop = clusterBySpecOpt.map { clusterBySpec => + Map(ClusterBySpec.toProperty(table.schema, clusterBySpec, conf.resolver)) + }.getOrElse { + Map(ClusterBySpec.toProperty(table.schema, ClusterBySpec(Nil), conf.resolver)) + } + AlterTableSetPropertiesCommand(table.catalogTable.identifier, prop, isView = false) + case RenameColumn(ResolvedV1TableIdentifier(ident), _, _) => throw QueryCompilationErrors.unsupportedTableOperationError(ident, "RENAME COLUMN") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala index 3d6de985a62f5..e619c59a7540c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala @@ -284,10 +284,11 @@ class V2SessionCatalog(catalog: SessionCatalog) catalogTable.storage } + val finalProperties = CatalogV2Util.applyClusterByChanges(properties, schema, changes) try { catalog.alterTable( catalogTable.copy( - properties = properties, schema = schema, owner = owner, comment = comment, + properties = finalProperties, schema = schema, owner = owner, comment = comment, storage = storage)) } catch { case _: NoSuchTableException => diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out index cabbfa520d77a..e03e0f0e3d638 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out @@ -203,6 +203,7 @@ NANOSECOND false NANOSECONDS false NATURAL true NO false +NONE false NOT true NULL true NULLS false diff --git a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out index e304509aa6d75..e5a371925b1dc 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out @@ -203,6 +203,7 @@ NANOSECOND false NANOSECONDS false NATURAL false NO false +NONE false NOT false NULL false NULLS false diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala new file mode 100644 index 0000000000000..8961019f3f8d1 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.{AnalysisException, QueryTest} + +/** + * This base suite contains unified tests for the `ALTER TABLE ... CLUSTER BY` command + * that check V1 and V2 table catalogs. The tests that cannot run for all supported catalogs are + * located in more specific test suites: + * + * - V2 table catalog tests: `org.apache.spark.sql.execution.command.v2.AlterTableClusterBySuite` + * - V1 table catalog tests: + * `org.apache.spark.sql.execution.command.v1.AlterTableClusterBySuiteBase` + * - V1 In-Memory catalog: `org.apache.spark.sql.execution.command.v1.AlterTableClusterBySuite` + * - V1 Hive External catalog: + * `org.apache.spark.sql.hive.execution.command.AlterTableClusterBySuite` + */ +trait AlterTableClusterBySuiteBase extends QueryTest with DDLCommandTestUtils { + override val command = "ALTER TABLE CLUSTER BY" + + protected val nestedColumnSchema: String = + "col1 INT, col2 STRUCT, col3 STRUCT<`col4.1` INT>" + protected val nestedClusteringColumns: Seq[String] = + Seq("col2.col3", "col2.`col4 1`", "col3.`col4.1`") + protected val nestedClusteringColumnsNew: Seq[String] = + Seq("col3.`col4.1`", "col2.`col4 1`", "col2.col3") + + def validateClusterBy(tableName: String, clusteringColumns: Seq[String]): Unit + + test("test basic ALTER TABLE with clustering columns") { + withNamespaceAndTable("ns", "table") { tbl => + spark.sql(s"CREATE TABLE $tbl (id INT, data STRING) $defaultUsing CLUSTER BY (id, data)") + validateClusterBy(tbl, Seq("id", "data")) + spark.sql(s"ALTER TABLE $tbl CLUSTER BY (data, id)") + validateClusterBy(tbl, Seq("data", "id")) + spark.sql(s"ALTER TABLE $tbl CLUSTER BY NONE") + validateClusterBy(tbl, Seq.empty) + spark.sql(s"ALTER TABLE $tbl CLUSTER BY (id)") + validateClusterBy(tbl, Seq("id")) + } + } + + test("test clustering columns with comma") { + withNamespaceAndTable("ns", "table") { tbl => + spark.sql(s"CREATE TABLE $tbl (`i,d` INT, data STRING) $defaultUsing " + + "CLUSTER BY (`i,d`, data)") + validateClusterBy(tbl, Seq("`i,d`", "data")) + spark.sql(s"ALTER TABLE $tbl CLUSTER BY (data, `i,d`)") + validateClusterBy(tbl, Seq("data", "`i,d`")) + } + } + + test("test nested clustering columns") { + withNamespaceAndTable("ns", "table") { tbl => + spark.sql(s"CREATE TABLE $tbl " + + s"($nestedColumnSchema) " + + s"$defaultUsing CLUSTER BY (${nestedClusteringColumns.mkString(",")})") + validateClusterBy(tbl, nestedClusteringColumns) + spark.sql(s"ALTER TABLE $tbl CLUSTER BY (${nestedClusteringColumnsNew.mkString(",")})") + validateClusterBy(tbl, nestedClusteringColumnsNew) + } + } + + test("clustering columns not defined in schema") { + withNamespaceAndTable("ns", "table") { tbl => + sql(s"CREATE TABLE $tbl (id bigint, data string) $defaultUsing CLUSTER BY (id)") + val err = intercept[AnalysisException] { + sql(s"ALTER TABLE $tbl CLUSTER BY (unknown)") + } + assert(err.message.contains("Couldn't find column unknown in:")) + } + } + + // Converts three-part table name (catalog.namespace.table) to TableIdentifier. + protected def parseTableName(threePartTableName: String): (String, String, String) = { + val tablePath = threePartTableName.split('.') + assert(tablePath.length === 3) + (tablePath(0), tablePath(1), tablePath(2)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableSuiteBase.scala index c80cedca29f7c..2588aa4313fa7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DescribeTableSuiteBase.scala @@ -199,4 +199,46 @@ trait DescribeTableSuiteBase extends QueryTest with DDLCommandTestUtils { Row("col2.x", "int", null))) } } + + test("describe a clustered table - alter table cluster by") { + withNamespaceAndTable("ns", "tbl") { tbl => + sql(s"CREATE TABLE $tbl (col1 STRING COMMENT 'this is comment', col2 struct) " + + s"$defaultUsing CLUSTER BY (col1, col2.x)") + sql(s"ALTER TABLE $tbl CLUSTER BY (col2.y, col1)") + val descriptionDf = sql(s"DESC $tbl") + assert(descriptionDf.schema.map(field => (field.name, field.dataType)) === Seq( + ("col_name", StringType), + ("data_type", StringType), + ("comment", StringType))) + QueryTest.checkAnswer( + descriptionDf, + Seq( + Row("col1", "string", "this is comment"), + Row("col2", "struct", null), + Row("# Clustering Information", "", ""), + Row("# col_name", "data_type", "comment"), + Row("col2.y", "int", null), + Row("col1", "string", "this is comment"))) + } + } + + test("describe a clustered table - alter table cluster by none") { + withNamespaceAndTable("ns", "tbl") { tbl => + sql(s"CREATE TABLE $tbl (col1 STRING COMMENT 'this is comment', col2 struct) " + + s"$defaultUsing CLUSTER BY (col1, col2.x)") + sql(s"ALTER TABLE $tbl CLUSTER BY NONE") + val descriptionDf = sql(s"DESC $tbl") + assert(descriptionDf.schema.map(field => (field.name, field.dataType)) === Seq( + ("col_name", StringType), + ("data_type", StringType), + ("comment", StringType))) + QueryTest.checkAnswer( + descriptionDf, + Seq( + Row("col1", "string", "this is comment"), + Row("col2", "struct", null), + Row("# Clustering Information", "", ""), + Row("# col_name", "data_type", "comment"))) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala new file mode 100644 index 0000000000000..385ef961127a2 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command.v1 + +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.ClusterBySpec +import org.apache.spark.sql.connector.expressions.FieldReference +import org.apache.spark.sql.execution.command + +/** + * This base suite contains unified tests for the `ALTER TABLE ... CLUSTER BY` command that + * checks V1 table catalogs. The tests that cannot run for all V1 catalogs are located in more + * specific test suites: + * + * - V1 In-Memory catalog: `org.apache.spark.sql.execution.command.v1.AlterTableClusterBySuite` + * - V1 Hive External catalog: + * `org.apache.spark.sql.hive.execution.command.AlterTableClusterBySuite` + */ +trait AlterTableClusterBySuiteBase extends command.AlterTableClusterBySuiteBase + with command.TestsV1AndV2Commands { + override def validateClusterBy(tableName: String, clusteringColumns: Seq[String]): Unit = { + val catalog = spark.sessionState.catalog + val (_, db, t) = parseTableName(tableName) + val table = catalog.getTableMetadata(TableIdentifier.apply(t, Some(db))) + assert(table.clusterBySpec === Some(ClusterBySpec(clusteringColumns.map(FieldReference(_))))) + } +} + +/** + * The class contains tests for the `ALTER TABLE ... CLUSTER BY` command to check V1 In-Memory + * table catalog. + */ +class AlterTableClusterBySuite extends AlterTableClusterBySuiteBase + with CommandSuiteBase { + override def commandVersion: String = super[AlterTableClusterBySuiteBase].commandVersion +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala new file mode 100644 index 0000000000000..bbbe6cd758756 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command.v2 + +import org.apache.spark.sql.connector.catalog.{Identifier, InMemoryTable} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.CatalogHelper +import org.apache.spark.sql.connector.expressions.{ClusterByTransform, FieldReference} +import org.apache.spark.sql.execution.command + +/** + * The class contains tests for the `ALTER TABLE ... CLUSTER BY` command to check V2 table + * catalogs. + */ +class AlterTableClusterBySuite extends command.AlterTableClusterBySuiteBase + with CommandSuiteBase { + override def validateClusterBy(tableName: String, clusteringColumns: Seq[String]): Unit = { + val (catalog, namespace, table) = parseTableName(tableName) + val catalogPlugin = spark.sessionState.catalogManager.catalog(catalog) + val partTable = catalogPlugin.asTableCatalog + .loadTable(Identifier.of(Array(namespace), table)) + .asInstanceOf[InMemoryTable] + assert(partTable.partitioning === + Array(ClusterByTransform(clusteringColumns.map(FieldReference(_))))) + } + + test("test REPLACE TABLE with clustering columns") { + withNamespaceAndTable("ns", "table") { tbl => + spark.sql(s"CREATE TABLE $tbl (id INT) $defaultUsing CLUSTER BY (id)") + validateClusterBy(tbl, Seq("id")) + + spark.sql(s"REPLACE TABLE $tbl (id INT, id2 INT) $defaultUsing CLUSTER BY (id2)") + validateClusterBy(tbl, Seq("id2")) + + spark.sql(s"ALTER TABLE $tbl CLUSTER BY (id)") + validateClusterBy(tbl, Seq("id")) + } + } +} diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala index e757487915bbf..a5961b036871c 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -214,7 +214,7 @@ trait ThriftServerWithSparkContextSuite extends SharedThriftServer { val sessionHandle = client.openSession(user, "") val infoValue = client.getInfo(sessionHandle, GetInfoType.CLI_ODBC_KEYWORDS) // scalastyle:off line.size.limit - assert(infoValue.getStringValue == "ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BEGIN,BETWEEN,BIGINT,BINARY,BINDING,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONSTRAINT,CONTAINS,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DEFINER,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DOUBLE,DROP,ELSE,END,ESCAPE,ESCAPED,EVOLUTION,EXCEPT,EXCHANGE,EXCLUDE,EXECUTE,EXISTS,EXPLAIN,EXPORT,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GLOBAL,GRANT,GROUP,GROUPING,HAVING,HOUR,HOURS,IDENTIFIER,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,JOIN,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEFT,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,MACRO,MAP,MATCHED,MERGE,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NO,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,REDUCE,REFERENCES,REFRESH,RENAME,REPAIR,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,START,STATISTICS,STORED,STRATIFY,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UPDATE,USE,USER,USING,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WEEK,WEEKS,WHEN,WHERE,WINDOW,WITH,WITHIN,X,YEAR,YEARS,ZONE") + assert(infoValue.getStringValue == "ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BEGIN,BETWEEN,BIGINT,BINARY,BINDING,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONSTRAINT,CONTAINS,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DEFINER,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DOUBLE,DROP,ELSE,END,ESCAPE,ESCAPED,EVOLUTION,EXCEPT,EXCHANGE,EXCLUDE,EXECUTE,EXISTS,EXPLAIN,EXPORT,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GLOBAL,GRANT,GROUP,GROUPING,HAVING,HOUR,HOURS,IDENTIFIER,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,JOIN,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEFT,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,MACRO,MAP,MATCHED,MERGE,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NO,NONE,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,REDUCE,REFERENCES,REFRESH,RENAME,REPAIR,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,START,STATISTICS,STORED,STRATIFY,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UPDATE,USE,USER,USING,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WEEK,WEEKS,WHEN,WHERE,WINDOW,WITH,WITHIN,X,YEAR,YEARS,ZONE") // scalastyle:on line.size.limit } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/AlterTableClusterBySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/AlterTableClusterBySuite.scala new file mode 100644 index 0000000000000..0a8fca29150ec --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/AlterTableClusterBySuite.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.execution.command + +import org.apache.spark.sql.execution.command.v1 + +/** + * The class contains tests for the `ALTER TABLE ... CLUSTER BY` command to check V1 Hive external + * table catalog. + */ +class AlterTableClusterBySuite extends v1.AlterTableClusterBySuiteBase with CommandSuiteBase { + // Hive doesn't support nested column names with space and dot. + override protected val nestedColumnSchema: String = + "col1 INT, col2 STRUCT" + override protected val nestedClusteringColumns: Seq[String] = + Seq("col2.col3") + override protected val nestedClusteringColumnsNew: Seq[String] = + Seq("col2.col4") + + // Hive catalog doesn't support column names with commas. + override def excluded: Seq[String] = Seq( + s"$command using Hive V1 catalog V1 command: test clustering columns with comma", + s"$command using Hive V1 catalog V2 command: test clustering columns with comma") + + override def commandVersion: String = super[AlterTableClusterBySuiteBase].commandVersion +}