apache · imback82 · Aug 21, 2023 · Sep 9, 2023 · Sep 9, 2023 · Sep 9, 2023
diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -2908,6 +2908,18 @@
     ],
     "sqlState" : "42601"
   },
+  "SPECIFY_CLUSTER_BY_WITH_BUCKETING_IS_NOT_ALLOWED" : {
+    "message" : [
+      "Cannot specify both CLUSTER BY and CLUSTERED BY INTO BUCKETS."
+    ],
+    "sqlState" : "42908"
+  },
+  "SPECIFY_CLUSTER_BY_WITH_PARTITIONED_BY_IS_NOT_ALLOWED" : {
+    "message" : [
+      "Cannot specify both CLUSTER BY and PARTITIONED BY."
+    ],
+    "sqlState" : "42908"
+  },
   "SPECIFY_PARTITION_IS_NOT_ALLOWED" : {
     "message" : [
       "A CREATE TABLE without explicit column list cannot specify PARTITIONED BY.",

diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md
@@ -1817,6 +1817,18 @@ A CREATE TABLE without explicit column list cannot specify bucketing information
 Please use the form with explicit column list and specify bucketing information.
 Alternatively, allow bucketing information to be inferred by omitting the clause.
 
+### SPECIFY_CLUSTER_BY_WITH_BUCKETING_IS_NOT_ALLOWED
+
+[SQLSTATE: 42908](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
+
+Cannot specify both CLUSTER BY and CLUSTERED BY INTO BUCKETS.
+
+### SPECIFY_CLUSTER_BY_WITH_PARTITIONED_BY_IS_NOT_ALLOWED
+
+[SQLSTATE: 42908](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
+
+Cannot specify both CLUSTER BY and PARTITIONED BY.
+
 ### SPECIFY_PARTITION_IS_NOT_ALLOWED
 
 [SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)

diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
@@ -298,6 +298,10 @@ replaceTableHeader
     : (CREATE OR)? REPLACE TABLE identifierReference
     ;
 
+clusterBySpec
+    : CLUSTER BY LEFT_PAREN multipartIdentifierList RIGHT_PAREN
+    ;
+
 bucketSpec
     : CLUSTERED BY identifierList
       (SORTED BY orderedIdentifierList)?
@@ -383,6 +387,7 @@ createTableClauses
     :((OPTIONS options=expressionPropertyList) |
      (PARTITIONED BY partitioning=partitionFieldList) |
      skewSpec |
+     clusterBySpec |
      bucketSpec |
      rowFormat |
      createFileFormat |

diff --git a/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala b/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala
@@ -679,4 +679,12 @@ private[sql] object QueryParsingErrors extends DataTypeErrorsBase {
       ctx
     )
   }
+
+  def clusterByWithPartitionedBy(ctx: ParserRuleContext): Throwable = {
+    new ParseException(errorClass = "SPECIFY_CLUSTER_BY_WITH_PARTITIONED_BY_IS_NOT_ALLOWED", ctx)
+  }
+
+  def clusterByWithBucketing(ctx: ParserRuleContext): Throwable = {
+    new ParseException(errorClass = "SPECIFY_CLUSTER_BY_WITH_BUCKETING_IS_NOT_ALLOWED", ctx)
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
@@ -23,7 +23,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.util.Shell
 
-import org.apache.spark.sql.catalyst.analysis.Resolver
+import org.apache.spark.sql.catalyst.analysis.{Resolver, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, BasePredicate, BoundReference, Expression, Predicate}
 import org.apache.spark.sql.catalyst.util.CharVarcharUtils
@@ -235,6 +235,18 @@ object CatalogUtils {
     BucketSpec(numBuckets, normalizedBucketCols, normalizedSortCols)
   }
 
+  def normalizeClusterBySpec(
+      tableName: String,
+      tableCols: Seq[String],
+      clusterBySpec: ClusterBySpec,
+      resolver: Resolver): ClusterBySpec = {
+    val normalizedClusterByCols = clusterBySpec.columnNames.map { colName =>
+      normalizeColumnName(tableName, tableCols, colName.name, "cluster by", resolver)
+    }
+
+    ClusterBySpec(normalizedClusterByCols.map(UnresolvedAttribute.quotedString(_)))
+  }
+
   /**
    * Convert URI to String.
    * Since URI.toString does not decode the uri, e.g. change '%25' to '%'.

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -31,7 +31,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{CurrentUserContext, FunctionIdentifier, InternalRow, SQLConfHelper, TableIdentifier}
-import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, UnresolvedLeafNode}
+import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, UnresolvedAttribute, UnresolvedLeafNode}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable.VIEW_STORING_ANALYZED_PLAN
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Cast, ExprId, Literal}
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -170,6 +170,23 @@ case class CatalogTablePartition(
   }
 }
 
+/**
+ * A container for clustering information.
+ *
+ * @param columnNames the names of the columns used for clustering.
+ */
+case class ClusterBySpec(columnNames: Seq[UnresolvedAttribute]) {
+  override def toString: String = columnNames.map(_.name).mkString(",")
+
+  lazy val toDDL: String = if (columnNames.nonEmpty) s"CLUSTER BY ($toString)" else ""
+}
+
+object ClusterBySpec {
+  def apply(columns: String): ClusterBySpec = columns match {
+    case "" => ClusterBySpec(Seq.empty[UnresolvedAttribute])
+    case _ => ClusterBySpec(columns.split(",").map(_.trim).map(UnresolvedAttribute.quotedString))
+  }
+}
 
 /**
  * A container for bucketing information.
@@ -253,7 +270,8 @@ case class CatalogTable(
     tracksPartitionsInCatalog: Boolean = false,
     schemaPreservesCase: Boolean = true,
     ignoredProperties: Map[String, String] = Map.empty,
-    viewOriginalText: Option[String] = None) {
+    viewOriginalText: Option[String] = None,
+    clusterBySpec: Option[ClusterBySpec] = None) {
 
   import CatalogTable._
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -33,7 +33,7 @@ import org.apache.spark.{SparkArithmeticException, SparkException}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, SQLConfHelper, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat}
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, ClusterBySpec}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AnyValue, First, Last, PercentileCont, PercentileDisc}
 import org.apache.spark.sql.catalyst.parser.SqlBaseParser._
@@ -3241,6 +3241,15 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
           })
   }
 
+  /**
+   * Create a [[ClusterBySpec]].
+   */
+  override def visitClusterBySpec(ctx: ClusterBySpecContext): ClusterBySpec = withOrigin(ctx) {
+    val columnNames = ctx.multipartIdentifierList.multipartIdentifier.asScala
+      .map(typedVisit[Seq[String]]).map(UnresolvedAttribute(_)).toSeq
+    ClusterBySpec(columnNames)
+  }
+
   /**
    * Convert a property list into a key-value map.
    * This should be called through [[visitPropertyKeyValues]] or [[visitPropertyKeys]].
@@ -3341,14 +3350,15 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
    * - location
    * - comment
    * - serde
+   * - clusterBySpec
    *
    * Note: Partition transforms are based on existing table schema definition. It can be simple
    * column names, or functions like `year(date_col)`. Partition columns are column names with data
    * types like `i INT`, which should be appended to the existing table schema.
    */
   type TableClauses = (
       Seq[Transform], Seq[StructField], Option[BucketSpec], Map[String, String],
-      OptionList, Option[String], Option[String], Option[SerdeInfo])
+      OptionList, Option[String], Option[String], Option[SerdeInfo], Option[ClusterBySpec])
 
   /**
    * Validate a create table statement and return the [[TableIdentifier]].
@@ -3809,6 +3819,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
     checkDuplicateClauses(ctx.rowFormat, "ROW FORMAT", ctx)
     checkDuplicateClauses(ctx.commentSpec(), "COMMENT", ctx)
     checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx)
+    checkDuplicateClauses(ctx.clusterBySpec(), "CLUSTER BY", ctx)
     checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx)
 
     if (ctx.skewSpec.size > 0) {
@@ -3827,8 +3838,9 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
     val comment = visitCommentSpecList(ctx.commentSpec())
     val serdeInfo =
       getSerdeInfo(ctx.rowFormat.asScala.toSeq, ctx.createFileFormat.asScala.toSeq, ctx)
+    val clusterBySpec = ctx.clusterBySpec().asScala.headOption.map(visitClusterBySpec)
     (partTransforms, partCols, bucketSpec, cleanedProperties, cleanedOptions, newLocation, comment,
-      serdeInfo)
+      serdeInfo, clusterBySpec)
   }
 
   protected def getSerdeInfo(
@@ -3881,6 +3893,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
    *     [OPTIONS table_property_list]
    *     [ROW FORMAT row_format]
    *     [STORED AS file_format]
+   *     [CLUSTER BY (col_name, col_name, ...)]
    *     [CLUSTERED BY (col_name, col_name, ...)
    *       [SORTED BY (col_name [ASC|DESC], ...)]
    *       INTO num_buckets BUCKETS
@@ -3902,7 +3915,16 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
       .map(visitCreateOrReplaceTableColTypeList).getOrElse(Nil)
     val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText)
     val (partTransforms, partCols, bucketSpec, properties, options, location,
-      comment, serdeInfo) = visitCreateTableClauses(ctx.createTableClauses())
+      comment, serdeInfo, clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses())
+
+    if (clusterBySpec.isDefined) {
+      if (partCols.nonEmpty || partTransforms.nonEmpty) {
+        throw QueryParsingErrors.clusterByWithPartitionedBy(ctx)
+      }
+      if (bucketSpec.isDefined) {
+        throw QueryParsingErrors.clusterByWithBucketing(ctx)
+      }
+    }
 
     if (provider.isDefined && serdeInfo.isDefined) {
       operationNotAllowed(s"CREATE TABLE ... USING ... ${serdeInfo.get.describe}", ctx)
@@ -3915,7 +3937,10 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
     }
 
     val partitioning =
-      partitionExpressions(partTransforms, partCols, ctx) ++ bucketSpec.map(_.asTransform)
+      partitionExpressions(partTransforms, partCols, ctx) ++
+        bucketSpec.map(_.asTransform) ++
+        clusterBySpec.map(_.asTransform)
+
     val tableSpec = UnresolvedTableSpec(properties, provider, options, location, comment,
       serdeInfo, external)
 
@@ -3958,6 +3983,8 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
    *   replace_table_clauses (order insensitive):
    *     [OPTIONS table_property_list]
    *     [PARTITIONED BY (partition_fields)]
+   *     [CLUSTER BY (col_name, col_name, ...)]
+   *     [CLUSTER BY (col_name, col_name, ...)]
    *     [CLUSTERED BY (col_name, col_name, ...)
    *       [SORTED BY (col_name [ASC|DESC], ...)]
    *       INTO num_buckets BUCKETS
@@ -3973,18 +4000,31 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
    */
   override def visitReplaceTable(ctx: ReplaceTableContext): LogicalPlan = withOrigin(ctx) {
     val orCreate = ctx.replaceTableHeader().CREATE() != null
-    val (partTransforms, partCols, bucketSpec, properties, options, location, comment, serdeInfo) =
-      visitCreateTableClauses(ctx.createTableClauses())
+    val (partTransforms, partCols, bucketSpec, properties, options, location, comment, serdeInfo,
+      clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses())
     val columns = Option(ctx.createOrReplaceTableColTypeList())
       .map(visitCreateOrReplaceTableColTypeList).getOrElse(Nil)
+
+    if (clusterBySpec.isDefined) {
+      if (partCols.nonEmpty || partTransforms.nonEmpty) {
+        throw QueryParsingErrors.clusterByWithPartitionedBy(ctx)
+      }
+      if (bucketSpec.isDefined) {
+        throw QueryParsingErrors.clusterByWithBucketing(ctx)
+      }
+    }
+
     val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText)
 
     if (provider.isDefined && serdeInfo.isDefined) {
       operationNotAllowed(s"REPLACE TABLE ... USING ... ${serdeInfo.get.describe}", ctx)
     }
 
     val partitioning =
-      partitionExpressions(partTransforms, partCols, ctx) ++ bucketSpec.map(_.asTransform)
+      partitionExpressions(partTransforms, partCols, ctx) ++
+        bucketSpec.map(_.asTransform) ++
+        clusterBySpec.map(_.asTransform)
+
     val tableSpec = UnresolvedTableSpec(properties, provider, options, location, comment,
       serdeInfo, external = false)
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala
@@ -19,13 +19,15 @@ package org.apache.spark.sql.connector.catalog
 
 import scala.collection.mutable
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, ClusterBySpec}
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.types.DataTypeUtils
 import org.apache.spark.sql.catalyst.util.{quoteIfNeeded, QuotingUtils}
-import org.apache.spark.sql.connector.expressions.{BucketTransform, FieldReference, IdentityTransform, LogicalExpressions, Transform}
+import org.apache.spark.sql.connector.expressions.{BucketTransform, ClusterByTransform, FieldReference, IdentityTransform, LogicalExpressions, Transform}
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
 import org.apache.spark.sql.types.StructType
 
@@ -53,10 +55,18 @@ private[sql] object CatalogV2Implicits {
     }
   }
 
+  implicit class ClusterByHelper(spec: ClusterBySpec) {
+    def asTransform: Transform = {
+      val references = spec.columnNames.map(col => FieldReference(col.nameParts))
+      clusterBy(references.toArray)
+    }
+  }
+
   implicit class TransformHelper(transforms: Seq[Transform]) {
-    def convertTransforms: (Seq[String], Option[BucketSpec]) = {
+    def convertTransforms: (Seq[String], Option[BucketSpec], Option[ClusterBySpec]) = {
       val identityCols = new mutable.ArrayBuffer[String]
       var bucketSpec = Option.empty[BucketSpec]
+      var clusterBySpec = Option.empty[ClusterBySpec]
 
       transforms.map {
         case IdentityTransform(FieldReference(Seq(col))) =>
@@ -73,11 +83,24 @@ private[sql] object CatalogV2Implicits {
               sortCol.map(_.fieldNames.mkString("."))))
           }
 
+        case ClusterByTransform(columnNames) =>
+          if (clusterBySpec.nonEmpty) {
+            // AstBuilder guarantees that it only passes down one ClusterByTransform.
+            throw SparkException.internalError("Cannot have multiple cluster by transforms.")
+          }
+          clusterBySpec =
+            Some(ClusterBySpec(columnNames.map(_.fieldNames).map(UnresolvedAttribute(_))))
+
         case transform =>
           throw QueryExecutionErrors.unsupportedPartitionTransformError(transform)
       }
 
-      (identityCols.toSeq, bucketSpec)
+      // Parser guarantees that partition and clustering cannot co-exist.
+      assert(!(identityCols.toSeq.nonEmpty && clusterBySpec.nonEmpty))
+      // Parser guarantees that bucketing and clustering cannot co-exist.
+      assert(!(bucketSpec.nonEmpty && clusterBySpec.nonEmpty))
+
+      (identityCols.toSeq, bucketSpec, clusterBySpec)
     }
   }
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala
@@ -52,6 +52,9 @@ private[sql] object LogicalExpressions {
       sortedCols: Array[NamedReference]): SortedBucketTransform =
     SortedBucketTransform(literal(numBuckets, IntegerType), references, sortedCols)
 
+  def clusterBy(references: Array[NamedReference]): ClusterByTransform =
+    ClusterByTransform(references)
+
   def identity(reference: NamedReference): IdentityTransform = IdentityTransform(reference)
 
   def years(reference: NamedReference): YearsTransform = YearsTransform(reference)
@@ -150,6 +153,41 @@ private[sql] object BucketTransform {
   }
 }
 
+/**
+ * This class represents a transform for [[ClusterBySpec]]. This is used to bundle
+ * ClusterBySpec in CreateTable's partitioning transforms to pass it down to analyzer/delta.
+ */
+final case class ClusterByTransform(
+    columnNames: Seq[NamedReference]) extends RewritableTransform {
+
+  override val name: String = "cluster_by"
+
+  override def references: Array[NamedReference] = {
+    arguments.collect { case named: NamedReference => named }
+  }
+
+  override def arguments: Array[Expression] = columnNames.toArray
+
+  override def toString: String = s"$name(${arguments.map(_.describe).mkString(", ")})"
+
+  override def withReferences(newReferences: Seq[NamedReference]): Transform = {
+    this.copy(columnNames = newReferences)
+  }
+}
+
+/**
+ * Convenience extractor for ClusterByTransform.
+ */
+object ClusterByTransform {
+  def unapply(transform: Transform): Option[Seq[NamedReference]] =
+    transform match {
+      case NamedTransform("cluster_by", arguments) =>
+        Some(arguments.map(_.asInstanceOf[NamedReference]))
+      case _ =>
+        None
+    }
+}
+
 private[sql] final case class SortedBucketTransform(
     numBuckets: Literal[Int],
     columns: Seq[NamedReference],