zedtang commented on code in PR #47301: URL: https://github.com/apache/spark/pull/47301#discussion_r1687483410
########## connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala: ########## @@ -274,6 +283,18 @@ trait CreateTableWriter[T] extends WriteConfigMethods[CreateTableWriter[T]] { */ def partitionedBy(column: Column, columns: Column*): CreateTableWriter[T] + /** + * Clusters the output by the given columns on the file system. The rows with matching values in + * the specified clustering columns will be consolidated within the same file. Review Comment: Oops, updated here and below ########## sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala: ########## @@ -330,6 +350,18 @@ trait CreateTableWriter[T] extends WriteConfigMethods[CreateTableWriter[T]] { */ def partitionedBy(column: Column, columns: Column*): CreateTableWriter[T] + /** + * Clusters the output by the given columns on the storage. The rows with matching values in + * the specified clustering columns will be consolidated within the same group. + * + * For instance, if you cluster a dataset by date, the data sharing the same date will be stored + * together in a file. This arrangement improves query efficiency when you apply selective + * filters to these clustering columns, thanks to data skipping. + * + * @since 4.0.0 + */ + def clusterBy(colName: String, colNames: String*): CreateTableWriter[T] Review Comment: added, also for partitionedBy ########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala: ########## @@ -209,10 +221,26 @@ object ClusterBySpec { normalizeClusterBySpec(schema, clusterBySpec, resolver).toJson } + /** + * Converts a ClusterBySpec to a map of table properties used to store the clustering Review Comment: No preference here, this is just a bit more friendly for the call site. ########## connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala: ########## @@ -274,6 +283,18 @@ trait CreateTableWriter[T] extends WriteConfigMethods[CreateTableWriter[T]] { */ def partitionedBy(column: Column, columns: Column*): CreateTableWriter[T] + /** + * Clusters the output by the given columns on the file system. The rows with matching values in + * the specified clustering columns will be consolidated within the same file. + * + * For instance, if you cluster a dataset by date, the data sharing the same date will be stored + * together in a file. This arrangement improves query efficiency when you apply selective + * filters to these clustering columns, thanks to data skipping. + * + * @since 4.0.0 + */ + def clusterBy(colName: String, colNames: String*): CreateTableWriter[T] Review Comment: done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org