This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 519b582ca0e [SPARK-45679][CONNECT] Add proto for clusterBy
519b582ca0e is described below
commit 519b582ca0e82f5766dcb92e2ea66be27cb76f9c
Author: Zhen Li <[email protected]>
AuthorDate: Fri Oct 27 10:47:17 2023 +0900
[SPARK-45679][CONNECT] Add proto for clusterBy
### What changes were proposed in this pull request?
Adding the proto change needed for clusterBy.
Example of the clusterBy API:
DataframeWriterV1
```
df.write
.format(...)
.clusterBy("clusteringColumn1", "clusteringColumn2")
.save(...) or saveAsTable(...)
```
DataFrameWriterV2
```
df.writeTo(...).using(...)
.clusterBy("clusteringColumn1", "clusteringColumn2")
.create() or replace() or createOrReplace()
```
### Why are the changes needed?
The first PR to introduce the proto change for the feature.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing tests
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #43544 from zhenlineo/clusterby.
Authored-by: Zhen Li <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../src/main/protobuf/spark/connect/commands.proto | 6 +
python/pyspark/sql/connect/proto/commands_pb2.py | 144 ++++++++++-----------
python/pyspark/sql/connect/proto/commands_pb2.pyi | 18 +++
3 files changed, 96 insertions(+), 72 deletions(-)
diff --git
a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto
b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto
index 89a82c2c7e8..d120fb7b530 100644
--- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto
+++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto
@@ -127,6 +127,9 @@ message WriteOperation {
// (Optional) A list of configuration options.
map<string, string> options = 9;
+ // (Optional) Columns used for clustering the table.
+ repeated string clustering_columns = 10;
+
message SaveTable {
// (Required) The table name.
string table_name = 1;
@@ -191,6 +194,9 @@ message WriteOperationV2 {
// (Optional) A condition for overwrite saving mode
Expression overwrite_condition = 8;
+
+ // (Optional) Columns used for clustering the table.
+ repeated string clustering_columns = 9;
}
// Starts write stream operation as streaming query. Query ID and Run ID of
the streaming
diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py
b/python/pyspark/sql/connect/proto/commands_pb2.py
index a1cbea09b3c..309d34852d2 100644
--- a/python/pyspark/sql/connect/proto/commands_pb2.py
+++ b/python/pyspark/sql/connect/proto/commands_pb2.py
@@ -35,7 +35,7 @@ from pyspark.sql.connect.proto import relations_pb2 as
spark_dot_connect_dot_rel
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-
b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\xf5\x07\n\x07\x43ommand\x12]\n\x11register_function\x18\x01
\x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02
\x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x
[...]
+
b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\xf5\x07\n\x07\x43ommand\x12]\n\x11register_function\x18\x01
\x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02
\x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x
[...]
)
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
@@ -74,75 +74,75 @@ if _descriptor._USE_C_DESCRIPTORS == False:
_CREATEDATAFRAMEVIEWCOMMAND._serialized_start = 1681
_CREATEDATAFRAMEVIEWCOMMAND._serialized_end = 1831
_WRITEOPERATION._serialized_start = 1834
- _WRITEOPERATION._serialized_end = 2885
- _WRITEOPERATION_OPTIONSENTRY._serialized_start = 2309
- _WRITEOPERATION_OPTIONSENTRY._serialized_end = 2367
- _WRITEOPERATION_SAVETABLE._serialized_start = 2370
- _WRITEOPERATION_SAVETABLE._serialized_end = 2628
- _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_start = 2504
- _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_end = 2628
- _WRITEOPERATION_BUCKETBY._serialized_start = 2630
- _WRITEOPERATION_BUCKETBY._serialized_end = 2721
- _WRITEOPERATION_SAVEMODE._serialized_start = 2724
- _WRITEOPERATION_SAVEMODE._serialized_end = 2861
- _WRITEOPERATIONV2._serialized_start = 2888
- _WRITEOPERATIONV2._serialized_end = 3701
- _WRITEOPERATIONV2_OPTIONSENTRY._serialized_start = 2309
- _WRITEOPERATIONV2_OPTIONSENTRY._serialized_end = 2367
- _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_start = 3460
- _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_end = 3526
- _WRITEOPERATIONV2_MODE._serialized_start = 3529
- _WRITEOPERATIONV2_MODE._serialized_end = 3688
- _WRITESTREAMOPERATIONSTART._serialized_start = 3704
- _WRITESTREAMOPERATIONSTART._serialized_end = 4504
- _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_start = 2309
- _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_end = 2367
- _STREAMINGFOREACHFUNCTION._serialized_start = 4507
- _STREAMINGFOREACHFUNCTION._serialized_end = 4686
- _WRITESTREAMOPERATIONSTARTRESULT._serialized_start = 4688
- _WRITESTREAMOPERATIONSTARTRESULT._serialized_end = 4809
- _STREAMINGQUERYINSTANCEID._serialized_start = 4811
- _STREAMINGQUERYINSTANCEID._serialized_end = 4876
- _STREAMINGQUERYCOMMAND._serialized_start = 4879
- _STREAMINGQUERYCOMMAND._serialized_end = 5511
- _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 5378
- _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 5422
- _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 5424
- _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 5500
- _STREAMINGQUERYCOMMANDRESULT._serialized_start = 5514
- _STREAMINGQUERYCOMMANDRESULT._serialized_end = 6655
- _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 6097
- _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 6267
- _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 6269
- _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 6341
- _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 6343
- _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 6382
- _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 6385
- _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 6582
- _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start =
6584
- _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 6640
- _STREAMINGQUERYMANAGERCOMMAND._serialized_start = 6658
- _STREAMINGQUERYMANAGERCOMMAND._serialized_end = 7487
- _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_start
= 7189
- _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_end =
7268
-
_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_start =
7271
-
_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_end =
7476
- _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_start = 7490
- _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_end = 8566
- _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_start = 8098
- _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_end = 8225
-
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_start =
8227
- _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_end
= 8342
-
_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_start
= 8344
-
_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_end =
8403
-
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_start
= 8405
-
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_end
= 8480
-
_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_start
= 8482
-
_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_end
= 8551
- _GETRESOURCESCOMMAND._serialized_start = 8568
- _GETRESOURCESCOMMAND._serialized_end = 8589
- _GETRESOURCESCOMMANDRESULT._serialized_start = 8592
- _GETRESOURCESCOMMANDRESULT._serialized_end = 8804
- _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 8708
- _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 8804
+ _WRITEOPERATION._serialized_end = 2932
+ _WRITEOPERATION_OPTIONSENTRY._serialized_start = 2356
+ _WRITEOPERATION_OPTIONSENTRY._serialized_end = 2414
+ _WRITEOPERATION_SAVETABLE._serialized_start = 2417
+ _WRITEOPERATION_SAVETABLE._serialized_end = 2675
+ _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_start = 2551
+ _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_end = 2675
+ _WRITEOPERATION_BUCKETBY._serialized_start = 2677
+ _WRITEOPERATION_BUCKETBY._serialized_end = 2768
+ _WRITEOPERATION_SAVEMODE._serialized_start = 2771
+ _WRITEOPERATION_SAVEMODE._serialized_end = 2908
+ _WRITEOPERATIONV2._serialized_start = 2935
+ _WRITEOPERATIONV2._serialized_end = 3795
+ _WRITEOPERATIONV2_OPTIONSENTRY._serialized_start = 2356
+ _WRITEOPERATIONV2_OPTIONSENTRY._serialized_end = 2414
+ _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_start = 3554
+ _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_end = 3620
+ _WRITEOPERATIONV2_MODE._serialized_start = 3623
+ _WRITEOPERATIONV2_MODE._serialized_end = 3782
+ _WRITESTREAMOPERATIONSTART._serialized_start = 3798
+ _WRITESTREAMOPERATIONSTART._serialized_end = 4598
+ _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_start = 2356
+ _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_end = 2414
+ _STREAMINGFOREACHFUNCTION._serialized_start = 4601
+ _STREAMINGFOREACHFUNCTION._serialized_end = 4780
+ _WRITESTREAMOPERATIONSTARTRESULT._serialized_start = 4782
+ _WRITESTREAMOPERATIONSTARTRESULT._serialized_end = 4903
+ _STREAMINGQUERYINSTANCEID._serialized_start = 4905
+ _STREAMINGQUERYINSTANCEID._serialized_end = 4970
+ _STREAMINGQUERYCOMMAND._serialized_start = 4973
+ _STREAMINGQUERYCOMMAND._serialized_end = 5605
+ _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 5472
+ _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 5516
+ _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 5518
+ _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 5594
+ _STREAMINGQUERYCOMMANDRESULT._serialized_start = 5608
+ _STREAMINGQUERYCOMMANDRESULT._serialized_end = 6749
+ _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 6191
+ _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 6361
+ _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 6363
+ _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 6435
+ _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 6437
+ _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 6476
+ _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 6479
+ _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 6676
+ _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start =
6678
+ _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 6734
+ _STREAMINGQUERYMANAGERCOMMAND._serialized_start = 6752
+ _STREAMINGQUERYMANAGERCOMMAND._serialized_end = 7581
+ _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_start
= 7283
+ _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_end =
7362
+
_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_start =
7365
+
_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_end =
7570
+ _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_start = 7584
+ _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_end = 8660
+ _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_start = 8192
+ _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_end = 8319
+
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_start =
8321
+ _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_end
= 8436
+
_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_start
= 8438
+
_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_end =
8497
+
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_start
= 8499
+
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_end
= 8574
+
_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_start
= 8576
+
_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_end
= 8645
+ _GETRESOURCESCOMMAND._serialized_start = 8662
+ _GETRESOURCESCOMMAND._serialized_end = 8683
+ _GETRESOURCESCOMMANDRESULT._serialized_start = 8686
+ _GETRESOURCESCOMMANDRESULT._serialized_end = 8898
+ _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 8802
+ _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 8898
# @@protoc_insertion_point(module_scope)
diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi
b/python/pyspark/sql/connect/proto/commands_pb2.pyi
index 7855f403e9c..ce7f3e5abfd 100644
--- a/python/pyspark/sql/connect/proto/commands_pb2.pyi
+++ b/python/pyspark/sql/connect/proto/commands_pb2.pyi
@@ -489,6 +489,7 @@ class WriteOperation(google.protobuf.message.Message):
PARTITIONING_COLUMNS_FIELD_NUMBER: builtins.int
BUCKET_BY_FIELD_NUMBER: builtins.int
OPTIONS_FIELD_NUMBER: builtins.int
+ CLUSTERING_COLUMNS_FIELD_NUMBER: builtins.int
@property
def input(self) -> pyspark.sql.connect.proto.relations_pb2.Relation:
"""(Required) The output of the `input` relation will be persisted
according to the options."""
@@ -517,6 +518,11 @@ class WriteOperation(google.protobuf.message.Message):
@property
def options(self) ->
google.protobuf.internal.containers.ScalarMap[builtins.str, builtins.str]:
"""(Optional) A list of configuration options."""
+ @property
+ def clustering_columns(
+ self,
+ ) ->
google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+ """(Optional) Columns used for clustering the table."""
def __init__(
self,
*,
@@ -529,6 +535,7 @@ class WriteOperation(google.protobuf.message.Message):
partitioning_columns: collections.abc.Iterable[builtins.str] | None =
...,
bucket_by: global___WriteOperation.BucketBy | None = ...,
options: collections.abc.Mapping[builtins.str, builtins.str] | None =
...,
+ clustering_columns: collections.abc.Iterable[builtins.str] | None =
...,
) -> None: ...
def HasField(
self,
@@ -556,6 +563,8 @@ class WriteOperation(google.protobuf.message.Message):
b"_source",
"bucket_by",
b"bucket_by",
+ "clustering_columns",
+ b"clustering_columns",
"input",
b"input",
"mode",
@@ -662,6 +671,7 @@ class WriteOperationV2(google.protobuf.message.Message):
TABLE_PROPERTIES_FIELD_NUMBER: builtins.int
MODE_FIELD_NUMBER: builtins.int
OVERWRITE_CONDITION_FIELD_NUMBER: builtins.int
+ CLUSTERING_COLUMNS_FIELD_NUMBER: builtins.int
@property
def input(self) -> pyspark.sql.connect.proto.relations_pb2.Relation:
"""(Required) The output of the `input` relation will be persisted
according to the options."""
@@ -693,6 +703,11 @@ class WriteOperationV2(google.protobuf.message.Message):
@property
def overwrite_condition(self) ->
pyspark.sql.connect.proto.expressions_pb2.Expression:
"""(Optional) A condition for overwrite saving mode"""
+ @property
+ def clustering_columns(
+ self,
+ ) ->
google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+ """(Optional) Columns used for clustering the table."""
def __init__(
self,
*,
@@ -707,6 +722,7 @@ class WriteOperationV2(google.protobuf.message.Message):
table_properties: collections.abc.Mapping[builtins.str, builtins.str]
| None = ...,
mode: global___WriteOperationV2.Mode.ValueType = ...,
overwrite_condition:
pyspark.sql.connect.proto.expressions_pb2.Expression | None = ...,
+ clustering_columns: collections.abc.Iterable[builtins.str] | None =
...,
) -> None: ...
def HasField(
self,
@@ -726,6 +742,8 @@ class WriteOperationV2(google.protobuf.message.Message):
field_name: typing_extensions.Literal[
"_provider",
b"_provider",
+ "clustering_columns",
+ b"clustering_columns",
"input",
b"input",
"mode",
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]