This is an automated email from the ASF dual-hosted git repository.
chengpan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kyuubi-shaded.git
The following commit(s) were added to refs/heads/master by this push:
new c58c858 [KYUUBI-SHADED #49] Create relocated package for
spark-connect-rpc
c58c858 is described below
commit c58c858b358b4fb90b06fff3b8147d2592d48c23
Author: Cheng Pan <[email protected]>
AuthorDate: Mon Aug 19 15:36:04 2024 +0800
[KYUUBI-SHADED #49] Create relocated package for spark-connect-rpc
### _Why are the changes needed?_
This PR introduces a new module `kyuubi-relocated-spark-connect-rpc`, which
forks [Spark Connect proto
files](https://github.com/apache/spark/tree/5b2d2149b615acdd8730547a1f24c2b637222545/sql/connect/common/src/main/protobuf)
and shifts the package name from `org.apache.spark.connect.proto` to
`org.apache.kyuubi.shaded.spark.connect.proto`, this is a precondition for
Kyuubi to support Spark Connect
### _How was this patch tested?_
- [ ] Add some test cases that check the changes thoroughly including
negative and positive cases if possible
- [ ] Add screenshots for manual tests if appropriate
- [ ] [Run
test](https://kyuubi.readthedocs.io/en/master/develop_tools/testing.html#running-tests)
locally before make a pull request
Closes #49 from pan3793/spark-connect-rpc.
316d46f [Cheng Pan] fi
914e1ba [Cheng Pan] Add kyuubi-relocated-spark-connect-rpc
Authored-by: Cheng Pan <[email protected]>
Signed-off-by: Cheng Pan <[email protected]>
---
kyuubi-relocated-spark-connect-rpc/pom.xml | 110 ++
.../src/main/protobuf/README.md | 8 +
.../src/main/protobuf/spark/connect/base.proto | 1103 +++++++++++++++++++
.../src/main/protobuf/spark/connect/catalog.proto | 243 +++++
.../src/main/protobuf/spark/connect/commands.proto | 533 ++++++++++
.../src/main/protobuf/spark/connect/common.proto | 98 ++
.../protobuf/spark/connect/example_plugins.proto | 42 +
.../main/protobuf/spark/connect/expressions.proto | 453 ++++++++
.../main/protobuf/spark/connect/relations.proto | 1120 ++++++++++++++++++++
.../src/main/protobuf/spark/connect/types.proto | 201 ++++
.../src/main/resources/META-INF/NOTICE | 8 +
pom.xml | 9 +
12 files changed, 3928 insertions(+)
diff --git a/kyuubi-relocated-spark-connect-rpc/pom.xml
b/kyuubi-relocated-spark-connect-rpc/pom.xml
new file mode 100644
index 0000000..9ade5f4
--- /dev/null
+++ b/kyuubi-relocated-spark-connect-rpc/pom.xml
@@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.kyuubi</groupId>
+ <artifactId>kyuubi-relocated-parent</artifactId>
+ <version>0.5.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>kyuubi-relocated-spark-connect-rpc</artifactId>
+ <description>Relocated Spark Connect RPC classes used by Kyuubi
internally.</description>
+
+ <properties>
+ <grpc.version>1.65.1</grpc.version>
+ <protobuf.version>3.25.1</protobuf.version>
+ </properties>
+
+ <dependencyManagement>
+ <dependencies>
+ <dependency>
+ <groupId>io.grpc</groupId>
+ <artifactId>grpc-bom</artifactId>
+ <version>${grpc.version}</version>
+ <type>pom</type>
+ <scope>import</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.google.protobuf</groupId>
+ <artifactId>protobuf-bom</artifactId>
+ <version>${protobuf.version}</version>
+ <type>pom</type>
+ <scope>import</scope>
+ </dependency>
+ </dependencies>
+ </dependencyManagement>
+
+ <dependencies>
+ <dependency>
+ <groupId>io.grpc</groupId>
+ <artifactId>grpc-protobuf</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>io.grpc</groupId>
+ <artifactId>grpc-stub</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.google.protobuf</groupId>
+ <artifactId>protobuf-java</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <!-- Add protobuf-maven-plugin and provide ScalaPB as a code
generation plugin -->
+ <plugin>
+ <groupId>org.xolstice.maven.plugins</groupId>
+ <artifactId>protobuf-maven-plugin</artifactId>
+ <version>0.6.1</version>
+ <configuration>
+
<protocArtifact>com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier}</protocArtifact>
+ <pluginId>grpc-java</pluginId>
+
<pluginArtifact>io.grpc:protoc-gen-grpc-java:${grpc.version}:exe:${os.detected.classifier}</pluginArtifact>
+ <protoSourceRoot>src/main/protobuf</protoSourceRoot>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>compile</goal>
+ <goal>compile-custom</goal>
+ <goal>test-compile</goal>
+ </goals>
+ <configuration>
+ <pluginParameter>@generated=omit</pluginParameter>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <!-- Used to resolve variables in the 'version' tag -->
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>flatten-maven-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/README.md
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/README.md
new file mode 100644
index 0000000..360e547
--- /dev/null
+++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/README.md
@@ -0,0 +1,8 @@
+proto files are copied from
+https://github.com/apache/spark/tree/5b2d2149b615acdd8730547a1f24c2b637222545/sql/connect/common/src/main/protobuf
+
+and with one additional change in each proto file
+```patch
+- option java_package = "org.apache.spark.connect.proto"
++ option java_package = "org.apache.kyuubi.shaded.spark.connect.proto"
+```
diff --git
a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/base.proto
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/base.proto
new file mode 100644
index 0000000..0c3498a
--- /dev/null
+++
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/base.proto
@@ -0,0 +1,1103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+import "google/protobuf/any.proto";
+import "spark/connect/commands.proto";
+import "spark/connect/common.proto";
+import "spark/connect/expressions.proto";
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.kyuubi.shaded.spark.connect.proto";
+option go_package = "internal/generated";
+
+// A [[Plan]] is the structure that carries the runtime information for the
execution from the
+// client to the server. A [[Plan]] can either be of the type [[Relation]]
which is a reference
+// to the underlying logical plan or it can be of the [[Command]] type that is
used to execute
+// commands on the server.
+message Plan {
+ oneof op_type {
+ Relation root = 1;
+ Command command = 2;
+ }
+}
+
+
+
+// User Context is used to refer to one particular user session that is
executing
+// queries in the backend.
+message UserContext {
+ string user_id = 1;
+ string user_name = 2;
+
+ // To extend the existing user context message that is used to identify
incoming requests,
+ // Spark Connect leverages the Any protobuf type that can be used to inject
arbitrary other
+ // messages into this message. Extensions are stored as a `repeated` type to
be able to
+ // handle multiple active extensions.
+ repeated google.protobuf.Any extensions = 999;
+}
+
+// Request to perform plan analyze, optionally to explain the plan.
+message AnalyzePlanRequest {
+ // (Required)
+ //
+ // The session_id specifies a spark session for a user id (which is specified
+ // by user_context.user_id). The session_id is set by the client to be able
to
+ // collate streaming responses from different queries within the dedicated
session.
+ // The id should be an UUID string of the format
`00112233-4455-6677-8899-aabbccddeeff`
+ string session_id = 1;
+
+ // (Optional)
+ //
+ // Server-side generated idempotency key from the previous responses (if
any). Server
+ // can use this to validate that the server side session has not changed.
+ optional string client_observed_server_side_session_id = 17;
+
+ // (Required) User context
+ UserContext user_context = 2;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 3;
+
+ oneof analyze {
+ Schema schema = 4;
+ Explain explain = 5;
+ TreeString tree_string = 6;
+ IsLocal is_local = 7;
+ IsStreaming is_streaming = 8;
+ InputFiles input_files = 9;
+ SparkVersion spark_version = 10;
+ DDLParse ddl_parse = 11;
+ SameSemantics same_semantics = 12;
+ SemanticHash semantic_hash = 13;
+ Persist persist = 14;
+ Unpersist unpersist = 15;
+ GetStorageLevel get_storage_level = 16;
+ }
+
+ message Schema {
+ // (Required) The logical plan to be analyzed.
+ Plan plan = 1;
+ }
+
+ // Explains the input plan based on a configurable mode.
+ message Explain {
+ // (Required) The logical plan to be analyzed.
+ Plan plan = 1;
+
+ // (Required) For analyzePlan rpc calls, configure the mode to explain
plan in strings.
+ ExplainMode explain_mode = 2;
+
+ // Plan explanation mode.
+ enum ExplainMode {
+ EXPLAIN_MODE_UNSPECIFIED = 0;
+
+ // Generates only physical plan.
+ EXPLAIN_MODE_SIMPLE = 1;
+
+ // Generates parsed logical plan, analyzed logical plan, optimized
logical plan and physical plan.
+ // Parsed Logical plan is a unresolved plan that extracted from the
query. Analyzed logical plans
+ // transforms which translates unresolvedAttribute and
unresolvedRelation into fully typed objects.
+ // The optimized logical plan transforms through a set of optimization
rules, resulting in the
+ // physical plan.
+ EXPLAIN_MODE_EXTENDED = 2;
+
+ // Generates code for the statement, if any and a physical plan.
+ EXPLAIN_MODE_CODEGEN = 3;
+
+ // If plan node statistics are available, generates a logical plan and
also the statistics.
+ EXPLAIN_MODE_COST = 4;
+
+ // Generates a physical plan outline and also node details.
+ EXPLAIN_MODE_FORMATTED = 5;
+ }
+ }
+
+ message TreeString {
+ // (Required) The logical plan to be analyzed.
+ Plan plan = 1;
+
+ // (Optional) Max level of the schema.
+ optional int32 level = 2;
+ }
+
+ message IsLocal {
+ // (Required) The logical plan to be analyzed.
+ Plan plan = 1;
+ }
+
+ message IsStreaming {
+ // (Required) The logical plan to be analyzed.
+ Plan plan = 1;
+ }
+
+ message InputFiles {
+ // (Required) The logical plan to be analyzed.
+ Plan plan = 1;
+ }
+
+ message SparkVersion { }
+
+ message DDLParse {
+ // (Required) The DDL formatted string to be parsed.
+ string ddl_string = 1;
+ }
+
+
+ // Returns `true` when the logical query plans are equal and therefore
return same results.
+ message SameSemantics {
+ // (Required) The plan to be compared.
+ Plan target_plan = 1;
+
+ // (Required) The other plan to be compared.
+ Plan other_plan = 2;
+ }
+
+ message SemanticHash {
+ // (Required) The logical plan to get a hashCode.
+ Plan plan = 1;
+ }
+
+ message Persist {
+ // (Required) The logical plan to persist.
+ Relation relation = 1;
+
+ // (Optional) The storage level.
+ optional StorageLevel storage_level = 2;
+ }
+
+ message Unpersist {
+ // (Required) The logical plan to unpersist.
+ Relation relation = 1;
+
+ // (Optional) Whether to block until all blocks are deleted.
+ optional bool blocking = 2;
+ }
+
+ message GetStorageLevel {
+ // (Required) The logical plan to get the storage level.
+ Relation relation = 1;
+ }
+}
+
+// Response to performing analysis of the query. Contains relevant metadata to
be able to
+// reason about the performance.
+// Next ID: 16
+message AnalyzePlanResponse {
+ string session_id = 1;
+ // Server-side generated idempotency key that the client can use to assert
that the server side
+ // session has not changed.
+ string server_side_session_id = 15;
+
+ oneof result {
+ Schema schema = 2;
+ Explain explain = 3;
+ TreeString tree_string = 4;
+ IsLocal is_local = 5;
+ IsStreaming is_streaming = 6;
+ InputFiles input_files = 7;
+ SparkVersion spark_version = 8;
+ DDLParse ddl_parse = 9;
+ SameSemantics same_semantics = 10;
+ SemanticHash semantic_hash = 11;
+ Persist persist = 12;
+ Unpersist unpersist = 13;
+ GetStorageLevel get_storage_level = 14;
+ }
+
+ message Schema {
+ DataType schema = 1;
+ }
+
+ message Explain {
+ string explain_string = 1;
+ }
+
+ message TreeString {
+ string tree_string = 1;
+ }
+
+ message IsLocal {
+ bool is_local = 1;
+ }
+
+ message IsStreaming {
+ bool is_streaming = 1;
+ }
+
+ message InputFiles {
+ // A best-effort snapshot of the files that compose this Dataset
+ repeated string files = 1;
+ }
+
+ message SparkVersion {
+ string version = 1;
+ }
+
+ message DDLParse {
+ DataType parsed = 1;
+ }
+
+ message SameSemantics {
+ bool result = 1;
+ }
+
+ message SemanticHash {
+ int32 result = 1;
+ }
+
+ message Persist { }
+
+ message Unpersist { }
+
+ message GetStorageLevel {
+ // (Required) The StorageLevel as a result of get_storage_level request.
+ StorageLevel storage_level = 1;
+ }
+}
+
+// A request to be executed by the service.
+message ExecutePlanRequest {
+ // (Required)
+ //
+ // The session_id specifies a spark session for a user id (which is specified
+ // by user_context.user_id). The session_id is set by the client to be able
to
+ // collate streaming responses from different queries within the dedicated
session.
+ // The id should be an UUID string of the format
`00112233-4455-6677-8899-aabbccddeeff`
+ string session_id = 1;
+
+ // (Optional)
+ //
+ // Server-side generated idempotency key from the previous responses (if
any). Server
+ // can use this to validate that the server side session has not changed.
+ optional string client_observed_server_side_session_id = 8;
+
+ // (Required) User context
+ //
+ // user_context.user_id and session+id both identify a unique remote spark
session on the
+ // server side.
+ UserContext user_context = 2;
+
+ // (Optional)
+ // Provide an id for this request. If not provided, it will be generated by
the server.
+ // It is returned in every ExecutePlanResponse.operation_id of the
ExecutePlan response stream.
+ // The id must be an UUID string of the format
`00112233-4455-6677-8899-aabbccddeeff`
+ optional string operation_id = 6;
+
+ // (Required) The logical plan to be executed / analyzed.
+ Plan plan = 3;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 4;
+
+ // Repeated element for options that can be passed to the request. This
element is currently
+ // unused but allows to pass in an extension value used for arbitrary
options.
+ repeated RequestOption request_options = 5;
+
+ message RequestOption {
+ oneof request_option {
+ ReattachOptions reattach_options = 1;
+ // Extension type for request options
+ google.protobuf.Any extension = 999;
+ }
+ }
+
+ // Tags to tag the given execution with.
+ // Tags cannot contain ',' character and cannot be empty strings.
+ // Used by Interrupt with interrupt.tag.
+ repeated string tags = 7;
+}
+
+// The response of a query, can be one or more for each request. Responses
belonging to the
+// same input query, carry the same `session_id`.
+// Next ID: 17
+message ExecutePlanResponse {
+ string session_id = 1;
+ // Server-side generated idempotency key that the client can use to assert
that the server side
+ // session has not changed.
+ string server_side_session_id = 15;
+
+ // Identifies the ExecutePlan execution.
+ // If set by the client in ExecutePlanRequest.operationId, that value is
returned.
+ // Otherwise generated by the server.
+ // It is an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+ string operation_id = 12;
+
+ // Identified the response in the stream.
+ // The id is an UUID string of the format
`00112233-4455-6677-8899-aabbccddeeff`
+ string response_id = 13;
+
+ // Union type for the different response messages.
+ oneof response_type {
+ ArrowBatch arrow_batch = 2;
+
+ // Special case for executing SQL commands.
+ SqlCommandResult sql_command_result = 5;
+
+ // Response for a streaming query.
+ WriteStreamOperationStartResult write_stream_operation_start_result = 8;
+
+ // Response for commands on a streaming query.
+ StreamingQueryCommandResult streaming_query_command_result = 9;
+
+ // Response for 'SparkContext.resources'.
+ GetResourcesCommandResult get_resources_command_result = 10;
+
+ // Response for commands on the streaming query manager.
+ StreamingQueryManagerCommandResult streaming_query_manager_command_result
= 11;
+
+ // Response for commands on the client side streaming query listener.
+ StreamingQueryListenerEventsResult streaming_query_listener_events_result
= 16;
+
+ // Response type informing if the stream is complete in reattachable
execution.
+ ResultComplete result_complete = 14;
+
+ // Response for command that creates ResourceProfile.
+ CreateResourceProfileCommandResult create_resource_profile_command_result
= 17;
+
+ // (Optional) Intermediate query progress reports.
+ ExecutionProgress execution_progress = 18;
+
+ // Response for command that checkpoints a DataFrame.
+ CheckpointCommandResult checkpoint_command_result = 19;
+
+ // Support arbitrary result objects.
+ google.protobuf.Any extension = 999;
+ }
+
+ // Metrics for the query execution. Typically, this field is only present in
the last
+ // batch of results and then represent the overall state of the query
execution.
+ Metrics metrics = 4;
+
+ // The metrics observed during the execution of the query plan.
+ repeated ObservedMetrics observed_metrics = 6;
+
+ // (Optional) The Spark schema. This field is available when `collect` is
called.
+ DataType schema = 7;
+
+ // A SQL command returns an opaque Relation that can be directly used as
input for the next
+ // call.
+ message SqlCommandResult {
+ Relation relation = 1;
+ }
+
+ // Batch results of metrics.
+ message ArrowBatch {
+ // Count rows in `data`. Must match the number of rows inside `data`.
+ int64 row_count = 1;
+ // Serialized Arrow data.
+ bytes data = 2;
+
+ // If set, row offset of the start of this ArrowBatch in execution results.
+ optional int64 start_offset = 3;
+ }
+
+ message Metrics {
+
+ repeated MetricObject metrics = 1;
+
+ message MetricObject {
+ string name = 1;
+ int64 plan_id = 2;
+ int64 parent = 3;
+ map<string, MetricValue> execution_metrics = 4;
+ }
+
+ message MetricValue {
+ string name = 1;
+ int64 value = 2;
+ string metric_type = 3;
+ }
+ }
+
+ message ObservedMetrics {
+ string name = 1;
+ repeated Expression.Literal values = 2;
+ repeated string keys = 3;
+ int64 plan_id = 4;
+ }
+
+ message ResultComplete {
+ // If present, in a reattachable execution this means that after server
sends onComplete,
+ // the execution is complete. If the server sends onComplete without
sending a ResultComplete,
+ // it means that there is more, and the client should use ReattachExecute
RPC to continue.
+ }
+
+ // This message is used to communicate progress about the query progress
during the execution.
+ message ExecutionProgress {
+ // Captures the progress of each individual stage.
+ repeated StageInfo stages = 1;
+
+ // Captures the currently in progress tasks.
+ int64 num_inflight_tasks = 2;
+
+ message StageInfo {
+ int64 stage_id = 1;
+ int64 num_tasks = 2;
+ int64 num_completed_tasks = 3;
+ int64 input_bytes_read = 4;
+ bool done = 5;
+ }
+ }
+}
+
+// The key-value pair for the config request and response.
+message KeyValue {
+ // (Required) The key.
+ string key = 1;
+ // (Optional) The value.
+ optional string value = 2;
+}
+
+// Request to update or fetch the configurations.
+message ConfigRequest {
+ // (Required)
+ //
+ // The session_id specifies a spark session for a user id (which is specified
+ // by user_context.user_id). The session_id is set by the client to be able
to
+ // collate streaming responses from different queries within the dedicated
session.
+ // The id should be an UUID string of the format
`00112233-4455-6677-8899-aabbccddeeff`
+ string session_id = 1;
+
+ // (Optional)
+ //
+ // Server-side generated idempotency key from the previous responses (if
any). Server
+ // can use this to validate that the server side session has not changed.
+ optional string client_observed_server_side_session_id = 8;
+
+ // (Required) User context
+ UserContext user_context = 2;
+
+ // (Required) The operation for the config.
+ Operation operation = 3;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 4;
+
+ message Operation {
+ oneof op_type {
+ Set set = 1;
+ Get get = 2;
+ GetWithDefault get_with_default = 3;
+ GetOption get_option = 4;
+ GetAll get_all = 5;
+ Unset unset = 6;
+ IsModifiable is_modifiable = 7;
+ }
+ }
+
+ message Set {
+ // (Required) The config key-value pairs to set.
+ repeated KeyValue pairs = 1;
+ }
+
+ message Get {
+ // (Required) The config keys to get.
+ repeated string keys = 1;
+ }
+
+ message GetWithDefault {
+ // (Required) The config key-value paris to get. The value will be used as
the default value.
+ repeated KeyValue pairs = 1;
+ }
+
+ message GetOption {
+ // (Required) The config keys to get optionally.
+ repeated string keys = 1;
+ }
+
+ message GetAll {
+ // (Optional) The prefix of the config key to get.
+ optional string prefix = 1;
+ }
+
+ message Unset {
+ // (Required) The config keys to unset.
+ repeated string keys = 1;
+ }
+
+ message IsModifiable {
+ // (Required) The config keys to check the config is modifiable.
+ repeated string keys = 1;
+ }
+}
+
+// Response to the config request.
+// Next ID: 5
+message ConfigResponse {
+ string session_id = 1;
+ // Server-side generated idempotency key that the client can use to assert
that the server side
+ // session has not changed.
+ string server_side_session_id = 4;
+
+ // (Optional) The result key-value pairs.
+ //
+ // Available when the operation is 'Get', 'GetWithDefault', 'GetOption',
'GetAll'.
+ // Also available for the operation 'IsModifiable' with boolean string
"true" and "false".
+ repeated KeyValue pairs = 2;
+
+ // (Optional)
+ //
+ // Warning messages for deprecated or unsupported configurations.
+ repeated string warnings = 3;
+}
+
+// Request to transfer client-local artifacts.
+message AddArtifactsRequest {
+
+ // (Required)
+ //
+ // The session_id specifies a spark session for a user id (which is specified
+ // by user_context.user_id). The session_id is set by the client to be able
to
+ // collate streaming responses from different queries within the dedicated
session.
+ // The id should be an UUID string of the format
`00112233-4455-6677-8899-aabbccddeeff`
+ string session_id = 1;
+
+ // User context
+ UserContext user_context = 2;
+
+ // (Optional)
+ //
+ // Server-side generated idempotency key from the previous responses (if
any). Server
+ // can use this to validate that the server side session has not changed.
+ optional string client_observed_server_side_session_id = 7;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 6;
+
+ // A chunk of an Artifact.
+ message ArtifactChunk {
+ // Data chunk.
+ bytes data = 1;
+ // CRC to allow server to verify integrity of the chunk.
+ int64 crc = 2;
+ }
+
+ // An artifact that is contained in a single `ArtifactChunk`.
+ // Generally, this message represents tiny artifacts such as REPL-generated
class files.
+ message SingleChunkArtifact {
+ // The name of the artifact is expected in the form of a "Relative Path"
that is made up of a
+ // sequence of directories and the final file element.
+ // Examples of "Relative Path"s: "jars/test.jar", "classes/xyz.class",
"abc.xyz", "a/b/X.jar".
+ // The server is expected to maintain the hierarchy of files as defined by
their name. (i.e
+ // The relative path of the file on the server's filesystem will be the
same as the name of
+ // the provided artifact)
+ string name = 1;
+ // A single data chunk.
+ ArtifactChunk data = 2;
+ }
+
+ // A number of `SingleChunkArtifact` batched into a single RPC.
+ message Batch {
+ repeated SingleChunkArtifact artifacts = 1;
+ }
+
+ // Signals the beginning/start of a chunked artifact.
+ // A large artifact is transferred through a payload of
`BeginChunkedArtifact` followed by a
+ // sequence of `ArtifactChunk`s.
+ message BeginChunkedArtifact {
+ // Name of the artifact undergoing chunking. Follows the same conventions
as the `name` in
+ // the `Artifact` message.
+ string name = 1;
+ // Total size of the artifact in bytes.
+ int64 total_bytes = 2;
+ // Number of chunks the artifact is split into.
+ // This includes the `initial_chunk`.
+ int64 num_chunks = 3;
+ // The first/initial chunk.
+ ArtifactChunk initial_chunk = 4;
+ }
+
+ // The payload is either a batch of artifacts or a partial chunk of a large
artifact.
+ oneof payload {
+ Batch batch = 3;
+ // The metadata and the initial chunk of a large artifact chunked into
multiple requests.
+ // The server side is notified about the total size of the large artifact
as well as the
+ // number of chunks to expect.
+ BeginChunkedArtifact begin_chunk = 4;
+ // A chunk of an artifact excluding metadata. This can be any chunk of a
large artifact
+ // excluding the first chunk (which is included in `BeginChunkedArtifact`).
+ ArtifactChunk chunk = 5;
+ }
+}
+
+// Response to adding an artifact. Contains relevant metadata to verify
successful transfer of
+// artifact(s).
+// Next ID: 4
+message AddArtifactsResponse {
+ // Session id in which the AddArtifact was running.
+ string session_id = 2;
+ // Server-side generated idempotency key that the client can use to assert
that the server side
+ // session has not changed.
+ string server_side_session_id = 3;
+
+ // The list of artifact(s) seen by the server.
+ repeated ArtifactSummary artifacts = 1;
+
+ // Metadata of an artifact.
+ message ArtifactSummary {
+ string name = 1;
+ // Whether the CRC (Cyclic Redundancy Check) is successful on server
verification.
+ // The server discards any artifact that fails the CRC.
+ // If false, the client may choose to resend the artifact specified by
`name`.
+ bool is_crc_successful = 2;
+ }
+}
+
+// Request to get current statuses of artifacts at the server side.
+message ArtifactStatusesRequest {
+ // (Required)
+ //
+ // The session_id specifies a spark session for a user id (which is specified
+ // by user_context.user_id). The session_id is set by the client to be able
to
+ // collate streaming responses from different queries within the dedicated
session.
+ // The id should be an UUID string of the format
`00112233-4455-6677-8899-aabbccddeeff`
+ string session_id = 1;
+
+ // (Optional)
+ //
+ // Server-side generated idempotency key from the previous responses (if
any). Server
+ // can use this to validate that the server side session has not changed.
+ optional string client_observed_server_side_session_id = 5;
+
+ // User context
+ UserContext user_context = 2;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 3;
+
+ // The name of the artifact is expected in the form of a "Relative Path"
that is made up of a
+ // sequence of directories and the final file element.
+ // Examples of "Relative Path"s: "jars/test.jar", "classes/xyz.class",
"abc.xyz", "a/b/X.jar".
+ // The server is expected to maintain the hierarchy of files as defined by
their name. (i.e
+ // The relative path of the file on the server's filesystem will be the same
as the name of
+ // the provided artifact)
+ repeated string names = 4;
+}
+
+// Response to checking artifact statuses.
+// Next ID: 4
+message ArtifactStatusesResponse {
+ // Session id in which the ArtifactStatus was running.
+ string session_id = 2;
+ // Server-side generated idempotency key that the client can use to assert
that the server side
+ // session has not changed.
+ string server_side_session_id = 3;
+ // A map of artifact names to their statuses.
+ map<string, ArtifactStatus> statuses = 1;
+
+ message ArtifactStatus {
+ // Exists or not particular artifact at the server.
+ bool exists = 1;
+ }
+}
+
+message InterruptRequest {
+ // (Required)
+ //
+ // The session_id specifies a spark session for a user id (which is specified
+ // by user_context.user_id). The session_id is set by the client to be able
to
+ // collate streaming responses from different queries within the dedicated
session.
+ // The id should be an UUID string of the format
`00112233-4455-6677-8899-aabbccddeeff`
+ string session_id = 1;
+
+ // (Optional)
+ //
+ // Server-side generated idempotency key from the previous responses (if
any). Server
+ // can use this to validate that the server side session has not changed.
+ optional string client_observed_server_side_session_id = 7;
+
+ // (Required) User context
+ UserContext user_context = 2;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 3;
+
+ // (Required) The type of interrupt to execute.
+ InterruptType interrupt_type = 4;
+
+ enum InterruptType {
+ INTERRUPT_TYPE_UNSPECIFIED = 0;
+
+ // Interrupt all running executions within the session with the provided
session_id.
+ INTERRUPT_TYPE_ALL = 1;
+
+ // Interrupt all running executions within the session with the provided
operation_tag.
+ INTERRUPT_TYPE_TAG = 2;
+
+ // Interrupt the running execution within the session with the provided
operation_id.
+ INTERRUPT_TYPE_OPERATION_ID = 3;
+ }
+
+ oneof interrupt {
+ // if interrupt_tag == INTERRUPT_TYPE_TAG, interrupt operation with this
tag.
+ string operation_tag = 5;
+
+ // if interrupt_tag == INTERRUPT_TYPE_OPERATION_ID, interrupt operation
with this operation_id.
+ string operation_id = 6;
+ }
+}
+
+// Next ID: 4
+message InterruptResponse {
+ // Session id in which the interrupt was running.
+ string session_id = 1;
+ // Server-side generated idempotency key that the client can use to assert
that the server side
+ // session has not changed.
+ string server_side_session_id = 3;
+
+ // Operation ids of the executions which were interrupted.
+ repeated string interrupted_ids = 2;
+
+}
+
+message ReattachOptions {
+ // If true, the request can be reattached to using ReattachExecute.
+ // ReattachExecute can be used either if the stream broke with a GRPC
network error,
+ // or if the server closed the stream without sending a response with
StreamStatus.complete=true.
+ // The server will keep a buffer of responses in case a response is lost, and
+ // ReattachExecute needs to back-track.
+ //
+ // If false, the execution response stream will will not be reattachable,
and all responses are
+ // immediately released by the server after being sent.
+ bool reattachable = 1;
+}
+
+message ReattachExecuteRequest {
+ // (Required)
+ //
+ // The session_id of the request to reattach to.
+ // This must be an id of existing session.
+ string session_id = 1;
+
+ // (Optional)
+ //
+ // Server-side generated idempotency key from the previous responses (if
any). Server
+ // can use this to validate that the server side session has not changed.
+ optional string client_observed_server_side_session_id = 6;
+
+ // (Required) User context
+ //
+ // user_context.user_id and session+id both identify a unique remote spark
session on the
+ // server side.
+ UserContext user_context = 2;
+
+ // (Required)
+ // Provide an id of the request to reattach to.
+ // This must be an id of existing operation.
+ string operation_id = 3;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 4;
+
+ // (Optional)
+ // Last already processed response id from the response stream.
+ // After reattach, server will resume the response stream after that
response.
+ // If not specified, server will restart the stream from the start.
+ //
+ // Note: server controls the amount of responses that it buffers and it may
drop responses,
+ // that are far behind the latest returned response, so this can't be used
to arbitrarily
+ // scroll back the cursor. If the response is no longer available, this will
result in an error.
+ optional string last_response_id = 5;
+}
+
+message ReleaseExecuteRequest {
+ // (Required)
+ //
+ // The session_id of the request to reattach to.
+ // This must be an id of existing session.
+ string session_id = 1;
+
+ // (Optional)
+ //
+ // Server-side generated idempotency key from the previous responses (if
any). Server
+ // can use this to validate that the server side session has not changed.
+ optional string client_observed_server_side_session_id = 7;
+
+ // (Required) User context
+ //
+ // user_context.user_id and session+id both identify a unique remote spark
session on the
+ // server side.
+ UserContext user_context = 2;
+
+ // (Required)
+ // Provide an id of the request to reattach to.
+ // This must be an id of existing operation.
+ string operation_id = 3;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 4;
+
+ // Release and close operation completely.
+ // This will also interrupt the query if it is running execution, and wait
for it to be torn down.
+ message ReleaseAll {}
+
+ // Release all responses from the operation response stream up to and
including
+ // the response with the given by response_id.
+ // While server determines by itself how much of a buffer of responses to
keep, client providing
+ // explicit release calls will help reduce resource consumption.
+ // Noop if response_id not found in cached responses.
+ message ReleaseUntil {
+ string response_id = 1;
+ }
+
+ oneof release {
+ ReleaseAll release_all = 5;
+ ReleaseUntil release_until = 6;
+ }
+}
+
+// Next ID: 4
+message ReleaseExecuteResponse {
+ // Session id in which the release was running.
+ string session_id = 1;
+ // Server-side generated idempotency key that the client can use to assert
that the server side
+ // session has not changed.
+ string server_side_session_id = 3;
+
+ // Operation id of the operation on which the release executed.
+ // If the operation couldn't be found (because e.g. it was concurrently
released), will be unset.
+ // Otherwise, it will be equal to the operation_id from request.
+ optional string operation_id = 2;
+}
+
+message ReleaseSessionRequest {
+ // (Required)
+ //
+ // The session_id of the request to reattach to.
+ // This must be an id of existing session.
+ string session_id = 1;
+
+ // (Required) User context
+ //
+ // user_context.user_id and session+id both identify a unique remote spark
session on the
+ // server side.
+ UserContext user_context = 2;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 3;
+}
+
+// Next ID: 3
+message ReleaseSessionResponse {
+ // Session id of the session on which the release executed.
+ string session_id = 1;
+ // Server-side generated idempotency key that the client can use to assert
that the server side
+ // session has not changed.
+ string server_side_session_id = 2;
+}
+
+message FetchErrorDetailsRequest {
+
+ // (Required)
+ // The session_id specifies a Spark session for a user identified by
user_context.user_id.
+ // The id should be a UUID string of the format
`00112233-4455-6677-8899-aabbccddeeff`.
+ string session_id = 1;
+
+ // (Optional)
+ //
+ // Server-side generated idempotency key from the previous responses (if
any). Server
+ // can use this to validate that the server side session has not changed.
+ optional string client_observed_server_side_session_id = 5;
+
+ // User context
+ UserContext user_context = 2;
+
+ // (Required)
+ // The id of the error.
+ string error_id = 3;
+
+ // Provides optional information about the client sending the request. This
field
+ // can be used for language or version specific information and is only
intended for
+ // logging purposes and will not be interpreted by the server.
+ optional string client_type = 4;
+}
+
+// Next ID: 5
+message FetchErrorDetailsResponse {
+
+ // Server-side generated idempotency key that the client can use to assert
that the server side
+ // session has not changed.
+ string server_side_session_id = 3;
+
+ string session_id = 4;
+
+ // The index of the root error in errors. The field will not be set if the
error is not found.
+ optional int32 root_error_idx = 1;
+
+ // A list of errors.
+ repeated Error errors = 2;
+
+ message StackTraceElement {
+ // The fully qualified name of the class containing the execution point.
+ string declaring_class = 1;
+
+ // The name of the method containing the execution point.
+ string method_name = 2;
+
+ // The name of the file containing the execution point.
+ optional string file_name = 3;
+
+ // The line number of the source line containing the execution point.
+ int32 line_number = 4;
+ }
+
+ // QueryContext defines the schema for the query context of a SparkThrowable.
+ // It helps users understand where the error occurs while executing queries.
+ message QueryContext {
+ // The type of this query context.
+ enum ContextType {
+ SQL = 0;
+ DATAFRAME = 1;
+ }
+ ContextType context_type = 10;
+
+ // The object type of the query which throws the exception.
+ // If the exception is directly from the main query, it should be an empty
string.
+ // Otherwise, it should be the exact object type in upper case. For
example, a "VIEW".
+ string object_type = 1;
+
+ // The object name of the query which throws the exception.
+ // If the exception is directly from the main query, it should be an empty
string.
+ // Otherwise, it should be the object name. For example, a view name "V1".
+ string object_name = 2;
+
+ // The starting index in the query text which throws the exception. The
index starts from 0.
+ int32 start_index = 3;
+
+ // The stopping index in the query which throws the exception. The index
starts from 0.
+ int32 stop_index = 4;
+
+ // The corresponding fragment of the query which throws the exception.
+ string fragment = 5;
+
+ // The user code (call site of the API) that caused throwing the exception.
+ string call_site = 6;
+
+ // Summary of the exception cause.
+ string summary = 7;
+ }
+
+ // SparkThrowable defines the schema for SparkThrowable exceptions.
+ message SparkThrowable {
+ // Succinct, human-readable, unique, and consistent representation of the
error category.
+ optional string error_class = 1;
+
+ // The message parameters for the error framework.
+ map<string, string> message_parameters = 2;
+
+ // The query context of a SparkThrowable.
+ repeated QueryContext query_contexts = 3;
+
+ // Portable error identifier across SQL engines
+ // If null, error class or SQLSTATE is not set.
+ optional string sql_state = 4;
+ }
+
+ // Error defines the schema for the representing exception.
+ message Error {
+ // The fully qualified names of the exception class and its parent classes.
+ repeated string error_type_hierarchy = 1;
+
+ // The detailed message of the exception.
+ string message = 2;
+
+ // The stackTrace of the exception. It will be set
+ // if the SQLConf spark.sql.connect.serverStacktrace.enabled is true.
+ repeated StackTraceElement stack_trace = 3;
+
+ // The index of the cause error in errors.
+ optional int32 cause_idx = 4;
+
+ // The structured data of a SparkThrowable exception.
+ optional SparkThrowable spark_throwable = 5;
+ }
+}
+
+message CheckpointCommandResult {
+ // (Required) The logical plan checkpointed.
+ CachedRemoteRelation relation = 1;
+}
+
+// Main interface for the SparkConnect service.
+service SparkConnectService {
+
+ // Executes a request that contains the query and returns a stream of
[[Response]].
+ //
+ // It is guaranteed that there is at least one ARROW batch returned even if
the result set is empty.
+ rpc ExecutePlan(ExecutePlanRequest) returns (stream ExecutePlanResponse) {}
+
+ // Analyzes a query and returns a [[AnalyzeResponse]] containing metadata
about the query.
+ rpc AnalyzePlan(AnalyzePlanRequest) returns (AnalyzePlanResponse) {}
+
+ // Update or fetch the configurations and returns a [[ConfigResponse]]
containing the result.
+ rpc Config(ConfigRequest) returns (ConfigResponse) {}
+
+ // Add artifacts to the session and returns a [[AddArtifactsResponse]]
containing metadata about
+ // the added artifacts.
+ rpc AddArtifacts(stream AddArtifactsRequest) returns (AddArtifactsResponse)
{}
+
+ // Check statuses of artifacts in the session and returns them in a
[[ArtifactStatusesResponse]]
+ rpc ArtifactStatus(ArtifactStatusesRequest) returns
(ArtifactStatusesResponse) {}
+
+ // Interrupts running executions
+ rpc Interrupt(InterruptRequest) returns (InterruptResponse) {}
+
+ // Reattach to an existing reattachable execution.
+ // The ExecutePlan must have been started with
ReattachOptions.reattachable=true.
+ // If the ExecutePlanResponse stream ends without a ResultComplete message,
there is more to
+ // continue. If there is a ResultComplete, the client should use
ReleaseExecute with
+ rpc ReattachExecute(ReattachExecuteRequest) returns (stream
ExecutePlanResponse) {}
+
+ // Release an reattachable execution, or parts thereof.
+ // The ExecutePlan must have been started with
ReattachOptions.reattachable=true.
+ // Non reattachable executions are released automatically and immediately
after the ExecutePlan
+ // RPC and ReleaseExecute may not be used.
+ rpc ReleaseExecute(ReleaseExecuteRequest) returns (ReleaseExecuteResponse) {}
+
+ // Release a session.
+ // All the executions in the session will be released. Any further requests
for the session with
+ // that session_id for the given user_id will fail. If the session didn't
exist or was already
+ // released, this is a noop.
+ rpc ReleaseSession(ReleaseSessionRequest) returns (ReleaseSessionResponse) {}
+
+ // FetchErrorDetails retrieves the matched exception with details based on a
provided error id.
+ rpc FetchErrorDetails(FetchErrorDetailsRequest) returns
(FetchErrorDetailsResponse) {}
+}
diff --git
a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/catalog.proto
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/catalog.proto
new file mode 100644
index 0000000..06e6282
--- /dev/null
+++
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/catalog.proto
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+import "spark/connect/common.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.kyuubi.shaded.spark.connect.proto";
+option go_package = "internal/generated";
+
+// Catalog messages are marked as unstable.
+message Catalog {
+ oneof cat_type {
+ CurrentDatabase current_database = 1;
+ SetCurrentDatabase set_current_database = 2;
+ ListDatabases list_databases = 3;
+ ListTables list_tables = 4;
+ ListFunctions list_functions = 5;
+ ListColumns list_columns = 6;
+ GetDatabase get_database = 7;
+ GetTable get_table = 8;
+ GetFunction get_function = 9;
+ DatabaseExists database_exists = 10;
+ TableExists table_exists = 11;
+ FunctionExists function_exists = 12;
+ CreateExternalTable create_external_table = 13;
+ CreateTable create_table = 14;
+ DropTempView drop_temp_view = 15;
+ DropGlobalTempView drop_global_temp_view = 16;
+ RecoverPartitions recover_partitions = 17;
+ IsCached is_cached = 18;
+ CacheTable cache_table = 19;
+ UncacheTable uncache_table = 20;
+ ClearCache clear_cache = 21;
+ RefreshTable refresh_table = 22;
+ RefreshByPath refresh_by_path = 23;
+ CurrentCatalog current_catalog = 24;
+ SetCurrentCatalog set_current_catalog = 25;
+ ListCatalogs list_catalogs = 26;
+ }
+}
+
+// See `spark.catalog.currentDatabase`
+message CurrentDatabase { }
+
+// See `spark.catalog.setCurrentDatabase`
+message SetCurrentDatabase {
+ // (Required)
+ string db_name = 1;
+}
+
+// See `spark.catalog.listDatabases`
+message ListDatabases {
+ // (Optional) The pattern that the database name needs to match
+ optional string pattern = 1;
+}
+
+// See `spark.catalog.listTables`
+message ListTables {
+ // (Optional)
+ optional string db_name = 1;
+ // (Optional) The pattern that the table name needs to match
+ optional string pattern = 2;
+}
+
+// See `spark.catalog.listFunctions`
+message ListFunctions {
+ // (Optional)
+ optional string db_name = 1;
+ // (Optional) The pattern that the function name needs to match
+ optional string pattern = 2;
+}
+
+// See `spark.catalog.listColumns`
+message ListColumns {
+ // (Required)
+ string table_name = 1;
+ // (Optional)
+ optional string db_name = 2;
+}
+
+// See `spark.catalog.getDatabase`
+message GetDatabase {
+ // (Required)
+ string db_name = 1;
+}
+
+// See `spark.catalog.getTable`
+message GetTable {
+ // (Required)
+ string table_name = 1;
+ // (Optional)
+ optional string db_name = 2;
+}
+
+// See `spark.catalog.getFunction`
+message GetFunction {
+ // (Required)
+ string function_name = 1;
+ // (Optional)
+ optional string db_name = 2;
+}
+
+// See `spark.catalog.databaseExists`
+message DatabaseExists {
+ // (Required)
+ string db_name = 1;
+}
+
+// See `spark.catalog.tableExists`
+message TableExists {
+ // (Required)
+ string table_name = 1;
+ // (Optional)
+ optional string db_name = 2;
+}
+
+// See `spark.catalog.functionExists`
+message FunctionExists {
+ // (Required)
+ string function_name = 1;
+ // (Optional)
+ optional string db_name = 2;
+}
+
+// See `spark.catalog.createExternalTable`
+message CreateExternalTable {
+ // (Required)
+ string table_name = 1;
+ // (Optional)
+ optional string path = 2;
+ // (Optional)
+ optional string source = 3;
+ // (Optional)
+ optional DataType schema = 4;
+ // Options could be empty for valid data source format.
+ // The map key is case insensitive.
+ map<string, string> options = 5;
+}
+
+// See `spark.catalog.createTable`
+message CreateTable {
+ // (Required)
+ string table_name = 1;
+ // (Optional)
+ optional string path = 2;
+ // (Optional)
+ optional string source = 3;
+ // (Optional)
+ optional string description = 4;
+ // (Optional)
+ optional DataType schema = 5;
+ // Options could be empty for valid data source format.
+ // The map key is case insensitive.
+ map<string, string> options = 6;
+}
+
+// See `spark.catalog.dropTempView`
+message DropTempView {
+ // (Required)
+ string view_name = 1;
+}
+
+// See `spark.catalog.dropGlobalTempView`
+message DropGlobalTempView {
+ // (Required)
+ string view_name = 1;
+}
+
+// See `spark.catalog.recoverPartitions`
+message RecoverPartitions {
+ // (Required)
+ string table_name = 1;
+}
+
+// See `spark.catalog.isCached`
+message IsCached {
+ // (Required)
+ string table_name = 1;
+}
+
+// See `spark.catalog.cacheTable`
+message CacheTable {
+ // (Required)
+ string table_name = 1;
+
+ // (Optional)
+ optional StorageLevel storage_level = 2;
+}
+
+// See `spark.catalog.uncacheTable`
+message UncacheTable {
+ // (Required)
+ string table_name = 1;
+}
+
+// See `spark.catalog.clearCache`
+message ClearCache { }
+
+// See `spark.catalog.refreshTable`
+message RefreshTable {
+ // (Required)
+ string table_name = 1;
+}
+
+// See `spark.catalog.refreshByPath`
+message RefreshByPath {
+ // (Required)
+ string path = 1;
+}
+
+// See `spark.catalog.currentCatalog`
+message CurrentCatalog { }
+
+// See `spark.catalog.setCurrentCatalog`
+message SetCurrentCatalog {
+ // (Required)
+ string catalog_name = 1;
+}
+
+// See `spark.catalog.listCatalogs`
+message ListCatalogs {
+ // (Optional) The pattern that the catalog name needs to match
+ optional string pattern = 1;
+}
diff --git
a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/commands.proto
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/commands.proto
new file mode 100644
index 0000000..f84e33e
--- /dev/null
+++
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/commands.proto
@@ -0,0 +1,533 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+import "google/protobuf/any.proto";
+import "spark/connect/common.proto";
+import "spark/connect/expressions.proto";
+import "spark/connect/relations.proto";
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.kyuubi.shaded.spark.connect.proto";
+option go_package = "internal/generated";
+
+// A [[Command]] is an operation that is executed by the server that does not
directly consume or
+// produce a relational result.
+message Command {
+ oneof command_type {
+ CommonInlineUserDefinedFunction register_function = 1;
+ WriteOperation write_operation = 2;
+ CreateDataFrameViewCommand create_dataframe_view = 3;
+ WriteOperationV2 write_operation_v2 = 4;
+ SqlCommand sql_command = 5;
+ WriteStreamOperationStart write_stream_operation_start = 6;
+ StreamingQueryCommand streaming_query_command = 7;
+ GetResourcesCommand get_resources_command = 8;
+ StreamingQueryManagerCommand streaming_query_manager_command = 9;
+ CommonInlineUserDefinedTableFunction register_table_function = 10;
+ StreamingQueryListenerBusCommand streaming_query_listener_bus_command = 11;
+ CommonInlineUserDefinedDataSource register_data_source = 12;
+ CreateResourceProfileCommand create_resource_profile_command = 13;
+ CheckpointCommand checkpoint_command = 14;
+ RemoveCachedRemoteRelationCommand remove_cached_remote_relation_command =
15;
+ MergeIntoTableCommand merge_into_table_command = 16;
+
+ // This field is used to mark extensions to the protocol. When plugins
generate arbitrary
+ // Commands they can add them here. During the planning the correct
resolution is done.
+ google.protobuf.Any extension = 999;
+
+ }
+}
+
+// A SQL Command is used to trigger the eager evaluation of SQL commands in
Spark.
+//
+// When the SQL provide as part of the message is a command it will be
immediately evaluated
+// and the result will be collected and returned as part of a LocalRelation.
If the result is
+// not a command, the operation will simply return a SQL Relation. This allows
the client to be
+// almost oblivious to the server-side behavior.
+message SqlCommand {
+ // (Required) SQL Query.
+ string sql = 1 [deprecated=true];
+
+ // (Optional) A map of parameter names to literal expressions.
+ map<string, Expression.Literal> args = 2 [deprecated=true];
+
+ // (Optional) A sequence of literal expressions for positional parameters in
the SQL query text.
+ repeated Expression.Literal pos_args = 3 [deprecated=true];
+
+ // (Optional) A map of parameter names to expressions.
+ // It cannot coexist with `pos_arguments`.
+ map<string, Expression> named_arguments = 4 [deprecated=true];
+
+ // (Optional) A sequence of expressions for positional parameters in the SQL
query text.
+ // It cannot coexist with `named_arguments`.
+ repeated Expression pos_arguments = 5 [deprecated=true];
+
+ // (Optional) The relation that this SQL command will be built on.
+ Relation input = 6;
+}
+
+// A command that can create DataFrame global temp view or local temp view.
+message CreateDataFrameViewCommand {
+ // (Required) The relation that this view will be built on.
+ Relation input = 1;
+
+ // (Required) View name.
+ string name = 2;
+
+ // (Required) Whether this is global temp view or local temp view.
+ bool is_global = 3;
+
+ // (Required)
+ //
+ // If true, and if the view already exists, updates it; if false, and if the
view
+ // already exists, throws exception.
+ bool replace = 4;
+}
+
+// As writes are not directly handled during analysis and planning, they are
modeled as commands.
+message WriteOperation {
+ // (Required) The output of the `input` relation will be persisted according
to the options.
+ Relation input = 1;
+
+ // (Optional) Format value according to the Spark documentation. Examples
are: text, parquet, delta.
+ optional string source = 2;
+
+ // (Optional)
+ //
+ // The destination of the write operation can be either a path or a table.
+ // If the destination is neither a path nor a table, such as jdbc and noop,
+ // the `save_type` should not be set.
+ oneof save_type {
+ string path = 3;
+ SaveTable table = 4;
+ }
+
+ // (Required) the save mode.
+ SaveMode mode = 5;
+
+ // (Optional) List of columns to sort the output by.
+ repeated string sort_column_names = 6;
+
+ // (Optional) List of columns for partitioning.
+ repeated string partitioning_columns = 7;
+
+ // (Optional) Bucketing specification. Bucketing must set the number of
buckets and the columns
+ // to bucket by.
+ BucketBy bucket_by = 8;
+
+ // (Optional) A list of configuration options.
+ map<string, string> options = 9;
+
+ // (Optional) Columns used for clustering the table.
+ repeated string clustering_columns = 10;
+
+ message SaveTable {
+ // (Required) The table name.
+ string table_name = 1;
+ // (Required) The method to be called to write to the table.
+ TableSaveMethod save_method = 2;
+
+ enum TableSaveMethod {
+ TABLE_SAVE_METHOD_UNSPECIFIED = 0;
+ TABLE_SAVE_METHOD_SAVE_AS_TABLE = 1;
+ TABLE_SAVE_METHOD_INSERT_INTO = 2;
+ }
+ }
+
+ message BucketBy {
+ repeated string bucket_column_names = 1;
+ int32 num_buckets = 2;
+ }
+
+ enum SaveMode {
+ SAVE_MODE_UNSPECIFIED = 0;
+ SAVE_MODE_APPEND = 1;
+ SAVE_MODE_OVERWRITE = 2;
+ SAVE_MODE_ERROR_IF_EXISTS = 3;
+ SAVE_MODE_IGNORE = 4;
+ }
+}
+
+// As writes are not directly handled during analysis and planning, they are
modeled as commands.
+message WriteOperationV2 {
+ // (Required) The output of the `input` relation will be persisted according
to the options.
+ Relation input = 1;
+
+ // (Required) The destination of the write operation must be either a path
or a table.
+ string table_name = 2;
+
+ // (Optional) A provider for the underlying output data source. Spark's
default catalog supports
+ // "parquet", "json", etc.
+ optional string provider = 3;
+
+ // (Optional) List of columns for partitioning for output table created by
`create`,
+ // `createOrReplace`, or `replace`
+ repeated Expression partitioning_columns = 4;
+
+ // (Optional) A list of configuration options.
+ map<string, string> options = 5;
+
+ // (Optional) A list of table properties.
+ map<string, string> table_properties = 6;
+
+ // (Required) Write mode.
+ Mode mode = 7;
+
+ enum Mode {
+ MODE_UNSPECIFIED = 0;
+ MODE_CREATE = 1;
+ MODE_OVERWRITE = 2;
+ MODE_OVERWRITE_PARTITIONS = 3;
+ MODE_APPEND = 4;
+ MODE_REPLACE = 5;
+ MODE_CREATE_OR_REPLACE = 6;
+ }
+
+ // (Optional) A condition for overwrite saving mode
+ Expression overwrite_condition = 8;
+
+ // (Optional) Columns used for clustering the table.
+ repeated string clustering_columns = 9;
+}
+
+// Starts write stream operation as streaming query. Query ID and Run ID of
the streaming
+// query are returned.
+message WriteStreamOperationStart {
+
+ // (Required) The output of the `input` streaming relation will be written.
+ Relation input = 1;
+
+ // The following fields directly map to API for DataStreamWriter().
+ // Consult API documentation unless explicitly documented here.
+
+ string format = 2;
+ map<string, string> options = 3;
+ repeated string partitioning_column_names = 4;
+
+ oneof trigger {
+ string processing_time_interval = 5;
+ bool available_now = 6;
+ bool once = 7;
+ string continuous_checkpoint_interval = 8;
+ }
+
+ string output_mode = 9;
+ string query_name = 10;
+
+ // The destination is optional. When set, it can be a path or a table name.
+ oneof sink_destination {
+ string path = 11;
+ string table_name = 12;
+ }
+
+ StreamingForeachFunction foreach_writer = 13;
+ StreamingForeachFunction foreach_batch = 14;
+
+ // (Optional) Columns used for clustering the table.
+ repeated string clustering_column_names = 15;
+}
+
+message StreamingForeachFunction {
+ oneof function {
+ PythonUDF python_function = 1;
+ ScalarScalaUDF scala_function = 2;
+ }
+}
+
+message WriteStreamOperationStartResult {
+
+ // (Required) Query instance. See `StreamingQueryInstanceId`.
+ StreamingQueryInstanceId query_id = 1;
+
+ // An optional query name.
+ string name = 2;
+
+ // Optional query started event if there is any listener registered on the
client side.
+ optional string query_started_event_json = 3;
+
+ // TODO: How do we indicate errors?
+ // TODO: Consider adding status, last progress etc here.
+}
+
+// A tuple that uniquely identifies an instance of streaming query run. It
consists of `id` that
+// persists across the streaming runs and `run_id` that changes between each
run of the
+// streaming query that resumes from the checkpoint.
+message StreamingQueryInstanceId {
+
+ // (Required) The unique id of this query that persists across restarts from
checkpoint data.
+ // That is, this id is generated when a query is started for the first time,
and
+ // will be the same every time it is restarted from checkpoint data.
+ string id = 1;
+
+ // (Required) The unique id of this run of the query. That is, every
start/restart of a query
+ // will generate a unique run_id. Therefore, every time a query is restarted
from
+ // checkpoint, it will have the same `id` but different `run_id`s.
+ string run_id = 2;
+}
+
+// Commands for a streaming query.
+message StreamingQueryCommand {
+
+ // (Required) Query instance. See `StreamingQueryInstanceId`.
+ StreamingQueryInstanceId query_id = 1;
+
+ // See documentation for the corresponding API method in StreamingQuery.
+ oneof command {
+ // status() API.
+ bool status = 2;
+ // lastProgress() API.
+ bool last_progress = 3;
+ // recentProgress() API.
+ bool recent_progress = 4;
+ // stop() API. Stops the query.
+ bool stop = 5;
+ // processAllAvailable() API. Waits till all the available data is
processed
+ bool process_all_available = 6;
+ // explain() API. Returns logical and physical plans.
+ ExplainCommand explain = 7;
+ // exception() API. Returns the exception in the query if any.
+ bool exception = 8;
+ // awaitTermination() API. Waits for the termination of the query.
+ AwaitTerminationCommand await_termination = 9;
+ }
+
+ message ExplainCommand {
+ // TODO: Consider reusing Explain from AnalyzePlanRequest message.
+ // We can not do this right now since it base.proto imports this
file.
+ bool extended = 1;
+ }
+
+ message AwaitTerminationCommand {
+ optional int64 timeout_ms = 2;
+ }
+}
+
+// Response for commands on a streaming query.
+message StreamingQueryCommandResult {
+ // (Required) Query instance id. See `StreamingQueryInstanceId`.
+ StreamingQueryInstanceId query_id = 1;
+
+ oneof result_type {
+ StatusResult status = 2;
+ RecentProgressResult recent_progress = 3;
+ ExplainResult explain = 4;
+ ExceptionResult exception = 5;
+ AwaitTerminationResult await_termination = 6;
+ }
+
+ message StatusResult {
+ // See documentation for these Scala 'StreamingQueryStatus' struct
+ string status_message = 1;
+ bool is_data_available = 2;
+ bool is_trigger_active = 3;
+ bool is_active = 4;
+ }
+
+ message RecentProgressResult {
+ // Progress reports as an array of json strings.
+ repeated string recent_progress_json = 5;
+ }
+
+ message ExplainResult {
+ // Logical and physical plans as string
+ string result = 1;
+ }
+
+ message ExceptionResult {
+ // (Optional) Exception message as string, maps to the return value of
original
+ // StreamingQueryException's toString method
+ optional string exception_message = 1;
+ // (Optional) Exception error class as string
+ optional string error_class = 2;
+ // (Optional) Exception stack trace as string
+ optional string stack_trace = 3;
+ }
+
+ message AwaitTerminationResult {
+ bool terminated = 1;
+ }
+}
+
+// Commands for the streaming query manager.
+message StreamingQueryManagerCommand {
+
+ // See documentation for the corresponding API method in
StreamingQueryManager.
+ oneof command {
+ // active() API, returns a list of active queries.
+ bool active = 1;
+ // get() API, returns the StreamingQuery identified by id.
+ string get_query = 2;
+ // awaitAnyTermination() API, wait until any query terminates or timeout.
+ AwaitAnyTerminationCommand await_any_termination = 3;
+ // resetTerminated() API.
+ bool reset_terminated = 4;
+ // addListener API.
+ StreamingQueryListenerCommand add_listener = 5;
+ // removeListener API.
+ StreamingQueryListenerCommand remove_listener = 6;
+ // listListeners() API, returns a list of streaming query listeners.
+ bool list_listeners = 7;
+ }
+
+ message AwaitAnyTerminationCommand {
+ // (Optional) The waiting time in milliseconds to wait for any query to
terminate.
+ optional int64 timeout_ms = 1;
+ }
+
+ message StreamingQueryListenerCommand {
+ bytes listener_payload = 1;
+ optional PythonUDF python_listener_payload = 2;
+ string id = 3;
+ }
+}
+
+// Response for commands on the streaming query manager.
+message StreamingQueryManagerCommandResult {
+ oneof result_type {
+ ActiveResult active = 1;
+ StreamingQueryInstance query = 2;
+ AwaitAnyTerminationResult await_any_termination = 3;
+ bool reset_terminated = 4;
+ bool add_listener = 5;
+ bool remove_listener = 6;
+ ListStreamingQueryListenerResult list_listeners = 7;
+ }
+
+ message ActiveResult {
+ repeated StreamingQueryInstance active_queries = 1;
+ }
+
+ message StreamingQueryInstance {
+ // (Required) The id and runId of this query.
+ StreamingQueryInstanceId id = 1;
+ // (Optional) The name of this query.
+ optional string name = 2;
+ }
+
+ message AwaitAnyTerminationResult {
+ bool terminated = 1;
+ }
+
+ message StreamingQueryListenerInstance {
+ bytes listener_payload = 1;
+ }
+
+ message ListStreamingQueryListenerResult {
+ // (Required) Reference IDs of listener instances.
+ repeated string listener_ids = 1;
+ }
+}
+
+// The protocol for client-side StreamingQueryListener.
+// This command will only be set when either the first listener is added to
the client, or the last
+// listener is removed from the client.
+// The add_listener_bus_listener command will only be set true in the first
case.
+// The remove_listener_bus_listener command will only be set true in the
second case.
+message StreamingQueryListenerBusCommand {
+ oneof command {
+ bool add_listener_bus_listener = 1;
+ bool remove_listener_bus_listener = 2;
+ }
+}
+
+// The enum used for client side streaming query listener event
+// There is no QueryStartedEvent defined here,
+// it is added as a field in WriteStreamOperationStartResult
+enum StreamingQueryEventType {
+ QUERY_PROGRESS_UNSPECIFIED = 0;
+ QUERY_PROGRESS_EVENT = 1;
+ QUERY_TERMINATED_EVENT = 2;
+ QUERY_IDLE_EVENT = 3;
+}
+
+// The protocol for the returned events in the long-running response channel.
+message StreamingQueryListenerEvent {
+ // (Required) The json serialized event, all StreamingQueryListener events
have a json method
+ string event_json = 1;
+ // (Required) Query event type used by client to decide how to deserialize
the event_json
+ StreamingQueryEventType event_type = 2;
+}
+
+message StreamingQueryListenerEventsResult {
+ repeated StreamingQueryListenerEvent events = 1;
+ optional bool listener_bus_listener_added = 2;
+}
+
+// Command to get the output of 'SparkContext.resources'
+message GetResourcesCommand { }
+
+// Response for command 'GetResourcesCommand'.
+message GetResourcesCommandResult {
+ map<string, ResourceInformation> resources = 1;
+}
+
+// Command to create ResourceProfile
+message CreateResourceProfileCommand {
+ // (Required) The ResourceProfile to be built on the server-side.
+ ResourceProfile profile = 1;
+}
+
+// Response for command 'CreateResourceProfileCommand'.
+message CreateResourceProfileCommandResult {
+ // (Required) Server-side generated resource profile id.
+ int32 profile_id = 1;
+}
+
+// Command to remove `CashedRemoteRelation`
+message RemoveCachedRemoteRelationCommand {
+ // (Required) The remote to be related
+ CachedRemoteRelation relation = 1;
+}
+
+message CheckpointCommand {
+ // (Required) The logical plan to checkpoint.
+ Relation relation = 1;
+
+ // (Required) Locally checkpoint using a local temporary
+ // directory in Spark Connect server (Spark Driver)
+ bool local = 2;
+
+ // (Required) Whether to checkpoint this dataframe immediately.
+ bool eager = 3;
+}
+
+message MergeIntoTableCommand {
+ // (Required) The name of the target table.
+ string target_table_name = 1;
+
+ // (Required) The relation of the source table.
+ Relation source_table_plan = 2;
+
+ // (Required) The condition to match the source and target.
+ Expression merge_condition = 3;
+
+ // (Optional) The actions to be taken when the condition is matched.
+ repeated Expression match_actions = 4;
+
+ // (Optional) The actions to be taken when the condition is not matched.
+ repeated Expression not_matched_actions = 5;
+
+ // (Optional) The actions to be taken when the condition is not matched by
source.
+ repeated Expression not_matched_by_source_actions = 6;
+
+ // (Required) Whether to enable schema evolution.
+ bool with_schema_evolution = 7;
+}
diff --git
a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/common.proto
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/common.proto
new file mode 100644
index 0000000..e156aa4
--- /dev/null
+++
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/common.proto
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.kyuubi.shaded.spark.connect.proto";
+option go_package = "internal/generated";
+
+// StorageLevel for persisting Datasets/Tables.
+message StorageLevel {
+ // (Required) Whether the cache should use disk or not.
+ bool use_disk = 1;
+ // (Required) Whether the cache should use memory or not.
+ bool use_memory = 2;
+ // (Required) Whether the cache should use off-heap or not.
+ bool use_off_heap = 3;
+ // (Required) Whether the cached data is deserialized or not.
+ bool deserialized = 4;
+ // (Required) The number of replicas.
+ int32 replication = 5;
+}
+
+
+// ResourceInformation to hold information about a type of Resource.
+// The corresponding class is 'org.apache.spark.resource.ResourceInformation'
+message ResourceInformation {
+ // (Required) The name of the resource
+ string name = 1;
+ // (Required) An array of strings describing the addresses of the resource.
+ repeated string addresses = 2;
+}
+
+// An executor resource request.
+message ExecutorResourceRequest {
+ // (Required) resource name.
+ string resource_name = 1;
+
+ // (Required) resource amount requesting.
+ int64 amount = 2;
+
+ // Optional script used to discover the resources.
+ optional string discovery_script = 3;
+
+ // Optional vendor, required for some cluster managers.
+ optional string vendor = 4;
+}
+
+// A task resource request.
+message TaskResourceRequest {
+ // (Required) resource name.
+ string resource_name = 1;
+
+ // (Required) resource amount requesting as a double to support fractional
+ // resource requests.
+ double amount = 2;
+}
+
+message ResourceProfile {
+ // (Optional) Resource requests for executors. Mapped from the resource name
+ // (e.g., cores, memory, CPU) to its specific request.
+ map<string, ExecutorResourceRequest> executor_resources = 1;
+
+ // (Optional) Resource requests for tasks. Mapped from the resource name
+ // (e.g., cores, memory, CPU) to its specific request.
+ map<string, TaskResourceRequest> task_resources = 2;
+}
+
+message Origin {
+ // (Required) Indicate the origin type.
+ oneof function {
+ PythonOrigin python_origin = 1;
+ }
+}
+
+message PythonOrigin {
+ // (Required) Name of the origin, for example, the name of the function
+ string fragment = 1;
+
+ // (Required) Callsite to show to end users, for example, stacktrace.
+ string call_site = 2;
+}
diff --git
a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/example_plugins.proto
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/example_plugins.proto
new file mode 100644
index 0000000..51faef4
--- /dev/null
+++
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/example_plugins.proto
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+import "spark/connect/relations.proto";
+import "spark/connect/expressions.proto";
+option go_package = "internal/generated";
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.kyuubi.shaded.spark.connect.proto";
+
+message ExamplePluginRelation {
+ Relation input = 1;
+ string custom_field = 2;
+
+}
+
+message ExamplePluginExpression {
+ Expression child = 1;
+ string custom_field = 2;
+}
+
+message ExamplePluginCommand {
+ string custom_field = 1;
+}
\ No newline at end of file
diff --git
a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/expressions.proto
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/expressions.proto
new file mode 100644
index 0000000..a11b0c1
--- /dev/null
+++
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/expressions.proto
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+import "google/protobuf/any.proto";
+import "spark/connect/types.proto";
+import "spark/connect/common.proto";
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.kyuubi.shaded.spark.connect.proto";
+option go_package = "internal/generated";
+
+// Expression used to refer to fields, functions and similar. This can be used
everywhere
+// expressions in SQL appear.
+message Expression {
+
+ ExpressionCommon common = 18;
+ oneof expr_type {
+ Literal literal = 1;
+ UnresolvedAttribute unresolved_attribute = 2;
+ UnresolvedFunction unresolved_function = 3;
+ ExpressionString expression_string = 4;
+ UnresolvedStar unresolved_star = 5;
+ Alias alias = 6;
+ Cast cast = 7;
+ UnresolvedRegex unresolved_regex = 8;
+ SortOrder sort_order = 9;
+ LambdaFunction lambda_function = 10;
+ Window window = 11;
+ UnresolvedExtractValue unresolved_extract_value = 12;
+ UpdateFields update_fields = 13;
+ UnresolvedNamedLambdaVariable unresolved_named_lambda_variable = 14;
+ CommonInlineUserDefinedFunction common_inline_user_defined_function = 15;
+ CallFunction call_function = 16;
+ NamedArgumentExpression named_argument_expression = 17;
+ MergeAction merge_action = 19;
+ TypedAggregateExpression typed_aggregate_expression = 20;
+
+ // This field is used to mark extensions to the protocol. When plugins
generate arbitrary
+ // relations they can add them here. During the planning the correct
resolution is done.
+ google.protobuf.Any extension = 999;
+ }
+
+
+ // Expression for the OVER clause or WINDOW clause.
+ message Window {
+
+ // (Required) The window function.
+ Expression window_function = 1;
+
+ // (Optional) The way that input rows are partitioned.
+ repeated Expression partition_spec = 2;
+
+ // (Optional) Ordering of rows in a partition.
+ repeated SortOrder order_spec = 3;
+
+ // (Optional) Window frame in a partition.
+ //
+ // If not set, it will be treated as 'UnspecifiedFrame'.
+ WindowFrame frame_spec = 4;
+
+ // The window frame
+ message WindowFrame {
+
+ // (Required) The type of the frame.
+ FrameType frame_type = 1;
+
+ // (Required) The lower bound of the frame.
+ FrameBoundary lower = 2;
+
+ // (Required) The upper bound of the frame.
+ FrameBoundary upper = 3;
+
+ enum FrameType {
+ FRAME_TYPE_UNDEFINED = 0;
+
+ // RowFrame treats rows in a partition individually.
+ FRAME_TYPE_ROW = 1;
+
+ // RangeFrame treats rows in a partition as groups of peers.
+ // All rows having the same 'ORDER BY' ordering are considered as
peers.
+ FRAME_TYPE_RANGE = 2;
+ }
+
+ message FrameBoundary {
+ oneof boundary {
+ // CURRENT ROW boundary
+ bool current_row = 1;
+
+ // UNBOUNDED boundary.
+ // For lower bound, it will be converted to 'UnboundedPreceding'.
+ // for upper bound, it will be converted to 'UnboundedFollowing'.
+ bool unbounded = 2;
+
+ // This is an expression for future proofing. We are expecting
literals on the server side.
+ Expression value = 3;
+ }
+ }
+ }
+ }
+
+ // SortOrder is used to specify the data ordering, it is normally used in
Sort and Window.
+ // It is an unevaluable expression and cannot be evaluated, so can not be
used in Projection.
+ message SortOrder {
+ // (Required) The expression to be sorted.
+ Expression child = 1;
+
+ // (Required) The sort direction, should be ASCENDING or DESCENDING.
+ SortDirection direction = 2;
+
+ // (Required) How to deal with NULLs, should be NULLS_FIRST or NULLS_LAST.
+ NullOrdering null_ordering = 3;
+
+ enum SortDirection {
+ SORT_DIRECTION_UNSPECIFIED = 0;
+ SORT_DIRECTION_ASCENDING = 1;
+ SORT_DIRECTION_DESCENDING = 2;
+ }
+
+ enum NullOrdering {
+ SORT_NULLS_UNSPECIFIED = 0;
+ SORT_NULLS_FIRST = 1;
+ SORT_NULLS_LAST = 2;
+ }
+ }
+
+ message Cast {
+ // (Required) the expression to be casted.
+ Expression expr = 1;
+
+ // (Required) the data type that the expr to be casted to.
+ oneof cast_to_type {
+ DataType type = 2;
+ // If this is set, Server will use Catalyst parser to parse this string
to DataType.
+ string type_str = 3;
+ }
+
+ // (Optional) The expression evaluation mode.
+ EvalMode eval_mode = 4;
+
+ enum EvalMode {
+ EVAL_MODE_UNSPECIFIED = 0;
+ EVAL_MODE_LEGACY = 1;
+ EVAL_MODE_ANSI = 2;
+ EVAL_MODE_TRY = 3;
+ }
+ }
+
+ message Literal {
+ oneof literal_type {
+ DataType null = 1;
+ bytes binary = 2;
+ bool boolean = 3;
+
+ int32 byte = 4;
+ int32 short = 5;
+ int32 integer = 6;
+ int64 long = 7;
+ float float = 10;
+ double double = 11;
+ Decimal decimal = 12;
+
+ string string = 13;
+
+ // Date in units of days since the UNIX epoch.
+ int32 date = 16;
+ // Timestamp in units of microseconds since the UNIX epoch.
+ int64 timestamp = 17;
+ // Timestamp in units of microseconds since the UNIX epoch (without
timezone information).
+ int64 timestamp_ntz = 18;
+
+ CalendarInterval calendar_interval = 19;
+ int32 year_month_interval = 20;
+ int64 day_time_interval = 21;
+ Array array = 22;
+ Map map = 23;
+ Struct struct = 24;
+ }
+
+ message Decimal {
+ // the string representation.
+ string value = 1;
+ // The maximum number of digits allowed in the value.
+ // the maximum precision is 38.
+ optional int32 precision = 2;
+ // declared scale of decimal literal
+ optional int32 scale = 3;
+ }
+
+ message CalendarInterval {
+ int32 months = 1;
+ int32 days = 2;
+ int64 microseconds = 3;
+ }
+
+ message Array {
+ DataType element_type = 1;
+ repeated Literal elements = 2;
+ }
+
+ message Map {
+ DataType key_type = 1;
+ DataType value_type = 2;
+ repeated Literal keys = 3;
+ repeated Literal values = 4;
+ }
+
+ message Struct {
+ DataType struct_type = 1;
+ repeated Literal elements = 2;
+ }
+ }
+
+ // An unresolved attribute that is not explicitly bound to a specific
column, but the column
+ // is resolved during analysis by name.
+ message UnresolvedAttribute {
+ // (Required) An identifier that will be parsed by Catalyst parser. This
should follow the
+ // Spark SQL identifier syntax.
+ string unparsed_identifier = 1;
+
+ // (Optional) The id of corresponding connect plan.
+ optional int64 plan_id = 2;
+
+ // (Optional) The requested column is a metadata column.
+ optional bool is_metadata_column = 3;
+ }
+
+ // An unresolved function is not explicitly bound to one explicit function,
but the function
+ // is resolved during analysis following Sparks name resolution rules.
+ message UnresolvedFunction {
+ // (Required) name (or unparsed name for user defined function) for the
unresolved function.
+ string function_name = 1;
+
+ // (Optional) Function arguments. Empty arguments are allowed.
+ repeated Expression arguments = 2;
+
+ // (Required) Indicate if this function should be applied on distinct
values.
+ bool is_distinct = 3;
+
+ // (Required) Indicate if this is a user defined function.
+ //
+ // When it is not a user defined function, Connect will use the function
name directly.
+ // When it is a user defined function, Connect will parse the function
name first.
+ bool is_user_defined_function = 4;
+ }
+
+ // Expression as string.
+ message ExpressionString {
+ // (Required) A SQL expression that will be parsed by Catalyst parser.
+ string expression = 1;
+ }
+
+ // UnresolvedStar is used to expand all the fields of a relation or struct.
+ message UnresolvedStar {
+
+ // (Optional) The target of the expansion.
+ //
+ // If set, it should end with '.*' and will be parsed by
'parseAttributeName'
+ // in the server side.
+ optional string unparsed_target = 1;
+
+ // (Optional) The id of corresponding connect plan.
+ optional int64 plan_id = 2;
+ }
+
+ // Represents all of the input attributes to a given relational operator,
for example in
+ // "SELECT `(id)?+.+` FROM ...".
+ message UnresolvedRegex {
+ // (Required) The column name used to extract column with regex.
+ string col_name = 1;
+
+ // (Optional) The id of corresponding connect plan.
+ optional int64 plan_id = 2;
+ }
+
+ // Extracts a value or values from an Expression
+ message UnresolvedExtractValue {
+ // (Required) The expression to extract value from, can be
+ // Map, Array, Struct or array of Structs.
+ Expression child = 1;
+
+ // (Required) The expression to describe the extraction, can be
+ // key of Map, index of Array, field name of Struct.
+ Expression extraction = 2;
+ }
+
+ // Add, replace or drop a field of `StructType` expression by name.
+ message UpdateFields {
+ // (Required) The struct expression.
+ Expression struct_expression = 1;
+
+ // (Required) The field name.
+ string field_name = 2;
+
+ // (Optional) The expression to add or replace.
+ //
+ // When not set, it means this field will be dropped.
+ Expression value_expression = 3;
+ }
+
+ message Alias {
+ // (Required) The expression that alias will be added on.
+ Expression expr = 1;
+
+ // (Required) a list of name parts for the alias.
+ //
+ // Scalar columns only has one name that presents.
+ repeated string name = 2;
+
+ // (Optional) Alias metadata expressed as a JSON map.
+ optional string metadata = 3;
+ }
+
+ message LambdaFunction {
+ // (Required) The lambda function.
+ //
+ // The function body should use 'UnresolvedAttribute' as arguments, the
sever side will
+ // replace 'UnresolvedAttribute' with 'UnresolvedNamedLambdaVariable'.
+ Expression function = 1;
+
+ // (Required) Function variables. Must contains 1 ~ 3 variables.
+ repeated Expression.UnresolvedNamedLambdaVariable arguments = 2;
+ }
+
+ message UnresolvedNamedLambdaVariable {
+
+ // (Required) a list of name parts for the variable. Must not be empty.
+ repeated string name_parts = 1;
+ }
+}
+
+message ExpressionCommon {
+ // (Required) Keep the information of the origin for this expression such as
stacktrace.
+ Origin origin = 1;
+}
+
+message CommonInlineUserDefinedFunction {
+ // (Required) Name of the user-defined function.
+ string function_name = 1;
+ // (Optional) Indicate if the user-defined function is deterministic.
+ bool deterministic = 2;
+ // (Optional) Function arguments. Empty arguments are allowed.
+ repeated Expression arguments = 3;
+ // (Required) Indicate the function type of the user-defined function.
+ oneof function {
+ PythonUDF python_udf = 4;
+ ScalarScalaUDF scalar_scala_udf = 5;
+ JavaUDF java_udf = 6;
+ }
+}
+
+message PythonUDF {
+ // (Required) Output type of the Python UDF
+ DataType output_type = 1;
+ // (Required) EvalType of the Python UDF
+ int32 eval_type = 2;
+ // (Required) The encoded commands of the Python UDF
+ bytes command = 3;
+ // (Required) Python version being used in the client.
+ string python_ver = 4;
+ // (Optional) Additional includes for the Python UDF.
+ repeated string additional_includes = 5;
+}
+
+message ScalarScalaUDF {
+ // (Required) Serialized JVM object containing UDF definition, input
encoders and output encoder
+ bytes payload = 1;
+ // (Optional) Input type(s) of the UDF
+ repeated DataType inputTypes = 2;
+ // (Required) Output type of the UDF
+ DataType outputType = 3;
+ // (Required) True if the UDF can return null value
+ bool nullable = 4;
+ // (Required) Indicate if the UDF is an aggregate function
+ bool aggregate = 5;
+}
+
+message JavaUDF {
+ // (Required) Fully qualified name of Java class
+ string class_name = 1;
+
+ // (Optional) Output type of the Java UDF
+ optional DataType output_type = 2;
+
+ // (Required) Indicate if the Java user-defined function is an aggregate
function
+ bool aggregate = 3;
+}
+
+message TypedAggregateExpression {
+ // (Required) The aggregate function object packed into bytes.
+ ScalarScalaUDF scalar_scala_udf = 1;
+}
+
+message CallFunction {
+ // (Required) Unparsed name of the SQL function.
+ string function_name = 1;
+
+ // (Optional) Function arguments. Empty arguments are allowed.
+ repeated Expression arguments = 2;
+}
+
+message NamedArgumentExpression {
+ // (Required) The key of the named argument.
+ string key = 1;
+
+ // (Required) The value expression of the named argument.
+ Expression value = 2;
+}
+
+message MergeAction {
+ // (Required) The action type of the merge action.
+ ActionType action_type = 1;
+
+ // (Optional) The condition expression of the merge action.
+ optional Expression condition = 2;
+
+ // (Optional) The assignments of the merge action. Required for ActionTypes
INSERT and UPDATE.
+ repeated Assignment assignments = 3;
+
+ enum ActionType {
+ ACTION_TYPE_INVALID = 0;
+ ACTION_TYPE_DELETE = 1;
+ ACTION_TYPE_INSERT = 2;
+ ACTION_TYPE_INSERT_STAR = 3;
+ ACTION_TYPE_UPDATE = 4;
+ ACTION_TYPE_UPDATE_STAR = 5;
+ }
+
+ message Assignment {
+ // (Required) The key of the assignment.
+ Expression key = 1;
+
+ // (Required) The value of the assignment.
+ Expression value = 2;
+ }
+}
diff --git
a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/relations.proto
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/relations.proto
new file mode 100644
index 0000000..1535776
--- /dev/null
+++
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/relations.proto
@@ -0,0 +1,1120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+import "google/protobuf/any.proto";
+import "spark/connect/expressions.proto";
+import "spark/connect/types.proto";
+import "spark/connect/catalog.proto";
+import "spark/connect/common.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.kyuubi.shaded.spark.connect.proto";
+option go_package = "internal/generated";
+
+// The main [[Relation]] type. Fundamentally, a relation is a typed container
+// that has exactly one explicit relation type set.
+//
+// When adding new relation types, they have to be registered here.
+message Relation {
+ RelationCommon common = 1;
+ oneof rel_type {
+ Read read = 2;
+ Project project = 3;
+ Filter filter = 4;
+ Join join = 5;
+ SetOperation set_op = 6;
+ Sort sort = 7;
+ Limit limit = 8;
+ Aggregate aggregate = 9;
+ SQL sql = 10;
+ LocalRelation local_relation = 11;
+ Sample sample = 12;
+ Offset offset = 13;
+ Deduplicate deduplicate = 14;
+ Range range = 15;
+ SubqueryAlias subquery_alias = 16;
+ Repartition repartition = 17;
+ ToDF to_df = 18;
+ WithColumnsRenamed with_columns_renamed = 19;
+ ShowString show_string = 20;
+ Drop drop = 21;
+ Tail tail = 22;
+ WithColumns with_columns = 23;
+ Hint hint = 24;
+ Unpivot unpivot = 25;
+ ToSchema to_schema = 26;
+ RepartitionByExpression repartition_by_expression = 27;
+ MapPartitions map_partitions = 28;
+ CollectMetrics collect_metrics = 29;
+ Parse parse = 30;
+ GroupMap group_map = 31;
+ CoGroupMap co_group_map = 32;
+ WithWatermark with_watermark = 33;
+ ApplyInPandasWithState apply_in_pandas_with_state = 34;
+ HtmlString html_string = 35;
+ CachedLocalRelation cached_local_relation = 36;
+ CachedRemoteRelation cached_remote_relation = 37;
+ CommonInlineUserDefinedTableFunction
common_inline_user_defined_table_function = 38;
+ AsOfJoin as_of_join = 39;
+ CommonInlineUserDefinedDataSource common_inline_user_defined_data_source =
40;
+ WithRelations with_relations = 41;
+
+ // NA functions
+ NAFill fill_na = 90;
+ NADrop drop_na = 91;
+ NAReplace replace = 92;
+
+ // stat functions
+ StatSummary summary = 100;
+ StatCrosstab crosstab = 101;
+ StatDescribe describe = 102;
+ StatCov cov = 103;
+ StatCorr corr = 104;
+ StatApproxQuantile approx_quantile = 105;
+ StatFreqItems freq_items = 106;
+ StatSampleBy sample_by = 107;
+
+ // Catalog API (experimental / unstable)
+ Catalog catalog = 200;
+
+ // This field is used to mark extensions to the protocol. When plugins
generate arbitrary
+ // relations they can add them here. During the planning the correct
resolution is done.
+ google.protobuf.Any extension = 998;
+ Unknown unknown = 999;
+ }
+}
+
+// Used for testing purposes only.
+message Unknown {}
+
+// Common metadata of all relations.
+message RelationCommon {
+ // (Required) Shared relation metadata.
+ string source_info = 1 [deprecated=true];
+
+ // (Optional) A per-client globally unique id for a given connect plan.
+ optional int64 plan_id = 2;
+
+ // (Optional) Keep the information of the origin for this expression such as
stacktrace.
+ Origin origin = 3;
+}
+
+// Relation that uses a SQL query to generate the output.
+message SQL {
+ // (Required) The SQL query.
+ string query = 1;
+
+ // (Optional) A map of parameter names to literal expressions.
+ map<string, Expression.Literal> args = 2 [deprecated=true];
+
+ // (Optional) A sequence of literal expressions for positional parameters in
the SQL query text.
+ repeated Expression.Literal pos_args = 3 [deprecated=true];
+
+ // (Optional) A map of parameter names to expressions.
+ // It cannot coexist with `pos_arguments`.
+ map<string, Expression> named_arguments = 4;
+
+ // (Optional) A sequence of expressions for positional parameters in the SQL
query text.
+ // It cannot coexist with `named_arguments`.
+ repeated Expression pos_arguments = 5;
+}
+
+// Relation of type [[WithRelations]].
+//
+// This relation contains a root plan, and one or more references that are
used by the root plan.
+// There are two ways of referencing a relation, by name (through a subquery
alias), or by plan_id
+// (using RelationCommon.plan_id).
+//
+// This relation can be used to implement CTEs, describe DAGs, or to reduce
tree depth.
+message WithRelations {
+ // (Required) Plan at the root of the query tree. This plan is expected to
contain one or more
+ // references. Those references get expanded later on by the engine.
+ Relation root = 1;
+
+ // (Required) Plans referenced by the root plan. Relations in this list are
also allowed to
+ // contain references to other relations in this list, as long they do not
form cycles.
+ repeated Relation references = 2;
+}
+
+// Relation that reads from a file / table or other data source. Does not have
additional
+// inputs.
+message Read {
+ oneof read_type {
+ NamedTable named_table = 1;
+ DataSource data_source = 2;
+ }
+
+ // (Optional) Indicates if this is a streaming read.
+ bool is_streaming = 3;
+
+ message NamedTable {
+ // (Required) Unparsed identifier for the table.
+ string unparsed_identifier = 1;
+
+ // Options for the named table. The map key is case insensitive.
+ map<string, string> options = 2;
+ }
+
+ message DataSource {
+ // (Optional) Supported formats include: parquet, orc, text, json,
parquet, csv, avro.
+ //
+ // If not set, the value from SQL conf 'spark.sql.sources.default' will be
used.
+ optional string format = 1;
+
+ // (Optional) If not set, Spark will infer the schema.
+ //
+ // This schema string should be either DDL-formatted or JSON-formatted.
+ optional string schema = 2;
+
+ // Options for the data source. The context of this map varies based on the
+ // data source format. This options could be empty for valid data source
format.
+ // The map key is case insensitive.
+ map<string, string> options = 3;
+
+ // (Optional) A list of path for file-system backed data sources.
+ repeated string paths = 4;
+
+ // (Optional) Condition in the where clause for each partition.
+ //
+ // This is only supported by the JDBC data source.
+ repeated string predicates = 5;
+ }
+}
+
+// Projection of a bag of expressions for a given input relation.
+//
+// The input relation must be specified.
+// The projected expression can be an arbitrary expression.
+message Project {
+ // (Optional) Input relation is optional for Project.
+ //
+ // For example, `SELECT ABS(-1)` is valid plan without an input plan.
+ Relation input = 1;
+
+ // (Required) A Project requires at least one expression.
+ repeated Expression expressions = 3;
+}
+
+// Relation that applies a boolean expression `condition` on each row of
`input` to produce
+// the output result.
+message Filter {
+ // (Required) Input relation for a Filter.
+ Relation input = 1;
+
+ // (Required) A Filter must have a condition expression.
+ Expression condition = 2;
+}
+
+// Relation of type [[Join]].
+//
+// `left` and `right` must be present.
+message Join {
+ // (Required) Left input relation for a Join.
+ Relation left = 1;
+
+ // (Required) Right input relation for a Join.
+ Relation right = 2;
+
+ // (Optional) The join condition. Could be unset when `using_columns` is
utilized.
+ //
+ // This field does not co-exist with using_columns.
+ Expression join_condition = 3;
+
+ // (Required) The join type.
+ JoinType join_type = 4;
+
+ // Optional. using_columns provides a list of columns that should present on
both sides of
+ // the join inputs that this Join will join on. For example A JOIN B USING
col_name is
+ // equivalent to A JOIN B on A.col_name = B.col_name.
+ //
+ // This field does not co-exist with join_condition.
+ repeated string using_columns = 5;
+
+ enum JoinType {
+ JOIN_TYPE_UNSPECIFIED = 0;
+ JOIN_TYPE_INNER = 1;
+ JOIN_TYPE_FULL_OUTER = 2;
+ JOIN_TYPE_LEFT_OUTER = 3;
+ JOIN_TYPE_RIGHT_OUTER = 4;
+ JOIN_TYPE_LEFT_ANTI = 5;
+ JOIN_TYPE_LEFT_SEMI = 6;
+ JOIN_TYPE_CROSS = 7;
+ }
+
+ // (Optional) Only used by joinWith. Set the left and right join data types.
+ optional JoinDataType join_data_type = 6;
+
+ message JoinDataType {
+ // If the left data type is a struct.
+ bool is_left_struct = 1;
+ // If the right data type is a struct.
+ bool is_right_struct = 2;
+ }
+}
+
+// Relation of type [[SetOperation]]
+message SetOperation {
+ // (Required) Left input relation for a Set operation.
+ Relation left_input = 1;
+
+ // (Required) Right input relation for a Set operation.
+ Relation right_input = 2;
+
+ // (Required) The Set operation type.
+ SetOpType set_op_type = 3;
+
+ // (Optional) If to remove duplicate rows.
+ //
+ // True to preserve all results.
+ // False to remove duplicate rows.
+ optional bool is_all = 4;
+
+ // (Optional) If to perform the Set operation based on name resolution.
+ //
+ // Only UNION supports this option.
+ optional bool by_name = 5;
+
+ // (Optional) If to perform the Set operation and allow missing columns.
+ //
+ // Only UNION supports this option.
+ optional bool allow_missing_columns = 6;
+
+ enum SetOpType {
+ SET_OP_TYPE_UNSPECIFIED = 0;
+ SET_OP_TYPE_INTERSECT = 1;
+ SET_OP_TYPE_UNION = 2;
+ SET_OP_TYPE_EXCEPT = 3;
+ }
+}
+
+// Relation of type [[Limit]] that is used to `limit` rows from the input
relation.
+message Limit {
+ // (Required) Input relation for a Limit.
+ Relation input = 1;
+
+ // (Required) the limit.
+ int32 limit = 2;
+}
+
+// Relation of type [[Offset]] that is used to read rows staring from the
`offset` on
+// the input relation.
+message Offset {
+ // (Required) Input relation for an Offset.
+ Relation input = 1;
+
+ // (Required) the limit.
+ int32 offset = 2;
+}
+
+// Relation of type [[Tail]] that is used to fetch `limit` rows from the last
of the input relation.
+message Tail {
+ // (Required) Input relation for an Tail.
+ Relation input = 1;
+
+ // (Required) the limit.
+ int32 limit = 2;
+}
+
+// Relation of type [[Aggregate]].
+message Aggregate {
+ // (Required) Input relation for a RelationalGroupedDataset.
+ Relation input = 1;
+
+ // (Required) How the RelationalGroupedDataset was built.
+ GroupType group_type = 2;
+
+ // (Required) Expressions for grouping keys
+ repeated Expression grouping_expressions = 3;
+
+ // (Required) List of values that will be translated to columns in the
output DataFrame.
+ repeated Expression aggregate_expressions = 4;
+
+ // (Optional) Pivots a column of the current `DataFrame` and performs the
specified aggregation.
+ Pivot pivot = 5;
+
+ // (Optional) List of values that will be translated to columns in the
output DataFrame.
+ repeated GroupingSets grouping_sets = 6;
+
+ enum GroupType {
+ GROUP_TYPE_UNSPECIFIED = 0;
+ GROUP_TYPE_GROUPBY = 1;
+ GROUP_TYPE_ROLLUP = 2;
+ GROUP_TYPE_CUBE = 3;
+ GROUP_TYPE_PIVOT = 4;
+ GROUP_TYPE_GROUPING_SETS = 5;
+ }
+
+ message Pivot {
+ // (Required) The column to pivot
+ Expression col = 1;
+
+ // (Optional) List of values that will be translated to columns in the
output DataFrame.
+ //
+ // Note that if it is empty, the server side will immediately trigger a
job to collect
+ // the distinct values of the column.
+ repeated Expression.Literal values = 2;
+ }
+
+ message GroupingSets {
+ // (Required) Individual grouping set
+ repeated Expression grouping_set = 1;
+ }
+}
+
+// Relation of type [[Sort]].
+message Sort {
+ // (Required) Input relation for a Sort.
+ Relation input = 1;
+
+ // (Required) The ordering expressions
+ repeated Expression.SortOrder order = 2;
+
+ // (Optional) if this is a global sort.
+ optional bool is_global = 3;
+}
+
+
+// Drop specified columns.
+message Drop {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Optional) columns to drop.
+ repeated Expression columns = 2;
+
+ // (Optional) names of columns to drop.
+ repeated string column_names = 3;
+}
+
+
+// Relation of type [[Deduplicate]] which have duplicate rows removed, could
consider either only
+// the subset of columns or all the columns.
+message Deduplicate {
+ // (Required) Input relation for a Deduplicate.
+ Relation input = 1;
+
+ // (Optional) Deduplicate based on a list of column names.
+ //
+ // This field does not co-use with `all_columns_as_keys`.
+ repeated string column_names = 2;
+
+ // (Optional) Deduplicate based on all the columns of the input relation.
+ //
+ // This field does not co-use with `column_names`.
+ optional bool all_columns_as_keys = 3;
+
+ // (Optional) Deduplicate within the time range of watermark.
+ optional bool within_watermark = 4;
+}
+
+// A relation that does not need to be qualified by name.
+message LocalRelation {
+ // (Optional) Local collection data serialized into Arrow IPC streaming
format which contains
+ // the schema of the data.
+ optional bytes data = 1;
+
+ // (Optional) The schema of local data.
+ // It should be either a DDL-formatted type string or a JSON string.
+ //
+ // The server side will update the column names and data types according to
this schema.
+ // If the 'data' is not provided, then this schema will be required.
+ optional string schema = 2;
+}
+
+// A local relation that has been cached already.
+message CachedLocalRelation {
+ // `userId` and `sessionId` fields are deleted since the server must always
use the active
+ // session/user rather than arbitrary values provided by the client. It is
never valid to access
+ // a local relation from a different session/user.
+ reserved 1, 2;
+ reserved "userId", "sessionId";
+
+ // (Required) A sha-256 hash of the serialized local relation in proto, see
LocalRelation.
+ string hash = 3;
+}
+
+// Represents a remote relation that has been cached on server.
+message CachedRemoteRelation {
+ // (Required) ID of the remote related (assigned by the service).
+ string relation_id = 1;
+}
+
+// Relation of type [[Sample]] that samples a fraction of the dataset.
+message Sample {
+ // (Required) Input relation for a Sample.
+ Relation input = 1;
+
+ // (Required) lower bound.
+ double lower_bound = 2;
+
+ // (Required) upper bound.
+ double upper_bound = 3;
+
+ // (Optional) Whether to sample with replacement.
+ optional bool with_replacement = 4;
+
+ // (Required) The random seed.
+ // This field is required to avoid generating mutable dataframes (see
SPARK-48184 for details),
+ // however, still keep it 'optional' here for backward compatibility.
+ optional int64 seed = 5;
+
+ // (Required) Explicitly sort the underlying plan to make the ordering
deterministic or cache it.
+ // This flag is true when invoking `dataframe.randomSplit` to randomly
splits DataFrame with the
+ // provided weights. Otherwise, it is false.
+ bool deterministic_order = 6;
+}
+
+// Relation of type [[Range]] that generates a sequence of integers.
+message Range {
+ // (Optional) Default value = 0
+ optional int64 start = 1;
+
+ // (Required)
+ int64 end = 2;
+
+ // (Required)
+ int64 step = 3;
+
+ // Optional. Default value is assigned by 1) SQL conf
"spark.sql.leafNodeDefaultParallelism" if
+ // it is set, or 2) spark default parallelism.
+ optional int32 num_partitions = 4;
+}
+
+// Relation alias.
+message SubqueryAlias {
+ // (Required) The input relation of SubqueryAlias.
+ Relation input = 1;
+
+ // (Required) The alias.
+ string alias = 2;
+
+ // (Optional) Qualifier of the alias.
+ repeated string qualifier = 3;
+}
+
+// Relation repartition.
+message Repartition {
+ // (Required) The input relation of Repartition.
+ Relation input = 1;
+
+ // (Required) Must be positive.
+ int32 num_partitions = 2;
+
+ // (Optional) Default value is false.
+ optional bool shuffle = 3;
+}
+
+// Compose the string representing rows for output.
+// It will invoke 'Dataset.showString' to compute the results.
+message ShowString {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) Number of rows to show.
+ int32 num_rows = 2;
+
+ // (Required) If set to more than 0, truncates strings to
+ // `truncate` characters and all cells will be aligned right.
+ int32 truncate = 3;
+
+ // (Required) If set to true, prints output rows vertically (one line per
column value).
+ bool vertical = 4;
+}
+
+// Compose the string representing rows for output.
+// It will invoke 'Dataset.htmlString' to compute the results.
+message HtmlString {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) Number of rows to show.
+ int32 num_rows = 2;
+
+ // (Required) If set to more than 0, truncates strings to
+ // `truncate` characters and all cells will be aligned right.
+ int32 truncate = 3;
+}
+
+// Computes specified statistics for numeric and string columns.
+// It will invoke 'Dataset.summary' (same as 'StatFunctions.summary')
+// to compute the results.
+message StatSummary {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Optional) Statistics from to be computed.
+ //
+ // Available statistics are:
+ // count
+ // mean
+ // stddev
+ // min
+ // max
+ // arbitrary approximate percentiles specified as a percentage (e.g. 75%)
+ // count_distinct
+ // approx_count_distinct
+ //
+ // If no statistics are given, this function computes 'count', 'mean',
'stddev', 'min',
+ // 'approximate quartiles' (percentiles at 25%, 50%, and 75%), and 'max'.
+ repeated string statistics = 2;
+}
+
+// Computes basic statistics for numeric and string columns, including count,
mean, stddev, min,
+// and max. If no columns are given, this function computes statistics for all
numerical or
+// string columns.
+message StatDescribe {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Optional) Columns to compute statistics on.
+ repeated string cols = 2;
+}
+
+// Computes a pair-wise frequency table of the given columns. Also known as a
contingency table.
+// It will invoke 'Dataset.stat.crosstab' (same as
'StatFunctions.crossTabulate')
+// to compute the results.
+message StatCrosstab {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) The name of the first column.
+ //
+ // Distinct items will make the first item of each row.
+ string col1 = 2;
+
+ // (Required) The name of the second column.
+ //
+ // Distinct items will make the column names of the DataFrame.
+ string col2 = 3;
+}
+
+// Calculate the sample covariance of two numerical columns of a DataFrame.
+// It will invoke 'Dataset.stat.cov' (same as 'StatFunctions.calculateCov') to
compute the results.
+message StatCov {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) The name of the first column.
+ string col1 = 2;
+
+ // (Required) The name of the second column.
+ string col2 = 3;
+}
+
+// Calculates the correlation of two columns of a DataFrame. Currently only
supports the Pearson
+// Correlation Coefficient. It will invoke 'Dataset.stat.corr' (same as
+// 'StatFunctions.pearsonCorrelation') to compute the results.
+message StatCorr {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) The name of the first column.
+ string col1 = 2;
+
+ // (Required) The name of the second column.
+ string col2 = 3;
+
+ // (Optional) Default value is 'pearson'.
+ //
+ // Currently only supports the Pearson Correlation Coefficient.
+ optional string method = 4;
+}
+
+// Calculates the approximate quantiles of numerical columns of a DataFrame.
+// It will invoke 'Dataset.stat.approxQuantile' (same as
'StatFunctions.approxQuantile')
+// to compute the results.
+message StatApproxQuantile {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) The names of the numerical columns.
+ repeated string cols = 2;
+
+ // (Required) A list of quantile probabilities.
+ //
+ // Each number must belong to [0, 1].
+ // For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
+ repeated double probabilities = 3;
+
+ // (Required) The relative target precision to achieve (greater than or
equal to 0).
+ //
+ // If set to zero, the exact quantiles are computed, which could be very
expensive.
+ // Note that values greater than 1 are accepted but give the same result as
1.
+ double relative_error = 4;
+}
+
+// Finding frequent items for columns, possibly with false positives.
+// It will invoke 'Dataset.stat.freqItems' (same as 'StatFunctions.freqItems')
+// to compute the results.
+message StatFreqItems {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) The names of the columns to search frequent items in.
+ repeated string cols = 2;
+
+ // (Optional) The minimum frequency for an item to be considered `frequent`.
+ // Should be greater than 1e-4.
+ optional double support = 3;
+}
+
+
+// Returns a stratified sample without replacement based on the fraction
+// given on each stratum.
+// It will invoke 'Dataset.stat.freqItems' (same as 'StatFunctions.freqItems')
+// to compute the results.
+message StatSampleBy {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) The column that defines strata.
+ Expression col = 2;
+
+ // (Required) Sampling fraction for each stratum.
+ //
+ // If a stratum is not specified, we treat its fraction as zero.
+ repeated Fraction fractions = 3;
+
+ // (Required) The random seed.
+ // This field is required to avoid generating mutable dataframes (see
SPARK-48184 for details),
+ // however, still keep it 'optional' here for backward compatibility.
+ optional int64 seed = 5;
+
+ message Fraction {
+ // (Required) The stratum.
+ Expression.Literal stratum = 1;
+
+ // (Required) The fraction value. Must be in [0, 1].
+ double fraction = 2;
+ }
+}
+
+
+// Replaces null values.
+// It will invoke 'Dataset.na.fill' (same as 'DataFrameNaFunctions.fill') to
compute the results.
+// Following 3 parameter combinations are supported:
+// 1, 'values' only contains 1 item, 'cols' is empty:
+// replaces null values in all type-compatible columns.
+// 2, 'values' only contains 1 item, 'cols' is not empty:
+// replaces null values in specified columns.
+// 3, 'values' contains more than 1 items, then 'cols' is required to have
the same length:
+// replaces each specified column with corresponding value.
+message NAFill {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Optional) Optional list of column names to consider.
+ repeated string cols = 2;
+
+ // (Required) Values to replace null values with.
+ //
+ // Should contain at least 1 item.
+ // Only 4 data types are supported now: bool, long, double, string
+ repeated Expression.Literal values = 3;
+}
+
+
+// Drop rows containing null values.
+// It will invoke 'Dataset.na.drop' (same as 'DataFrameNaFunctions.drop') to
compute the results.
+message NADrop {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Optional) Optional list of column names to consider.
+ //
+ // When it is empty, all the columns in the input relation will be
considered.
+ repeated string cols = 2;
+
+ // (Optional) The minimum number of non-null and non-NaN values required to
keep.
+ //
+ // When not set, it is equivalent to the number of considered columns, which
means
+ // a row will be kept only if all columns are non-null.
+ //
+ // 'how' options ('all', 'any') can be easily converted to this field:
+ // - 'all' -> set 'min_non_nulls' 1;
+ // - 'any' -> keep 'min_non_nulls' unset;
+ optional int32 min_non_nulls = 3;
+}
+
+
+// Replaces old values with the corresponding values.
+// It will invoke 'Dataset.na.replace' (same as 'DataFrameNaFunctions.replace')
+// to compute the results.
+message NAReplace {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Optional) List of column names to consider.
+ //
+ // When it is empty, all the type-compatible columns in the input relation
will be considered.
+ repeated string cols = 2;
+
+ // (Optional) The value replacement mapping.
+ repeated Replacement replacements = 3;
+
+ message Replacement {
+ // (Required) The old value.
+ //
+ // Only 4 data types are supported now: null, bool, double, string.
+ Expression.Literal old_value = 1;
+
+ // (Required) The new value.
+ //
+ // Should be of the same data type with the old value.
+ Expression.Literal new_value = 2;
+ }
+}
+
+
+// Rename columns on the input relation by the same length of names.
+message ToDF {
+ // (Required) The input relation of RenameColumnsBySameLengthNames.
+ Relation input = 1;
+
+ // (Required)
+ //
+ // The number of columns of the input relation must be equal to the length
+ // of this field. If this is not true, an exception will be returned.
+ repeated string column_names = 2;
+}
+
+
+// Rename columns on the input relation by a map with name to name mapping.
+message WithColumnsRenamed {
+ // (Required) The input relation.
+ Relation input = 1;
+
+
+ // (Optional)
+ //
+ // Renaming column names of input relation from A to B where A is the map key
+ // and B is the map value. This is a no-op if schema doesn't contain any A.
It
+ // does not require that all input relation column names to present as keys.
+ // duplicated B are not allowed.
+ map<string, string> rename_columns_map = 2 [deprecated=true];
+
+ repeated Rename renames = 3;
+
+ message Rename {
+ // (Required) The existing column name.
+ string col_name = 1;
+
+ // (Required) The new column name.
+ string new_col_name = 2;
+ }
+}
+
+// Adding columns or replacing the existing columns that have the same names.
+message WithColumns {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required)
+ //
+ // Given a column name, apply the corresponding expression on the column. If
column
+ // name exists in the input relation, then replace the column. If the column
name
+ // does not exist in the input relation, then adds it as a new column.
+ //
+ // Only one name part is expected from each Expression.Alias.
+ //
+ // An exception is thrown when duplicated names are present in the mapping.
+ repeated Expression.Alias aliases = 2;
+}
+
+message WithWatermark {
+
+ // (Required) The input relation
+ Relation input = 1;
+
+ // (Required) Name of the column containing event time.
+ string event_time = 2;
+
+ // (Required)
+ string delay_threshold = 3;
+}
+
+// Specify a hint over a relation. Hint should have a name and optional
parameters.
+message Hint {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) Hint name.
+ //
+ // Supported Join hints include BROADCAST, MERGE, SHUFFLE_HASH,
SHUFFLE_REPLICATE_NL.
+ //
+ // Supported partitioning hints include COALESCE, REPARTITION,
REPARTITION_BY_RANGE.
+ string name = 2;
+
+ // (Optional) Hint parameters.
+ repeated Expression parameters = 3;
+}
+
+// Unpivot a DataFrame from wide format to long format, optionally leaving
identifier columns set.
+message Unpivot {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) Id columns.
+ repeated Expression ids = 2;
+
+ // (Optional) Value columns to unpivot.
+ optional Values values = 3;
+
+ // (Required) Name of the variable column.
+ string variable_column_name = 4;
+
+ // (Required) Name of the value column.
+ string value_column_name = 5;
+
+ message Values {
+ repeated Expression values = 1;
+ }
+}
+
+message ToSchema {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) The user provided schema.
+ //
+ // The Sever side will update the dataframe with this schema.
+ DataType schema = 2;
+}
+
+message RepartitionByExpression {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) The partitioning expressions.
+ repeated Expression partition_exprs = 2;
+
+ // (Optional) number of partitions, must be positive.
+ optional int32 num_partitions = 3;
+}
+
+message MapPartitions {
+ // (Required) Input relation for a mapPartitions-equivalent API:
mapInPandas, mapInArrow.
+ Relation input = 1;
+
+ // (Required) Input user-defined function.
+ CommonInlineUserDefinedFunction func = 2;
+
+ // (Optional) Whether to use barrier mode execution or not.
+ optional bool is_barrier = 3;
+
+ // (Optional) ResourceProfile id used for the stage level scheduling.
+ optional int32 profile_id = 4;
+}
+
+message GroupMap {
+ // (Required) Input relation for Group Map API: apply, applyInPandas.
+ Relation input = 1;
+
+ // (Required) Expressions for grouping keys.
+ repeated Expression grouping_expressions = 2;
+
+ // (Required) Input user-defined function.
+ CommonInlineUserDefinedFunction func = 3;
+
+ // (Optional) Expressions for sorting. Only used by Scala Sorted Group Map
API.
+ repeated Expression sorting_expressions = 4;
+
+ // Below fields are only used by (Flat)MapGroupsWithState
+ // (Optional) Input relation for initial State.
+ Relation initial_input = 5;
+
+ // (Optional) Expressions for grouping keys of the initial state input
relation.
+ repeated Expression initial_grouping_expressions = 6;
+
+ // (Optional) True if MapGroupsWithState, false if FlatMapGroupsWithState.
+ optional bool is_map_groups_with_state = 7;
+
+ // (Optional) The output mode of the function.
+ optional string output_mode = 8;
+
+ // (Optional) Timeout configuration for groups that do not receive data for
a while.
+ optional string timeout_conf = 9;
+}
+
+message CoGroupMap {
+ // (Required) One input relation for CoGroup Map API - applyInPandas.
+ Relation input = 1;
+
+ // Expressions for grouping keys of the first input relation.
+ repeated Expression input_grouping_expressions = 2;
+
+ // (Required) The other input relation.
+ Relation other = 3;
+
+ // Expressions for grouping keys of the other input relation.
+ repeated Expression other_grouping_expressions = 4;
+
+ // (Required) Input user-defined function.
+ CommonInlineUserDefinedFunction func = 5;
+
+ // (Optional) Expressions for sorting. Only used by Scala Sorted CoGroup Map
API.
+ repeated Expression input_sorting_expressions = 6;
+
+ // (Optional) Expressions for sorting. Only used by Scala Sorted CoGroup Map
API.
+ repeated Expression other_sorting_expressions = 7;
+}
+
+message ApplyInPandasWithState {
+ // (Required) Input relation for applyInPandasWithState.
+ Relation input = 1;
+
+ // (Required) Expressions for grouping keys.
+ repeated Expression grouping_expressions = 2;
+
+ // (Required) Input user-defined function.
+ CommonInlineUserDefinedFunction func = 3;
+
+ // (Required) Schema for the output DataFrame.
+ string output_schema = 4;
+
+ // (Required) Schema for the state.
+ string state_schema = 5;
+
+ // (Required) The output mode of the function.
+ string output_mode = 6;
+
+ // (Required) Timeout configuration for groups that do not receive data for
a while.
+ string timeout_conf = 7;
+}
+
+message CommonInlineUserDefinedTableFunction {
+ // (Required) Name of the user-defined table function.
+ string function_name = 1;
+
+ // (Optional) Whether the user-defined table function is deterministic.
+ bool deterministic = 2;
+
+ // (Optional) Function input arguments. Empty arguments are allowed.
+ repeated Expression arguments = 3;
+
+ // (Required) Type of the user-defined table function.
+ oneof function {
+ PythonUDTF python_udtf = 4;
+ }
+}
+
+message PythonUDTF {
+ // (Optional) Return type of the Python UDTF.
+ optional DataType return_type = 1;
+
+ // (Required) EvalType of the Python UDTF.
+ int32 eval_type = 2;
+
+ // (Required) The encoded commands of the Python UDTF.
+ bytes command = 3;
+
+ // (Required) Python version being used in the client.
+ string python_ver = 4;
+}
+
+message CommonInlineUserDefinedDataSource {
+ // (Required) Name of the data source.
+ string name = 1;
+
+ // (Required) The data source type.
+ oneof data_source {
+ PythonDataSource python_data_source = 2;
+ }
+}
+
+message PythonDataSource {
+ // (Required) The encoded commands of the Python data source.
+ bytes command = 1;
+
+ // (Required) Python version being used in the client.
+ string python_ver = 2;
+}
+
+// Collect arbitrary (named) metrics from a dataset.
+message CollectMetrics {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) Name of the metrics.
+ string name = 2;
+
+ // (Required) The metric sequence.
+ repeated Expression metrics = 3;
+}
+
+message Parse {
+ // (Required) Input relation to Parse. The input is expected to have single
text column.
+ Relation input = 1;
+ // (Required) The expected format of the text.
+ ParseFormat format = 2;
+
+ // (Optional) DataType representing the schema. If not set, Spark will infer
the schema.
+ optional DataType schema = 3;
+
+ // Options for the csv/json parser. The map key is case insensitive.
+ map<string, string> options = 4;
+ enum ParseFormat {
+ PARSE_FORMAT_UNSPECIFIED = 0;
+ PARSE_FORMAT_CSV = 1;
+ PARSE_FORMAT_JSON = 2;
+ }
+}
+
+// Relation of type [[AsOfJoin]].
+//
+// `left` and `right` must be present.
+message AsOfJoin {
+ // (Required) Left input relation for a Join.
+ Relation left = 1;
+
+ // (Required) Right input relation for a Join.
+ Relation right = 2;
+
+ // (Required) Field to join on in left DataFrame
+ Expression left_as_of = 3;
+
+ // (Required) Field to join on in right DataFrame
+ Expression right_as_of = 4;
+
+ // (Optional) The join condition. Could be unset when `using_columns` is
utilized.
+ //
+ // This field does not co-exist with using_columns.
+ Expression join_expr = 5;
+
+ // Optional. using_columns provides a list of columns that should present on
both sides of
+ // the join inputs that this Join will join on. For example A JOIN B USING
col_name is
+ // equivalent to A JOIN B on A.col_name = B.col_name.
+ //
+ // This field does not co-exist with join_condition.
+ repeated string using_columns = 6;
+
+ // (Required) The join type.
+ string join_type = 7;
+
+ // (Optional) The asof tolerance within this range.
+ Expression tolerance = 8;
+
+ // (Required) Whether allow matching with the same value or not.
+ bool allow_exact_matches = 9;
+
+ // (Required) Whether to search for prior, subsequent, or closest matches.
+ string direction = 10;
+}
diff --git
a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/types.proto
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/types.proto
new file mode 100644
index 0000000..d6b4ffb
--- /dev/null
+++
b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/types.proto
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.kyuubi.shaded.spark.connect.proto";
+option go_package = "internal/generated";
+
+// This message describes the logical [[DataType]] of something. It does not
carry the value
+// itself but only describes it.
+message DataType {
+ oneof kind {
+ NULL null = 1;
+
+ Binary binary = 2;
+
+ Boolean boolean = 3;
+
+ // Numeric types
+ Byte byte = 4;
+ Short short = 5;
+ Integer integer = 6;
+ Long long = 7;
+
+ Float float = 8;
+ Double double = 9;
+ Decimal decimal = 10;
+
+ // String types
+ String string = 11;
+ Char char = 12;
+ VarChar var_char = 13;
+
+ // Datatime types
+ Date date = 14;
+ Timestamp timestamp = 15;
+ TimestampNTZ timestamp_ntz = 16;
+
+ // Interval types
+ CalendarInterval calendar_interval = 17;
+ YearMonthInterval year_month_interval = 18;
+ DayTimeInterval day_time_interval = 19;
+
+ // Complex types
+ Array array = 20;
+ Struct struct = 21;
+ Map map = 22;
+ Variant variant = 25;
+
+ // UserDefinedType
+ UDT udt = 23;
+
+ // UnparsedDataType
+ Unparsed unparsed = 24;
+ }
+
+ message Boolean {
+ uint32 type_variation_reference = 1;
+ }
+
+ message Byte {
+ uint32 type_variation_reference = 1;
+ }
+
+ message Short {
+ uint32 type_variation_reference = 1;
+ }
+
+ message Integer {
+ uint32 type_variation_reference = 1;
+ }
+
+ message Long {
+ uint32 type_variation_reference = 1;
+ }
+
+ message Float {
+ uint32 type_variation_reference = 1;
+ }
+
+ message Double {
+ uint32 type_variation_reference = 1;
+ }
+
+ message String {
+ uint32 type_variation_reference = 1;
+ string collation = 2;
+ }
+
+ message Binary {
+ uint32 type_variation_reference = 1;
+ }
+
+ message NULL {
+ uint32 type_variation_reference = 1;
+ }
+
+ message Timestamp {
+ uint32 type_variation_reference = 1;
+ }
+
+ message Date {
+ uint32 type_variation_reference = 1;
+ }
+
+ message TimestampNTZ {
+ uint32 type_variation_reference = 1;
+ }
+
+ message CalendarInterval {
+ uint32 type_variation_reference = 1;
+ }
+
+ message YearMonthInterval {
+ optional int32 start_field = 1;
+ optional int32 end_field = 2;
+ uint32 type_variation_reference = 3;
+ }
+
+ message DayTimeInterval {
+ optional int32 start_field = 1;
+ optional int32 end_field = 2;
+ uint32 type_variation_reference = 3;
+ }
+
+ // Start compound types.
+ message Char {
+ int32 length = 1;
+ uint32 type_variation_reference = 2;
+ }
+
+ message VarChar {
+ int32 length = 1;
+ uint32 type_variation_reference = 2;
+ }
+
+ message Decimal {
+ optional int32 scale = 1;
+ optional int32 precision = 2;
+ uint32 type_variation_reference = 3;
+ }
+
+ message StructField {
+ string name = 1;
+ DataType data_type = 2;
+ bool nullable = 3;
+ optional string metadata = 4;
+ }
+
+ message Struct {
+ repeated StructField fields = 1;
+ uint32 type_variation_reference = 2;
+ }
+
+ message Array {
+ DataType element_type = 1;
+ bool contains_null = 2;
+ uint32 type_variation_reference = 3;
+ }
+
+ message Map {
+ DataType key_type = 1;
+ DataType value_type = 2;
+ bool value_contains_null = 3;
+ uint32 type_variation_reference = 4;
+ }
+
+ message Variant {
+ uint32 type_variation_reference = 1;
+ }
+
+ message UDT {
+ string type = 1;
+ optional string jvm_class = 2;
+ optional string python_class = 3;
+ optional string serialized_python_class = 4;
+ DataType sql_type = 5;
+ }
+
+ message Unparsed {
+ // (Required) The unparsed data type string
+ string data_type_string = 1;
+ }
+}
diff --git
a/kyuubi-relocated-spark-connect-rpc/src/main/resources/META-INF/NOTICE
b/kyuubi-relocated-spark-connect-rpc/src/main/resources/META-INF/NOTICE
new file mode 100644
index 0000000..b65f4e7
--- /dev/null
+++ b/kyuubi-relocated-spark-connect-rpc/src/main/resources/META-INF/NOTICE
@@ -0,0 +1,8 @@
+kyuubi-relocated-spark-connect-rpc
+Copyright 2023-2024 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (https://www.apache.org/).
+
+Apache Spark
+Copyright 2014 and onwards The Apache Software Foundation.
diff --git a/pom.xml b/pom.xml
index fef65db..ad18d71 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,6 +54,7 @@
<module>kyuubi-relocated-force-shading</module>
<module>kyuubi-relocated-hive-metastore-client</module>
<module>kyuubi-relocated-hive-service-rpc</module>
+ <module>kyuubi-relocated-spark-connect-rpc</module>
<module>kyuubi-relocated-thrift</module>
<module>kyuubi-relocated-zookeeper-parent</module>
</modules>
@@ -97,6 +98,7 @@
<maven.plugin.surefire.version>3.0.0-M8</maven.plugin.surefire.version>
<maven.plugin.spotless.version>2.30.0</maven.plugin.spotless.version>
<maven.plugin.jacoco.version>0.8.7</maven.plugin.jacoco.version>
+ <maven.plugin.os.version>1.7.1</maven.plugin.os.version>
<maven.plugin.shade.version>3.4.1</maven.plugin.shade.version>
<!-- Needed for consistent times -->
@@ -457,6 +459,13 @@
</executions>
</plugin>
</plugins>
+ <extensions>
+ <extension>
+ <groupId>kr.motd.maven</groupId>
+ <artifactId>os-maven-plugin</artifactId>
+ <version>${maven.plugin.os.version}</version>
+ </extension>
+ </extensions>
</build>
<profiles>