This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push:
new c6cd2cc [SPARK-38625][SQL] DataSource V2: Add APIs for group-based
row-level operations
c6cd2cc is described below
commit c6cd2cc7f92f3905f7cc2d2517d7b7746e286a69
Author: Anton Okolnychyi <[email protected]>
AuthorDate: Thu Mar 24 10:07:43 2022 +0800
[SPARK-38625][SQL] DataSource V2: Add APIs for group-based row-level
operations
### What changes were proposed in this pull request?
This PR contains row-level operation APIs for V2 data sources that can
replace groups of data (e.g. files, partitions). It is a subset of the changes
already reviewed in #35395.
### Why are the changes needed?
These changes are needed to support row-level operations in Spark per SPIP
[SPARK-35801](https://issues.apache.org/jira/browse/SPARK-35801).
### Does this PR introduce _any_ user-facing change?
Yes, this PR adds new Data Source V2 APIs.
### How was this patch tested?
Not applicable.
Closes #35940 from aokolnychyi/spark-38625.
Authored-by: Anton Okolnychyi <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit 6743aaaef9fff46c009cc235dd87d0508c5e5ba8)
Signed-off-by: Wenchen Fan <[email protected]>
---
.../catalog/SupportsRowLevelOperations.java | 42 ++++++++++
.../sql/connector/write/RowLevelOperation.java | 92 ++++++++++++++++++++++
.../connector/write/RowLevelOperationBuilder.java | 34 ++++++++
.../sql/connector/write/RowLevelOperationInfo.java | 40 ++++++++++
4 files changed, 208 insertions(+)
diff --git
a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsRowLevelOperations.java
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsRowLevelOperations.java
new file mode 100644
index 0000000..323d52f
--- /dev/null
+++
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsRowLevelOperations.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.catalog;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.sql.connector.write.RowLevelOperationBuilder;
+import org.apache.spark.sql.connector.write.RowLevelOperation;
+import org.apache.spark.sql.connector.write.RowLevelOperationInfo;
+
+/**
+ * A mix-in interface for {@link Table} row-level operations support. Data
sources can implement
+ * this interface to indicate they support rewriting data for DELETE, UPDATE,
MERGE operations.
+ *
+ * @since 3.3.0
+ */
+@Experimental
+public interface SupportsRowLevelOperations extends Table {
+ /**
+ * Returns a {@link RowLevelOperationBuilder} to build a {@link
RowLevelOperation}.
+ * Spark will call this method while planning DELETE, UPDATE and MERGE
operations
+ * that require rewriting data.
+ *
+ * @param info the row-level operation info such as command (e.g. DELETE)
and options
+ * @return the row-level operation builder
+ */
+ RowLevelOperationBuilder newRowLevelOperationBuilder(RowLevelOperationInfo
info);
+}
diff --git
a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/RowLevelOperation.java
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/RowLevelOperation.java
new file mode 100644
index 0000000..04bbab1
--- /dev/null
+++
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/RowLevelOperation.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.write;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.sql.connector.expressions.NamedReference;
+import org.apache.spark.sql.connector.read.Scan;
+import org.apache.spark.sql.connector.read.ScanBuilder;
+import org.apache.spark.sql.util.CaseInsensitiveStringMap;
+
+/**
+ * A logical representation of a data source DELETE, UPDATE, or MERGE
operation that requires
+ * rewriting data.
+ *
+ * @since 3.3.0
+ */
+@Experimental
+public interface RowLevelOperation {
+
+ /**
+ * A row-level SQL command.
+ */
+ enum Command {
+ DELETE, UPDATE, MERGE
+ }
+
+ /**
+ * Returns the description associated with this row-level operation.
+ */
+ default String description() {
+ return this.getClass().toString();
+ }
+
+ /**
+ * Returns the SQL command that is being performed.
+ */
+ Command command();
+
+ /**
+ * Returns a {@link ScanBuilder} to configure a {@link Scan} for this
row-level operation.
+ * <p>
+ * Data sources fall into two categories: those that can handle a delta of
rows and those that
+ * need to replace groups (e.g. partitions, files). Data sources that handle
deltas allow Spark
+ * to quickly discard unchanged rows and have no requirements for input
scans. Data sources that
+ * replace groups of rows can discard deleted rows but need to keep
unchanged rows to be passed
+ * back into the source. This means that scans for such data sources must
produce all rows
+ * in a group if any are returned. Some data sources will avoid pushing
filters into files (file
+ * granularity), while others will avoid pruning files within a partition
(partition granularity).
+ * <p>
+ * For example, if a data source can only replace partitions, all rows from
a partition must
+ * be returned by the scan, even if a filter can narrow the set of changes
to a single file
+ * in the partition. Similarly, a data source that can swap individual files
must produce all
+ * rows from files where at least one record must be changed, not just rows
that must be changed.
+ */
+ ScanBuilder newScanBuilder(CaseInsensitiveStringMap options);
+
+ /**
+ * Returns a {@link WriteBuilder} to configure a {@link Write} for this
row-level operation.
+ * <p>
+ * Note that Spark will first configure the scan and then the write,
allowing data sources to pass
+ * information from the scan to the write. For example, the scan can report
which condition was
+ * used to read the data that may be needed by the write under certain
isolation levels.
+ * Implementations may capture the built scan or required scan information
and then use it
+ * while building the write.
+ */
+ WriteBuilder newWriteBuilder(LogicalWriteInfo info);
+
+ /**
+ * Returns metadata attributes that are required to perform this row-level
operation.
+ * <p>
+ * Data sources that can use this method to project metadata columns needed
for writing
+ * the data back (e.g. metadata columns for grouping data).
+ */
+ default NamedReference[] requiredMetadataAttributes() {
+ return new NamedReference[0];
+ }
+}
diff --git
a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/RowLevelOperationBuilder.java
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/RowLevelOperationBuilder.java
new file mode 100644
index 0000000..bc2f577
--- /dev/null
+++
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/RowLevelOperationBuilder.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.write;
+
+import org.apache.spark.annotation.Experimental;
+
+/**
+ * An interface for building a {@link RowLevelOperation}.
+ *
+ * @since 3.3.0
+ */
+@Experimental
+public interface RowLevelOperationBuilder {
+ /**
+ * Returns a {@link RowLevelOperation} that controls how Spark rewrites data
+ * for DELETE, UPDATE, MERGE commands.
+ */
+ RowLevelOperation build();
+}
diff --git
a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/RowLevelOperationInfo.java
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/RowLevelOperationInfo.java
new file mode 100644
index 0000000..e3d7397
--- /dev/null
+++
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/RowLevelOperationInfo.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.write;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.sql.connector.write.RowLevelOperation.Command;
+import org.apache.spark.sql.util.CaseInsensitiveStringMap;
+
+/**
+ * An interface with logical information for a row-level operation such as
DELETE, UPDATE, MERGE.
+ *
+ * @since 3.3.0
+ */
+@Experimental
+public interface RowLevelOperationInfo {
+ /**
+ * Returns options that the user specified when performing the row-level
operation.
+ */
+ CaseInsensitiveStringMap options();
+
+ /**
+ * Returns the row-level SQL command (e.g. DELETE, UPDATE, MERGE).
+ */
+ Command command();
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]