This is an automated email from the ASF dual-hosted git repository.

fanjia pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/seatunnel.git


The following commit(s) were added to refs/heads/dev by this push:
     new f47495554b [Improve][CDC] Disable exactly_once by default to improve 
stability (#6244)
f47495554b is described below

commit f47495554b7e04b8fdf3bf5891be80bccae75920
Author: hailin0 <[email protected]>
AuthorDate: Fri Jan 19 08:39:55 2024 +0800

    [Improve][CDC] Disable exactly_once by default to improve stability (#6244)
---
 docs/en/connector-v2/source/MySQL-CDC.md                          | 2 +-
 docs/en/connector-v2/source/Oracle-CDC.md                         | 2 +-
 docs/en/connector-v2/source/Postgre-CDC.md                        | 2 +-
 docs/en/connector-v2/source/SqlServer-CDC.md                      | 2 +-
 .../seatunnel/connectors/cdc/base/option/SourceOptions.java       | 2 +-
 .../resources/oraclecdc_to_oracle_with_custom_primary_key.conf    | 2 ++
 .../postgrescdc_to_postgres_with_custom_primary_key.conf          | 2 +-
 .../sqlservercdc_to_sqlserver_with_custom_primary_key.conf        | 8 +++++++-
 8 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/docs/en/connector-v2/source/MySQL-CDC.md 
b/docs/en/connector-v2/source/MySQL-CDC.md
index 499830f7fa..850ffb3d58 100644
--- a/docs/en/connector-v2/source/MySQL-CDC.md
+++ b/docs/en/connector-v2/source/MySQL-CDC.md
@@ -171,7 +171,7 @@ When an initial consistent snapshot is made for large 
databases, your establishe
 | chunk-key.even-distribution.factor.lower-bound | Double   | No       | 0.05  
  | The lower bound of the chunk key distribution factor. This factor is used 
to determine whether the table data is evenly distributed. If the distribution 
factor is calculated to be greater than or equal to this lower bound (i.e., 
(MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for 
even distribution. Otherwise, if the distribution factor is less, the table 
will be considered as unev [...]
 | sample-sharding.threshold                      | Integer  | No       | 1000  
  | This configuration specifies the threshold of estimated shard count to 
trigger the sample sharding strategy. When the distribution factor is outside 
the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and 
`chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count 
(calculated as approximate row count / chunk size) exceeds this threshold, the 
sample sharding strategy [...]
 | inverse-sampling.rate                          | Integer  | No       | 1000  
  | The inverse of the sampling rate used in the sample sharding strategy. For 
example, if this value is set to 1000, it means a 1/1000 sampling rate is 
applied during the sampling process. This option provides flexibility in 
controlling the granularity of the sampling, thus affecting the final number of 
shards. It's especially useful when dealing with very large datasets where a 
lower sampling rate is preferr [...]
-| exactly_once                                   | Boolean  | No       | true  
  | Enable exactly once semantic.                                               
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
+| exactly_once                                   | Boolean  | No       | false 
  | Enable exactly once semantic.                                               
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
 | format                                         | Enum     | No       | 
DEFAULT | Optional output format for MySQL CDC, valid enumerations are 
`DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`.                                           
                                                                                
                                                                                
                                                                                
                             [...]
 | debezium                                       | Config   | No       | -     
  | Pass-through [Debezium's 
properties](https://debezium.io/documentation/reference/1.6/connectors/mysql.html#mysql-connector-properties)
 to Debezium Embedded Engine which is used to capture data changes from MySQL 
server.                                                                         
                                                                                
                                      [...]
 | common-options                                 |          | no       | -     
  | Source plugin common parameters, please refer to [Source Common 
Options](common-options.md) for details                                         
                                                                                
                                                                                
                                                                                
                          [...]
diff --git a/docs/en/connector-v2/source/Oracle-CDC.md 
b/docs/en/connector-v2/source/Oracle-CDC.md
index f34f6102f6..51375f3492 100644
--- a/docs/en/connector-v2/source/Oracle-CDC.md
+++ b/docs/en/connector-v2/source/Oracle-CDC.md
@@ -228,7 +228,7 @@ exit;
 | chunk-key.even-distribution.factor.lower-bound | Double   | No       | 0.05  
  | The lower bound of the chunk key distribution factor. This factor is used 
to determine whether the table data is evenly distributed. If the distribution 
factor is calculated to be greater than or equal to this lower bound (i.e., 
(MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for 
even distribution. Otherwise, if the distribution factor is less, the table 
will be considered as unev [...]
 | sample-sharding.threshold                      | Integer  | No       | 1000  
  | This configuration specifies the threshold of estimated shard count to 
trigger the sample sharding strategy. When the distribution factor is outside 
the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and 
`chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count 
(calculated as approximate row count / chunk size) exceeds this threshold, the 
sample sharding strategy [...]
 | inverse-sampling.rate                          | Integer  | No       | 1000  
  | The inverse of the sampling rate used in the sample sharding strategy. For 
example, if this value is set to 1000, it means a 1/1000 sampling rate is 
applied during the sampling process. This option provides flexibility in 
controlling the granularity of the sampling, thus affecting the final number of 
shards. It's especially useful when dealing with very large datasets where a 
lower sampling rate is preferr [...]
-| exactly_once                                   | Boolean  | No       | true  
  | Enable exactly once semantic.                                               
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
+| exactly_once                                   | Boolean  | No       | false 
  | Enable exactly once semantic.                                               
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
 | format                                         | Enum     | No       | 
DEFAULT | Optional output format for Oracle CDC, valid enumerations are 
`DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`.                                           
                                                                                
                                                                                
                                                                                
                            [...]
 | debezium                                       | Config   | No       | -     
  | Pass-through [Debezium's 
properties](https://debezium.io/documentation/reference/1.6/connectors/oracle.html#oracle-connector-properties)
 to Debezium Embedded Engine which is used to capture data changes from Oracle 
server.                                                                         
                                                                                
                                   [...]
 | common-options                                 |          | no       | -     
  | Source plugin common parameters, please refer to [Source Common 
Options](common-options.md) for details                                         
                                                                                
                                                                                
                                                                                
                          [...]
diff --git a/docs/en/connector-v2/source/Postgre-CDC.md 
b/docs/en/connector-v2/source/Postgre-CDC.md
index ffd776ec53..a9df4d0d08 100644
--- a/docs/en/connector-v2/source/Postgre-CDC.md
+++ b/docs/en/connector-v2/source/Postgre-CDC.md
@@ -99,7 +99,7 @@ ALTER TABLE your_table_name REPLICA IDENTITY FULL;
 | chunk-key.even-distribution.factor.lower-bound | Double   | No       | 0.05  
   | The lower bound of the chunk key distribution factor. This factor is used 
to determine whether the table data is evenly distributed. If the distribution 
factor is calculated to be greater than or equal to this lower bound (i.e., 
(MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for 
even distribution. Otherwise, if the distribution factor is less, the table 
will be considered as une [...]
 | sample-sharding.threshold                      | Integer  | No       | 1000  
   | This configuration specifies the threshold of estimated shard count to 
trigger the sample sharding strategy. When the distribution factor is outside 
the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and 
`chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count 
(calculated as approximate row count / chunk size) exceeds this threshold, the 
sample sharding strateg [...]
 | inverse-sampling.rate                          | Integer  | No       | 1000  
   | The inverse of the sampling rate used in the sample sharding strategy. For 
example, if this value is set to 1000, it means a 1/1000 sampling rate is 
applied during the sampling process. This option provides flexibility in 
controlling the granularity of the sampling, thus affecting the final number of 
shards. It's especially useful when dealing with very large datasets where a 
lower sampling rate is prefer [...]
-| exactly_once                                   | Boolean  | No       | true  
   | Enable exactly once semantic.                                              
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
+| exactly_once                                   | Boolean  | No       | false 
   | Enable exactly once semantic.                                              
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
 | format                                         | Enum     | No       | 
DEFAULT  | Optional output format for Postgre CDC, valid enumerations are 
`DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`.                                           
                                                                                
                                                                                
                                                                                
                          [...]
 | debezium                                       | Config   | No       | -     
   | Pass-through [Debezium's 
properties](https://debezium.io/documentation/reference/1.6/connectors/postgresql.html#postgresql-connector-properties)
 to Debezium Embedded Engine which is used to capture data changes from Postgre 
server.                                                                         
                                                                                
                         [...]
 | common-options                                 |          | no       | -     
   | Source plugin common parameters, please refer to [Source Common 
Options](common-options.md) for details                                         
                                                                                
                                                                                
                                                                                
                         [...]
diff --git a/docs/en/connector-v2/source/SqlServer-CDC.md 
b/docs/en/connector-v2/source/SqlServer-CDC.md
index 02cc4c21ac..1c932c28f5 100644
--- a/docs/en/connector-v2/source/SqlServer-CDC.md
+++ b/docs/en/connector-v2/source/SqlServer-CDC.md
@@ -81,7 +81,7 @@ Please download and put SqlServer driver in 
`${SEATUNNEL_HOME}/lib/` dir. For ex
 | chunk-key.even-distribution.factor.lower-bound | Double   | No       | 0.05  
  | The lower bound of the chunk key distribution factor. This factor is used 
to determine whether the table data is evenly distributed. If the distribution 
factor is calculated to be greater than or equal to this lower bound (i.e., 
(MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for 
even distribution. Otherwise, if the distribution factor is less, the table 
will be considered as unev [...]
 | sample-sharding.threshold                      | int      | No       | 1000  
  | This configuration specifies the threshold of estimated shard count to 
trigger the sample sharding strategy. When the distribution factor is outside 
the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and 
`chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count 
(calculated as approximate row count / chunk size) exceeds this threshold, the 
sample sharding strategy [...]
 | inverse-sampling.rate                          | int      | No       | 1000  
  | The inverse of the sampling rate used in the sample sharding strategy. For 
example, if this value is set to 1000, it means a 1/1000 sampling rate is 
applied during the sampling process. This option provides flexibility in 
controlling the granularity of the sampling, thus affecting the final number of 
shards. It's especially useful when dealing with very large datasets where a 
lower sampling rate is preferr [...]
-| exactly_once                                   | Boolean  | No       | true  
  | Enable exactly once semantic.                                               
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
+| exactly_once                                   | Boolean  | No       | false 
  | Enable exactly once semantic.                                               
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
 | debezium.*                                     | config   | No       | -     
  | Pass-through Debezium's properties to Debezium Embedded Engine which is 
used to capture data changes from SqlServer server.<br/>See more about<br/>the 
[Debezium's SqlServer Connector 
properties](https://debezium.io/documentation/reference/1.6/connectors/sqlserver.html#sqlserver-connector-properties)
                                                                                
                              [...]
 | format                                         | Enum     | No       | 
DEFAULT | Optional output format for SqlServer CDC, valid enumerations are 
"DEFAULT"、"COMPATIBLE_DEBEZIUM_JSON".                                           
                                                                                
                                                                                
                                                                                
                         [...]
 | common-options                                 |          | no       | -     
  | Source plugin common parameters, please refer to [Source Common 
Options](common-options.md) for details.                                        
                                                                                
                                                                                
                                                                                
                          [...]
diff --git 
a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/option/SourceOptions.java
 
b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/option/SourceOptions.java
index 99932c21f9..87483d9cff 100644
--- 
a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/option/SourceOptions.java
+++ 
b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/option/SourceOptions.java
@@ -105,7 +105,7 @@ public class SourceOptions {
     public static final Option<Boolean> EXACTLY_ONCE =
             Options.key("exactly_once")
                     .booleanType()
-                    .defaultValue(true)
+                    .defaultValue(false)
                     .withDescription("Enable exactly once semantic.");
 
     public static OptionRule.Builder getBaseRule() {
diff --git 
a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-oracle-e2e/src/test/resources/oraclecdc_to_oracle_with_custom_primary_key.conf
 
b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-oracle-e2e/src/test/resources/oraclecdc_to_oracle_with_custom_primary_key.conf
index 2b6a189ba6..769fab0923 100644
--- 
a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-oracle-e2e/src/test/resources/oraclecdc_to_oracle_with_custom_primary_key.conf
+++ 
b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-oracle-e2e/src/test/resources/oraclecdc_to_oracle_with_custom_primary_key.conf
@@ -49,6 +49,8 @@ source {
         primaryKeys = ["ID"]
       }
     ]
+
+    exactly_once = true
   }
 }
 
diff --git 
a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-postgres-e2e/src/test/resources/postgrescdc_to_postgres_with_custom_primary_key.conf
 
b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-postgres-e2e/src/test/resources/postgrescdc_to_postgres_with_custom_primary_key.conf
index dd7168969e..3250da0db2 100644
--- 
a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-postgres-e2e/src/test/resources/postgrescdc_to_postgres_with_custom_primary_key.conf
+++ 
b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-postgres-e2e/src/test/resources/postgrescdc_to_postgres_with_custom_primary_key.conf
@@ -37,7 +37,7 @@ source {
     table-names = ["postgres_cdc.inventory.full_types_no_primary_key"]
     base-url = 
"jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF"
     decoding.plugin.name = "decoderbufs"
-    exactly_once = false
+    exactly_once = true
     table-names-config = [
       {
         table = "postgres_cdc.inventory.full_types_no_primary_key"
diff --git 
a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-sqlserver-e2e/src/test/resources/sqlservercdc_to_sqlserver_with_custom_primary_key.conf
 
b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-sqlserver-e2e/src/test/resources/sqlservercdc_to_sqlserver_with_custom_primary_key.conf
index 4e9f4e5e73..c3e5189921 100644
--- 
a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-sqlserver-e2e/src/test/resources/sqlservercdc_to_sqlserver_with_custom_primary_key.conf
+++ 
b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-sqlserver-e2e/src/test/resources/sqlservercdc_to_sqlserver_with_custom_primary_key.conf
@@ -35,7 +35,13 @@ source {
     table-names = ["column_type_test.dbo.full_types_custom_primary_key"]
     base-url = 
"jdbc:sqlserver://sqlserver-host:1433;databaseName=column_type_test"
 
-    exactly_once = false
+    exactly_once = true
+    table-names-config = [
+      {
+        table = "column_type_test.dbo.full_types_custom_primary_key"
+        primaryKeys = ["id"]
+      }
+    ]
   }
 }
 

Reply via email to