This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new c8c25b111d [HUDI-4888] Throw exception if COW table and consistent
hashing bucket index (#7172)
c8c25b111d is described below
commit c8c25b111db03fc87fa24020f6b52fbb3af8315d
Author: Jon Vexler <[email protected]>
AuthorDate: Thu Nov 10 13:42:23 2022 -0500
[HUDI-4888] Throw exception if COW table and consistent hashing bucket
index (#7172)
Co-authored-by: Jonathan Vexler <=>
---
.../org/apache/hudi/config/HoodieIndexConfig.java | 2 +-
.../hudi/common/table/HoodieTableMetaClient.java | 17 ++++++++++++
.../org/apache/hudi/TestHoodieSparkSqlWriter.scala | 30 +++++++++++++++++++++-
3 files changed, 47 insertions(+), 2 deletions(-)
diff --git
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java
index 3c8bc636ed..c250e07f33 100644
---
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java
+++
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java
@@ -255,7 +255,7 @@ public class HoodieIndexConfig extends HoodieConfig {
.withDocumentation("Type of bucket index engine to use. Default is
SIMPLE bucket index, with fixed number of bucket."
+ "Possible options are [SIMPLE | CONSISTENT_HASHING]."
+ "Consistent hashing supports dynamic resizing of the number of
bucket, solving potential data skew and file size "
- + "issues of the SIMPLE hashing engine.");
+ + "issues of the SIMPLE hashing engine. Consistent hashing only
works with MOR tables, only use simple hashing on COW tables.");
/**
* Bucket num equals file groups num in each partition.
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java
b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java
index 87f2410af4..4fbf7c53b7 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java
@@ -115,6 +115,12 @@ public class HoodieTableMetaClient implements Serializable
{
private FileSystemRetryConfig fileSystemRetryConfig =
FileSystemRetryConfig.newBuilder().build();
protected HoodieMetastoreConfig metastoreConfig;
+ /**
+ *
+ * Instantiate HoodieTableMetaClient.
+ * Can only be called if table already exists
+ *
+ */
protected HoodieTableMetaClient(Configuration conf, String basePath, boolean
loadActiveTimelineOnLoad,
ConsistencyGuardConfig consistencyGuardConfig,
Option<TimelineLayoutVersion> layoutVersion,
String payloadClassName, FileSystemRetryConfig
fileSystemRetryConfig) {
@@ -412,6 +418,17 @@ public class HoodieTableMetaClient implements Serializable
{
throw new HoodieException("Only simple, non-partitioned or complex key
generator are supported when meta-fields are disabled. Used: " + keyGenClass);
}
}
+
+ //Check to make sure it's not a COW table with consistent hashing bucket
index
+ if (tableType == HoodieTableType.COPY_ON_WRITE) {
+ String indexType = properties.getProperty("hoodie.index.type");
+ if (indexType != null && indexType.equals("BUCKET")) {
+ String bucketEngine =
properties.getProperty("hoodie.index.bucket.engine");
+ if (bucketEngine != null && bucketEngine.equals("CONSISTENT_HASHING"))
{
+ throw new HoodieException("Consistent hashing bucket index does not
work with COW table. Use simple bucket index or an MOR table.");
+ }
+ }
+ }
}
/**
diff --git
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala
index 2ce76b2bff..732d8d7ec0 100644
---
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala
+++
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala
@@ -24,7 +24,7 @@ import org.apache.hudi.client.SparkRDDWriteClient
import org.apache.hudi.common.model._
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient,
TableSchemaResolver}
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
-import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig}
+import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieIndexConfig,
HoodieWriteConfig}
import org.apache.hudi.exception.HoodieException
import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode
import org.apache.hudi.functional.TestBootstrap
@@ -1137,6 +1137,34 @@ class TestHoodieSparkSqlWriter {
val kg2 = HoodieWriterUtils.getOriginKeyGenerator(m2)
assertTrue(kg2 == classOf[SimpleKeyGenerator].getName)
}
+
+ /**
+ *
+ * Test that you can't have consistent hashing bucket index on a COW table
+ * */
+ @Test
+ def testCOWConsistentHashing(): Unit = {
+ val _spark = spark
+ import _spark.implicits._
+ val df = Seq((1, "a1", 10, 1000, "2021-10-16")).toDF("id", "name",
"value", "ts", "dt")
+ val options = Map(
+ DataSourceWriteOptions.RECORDKEY_FIELD.key -> "id",
+ DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "ts",
+ DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "dt",
+ HoodieIndexConfig.BUCKET_INDEX_ENGINE_TYPE.key -> "CONSISTENT_HASHING",
+ HoodieIndexConfig.INDEX_TYPE.key -> "BUCKET"
+ )
+
+ val (tableName1, tablePath1) = ("hoodie_test_params_1", s"$tempBasePath" +
"_1")
+ val exc = intercept[HoodieException] {
+ df.write.format("hudi")
+ .options(options)
+ .option(HoodieWriteConfig.TBL_NAME.key, tableName1)
+ .option(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key,
classOf[NonpartitionedKeyGenerator].getName)
+ .mode(SaveMode.Overwrite).save(tablePath1)
+ }
+ assert(exc.getMessage.contains("Consistent hashing bucket index does not
work with COW table. Use simple bucket index or an MOR table."))
+ }
}
object TestHoodieSparkSqlWriter {