Re: [PR] [HUDI-8034] Support custom key generator with HoodieCatalogTable [hudi]

via GitHub Thu, 12 Sep 2024 14:53:27 -0700


yihua commented on code in PR #11816:
URL: https://github.com/apache/hudi/pull/11816#discussion_r1757633288



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala:
##########
@@ -313,7 +315,9 @@ class HoodieCatalogTable(val spark: SparkSession, var 
table: CatalogTable) exten
           
KeyGeneratorType.valueOf(originTableConfig(HoodieTableConfig.KEY_GENERATOR_TYPE.key)).getClassName)
     } else {
       val primaryKeys = 
table.properties.getOrElse(SQL_KEY_TABLE_PRIMARY_KEY.sqlKeyName, 
table.storage.properties.get(SQL_KEY_TABLE_PRIMARY_KEY.sqlKeyName)).toString
-      val partitions = table.partitionColumnNames.mkString(",")
+      val partitionFieldsOpt = 
Option.apply(ConfigUtils.getStringWithAltKeys(originTableConfig.asJava.asInstanceOf[java.util.Map[String,
 Object]], HoodieTableConfig.PARTITION_FIELDS))

Review Comment:
   `originTableConfig.asJava.asInstanceOf[java.util.Map[String, Object]` may 
incur non-trivial overhead.  Could you add a new util method that can be used 
on Scala Map?



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala:
##########
@@ -212,7 +211,9 @@ class HoodieCatalogTable(val spark: SparkSession, var 
table: CatalogTable) exten
     } else {
       val (recordName, namespace) = 
AvroConversionUtils.getAvroRecordNameAndNamespace(table.identifier.table)
       val schema = SchemaConverters.toAvroType(dataSchema, nullable = false, 
recordName, namespace)
-      val partitionColumns = if (table.partitionColumnNames.isEmpty) {
+      val partitionColumns = if 
(tableConfigs.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())) {
+        
ConfigUtils.getStringWithAltKeys(tableConfigs.asJava.asInstanceOf[java.util.Map[String,
 Object]], KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)

Review Comment:
   Similar here on the conversion.



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java:
##########
@@ -66,29 +66,25 @@ public class KeyGenUtils {
   public static KeyGeneratorType inferKeyGeneratorType(
       Option<String> recordsKeyFields, String partitionFields) {
     boolean autoGenerateRecordKeys = !recordsKeyFields.isPresent();
-    if (autoGenerateRecordKeys) {
-      return inferKeyGeneratorTypeForAutoKeyGen(partitionFields);
-    } else {
-      if (!StringUtils.isNullOrEmpty(partitionFields)) {
-        int numPartFields = partitionFields.split(",").length;
-        int numRecordKeyFields = recordsKeyFields.get().split(",").length;
-        if (numPartFields == 1 && numRecordKeyFields == 1) {
-          return KeyGeneratorType.SIMPLE;
-        }
-        return KeyGeneratorType.COMPLEX;
-      }
-      return KeyGeneratorType.NON_PARTITION;
+    KeyGeneratorType partitionKeyGeneratorType = 
inferKeyGeneratorTypeFromPartitionFields(partitionFields);
+    if (autoGenerateRecordKeys || partitionKeyGeneratorType != 
KeyGeneratorType.SIMPLE) {

Review Comment:
   Make sense.  In that case should all logic of checking record key and 
partition fields be put in the same method 
`inferKeyGeneratorTypeFromPartitionFields`?  The code in the current shape is 
confusing.  Still keep the structure like:
   ```
   if (autoGenerateRecordKeys) {
         return inferKeyGeneratorTypeForAutoKeyGen(partitionFields);
   }
   ...
   ```



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlWithCustomKeyGenerator.scala:
##########
@@ -442,13 +442,92 @@ class TestSparkSqlWithCustomKeyGenerator extends 
HoodieSparkSqlTestBase {
     }
   }
 
+  test("Test create table with custom key generator") {
+    withTempDir { tmp => {
+      val tableName = generateTableName
+      val tablePath = tmp.getCanonicalPath + "/" + tableName
+      val writePartitionFields = "ts:timestamp"
+      val dateFormat = "yyyy/MM/dd"
+      val tsGenFunc = (ts: Integer) => TS_FORMATTER_FUNC_WITH_FORMAT.apply(ts, 
dateFormat)
+      val customPartitionFunc = (ts: Integer, _: String) => "ts=" + 
tsGenFunc.apply(ts)
+
+      spark.sql(
+        s"""
+           |create table ${tableName} (
+           |  `id` INT,
+           |  `name` STRING,
+           |  `price` DECIMAL(5, 1),
+           |  `ts` INT,
+           |  `segment` STRING
+           |) using hudi
+           |tblproperties (
+           |  'primaryKey' = 'id,name',
+           |  'type' = 'mor',
+           |  'preCombineField'='name',
+           |  'hoodie.datasource.write.keygenerator.class' = 
'$CUSTOM_KEY_GEN_CLASS_NAME',
+           |  'hoodie.datasource.write.partitionpath.field' = 
'$writePartitionFields',

Review Comment:
   Let's JIRA the follow up and mark it to fix 1.0.0 release.



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java:
##########
@@ -66,29 +66,25 @@ public class KeyGenUtils {
   public static KeyGeneratorType inferKeyGeneratorType(
       Option<String> recordsKeyFields, String partitionFields) {
     boolean autoGenerateRecordKeys = !recordsKeyFields.isPresent();
-    if (autoGenerateRecordKeys) {
-      return inferKeyGeneratorTypeForAutoKeyGen(partitionFields);
-    } else {
-      if (!StringUtils.isNullOrEmpty(partitionFields)) {
-        int numPartFields = partitionFields.split(",").length;
-        int numRecordKeyFields = recordsKeyFields.get().split(",").length;
-        if (numPartFields == 1 && numRecordKeyFields == 1) {
-          return KeyGeneratorType.SIMPLE;
-        }
-        return KeyGeneratorType.COMPLEX;
-      }
-      return KeyGeneratorType.NON_PARTITION;
+    KeyGeneratorType partitionKeyGeneratorType = 
inferKeyGeneratorTypeFromPartitionFields(partitionFields);
+    if (autoGenerateRecordKeys || partitionKeyGeneratorType != 
KeyGeneratorType.SIMPLE) {
+      return partitionKeyGeneratorType;
     }
+    int numRecordKeyFields = recordsKeyFields.get().split(",").length;
+    return numRecordKeyFields == 1 ? KeyGeneratorType.SIMPLE : 
KeyGeneratorType.COMPLEX;
   }
 
   // When auto record key gen is enabled, our inference will be based on 
partition path only.
-  private static KeyGeneratorType inferKeyGeneratorTypeForAutoKeyGen(String 
partitionFields) {
+  static KeyGeneratorType inferKeyGeneratorTypeFromPartitionFields(String 
partitionFields) {
     if (!StringUtils.isNullOrEmpty(partitionFields)) {
-      int numPartFields = partitionFields.split(",").length;
-      if (numPartFields == 1) {
+      String[] partitonFields = partitionFields.split(",");
+      if 
(partitonFields[0].contains(BaseKeyGenerator.CUSTOM_KEY_GENERATOR_SPLIT_REGEX)) 
{
+        return KeyGeneratorType.CUSTOM;

Review Comment:
   I created HUDI-7613 for you to validate write/query with CustomKeyGenerator 
on Flink and Hive



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8034] Support custom key generator with HoodieCatalogTable [hudi]

Reply via email to