yihua commented on code in PR #11816:
URL: https://github.com/apache/hudi/pull/11816#discussion_r1751101788
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java:
##########
@@ -66,29 +66,25 @@ public class KeyGenUtils {
public static KeyGeneratorType inferKeyGeneratorType(
Option<String> recordsKeyFields, String partitionFields) {
boolean autoGenerateRecordKeys = !recordsKeyFields.isPresent();
- if (autoGenerateRecordKeys) {
- return inferKeyGeneratorTypeForAutoKeyGen(partitionFields);
- } else {
- if (!StringUtils.isNullOrEmpty(partitionFields)) {
- int numPartFields = partitionFields.split(",").length;
- int numRecordKeyFields = recordsKeyFields.get().split(",").length;
- if (numPartFields == 1 && numRecordKeyFields == 1) {
- return KeyGeneratorType.SIMPLE;
- }
- return KeyGeneratorType.COMPLEX;
- }
- return KeyGeneratorType.NON_PARTITION;
+ KeyGeneratorType partitionKeyGeneratorType =
inferKeyGeneratorTypeFromPartitionFields(partitionFields);
+ if (autoGenerateRecordKeys || partitionKeyGeneratorType !=
KeyGeneratorType.SIMPLE) {
+ return partitionKeyGeneratorType;
}
+ int numRecordKeyFields = recordsKeyFields.get().split(",").length;
+ return numRecordKeyFields == 1 ? KeyGeneratorType.SIMPLE :
KeyGeneratorType.COMPLEX;
}
// When auto record key gen is enabled, our inference will be based on
partition path only.
- private static KeyGeneratorType inferKeyGeneratorTypeForAutoKeyGen(String
partitionFields) {
+ static KeyGeneratorType inferKeyGeneratorTypeFromPartitionFields(String
partitionFields) {
if (!StringUtils.isNullOrEmpty(partitionFields)) {
- int numPartFields = partitionFields.split(",").length;
- if (numPartFields == 1) {
+ String[] partitonFields = partitionFields.split(",");
+ if
(partitonFields[0].contains(BaseKeyGenerator.CUSTOM_KEY_GENERATOR_SPLIT_REGEX))
{
+ return KeyGeneratorType.CUSTOM;
Review Comment:
What about other engines? Do they have similar infer logic that needs to be
fixed?
##########
hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/TestKeyGenUtils.java:
##########
@@ -45,12 +46,68 @@ public void testInferKeyGeneratorType() {
assertEquals(
KeyGeneratorType.COMPLEX,
KeyGenUtils.inferKeyGeneratorType(Option.of("col1,col2"),
"partition1,partition2"));
+
+ assertEquals(
+ KeyGeneratorType.CUSTOM,
+ KeyGenUtils.inferKeyGeneratorType(Option.of("col1"),
"partition1:simple,partition2:timestamp"));
+ assertEquals(
+ KeyGeneratorType.CUSTOM,
+ KeyGenUtils.inferKeyGeneratorType(Option.of("col1,col2"),
"partition1:simple"));
+ assertEquals(
+ KeyGeneratorType.CUSTOM,
+ KeyGenUtils.inferKeyGeneratorType(Option.of("col1,col2"),
"partition1:simple,partition2:timestamp"));
+
assertEquals(
KeyGeneratorType.NON_PARTITION,
KeyGenUtils.inferKeyGeneratorType(Option.of("col1,col2"), ""));
assertEquals(
KeyGeneratorType.NON_PARTITION,
KeyGenUtils.inferKeyGeneratorType(Option.of("col1,col2"), null));
+
+ // Test key generator type with auto generation of record keys
+ assertEquals(
+ KeyGeneratorType.SIMPLE,
+ KeyGenUtils.inferKeyGeneratorType(Option.empty(), "partition1"));
+ assertEquals(
+ KeyGeneratorType.COMPLEX,
+ KeyGenUtils.inferKeyGeneratorType(Option.empty(),
"partition1,partition2"));
+ assertEquals(
+ KeyGeneratorType.CUSTOM,
+ KeyGenUtils.inferKeyGeneratorType(Option.empty(),
"partition1:simple"));
+ assertEquals(
+ KeyGeneratorType.CUSTOM,
+ KeyGenUtils.inferKeyGeneratorType(Option.empty(),
"partition1:simple,partition2:timestamp"));
+ assertEquals(
+ KeyGeneratorType.NON_PARTITION,
+ KeyGenUtils.inferKeyGeneratorType(Option.empty(), ""));
+ assertEquals(
+ KeyGeneratorType.NON_PARTITION,
+ KeyGenUtils.inferKeyGeneratorType(Option.empty(), null));
+ }
+
+ @Test
+ public void testInferKeyGeneratorTypeFromPartitionFields() {
+ assertEquals(
+ KeyGeneratorType.SIMPLE,
+ KeyGenUtils.inferKeyGeneratorTypeFromPartitionFields("partition1"));
+ assertEquals(
+ KeyGeneratorType.COMPLEX,
+
KeyGenUtils.inferKeyGeneratorTypeFromPartitionFields("partition1,partition1"));
Review Comment:
```suggestion
KeyGenUtils.inferKeyGeneratorTypeFromPartitionFields("partition1,partition2"));
```
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala:
##########
@@ -313,7 +315,9 @@ class HoodieCatalogTable(val spark: SparkSession, var
table: CatalogTable) exten
KeyGeneratorType.valueOf(originTableConfig(HoodieTableConfig.KEY_GENERATOR_TYPE.key)).getClassName)
} else {
val primaryKeys =
table.properties.getOrElse(SQL_KEY_TABLE_PRIMARY_KEY.sqlKeyName,
table.storage.properties.get(SQL_KEY_TABLE_PRIMARY_KEY.sqlKeyName)).toString
- val partitions = table.partitionColumnNames.mkString(",")
+ val partitionFieldsOpt =
originTableConfig.get(HoodieTableConfig.PARTITION_FIELDS.key)
+
.orElse(sqlOptions.get(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()))
Review Comment:
Avoid using `.key` and use `ConfigUtils` instead so alternative keys are
considered.
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlWithCustomKeyGenerator.scala:
##########
@@ -442,13 +442,92 @@ class TestSparkSqlWithCustomKeyGenerator extends
HoodieSparkSqlTestBase {
}
}
+ test("Test create table with custom key generator") {
+ withTempDir { tmp => {
+ val tableName = generateTableName
+ val tablePath = tmp.getCanonicalPath + "/" + tableName
+ val writePartitionFields = "ts:timestamp"
+ val dateFormat = "yyyy/MM/dd"
+ val tsGenFunc = (ts: Integer) => TS_FORMATTER_FUNC_WITH_FORMAT.apply(ts,
dateFormat)
+ val customPartitionFunc = (ts: Integer, _: String) => "ts=" +
tsGenFunc.apply(ts)
+
+ spark.sql(
+ s"""
+ |create table ${tableName} (
+ | `id` INT,
+ | `name` STRING,
+ | `price` DECIMAL(5, 1),
+ | `ts` INT,
+ | `segment` STRING
+ |) using hudi
+ |tblproperties (
+ | 'primaryKey' = 'id,name',
+ | 'type' = 'mor',
+ | 'preCombineField'='name',
+ | 'hoodie.datasource.write.keygenerator.class' =
'$CUSTOM_KEY_GEN_CLASS_NAME',
+ | 'hoodie.datasource.write.partitionpath.field' =
'$writePartitionFields',
Review Comment:
Can we write a test that do not use
`hoodie.datasource.write.partitionpath.field` in `tblproperties`, by creating a
table using Spark datasource with custom keygen and timestamp-based partition
field, and using Spark SQL without passing in any write configs to make sure
the partition path value can be correctly generated?
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java:
##########
@@ -66,29 +66,25 @@ public class KeyGenUtils {
public static KeyGeneratorType inferKeyGeneratorType(
Option<String> recordsKeyFields, String partitionFields) {
boolean autoGenerateRecordKeys = !recordsKeyFields.isPresent();
- if (autoGenerateRecordKeys) {
- return inferKeyGeneratorTypeForAutoKeyGen(partitionFields);
- } else {
- if (!StringUtils.isNullOrEmpty(partitionFields)) {
- int numPartFields = partitionFields.split(",").length;
- int numRecordKeyFields = recordsKeyFields.get().split(",").length;
- if (numPartFields == 1 && numRecordKeyFields == 1) {
- return KeyGeneratorType.SIMPLE;
- }
- return KeyGeneratorType.COMPLEX;
- }
- return KeyGeneratorType.NON_PARTITION;
+ KeyGeneratorType partitionKeyGeneratorType =
inferKeyGeneratorTypeFromPartitionFields(partitionFields);
+ if (autoGenerateRecordKeys || partitionKeyGeneratorType !=
KeyGeneratorType.SIMPLE) {
Review Comment:
Should `KeyGeneratorType.COMPLEX` be included too based on the logic below.
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala:
##########
@@ -212,7 +211,9 @@ class HoodieCatalogTable(val spark: SparkSession, var
table: CatalogTable) exten
} else {
val (recordName, namespace) =
AvroConversionUtils.getAvroRecordNameAndNamespace(table.identifier.table)
val schema = SchemaConverters.toAvroType(dataSchema, nullable = false,
recordName, namespace)
- val partitionColumns = if (table.partitionColumnNames.isEmpty) {
+ val partitionColumns = if
(tableConfigs.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())) {
+ tableConfigs(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())
+ } else if (table.partitionColumnNames.isEmpty) {
Review Comment:
Same here on avoiding `.key` calls
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]