yihua commented on code in PR #11816:
URL: https://github.com/apache/hudi/pull/11816#discussion_r1757633288
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala:
##########
@@ -313,7 +315,9 @@ class HoodieCatalogTable(val spark: SparkSession, var
table: CatalogTable) exten
KeyGeneratorType.valueOf(originTableConfig(HoodieTableConfig.KEY_GENERATOR_TYPE.key)).getClassName)
} else {
val primaryKeys =
table.properties.getOrElse(SQL_KEY_TABLE_PRIMARY_KEY.sqlKeyName,
table.storage.properties.get(SQL_KEY_TABLE_PRIMARY_KEY.sqlKeyName)).toString
- val partitions = table.partitionColumnNames.mkString(",")
+ val partitionFieldsOpt =
Option.apply(ConfigUtils.getStringWithAltKeys(originTableConfig.asJava.asInstanceOf[java.util.Map[String,
Object]], HoodieTableConfig.PARTITION_FIELDS))
Review Comment:
`originTableConfig.asJava.asInstanceOf[java.util.Map[String, Object]` may
incur non-trivial overhead. Could you add a new util method that can be used
on Scala Map?
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala:
##########
@@ -212,7 +211,9 @@ class HoodieCatalogTable(val spark: SparkSession, var
table: CatalogTable) exten
} else {
val (recordName, namespace) =
AvroConversionUtils.getAvroRecordNameAndNamespace(table.identifier.table)
val schema = SchemaConverters.toAvroType(dataSchema, nullable = false,
recordName, namespace)
- val partitionColumns = if (table.partitionColumnNames.isEmpty) {
+ val partitionColumns = if
(tableConfigs.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())) {
+
ConfigUtils.getStringWithAltKeys(tableConfigs.asJava.asInstanceOf[java.util.Map[String,
Object]], KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)
Review Comment:
Similar here on the conversion.
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java:
##########
@@ -66,29 +66,25 @@ public class KeyGenUtils {
public static KeyGeneratorType inferKeyGeneratorType(
Option<String> recordsKeyFields, String partitionFields) {
boolean autoGenerateRecordKeys = !recordsKeyFields.isPresent();
- if (autoGenerateRecordKeys) {
- return inferKeyGeneratorTypeForAutoKeyGen(partitionFields);
- } else {
- if (!StringUtils.isNullOrEmpty(partitionFields)) {
- int numPartFields = partitionFields.split(",").length;
- int numRecordKeyFields = recordsKeyFields.get().split(",").length;
- if (numPartFields == 1 && numRecordKeyFields == 1) {
- return KeyGeneratorType.SIMPLE;
- }
- return KeyGeneratorType.COMPLEX;
- }
- return KeyGeneratorType.NON_PARTITION;
+ KeyGeneratorType partitionKeyGeneratorType =
inferKeyGeneratorTypeFromPartitionFields(partitionFields);
+ if (autoGenerateRecordKeys || partitionKeyGeneratorType !=
KeyGeneratorType.SIMPLE) {
Review Comment:
Make sense. In that case should all logic of checking record key and
partition fields be put in the same method
`inferKeyGeneratorTypeFromPartitionFields`? The code in the current shape is
confusing. Still keep the structure like:
```
if (autoGenerateRecordKeys) {
return inferKeyGeneratorTypeForAutoKeyGen(partitionFields);
}
...
```
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlWithCustomKeyGenerator.scala:
##########
@@ -442,13 +442,92 @@ class TestSparkSqlWithCustomKeyGenerator extends
HoodieSparkSqlTestBase {
}
}
+ test("Test create table with custom key generator") {
+ withTempDir { tmp => {
+ val tableName = generateTableName
+ val tablePath = tmp.getCanonicalPath + "/" + tableName
+ val writePartitionFields = "ts:timestamp"
+ val dateFormat = "yyyy/MM/dd"
+ val tsGenFunc = (ts: Integer) => TS_FORMATTER_FUNC_WITH_FORMAT.apply(ts,
dateFormat)
+ val customPartitionFunc = (ts: Integer, _: String) => "ts=" +
tsGenFunc.apply(ts)
+
+ spark.sql(
+ s"""
+ |create table ${tableName} (
+ | `id` INT,
+ | `name` STRING,
+ | `price` DECIMAL(5, 1),
+ | `ts` INT,
+ | `segment` STRING
+ |) using hudi
+ |tblproperties (
+ | 'primaryKey' = 'id,name',
+ | 'type' = 'mor',
+ | 'preCombineField'='name',
+ | 'hoodie.datasource.write.keygenerator.class' =
'$CUSTOM_KEY_GEN_CLASS_NAME',
+ | 'hoodie.datasource.write.partitionpath.field' =
'$writePartitionFields',
Review Comment:
Let's JIRA the follow up and mark it to fix 1.0.0 release.
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java:
##########
@@ -66,29 +66,25 @@ public class KeyGenUtils {
public static KeyGeneratorType inferKeyGeneratorType(
Option<String> recordsKeyFields, String partitionFields) {
boolean autoGenerateRecordKeys = !recordsKeyFields.isPresent();
- if (autoGenerateRecordKeys) {
- return inferKeyGeneratorTypeForAutoKeyGen(partitionFields);
- } else {
- if (!StringUtils.isNullOrEmpty(partitionFields)) {
- int numPartFields = partitionFields.split(",").length;
- int numRecordKeyFields = recordsKeyFields.get().split(",").length;
- if (numPartFields == 1 && numRecordKeyFields == 1) {
- return KeyGeneratorType.SIMPLE;
- }
- return KeyGeneratorType.COMPLEX;
- }
- return KeyGeneratorType.NON_PARTITION;
+ KeyGeneratorType partitionKeyGeneratorType =
inferKeyGeneratorTypeFromPartitionFields(partitionFields);
+ if (autoGenerateRecordKeys || partitionKeyGeneratorType !=
KeyGeneratorType.SIMPLE) {
+ return partitionKeyGeneratorType;
}
+ int numRecordKeyFields = recordsKeyFields.get().split(",").length;
+ return numRecordKeyFields == 1 ? KeyGeneratorType.SIMPLE :
KeyGeneratorType.COMPLEX;
}
// When auto record key gen is enabled, our inference will be based on
partition path only.
- private static KeyGeneratorType inferKeyGeneratorTypeForAutoKeyGen(String
partitionFields) {
+ static KeyGeneratorType inferKeyGeneratorTypeFromPartitionFields(String
partitionFields) {
if (!StringUtils.isNullOrEmpty(partitionFields)) {
- int numPartFields = partitionFields.split(",").length;
- if (numPartFields == 1) {
+ String[] partitonFields = partitionFields.split(",");
+ if
(partitonFields[0].contains(BaseKeyGenerator.CUSTOM_KEY_GENERATOR_SPLIT_REGEX))
{
+ return KeyGeneratorType.CUSTOM;
Review Comment:
I created HUDI-7613 for you to validate write/query with CustomKeyGenerator
on Flink and Hive
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]