suryaprasanna commented on code in PR #18384:
URL: https://github.com/apache/hudi/pull/18384#discussion_r2997454239
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java:
##########
@@ -1738,6 +1741,43 @@ public boolean populateMetaFields() {
return getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS);
}
+ public Set<String> getMetaFieldsToExclude() {
+ String value = getString(HoodieTableConfig.META_FIELDS_TO_EXCLUDE);
+ if (value == null || value.trim().isEmpty()) {
+ return Collections.emptySet();
+ }
+ Set<String> excluded = new HashSet<>();
+ for (String field : value.split(",")) {
+ String trimmed = field.trim();
+ if (!trimmed.isEmpty()) {
+ excluded.add(trimmed);
+ }
+ }
+ return excluded;
+ }
+
+ /**
+ * Returns a boolean[5] indexed by meta field ordinal.
+ * true = populate this meta field, false = write empty string.
+ */
+ public boolean[] getMetaFieldPopulationFlags() {
+ boolean[] flags = new boolean[5];
+ if (!populateMetaFields()) {
+ return flags; // all false
+ }
+ Set<String> excluded = getMetaFieldsToExclude();
+ if (excluded.isEmpty()) {
+ Arrays.fill(flags, true);
+ return flags;
+ }
+ flags[0] = !excluded.contains(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
+ flags[1] = !excluded.contains(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD);
+ flags[2] = !excluded.contains(HoodieRecord.RECORD_KEY_METADATA_FIELD);
+ flags[3] = !excluded.contains(HoodieRecord.PARTITION_PATH_METADATA_FIELD);
+ flags[4] = !excluded.contains(HoodieRecord.FILENAME_METADATA_FIELD);
+ return flags;
Review Comment:
Do we need to include OPERATION_METADATA_FIELD as well?
##########
hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataCreateHandle.java:
##########
@@ -77,6 +77,7 @@ public class HoodieRowDataCreateHandle implements
Serializable {
private final String fileId;
private final boolean preserveHoodieMetadata;
private final boolean skipMetadataWrite;
+ private final boolean[] populateField;
Review Comment:
Should this be populateIndividualMetaFields something like that, instead of
populateField, what do you think?
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java:
##########
@@ -1738,6 +1741,43 @@ public boolean populateMetaFields() {
return getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS);
}
+ public Set<String> getMetaFieldsToExclude() {
Review Comment:
Should this be private?
##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetWriter.java:
##########
@@ -95,11 +108,28 @@ protected void updateRecordMetadata(InternalRow row,
UTF8String recordKey,
String partitionPath,
long recordCount) {
- row.update(COMMIT_TIME_METADATA_FIELD.ordinal(), instantTime);
- row.update(COMMIT_SEQNO_METADATA_FIELD.ordinal(),
UTF8String.fromString(seqIdGenerator.apply(recordCount)));
- row.update(RECORD_KEY_METADATA_FIELD.ordinal(), recordKey);
- // TODO set partition path in ctor
- row.update(PARTITION_PATH_METADATA_FIELD.ordinal(),
UTF8String.fromString(partitionPath));
- row.update(FILENAME_METADATA_FIELD.ordinal(), fileName);
+ if (populateField != null) {
+ if (populateField[0]) {
+ row.update(COMMIT_TIME_METADATA_FIELD.ordinal(), instantTime);
+ }
+ if (populateField[1]) {
+ row.update(COMMIT_SEQNO_METADATA_FIELD.ordinal(),
UTF8String.fromString(seqIdGenerator.apply(recordCount)));
+ }
+ if (populateField[2]) {
+ row.update(RECORD_KEY_METADATA_FIELD.ordinal(), recordKey);
+ }
+ if (populateField[3]) {
+ row.update(PARTITION_PATH_METADATA_FIELD.ordinal(),
UTF8String.fromString(partitionPath));
+ }
+ if (populateField[4]) {
+ row.update(FILENAME_METADATA_FIELD.ordinal(), fileName);
+ }
+ } else {
+ row.update(COMMIT_TIME_METADATA_FIELD.ordinal(), instantTime);
+ row.update(COMMIT_SEQNO_METADATA_FIELD.ordinal(),
UTF8String.fromString(seqIdGenerator.apply(recordCount)));
+ row.update(RECORD_KEY_METADATA_FIELD.ordinal(), recordKey);
+ row.update(PARTITION_PATH_METADATA_FIELD.ordinal(),
UTF8String.fromString(partitionPath));
+ row.update(FILENAME_METADATA_FIELD.ordinal(), fileName);
Review Comment:
Do we need to include OPERATION_METADATA_FIELD as well?
##########
hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java:
##########
@@ -324,6 +324,16 @@ public static final String getDefaultPayloadClassName() {
.withDocumentation("When enabled, populates all meta fields. When
disabled, no meta fields are populated "
+ "and incremental queries will not be functional. This is only
meant to be used for append only/immutable data for batch processing");
+ public static final ConfigProperty<String> META_FIELDS_TO_EXCLUDE =
ConfigProperty
Review Comment:
This would require table version upgrade, not sure how we want to track it
as part of next version.
##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetWriter.java:
##########
@@ -44,6 +44,7 @@ public class HoodieSparkParquetWriter extends
HoodieBaseParquetWriter<InternalRo
private final UTF8String instantTime;
private final boolean populateMetaFields;
+ private final boolean[] populateField;
Review Comment:
Should this be populateIndividualMetaFields something like that, instead of
populateField, what do you think?
##########
hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala:
##########
@@ -110,11 +110,13 @@ object HoodieDatasetBulkInsertHelper
sparkKeyGenerator
}
+ val populateField = config.getMetaFieldPopulationFlags
Review Comment:
Should this be populateIndividualMetaFields something like that, instead of
populateField, what do you think?
##########
hudi-hadoop-common/src/main/java/org/apache/hudi/io/storage/hadoop/HoodieAvroOrcWriter.java:
##########
@@ -70,11 +70,17 @@ public class HoodieAvroOrcWriter implements
HoodieAvroFileWriter, Closeable {
private final TaskContextSupplier taskContextSupplier;
private final HoodieOrcConfig orcConfig;
+ private final boolean[] populateField;
private String minRecordKey;
private String maxRecordKey;
public HoodieAvroOrcWriter(String instantTime, StoragePath file,
HoodieOrcConfig config, HoodieSchema schema,
TaskContextSupplier taskContextSupplier) throws
IOException {
+ this(instantTime, file, config, schema, taskContextSupplier, null);
+ }
+
+ public HoodieAvroOrcWriter(String instantTime, StoragePath file,
HoodieOrcConfig config, HoodieSchema schema,
+ TaskContextSupplier taskContextSupplier,
boolean[] populateField) throws IOException {
Review Comment:
Should this be populateIndividualMetaFields something like that, instead of
populateField, what do you think?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]