This is an automated email from the ASF dual-hosted git repository.
yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 0f258b1f6c39 fix: Fix duplicate field exception in hive query with
where clause (#14337)
0f258b1f6c39 is described below
commit 0f258b1f6c392a2132b560462ccce3da9063bc52
Author: Shuo Cheng <[email protected]>
AuthorDate: Wed Nov 26 15:10:34 2025 +0800
fix: Fix duplicate field exception in hive query with where clause (#14337)
---
.../HoodieFileGroupReaderBasedRecordReader.java | 8 +++--
...TestHoodieFileGroupReaderBasedRecordReader.java | 34 ++++++++++++++++++++++
2 files changed, 40 insertions(+), 2 deletions(-)
diff --git
a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java
b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java
index c74ad8dd2e5a..8d06728860f3 100644
---
a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java
+++
b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java
@@ -311,7 +311,8 @@ public class HoodieFileGroupReaderBasedRecordReader
implements RecordReader<Null
return null;
}
- private static Schema createRequestedSchema(Schema tableSchema, JobConf
jobConf) {
+ @VisibleForTesting
+ public static Schema createRequestedSchema(Schema tableSchema, JobConf
jobConf) {
String readCols =
jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
if (StringUtils.isNullOrEmpty(readCols)) {
Schema emptySchema = Schema.createRecord(tableSchema.getName(),
tableSchema.getDoc(),
@@ -330,6 +331,9 @@ public class HoodieFileGroupReaderBasedRecordReader
implements RecordReader<Null
// if they are actually written to the file, then it is ok to read them
from the file
tableSchema.getFields().forEach(f ->
partitionColumns.remove(f.name().toLowerCase(Locale.ROOT)));
return HoodieAvroUtils.generateProjectionSchema(tableSchema,
-
Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c
-> !partitionColumns.contains(c)).collect(Collectors.toList()));
+ // The READ_COLUMN_NAMES_CONF_STR includes all columns from the query,
including those used in the WHERE clause,
+ // so any column referenced in the filter (non-partition) will appear
twice if already present in the project schema,
+ // here distinct() is used here to deduplicate the read columns.
+
Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c
-> !partitionColumns.contains(c)).distinct().collect(Collectors.toList()));
}
}
diff --git
a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieFileGroupReaderBasedRecordReader.java
b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieFileGroupReaderBasedRecordReader.java
index 07fc8ca2fd7a..0d16dabc3fc3 100644
---
a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieFileGroupReaderBasedRecordReader.java
+++
b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieFileGroupReaderBasedRecordReader.java
@@ -21,6 +21,9 @@ package org.apache.hudi.hadoop;
import org.apache.hudi.common.util.collection.ClosableIterator;
+import org.apache.avro.Schema;
+import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
@@ -48,4 +51,35 @@ public class TestHoodieFileGroupReaderBasedRecordReader {
assertEquals(0, recordReader.getProgress());
assertEquals(0, recordReader.getPos());
}
+
+ /**
+ * Test to verify that the functionality handles duplicate column names
+ * that could occur when columns are referenced in both SELECT and WHERE
clauses.
+ * This test ensures the fix for duplicate field exception works correctly.
+ * The fix was applied in createRequestedSchema method where .distinct() was
added
+ * to prevent duplicate column names from causing schema generation issues.
+ */
+ @Test
+ void testDuplicateFieldHandlingInHiveQueryWithWhereClause() {
+ JobConf jobConf = new JobConf();
+ // Simulate a query where same column appears in both SELECT and WHERE
clauses
+ // This would result in duplicates in READ_COLUMN_NAMES_CONF_STR like
"field1,field2,field1"
+ jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
"field1,field2,part1,field1");
+ jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "part1");
+
+ String schemaStr = "{\n"
+ + " \"type\": \"record\",\n"
+ + " \"name\": \"testRecord\",\n"
+ + " \"fields\": [\n"
+ + " {\"name\": \"field1\", \"type\": \"string\"},\n"
+ + " {\"name\": \"field2\", \"type\": \"int\"},\n"
+ + " {\"name\": \"field3\", \"type\": \"int\"}\n"
+ + " ]\n"
+ + "}";
+ Schema tableSchema = new Schema.Parser().parse(schemaStr);
+ Schema requestedSchema =
HoodieFileGroupReaderBasedRecordReader.createRequestedSchema(tableSchema,
jobConf);
+ assertEquals(2, requestedSchema.getFields().size());
+ assertEquals("field1", requestedSchema.getFields().get(0).name());
+ assertEquals("field2", requestedSchema.getFields().get(1).name());
+ }
}