(hudi) branch master updated: fix: Fix duplicate field exception in hive query with where clause (#14337)

yihua Tue, 25 Nov 2025 23:11:45 -0800

This is an automated email from the ASF dual-hosted git repository.

yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git



The following commit(s) were added to refs/heads/master by this push:
     new 0f258b1f6c39 fix: Fix duplicate field exception in hive query with 
where clause (#14337)
0f258b1f6c39 is described below

commit 0f258b1f6c392a2132b560462ccce3da9063bc52
Author: Shuo Cheng <[email protected]>
AuthorDate: Wed Nov 26 15:10:34 2025 +0800

    fix: Fix duplicate field exception in hive query with where clause (#14337)
---
 .../HoodieFileGroupReaderBasedRecordReader.java    |  8 +++--
 ...TestHoodieFileGroupReaderBasedRecordReader.java | 34 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git 
a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java
 
b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java
index c74ad8dd2e5a..8d06728860f3 100644
--- 
a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java
+++ 
b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java
@@ -311,7 +311,8 @@ public class HoodieFileGroupReaderBasedRecordReader 
implements RecordReader<Null
     return null;
   }
 
-  private static Schema createRequestedSchema(Schema tableSchema, JobConf 
jobConf) {
+  @VisibleForTesting
+  public static Schema createRequestedSchema(Schema tableSchema, JobConf 
jobConf) {
     String readCols = 
jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
     if (StringUtils.isNullOrEmpty(readCols)) {
       Schema emptySchema = Schema.createRecord(tableSchema.getName(), 
tableSchema.getDoc(),
@@ -330,6 +331,9 @@ public class HoodieFileGroupReaderBasedRecordReader 
implements RecordReader<Null
     // if they are actually written to the file, then it is ok to read them 
from the file
     tableSchema.getFields().forEach(f -> 
partitionColumns.remove(f.name().toLowerCase(Locale.ROOT)));
     return HoodieAvroUtils.generateProjectionSchema(tableSchema,
-        
Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c
 -> !partitionColumns.contains(c)).collect(Collectors.toList()));
+        // The READ_COLUMN_NAMES_CONF_STR includes all columns from the query, 
including those used in the WHERE clause,
+        // so any column referenced in the filter (non-partition) will appear 
twice if already present in the project schema,
+        // here distinct() is used here to deduplicate the read columns.
+        
Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c
 -> !partitionColumns.contains(c)).distinct().collect(Collectors.toList()));
   }
 }
diff --git 
a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieFileGroupReaderBasedRecordReader.java
 
b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieFileGroupReaderBasedRecordReader.java
index 07fc8ca2fd7a..0d16dabc3fc3 100644
--- 
a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieFileGroupReaderBasedRecordReader.java
+++ 
b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieFileGroupReaderBasedRecordReader.java
@@ -21,6 +21,9 @@ package org.apache.hudi.hadoop;
 
 import org.apache.hudi.common.util.collection.ClosableIterator;
 
+import org.apache.avro.Schema;
+import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.io.ArrayWritable;
 import org.apache.hadoop.mapred.InputSplit;
 import org.apache.hadoop.mapred.JobConf;
@@ -48,4 +51,35 @@ public class TestHoodieFileGroupReaderBasedRecordReader {
     assertEquals(0, recordReader.getProgress());
     assertEquals(0, recordReader.getPos());
   }
+
+  /**
+   * Test to verify that the functionality handles duplicate column names
+   * that could occur when columns are referenced in both SELECT and WHERE 
clauses.
+   * This test ensures the fix for duplicate field exception works correctly.
+   * The fix was applied in createRequestedSchema method where .distinct() was 
added
+   * to prevent duplicate column names from causing schema generation issues.
+   */
+  @Test
+  void testDuplicateFieldHandlingInHiveQueryWithWhereClause() {
+    JobConf jobConf = new JobConf();
+    // Simulate a query where same column appears in both SELECT and WHERE 
clauses
+    // This would result in duplicates in READ_COLUMN_NAMES_CONF_STR like 
"field1,field2,field1"
+    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, 
"field1,field2,part1,field1");
+    jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "part1");
+
+    String schemaStr = "{\n"
+        + "  \"type\": \"record\",\n"
+        + "  \"name\": \"testRecord\",\n"
+        + "  \"fields\": [\n"
+        + "    {\"name\": \"field1\", \"type\": \"string\"},\n"
+        + "    {\"name\": \"field2\", \"type\": \"int\"},\n"
+        + "    {\"name\": \"field3\", \"type\": \"int\"}\n"
+        + "  ]\n"
+        + "}";
+    Schema tableSchema = new Schema.Parser().parse(schemaStr);
+    Schema requestedSchema = 
HoodieFileGroupReaderBasedRecordReader.createRequestedSchema(tableSchema, 
jobConf);
+    assertEquals(2, requestedSchema.getFields().size());
+    assertEquals("field1", requestedSchema.getFields().get(0).name());
+    assertEquals("field2", requestedSchema.getFields().get(1).name());
+  }
 }

(hudi) branch master updated: fix: Fix duplicate field exception in hive query with where clause (#14337)

Reply via email to