(auron) branch master updated: [AURON #1780] Fix the ORC table written by Hive to read null uppercase fields (#1781)

richox Tue, 30 Dec 2025 22:50:27 -0800

This is an automated email from the ASF dual-hosted git repository.

richox pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/auron.git



The following commit(s) were added to refs/heads/master by this push:
     new d357cbc1 [AURON #1780] Fix the ORC table written by Hive to read null 
uppercase fields (#1781)
d357cbc1 is described below

commit d357cbc100f01374059c4deaa6856af763164e4c
Author: Graceful <[email protected]>
AuthorDate: Wed Dec 31 14:50:19 2025 +0800

    [AURON #1780] Fix the ORC table written by Hive to read null uppercase 
fields (#1781)
    
    <!--
    - Start the PR title with the related issue ID, e.g. '[AURON #XXXX]
    Short summary...'.
    -->
    # Which issue does this PR close?
    
    Closes #1780
    
    # Rationale for this change
    Fix the issue of reading null ORC files written by Hive
    # What changes are included in this PR?
    Modify the matching field logic of the ORC file
    # Are there any user-facing changes?
    no
    # How was this patch tested?
    cluster test
    
    ---------
    
    Co-authored-by: duanhao-jk <[email protected]>
---
 native-engine/auron-jni-bridge/src/conf.rs         |  1 +
 native-engine/datafusion-ext-plans/src/orc_exec.rs | 24 ++++++++++++++++++++--
 .../configuration/SparkAuronConfiguration.java     |  5 +++++
 .../java/org/apache/spark/sql/auron/AuronConf.java |  2 ++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/native-engine/auron-jni-bridge/src/conf.rs 
b/native-engine/auron-jni-bridge/src/conf.rs
index e99037ad..383596d6 100644
--- a/native-engine/auron-jni-bridge/src/conf.rs
+++ b/native-engine/auron-jni-bridge/src/conf.rs
@@ -57,6 +57,7 @@ define_conf!(IntConf, SUGGESTED_BATCH_MEM_SIZE);
 define_conf!(IntConf, SUGGESTED_BATCH_MEM_SIZE_KWAY_MERGE);
 define_conf!(BooleanConf, ORC_FORCE_POSITIONAL_EVOLUTION);
 define_conf!(BooleanConf, ORC_TIMESTAMP_USE_MICROSECOND);
+define_conf!(BooleanConf, ORC_SCHEMA_CASE_SENSITIVE);
 define_conf!(IntConf, UDAF_FALLBACK_NUM_UDAFS_TRIGGER_SORT_AGG);
 define_conf!(BooleanConf, PARSE_JSON_ERROR_FALLBACK);
 define_conf!(StringConf, NATIVE_LOG_LEVEL);
diff --git a/native-engine/datafusion-ext-plans/src/orc_exec.rs 
b/native-engine/datafusion-ext-plans/src/orc_exec.rs
index d07e4794..c53cb6b5 100644
--- a/native-engine/datafusion-ext-plans/src/orc_exec.rs
+++ b/native-engine/datafusion-ext-plans/src/orc_exec.rs
@@ -160,6 +160,7 @@ impl ExecutionPlan for OrcExec {
 
         let force_positional_evolution = 
conf::ORC_FORCE_POSITIONAL_EVOLUTION.value()?;
         let use_microsecond_precision = 
conf::ORC_TIMESTAMP_USE_MICROSECOND.value()?;
+        let is_case_sensitive = conf::ORC_SCHEMA_CASE_SENSITIVE.value()?;
 
         let opener: Arc<dyn FileOpener> = Arc::new(OrcOpener {
             projection,
@@ -170,6 +171,7 @@ impl ExecutionPlan for OrcExec {
             metrics: self.metrics.clone(),
             force_positional_evolution,
             use_microsecond_precision,
+            is_case_sensitive,
         });
 
         let file_stream = Box::pin(FileStream::new(
@@ -217,6 +219,7 @@ struct OrcOpener {
     metrics: ExecutionPlanMetricsSet,
     force_positional_evolution: bool,
     use_microsecond_precision: bool,
+    is_case_sensitive: bool,
 }
 
 impl FileOpener for OrcOpener {
@@ -245,6 +248,7 @@ impl FileOpener for OrcOpener {
             self.force_positional_evolution,
         );
         let use_microsecond = self.use_microsecond_precision;
+        let is_case = self.is_case_sensitive;
 
         Ok(Box::pin(async move {
             let mut builder = ArrowReaderBuilder::try_new_async(reader)
@@ -259,7 +263,7 @@ impl FileOpener for OrcOpener {
             }
 
             let (schema_mapping, projection) =
-                schema_adapter.map_schema(builder.file_metadata())?;
+                schema_adapter.map_schema(builder.file_metadata(), is_case)?;
 
             let projection_mask =
                 
ProjectionMask::roots(builder.file_metadata().root_data_type(), projection);
@@ -325,6 +329,7 @@ impl SchemaAdapter {
     fn map_schema(
         &self,
         orc_file_meta: &FileMetadata,
+        is_case_sensitive: bool,
     ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
         let mut projection = 
Vec::with_capacity(self.projected_schema.fields().len());
         let mut field_mappings = vec![None; 
self.projected_schema.fields().len()];
@@ -363,7 +368,7 @@ impl SchemaAdapter {
                     }
                 }
             }
-        } else {
+        } else if is_case_sensitive {
             for named_column in file_named_columns {
                 if let Some((proj_idx, _)) =
                     self.projected_schema.fields().find(named_column.name())
@@ -372,6 +377,21 @@ impl SchemaAdapter {
                     projection.push(named_column.data_type().column_index());
                 }
             }
+        } else {
+            for named_column in file_named_columns {
+                // Case-insensitive field name matching
+                let named_column_name_lower = 
named_column.name().to_lowercase();
+                if let Some((proj_idx, _)) = self
+                    .projected_schema
+                    .fields()
+                    .iter()
+                    .enumerate()
+                    .find(|(_, f)| f.name().to_lowercase() == 
named_column_name_lower)
+                {
+                    field_mappings[proj_idx] = Some(projection.len());
+                    projection.push(named_column.data_type().column_index());
+                }
+            }
         }
 
         Ok((
diff --git 
a/spark-extension/src/main/java/org/apache/auron/spark/configuration/SparkAuronConfiguration.java
 
b/spark-extension/src/main/java/org/apache/auron/spark/configuration/SparkAuronConfiguration.java
index b503b59f..8e66efb1 100644
--- 
a/spark-extension/src/main/java/org/apache/auron/spark/configuration/SparkAuronConfiguration.java
+++ 
b/spark-extension/src/main/java/org/apache/auron/spark/configuration/SparkAuronConfiguration.java
@@ -237,6 +237,11 @@ public class SparkAuronConfiguration extends 
AuronConfiguration {
             .description("use microsecond precision when reading ORC timestamp 
columns. ")
             .booleanType()
             .defaultValue(false);
+    public static final ConfigOption<Boolean> ORC_SCHEMA_CASE_SENSITIVE = 
ConfigOptions.key(
+                    "auron.orc.schema.caseSensitive.enable")
+            .description("whether ORC file schema matching distinguishes 
between uppercase and lowercase. ")
+            .booleanType()
+            .defaultValue(false);
 
     public static final ConfigOption<Boolean> FORCE_SHORT_CIRCUIT_AND_OR = 
ConfigOptions.key(
                     "auron.forceShortCircuitAndOr")
diff --git 
a/spark-extension/src/main/java/org/apache/spark/sql/auron/AuronConf.java 
b/spark-extension/src/main/java/org/apache/spark/sql/auron/AuronConf.java
index b3cbca1a..2943d4b2 100644
--- a/spark-extension/src/main/java/org/apache/spark/sql/auron/AuronConf.java
+++ b/spark-extension/src/main/java/org/apache/spark/sql/auron/AuronConf.java
@@ -139,6 +139,8 @@ public enum AuronConf {
     // use microsecond precision when reading ORC timestamp columns
     ORC_TIMESTAMP_USE_MICROSECOND("spark.auron.orc.timestamp.use.microsecond", 
false),
 
+    ORC_SCHEMA_CASE_SENSITIVE("spark.auron.orc.schema.caseSensitive.enable", 
false),
+
     NATIVE_LOG_LEVEL("spark.auron.native.log.level", "info");
 
     public final String key;

(auron) branch master updated: [AURON #1780] Fix the ORC table written by Hive to read null uppercase fields (#1781)

Reply via email to