This is an automated email from the ASF dual-hosted git repository.
richox pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/auron.git
The following commit(s) were added to refs/heads/master by this push:
new d357cbc1 [AURON #1780] Fix the ORC table written by Hive to read null
uppercase fields (#1781)
d357cbc1 is described below
commit d357cbc100f01374059c4deaa6856af763164e4c
Author: Graceful <[email protected]>
AuthorDate: Wed Dec 31 14:50:19 2025 +0800
[AURON #1780] Fix the ORC table written by Hive to read null uppercase
fields (#1781)
<!--
- Start the PR title with the related issue ID, e.g. '[AURON #XXXX]
Short summary...'.
-->
# Which issue does this PR close?
Closes #1780
# Rationale for this change
Fix the issue of reading null ORC files written by Hive
# What changes are included in this PR?
Modify the matching field logic of the ORC file
# Are there any user-facing changes?
no
# How was this patch tested?
cluster test
---------
Co-authored-by: duanhao-jk <[email protected]>
---
native-engine/auron-jni-bridge/src/conf.rs | 1 +
native-engine/datafusion-ext-plans/src/orc_exec.rs | 24 ++++++++++++++++++++--
.../configuration/SparkAuronConfiguration.java | 5 +++++
.../java/org/apache/spark/sql/auron/AuronConf.java | 2 ++
4 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/native-engine/auron-jni-bridge/src/conf.rs
b/native-engine/auron-jni-bridge/src/conf.rs
index e99037ad..383596d6 100644
--- a/native-engine/auron-jni-bridge/src/conf.rs
+++ b/native-engine/auron-jni-bridge/src/conf.rs
@@ -57,6 +57,7 @@ define_conf!(IntConf, SUGGESTED_BATCH_MEM_SIZE);
define_conf!(IntConf, SUGGESTED_BATCH_MEM_SIZE_KWAY_MERGE);
define_conf!(BooleanConf, ORC_FORCE_POSITIONAL_EVOLUTION);
define_conf!(BooleanConf, ORC_TIMESTAMP_USE_MICROSECOND);
+define_conf!(BooleanConf, ORC_SCHEMA_CASE_SENSITIVE);
define_conf!(IntConf, UDAF_FALLBACK_NUM_UDAFS_TRIGGER_SORT_AGG);
define_conf!(BooleanConf, PARSE_JSON_ERROR_FALLBACK);
define_conf!(StringConf, NATIVE_LOG_LEVEL);
diff --git a/native-engine/datafusion-ext-plans/src/orc_exec.rs
b/native-engine/datafusion-ext-plans/src/orc_exec.rs
index d07e4794..c53cb6b5 100644
--- a/native-engine/datafusion-ext-plans/src/orc_exec.rs
+++ b/native-engine/datafusion-ext-plans/src/orc_exec.rs
@@ -160,6 +160,7 @@ impl ExecutionPlan for OrcExec {
let force_positional_evolution =
conf::ORC_FORCE_POSITIONAL_EVOLUTION.value()?;
let use_microsecond_precision =
conf::ORC_TIMESTAMP_USE_MICROSECOND.value()?;
+ let is_case_sensitive = conf::ORC_SCHEMA_CASE_SENSITIVE.value()?;
let opener: Arc<dyn FileOpener> = Arc::new(OrcOpener {
projection,
@@ -170,6 +171,7 @@ impl ExecutionPlan for OrcExec {
metrics: self.metrics.clone(),
force_positional_evolution,
use_microsecond_precision,
+ is_case_sensitive,
});
let file_stream = Box::pin(FileStream::new(
@@ -217,6 +219,7 @@ struct OrcOpener {
metrics: ExecutionPlanMetricsSet,
force_positional_evolution: bool,
use_microsecond_precision: bool,
+ is_case_sensitive: bool,
}
impl FileOpener for OrcOpener {
@@ -245,6 +248,7 @@ impl FileOpener for OrcOpener {
self.force_positional_evolution,
);
let use_microsecond = self.use_microsecond_precision;
+ let is_case = self.is_case_sensitive;
Ok(Box::pin(async move {
let mut builder = ArrowReaderBuilder::try_new_async(reader)
@@ -259,7 +263,7 @@ impl FileOpener for OrcOpener {
}
let (schema_mapping, projection) =
- schema_adapter.map_schema(builder.file_metadata())?;
+ schema_adapter.map_schema(builder.file_metadata(), is_case)?;
let projection_mask =
ProjectionMask::roots(builder.file_metadata().root_data_type(), projection);
@@ -325,6 +329,7 @@ impl SchemaAdapter {
fn map_schema(
&self,
orc_file_meta: &FileMetadata,
+ is_case_sensitive: bool,
) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
let mut projection =
Vec::with_capacity(self.projected_schema.fields().len());
let mut field_mappings = vec![None;
self.projected_schema.fields().len()];
@@ -363,7 +368,7 @@ impl SchemaAdapter {
}
}
}
- } else {
+ } else if is_case_sensitive {
for named_column in file_named_columns {
if let Some((proj_idx, _)) =
self.projected_schema.fields().find(named_column.name())
@@ -372,6 +377,21 @@ impl SchemaAdapter {
projection.push(named_column.data_type().column_index());
}
}
+ } else {
+ for named_column in file_named_columns {
+ // Case-insensitive field name matching
+ let named_column_name_lower =
named_column.name().to_lowercase();
+ if let Some((proj_idx, _)) = self
+ .projected_schema
+ .fields()
+ .iter()
+ .enumerate()
+ .find(|(_, f)| f.name().to_lowercase() ==
named_column_name_lower)
+ {
+ field_mappings[proj_idx] = Some(projection.len());
+ projection.push(named_column.data_type().column_index());
+ }
+ }
}
Ok((
diff --git
a/spark-extension/src/main/java/org/apache/auron/spark/configuration/SparkAuronConfiguration.java
b/spark-extension/src/main/java/org/apache/auron/spark/configuration/SparkAuronConfiguration.java
index b503b59f..8e66efb1 100644
---
a/spark-extension/src/main/java/org/apache/auron/spark/configuration/SparkAuronConfiguration.java
+++
b/spark-extension/src/main/java/org/apache/auron/spark/configuration/SparkAuronConfiguration.java
@@ -237,6 +237,11 @@ public class SparkAuronConfiguration extends
AuronConfiguration {
.description("use microsecond precision when reading ORC timestamp
columns. ")
.booleanType()
.defaultValue(false);
+ public static final ConfigOption<Boolean> ORC_SCHEMA_CASE_SENSITIVE =
ConfigOptions.key(
+ "auron.orc.schema.caseSensitive.enable")
+ .description("whether ORC file schema matching distinguishes
between uppercase and lowercase. ")
+ .booleanType()
+ .defaultValue(false);
public static final ConfigOption<Boolean> FORCE_SHORT_CIRCUIT_AND_OR =
ConfigOptions.key(
"auron.forceShortCircuitAndOr")
diff --git
a/spark-extension/src/main/java/org/apache/spark/sql/auron/AuronConf.java
b/spark-extension/src/main/java/org/apache/spark/sql/auron/AuronConf.java
index b3cbca1a..2943d4b2 100644
--- a/spark-extension/src/main/java/org/apache/spark/sql/auron/AuronConf.java
+++ b/spark-extension/src/main/java/org/apache/spark/sql/auron/AuronConf.java
@@ -139,6 +139,8 @@ public enum AuronConf {
// use microsecond precision when reading ORC timestamp columns
ORC_TIMESTAMP_USE_MICROSECOND("spark.auron.orc.timestamp.use.microsecond",
false),
+ ORC_SCHEMA_CASE_SENSITIVE("spark.auron.orc.schema.caseSensitive.enable",
false),
+
NATIVE_LOG_LEVEL("spark.auron.native.log.level", "info");
public final String key;