This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch branch-52
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/branch-52 by this push:
new 72ea8ec086 [branch-52] Fix constant value from stats (#20042) (#20709)
72ea8ec086 is described below
commit 72ea8ec086e59220f6b255ea565e710990ad7967
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Mar 4 17:43:21 2026 -0500
[branch-52] Fix constant value from stats (#20042) (#20709)
- Part of https://github.com/apache/datafusion/issues/20681
- Closes https://github.com/apache/datafusion/issues/20041 on branch-52
This PR:
- Backports https://github.com/apache/datafusion/pull/20042 from
@gabotechs to the `branch-52` line
Co-authored-by: Gabriel <[email protected]>
---
.../core/src/datasource/physical_plan/parquet.rs | 48 ++++++++++++++++++++--
datafusion/datasource-parquet/src/opener.rs | 4 ++
2 files changed, 49 insertions(+), 3 deletions(-)
diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs
b/datafusion/core/src/datasource/physical_plan/parquet.rs
index 4703b55ecc..dde40cc060 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet.rs
@@ -38,10 +38,10 @@ mod tests {
use crate::prelude::{ParquetReadOptions, SessionConfig, SessionContext};
use crate::test::object_store::local_unpartitioned_file;
use arrow::array::{
- ArrayRef, AsArray, Date64Array, Int8Array, Int32Array, Int64Array,
StringArray,
- StringViewArray, StructArray, TimestampNanosecondArray,
+ ArrayRef, AsArray, Date64Array, DictionaryArray, Int8Array, Int32Array,
+ Int64Array, StringArray, StringViewArray, StructArray,
TimestampNanosecondArray,
};
- use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaBuilder};
+ use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaBuilder,
UInt16Type};
use arrow::record_batch::RecordBatch;
use arrow::util::pretty::pretty_format_batches;
use arrow_schema::{SchemaRef, TimeUnit};
@@ -2249,6 +2249,48 @@ mod tests {
Ok(())
}
+ /// Tests that constant dictionary columns (where min == max in statistics)
+ /// are correctly handled. This reproduced a bug where the constant value
+ /// from statistics had type Utf8 but the schema expected Dictionary.
+ #[tokio::test]
+ async fn test_constant_dictionary_column_parquet() -> Result<()> {
+ let tmp_dir = TempDir::new()?;
+ let path = tmp_dir.path().to_str().unwrap().to_string() +
"/test.parquet";
+
+ // Write parquet with dictionary column where all values are the same
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "status",
+ DataType::Dictionary(Box::new(DataType::UInt16),
Box::new(DataType::Utf8)),
+ false,
+ )]));
+ let status: DictionaryArray<UInt16Type> =
+ vec!["active", "active"].into_iter().collect();
+ let batch = RecordBatch::try_new(schema.clone(),
vec![Arc::new(status)])?;
+ let file = File::create(&path)?;
+ let props = WriterProperties::builder()
+
.set_statistics_enabled(parquet::file::properties::EnabledStatistics::Page)
+ .build();
+ let mut writer = ArrowWriter::try_new(file, schema, Some(props))?;
+ writer.write(&batch)?;
+ writer.close()?;
+
+ // Query the constant dictionary column
+ let ctx = SessionContext::new();
+ ctx.register_parquet("t", &path, ParquetReadOptions::default())
+ .await?;
+ let result = ctx.sql("SELECT status FROM t").await?.collect().await?;
+
+ insta::assert_snapshot!(batches_to_string(&result),@r"
+ +--------+
+ | status |
+ +--------+
+ | active |
+ | active |
+ +--------+
+ ");
+ Ok(())
+ }
+
fn write_file(file: &String) {
let struct_fields = Fields::from(vec![
Field::new("id", DataType::Int64, false),
diff --git a/datafusion/datasource-parquet/src/opener.rs
b/datafusion/datasource-parquet/src/opener.rs
index 83bdf79c8f..719a3afc76 100644
--- a/datafusion/datasource-parquet/src/opener.rs
+++ b/datafusion/datasource-parquet/src/opener.rs
@@ -696,6 +696,10 @@ fn constant_value_from_stats(
&& !min.is_null()
&& matches!(column_stats.null_count, Precision::Exact(0))
{
+ // Cast to the expected data type if needed (e.g., Utf8 -> Dictionary)
+ if min.data_type() != *data_type {
+ return min.cast_to(data_type).ok();
+ }
return Some(min.clone());
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]