mkleen commented on code in PR #23201:
URL: https://github.com/apache/datafusion/pull/23201#discussion_r3480515647
##########
datafusion/execution/src/cache/mod.rs:
##########
@@ -165,3 +166,124 @@ impl Display for TableScopedPath {
}
}
}
+
+/// A fingerprint of the `file_schema` used to compute a file's statistics.
+///
+/// Captures exactly the attributes that determine the layout and meaning of
+/// `Statistics::column_statistics`: each column's name, data type and
+/// nullability, in order. It deliberately excludes field/schema metadata,
which
+/// cannot affect statistics — including it would needlessly fragment the
cache.
+#[derive(PartialEq, Eq, Hash, Clone, Debug)]
+pub struct SchemaFingerprint(Vec<(String, DataType, bool)>);
+
+impl SchemaFingerprint {
+ /// Builds a fingerprint from the `file_schema` used to compute statistics
+ /// (the schema of the columns physically read, not the full table schema —
+ /// partition columns and their statistics are handled separately).
+ pub fn from_schema(file_schema: &Schema) -> Self {
+ Self(
+ file_schema
+ .fields()
+ .iter()
+ .map(|f| (f.name().clone(), f.data_type().clone(),
f.is_nullable()))
+ .collect(),
+ )
+ }
+}
+
+impl DFHeapSize for SchemaFingerprint {
+ fn heap_size(&self, ctx: &mut DFHeapSizeCtx) -> usize {
+ // `(String, DataType, bool)` has no `DFHeapSize` impl (only 2-tuples
do),
+ // so account for each column by hand. `bool` carries no heap.
+ self.0.capacity() * size_of::<(String, DataType, bool)>()
+ + self
+ .0
+ .iter()
+ .map(|(name, data_type, _)| {
+ name.heap_size(ctx) + data_type.heap_size(ctx)
+ })
+ .sum::<usize>()
+ }
+}
+
+/// Cache key for the file-statistics cache.
+///
+/// Like [`TableScopedPath`] it is scoped by table and path, but it
additionally
+/// carries a [`SchemaFingerprint`]. File statistics are computed against a
+/// specific `file_schema`, so the same path read under different schemas must
+/// not share an entry; the fingerprint keeps those entries distinct while a
+/// repeated read of the same schema still reuses its entry.
+#[derive(PartialEq, Eq, Hash, Clone, Debug)]
+pub struct FileStatisticsCacheKey {
Review Comment:
All file statistics tests like are still running on `TableScopedPath`. Can
we change these
Tests also to use `FileStatisticsCacheKey` ?
https://github.com/apache/datafusion/blob/bde8e5b24e772210885ea5272e5cb06b52c939c3/datafusion/execution/src/cache/default_cache.rs#L820
##########
datafusion/execution/src/cache/mod.rs:
##########
@@ -165,3 +166,124 @@ impl Display for TableScopedPath {
}
}
}
+
+/// A fingerprint of the `file_schema` used to compute a file's statistics.
+///
+/// Captures exactly the attributes that determine the layout and meaning of
+/// `Statistics::column_statistics`: each column's name, data type and
+/// nullability, in order. It deliberately excludes field/schema metadata,
which
+/// cannot affect statistics — including it would needlessly fragment the
cache.
+#[derive(PartialEq, Eq, Hash, Clone, Debug)]
+pub struct SchemaFingerprint(Vec<(String, DataType, bool)>);
+
+impl SchemaFingerprint {
+ /// Builds a fingerprint from the `file_schema` used to compute statistics
+ /// (the schema of the columns physically read, not the full table schema —
+ /// partition columns and their statistics are handled separately).
+ pub fn from_schema(file_schema: &Schema) -> Self {
+ Self(
+ file_schema
+ .fields()
+ .iter()
+ .map(|f| (f.name().clone(), f.data_type().clone(),
f.is_nullable()))
+ .collect(),
+ )
+ }
+}
+
+impl DFHeapSize for SchemaFingerprint {
Review Comment:
If you add a `DFHeapSize` Trait for a 3 Tuple in `heap_size.rs` such as:
```rust
impl<A, B, C> DFHeapSize for (A, B, C)
where
A: DFHeapSize,
B: DFHeapSize,
C: DFHeapSize,
{
fn heap_size(&self, ctx: &mut DFHeapSizeCtx) -> usize {
self.0.heap_size(ctx) + self.1.heap_size(ctx) + self.2.heap_size(ctx)
}
}
```
This becomes:
```rust
impl DFHeapSize for SchemaFingerprint {
fn heap_size(&self, ctx: &mut DFHeapSizeCtx) -> usize {
self.0.heap_size(ctx)
}
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]