This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 1042095211 feat: improve string statistics display (#8535)
1042095211 is described below
commit 1042095211caec2cbd0af93b8f4c8a78dff47259
Author: Ashim Sedhain <[email protected]>
AuthorDate: Thu Dec 14 10:40:53 2023 -0600
feat: improve string statistics display (#8535)
GH-8464
---
datafusion-cli/src/functions.rs | 77 +++++++++++++++++++++++++++--------------
datafusion-cli/src/main.rs | 24 +++++++++++++
2 files changed, 75 insertions(+), 26 deletions(-)
diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs
index 24f3399ee2..f8d9ed238b 100644
--- a/datafusion-cli/src/functions.rs
+++ b/datafusion-cli/src/functions.rs
@@ -31,6 +31,7 @@ use datafusion::logical_expr::Expr;
use datafusion::physical_plan::memory::MemoryExec;
use datafusion::physical_plan::ExecutionPlan;
use datafusion::scalar::ScalarValue;
+use parquet::basic::ConvertedType;
use parquet::file::reader::FileReader;
use parquet::file::serialized_reader::SerializedFileReader;
use parquet::file::statistics::Statistics;
@@ -246,6 +247,52 @@ impl TableProvider for ParquetMetadataTable {
}
}
+fn convert_parquet_statistics(
+ value: &Statistics,
+ converted_type: ConvertedType,
+) -> (String, String) {
+ match (value, converted_type) {
+ (Statistics::Boolean(val), _) => (val.min().to_string(),
val.max().to_string()),
+ (Statistics::Int32(val), _) => (val.min().to_string(),
val.max().to_string()),
+ (Statistics::Int64(val), _) => (val.min().to_string(),
val.max().to_string()),
+ (Statistics::Int96(val), _) => (val.min().to_string(),
val.max().to_string()),
+ (Statistics::Float(val), _) => (val.min().to_string(),
val.max().to_string()),
+ (Statistics::Double(val), _) => (val.min().to_string(),
val.max().to_string()),
+ (Statistics::ByteArray(val), ConvertedType::UTF8) => {
+ let min_bytes = val.min();
+ let max_bytes = val.max();
+ let min = min_bytes
+ .as_utf8()
+ .map(|v| v.to_string())
+ .unwrap_or_else(|_| min_bytes.to_string());
+
+ let max = max_bytes
+ .as_utf8()
+ .map(|v| v.to_string())
+ .unwrap_or_else(|_| max_bytes.to_string());
+ (min, max)
+ }
+ (Statistics::ByteArray(val), _) => (val.min().to_string(),
val.max().to_string()),
+ (Statistics::FixedLenByteArray(val), ConvertedType::UTF8) => {
+ let min_bytes = val.min();
+ let max_bytes = val.max();
+ let min = min_bytes
+ .as_utf8()
+ .map(|v| v.to_string())
+ .unwrap_or_else(|_| min_bytes.to_string());
+
+ let max = max_bytes
+ .as_utf8()
+ .map(|v| v.to_string())
+ .unwrap_or_else(|_| max_bytes.to_string());
+ (min, max)
+ }
+ (Statistics::FixedLenByteArray(val), _) => {
+ (val.min().to_string(), val.max().to_string())
+ }
+ }
+}
+
pub struct ParquetMetadataFunc {}
impl TableFunctionImpl for ParquetMetadataFunc {
@@ -326,34 +373,12 @@ impl TableFunctionImpl for ParquetMetadataFunc {
num_values_arr.push(column.num_values());
path_in_schema_arr.push(column.column_path().to_string());
type_arr.push(column.column_type().to_string());
+ let converted_type = column.column_descr().converted_type();
+
if let Some(s) = column.statistics() {
let (min_val, max_val) = if s.has_min_max_set() {
- let (min_val, max_val) = match s {
- Statistics::Boolean(val) => {
- (val.min().to_string(), val.max().to_string())
- }
- Statistics::Int32(val) => {
- (val.min().to_string(), val.max().to_string())
- }
- Statistics::Int64(val) => {
- (val.min().to_string(), val.max().to_string())
- }
- Statistics::Int96(val) => {
- (val.min().to_string(), val.max().to_string())
- }
- Statistics::Float(val) => {
- (val.min().to_string(), val.max().to_string())
- }
- Statistics::Double(val) => {
- (val.min().to_string(), val.max().to_string())
- }
- Statistics::ByteArray(val) => {
- (val.min().to_string(), val.max().to_string())
- }
- Statistics::FixedLenByteArray(val) => {
- (val.min().to_string(), val.max().to_string())
- }
- };
+ let (min_val, max_val) =
+ convert_parquet_statistics(s, converted_type);
(Some(min_val), Some(max_val))
} else {
(None, None)
diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs
index 8b1a9816af..8b74a797b5 100644
--- a/datafusion-cli/src/main.rs
+++ b/datafusion-cli/src/main.rs
@@ -420,4 +420,28 @@ mod tests {
Ok(())
}
+
+ #[tokio::test]
+ async fn test_parquet_metadata_works_with_strings() -> Result<(),
DataFusionError> {
+ let ctx = SessionContext::new();
+ ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc
{}));
+
+ // input with string columns
+ let sql =
+ "SELECT * FROM
parquet_metadata('../parquet-testing/data/data_index_bloom_encoding_stats.parquet')";
+ let df = ctx.sql(sql).await?;
+ let rbs = df.collect().await?;
+
+ let excepted = [
+
+"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+",
+"| filename |
row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes |
column_id | file_offset | num_values | path_in_schema | type | stats_min
| stats_max | stats_null_count | stats_distinct_count | stats_min_value |
stats_max_value | compression | encodings |
index_page_offset | dictionary_page_offset | data_page_offset |
total_compressed_size | total_uncompressed_size |",
+"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+",
+"| ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0
| 14 | 1 | 163 | 0 |
4 | 14 | \"String\" | BYTE_ARRAY | Hello | today
| 0 | | Hello | today
| GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | |
| 4 | 152 | 163
|",
+"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+"
+ ];
+ assert_batches_eq!(excepted, &rbs);
+
+ Ok(())
+ }
}