adriangb opened a new issue, #13574:
URL: https://github.com/apache/datafusion/issues/13574

   ### Describe the bug
   
   Tested on a parquet file with a single `Dictionary(UInt32, Utf8)` column and 
bloom filters, datafusion-cli won't use them to prune.
   
   ### To Reproduce
   
   ```rust
   use arrow::array::RecordBatch;
   use arrow_schema::DataType;
   use bytes::{BufMut, BytesMut};
   use datafusion::datasource::schema_adapter::DefaultSchemaAdapterFactory;
   use parquet::arrow::ArrowWriter;
   use parquet::file::properties::WriterProperties;
   use std::sync::Arc;
   
   async fn write_record_batch(data_type: &DataType, suffix: &str) {
       let schema = 
Arc::new(arrow::datatypes::Schema::new(vec![arrow::datatypes::Field::new(
           "column",
           data_type.clone(),
           false,
       )]));
       let batch = RecordBatch::try_new(
           
Arc::new(arrow::datatypes::Schema::new(vec![arrow::datatypes::Field::new(
               "column",
               DataType::Utf8,
               false,
           )])),
           vec![Arc::new(arrow::array::StringArray::from(vec!["Hello, 
World!"]))],
       )
       .unwrap();
       let batch = DefaultSchemaAdapterFactory::from_schema(schema)
           .map_schema(batch.schema().as_ref())
           .unwrap()
           .0
           .map_batch(batch)
           .unwrap();
       let mut buf = BytesMut::new().writer();
       let schema = batch.schema();
       let props = WriterProperties::builder()
           .set_bloom_filter_enabled(true)
           
.set_statistics_enabled(parquet::file::properties::EnabledStatistics::None)
           .build();
       {
           let mut writer = ArrowWriter::try_new(&mut buf, schema, 
Some(props)).unwrap();
           writer.write(&batch).unwrap();
           writer.finish().unwrap();
       }
       tokio::fs::write(format!("hello_world{}.parquet", suffix), 
buf.into_inner().freeze())
           .await
           .unwrap();
   }
   
   #[tokio::main]
   async fn main() {
       // Create a RecordBatch with a single string column with a single row 
containing "Hello, World!"
       let data_type = arrow::datatypes::DataType::Utf8;
       write_record_batch(&data_type, "_plain").await;
       let data_type = arrow::datatypes::DataType::Dictionary(
           Box::new(arrow::datatypes::DataType::Int32),
           Box::new(arrow::datatypes::DataType::Utf8),
       );
       write_record_batch(&data_type, "_dict").await;
       let data_type = arrow::datatypes::DataType::Utf8View;
       write_record_batch(&data_type, "_view").await;
   }
   ```
   
   You can now verify with `parquet bloom-filter -c column -v 'Not Hello, 
World!' hello_world_dict.parquet` that the value can be pruned via bloom 
fitlers.
   
   But querying it with datafusion-cli using `explain analyze select * from 
'hello_world_dict.parquet' where column = 'Not Hellow, World!';` confirms that 
bloom filters aren't used. They are used for `_plain` or `_view`.
   
   ### Expected behavior
   
   _No response_
   
   ### Additional context
   
   _No response_


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to