alamb commented on code in PR #10716: URL: https://github.com/apache/datafusion/pull/10716#discussion_r1624676825
########## datafusion/core/src/datasource/schema_adapter.rs: ########## @@ -75,9 +75,16 @@ pub trait SchemaAdapter: Send + Sync { /// Creates a `SchemaMapping` that can be used to cast or map the columns /// from the file schema to the table schema. -pub trait SchemaMapper: Send + Sync { +pub trait SchemaMapper: Debug + Send + Sync { /// Adapts a `RecordBatch` to match the `table_schema` using the stored mapping and conversions. fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch>; + + /// Adapts a `RecordBatch` that does not have all the columns (as defined in the schema). + /// This method is slower than `map_batch` and should only be used when explicitly needed. Review Comment: I think it would help here to give some context about why this is slower. Here is a suggestion ```diff - /// Adapts a `RecordBatch` that does not have all the columns (as defined in the schema). - /// This method is slower than `map_batch` and should only be used when explicitly needed. + /// Adapts a [`RecordBatch`] that does not have all the columns from the + /// file schema. + /// + /// This method is used when applying a filter to a subset of the columns during + /// via an `ArrowPredicate`. + /// + /// This method is slower than `map_batch` as it looks up columns by name.``` ########## datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs: ########## @@ -473,6 +495,74 @@ mod test { ); } + #[test] + fn test_filter_type_coercion() { + let testdata = crate::test_util::parquet_test_data(); + let file = std::fs::File::open(format!("{testdata}/alltypes_plain.parquet")) + .expect("opening file"); + + let reader = SerializedFileReader::new(file).expect("creating reader"); + let metadata = reader.metadata(); + let file_schema = + parquet_to_arrow_schema(metadata.file_metadata().schema_descr(), None) + .expect("parsing schema"); + + // This is the schema we would like to coerce to, + // which is different from the physical schema of the file. + let table_schema = Schema::new(vec![Field::new( + "timestamp_col", + DataType::Timestamp(Nanosecond, Some(Arc::from("UTC"))), + false, + )]); + + let expr = col("timestamp_col").eq(Expr::Literal( + ScalarValue::TimestampNanosecond(Some(1), Some(Arc::from("UTC"))), + )); + let expr = logical2physical(&expr, &table_schema); + let candidate = FilterCandidateBuilder::new(expr, &file_schema, &table_schema) + .build(metadata) + .expect("building candidate") + .expect("candidate expected"); + + let schema_adapter = + DefaultSchemaAdapterFactory {}.create(Arc::new(table_schema)); + let (schema_mapping, _) = schema_adapter + .map_schema(&file_schema) + .expect("creating schema mapping"); + + let mut row_filter = DatafusionArrowPredicate::try_new( + candidate, + &file_schema, + metadata, + Count::new(), + Time::new(), + schema_mapping, + ) + .expect("creating filter predicate"); + + // Create some fake data as if it was from the parquet file + let ts_array = TimestampNanosecondArray::new( + vec![TimestampNanosecondType::parse("2020-01-01T00:00:00") + .expect("should parse")] + .into(), + None, + ); + // We need a matching schema to create a record batch + let batch_schema = Schema::new(vec![Field::new( + "timestamp_col", + DataType::Timestamp(Nanosecond, None), + false, + )]); + + let record_batch = + RecordBatch::try_new(Arc::new(batch_schema), vec![Arc::new(ts_array)]) + .expect("creating record batch"); + + let filtered = row_filter.evaluate(record_batch); + + assert!(filtered.is_ok()); Review Comment: I think it would be good: ## Check the actual results ```suggestion let filtered = row_filter.evaluate(record_batch); let filtered = row_filter.evaluate(record_batch).unwrap(); assert_eq!(filtered, BooleanArray::from(vec![false])); ``` ## Add a negative case Aka add another expression / dataset where the filter returns `true`; ## Consider reworking the use of alltypes_plain.parquet I found it cofusing that the only reason the file was used was to get a schema. I would suggest either: 1. Simply hard coding the file schema (e.g. to one that had `int_col` and `timestamp(none)`) 2. Use the actual data in the file to prune ```sql (venv) andrewlamb@Andrews-MacBook-Pro-2:~/Software/datafusion2$ datafusion-cli -c 'select * from "./parquet-testing/data/alltypes_plain.parquet"' DataFusion CLI v38.0.0 +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+ | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col | +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+ | 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00 | | 5 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30332f30312f3039 | 31 | 2009-03-01T00:01:00 | | 6 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30342f30312f3039 | 30 | 2009-04-01T00:00:00 | | 7 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30342f30312f3039 | 31 | 2009-04-01T00:01:00 | | 2 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30322f30312f3039 | 30 | 2009-02-01T00:00:00 | | 3 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30322f30312f3039 | 31 | 2009-02-01T00:01:00 | | 0 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30312f30312f3039 | 30 | 2009-01-01T00:00:00 | | 1 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30312f30312f3039 | 31 | 2009-01-01T00:01:00 | +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+ ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org