alamb commented on code in PR #20505: URL: https://github.com/apache/datafusion/pull/20505#discussion_r2884585826
########## datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt: ########## @@ -737,3 +737,108 @@ DROP TABLE t_union_mem; statement ok DROP TABLE t_union_parquet; + +# Cleanup settings +statement ok +set datafusion.optimizer.max_passes = 3; + +statement ok +set datafusion.execution.parquet.pushdown_filters = false; + + +# Regression test for https://github.com/apache/datafusion/issues/20696 +# Multi-column INNER JOIN with dictionary fails +# when parquet pushdown filters are enabled. + + +statement ok +COPY ( + SELECT + to_timestamp_nanos(time_ns) AS time, + arrow_cast(state, 'Dictionary(Int32, Utf8)') AS state, + arrow_cast(city, 'Dictionary(Int32, Utf8)') AS city, + temp + FROM ( + VALUES + (200, 'CA', 'LA', 90.0), + (250, 'MA', 'Boston', 72.4), + (100, 'MA', 'Boston', 70.4), + (350, 'CA', 'LA', 90.0) + ) AS t(time_ns, state, city, temp) +) +TO 'test_files/scratch/parquet_filter_pushdown/issue_20696/h2o/data.parquet'; + +statement ok +COPY ( + SELECT + to_timestamp_nanos(time_ns) AS time, + arrow_cast(state, 'Dictionary(Int32, Utf8)') AS state, + arrow_cast(city, 'Dictionary(Int32, Utf8)') AS city, + temp, + reading + FROM ( + VALUES + (250, 'MA', 'Boston', 53.4, 51.0), + (100, 'MA', 'Boston', 50.4, 50.0) + ) AS t(time_ns, state, city, temp, reading) +) +TO 'test_files/scratch/parquet_filter_pushdown/issue_20696/o2/data.parquet'; + +statement ok +CREATE EXTERNAL TABLE h2o_parquet_20696 STORED AS PARQUET +LOCATION 'test_files/scratch/parquet_filter_pushdown/issue_20696/h2o/'; + +statement ok +CREATE EXTERNAL TABLE o2_parquet_20696 STORED AS PARQUET +LOCATION 'test_files/scratch/parquet_filter_pushdown/issue_20696/o2/'; + +# Query should work both with and without filters +statement ok +set datafusion.execution.parquet.pushdown_filters = false; + +query RRR +SELECT + h2o_parquet_20696.temp AS h2o_temp, + o2_parquet_20696.temp AS o2_temp, + o2_parquet_20696.reading +FROM h2o_parquet_20696 +INNER JOIN o2_parquet_20696 + ON h2o_parquet_20696.time = o2_parquet_20696.time + AND h2o_parquet_20696.state = o2_parquet_20696.state + AND h2o_parquet_20696.city = o2_parquet_20696.city +WHERE h2o_parquet_20696.time >= '1970-01-01T00:00:00.000000050Z' + AND h2o_parquet_20696.time <= '1970-01-01T00:00:00.000000300Z'; +---- +72.4 53.4 51 +70.4 50.4 50 + + +statement ok +set datafusion.execution.parquet.pushdown_filters = true; + +query RRR Review Comment: this test fails with ``` Parquet error: External: Compute error: Error evaluating filter predicate: ArrowError(InvalidArgumentError("Can't compare arrays of different types"), Some("")) ``` Prior to this fix -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
