hiltontj commented on code in PR #8560: URL: https://github.com/apache/arrow-datafusion/pull/8560#discussion_r1429314659
########## datafusion/sqllogictest/test_files/parquet.slt: ########## @@ -0,0 +1,151 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TESTS FOR PARQUET FILES + +# Setup basic alltypes_plain table: + +statement ok +CREATE EXTERNAL TABLE alltypes_plain ( + id INT NOT NULL, + bool_col BOOLEAN NOT NULL, + tinyint_col TINYINT NOT NULL, + smallint_col SMALLINT NOT NULL, + int_col INT NOT NULL, + bigint_col BIGINT NOT NULL, + float_col FLOAT NOT NULL, + double_col DOUBLE NOT NULL, + date_string_col BYTEA NOT NULL, + string_col VARCHAR NOT NULL, + timestamp_col TIMESTAMP NOT NULL, +) +STORED AS PARQUET +WITH HEADER ROW +LOCATION '../../parquet-testing/data/alltypes_plain.parquet' + +# Test a basic query: + +query IT +SELECT id, CAST(string_col AS varchar) FROM alltypes_plain +---- +4 0 +5 1 +6 0 +7 1 +2 0 +3 1 +0 0 +1 1 + +# Explain query on the un-ordered table, expect no "output_ordering" clause in physical_plan -> ParquetExec: + +query TT +EXPLAIN SELECT int_col, string_col +FROM alltypes_plain +ORDER BY string_col, int_col +LIMIT 10 +---- +logical_plan +Limit: skip=0, fetch=10 +--Sort: alltypes_plain.string_col ASC NULLS LAST, alltypes_plain.int_col ASC NULLS LAST, fetch=10 +----TableScan: alltypes_plain projection=[int_col, string_col] +physical_plan +GlobalLimitExec: skip=0, fetch=10 +--SortExec: TopK(fetch=10), expr=[string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] +----ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[int_col, string_col] + +# Setup alltypes_plain, with an order clause: + +statement ok +CREATE EXTERNAL TABLE alltypes_plain_with_order ( + id INT NOT NULL, + bool_col BOOLEAN NOT NULL, + tinyint_col TINYINT NOT NULL, + smallint_col SMALLINT NOT NULL, + int_col INT NOT NULL, + bigint_col BIGINT NOT NULL, + float_col FLOAT NOT NULL, + double_col DOUBLE NOT NULL, + date_string_col BYTEA NOT NULL, + string_col VARCHAR NOT NULL, + timestamp_col TIMESTAMP NOT NULL, +) +STORED AS PARQUET +WITH HEADER ROW +WITH ORDER (string_col ASC NULLS LAST, int_col NULLS LAST) +LOCATION '../../parquet-testing/data/alltypes_plain.parquet' + +# Explain query on the ordered table, expect to see the "output_ordering" clause in physical_plan -> ParquetExec: + +query TT +EXPLAIN SELECT int_col, string_col +FROM alltypes_plain_with_order +ORDER BY string_col, int_col +LIMIT 10 +---- +logical_plan +Limit: skip=0, fetch=10 +--Sort: alltypes_plain_with_order.string_col ASC NULLS LAST, alltypes_plain_with_order.int_col ASC NULLS LAST, fetch=10 +----TableScan: alltypes_plain_with_order projection=[int_col, string_col] +physical_plan +GlobalLimitExec: skip=0, fetch=10 +--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST] + +# Setup alltypes_plain, from the directory, with ordering clause: + +statement ok +CREATE EXTERNAL TABLE alltypes_plain_from_dir ( + id INT NOT NULL, + bool_col BOOLEAN NOT NULL, + tinyint_col TINYINT NOT NULL, + smallint_col SMALLINT NOT NULL, + int_col INT NOT NULL, + bigint_col BIGINT NOT NULL, + float_col FLOAT NOT NULL, + double_col DOUBLE NOT NULL, + date_string_col BYTEA NOT NULL, + string_col VARCHAR NOT NULL, + timestamp_col TIMESTAMP NOT NULL, +) +STORED AS PARQUET +WITH HEADER ROW +WITH ORDER (string_col ASC NULLS LAST, int_col NULLS LAST) +PARTITIONED BY (string_col, int_col) +LOCATION '../../parquet-testing/data/alltypes_dir' Review Comment: @alamb - that approach was just what I was looking for, thank you. I was able to use `COPY` to generate the files needed to test partitioning and output ordering in the physical plan, as the original `parquet_with_sort_order_specified.rs` test was doing. This eliminates reliance on files elsewhere in the project, and makes the testing data visible in the `.slt` file, which seems desirable. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
