hiltontj commented on code in PR #8560: URL: https://github.com/apache/arrow-datafusion/pull/8560#discussion_r1429202281
########## datafusion/sqllogictest/test_files/parquet.slt: ########## @@ -0,0 +1,151 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TESTS FOR PARQUET FILES + +# Setup basic alltypes_plain table: + +statement ok +CREATE EXTERNAL TABLE alltypes_plain ( + id INT NOT NULL, + bool_col BOOLEAN NOT NULL, + tinyint_col TINYINT NOT NULL, + smallint_col SMALLINT NOT NULL, + int_col INT NOT NULL, + bigint_col BIGINT NOT NULL, + float_col FLOAT NOT NULL, + double_col DOUBLE NOT NULL, + date_string_col BYTEA NOT NULL, + string_col VARCHAR NOT NULL, + timestamp_col TIMESTAMP NOT NULL, +) +STORED AS PARQUET +WITH HEADER ROW +LOCATION '../../parquet-testing/data/alltypes_plain.parquet' + +# Test a basic query: + +query IT +SELECT id, CAST(string_col AS varchar) FROM alltypes_plain +---- +4 0 +5 1 +6 0 +7 1 +2 0 +3 1 +0 0 +1 1 + +# Explain query on the un-ordered table, expect no "output_ordering" clause in physical_plan -> ParquetExec: + +query TT +EXPLAIN SELECT int_col, string_col +FROM alltypes_plain +ORDER BY string_col, int_col +LIMIT 10 +---- +logical_plan +Limit: skip=0, fetch=10 +--Sort: alltypes_plain.string_col ASC NULLS LAST, alltypes_plain.int_col ASC NULLS LAST, fetch=10 +----TableScan: alltypes_plain projection=[int_col, string_col] +physical_plan +GlobalLimitExec: skip=0, fetch=10 +--SortExec: TopK(fetch=10), expr=[string_col@1 ASC NULLS LAST,int_col@0 ASC NULLS LAST] +----ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[int_col, string_col] + +# Setup alltypes_plain, with an order clause: + +statement ok +CREATE EXTERNAL TABLE alltypes_plain_with_order ( + id INT NOT NULL, + bool_col BOOLEAN NOT NULL, + tinyint_col TINYINT NOT NULL, + smallint_col SMALLINT NOT NULL, + int_col INT NOT NULL, + bigint_col BIGINT NOT NULL, + float_col FLOAT NOT NULL, + double_col DOUBLE NOT NULL, + date_string_col BYTEA NOT NULL, + string_col VARCHAR NOT NULL, + timestamp_col TIMESTAMP NOT NULL, +) +STORED AS PARQUET +WITH HEADER ROW +WITH ORDER (string_col ASC NULLS LAST, int_col NULLS LAST) +LOCATION '../../parquet-testing/data/alltypes_plain.parquet' + +# Explain query on the ordered table, expect to see the "output_ordering" clause in physical_plan -> ParquetExec: + +query TT +EXPLAIN SELECT int_col, string_col +FROM alltypes_plain_with_order +ORDER BY string_col, int_col +LIMIT 10 +---- +logical_plan +Limit: skip=0, fetch=10 +--Sort: alltypes_plain_with_order.string_col ASC NULLS LAST, alltypes_plain_with_order.int_col ASC NULLS LAST, fetch=10 +----TableScan: alltypes_plain_with_order projection=[int_col, string_col] +physical_plan +GlobalLimitExec: skip=0, fetch=10 +--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST] + +# Setup alltypes_plain, from the directory, with ordering clause: + +statement ok +CREATE EXTERNAL TABLE alltypes_plain_from_dir ( + id INT NOT NULL, + bool_col BOOLEAN NOT NULL, + tinyint_col TINYINT NOT NULL, + smallint_col SMALLINT NOT NULL, + int_col INT NOT NULL, + bigint_col BIGINT NOT NULL, + float_col FLOAT NOT NULL, + double_col DOUBLE NOT NULL, + date_string_col BYTEA NOT NULL, + string_col VARCHAR NOT NULL, + timestamp_col TIMESTAMP NOT NULL, +) +STORED AS PARQUET +WITH HEADER ROW +WITH ORDER (string_col ASC NULLS LAST, int_col NULLS LAST) +PARTITIONED BY (string_col, int_col) +LOCATION '../../parquet-testing/data/alltypes_dir' Review Comment: @alamb - thank you for the direction, that is an excellent example. I will incorporate that here. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
