This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new ddfc282008 Add tests for sqllogictest prioritization (#20656)
ddfc282008 is described below
commit ddfc282008587b15947ff54c91497638df1ca2d4
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Mar 4 17:42:27 2026 -0500
Add tests for sqllogictest prioritization (#20656)
## Which issue does this PR close?
- Follow on to https://github.com/apache/datafusion/pull/20576
## Rationale for this change
Implement suggestions from @kosiew in
https://github.com/apache/datafusion/pull/20576#discussion_r2870761276:
> Can we add a deterministic tie-breaker in sort_tests (for equal
priority) using relative_path, e.g. sort_by_key(|f| (priority,
f.relative_path.clone())) to keep run order stable?
>
> This would also benefit from a small unit test covering:
>
> prioritized files are first,
> non-prioritized files keep deterministic ordering
## What changes are included in this PR?
1. Add deterministic tie breaker so tests are always run in the same
order
2. Add a test for the ordering
Note that in order to actually write these tests I needed to pull the
TestFile and some related logic into its own module
## Are these changes tested?
Yes by CI
I also ran the tests locally via
```shell
cargo test --profile=ci --test sqllogictests
```
I also double checked sqllogictests
```shell
INCLUDE_SQLITE=true cargo test --profile release-nonlto --test sqllogictests
```
## Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
-->
<!--
If there are any breaking changes to public APIs, please add the `api
change` label.
-->
---
datafusion/sqllogictest/bin/sqllogictests.rs | 156 ++++------------------
datafusion/sqllogictest/src/lib.rs | 2 +
datafusion/sqllogictest/src/test_file.rs | 186 +++++++++++++++++++++++++++
3 files changed, 213 insertions(+), 131 deletions(-)
diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs
b/datafusion/sqllogictest/bin/sqllogictests.rs
index b5a382ca6a..38a7635042 100644
--- a/datafusion/sqllogictest/bin/sqllogictests.rs
+++ b/datafusion/sqllogictest/bin/sqllogictests.rs
@@ -18,9 +18,8 @@
use clap::{ColorChoice, Parser, ValueEnum};
use datafusion::common::instant::Instant;
use datafusion::common::utils::get_available_parallelism;
-use datafusion::common::{
- DataFusionError, HashMap, Result, exec_datafusion_err, exec_err,
-};
+use datafusion::common::{DataFusionError, Result, exec_datafusion_err,
exec_err};
+use datafusion_sqllogictest::TestFile;
use datafusion_sqllogictest::{
CurrentlyExecutingSqlTracker, DataFusion, DataFusionSubstraitRoundTrip,
Filter,
TestContext, df_value_validator, read_dir_recursive, setup_scratch_dir,
@@ -44,13 +43,12 @@ use crate::postgres_container::{
};
use datafusion::common::runtime::SpawnedTask;
use futures::FutureExt;
-use std::ffi::OsStr;
use std::fs;
use std::io::{IsTerminal, stderr, stdout};
use std::path::{Path, PathBuf};
use std::str::FromStr;
+use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::{Arc, LazyLock};
use std::time::Duration;
#[cfg(feature = "postgres")]
@@ -59,6 +57,7 @@ mod postgres_container;
const TEST_DIRECTORY: &str = "test_files/";
const DATAFUSION_TESTING_TEST_DIRECTORY: &str =
"../../datafusion-testing/data/";
const PG_COMPAT_FILE_PREFIX: &str = "pg_compat_";
+const TPCH_PREFIX: &str = "tpch";
const SQLITE_PREFIX: &str = "sqlite";
const ERRS_PER_FILE_LIMIT: usize = 10;
const TIMING_DEBUG_SLOW_FILES_ENV: &str = "SLT_TIMING_DEBUG_SLOW_FILES";
@@ -77,55 +76,6 @@ struct FileTiming {
elapsed: Duration,
}
-/// TEST PRIORITY
-///
-/// Heuristically prioritize some test to run earlier.
-///
-/// Prioritizes test to run earlier if they are known to be long running (as
-/// each test file itself is run sequentially, but multiple test files are run
-/// in parallel.
-///
-/// Tests not listed here will run after the listed tests in an arbitrary
order.
-///
-/// You can find the top longest running tests by running `--timing-summary`
mode.
-/// For example
-///
-/// ```shell
-/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top
-/// ...
-/// Per-file elapsed summary (deterministic):
-/// 1. 5.375s push_down_filter_regression.slt
-/// 2. 3.174s aggregate.slt
-/// 3. 3.158s imdb.slt
-/// 4. 2.793s joins.slt
-/// 5. 2.505s array.slt
-/// 6. 2.265s aggregate_skip_partial.slt
-/// 7. 2.260s window.slt
-/// 8. 1.677s group_by.slt
-/// 9. 0.973s datetime/timestamps.slt
-/// 10. 0.822s cte.slt
-/// ```
-static TEST_PRIORITY: LazyLock<HashMap<PathBuf, usize>> = LazyLock::new(|| {
- [
- (PathBuf::from("push_down_filter_regression.slt"), 0), // longest
running, so run first.
- (PathBuf::from("aggregate.slt"), 1),
- (PathBuf::from("joins.slt"), 2),
- (PathBuf::from("imdb.slt"), 3),
- (PathBuf::from("array.slt"), 4),
- (PathBuf::from("aggregate_skip_partial.slt"), 5),
- (PathBuf::from("window.slt"), 6),
- (PathBuf::from("group_by.slt"), 7),
- (PathBuf::from("datetime/timestamps.slt"), 8),
- (PathBuf::from("cte.slt"), 9),
- ]
- .into_iter()
- .collect()
-});
-
-/// Default priority for tests not in the TEST_PRIORITY map. Tests with lower
-/// priority values run first.
-static DEFAULT_PRIORITY: usize = 100;
-
pub fn main() -> Result<()> {
tokio::runtime::Builder::new_multi_thread()
.enable_all()
@@ -832,91 +782,35 @@ async fn run_complete_file_with_postgres(
plan_err!("Can not run with postgres as postgres feature is not enabled")
}
-/// Represents a parsed test file
-#[derive(Debug)]
-struct TestFile {
- /// The absolute path to the file
- pub path: PathBuf,
- /// The relative path of the file (used for display)
- pub relative_path: PathBuf,
-}
-
-impl TestFile {
- fn new(path: PathBuf) -> Self {
- let p = path.to_string_lossy();
- let relative_path = PathBuf::from(if p.starts_with(TEST_DIRECTORY) {
- p.strip_prefix(TEST_DIRECTORY).unwrap()
- } else if p.starts_with(DATAFUSION_TESTING_TEST_DIRECTORY) {
- p.strip_prefix(DATAFUSION_TESTING_TEST_DIRECTORY).unwrap()
- } else {
- ""
- });
-
- Self {
- path,
- relative_path,
- }
- }
-
- fn is_slt_file(&self) -> bool {
- self.path.extension() == Some(OsStr::new("slt"))
- }
-
- fn check_sqlite(&self, options: &Options) -> bool {
- if !self.relative_path.starts_with(SQLITE_PREFIX) {
- return true;
- }
-
- options.include_sqlite
- }
-
- fn check_tpch(&self, options: &Options) -> bool {
- if !self.relative_path.starts_with("tpch") {
- return true;
- }
+fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
+ let prefixes: &[&str] = if options.include_sqlite {
+ &[TEST_DIRECTORY, DATAFUSION_TESTING_TEST_DIRECTORY]
+ } else {
+ &[TEST_DIRECTORY]
+ };
- options.include_tpch
- }
-}
+ let directories = prefixes
+ .iter()
+ .map(|prefix| {
+ read_dir_recursive(prefix).map_err(|e| {
+ exec_datafusion_err!("Error reading test directory {prefix}:
{e}")
+ })
+ })
+ .collect::<Result<Vec<_>>>()?;
-fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
- let mut paths = read_dir_recursive(TEST_DIRECTORY)?
+ let mut paths = directories
.into_iter()
- .map(TestFile::new)
+ .flatten()
+ .map(|p| TestFile::new(p, prefixes))
.filter(|f| options.check_test_file(&f.path))
.filter(|f| f.is_slt_file())
- .filter(|f| f.check_tpch(options))
- .filter(|f| f.check_sqlite(options))
+ .filter(|f| !f.relative_path_starts_with(TPCH_PREFIX) ||
options.include_tpch)
+ .filter(|f| !f.relative_path_starts_with(SQLITE_PREFIX) ||
options.include_sqlite)
.filter(|f| options.check_pg_compat_file(f.path.as_path()))
.collect::<Vec<_>>();
- if options.include_sqlite {
- let mut sqlite_paths =
read_dir_recursive(DATAFUSION_TESTING_TEST_DIRECTORY)?
- .into_iter()
- .map(TestFile::new)
- .filter(|f| options.check_test_file(&f.path))
- .filter(|f| f.is_slt_file())
- .filter(|f| f.check_sqlite(options))
- .filter(|f| options.check_pg_compat_file(f.path.as_path()))
- .collect::<Vec<_>>();
-
- paths.append(&mut sqlite_paths)
- }
- Ok(sort_tests(paths))
-}
-
-/// Sort the tests heuristically by order of "priority"
-///
-/// Prioritizes test to run earlier if they are known to be long running (as
-/// each test file itself is run sequentially, but multiple test files are run
-/// in parallel.
-fn sort_tests(mut tests: Vec<TestFile>) -> Vec<TestFile> {
- tests.sort_by_key(|f| {
- TEST_PRIORITY
- .get(&f.relative_path)
- .unwrap_or(&DEFAULT_PRIORITY)
- });
- tests
+ paths.sort_unstable();
+ Ok(paths)
}
/// Parsed command line options
diff --git a/datafusion/sqllogictest/src/lib.rs
b/datafusion/sqllogictest/src/lib.rs
index f228ee89ab..bb12c58bdc 100644
--- a/datafusion/sqllogictest/src/lib.rs
+++ b/datafusion/sqllogictest/src/lib.rs
@@ -27,6 +27,7 @@
//! DataFusion sqllogictest driver
mod engines;
+mod test_file;
pub use engines::CurrentlyExecutingSqlTracker;
pub use engines::DFColumnType;
@@ -46,4 +47,5 @@ mod util;
pub use filters::*;
pub use test_context::TestContext;
+pub use test_file::TestFile;
pub use util::*;
diff --git a/datafusion/sqllogictest/src/test_file.rs
b/datafusion/sqllogictest/src/test_file.rs
new file mode 100644
index 0000000000..c44cae1336
--- /dev/null
+++ b/datafusion/sqllogictest/src/test_file.rs
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::ffi::OsStr;
+use std::path::{Path, PathBuf};
+use std::sync::LazyLock;
+
+/// Represents a parsed test file
+///
+/// Note there is a custom Ord implementation that sorts test files by:
+/// 1. Hard coded test priority (lower runs first),
+/// 2. Relative path as deterministic tie-breaker.
+#[derive(Debug, PartialEq, Eq)]
+pub struct TestFile {
+ /// The absolute path to the file
+ pub path: PathBuf,
+ /// The relative path of the file (used for display)
+ pub relative_path: PathBuf,
+}
+
+impl TestFile {
+ /// Create a new [`TestFile`] from the given path, stripping any of the
+ /// known test directory prefixes for the relative path.
+ pub fn new(path: PathBuf, prefixes: &[&str]) -> Self {
+ let p = path.to_string_lossy();
+ for prefix in prefixes {
+ if p.starts_with(prefix) {
+ let relative_path =
PathBuf::from(p.strip_prefix(prefix).unwrap());
+ return Self {
+ path,
+ relative_path,
+ };
+ }
+ }
+ let relative_path = PathBuf::from("");
+
+ Self {
+ path,
+ relative_path,
+ }
+ }
+
+ /// Returns true if the file has a .slt extension, indicating it is a
sqllogictest file.
+ pub fn is_slt_file(&self) -> bool {
+ self.path.extension() == Some(OsStr::new("slt"))
+ }
+
+ /// Returns true if the relative path starts with the given prefix, which
+ /// can be used to filter tests by subdirectory or filename patterns.
+ pub fn relative_path_starts_with(&self, prefix: impl AsRef<Path>) -> bool {
+ self.relative_path.starts_with(prefix)
+ }
+}
+
+impl PartialOrd for TestFile {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for TestFile {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ let self_path = &self.relative_path;
+ let other_path = &other.relative_path;
+
+ let priority_self =
TEST_PRIORITY.get(self_path).unwrap_or(&DEFAULT_PRIORITY);
+ let priority_other =
TEST_PRIORITY.get(other_path).unwrap_or(&DEFAULT_PRIORITY);
+
+ priority_self
+ .cmp(priority_other)
+ .then_with(|| self_path.cmp(other_path)) // Tie-breaker:
lexicographic order of relative paths.
+ // Final tie-breaker keeps Ord consistent with Eq when relative
paths collide.
+ .then_with(|| self.path.cmp(&other.path))
+ }
+}
+
+/// TEST PRIORITY
+///
+/// Heuristically prioritize some test to run earlier.
+///
+/// Prioritizes test to run earlier if they are known to be long running (as
+/// each test file itself is run sequentially, but multiple test files are run
+/// in parallel.
+///
+/// Tests not listed here will run after the listed tests in deterministic
+/// lexicographic order by relative path.
+///
+/// You can find the top longest running tests by running `--timing-summary`
+/// mode. For example
+///
+/// ```shell
+/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top
+/// ...
+/// Per-file elapsed summary (deterministic):
+/// 1. 3.568s aggregate.slt
+/// 2. 3.464s joins.slt
+/// 3. 3.336s imdb.slt
+/// 4. 3.085s push_down_filter_regression.slt
+/// 5. 2.926s aggregate_skip_partial.slt
+/// 6. 2.453s array.slt
+/// 7. 2.399s window.slt
+/// 8. 2.198s group_by.slt
+/// 9. 1.281s clickbench.slt
+/// 10. 1.058s datetime/timestamps.slt
+/// ```
+const TEST_PRIORITY_ENTRIES: &[&str] = &[
+ "aggregate.slt", // longest-running files go first
+ "joins.slt",
+ "imdb.slt",
+ "push_down_filter_regression.slt",
+ "aggregate_skip_partial.slt",
+ "array.slt",
+ "window.slt",
+ "group_by.slt",
+ "clickbench.slt",
+ "datetime/timestamps.slt",
+];
+
+/// Default priority for tests not in the priority map. Tests with lower
+/// priority values run first.
+const DEFAULT_PRIORITY: usize = 100;
+
+static TEST_PRIORITY: LazyLock<HashMap<PathBuf, usize>> = LazyLock::new(|| {
+ TEST_PRIORITY_ENTRIES
+ .iter()
+ .enumerate()
+ .map(|(priority, path)| (PathBuf::from(path), priority))
+ .collect()
+});
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn prioritized_files_are_first() {
+ let mut input = vec!["z_unlisted.slt", "a_unlisted.slt"];
+ input.extend(TEST_PRIORITY_ENTRIES.iter());
+ input.push("q_unlisted.slt");
+
+ let mut sorted = to_test_files(input);
+ sorted.sort_unstable();
+
+ println!("Sorted input: {sorted:?}");
+
+ // the prioritized files should be first, in the order specified by
TEST_PRIORITY_ENTRIES
+ for file in sorted.iter().take(TEST_PRIORITY_ENTRIES.len()) {
+ assert!(
+ TEST_PRIORITY.contains_key(&file.relative_path),
+ "Expected prioritized file {file:?} not found in input
{sorted:?}"
+ );
+ }
+ // last three files should be the unlisted ones in deterministic order
+ let expected_files =
+ to_test_files(["a_unlisted.slt", "q_unlisted.slt",
"z_unlisted.slt"]);
+ assert!(
+ sorted.ends_with(&expected_files),
+ "Expected unlisted files {expected_files:?} at the end in
deterministic order of {sorted:?}"
+ );
+ }
+
+ fn to_test_files<'a>(files: impl IntoIterator<Item = &'a str>) ->
Vec<TestFile> {
+ files
+ .into_iter()
+ .map(|f| TestFile {
+ path: PathBuf::from(f),
+ relative_path: PathBuf::from(f),
+ })
+ .collect()
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]