Re: [PR] Make aggr fuzzer query builder more configurable [datafusion]

via GitHub Fri, 25 Apr 2025 13:09:30 -0700


Rachelint commented on code in PR #15851:
URL: https://github.com/apache/datafusion/pull/15851#discussion_r2060734020



##########
datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs:
##########
@@ -0,0 +1,371 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{collections::HashSet, str::FromStr};
+
+use rand::{seq::SliceRandom, thread_rng, Rng};
+
+/// Random aggregate query builder
+///
+/// Creates queries like
+/// ```sql
+/// SELECT AGG(..) FROM table_name GROUP BY <group_by_columns>
+///```
+#[derive(Debug, Default, Clone)]
+pub struct QueryBuilder {
+    // ===================================
+    // Table settings
+    // ===================================
+    /// The name of the table to query
+    table_name: String,
+
+    // ===================================
+    // Grouping settings
+    // ===================================
+    /// Columns to be used in randomly generate `groupings`
+    ///
+    /// # Example
+    ///
+    /// Columns:
+    ///
+    /// ```text
+    ///   [a,b,c,d]
+    /// ```
+    ///
+    /// And randomly generated `groupings` (at least 1 column)
+    /// can be:
+    ///
+    /// ```text
+    ///   [a]
+    ///   [a,b]
+    ///   [a,b,d]
+    ///   ...
+    /// ```
+    ///
+    /// So the finally generated sqls will be:
+    ///
+    /// ```text
+    ///   SELECT aggr FROM t GROUP BY a;
+    ///   SELECT aggr FROM t GROUP BY a,b;
+    ///   SELECT aggr FROM t GROUP BY a,b,d;
+    ///   ...
+    /// ```
+    group_by_columns: Vec<String>,
+
+    /// Max columns num in randomly generated `groupings`
+    max_group_by_columns: usize,
+
+    /// The sort keys of dataset
+    ///
+    /// Due to optimizations will be triggered when all or some
+    /// grouping columns are the sort keys of dataset.
+    /// So it is necessary to randomly generate some `groupings` basing on
+    /// dataset sort keys for test coverage.
+    ///
+    /// # Example
+    ///
+    /// Dataset including columns [a,b,c], and sorted by [a,b]
+    ///
+    /// And we may generate sqls to try covering the sort-optimization cases 
like:
+    ///
+    /// ```text
+    ///   SELECT aggr FROM t GROUP BY b; // no permutation case
+    ///   SELECT aggr FROM t GROUP BY a,c; // partial permutation case
+    ///   SELECT aggr FROM t GROUP BY a,b,c; // full permutation case
+    ///   ...
+    /// ```
+    ///
+    /// More details can see [`GroupOrdering`].
+    ///
+    /// [`GroupOrdering`]:  
datafusion_physical_plan::aggregates::order::GroupOrdering
+    ///
+    dataset_sort_keys: Vec<Vec<String>>,
+
+    /// If we will also test the no grouping case like:
+    ///
+    /// ```text
+    ///   SELECT aggr FROM t;
+    /// ```
+    ///
+    no_grouping: bool,
+
+    // ====================================
+    // Aggregation function settings
+    // ====================================
+    /// Aggregate functions to be used in the query
+    /// (function_name, is_distinct)
+    aggregate_functions: Vec<(String, bool)>,
+
+    /// Possible columns for arguments in the aggregate functions
+    ///
+    /// Assumes each
+    arguments: Vec<String>,
+}
+
+impl QueryBuilder {
+    pub fn new() -> Self {
+        Self {
+            no_grouping: true,
+            max_group_by_columns: 3,
+            ..Default::default()
+        }
+    }
+
+    /// return the table name if any
+    pub fn table_name(&self) -> &str {
+        &self.table_name
+    }
+
+    /// Set the table name for the query builder
+    pub fn with_table_name(mut self, table_name: impl Into<String>) -> Self {
+        self.table_name = table_name.into();
+        self
+    }
+
+    /// Add a new possible aggregate function to the query builder
+    pub fn with_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), false));
+        self
+    }
+
+    /// Add a new possible `DISTINCT` aggregate function to the query
+    ///
+    /// This is different than `with_aggregate_function` because only certain
+    /// aggregates support `DISTINCT`
+    pub fn with_distinct_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), true));
+        self
+    }
+
+    /// Set the columns to be used in the group bys clauses
+    pub fn set_group_by_columns<'a>(
+        mut self,
+        group_by: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        self.group_by_columns = 
group_by.into_iter().map(String::from).collect();
+        self
+    }
+
+    /// Add one or more columns to be used as an argument in the aggregate 
functions
+    pub fn with_aggregate_arguments<'a>(
+        mut self,
+        arguments: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        let arguments = arguments.into_iter().map(String::from);
+        self.arguments.extend(arguments);
+        self
+    }
+
+    /// Add max columns num in group by(default: 3), for example if it is set 
to 1,
+    /// the generated sql will group by at most 1 column
+    #[allow(dead_code)]
+    pub fn with_max_group_by_columns(mut self, group_by_columns: usize) -> 
Self {
+        self.max_group_by_columns = group_by_columns;
+        self
+    }
+
+    /// Add sort keys of dataset if any, then the builder will generate 
queries basing on it
+    /// to cover the sort-optimization cases
+    pub fn with_dataset_sort_keys(mut self, dataset_sort_keys: 
Vec<Vec<String>>) -> Self {

Review Comment:
   However, the sql picking logic indeed has some problems now as I noticed.
   
   For example, assume dataset t1 sorted by [a, b], and t2 sorted by [c,d].
   
   It is possible that
   - we randomly picked sql select xxx from xxx group by c,d for t1
   - but select xxx from xxx group by a,b for t2
   
   And finally it can not cover the sort optimization path..
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Make aggr fuzzer query builder more configurable [datafusion]

Reply via email to