(arrow-datafusion) branch main updated: Remove need for sort in new_with_metadata (#8855)

dheres Mon, 15 Jan 2024 00:21:58 -0800

This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new ff728d6c75 Remove need for sort in new_with_metadata (#8855)
ff728d6c75 is described below

commit ff728d6c75eb0eef048d1b2f61a73bf750d2814e
Author: Simon Vandel Sillesen <[email protected]>
AuthorDate: Mon Jan 15 08:21:38 2024 +0000

    Remove need for sort in new_with_metadata (#8855)
    
    BTreeMap gives stable iteration order, so we don't need to sort
    
    Speeds up benchmarks in sql_planner.rs by 3-8%
---
 datafusion/common/src/dfschema.rs | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/datafusion/common/src/dfschema.rs 
b/datafusion/common/src/dfschema.rs
index 85b97aac03..c715fad112 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -18,7 +18,7 @@
 //! DFSchema is an extended schema struct that DataFusion uses to provide 
support for
 //! fields with optional relation names.
 
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap};
 use std::convert::TryFrom;
 use std::fmt::{Display, Formatter};
 use std::hash::Hash;
@@ -135,8 +135,8 @@ impl DFSchema {
         fields: Vec<DFField>,
         metadata: HashMap<String, String>,
     ) -> Result<Self> {
-        let mut qualified_names = HashSet::new();
-        let mut unqualified_names = HashSet::new();
+        let mut qualified_names = BTreeSet::new();
+        let mut unqualified_names = BTreeSet::new();
 
         for field in &fields {
             if let Some(qualifier) = field.qualifier() {
@@ -148,14 +148,8 @@ impl DFSchema {
             }
         }
 
-        // check for mix of qualified and unqualified field with same 
unqualified name
-        // note that we need to sort the contents of the HashSet first so that 
errors are
-        // deterministic
-        let mut qualified_names = qualified_names
-            .iter()
-            .map(|(l, r)| (l.to_owned(), r.to_owned()))
-            .collect::<Vec<(&OwnedTableReference, &String)>>();
-        qualified_names.sort();
+        // Check for mix of qualified and unqualified fields with same 
unqualified name.
+        // The BTreeSet storage makes sure that errors are reported in 
deterministic order.
         for (qualifier, name) in &qualified_names {
             if unqualified_names.contains(name) {
                 return _schema_err!(SchemaError::AmbiguousReference {

(arrow-datafusion) branch main updated: Remove need for sort in new_with_metadata (#8855)

Reply via email to