Re: [PR] Reduce clone of `Statistics` in `ListingTable` and `PartitionedFile` [datafusion]

via GitHub Mon, 05 Aug 2024 12:16:36 -0700


alamb commented on code in PR #11802:
URL: https://github.com/apache/datafusion/pull/11802#discussion_r1704504377



##########
datafusion/core/src/datasource/listing/mod.rs:
##########
@@ -78,10 +78,11 @@ pub struct PartitionedFile {
     ///
     /// DataFusion relies on these statistics for planning (in particular to 
sort file groups),
     /// so if they are incorrect, incorrect answers may result.
-    pub statistics: Option<Statistics>,
+    pub statistics: Option<Arc<Statistics>>,

Review Comment:
   💯  This alone will likely avoid a bunch of copying



##########
datafusion/core/src/datasource/statistics.rs:
##########
@@ -238,45 +214,61 @@ fn min_max_aggregate_data_type(input_type: &DataType) -> 
&DataType {
 /// If the given value is numerically greater than the original maximum value,
 /// return the new maximum value with appropriate exactness information.
 fn set_max_if_greater(
-    max_nominee: Precision<ScalarValue>,
-    max_values: Precision<ScalarValue>,
-) -> Precision<ScalarValue> {
-    match (&max_values, &max_nominee) {
-        (Precision::Exact(val1), Precision::Exact(val2)) if val1 < val2 => 
max_nominee,
+    max_nominee: &Precision<ScalarValue>,
+    max_value: &mut Precision<ScalarValue>,
+) {
+    match (&max_value, max_nominee) {
+        (Precision::Exact(val1), Precision::Exact(val2)) if val1 < val2 => {
+            *max_value = max_nominee.clone();
+        }
         (Precision::Exact(val1), Precision::Inexact(val2))
         | (Precision::Inexact(val1), Precision::Inexact(val2))
         | (Precision::Inexact(val1), Precision::Exact(val2))
             if val1 < val2 =>
         {
-            max_nominee.to_inexact()
+            *max_value = max_nominee.clone().to_inexact();
+        }
+        (Precision::Exact(_), Precision::Absent) => {
+            let exact_max = mem::take(max_value);
+            *max_value = exact_max.to_inexact();
+        }
+        (Precision::Absent, Precision::Exact(_)) => {
+            *max_value = max_nominee.clone().to_inexact();
+        }
+        (Precision::Absent, Precision::Inexact(_)) => {
+            *max_value = max_nominee.clone();
         }
-        (Precision::Exact(_), Precision::Absent) => max_values.to_inexact(),
-        (Precision::Absent, Precision::Exact(_)) => max_nominee.to_inexact(),
-        (Precision::Absent, Precision::Inexact(_)) => max_nominee,
-        (Precision::Absent, Precision::Absent) => Precision::Absent,
-        _ => max_values,
+        _ => {}
     }
 }
 
 /// If the given value is numerically lesser than the original minimum value,
 /// return the new minimum value with appropriate exactness information.
 fn set_min_if_lesser(
-    min_nominee: Precision<ScalarValue>,
-    min_values: Precision<ScalarValue>,
-) -> Precision<ScalarValue> {
-    match (&min_values, &min_nominee) {
-        (Precision::Exact(val1), Precision::Exact(val2)) if val1 > val2 => 
min_nominee,
+    min_nominee: &Precision<ScalarValue>,

Review Comment:
   💯  to reduce this copy



##########
datafusion/core/src/datasource/listing/mod.rs:
##########
@@ -159,6 +160,24 @@ impl From<ObjectMeta> for PartitionedFile {
     }
 }
 
+impl Default for PartitionedFile {

Review Comment:
   Isn't this the same as `#[derive(Default)]`?



##########
datafusion/core/src/datasource/statistics.rs:
##########
@@ -90,38 +91,28 @@ pub async fn get_statistics_with_limit(
                 // counts across all the files in question. If any file does 
not
                 // provide any information or provides an inexact value, we 
demote
                 // the statistic precision to inexact.
-                num_rows = add_row_stats(file_stats.num_rows, num_rows);
+                num_rows = add_row_stats(file_stats.num_rows.clone(), 
num_rows);
 
                 total_byte_size =
-                    add_row_stats(file_stats.total_byte_size, total_byte_size);
+                    add_row_stats(file_stats.total_byte_size.clone(), 
total_byte_size);

Review Comment:
   I double checked that stats here are `Precision<usize>` (and thus this clone 
is not a performance problem)



##########
datafusion/core/src/datasource/statistics.rs:
##########
@@ -90,38 +91,28 @@ pub async fn get_statistics_with_limit(
                 // counts across all the files in question. If any file does 
not
                 // provide any information or provides an inexact value, we 
demote
                 // the statistic precision to inexact.
-                num_rows = add_row_stats(file_stats.num_rows, num_rows);
+                num_rows = add_row_stats(file_stats.num_rows.clone(), 
num_rows);
 
                 total_byte_size =
-                    add_row_stats(file_stats.total_byte_size, total_byte_size);
+                    add_row_stats(file_stats.total_byte_size.clone(), 
total_byte_size);

Review Comment:
   I also made a small experiment to see if an alternate formulation where it 
might be clearer that the copy is not occuring (last commit in 
https://github.com/apache/datafusion/pull/11828) 



##########
datafusion/core/src/datasource/listing/helpers.rs:
##########
@@ -139,8 +140,8 @@ pub fn split_files(
     // effectively this is div with rounding up instead of truncating
     let chunk_size = (partitioned_files.len() + n - 1) / n;
     partitioned_files
-        .chunks(chunk_size)
-        .map(|c| c.to_vec())
+        .chunks_mut(chunk_size)
+        .map(|c| c.iter_mut().map(mem::take).collect())
         .collect()
 }

Review Comment:
   I think this could also be forumulated with `drain()` and this avoid the 
need for `Default`: 
https://doc.rust-lang.org/std/vec/struct.Vec.html#method.drain
   
   Here is a POC of it working: https://github.com/apache/datafusion/pull/11829
   
   ```rust
       let mut chunks = Vec::with_capacity(n);
   
       let mut current_chunk = Vec::with_capacity(chunk_size);
       for file in partitioned_files.drain(..) {
           current_chunk.push(file);
           if current_chunk.len() == chunk_size {
               chunks.push(mem::take(&mut current_chunk));
           }
       }
       if !current_chunk.is_empty() {
           chunks.push(current_chunk)
       }
       chunks
   ```
   
   (I don't know if this matters for performance)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Reduce clone of `Statistics` in `ListingTable` and `PartitionedFile` [datafusion]

Reply via email to