alamb commented on code in PR #4521:
URL: https://github.com/apache/arrow-datafusion/pull/4521#discussion_r1050098588


##########
datafusion/core/src/datasource/listing/table.rs:
##########
@@ -747,6 +747,30 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn load_table_stats_when_no_stats() -> Result<()> {
+        let testdata = crate::test_util::parquet_test_data();
+        let filename = format!("{}/{}", testdata, "alltypes_plain.parquet");
+        let table_path = ListingTableUrl::parse(filename).unwrap();
+
+        let ctx = SessionContext::new();
+        let state = ctx.state();
+
+        let opt = 
ListingOptions::new(Arc::new(ParquetFormat::new(ctx.config_options())))
+            .with_collect_stat(false);
+        let schema = opt.infer_schema(&state, &table_path).await?;
+        let config = ListingTableConfig::new(table_path)
+            .with_listing_options(opt)
+            .with_schema(schema);
+        let table = ListingTable::try_new(config)?;
+
+        let exec = table.scan(&state, None, &[], None).await?;
+        assert_eq!(exec.statistics().num_rows, None);

Review Comment:
   👍 



##########
datafusion/core/src/datasource/mod.rs:
##########
@@ -47,29 +47,42 @@ use futures::StreamExt;
 /// Get all files as well as the file level summary statistics (no statistic 
for partition columns).
 /// If the optional `limit` is provided, includes only sufficient files.
 /// Needed to read up to `limit` number of rows.
-/// TODO fix case where `num_rows` and `total_byte_size` are not defined (stat 
should be None instead of Some(0))
 pub async fn get_statistics_with_limit(
     all_files: impl Stream<Item = Result<(PartitionedFile, Statistics)>>,
     file_schema: SchemaRef,
     limit: Option<usize>,
 ) -> Result<(Vec<PartitionedFile>, Statistics)> {
     let mut result_files = vec![];
 
-    let mut total_byte_size = 0;
     let mut null_counts = vec![0; file_schema.fields().len()];
     let mut has_statistics = false;
     let (mut max_values, mut min_values) = create_max_min_accs(&file_schema);
 
-    let mut num_rows = 0;
     let mut is_exact = true;
+
+    // The number of rows and the total byte size can be calculated as long as
+    // at least one file has them. If none of the files provide them, then they
+    // will be omitted from the statistics. The missing values will be counted
+    // as zero.
+    let mut num_rows = None;
+    let mut total_byte_size = None;
+
     // fusing the stream allows us to call next safely even once it is finished
     let mut all_files = Box::pin(all_files.fuse());
     while let Some(res) = all_files.next().await {
         let (file, file_stats) = res?;
         result_files.push(file);
         is_exact &= file_stats.is_exact;
-        num_rows += file_stats.num_rows.unwrap_or(0);
-        total_byte_size += file_stats.total_byte_size.unwrap_or(0);
+        num_rows = if let Some(num_rows) = num_rows {
+            Some(num_rows + file_stats.num_rows.unwrap_or(0))
+        } else {
+            file_stats.num_rows
+        };
+        total_byte_size = if let Some(total_byte_size) = total_byte_size {
+            Some(total_byte_size + file_stats.total_byte_size.unwrap_or(0))
+        } else {
+            file_stats.total_byte_size
+        };

Review Comment:
   If you are into a more functional style of coding, you can do something like 
the following as well
   
   ```suggestion
           num_rows = num_rows
               .map(|num_rows| num_rows + file_stats.num_rows.unwrap_or(0))
               .or(file_stats.num_rows);
           total_byte_size = total_byte_size
               .map(|total_byte_size| total_byte_size + 
file_stats.total_byte_size.unwrap_or(0))
               .or(file_stats.total_byte_size);
   ```
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to