This is an automated email from the ASF dual-hosted git repository.

xushiyan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/hudi-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 2981486  feat: add base file records' in-memory size to `FileStats` 
(#140)
2981486 is described below

commit 298148636b30782ed2cbc538f90604c92bd12039
Author: Shiyan Xu <[email protected]>
AuthorDate: Sun Sep 15 00:16:12 2024 -0500

    feat: add base file records' in-memory size to `FileStats` (#140)
---
 crates/core/src/file_group/mod.rs     | 10 +++++++++-
 crates/core/src/storage/file_stats.rs |  3 ++-
 python/hudi/_internal.pyi             |  1 +
 python/src/internal.rs                |  7 ++++++-
 python/tests/test_table_read.py       |  1 +
 5 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/crates/core/src/file_group/mod.rs 
b/crates/core/src/file_group/mod.rs
index 3dd1af3..c947de4 100644
--- a/crates/core/src/file_group/mod.rs
+++ b/crates/core/src/file_group/mod.rs
@@ -103,7 +103,15 @@ impl FileSlice {
                 .get_parquet_file_metadata(&self.base_file_relative_path())
                 .await?;
             let num_records = parquet_meta.file_metadata().num_rows();
-            let stats = FileStats { num_records };
+            let size_bytes = parquet_meta
+                .row_groups()
+                .iter()
+                .map(|rg| rg.total_byte_size())
+                .sum::<i64>();
+            let stats = FileStats {
+                num_records,
+                size_bytes,
+            };
             self.base_file.stats = Some(stats);
         }
         Ok(())
diff --git a/crates/core/src/storage/file_stats.rs 
b/crates/core/src/storage/file_stats.rs
index 19a7000..b0d2bcb 100644
--- a/crates/core/src/storage/file_stats.rs
+++ b/crates/core/src/storage/file_stats.rs
@@ -17,7 +17,8 @@
  * under the License.
  */
 
-#[derive(Clone, Debug, Default, Eq, PartialEq)]
+#[derive(Clone, Debug, Default)]
 pub struct FileStats {
     pub num_records: i64,
+    pub size_bytes: i64,
 }
diff --git a/python/hudi/_internal.pyi b/python/hudi/_internal.pyi
index 0f83aee..acf2e16 100644
--- a/python/hudi/_internal.pyi
+++ b/python/hudi/_internal.pyi
@@ -29,6 +29,7 @@ class HudiFileSlice:
     base_file_name: str
     base_file_size: int
     num_records: int
+    size_bytes: int
 
     def base_file_relative_path(self) -> str: ...
 
diff --git a/python/src/internal.rs b/python/src/internal.rs
index 141a74e..7c52930 100644
--- a/python/src/internal.rs
+++ b/python/src/internal.rs
@@ -44,6 +44,8 @@ pub struct HudiFileSlice {
     base_file_size: usize,
     #[pyo3(get)]
     num_records: i64,
+    #[pyo3(get)]
+    size_bytes: i64,
 }
 
 #[cfg(not(tarpaulin))]
@@ -69,7 +71,9 @@ fn convert_file_slice(f: &FileSlice) -> HudiFileSlice {
     let commit_time = f.base_file.commit_time.to_string();
     let base_file_name = f.base_file.info.name.clone();
     let base_file_size = f.base_file.info.size;
-    let num_records = 
f.base_file.stats.clone().unwrap_or_default().num_records;
+    let stats = f.base_file.stats.clone().unwrap_or_default();
+    let num_records = stats.num_records;
+    let size_bytes = stats.size_bytes;
     HudiFileSlice {
         file_group_id,
         partition_path,
@@ -77,6 +81,7 @@ fn convert_file_slice(f: &FileSlice) -> HudiFileSlice {
         base_file_name,
         base_file_size,
         num_records,
+        size_bytes,
     }
 }
 
diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py
index c3c84c9..6a517e5 100644
--- a/python/tests/test_table_read.py
+++ b/python/tests/test_table_read.py
@@ -55,6 +55,7 @@ def test_sample_table(get_sample_table):
         "20240402144910683",
     }
     assert all(f.num_records == 1 for f in file_slices)
+    assert all(f.size_bytes > 0 for f in file_slices)
     file_slice_paths = [f.base_file_relative_path() for f in file_slices]
     assert set(file_slice_paths) == {
         
"chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet",

Reply via email to