This is an automated email from the ASF dual-hosted git repository.
xushiyan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/hudi-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 2981486 feat: add base file records' in-memory size to `FileStats`
(#140)
2981486 is described below
commit 298148636b30782ed2cbc538f90604c92bd12039
Author: Shiyan Xu <[email protected]>
AuthorDate: Sun Sep 15 00:16:12 2024 -0500
feat: add base file records' in-memory size to `FileStats` (#140)
---
crates/core/src/file_group/mod.rs | 10 +++++++++-
crates/core/src/storage/file_stats.rs | 3 ++-
python/hudi/_internal.pyi | 1 +
python/src/internal.rs | 7 ++++++-
python/tests/test_table_read.py | 1 +
5 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/crates/core/src/file_group/mod.rs
b/crates/core/src/file_group/mod.rs
index 3dd1af3..c947de4 100644
--- a/crates/core/src/file_group/mod.rs
+++ b/crates/core/src/file_group/mod.rs
@@ -103,7 +103,15 @@ impl FileSlice {
.get_parquet_file_metadata(&self.base_file_relative_path())
.await?;
let num_records = parquet_meta.file_metadata().num_rows();
- let stats = FileStats { num_records };
+ let size_bytes = parquet_meta
+ .row_groups()
+ .iter()
+ .map(|rg| rg.total_byte_size())
+ .sum::<i64>();
+ let stats = FileStats {
+ num_records,
+ size_bytes,
+ };
self.base_file.stats = Some(stats);
}
Ok(())
diff --git a/crates/core/src/storage/file_stats.rs
b/crates/core/src/storage/file_stats.rs
index 19a7000..b0d2bcb 100644
--- a/crates/core/src/storage/file_stats.rs
+++ b/crates/core/src/storage/file_stats.rs
@@ -17,7 +17,8 @@
* under the License.
*/
-#[derive(Clone, Debug, Default, Eq, PartialEq)]
+#[derive(Clone, Debug, Default)]
pub struct FileStats {
pub num_records: i64,
+ pub size_bytes: i64,
}
diff --git a/python/hudi/_internal.pyi b/python/hudi/_internal.pyi
index 0f83aee..acf2e16 100644
--- a/python/hudi/_internal.pyi
+++ b/python/hudi/_internal.pyi
@@ -29,6 +29,7 @@ class HudiFileSlice:
base_file_name: str
base_file_size: int
num_records: int
+ size_bytes: int
def base_file_relative_path(self) -> str: ...
diff --git a/python/src/internal.rs b/python/src/internal.rs
index 141a74e..7c52930 100644
--- a/python/src/internal.rs
+++ b/python/src/internal.rs
@@ -44,6 +44,8 @@ pub struct HudiFileSlice {
base_file_size: usize,
#[pyo3(get)]
num_records: i64,
+ #[pyo3(get)]
+ size_bytes: i64,
}
#[cfg(not(tarpaulin))]
@@ -69,7 +71,9 @@ fn convert_file_slice(f: &FileSlice) -> HudiFileSlice {
let commit_time = f.base_file.commit_time.to_string();
let base_file_name = f.base_file.info.name.clone();
let base_file_size = f.base_file.info.size;
- let num_records =
f.base_file.stats.clone().unwrap_or_default().num_records;
+ let stats = f.base_file.stats.clone().unwrap_or_default();
+ let num_records = stats.num_records;
+ let size_bytes = stats.size_bytes;
HudiFileSlice {
file_group_id,
partition_path,
@@ -77,6 +81,7 @@ fn convert_file_slice(f: &FileSlice) -> HudiFileSlice {
base_file_name,
base_file_size,
num_records,
+ size_bytes,
}
}
diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py
index c3c84c9..6a517e5 100644
--- a/python/tests/test_table_read.py
+++ b/python/tests/test_table_read.py
@@ -55,6 +55,7 @@ def test_sample_table(get_sample_table):
"20240402144910683",
}
assert all(f.num_records == 1 for f in file_slices)
+ assert all(f.size_bytes > 0 for f in file_slices)
file_slice_paths = [f.base_file_relative_path() for f in file_slices]
assert set(file_slice_paths) == {
"chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet",