leaves12138 commented on code in PR #22:
URL: https://github.com/apache/paimon-mosaic/pull/22#discussion_r3276159732


##########
include/mosaic.hpp:
##########
@@ -295,22 +298,30 @@ class Reader {
         return out;
     }
 
-    std::vector<ColumnStatistics> get_row_group_statistics(uint32_t rg_index) 
const {
+    std::vector<ColumnStatistics> get_row_group_statistics(uint32_t rg_index) {
         uint32_t n = 0;
         check(mosaic_reader_row_group_num_stats(handle_, rg_index, &n));
+        if (n == 0) return {};
+        std::vector<const char*> names(n);
+        std::vector<uint64_t> null_counts(n);
+        std::vector<const uint8_t*> min_ptrs(n);
+        std::vector<size_t> min_lens(n);
+        std::vector<const uint8_t*> max_ptrs(n);
+        std::vector<size_t> max_lens(n);
+        check(mosaic_reader_row_group_stats(handle_, rg_index,
+            names.data(), null_counts.data(),
+            min_ptrs.data(), min_lens.data(),
+            max_ptrs.data(), max_lens.data()));
         std::vector<ColumnStatistics> result;
         result.reserve(n);
         for (uint32_t i = 0; i < n; i++) {
             ColumnStatistics s;
-            check(mosaic_reader_row_group_stat_column_index(handle_, rg_index, 
i, &s.column_index));
-            check(mosaic_reader_row_group_stat_null_count(handle_, rg_index, 
i, &s.null_count));
-            size_t min_len = 0, max_len = 0;
-            const uint8_t* min_ptr = mosaic_reader_row_group_stat_min(handle_, 
rg_index, i, &min_len);
-            const uint8_t* max_ptr = mosaic_reader_row_group_stat_max(handle_, 
rg_index, i, &max_len);
-            if (min_ptr && min_len > 0)
-                s.min_value.assign(min_ptr, min_ptr + min_len);
-            if (max_ptr && max_len > 0)
-                s.max_value.assign(max_ptr, max_ptr + max_len);
+            s.column_name = names[i] ? names[i] : "";
+            s.null_count = null_counts[i];
+            if (min_ptrs[i] && min_lens[i] > 0)

Review Comment:
   Same zero-length stats issue exists in the C++ wrapper. Empty string min/max 
values have length 0, so this skips assigning them; then 
`ColumnStatistics::has_min_max()` (based on `!min_value.empty()`) reports 
false. The wrapper needs to preserve min/max presence separately from byte 
length, for example by adding explicit `has_min`/`has_max` flags (or optional 
byte vectors) and assigning even when `*_lens[i] == 0` if the pointer is 
non-null. Please fix both writer and reader stats paths and add a C++ test for 
an empty-string min/max.



##########
python/mosaic/mosaic.py:
##########
@@ -59,6 +59,32 @@ def _check_error(msg="operation failed"):
     raise RuntimeError(msg)
 
 
+def _fetch_rg_stats(num_stats_fn, stats_fn, handle, rg_index):
+    n_out = ctypes.c_uint32(0)
+    rc = num_stats_fn(handle, rg_index, ctypes.byref(n_out))
+    if rc != 0:
+        _check_error("num_stats failed")
+    n = n_out.value
+    if n == 0:
+        return {}
+    names = (ctypes.c_char_p * n)()
+    null_counts = (ctypes.c_uint64 * n)()
+    min_ptrs = (ctypes.POINTER(ctypes.c_uint8) * n)()
+    min_lens = (ctypes.c_size_t * n)()
+    max_ptrs = (ctypes.POINTER(ctypes.c_uint8) * n)()
+    max_lens = (ctypes.c_size_t * n)()
+    rc = stats_fn(handle, rg_index, names, null_counts, min_ptrs, min_lens, 
max_ptrs, max_lens)
+    if rc != 0:
+        _check_error("row_group_stats failed")
+    result = {}
+    for i in range(n):
+        col_name = names[i].decode("utf-8")
+        min_val = ctypes.string_at(min_ptrs[i], min_lens[i]) if min_ptrs[i] 
and min_lens[i] > 0 else None

Review Comment:
   This treats a valid zero-length stats value as missing. For a UTF8 stats 
column whose minimum is the empty string, the native side returns a non-null 
pointer with `min_lens[i] == 0`, but this branch converts it to `None`, so 
`has_min_max` becomes false even though the row group is not all-null. Please 
use pointer presence to distinguish missing values from empty values, e.g. call 
`ctypes.string_at(min_ptrs[i], min_lens[i])` whenever `min_ptrs[i]` is 
non-null, even if the length is zero. The same applies to `max_val`, and it 
would be good to add a Python test with stats over `["", "b"]`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to