leaves12138 commented on code in PR #22:
URL: https://github.com/apache/paimon-mosaic/pull/22#discussion_r3276159732
##########
include/mosaic.hpp:
##########
@@ -295,22 +298,30 @@ class Reader {
return out;
}
- std::vector<ColumnStatistics> get_row_group_statistics(uint32_t rg_index)
const {
+ std::vector<ColumnStatistics> get_row_group_statistics(uint32_t rg_index) {
uint32_t n = 0;
check(mosaic_reader_row_group_num_stats(handle_, rg_index, &n));
+ if (n == 0) return {};
+ std::vector<const char*> names(n);
+ std::vector<uint64_t> null_counts(n);
+ std::vector<const uint8_t*> min_ptrs(n);
+ std::vector<size_t> min_lens(n);
+ std::vector<const uint8_t*> max_ptrs(n);
+ std::vector<size_t> max_lens(n);
+ check(mosaic_reader_row_group_stats(handle_, rg_index,
+ names.data(), null_counts.data(),
+ min_ptrs.data(), min_lens.data(),
+ max_ptrs.data(), max_lens.data()));
std::vector<ColumnStatistics> result;
result.reserve(n);
for (uint32_t i = 0; i < n; i++) {
ColumnStatistics s;
- check(mosaic_reader_row_group_stat_column_index(handle_, rg_index,
i, &s.column_index));
- check(mosaic_reader_row_group_stat_null_count(handle_, rg_index,
i, &s.null_count));
- size_t min_len = 0, max_len = 0;
- const uint8_t* min_ptr = mosaic_reader_row_group_stat_min(handle_,
rg_index, i, &min_len);
- const uint8_t* max_ptr = mosaic_reader_row_group_stat_max(handle_,
rg_index, i, &max_len);
- if (min_ptr && min_len > 0)
- s.min_value.assign(min_ptr, min_ptr + min_len);
- if (max_ptr && max_len > 0)
- s.max_value.assign(max_ptr, max_ptr + max_len);
+ s.column_name = names[i] ? names[i] : "";
+ s.null_count = null_counts[i];
+ if (min_ptrs[i] && min_lens[i] > 0)
Review Comment:
Same zero-length stats issue exists in the C++ wrapper. Empty string min/max
values have length 0, so this skips assigning them; then
`ColumnStatistics::has_min_max()` (based on `!min_value.empty()`) reports
false. The wrapper needs to preserve min/max presence separately from byte
length, for example by adding explicit `has_min`/`has_max` flags (or optional
byte vectors) and assigning even when `*_lens[i] == 0` if the pointer is
non-null. Please fix both writer and reader stats paths and add a C++ test for
an empty-string min/max.
##########
python/mosaic/mosaic.py:
##########
@@ -59,6 +59,32 @@ def _check_error(msg="operation failed"):
raise RuntimeError(msg)
+def _fetch_rg_stats(num_stats_fn, stats_fn, handle, rg_index):
+ n_out = ctypes.c_uint32(0)
+ rc = num_stats_fn(handle, rg_index, ctypes.byref(n_out))
+ if rc != 0:
+ _check_error("num_stats failed")
+ n = n_out.value
+ if n == 0:
+ return {}
+ names = (ctypes.c_char_p * n)()
+ null_counts = (ctypes.c_uint64 * n)()
+ min_ptrs = (ctypes.POINTER(ctypes.c_uint8) * n)()
+ min_lens = (ctypes.c_size_t * n)()
+ max_ptrs = (ctypes.POINTER(ctypes.c_uint8) * n)()
+ max_lens = (ctypes.c_size_t * n)()
+ rc = stats_fn(handle, rg_index, names, null_counts, min_ptrs, min_lens,
max_ptrs, max_lens)
+ if rc != 0:
+ _check_error("row_group_stats failed")
+ result = {}
+ for i in range(n):
+ col_name = names[i].decode("utf-8")
+ min_val = ctypes.string_at(min_ptrs[i], min_lens[i]) if min_ptrs[i]
and min_lens[i] > 0 else None
Review Comment:
This treats a valid zero-length stats value as missing. For a UTF8 stats
column whose minimum is the empty string, the native side returns a non-null
pointer with `min_lens[i] == 0`, but this branch converts it to `None`, so
`has_min_max` becomes false even though the row group is not all-null. Please
use pointer presence to distinguish missing values from empty values, e.g. call
`ctypes.string_at(min_ptrs[i], min_lens[i])` whenever `min_ptrs[i]` is
non-null, even if the length is zero. The same applies to `max_val`, and it
would be good to add a Python test with stats over `["", "b"]`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]