This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 6c24acdbf ORC-1645: [C++] Evaulate stripe stats before load stripe
footer
6c24acdbf is described below
commit 6c24acdbf232b5323b7138d93b51806f3fa9fb01
Author: Smith Cruise <[email protected]>
AuthorDate: Mon Mar 4 11:24:32 2024 -0800
ORC-1645: [C++] Evaulate stripe stats before load stripe footer
### What changes were proposed in this pull request?
Stripe's stats are in the orc tail's metadata, which means we can evaluate
stripe's stats before loading stripe's footer.
That can save one IO request.
https://issues.apache.org/jira/browse/ORC-1645
### Why are the changes needed?
reduce iops
### How was this patch tested?
passed ut
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #1835 from Smith-Cruise/improve-metadata.
Authored-by: Smith Cruise <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/src/Reader.cc | 38 ++++++++++++++++++++------------------
1 file changed, 20 insertions(+), 18 deletions(-)
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 162f69e10..4e02f171a 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -1050,23 +1050,24 @@ namespace orc {
<< ", footerLength=" << currentStripeInfo.footer_length() << ")";
throw ParseError(msg.str());
}
- currentStripeFooter = getStripeFooter(currentStripeInfo,
*contents.get());
rowsInCurrentStripe = currentStripeInfo.number_of_rows();
processingStripe = currentStripe;
- if (sargsApplier) {
- bool isStripeNeeded = true;
- if (contents->metadata) {
- const auto& currentStripeStats =
-
contents->metadata->stripe_stats(static_cast<int>(currentStripe));
- // skip this stripe after stats fail to satisfy sargs
- uint64_t stripeRowGroupCount =
- (rowsInCurrentStripe + footer->row_index_stride() - 1) /
footer->row_index_stride();
- isStripeNeeded =
- sargsApplier->evaluateStripeStatistics(currentStripeStats,
stripeRowGroupCount);
- }
+ bool isStripeNeeded = true;
+ // If PPD enabled and stripe stats existed, evaulate it first
+ if (sargsApplier && contents->metadata) {
+ const auto& currentStripeStats =
+ contents->metadata->stripe_stats(static_cast<int>(currentStripe));
+ // skip this stripe after stats fail to satisfy sargs
+ uint64_t stripeRowGroupCount =
+ (rowsInCurrentStripe + footer->row_index_stride() - 1) /
footer->row_index_stride();
+ isStripeNeeded =
+ sargsApplier->evaluateStripeStatistics(currentStripeStats,
stripeRowGroupCount);
+ }
- if (isStripeNeeded) {
+ if (isStripeNeeded) {
+ currentStripeFooter = getStripeFooter(currentStripeInfo,
*contents.get());
+ if (sargsApplier) {
// read row group statistics and bloom filters of current stripe
loadStripeIndex();
@@ -1078,11 +1079,12 @@ namespace orc {
}
isStripeNeeded = false;
}
- if (!isStripeNeeded) {
- // advance to next stripe when current stripe has no matching rows
- currentStripe += 1;
- currentRowInStripe = 0;
- }
+ }
+
+ if (!isStripeNeeded) {
+ // advance to next stripe when current stripe has no matching rows
+ currentStripe += 1;
+ currentRowInStripe = 0;
}
} while (sargsApplier && currentStripe < lastStripe);