This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new cf85da122 ORC-2022: [C++] Add support to use dictionary for IN
expression
cf85da122 is described below
commit cf85da12204c399dc4af72007e8397d65fac45e9
Author: luffy-zh <[email protected]>
AuthorDate: Wed Nov 5 10:54:04 2025 +0800
ORC-2022: [C++] Add support to use dictionary for IN expression
### What changes were proposed in this pull request?
Use column dictionaries to evaluate IN predicates.
### Why are the changes needed?
Optimize IN predicate pruning: consult column dictionary (when reasonably
sized) instead of relying on less effective min/max statistics.
### How was this patch tested?
Unit tests in TestPredicatePushdown.cc verify this change.
### Was this patch authored or co-authored using generative AI tooling?
NO.
Closes #2453 from luffy-zh/ORC-2022.
Lead-authored-by: luffy-zh <[email protected]>
Co-authored-by: shouzhi <[email protected]>
Co-authored-by: Hao Zou <[email protected]>
Co-authored-by: Gang Wu <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
c++/include/orc/Reader.hh | 15 +++++
c++/include/orc/sargs/Literal.hh | 3 +
c++/src/CMakeLists.txt | 1 +
c++/src/ColumnReader.cc | 70 ++++++++-------------
c++/src/ColumnReader.hh | 7 +++
c++/src/DictionaryLoader.cc | 100 ++++++++++++++++++++++++++++++
c++/src/DictionaryLoader.hh | 57 +++++++++++++++++
c++/src/Options.hh | 12 ++++
c++/src/Reader.cc | 107 ++++++++++++++++++++++++++++++--
c++/src/Reader.hh | 12 ++++
c++/src/StripeStream.cc | 5 ++
c++/src/StripeStream.hh | 2 +
c++/src/meson.build | 1 +
c++/src/sargs/Literal.cc | 5 ++
c++/src/sargs/SargsApplier.cc | 125 +++++++++++++++++++++++++++++++++++++-
c++/src/sargs/SargsApplier.hh | 50 ++++++++++++---
c++/test/MockStripeStreams.hh | 1 +
c++/test/TestPredicatePushdown.cc | 119 ++++++++++++++++++++++++++++++++++++
c++/test/TestSargsApplier.cc | 12 ++--
19 files changed, 641 insertions(+), 63 deletions(-)
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 506f088d6..5a8899453 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -408,6 +408,21 @@ namespace orc {
* Get the number of stripes to look ahead for small stripe prefetch.
*/
uint64_t getSmallStripeLookAheadLimit() const;
+
+ /**
+ * Set the maximum dictionary size threshold for evaluation.
+ *
+ * Dictionaries with more entries than this threshold will not be
evaluated.
+ * 0 to disable dictionary filtering.
+ *
+ * Defaults to 0.
+ */
+ RowReaderOptions& setDictionaryFilteringSizeThreshold(uint32_t threshold);
+
+ /**
+ * Get the dictionary filtering size threshold.
+ */
+ uint32_t getDictionaryFilteringSizeThreshold() const;
};
class RowReader;
diff --git a/c++/include/orc/sargs/Literal.hh b/c++/include/orc/sargs/Literal.hh
index f7d37005a..821790145 100644
--- a/c++/include/orc/sargs/Literal.hh
+++ b/c++/include/orc/sargs/Literal.hh
@@ -22,6 +22,8 @@
#include "orc/Int128.hh"
#include "orc/Vector.hh"
+#include <string_view>
+
namespace orc {
/**
@@ -123,6 +125,7 @@ namespace orc {
Timestamp getTimestamp() const;
double getFloat() const;
std::string getString() const;
+ std::string_view getStringView() const;
bool getBool() const;
Decimal getDecimal() const;
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index 2f81bb802..a1fd549ce 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -169,6 +169,7 @@ set(SOURCE_FILES
ConvertColumnReader.cc
CpuInfoUtil.cc
Dictionary.cc
+ DictionaryLoader.cc
Exceptions.cc
Geospatial.cc
Int128.cc
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index d087482f9..8a29418c3 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -16,15 +16,18 @@
* limitations under the License.
*/
-#include "orc/Int128.hh"
+#include "ColumnReader.hh"
+
+#include <cmath>
#include "Adaptor.hh"
#include "ByteRLE.hh"
-#include "ColumnReader.hh"
#include "ConvertColumnReader.hh"
+#include "DictionaryLoader.hh"
#include "RLE.hh"
#include "SchemaEvolution.hh"
#include "orc/Exceptions.hh"
+#include "orc/Int128.hh"
#include <math.h>
#include <iostream>
@@ -36,19 +39,6 @@ namespace orc {
// PASS
}
- inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) {
- switch (static_cast<int64_t>(kind)) {
- case proto::ColumnEncoding_Kind_DIRECT:
- case proto::ColumnEncoding_Kind_DICTIONARY:
- return RleVersion_1;
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- return RleVersion_2;
- default:
- throw ParseError("Unknown encoding in convertRleVersion");
- }
- }
-
ColumnReader::ColumnReader(const Type& type, StripeStreams& stripe)
: columnId(type.getColumnId()),
memoryPool(stripe.getMemoryPool()),
@@ -519,7 +509,10 @@ namespace orc {
std::unique_ptr<RleDecoder> rle_;
public:
- StringDictionaryColumnReader(const Type& type, StripeStreams& stipe);
+ StringDictionaryColumnReader(const Type& type, StripeStreams& stripe);
+
+ StringDictionaryColumnReader(const Type& type, StripeStreams& stripe,
+ const std::shared_ptr<StringDictionary>
dictionary);
~StringDictionaryColumnReader() override;
uint64_t skip(uint64_t numValues) override;
@@ -533,39 +526,23 @@ namespace orc {
StringDictionaryColumnReader::StringDictionaryColumnReader(const Type& type,
StripeStreams&
stripe)
- : ColumnReader(type, stripe), dictionary_(new
StringDictionary(stripe.getMemoryPool())) {
+ : StringDictionaryColumnReader(type, stripe, nullptr) {}
+
+ StringDictionaryColumnReader::StringDictionaryColumnReader(
+ const Type& type, StripeStreams& stripe, const
std::shared_ptr<StringDictionary> dictionary)
+ : ColumnReader(type, stripe), dictionary_(dictionary) {
RleVersion rleVersion =
convertRleVersion(stripe.getEncoding(columnId).kind());
- uint32_t dictSize = stripe.getEncoding(columnId).dictionary_size();
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
if (stream == nullptr) {
throw ParseError("DATA stream not found in StringDictionaryColumn");
}
rle_ = createRleDecoder(std::move(stream), false, rleVersion, memoryPool,
metrics);
- stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false);
- if (dictSize > 0 && stream == nullptr) {
- throw ParseError("LENGTH stream not found in StringDictionaryColumn");
- }
- std::unique_ptr<RleDecoder> lengthDecoder =
- createRleDecoder(std::move(stream), false, rleVersion, memoryPool,
metrics);
- dictionary_->dictionaryOffset.resize(dictSize + 1);
- int64_t* lengthArray = dictionary_->dictionaryOffset.data();
- lengthDecoder->next(lengthArray + 1, dictSize, nullptr);
- lengthArray[0] = 0;
- for (uint32_t i = 1; i < dictSize + 1; ++i) {
- if (lengthArray[i] < 0) {
- throw ParseError("Negative dictionary entry length");
- }
- lengthArray[i] += lengthArray[i - 1];
- }
- int64_t blobSize = lengthArray[dictSize];
- dictionary_->dictionaryBlob.resize(static_cast<uint64_t>(blobSize));
- std::unique_ptr<SeekableInputStream> blobStream =
- stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
- if (blobSize > 0 && blobStream == nullptr) {
- throw ParseError("DICTIONARY_DATA stream not found in
StringDictionaryColumn");
+
+ // If no dictionary was provided, load it
+ if (!dictionary_) {
+ dictionary_ = loadStringDictionary(columnId, stripe, memoryPool);
}
- readFully(dictionary_->dictionaryBlob.data(), blobSize, blobStream.get());
}
StringDictionaryColumnReader::~StringDictionaryColumnReader() {
@@ -1717,8 +1694,15 @@ namespace orc {
case GEOGRAPHY:
switch
(static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) {
case proto::ColumnEncoding_Kind_DICTIONARY:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- return std::make_unique<StringDictionaryColumnReader>(type,
stripe);
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2: {
+ // Check if we have a pre-loaded dictionary we can use
+ auto dictionary = stripe.getSharedDictionary(type.getColumnId());
+ if (dictionary) {
+ return std::make_unique<StringDictionaryColumnReader>(type,
stripe, dictionary);
+ } else {
+ return std::unique_ptr<ColumnReader>(new
StringDictionaryColumnReader(type, stripe));
+ }
+ }
case proto::ColumnEncoding_Kind_DIRECT:
case proto::ColumnEncoding_Kind_DIRECT_V2:
return std::make_unique<StringDirectColumnReader>(type, stripe);
diff --git a/c++/src/ColumnReader.hh b/c++/src/ColumnReader.hh
index f0f3fe1b5..4a3fe2ac4 100644
--- a/c++/src/ColumnReader.hh
+++ b/c++/src/ColumnReader.hh
@@ -97,6 +97,13 @@ namespace orc {
* @return the number of scale digits
*/
virtual int32_t getForcedScaleOnHive11Decimal() const = 0;
+
+ /**
+ * Get a shared dictionary for the given column if available.
+ * @param columnId the id of the column
+ * @return shared pointer to the StringDictionary or nullptr if not
available
+ */
+ virtual std::shared_ptr<StringDictionary> getSharedDictionary(uint64_t
columnId) const = 0;
/**
* Whether decimals that have precision <=18 are encoded as fixed scale
and values
diff --git a/c++/src/DictionaryLoader.cc b/c++/src/DictionaryLoader.cc
new file mode 100644
index 000000000..428d288d5
--- /dev/null
+++ b/c++/src/DictionaryLoader.cc
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DictionaryLoader.hh"
+#include "RLE.hh"
+
+namespace orc {
+
+ namespace {
+
+ // Helper function to read data fully from a stream
+ void readFully(char* buffer, int64_t bufferSize, SeekableInputStream*
stream) {
+ int64_t posn = 0;
+ while (posn < bufferSize) {
+ const void* chunk;
+ int length;
+ if (!stream->Next(&chunk, &length)) {
+ throw ParseError("bad read in readFully");
+ }
+ if (posn + length > bufferSize) {
+ throw ParseError("Corrupt dictionary blob");
+ }
+ memcpy(buffer + posn, chunk, static_cast<size_t>(length));
+ posn += length;
+ }
+ }
+
+ } // namespace
+
+ std::shared_ptr<StringDictionary> loadStringDictionary(uint64_t columnId,
StripeStreams& stripe,
+ MemoryPool& pool) {
+ // Get encoding information
+ proto::ColumnEncoding encoding = stripe.getEncoding(columnId);
+ RleVersion rleVersion = convertRleVersion(encoding.kind());
+ uint32_t dictSize = encoding.dictionary_size();
+
+ // Create the dictionary object
+ auto dictionary = std::make_shared<StringDictionary>(pool);
+
+ // Read LENGTH stream to get dictionary entry lengths
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false);
+ if (dictSize > 0 && stream == nullptr) {
+ std::stringstream ss;
+ ss << "LENGTH stream not found in StringDictionaryColumn for column " <<
columnId;
+ throw ParseError(ss.str());
+ }
+ std::unique_ptr<RleDecoder> lengthDecoder =
+ createRleDecoder(std::move(stream), false, rleVersion, pool,
stripe.getReaderMetrics());
+
+ // Decode dictionary entry lengths
+ dictionary->dictionaryOffset.resize(dictSize + 1);
+ int64_t* lengthArray = dictionary->dictionaryOffset.data();
+ lengthDecoder->next(lengthArray + 1, dictSize, nullptr);
+ lengthArray[0] = 0;
+
+ // Convert lengths to cumulative offsets
+ for (uint32_t i = 1; i < dictSize + 1; ++i) {
+ if (lengthArray[i] < 0) {
+ std::stringstream ss;
+ ss << "Negative dictionary entry length for column " << columnId;
+ throw ParseError(ss.str());
+ }
+ lengthArray[i] += lengthArray[i - 1];
+ }
+
+ int64_t blobSize = lengthArray[dictSize];
+
+ // Read DICTIONARY_DATA stream to get dictionary content
+ dictionary->dictionaryBlob.resize(static_cast<uint64_t>(blobSize));
+ std::unique_ptr<SeekableInputStream> blobStream =
+ stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
+ if (blobSize > 0 && blobStream == nullptr) {
+ std::stringstream ss;
+ ss << "DICTIONARY_DATA stream not found in StringDictionaryColumn for
column " << columnId;
+ throw ParseError(ss.str());
+ }
+
+ // Read the dictionary blob
+ readFully(dictionary->dictionaryBlob.data(), blobSize, blobStream.get());
+
+ return dictionary;
+ }
+
+} // namespace orc
\ No newline at end of file
diff --git a/c++/src/DictionaryLoader.hh b/c++/src/DictionaryLoader.hh
new file mode 100644
index 000000000..64df45afe
--- /dev/null
+++ b/c++/src/DictionaryLoader.hh
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_DICTIONARY_LOADER_HH
+#define ORC_DICTIONARY_LOADER_HH
+
+#include "ColumnReader.hh"
+#include "orc/Vector.hh"
+
+namespace orc {
+
+ /**
+ * Load a string dictionary for a single column from a stripe.
+ * This function reads the LENGTH and DICTIONARY_DATA streams and populates
+ * the StringDictionary structure. It automatically uses ReadCache if
available
+ * through the StripeStreams interface.
+ *
+ * @param columnId the column ID to load the dictionary for
+ * @param stripe the StripeStreams interface providing access to streams
+ * @param pool the memory pool to use for allocating the dictionary
+ * @return a shared pointer to the loaded StringDictionary, or nullptr if
loading fails
+ */
+ std::shared_ptr<StringDictionary> loadStringDictionary(uint64_t columnId,
StripeStreams& stripe,
+ MemoryPool& pool);
+
+ // Helper function to convert encoding kind to RLE version
+ inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) {
+ switch (static_cast<int64_t>(kind)) {
+ case proto::ColumnEncoding_Kind_DIRECT:
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ return RleVersion_1;
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ return RleVersion_2;
+ default:
+ throw ParseError("Unknown encoding in convertRleVersion");
+ }
+ }
+
+} // namespace orc
+
+#endif
diff --git a/c++/src/Options.hh b/c++/src/Options.hh
index b71edcd42..c0abf190c 100644
--- a/c++/src/Options.hh
+++ b/c++/src/Options.hh
@@ -25,6 +25,7 @@
#include "io/Cache.hh"
+#include <cstdint>
#include <iostream>
#include <limits>
@@ -156,6 +157,7 @@ namespace orc {
bool throwOnSchemaEvolutionOverflow;
bool enableAsyncPrefetch;
uint64_t smallStripeLookAheadLimit;
+ uint32_t dictionaryFilteringSizeThreshold;
RowReaderOptionsPrivate() {
selection = ColumnSelection_NONE;
@@ -169,6 +171,7 @@ namespace orc {
throwOnSchemaEvolutionOverflow = false;
enableAsyncPrefetch = false;
smallStripeLookAheadLimit = 8;
+ dictionaryFilteringSizeThreshold = 0;
}
};
@@ -362,6 +365,15 @@ namespace orc {
return privateBits_->smallStripeLookAheadLimit;
}
+ RowReaderOptions&
RowReaderOptions::setDictionaryFilteringSizeThreshold(uint32_t threshold) {
+ privateBits_->dictionaryFilteringSizeThreshold = threshold;
+ return *this;
+ }
+
+ uint32_t RowReaderOptions::getDictionaryFilteringSizeThreshold() const {
+ return privateBits_->dictionaryFilteringSizeThreshold;
+ }
+
} // namespace orc
#endif
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index ab4c5047d..4fd1a73a9 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -19,7 +19,9 @@
#include "Reader.hh"
#include "Adaptor.hh"
#include "BloomFilter.hh"
+#include "DictionaryLoader.hh"
#include "Options.hh"
+#include "RLE.hh"
#include "Statistics.hh"
#include "StripeStream.hh"
#include "Utils.hh"
@@ -32,6 +34,7 @@
#include <set>
#include <sstream>
#include <string>
+#include <unordered_map>
#include <vector>
namespace orc {
@@ -347,9 +350,10 @@ namespace orc {
// prepare SargsApplier if SearchArgument is available
if (opts.getSearchArgument() && footer_->row_index_stride() > 0) {
sargs_ = opts.getSearchArgument();
- sargsApplier_.reset(
- new SargsApplier(*contents_->schema, sargs_.get(),
footer_->row_index_stride(),
- getWriterVersionImpl(contents.get()),
contents_->readerMetrics));
+ sargsApplier_ = std::make_unique<SargsApplier>(
+ *contents_->schema, sargs_.get(), footer_->row_index_stride(),
+ getWriterVersionImpl(contents.get()),
opts.getDictionaryFilteringSizeThreshold(),
+ contents_->readerMetrics, &schemaEvolution_);
}
skipBloomFilters_ = hasBadBloomFilters();
@@ -1119,10 +1123,97 @@ namespace orc {
return getStripeSize(stripeInfo) <= threshold;
}
+ /**
+ * Load stripe dictionaries for dictionary-based predicate pushdown.
+ * Only loads dictionaries for STRING/VARCHAR/CHAR columns with IN
expressions.
+ */
+ std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>
loadStripeDictionaries(
+ const proto::Footer& footer, const std::vector<bool>& selectedColumns,
+ const std::vector<uint64_t>& columnsWithInExpr, StripeStreams& stripe,
+ size_t dictSizeThreshold) {
+ std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>
dictionaries;
+
+ // Only load dictionaries for selected columns with IN expressions
+ for (uint64_t colId : columnsWithInExpr) {
+ if (!selectedColumns[colId] || colId >=
static_cast<uint64_t>(footer.types_size())) {
+ continue;
+ }
+
+ auto encoding = stripe.getEncoding(colId);
+ if (encoding.kind() != proto::ColumnEncoding_Kind_DICTIONARY &&
+ encoding.kind() != proto::ColumnEncoding_Kind_DICTIONARY_V2) {
+ continue;
+ }
+
+ auto typeKind = footer.types(static_cast<int>(colId)).kind();
+ if (typeKind != proto::Type_Kind_STRING && typeKind !=
proto::Type_Kind_VARCHAR &&
+ typeKind != proto::Type_Kind_CHAR) {
+ continue;
+ }
+
+ if (encoding.dictionary_size() > dictSizeThreshold) {
+ continue;
+ }
+
+ dictionaries[colId] = loadStringDictionary(colId, stripe,
stripe.getMemoryPool());
+ }
+
+ return dictionaries;
+ }
+
+ // Evaluate dictionaries for the current stripe to determine if it can be
+ // skipped.
+ bool evaluateStripeDictionaries(RowReaderImpl& reader, const proto::Footer&
footer,
+ const std::vector<bool>& selectedColumns,
+ const proto::StripeFooter& stripeFooter,
+ const proto::StripeInformation& stripeInfo,
+ uint64_t currentStripe, SargsApplier*
sargsApplier,
+ const Timezone& localTimezone, const
Timezone& readerTimezone) {
+ const std::vector<uint64_t>& columnsWithInExpr =
sargsApplier->getColumnsWithInExpressions();
+ if (columnsWithInExpr.empty()) {
+ return true;
+ }
+
+ const Timezone& writerTimezone = stripeFooter.has_writer_timezone()
+ ?
getTimezoneByName(stripeFooter.writer_timezone())
+ : localTimezone;
+ StripeStreamsImpl stripeStreams(reader, currentStripe, stripeInfo,
stripeFooter,
+ stripeInfo.offset(),
*reader.getFileContents().stream,
+ writerTimezone, readerTimezone);
+
+ auto dictionaries =
+ loadStripeDictionaries(footer, selectedColumns, columnsWithInExpr,
stripeStreams,
+
sargsApplier->getDictionaryFilteringSizeThreshold());
+ if (!dictionaries.empty()) {
+ // Store the loaded dictionaries for use by ColumnReaders
+ reader.setSharedDictionaries(dictionaries);
+
+ return sargsApplier->evaluateColumnDictionaries(dictionaries);
+ }
+
+ return true;
+ }
+
+ std::shared_ptr<StringDictionary>
RowReaderImpl::getSharedDictionary(uint64_t columnId) const {
+ auto it = sharedDictionaries_.find(columnId);
+ if (it != sharedDictionaries_.end()) {
+ return it->second;
+ }
+ return nullptr;
+ }
+
+ void RowReaderImpl::setSharedDictionaries(
+ const std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>&
dictionaries) {
+ for (const auto& pair : dictionaries) {
+ sharedDictionaries_[pair.first] = pair.second;
+ }
+ }
+
void RowReaderImpl::startNextStripe() {
reader_.reset(); // ColumnReaders use lots of memory; free old memory
first
rowIndexes_.clear();
bloomFilterIndex_.clear();
+ sharedDictionaries_.clear(); // Clear dictionaries from previous stripe
// evaluate file statistics if it exists
if (sargsApplier_ &&
@@ -1164,7 +1255,15 @@ namespace orc {
if (isStripeNeeded) {
currentStripeFooter_ = getStripeFooter(currentStripeInfo_,
*contents_.get());
- if (sargsApplier_) {
+
+ if (sargsApplier_ &&
sargsApplier_->getDictionaryFilteringSizeThreshold() > 0) {
+ // evaluate dictionaries for predicate pushdown
+ isStripeNeeded = evaluateStripeDictionaries(
+ *this, *footer_, selectedColumns_, currentStripeFooter_,
currentStripeInfo_,
+ currentStripe_, sargsApplier_.get(), localTimezone_,
readerTimezone_);
+ }
+
+ if (sargsApplier_ && isStripeNeeded) {
// read row group statistics and bloom filters of current stripe
loadStripeIndex();
diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh
index 966281cce..132f92ebb 100644
--- a/c++/src/Reader.hh
+++ b/c++/src/Reader.hh
@@ -193,6 +193,9 @@ namespace orc {
// match read and file types
SchemaEvolution schemaEvolution_;
+ // Dictionary optimization
+ std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>
sharedDictionaries_;
+
// load stripe index if not done so
void loadStripeIndex();
@@ -266,6 +269,15 @@ namespace orc {
std::shared_ptr<ReadRangeCache> getReadCache() const {
return contents_->readCache;
}
+
+ // Method to set shared dictionaries from external functions
+ void setSharedDictionaries(
+ const std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>&
dictionaries);
+
+ // Method to get a shared dictionary by column id
+ std::shared_ptr<StringDictionary> getSharedDictionary(uint64_t columnId)
const;
+
+ private:
};
class ReaderImpl : public Reader {
diff --git a/c++/src/StripeStream.cc b/c++/src/StripeStream.cc
index a5609f762..f12a28b2c 100644
--- a/c++/src/StripeStream.cc
+++ b/c++/src/StripeStream.cc
@@ -174,4 +174,9 @@ namespace orc {
streamOffset, stripeFooter_->streams(static_cast<int>(streamId)));
}
+ std::shared_ptr<StringDictionary> StripeStreamsImpl::getSharedDictionary(
+ uint64_t columnId) const {
+ return reader_.getSharedDictionary(columnId);
+ }
+
} // namespace orc
diff --git a/c++/src/StripeStream.hh b/c++/src/StripeStream.hh
index 2d26f8575..eb31d77aa 100644
--- a/c++/src/StripeStream.hh
+++ b/c++/src/StripeStream.hh
@@ -81,6 +81,8 @@ namespace orc {
int32_t getForcedScaleOnHive11Decimal() const override;
const SchemaEvolution* getSchemaEvolution() const override;
+
+ std::shared_ptr<StringDictionary> getSharedDictionary(uint64_t columnId)
const override;
};
/**
diff --git a/c++/src/meson.build b/c++/src/meson.build
index 885df0072..6dfea9ab6 100644
--- a/c++/src/meson.build
+++ b/c++/src/meson.build
@@ -151,6 +151,7 @@ source_files += files(
'ConvertColumnReader.cc',
'CpuInfoUtil.cc',
'Dictionary.cc',
+ 'DictionaryLoader.cc',
'Exceptions.cc',
'Geospatial.cc',
'Int128.cc',
diff --git a/c++/src/sargs/Literal.cc b/c++/src/sargs/Literal.cc
index f36db7943..f323c061a 100644
--- a/c++/src/sargs/Literal.cc
+++ b/c++/src/sargs/Literal.cc
@@ -293,6 +293,11 @@ namespace orc {
return std::string(value_.Buffer, size_);
}
+ std::string_view Literal::getStringView() const {
+ validate(isNull_, type_, PredicateDataType::STRING);
+ return std::string_view(value_.Buffer, size_);
+ }
+
bool Literal::getBool() const {
validate(isNull_, type_, PredicateDataType::BOOLEAN);
return value_.BooleanVal;
diff --git a/c++/src/sargs/SargsApplier.cc b/c++/src/sargs/SargsApplier.cc
index b3085964d..5c7aa10ef 100644
--- a/c++/src/sargs/SargsApplier.cc
+++ b/c++/src/sargs/SargsApplier.cc
@@ -17,7 +17,11 @@
*/
#include "SargsApplier.hh"
+#include "Dictionary.hh"
+#include "sargs/PredicateLeaf.hh"
+
#include <numeric>
+#include <set>
namespace orc {
@@ -39,17 +43,21 @@ namespace orc {
SargsApplier::SargsApplier(const Type& type, const SearchArgument*
searchArgument,
uint64_t rowIndexStride, WriterVersion
writerVersion,
- ReaderMetrics* metrics, const SchemaEvolution*
schemaEvolution)
+ size_t dictionaryFilteringSizeThreshold,
ReaderMetrics* metrics,
+ const SchemaEvolution* schemaEvolution)
: type_(type),
searchArgument_(searchArgument),
schemaEvolution_(schemaEvolution),
rowIndexStride_(rowIndexStride),
writerVersion_(writerVersion),
+ dictionaryFilteringSizeThreshold_(dictionaryFilteringSizeThreshold),
hasEvaluatedFileStats_(false),
fileStatsEvalResult_(true),
metrics_(metrics) {
const SearchArgumentImpl* sargs = dynamic_cast<const
SearchArgumentImpl*>(searchArgument_);
+ std::set<uint64_t> columnsWithInExpr;
+
// find the mapping from predicate leaves to columns
const std::vector<PredicateLeaf>& leaves = sargs->getLeaves();
filterColumns_.resize(leaves.size(), INVALID_COLUMN_ID);
@@ -59,7 +67,16 @@ namespace orc {
} else {
filterColumns_[i] = leaves[i].getColumnId();
}
+
+ if (leaves[i].getOperator() == PredicateLeaf::Operator::IN) {
+ uint64_t columnId = filterColumns_[i];
+ if (columnId != INVALID_COLUMN_ID) {
+ columnsWithInExpr.insert(columnId);
+ }
+ }
}
+
+ columnsWithInExpr_.assign(columnsWithInExpr.begin(),
columnsWithInExpr.end());
}
bool SargsApplier::pickRowGroups(uint64_t rowsInStripe,
@@ -185,4 +202,110 @@ namespace orc {
}
return fileStatsEvalResult_;
}
+
+ TruthValue SargsApplier::evaluateDictionaryForColumn(const StringDictionary&
dictionary,
+ const PredicateLeaf&
leaf) const {
+ // Only handle IN expressions for dictionary filtering
+ if (leaf.getOperator() != PredicateLeaf::Operator::IN) {
+ return TruthValue::YES_NO_NULL;
+ }
+
+ const std::vector<Literal>& literals = leaf.getLiteralList();
+ if (literals.empty()) {
+ return TruthValue::YES_NO_NULL;
+ }
+
+ // Pre-compute string views for literals to avoid repeated function calls
+ std::vector<std::string_view> literalViews;
+ literalViews.reserve(literals.size());
+ for (const auto& literal : literals) {
+ literalViews.emplace_back(literal.getStringView());
+ }
+
+ // Check if any dictionary entry matches any literal in the IN list
+ const int64_t* offsets = dictionary.dictionaryOffset.data();
+ const char* blob = dictionary.dictionaryBlob.data();
+ size_t dictSize = dictionary.dictionaryOffset.size() - 1;
+
+ // Use a set to store matching dictionary entries
+ size_t matchedEntriesCount = 0;
+
+ for (size_t i = 0; i < dictSize; ++i) {
+ int64_t start = offsets[i];
+ int64_t length = offsets[i + 1] - start;
+ std::string_view dictEntry(blob + start, static_cast<size_t>(length));
+
+ // Check if this dictionary entry matches any literal in the IN list
+ for (const auto& literalView : literalViews) {
+ if (dictEntry == literalView) {
+ matchedEntriesCount++;
+ break;
+ }
+ }
+ }
+
+ // If all dictionary entries match, return YES
+ if (matchedEntriesCount == dictSize) {
+ return TruthValue::YES;
+ }
+ // If any dictionary entry matches, stripe might contain matching rows
+ else if (matchedEntriesCount != 0) {
+ return TruthValue::YES_NO_NULL;
+ }
+ // No dictionary entry matches any literal in the IN list - skip stripe
+ else {
+ return TruthValue::NO;
+ }
+ }
+
+ bool SargsApplier::evaluateColumnDictionaries(
+ const std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>&
dictionaries) {
+ const SearchArgumentImpl* sargs = dynamic_cast<const
SearchArgumentImpl*>(searchArgument_);
+ if (sargs == nullptr) {
+ return true; // Cannot evaluate, assume stripe is needed
+ }
+
+ const std::vector<PredicateLeaf>& leaves = sargs->getLeaves();
+ std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL);
+
+ // Evaluate each predicate leaf against dictionaries (only IN expressions)
+ for (size_t pred = 0; pred != leaves.size(); ++pred) {
+ uint64_t columnId = filterColumns_[pred];
+
+ // Only evaluate IN expressions
+ if (leaves[pred].getOperator() != PredicateLeaf::Operator::IN) {
+ leafValues[pred] = TruthValue::YES_NO_NULL;
+ continue;
+ }
+
+ // Check if this column has a dictionary
+ auto dictIter = dictionaries.find(columnId);
+ if (columnId == INVALID_COLUMN_ID || dictIter == dictionaries.cend() ||
+ dictIter->second == nullptr) {
+ // No dictionary for this column, cannot evaluate
+ leafValues[pred] = TruthValue::YES_NO_NULL;
+ continue;
+ }
+
+ // Check if schema evolution makes PPD unsafe
+ if (schemaEvolution_ &&
!schemaEvolution_->isSafePPDConversion(columnId)) {
+ leafValues[pred] = TruthValue::YES_NO_NULL;
+ continue;
+ }
+
+ // Check if this is a STRING/VARCHAR/CHAR column
+ PredicateDataType predType = leaves[pred].getType();
+ if (predType != PredicateDataType::STRING) {
+ // Only string types are supported for dictionary filtering
+ leafValues[pred] = TruthValue::YES_NO_NULL;
+ continue;
+ }
+
+ // Evaluate the IN expression against the dictionary
+ leafValues[pred] = evaluateDictionaryForColumn(*dictIter->second,
leaves[pred]);
+ }
+
+ return isNeeded(searchArgument_->evaluate(leafValues));
+ }
+
} // namespace orc
diff --git a/c++/src/sargs/SargsApplier.hh b/c++/src/sargs/SargsApplier.hh
index 65c8dec83..b9e85d508 100644
--- a/c++/src/sargs/SargsApplier.hh
+++ b/c++/src/sargs/SargsApplier.hh
@@ -19,25 +19,24 @@
#ifndef ORC_SARGSAPPLIER_HH
#define ORC_SARGSAPPLIER_HH
-#include <orc/Common.hh>
+#include "SchemaEvolution.hh"
#include "orc/BloomFilter.hh"
+#include "orc/Common.hh"
#include "orc/Reader.hh"
#include "orc/Type.hh"
-#include "wrap/orc-proto-wrapper.hh"
-
#include "sargs/SearchArgument.hh"
-#include "SchemaEvolution.hh"
-
#include <unordered_map>
namespace orc {
+ struct StringDictionary;
+
class SargsApplier {
public:
SargsApplier(const Type& type, const SearchArgument* searchArgument,
uint64_t rowIndexStride,
- WriterVersion writerVersion, ReaderMetrics* metrics,
- const SchemaEvolution* schemaEvolution = nullptr);
+ WriterVersion writerVersion, size_t
dictionaryFilteringSizeThreshold,
+ ReaderMetrics* metrics, const SchemaEvolution*
schemaEvolution);
/**
* Evaluate search argument on file statistics
@@ -60,6 +59,18 @@ namespace orc {
bool evaluateStripeStatistics(const proto::StripeStatistics& stripeStats,
uint64_t stripeRowGroupCount);
+ /**
+ * Evaluate search argument on column dictionaries (only IN expressions)
+ * If dictionary entries don't satisfy the sargs,
+ * the EvaluatedRowGroupCount of Reader Metrics will be updated.
+ * Otherwise, Reader Metrics will not be updated and
+ * will require further evaluation.
+ * @param dictionaries map from column ID to StringDictionary
+ * @return true if any dictionary entry satisfies the sargs
+ */
+ bool evaluateColumnDictionaries(
+ const std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>&
dictionaries);
+
/**
* TODO: use proto::RowIndex and proto::BloomFilter to do the evaluation
* Pick the row groups that we need to load from the current stripe.
@@ -114,11 +125,29 @@ namespace orc {
}
}
+ /**
+ * Get list of column IDs that have IN expressions for dictionary filtering
+ */
+ const std::vector<uint64_t>& getColumnsWithInExpressions() const {
+ return columnsWithInExpr_;
+ }
+
+ /**
+ * Get the dictionary filtering size threshold
+ */
+ size_t getDictionaryFilteringSizeThreshold() const {
+ return dictionaryFilteringSizeThreshold_;
+ }
+
private:
// evaluate column statistics in the form of protobuf::RepeatedPtrField
typedef ::google::protobuf::RepeatedPtrField<proto::ColumnStatistics>
PbColumnStatistics;
bool evaluateColumnStatistics(const PbColumnStatistics& colStats) const;
+ // Helper method to evaluate IN expression against a dictionary
+ TruthValue evaluateDictionaryForColumn(const StringDictionary& dictionary,
+ const PredicateLeaf& leaf) const;
+
friend class TestSargsApplier_findColumnTest_Test;
friend class TestSargsApplier_findArrayColumnTest_Test;
friend class TestSargsApplier_findMapColumnTest_Test;
@@ -128,10 +157,13 @@ namespace orc {
const Type& type_;
const SearchArgument* searchArgument_;
const SchemaEvolution* schemaEvolution_;
- uint64_t rowIndexStride_;
- WriterVersion writerVersion_;
+ const uint64_t rowIndexStride_;
+ const WriterVersion writerVersion_;
+ const uint32_t dictionaryFilteringSizeThreshold_;
// column ids for each predicate leaf in the search argument
std::vector<uint64_t> filterColumns_;
+ // column ids that have IN expressions for dictionary filtering
+ std::vector<uint64_t> columnsWithInExpr_;
// Map from RowGroup index to the next skipped row of the selected range it
// locates. If the RowGroup is not selected, set the value to 0.
diff --git a/c++/test/MockStripeStreams.hh b/c++/test/MockStripeStreams.hh
index dd32ad599..a62ed4394 100644
--- a/c++/test/MockStripeStreams.hh
+++ b/c++/test/MockStripeStreams.hh
@@ -41,6 +41,7 @@ namespace orc {
MOCK_CONST_METHOD0(getForcedScaleOnHive11Decimal, int32_t());
MOCK_CONST_METHOD0(isDecimalAsLong, bool());
MOCK_CONST_METHOD0(getSchemaEvolution, const SchemaEvolution*());
+ MOCK_CONST_METHOD1(getSharedDictionary,
std::shared_ptr<StringDictionary>(uint64_t));
MemoryPool& getMemoryPool() const override;
diff --git a/c++/test/TestPredicatePushdown.cc
b/c++/test/TestPredicatePushdown.cc
index 5c8ed14e7..427745a5f 100644
--- a/c++/test/TestPredicatePushdown.cc
+++ b/c++/test/TestPredicatePushdown.cc
@@ -548,4 +548,123 @@ namespace orc {
TestFirstStripeSelectedWithStripeStats(reader.get(), pos);
}
}
+
+ // Create a test file with dictionary-encoded string columns for testing
+ // dictionary-based predicate pushdown
+ void createDictionaryTestFile(MemoryOutputStream& memStream) {
+ MemoryPool* pool = getDefaultPool();
+ auto type = std::unique_ptr<Type>(
+
Type::buildTypeFromString("struct<id:bigint,category:string,status:string>"));
+ WriterOptions options;
+ options
+ .setStripeSize(512) // Small stripe size to create multiple stripes
+ .setCompressionBlockSize(1024)
+ .setMemoryBlockSize(64)
+ .setCompression(CompressionKind_ZLIB)
+ .setMemoryPool(pool)
+ .setDictionaryKeySizeThreshold(1.0)
+ .setRowIndexStride(1000);
+
+ auto writer = createWriter(*type, &memStream, options);
+ auto batch = writer->createRowBatch(1000);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+ auto& idBatch = dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]);
+ auto& categoryBatch =
dynamic_cast<StringVectorBatch&>(*structBatch.fields[1]);
+ auto& statusBatch =
dynamic_cast<StringVectorBatch&>(*structBatch.fields[2]);
+
+ const char* categories[] = {"A", "B", "C", "X", "Y"};
+ const char* statuses[] = {"active", "inactive", "pending"};
+
+ char categoryBuffer[10000];
+ char statusBuffer[20000];
+ uint64_t categoryOffset = 0;
+ uint64_t statusOffset = 0;
+
+ for (uint64_t i = 0; i < 10000; ++i) {
+ idBatch.data[i % 1000] = static_cast<int64_t>(i);
+
+ const char* category = categories[i % 5];
+ size_t catLen = strlen(category);
+ memcpy(categoryBuffer + categoryOffset, category, catLen);
+ categoryBatch.data[i % 1000] = categoryBuffer + categoryOffset;
+ categoryBatch.length[i % 1000] = static_cast<int64_t>(catLen);
+ categoryOffset += catLen;
+
+ const char* status = statuses[i % 3];
+ size_t statusLen = strlen(status);
+ memcpy(statusBuffer + statusOffset, status, statusLen);
+ statusBatch.data[i % 1000] = statusBuffer + statusOffset;
+ statusBatch.length[i % 1000] = static_cast<int64_t>(statusLen);
+ statusOffset += statusLen;
+
+ if ((i + 1) % 1000 == 0) {
+ structBatch.numElements = 1000;
+ idBatch.numElements = 1000;
+ categoryBatch.numElements = 1000;
+ statusBatch.numElements = 1000;
+ writer->add(*batch);
+ categoryOffset = 0;
+ statusOffset = 0;
+ }
+ }
+
+ writer->close();
+ }
+
+ TEST(TestPredicatePushdown, testDictionaryFiltering) {
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ MemoryPool* pool = getDefaultPool();
+ createDictionaryTestFile(memStream);
+
+ auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
+ ReaderOptions readerOptions;
+ readerOptions.setMemoryPool(*pool);
+ std::unique_ptr<Reader> reader = createReader(std::move(inStream),
readerOptions);
+ EXPECT_EQ(10000, reader->getNumberOfRows());
+ EXPECT_TRUE(reader->getNumberOfStripes() > 1);
+
+ struct Param {
+ uint32_t dictSizeThreshold;
+ uint64_t expectedRowsRead;
+ };
+
+ // Test filtering all rows when dictionary size threshold is large enough
+ for (const auto& param : {Param{0, 10000}, Param{10, 0}}) {
+ RowReaderOptions options;
+ options.searchArgument(
+ SearchArgumentFactory::newBuilder()
+ ->in("category", PredicateDataType::STRING, {Literal("M", 1),
Literal("N", 1)})
+ .build());
+ if (param.dictSizeThreshold > 0) {
+ options.setDictionaryFilteringSizeThreshold(param.dictSizeThreshold);
+ }
+ auto rowReader = reader->createRowReader(options);
+ auto batch = rowReader->createRowBatch(1000);
+ uint64_t rowsRead = 0;
+ while (rowReader->next(*batch)) {
+ rowsRead += batch->numElements;
+ }
+ EXPECT_EQ(rowsRead, param.expectedRowsRead);
+ }
+
+ // Test filtering with matching values
+ for (const auto& param : {Param{0, 10000}, Param{100, 10000}}) {
+ RowReaderOptions options;
+ options.searchArgument(
+ SearchArgumentFactory::newBuilder()
+ ->in("category", PredicateDataType::STRING, {Literal("A", 1),
Literal("C", 1)})
+ .build());
+ if (param.dictSizeThreshold > 0) {
+ options.setDictionaryFilteringSizeThreshold(param.dictSizeThreshold);
+ }
+ auto rowReader = reader->createRowReader(options);
+ auto batch = rowReader->createRowBatch(1000);
+ uint64_t rowsRead = 0;
+ while (rowReader->next(*batch)) {
+ rowsRead += batch->numElements;
+ }
+ EXPECT_EQ(rowsRead, param.expectedRowsRead);
+ }
+ }
+
} // namespace orc
diff --git a/c++/test/TestSargsApplier.cc b/c++/test/TestSargsApplier.cc
index 7105c738e..d64d5863d 100644
--- a/c++/test/TestSargsApplier.cc
+++ b/c++/test/TestSargsApplier.cc
@@ -93,7 +93,7 @@ namespace orc {
// evaluate row group index
ReaderMetrics metrics;
SchemaEvolution se(nullptr, type.get());
- SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135,
&metrics, &se);
+ SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0,
&metrics, &se);
EXPECT_TRUE(applier.pickRowGroups(4000, rowIndexes, {}));
const auto& nextSkippedRows = applier.getNextSkippedRows();
EXPECT_EQ(4, nextSkippedRows.size());
@@ -122,7 +122,7 @@ namespace orc {
*stripeStats.add_col_stats() = createIntStats(0L, 10L);
*stripeStats.add_col_stats() = createIntStats(0L, 50L);
ReaderMetrics metrics;
- SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135,
&metrics);
+ SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0,
&metrics, nullptr);
EXPECT_FALSE(applier.evaluateStripeStatistics(stripeStats, 1));
EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);
@@ -136,7 +136,7 @@ namespace orc {
*stripeStats.add_col_stats() = createIntStats(0L, 50L);
*stripeStats.add_col_stats() = createIntStats(0L, 50L);
ReaderMetrics metrics;
- SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135,
&metrics);
+ SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0,
&metrics, nullptr);
EXPECT_TRUE(applier.evaluateStripeStatistics(stripeStats, 1));
EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);
@@ -150,7 +150,7 @@ namespace orc {
*footer.add_statistics() = createIntStats(0L, 10L);
*footer.add_statistics() = createIntStats(0L, 50L);
ReaderMetrics metrics;
- SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135,
&metrics);
+ SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0,
&metrics, nullptr);
EXPECT_FALSE(applier.evaluateFileStatistics(footer, 1));
EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);
@@ -164,7 +164,7 @@ namespace orc {
*footer.add_statistics() = createIntStats(0L, 50L);
*footer.add_statistics() = createIntStats(0L, 30L);
ReaderMetrics metrics;
- SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135,
&metrics);
+ SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0,
&metrics, nullptr);
EXPECT_FALSE(applier.evaluateFileStatistics(footer, 1));
EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);
@@ -178,7 +178,7 @@ namespace orc {
*footer.add_statistics() = createIntStats(0L, 50L);
*footer.add_statistics() = createIntStats(0L, 50L);
ReaderMetrics metrics;
- SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135,
&metrics);
+ SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0,
&metrics, nullptr);
EXPECT_TRUE(applier.evaluateFileStatistics(footer, 1));
EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);