ffacs commented on code in PR #2453:
URL: https://github.com/apache/orc/pull/2453#discussion_r2472070495
##########
c++/src/ColumnReader.cc:
##########
@@ -1717,8 +1703,16 @@ namespace orc {
case GEOGRAPHY:
switch
(static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) {
case proto::ColumnEncoding_Kind_DICTIONARY:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- return std::make_unique<StringDictionaryColumnReader>(type,
stripe);
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2: {
+ // Check if we have a pre-loaded dictionary we can use
+ auto dictionary = stripe.getSharedDictionary(type.getColumnId());
+ if (dictionary) {
+ return std::unique_ptr<ColumnReader>(
+ new StringDictionaryColumnReader(type, stripe, dictionary));
Review Comment:
std::make_unique would be better.
##########
c++/test/TestPredicatePushdown.cc:
##########
@@ -548,4 +548,122 @@ namespace orc {
TestFirstStripeSelectedWithStripeStats(reader.get(), pos);
}
}
+
+ // Create a test file with dictionary-encoded string columns for testing
+ // dictionary-based predicate pushdown
+ void createDictionaryTestFile(MemoryOutputStream& memStream) {
+ MemoryPool* pool = getDefaultPool();
+ auto type = std::unique_ptr<Type>(
+
Type::buildTypeFromString("struct<id:bigint,category:string,status:string>"));
+ WriterOptions options;
+ options
+ .setStripeSize(64 * 1024) // Small stripe size to create multiple
stripes
Review Comment:
Let's add a assert to make sure there are more than one stripes, since the
pattern of data is to easy for encoding.
##########
c++/src/sargs/SargsApplier.cc:
##########
@@ -185,4 +202,97 @@ namespace orc {
}
return fileStatsEvalResult_;
}
+
+ TruthValue SargsApplier::evaluateDictionaryForColumn(const StringDictionary&
dictionary,
+ const PredicateLeaf&
leaf) const {
+ // Only handle IN expressions for dictionary filtering
+ if (leaf.getOperator() != PredicateLeaf::Operator::IN) {
+ return TruthValue::YES_NO_NULL;
+ }
+
+ const std::vector<Literal>& literals = leaf.getLiteralList();
+ if (literals.empty()) {
+ return TruthValue::YES_NO_NULL;
+ }
+
+ // Pre-compute string views for literals to avoid repeated function calls
+ std::vector<std::string_view> literalViews;
+ literalViews.reserve(literals.size());
+ for (const auto& literal : literals) {
+ literalViews.emplace_back(literal.getStringView());
+ }
+
+ // Check if any dictionary entry matches any literal in the IN list
+ const int64_t* offsets = dictionary.dictionaryOffset.data();
+ const char* blob = dictionary.dictionaryBlob.data();
+ size_t dictSize = dictionary.dictionaryOffset.size() - 1;
+
+ for (size_t i = 0; i < dictSize; ++i) {
+ int64_t start = offsets[i];
+ int64_t length = offsets[i + 1] - start;
+ std::string_view dictEntry(blob + start, static_cast<size_t>(length));
+
+ // Check if this dictionary entry matches any literal in the IN list
+ for (const auto& literalView : literalViews) {
+ if (dictEntry == literalView) {
+ // Found a match - stripe might contain matching rows
+ return TruthValue::YES_NO_NULL;
Review Comment:
How about calculating the size of the intersect of literals and the
dictionary, by which we return TruthValue more precisely?
##########
c++/src/ColumnReader.cc:
##########
@@ -529,43 +524,34 @@ namespace orc {
void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char*
notNull) override;
void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>&
positions) override;
+
+ // Method to set dictionary if we have a pre-loaded one
+ void setDictionary(std::shared_ptr<StringDictionary> dictionary) {
Review Comment:
Where is the function used?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]