(orc) branch main updated: ORC-2022: [C++] Add support to use dictionary for IN expression

gangwu Tue, 04 Nov 2025 18:54:21 -0800

This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/main by this push:
     new cf85da122 ORC-2022: [C++] Add support to use dictionary for IN 
expression
cf85da122 is described below

commit cf85da12204c399dc4af72007e8397d65fac45e9
Author: luffy-zh <[email protected]>
AuthorDate: Wed Nov 5 10:54:04 2025 +0800

    ORC-2022: [C++] Add support to use dictionary for IN expression
    
    ### What changes were proposed in this pull request?
    
    Use column dictionaries to evaluate IN predicates.
    
    ### Why are the changes needed?
    
    Optimize IN predicate pruning: consult column dictionary (when reasonably 
sized) instead of relying on less effective min/max statistics.
    
    ### How was this patch tested?
    
    Unit tests in TestPredicatePushdown.cc verify this change.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    NO.
    
    Closes #2453 from luffy-zh/ORC-2022.
    
    Lead-authored-by: luffy-zh <[email protected]>
    Co-authored-by: shouzhi <[email protected]>
    Co-authored-by: Hao Zou <[email protected]>
    Co-authored-by: Gang Wu <[email protected]>
    Signed-off-by: Gang Wu <[email protected]>
---
 c++/include/orc/Reader.hh         |  15 +++++
 c++/include/orc/sargs/Literal.hh  |   3 +
 c++/src/CMakeLists.txt            |   1 +
 c++/src/ColumnReader.cc           |  70 ++++++++-------------
 c++/src/ColumnReader.hh           |   7 +++
 c++/src/DictionaryLoader.cc       | 100 ++++++++++++++++++++++++++++++
 c++/src/DictionaryLoader.hh       |  57 +++++++++++++++++
 c++/src/Options.hh                |  12 ++++
 c++/src/Reader.cc                 | 107 ++++++++++++++++++++++++++++++--
 c++/src/Reader.hh                 |  12 ++++
 c++/src/StripeStream.cc           |   5 ++
 c++/src/StripeStream.hh           |   2 +
 c++/src/meson.build               |   1 +
 c++/src/sargs/Literal.cc          |   5 ++
 c++/src/sargs/SargsApplier.cc     | 125 +++++++++++++++++++++++++++++++++++++-
 c++/src/sargs/SargsApplier.hh     |  50 ++++++++++++---
 c++/test/MockStripeStreams.hh     |   1 +
 c++/test/TestPredicatePushdown.cc | 119 ++++++++++++++++++++++++++++++++++++
 c++/test/TestSargsApplier.cc      |  12 ++--
 19 files changed, 641 insertions(+), 63 deletions(-)

diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 506f088d6..5a8899453 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -408,6 +408,21 @@ namespace orc {
      * Get the number of stripes to look ahead for small stripe prefetch.
      */
     uint64_t getSmallStripeLookAheadLimit() const;
+
+    /**
+     * Set the maximum dictionary size threshold for evaluation.
+     *
+     * Dictionaries with more entries than this threshold will not be 
evaluated.
+     * 0 to disable dictionary filtering.
+     *
+     * Defaults to 0.
+     */
+    RowReaderOptions& setDictionaryFilteringSizeThreshold(uint32_t threshold);
+
+    /**
+     * Get the dictionary filtering size threshold.
+     */
+    uint32_t getDictionaryFilteringSizeThreshold() const;
   };
 
   class RowReader;
diff --git a/c++/include/orc/sargs/Literal.hh b/c++/include/orc/sargs/Literal.hh
index f7d37005a..821790145 100644
--- a/c++/include/orc/sargs/Literal.hh
+++ b/c++/include/orc/sargs/Literal.hh
@@ -22,6 +22,8 @@
 #include "orc/Int128.hh"
 #include "orc/Vector.hh"
 
+#include <string_view>
+
 namespace orc {
 
   /**
@@ -123,6 +125,7 @@ namespace orc {
     Timestamp getTimestamp() const;
     double getFloat() const;
     std::string getString() const;
+    std::string_view getStringView() const;
     bool getBool() const;
     Decimal getDecimal() const;
 
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index 2f81bb802..a1fd549ce 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -169,6 +169,7 @@ set(SOURCE_FILES
   ConvertColumnReader.cc
   CpuInfoUtil.cc
   Dictionary.cc
+  DictionaryLoader.cc
   Exceptions.cc
   Geospatial.cc
   Int128.cc
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index d087482f9..8a29418c3 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -16,15 +16,18 @@
  * limitations under the License.
  */
 
-#include "orc/Int128.hh"
+#include "ColumnReader.hh"
+
+#include <cmath>
 
 #include "Adaptor.hh"
 #include "ByteRLE.hh"
-#include "ColumnReader.hh"
 #include "ConvertColumnReader.hh"
+#include "DictionaryLoader.hh"
 #include "RLE.hh"
 #include "SchemaEvolution.hh"
 #include "orc/Exceptions.hh"
+#include "orc/Int128.hh"
 
 #include <math.h>
 #include <iostream>
@@ -36,19 +39,6 @@ namespace orc {
     // PASS
   }
 
-  inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) {
-    switch (static_cast<int64_t>(kind)) {
-      case proto::ColumnEncoding_Kind_DIRECT:
-      case proto::ColumnEncoding_Kind_DICTIONARY:
-        return RleVersion_1;
-      case proto::ColumnEncoding_Kind_DIRECT_V2:
-      case proto::ColumnEncoding_Kind_DICTIONARY_V2:
-        return RleVersion_2;
-      default:
-        throw ParseError("Unknown encoding in convertRleVersion");
-    }
-  }
-
   ColumnReader::ColumnReader(const Type& type, StripeStreams& stripe)
       : columnId(type.getColumnId()),
         memoryPool(stripe.getMemoryPool()),
@@ -519,7 +509,10 @@ namespace orc {
     std::unique_ptr<RleDecoder> rle_;
 
    public:
-    StringDictionaryColumnReader(const Type& type, StripeStreams& stipe);
+    StringDictionaryColumnReader(const Type& type, StripeStreams& stripe);
+
+    StringDictionaryColumnReader(const Type& type, StripeStreams& stripe,
+                                 const std::shared_ptr<StringDictionary> 
dictionary);
     ~StringDictionaryColumnReader() override;
 
     uint64_t skip(uint64_t numValues) override;
@@ -533,39 +526,23 @@ namespace orc {
 
   StringDictionaryColumnReader::StringDictionaryColumnReader(const Type& type,
                                                              StripeStreams& 
stripe)
-      : ColumnReader(type, stripe), dictionary_(new 
StringDictionary(stripe.getMemoryPool())) {
+      : StringDictionaryColumnReader(type, stripe, nullptr) {}
+
+  StringDictionaryColumnReader::StringDictionaryColumnReader(
+      const Type& type, StripeStreams& stripe, const 
std::shared_ptr<StringDictionary> dictionary)
+      : ColumnReader(type, stripe), dictionary_(dictionary) {
     RleVersion rleVersion = 
convertRleVersion(stripe.getEncoding(columnId).kind());
-    uint32_t dictSize = stripe.getEncoding(columnId).dictionary_size();
     std::unique_ptr<SeekableInputStream> stream =
         stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
     if (stream == nullptr) {
       throw ParseError("DATA stream not found in StringDictionaryColumn");
     }
     rle_ = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, 
metrics);
-    stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false);
-    if (dictSize > 0 && stream == nullptr) {
-      throw ParseError("LENGTH stream not found in StringDictionaryColumn");
-    }
-    std::unique_ptr<RleDecoder> lengthDecoder =
-        createRleDecoder(std::move(stream), false, rleVersion, memoryPool, 
metrics);
-    dictionary_->dictionaryOffset.resize(dictSize + 1);
-    int64_t* lengthArray = dictionary_->dictionaryOffset.data();
-    lengthDecoder->next(lengthArray + 1, dictSize, nullptr);
-    lengthArray[0] = 0;
-    for (uint32_t i = 1; i < dictSize + 1; ++i) {
-      if (lengthArray[i] < 0) {
-        throw ParseError("Negative dictionary entry length");
-      }
-      lengthArray[i] += lengthArray[i - 1];
-    }
-    int64_t blobSize = lengthArray[dictSize];
-    dictionary_->dictionaryBlob.resize(static_cast<uint64_t>(blobSize));
-    std::unique_ptr<SeekableInputStream> blobStream =
-        stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
-    if (blobSize > 0 && blobStream == nullptr) {
-      throw ParseError("DICTIONARY_DATA stream not found in 
StringDictionaryColumn");
+
+    // If no dictionary was provided, load it
+    if (!dictionary_) {
+      dictionary_ = loadStringDictionary(columnId, stripe, memoryPool);
     }
-    readFully(dictionary_->dictionaryBlob.data(), blobSize, blobStream.get());
   }
 
   StringDictionaryColumnReader::~StringDictionaryColumnReader() {
@@ -1717,8 +1694,15 @@ namespace orc {
       case GEOGRAPHY:
         switch 
(static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) {
           case proto::ColumnEncoding_Kind_DICTIONARY:
-          case proto::ColumnEncoding_Kind_DICTIONARY_V2:
-            return std::make_unique<StringDictionaryColumnReader>(type, 
stripe);
+          case proto::ColumnEncoding_Kind_DICTIONARY_V2: {
+            // Check if we have a pre-loaded dictionary we can use
+            auto dictionary = stripe.getSharedDictionary(type.getColumnId());
+            if (dictionary) {
+              return std::make_unique<StringDictionaryColumnReader>(type, 
stripe, dictionary);
+            } else {
+              return std::unique_ptr<ColumnReader>(new 
StringDictionaryColumnReader(type, stripe));
+            }
+          }
           case proto::ColumnEncoding_Kind_DIRECT:
           case proto::ColumnEncoding_Kind_DIRECT_V2:
             return std::make_unique<StringDirectColumnReader>(type, stripe);
diff --git a/c++/src/ColumnReader.hh b/c++/src/ColumnReader.hh
index f0f3fe1b5..4a3fe2ac4 100644
--- a/c++/src/ColumnReader.hh
+++ b/c++/src/ColumnReader.hh
@@ -97,6 +97,13 @@ namespace orc {
      * @return the number of scale digits
      */
     virtual int32_t getForcedScaleOnHive11Decimal() const = 0;
+    
+    /**
+     * Get a shared dictionary for the given column if available.
+     * @param columnId the id of the column
+     * @return shared pointer to the StringDictionary or nullptr if not 
available
+     */
+    virtual std::shared_ptr<StringDictionary> getSharedDictionary(uint64_t 
columnId) const = 0;
 
     /**
      * Whether decimals that have precision <=18 are encoded as fixed scale 
and values
diff --git a/c++/src/DictionaryLoader.cc b/c++/src/DictionaryLoader.cc
new file mode 100644
index 000000000..428d288d5
--- /dev/null
+++ b/c++/src/DictionaryLoader.cc
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DictionaryLoader.hh"
+#include "RLE.hh"
+
+namespace orc {
+
+  namespace {
+
+    // Helper function to read data fully from a stream
+    void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* 
stream) {
+      int64_t posn = 0;
+      while (posn < bufferSize) {
+        const void* chunk;
+        int length;
+        if (!stream->Next(&chunk, &length)) {
+          throw ParseError("bad read in readFully");
+        }
+        if (posn + length > bufferSize) {
+          throw ParseError("Corrupt dictionary blob");
+        }
+        memcpy(buffer + posn, chunk, static_cast<size_t>(length));
+        posn += length;
+      }
+    }
+
+  }  // namespace
+
+  std::shared_ptr<StringDictionary> loadStringDictionary(uint64_t columnId, 
StripeStreams& stripe,
+                                                         MemoryPool& pool) {
+    // Get encoding information
+    proto::ColumnEncoding encoding = stripe.getEncoding(columnId);
+    RleVersion rleVersion = convertRleVersion(encoding.kind());
+    uint32_t dictSize = encoding.dictionary_size();
+
+    // Create the dictionary object
+    auto dictionary = std::make_shared<StringDictionary>(pool);
+
+    // Read LENGTH stream to get dictionary entry lengths
+    std::unique_ptr<SeekableInputStream> stream =
+        stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false);
+    if (dictSize > 0 && stream == nullptr) {
+      std::stringstream ss;
+      ss << "LENGTH stream not found in StringDictionaryColumn for column " << 
columnId;
+      throw ParseError(ss.str());
+    }
+    std::unique_ptr<RleDecoder> lengthDecoder =
+        createRleDecoder(std::move(stream), false, rleVersion, pool, 
stripe.getReaderMetrics());
+
+    // Decode dictionary entry lengths
+    dictionary->dictionaryOffset.resize(dictSize + 1);
+    int64_t* lengthArray = dictionary->dictionaryOffset.data();
+    lengthDecoder->next(lengthArray + 1, dictSize, nullptr);
+    lengthArray[0] = 0;
+
+    // Convert lengths to cumulative offsets
+    for (uint32_t i = 1; i < dictSize + 1; ++i) {
+      if (lengthArray[i] < 0) {
+        std::stringstream ss;
+        ss << "Negative dictionary entry length for column " << columnId;
+        throw ParseError(ss.str());
+      }
+      lengthArray[i] += lengthArray[i - 1];
+    }
+
+    int64_t blobSize = lengthArray[dictSize];
+
+    // Read DICTIONARY_DATA stream to get dictionary content
+    dictionary->dictionaryBlob.resize(static_cast<uint64_t>(blobSize));
+    std::unique_ptr<SeekableInputStream> blobStream =
+        stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
+    if (blobSize > 0 && blobStream == nullptr) {
+      std::stringstream ss;
+      ss << "DICTIONARY_DATA stream not found in StringDictionaryColumn for 
column " << columnId;
+      throw ParseError(ss.str());
+    }
+
+    // Read the dictionary blob
+    readFully(dictionary->dictionaryBlob.data(), blobSize, blobStream.get());
+
+    return dictionary;
+  }
+
+}  // namespace orc
\ No newline at end of file
diff --git a/c++/src/DictionaryLoader.hh b/c++/src/DictionaryLoader.hh
new file mode 100644
index 000000000..64df45afe
--- /dev/null
+++ b/c++/src/DictionaryLoader.hh
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_DICTIONARY_LOADER_HH
+#define ORC_DICTIONARY_LOADER_HH
+
+#include "ColumnReader.hh"
+#include "orc/Vector.hh"
+
+namespace orc {
+
+  /**
+   * Load a string dictionary for a single column from a stripe.
+   * This function reads the LENGTH and DICTIONARY_DATA streams and populates
+   * the StringDictionary structure. It automatically uses ReadCache if 
available
+   * through the StripeStreams interface.
+   *
+   * @param columnId the column ID to load the dictionary for
+   * @param stripe the StripeStreams interface providing access to streams
+   * @param pool the memory pool to use for allocating the dictionary
+   * @return a shared pointer to the loaded StringDictionary, or nullptr if 
loading fails
+   */
+  std::shared_ptr<StringDictionary> loadStringDictionary(uint64_t columnId, 
StripeStreams& stripe,
+                                                         MemoryPool& pool);
+
+  // Helper function to convert encoding kind to RLE version
+  inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) {
+    switch (static_cast<int64_t>(kind)) {
+      case proto::ColumnEncoding_Kind_DIRECT:
+      case proto::ColumnEncoding_Kind_DICTIONARY:
+        return RleVersion_1;
+      case proto::ColumnEncoding_Kind_DIRECT_V2:
+      case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+        return RleVersion_2;
+      default:
+        throw ParseError("Unknown encoding in convertRleVersion");
+    }
+  }
+
+}  // namespace orc
+
+#endif
diff --git a/c++/src/Options.hh b/c++/src/Options.hh
index b71edcd42..c0abf190c 100644
--- a/c++/src/Options.hh
+++ b/c++/src/Options.hh
@@ -25,6 +25,7 @@
 
 #include "io/Cache.hh"
 
+#include <cstdint>
 #include <iostream>
 #include <limits>
 
@@ -156,6 +157,7 @@ namespace orc {
     bool throwOnSchemaEvolutionOverflow;
     bool enableAsyncPrefetch;
     uint64_t smallStripeLookAheadLimit;
+    uint32_t dictionaryFilteringSizeThreshold;
 
     RowReaderOptionsPrivate() {
       selection = ColumnSelection_NONE;
@@ -169,6 +171,7 @@ namespace orc {
       throwOnSchemaEvolutionOverflow = false;
       enableAsyncPrefetch = false;
       smallStripeLookAheadLimit = 8;
+      dictionaryFilteringSizeThreshold = 0;
     }
   };
 
@@ -362,6 +365,15 @@ namespace orc {
     return privateBits_->smallStripeLookAheadLimit;
   }
 
+  RowReaderOptions& 
RowReaderOptions::setDictionaryFilteringSizeThreshold(uint32_t threshold) {
+    privateBits_->dictionaryFilteringSizeThreshold = threshold;
+    return *this;
+  }
+
+  uint32_t RowReaderOptions::getDictionaryFilteringSizeThreshold() const {
+    return privateBits_->dictionaryFilteringSizeThreshold;
+  }
+
 }  // namespace orc
 
 #endif
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index ab4c5047d..4fd1a73a9 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -19,7 +19,9 @@
 #include "Reader.hh"
 #include "Adaptor.hh"
 #include "BloomFilter.hh"
+#include "DictionaryLoader.hh"
 #include "Options.hh"
+#include "RLE.hh"
 #include "Statistics.hh"
 #include "StripeStream.hh"
 #include "Utils.hh"
@@ -32,6 +34,7 @@
 #include <set>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace orc {
@@ -347,9 +350,10 @@ namespace orc {
     // prepare SargsApplier if SearchArgument is available
     if (opts.getSearchArgument() && footer_->row_index_stride() > 0) {
       sargs_ = opts.getSearchArgument();
-      sargsApplier_.reset(
-          new SargsApplier(*contents_->schema, sargs_.get(), 
footer_->row_index_stride(),
-                           getWriterVersionImpl(contents.get()), 
contents_->readerMetrics));
+      sargsApplier_ = std::make_unique<SargsApplier>(
+          *contents_->schema, sargs_.get(), footer_->row_index_stride(),
+          getWriterVersionImpl(contents.get()), 
opts.getDictionaryFilteringSizeThreshold(),
+          contents_->readerMetrics, &schemaEvolution_);
     }
 
     skipBloomFilters_ = hasBadBloomFilters();
@@ -1119,10 +1123,97 @@ namespace orc {
     return getStripeSize(stripeInfo) <= threshold;
   }
 
+  /**
+   * Load stripe dictionaries for dictionary-based predicate pushdown.
+   * Only loads dictionaries for STRING/VARCHAR/CHAR columns with IN 
expressions.
+   */
+  std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>> 
loadStripeDictionaries(
+      const proto::Footer& footer, const std::vector<bool>& selectedColumns,
+      const std::vector<uint64_t>& columnsWithInExpr, StripeStreams& stripe,
+      size_t dictSizeThreshold) {
+    std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>> 
dictionaries;
+
+    // Only load dictionaries for selected columns with IN expressions
+    for (uint64_t colId : columnsWithInExpr) {
+      if (!selectedColumns[colId] || colId >= 
static_cast<uint64_t>(footer.types_size())) {
+        continue;
+      }
+
+      auto encoding = stripe.getEncoding(colId);
+      if (encoding.kind() != proto::ColumnEncoding_Kind_DICTIONARY &&
+          encoding.kind() != proto::ColumnEncoding_Kind_DICTIONARY_V2) {
+        continue;
+      }
+
+      auto typeKind = footer.types(static_cast<int>(colId)).kind();
+      if (typeKind != proto::Type_Kind_STRING && typeKind != 
proto::Type_Kind_VARCHAR &&
+          typeKind != proto::Type_Kind_CHAR) {
+        continue;
+      }
+
+      if (encoding.dictionary_size() > dictSizeThreshold) {
+        continue;
+      }
+
+      dictionaries[colId] = loadStringDictionary(colId, stripe, 
stripe.getMemoryPool());
+    }
+
+    return dictionaries;
+  }
+
+  // Evaluate dictionaries for the current stripe to determine if it can be
+  // skipped.
+  bool evaluateStripeDictionaries(RowReaderImpl& reader, const proto::Footer& 
footer,
+                                  const std::vector<bool>& selectedColumns,
+                                  const proto::StripeFooter& stripeFooter,
+                                  const proto::StripeInformation& stripeInfo,
+                                  uint64_t currentStripe, SargsApplier* 
sargsApplier,
+                                  const Timezone& localTimezone, const 
Timezone& readerTimezone) {
+    const std::vector<uint64_t>& columnsWithInExpr = 
sargsApplier->getColumnsWithInExpressions();
+    if (columnsWithInExpr.empty()) {
+      return true;
+    }
+
+    const Timezone& writerTimezone = stripeFooter.has_writer_timezone()
+                                         ? 
getTimezoneByName(stripeFooter.writer_timezone())
+                                         : localTimezone;
+    StripeStreamsImpl stripeStreams(reader, currentStripe, stripeInfo, 
stripeFooter,
+                                    stripeInfo.offset(), 
*reader.getFileContents().stream,
+                                    writerTimezone, readerTimezone);
+
+    auto dictionaries =
+        loadStripeDictionaries(footer, selectedColumns, columnsWithInExpr, 
stripeStreams,
+                               
sargsApplier->getDictionaryFilteringSizeThreshold());
+    if (!dictionaries.empty()) {
+      // Store the loaded dictionaries for use by ColumnReaders
+      reader.setSharedDictionaries(dictionaries);
+
+      return sargsApplier->evaluateColumnDictionaries(dictionaries);
+    }
+
+    return true;
+  }
+
+  std::shared_ptr<StringDictionary> 
RowReaderImpl::getSharedDictionary(uint64_t columnId) const {
+    auto it = sharedDictionaries_.find(columnId);
+    if (it != sharedDictionaries_.end()) {
+      return it->second;
+    }
+    return nullptr;
+  }
+
+  void RowReaderImpl::setSharedDictionaries(
+      const std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>& 
dictionaries) {
+    for (const auto& pair : dictionaries) {
+      sharedDictionaries_[pair.first] = pair.second;
+    }
+  }
+
   void RowReaderImpl::startNextStripe() {
     reader_.reset();  // ColumnReaders use lots of memory; free old memory 
first
     rowIndexes_.clear();
     bloomFilterIndex_.clear();
+    sharedDictionaries_.clear();  // Clear dictionaries from previous stripe
 
     // evaluate file statistics if it exists
     if (sargsApplier_ &&
@@ -1164,7 +1255,15 @@ namespace orc {
 
       if (isStripeNeeded) {
         currentStripeFooter_ = getStripeFooter(currentStripeInfo_, 
*contents_.get());
-        if (sargsApplier_) {
+
+        if (sargsApplier_ && 
sargsApplier_->getDictionaryFilteringSizeThreshold() > 0) {
+          // evaluate dictionaries for predicate pushdown
+          isStripeNeeded = evaluateStripeDictionaries(
+              *this, *footer_, selectedColumns_, currentStripeFooter_, 
currentStripeInfo_,
+              currentStripe_, sargsApplier_.get(), localTimezone_, 
readerTimezone_);
+        }
+
+        if (sargsApplier_ && isStripeNeeded) {
           // read row group statistics and bloom filters of current stripe
           loadStripeIndex();
 
diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh
index 966281cce..132f92ebb 100644
--- a/c++/src/Reader.hh
+++ b/c++/src/Reader.hh
@@ -193,6 +193,9 @@ namespace orc {
     // match read and file types
     SchemaEvolution schemaEvolution_;
 
+    // Dictionary optimization
+    std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>> 
sharedDictionaries_;
+
     // load stripe index if not done so
     void loadStripeIndex();
 
@@ -266,6 +269,15 @@ namespace orc {
     std::shared_ptr<ReadRangeCache> getReadCache() const {
       return contents_->readCache;
     }
+
+    // Method to set shared dictionaries from external functions
+    void setSharedDictionaries(
+        const std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>& 
dictionaries);
+
+    // Method to get a shared dictionary by column id
+    std::shared_ptr<StringDictionary> getSharedDictionary(uint64_t columnId) 
const;
+
+   private:
   };
 
   class ReaderImpl : public Reader {
diff --git a/c++/src/StripeStream.cc b/c++/src/StripeStream.cc
index a5609f762..f12a28b2c 100644
--- a/c++/src/StripeStream.cc
+++ b/c++/src/StripeStream.cc
@@ -174,4 +174,9 @@ namespace orc {
         streamOffset, stripeFooter_->streams(static_cast<int>(streamId)));
   }
 
+  std::shared_ptr<StringDictionary> StripeStreamsImpl::getSharedDictionary(
+      uint64_t columnId) const {
+    return reader_.getSharedDictionary(columnId);
+  }
+
 }  // namespace orc
diff --git a/c++/src/StripeStream.hh b/c++/src/StripeStream.hh
index 2d26f8575..eb31d77aa 100644
--- a/c++/src/StripeStream.hh
+++ b/c++/src/StripeStream.hh
@@ -81,6 +81,8 @@ namespace orc {
     int32_t getForcedScaleOnHive11Decimal() const override;
 
     const SchemaEvolution* getSchemaEvolution() const override;
+    
+    std::shared_ptr<StringDictionary> getSharedDictionary(uint64_t columnId) 
const override;
   };
 
   /**
diff --git a/c++/src/meson.build b/c++/src/meson.build
index 885df0072..6dfea9ab6 100644
--- a/c++/src/meson.build
+++ b/c++/src/meson.build
@@ -151,6 +151,7 @@ source_files += files(
     'ConvertColumnReader.cc',
     'CpuInfoUtil.cc',
     'Dictionary.cc',
+    'DictionaryLoader.cc',
     'Exceptions.cc',
     'Geospatial.cc',
     'Int128.cc',
diff --git a/c++/src/sargs/Literal.cc b/c++/src/sargs/Literal.cc
index f36db7943..f323c061a 100644
--- a/c++/src/sargs/Literal.cc
+++ b/c++/src/sargs/Literal.cc
@@ -293,6 +293,11 @@ namespace orc {
     return std::string(value_.Buffer, size_);
   }
 
+  std::string_view Literal::getStringView() const {
+    validate(isNull_, type_, PredicateDataType::STRING);
+    return std::string_view(value_.Buffer, size_);
+  }
+
   bool Literal::getBool() const {
     validate(isNull_, type_, PredicateDataType::BOOLEAN);
     return value_.BooleanVal;
diff --git a/c++/src/sargs/SargsApplier.cc b/c++/src/sargs/SargsApplier.cc
index b3085964d..5c7aa10ef 100644
--- a/c++/src/sargs/SargsApplier.cc
+++ b/c++/src/sargs/SargsApplier.cc
@@ -17,7 +17,11 @@
  */
 
 #include "SargsApplier.hh"
+#include "Dictionary.hh"
+#include "sargs/PredicateLeaf.hh"
+
 #include <numeric>
+#include <set>
 
 namespace orc {
 
@@ -39,17 +43,21 @@ namespace orc {
 
   SargsApplier::SargsApplier(const Type& type, const SearchArgument* 
searchArgument,
                              uint64_t rowIndexStride, WriterVersion 
writerVersion,
-                             ReaderMetrics* metrics, const SchemaEvolution* 
schemaEvolution)
+                             size_t dictionaryFilteringSizeThreshold, 
ReaderMetrics* metrics,
+                             const SchemaEvolution* schemaEvolution)
       : type_(type),
         searchArgument_(searchArgument),
         schemaEvolution_(schemaEvolution),
         rowIndexStride_(rowIndexStride),
         writerVersion_(writerVersion),
+        dictionaryFilteringSizeThreshold_(dictionaryFilteringSizeThreshold),
         hasEvaluatedFileStats_(false),
         fileStatsEvalResult_(true),
         metrics_(metrics) {
     const SearchArgumentImpl* sargs = dynamic_cast<const 
SearchArgumentImpl*>(searchArgument_);
 
+    std::set<uint64_t> columnsWithInExpr;
+
     // find the mapping from predicate leaves to columns
     const std::vector<PredicateLeaf>& leaves = sargs->getLeaves();
     filterColumns_.resize(leaves.size(), INVALID_COLUMN_ID);
@@ -59,7 +67,16 @@ namespace orc {
       } else {
         filterColumns_[i] = leaves[i].getColumnId();
       }
+
+      if (leaves[i].getOperator() == PredicateLeaf::Operator::IN) {
+        uint64_t columnId = filterColumns_[i];
+        if (columnId != INVALID_COLUMN_ID) {
+          columnsWithInExpr.insert(columnId);
+        }
+      }
     }
+
+    columnsWithInExpr_.assign(columnsWithInExpr.begin(), 
columnsWithInExpr.end());
   }
 
   bool SargsApplier::pickRowGroups(uint64_t rowsInStripe,
@@ -185,4 +202,110 @@ namespace orc {
     }
     return fileStatsEvalResult_;
   }
+
+  TruthValue SargsApplier::evaluateDictionaryForColumn(const StringDictionary& 
dictionary,
+                                                       const PredicateLeaf& 
leaf) const {
+    // Only handle IN expressions for dictionary filtering
+    if (leaf.getOperator() != PredicateLeaf::Operator::IN) {
+      return TruthValue::YES_NO_NULL;
+    }
+
+    const std::vector<Literal>& literals = leaf.getLiteralList();
+    if (literals.empty()) {
+      return TruthValue::YES_NO_NULL;
+    }
+
+    // Pre-compute string views for literals to avoid repeated function calls
+    std::vector<std::string_view> literalViews;
+    literalViews.reserve(literals.size());
+    for (const auto& literal : literals) {
+      literalViews.emplace_back(literal.getStringView());
+    }
+
+    // Check if any dictionary entry matches any literal in the IN list
+    const int64_t* offsets = dictionary.dictionaryOffset.data();
+    const char* blob = dictionary.dictionaryBlob.data();
+    size_t dictSize = dictionary.dictionaryOffset.size() - 1;
+
+    // Use a set to store matching dictionary entries
+    size_t matchedEntriesCount = 0;
+
+    for (size_t i = 0; i < dictSize; ++i) {
+      int64_t start = offsets[i];
+      int64_t length = offsets[i + 1] - start;
+      std::string_view dictEntry(blob + start, static_cast<size_t>(length));
+
+      // Check if this dictionary entry matches any literal in the IN list
+      for (const auto& literalView : literalViews) {
+        if (dictEntry == literalView) {
+          matchedEntriesCount++;
+          break;
+        }
+      }
+    }
+
+    // If all dictionary entries match, return YES
+    if (matchedEntriesCount == dictSize) {
+      return TruthValue::YES;
+    }
+    // If any dictionary entry matches, stripe might contain matching rows
+    else if (matchedEntriesCount != 0) {
+      return TruthValue::YES_NO_NULL;
+    }
+    // No dictionary entry matches any literal in the IN list - skip stripe
+    else {
+      return TruthValue::NO;
+    }
+  }
+
+  bool SargsApplier::evaluateColumnDictionaries(
+      const std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>& 
dictionaries) {
+    const SearchArgumentImpl* sargs = dynamic_cast<const 
SearchArgumentImpl*>(searchArgument_);
+    if (sargs == nullptr) {
+      return true;  // Cannot evaluate, assume stripe is needed
+    }
+
+    const std::vector<PredicateLeaf>& leaves = sargs->getLeaves();
+    std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL);
+
+    // Evaluate each predicate leaf against dictionaries (only IN expressions)
+    for (size_t pred = 0; pred != leaves.size(); ++pred) {
+      uint64_t columnId = filterColumns_[pred];
+
+      // Only evaluate IN expressions
+      if (leaves[pred].getOperator() != PredicateLeaf::Operator::IN) {
+        leafValues[pred] = TruthValue::YES_NO_NULL;
+        continue;
+      }
+
+      // Check if this column has a dictionary
+      auto dictIter = dictionaries.find(columnId);
+      if (columnId == INVALID_COLUMN_ID || dictIter == dictionaries.cend() ||
+          dictIter->second == nullptr) {
+        // No dictionary for this column, cannot evaluate
+        leafValues[pred] = TruthValue::YES_NO_NULL;
+        continue;
+      }
+
+      // Check if schema evolution makes PPD unsafe
+      if (schemaEvolution_ && 
!schemaEvolution_->isSafePPDConversion(columnId)) {
+        leafValues[pred] = TruthValue::YES_NO_NULL;
+        continue;
+      }
+
+      // Check if this is a STRING/VARCHAR/CHAR column
+      PredicateDataType predType = leaves[pred].getType();
+      if (predType != PredicateDataType::STRING) {
+        // Only string types are supported for dictionary filtering
+        leafValues[pred] = TruthValue::YES_NO_NULL;
+        continue;
+      }
+
+      // Evaluate the IN expression against the dictionary
+      leafValues[pred] = evaluateDictionaryForColumn(*dictIter->second, 
leaves[pred]);
+    }
+
+    return isNeeded(searchArgument_->evaluate(leafValues));
+  }
+
 }  // namespace orc
diff --git a/c++/src/sargs/SargsApplier.hh b/c++/src/sargs/SargsApplier.hh
index 65c8dec83..b9e85d508 100644
--- a/c++/src/sargs/SargsApplier.hh
+++ b/c++/src/sargs/SargsApplier.hh
@@ -19,25 +19,24 @@
 #ifndef ORC_SARGSAPPLIER_HH
 #define ORC_SARGSAPPLIER_HH
 
-#include <orc/Common.hh>
+#include "SchemaEvolution.hh"
 #include "orc/BloomFilter.hh"
+#include "orc/Common.hh"
 #include "orc/Reader.hh"
 #include "orc/Type.hh"
-#include "wrap/orc-proto-wrapper.hh"
-
 #include "sargs/SearchArgument.hh"
 
-#include "SchemaEvolution.hh"
-
 #include <unordered_map>
 
 namespace orc {
 
+  struct StringDictionary;
+
   class SargsApplier {
    public:
     SargsApplier(const Type& type, const SearchArgument* searchArgument, 
uint64_t rowIndexStride,
-                 WriterVersion writerVersion, ReaderMetrics* metrics,
-                 const SchemaEvolution* schemaEvolution = nullptr);
+                 WriterVersion writerVersion, size_t 
dictionaryFilteringSizeThreshold,
+                 ReaderMetrics* metrics, const SchemaEvolution* 
schemaEvolution);
 
     /**
      * Evaluate search argument on file statistics
@@ -60,6 +59,18 @@ namespace orc {
     bool evaluateStripeStatistics(const proto::StripeStatistics& stripeStats,
                                   uint64_t stripeRowGroupCount);
 
+    /**
+     * Evaluate search argument on column dictionaries (only IN expressions)
+     * If dictionary entries don't satisfy the sargs,
+     * the EvaluatedRowGroupCount of Reader Metrics will be updated.
+     * Otherwise, Reader Metrics will not be updated and
+     * will require further evaluation.
+     * @param dictionaries map from column ID to StringDictionary
+     * @return true if any dictionary entry satisfies the sargs
+     */
+    bool evaluateColumnDictionaries(
+        const std::unordered_map<uint64_t, std::shared_ptr<StringDictionary>>& 
dictionaries);
+
     /**
      * TODO: use proto::RowIndex and proto::BloomFilter to do the evaluation
      * Pick the row groups that we need to load from the current stripe.
@@ -114,11 +125,29 @@ namespace orc {
       }
     }
 
+    /**
+     * Get list of column IDs that have IN expressions for dictionary filtering
+     */
+    const std::vector<uint64_t>& getColumnsWithInExpressions() const {
+      return columnsWithInExpr_;
+    }
+
+    /**
+     * Get the dictionary filtering size threshold
+     */
+    size_t getDictionaryFilteringSizeThreshold() const {
+      return dictionaryFilteringSizeThreshold_;
+    }
+
    private:
     // evaluate column statistics in the form of protobuf::RepeatedPtrField
     typedef ::google::protobuf::RepeatedPtrField<proto::ColumnStatistics> 
PbColumnStatistics;
     bool evaluateColumnStatistics(const PbColumnStatistics& colStats) const;
 
+    // Helper method to evaluate IN expression against a dictionary
+    TruthValue evaluateDictionaryForColumn(const StringDictionary& dictionary,
+                                           const PredicateLeaf& leaf) const;
+
     friend class TestSargsApplier_findColumnTest_Test;
     friend class TestSargsApplier_findArrayColumnTest_Test;
     friend class TestSargsApplier_findMapColumnTest_Test;
@@ -128,10 +157,13 @@ namespace orc {
     const Type& type_;
     const SearchArgument* searchArgument_;
     const SchemaEvolution* schemaEvolution_;
-    uint64_t rowIndexStride_;
-    WriterVersion writerVersion_;
+    const uint64_t rowIndexStride_;
+    const WriterVersion writerVersion_;
+    const uint32_t dictionaryFilteringSizeThreshold_;
     // column ids for each predicate leaf in the search argument
     std::vector<uint64_t> filterColumns_;
+    // column ids that have IN expressions for dictionary filtering
+    std::vector<uint64_t> columnsWithInExpr_;
 
     // Map from RowGroup index to the next skipped row of the selected range it
     // locates. If the RowGroup is not selected, set the value to 0.
diff --git a/c++/test/MockStripeStreams.hh b/c++/test/MockStripeStreams.hh
index dd32ad599..a62ed4394 100644
--- a/c++/test/MockStripeStreams.hh
+++ b/c++/test/MockStripeStreams.hh
@@ -41,6 +41,7 @@ namespace orc {
     MOCK_CONST_METHOD0(getForcedScaleOnHive11Decimal, int32_t());
     MOCK_CONST_METHOD0(isDecimalAsLong, bool());
     MOCK_CONST_METHOD0(getSchemaEvolution, const SchemaEvolution*());
+    MOCK_CONST_METHOD1(getSharedDictionary, 
std::shared_ptr<StringDictionary>(uint64_t));
 
     MemoryPool& getMemoryPool() const override;
 
diff --git a/c++/test/TestPredicatePushdown.cc 
b/c++/test/TestPredicatePushdown.cc
index 5c8ed14e7..427745a5f 100644
--- a/c++/test/TestPredicatePushdown.cc
+++ b/c++/test/TestPredicatePushdown.cc
@@ -548,4 +548,123 @@ namespace orc {
       TestFirstStripeSelectedWithStripeStats(reader.get(), pos);
     }
   }
+
+  // Create a test file with dictionary-encoded string columns for testing
+  // dictionary-based predicate pushdown
+  void createDictionaryTestFile(MemoryOutputStream& memStream) {
+    MemoryPool* pool = getDefaultPool();
+    auto type = std::unique_ptr<Type>(
+        
Type::buildTypeFromString("struct<id:bigint,category:string,status:string>"));
+    WriterOptions options;
+    options
+        .setStripeSize(512)  // Small stripe size to create multiple stripes
+        .setCompressionBlockSize(1024)
+        .setMemoryBlockSize(64)
+        .setCompression(CompressionKind_ZLIB)
+        .setMemoryPool(pool)
+        .setDictionaryKeySizeThreshold(1.0)
+        .setRowIndexStride(1000);
+
+    auto writer = createWriter(*type, &memStream, options);
+    auto batch = writer->createRowBatch(1000);
+    auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+    auto& idBatch = dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]);
+    auto& categoryBatch = 
dynamic_cast<StringVectorBatch&>(*structBatch.fields[1]);
+    auto& statusBatch = 
dynamic_cast<StringVectorBatch&>(*structBatch.fields[2]);
+
+    const char* categories[] = {"A", "B", "C", "X", "Y"};
+    const char* statuses[] = {"active", "inactive", "pending"};
+
+    char categoryBuffer[10000];
+    char statusBuffer[20000];
+    uint64_t categoryOffset = 0;
+    uint64_t statusOffset = 0;
+
+    for (uint64_t i = 0; i < 10000; ++i) {
+      idBatch.data[i % 1000] = static_cast<int64_t>(i);
+
+      const char* category = categories[i % 5];
+      size_t catLen = strlen(category);
+      memcpy(categoryBuffer + categoryOffset, category, catLen);
+      categoryBatch.data[i % 1000] = categoryBuffer + categoryOffset;
+      categoryBatch.length[i % 1000] = static_cast<int64_t>(catLen);
+      categoryOffset += catLen;
+
+      const char* status = statuses[i % 3];
+      size_t statusLen = strlen(status);
+      memcpy(statusBuffer + statusOffset, status, statusLen);
+      statusBatch.data[i % 1000] = statusBuffer + statusOffset;
+      statusBatch.length[i % 1000] = static_cast<int64_t>(statusLen);
+      statusOffset += statusLen;
+
+      if ((i + 1) % 1000 == 0) {
+        structBatch.numElements = 1000;
+        idBatch.numElements = 1000;
+        categoryBatch.numElements = 1000;
+        statusBatch.numElements = 1000;
+        writer->add(*batch);
+        categoryOffset = 0;
+        statusOffset = 0;
+      }
+    }
+
+    writer->close();
+  }
+
+  TEST(TestPredicatePushdown, testDictionaryFiltering) {
+    MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+    MemoryPool* pool = getDefaultPool();
+    createDictionaryTestFile(memStream);
+
+    auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), 
memStream.getLength());
+    ReaderOptions readerOptions;
+    readerOptions.setMemoryPool(*pool);
+    std::unique_ptr<Reader> reader = createReader(std::move(inStream), 
readerOptions);
+    EXPECT_EQ(10000, reader->getNumberOfRows());
+    EXPECT_TRUE(reader->getNumberOfStripes() > 1);
+
+    struct Param {
+      uint32_t dictSizeThreshold;
+      uint64_t expectedRowsRead;
+    };
+
+    // Test filtering all rows when dictionary size threshold is large enough
+    for (const auto& param : {Param{0, 10000}, Param{10, 0}}) {
+      RowReaderOptions options;
+      options.searchArgument(
+          SearchArgumentFactory::newBuilder()
+              ->in("category", PredicateDataType::STRING, {Literal("M", 1), 
Literal("N", 1)})
+              .build());
+      if (param.dictSizeThreshold > 0) {
+        options.setDictionaryFilteringSizeThreshold(param.dictSizeThreshold);
+      }
+      auto rowReader = reader->createRowReader(options);
+      auto batch = rowReader->createRowBatch(1000);
+      uint64_t rowsRead = 0;
+      while (rowReader->next(*batch)) {
+        rowsRead += batch->numElements;
+      }
+      EXPECT_EQ(rowsRead, param.expectedRowsRead);
+    }
+
+    // Test filtering with matching values
+    for (const auto& param : {Param{0, 10000}, Param{100, 10000}}) {
+      RowReaderOptions options;
+      options.searchArgument(
+          SearchArgumentFactory::newBuilder()
+              ->in("category", PredicateDataType::STRING, {Literal("A", 1), 
Literal("C", 1)})
+              .build());
+      if (param.dictSizeThreshold > 0) {
+        options.setDictionaryFilteringSizeThreshold(param.dictSizeThreshold);
+      }
+      auto rowReader = reader->createRowReader(options);
+      auto batch = rowReader->createRowBatch(1000);
+      uint64_t rowsRead = 0;
+      while (rowReader->next(*batch)) {
+        rowsRead += batch->numElements;
+      }
+      EXPECT_EQ(rowsRead, param.expectedRowsRead);
+    }
+  }
+
 }  // namespace orc
diff --git a/c++/test/TestSargsApplier.cc b/c++/test/TestSargsApplier.cc
index 7105c738e..d64d5863d 100644
--- a/c++/test/TestSargsApplier.cc
+++ b/c++/test/TestSargsApplier.cc
@@ -93,7 +93,7 @@ namespace orc {
     // evaluate row group index
     ReaderMetrics metrics;
     SchemaEvolution se(nullptr, type.get());
-    SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 
&metrics, &se);
+    SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0, 
&metrics, &se);
     EXPECT_TRUE(applier.pickRowGroups(4000, rowIndexes, {}));
     const auto& nextSkippedRows = applier.getNextSkippedRows();
     EXPECT_EQ(4, nextSkippedRows.size());
@@ -122,7 +122,7 @@ namespace orc {
       *stripeStats.add_col_stats() = createIntStats(0L, 10L);
       *stripeStats.add_col_stats() = createIntStats(0L, 50L);
       ReaderMetrics metrics;
-      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 
&metrics);
+      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0, 
&metrics, nullptr);
       EXPECT_FALSE(applier.evaluateStripeStatistics(stripeStats, 1));
       EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
       EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);
@@ -136,7 +136,7 @@ namespace orc {
       *stripeStats.add_col_stats() = createIntStats(0L, 50L);
       *stripeStats.add_col_stats() = createIntStats(0L, 50L);
       ReaderMetrics metrics;
-      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 
&metrics);
+      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0, 
&metrics, nullptr);
       EXPECT_TRUE(applier.evaluateStripeStatistics(stripeStats, 1));
       EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
       EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);
@@ -150,7 +150,7 @@ namespace orc {
       *footer.add_statistics() = createIntStats(0L, 10L);
       *footer.add_statistics() = createIntStats(0L, 50L);
       ReaderMetrics metrics;
-      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 
&metrics);
+      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0, 
&metrics, nullptr);
       EXPECT_FALSE(applier.evaluateFileStatistics(footer, 1));
       EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
       EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);
@@ -164,7 +164,7 @@ namespace orc {
       *footer.add_statistics() = createIntStats(0L, 50L);
       *footer.add_statistics() = createIntStats(0L, 30L);
       ReaderMetrics metrics;
-      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 
&metrics);
+      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0, 
&metrics, nullptr);
       EXPECT_FALSE(applier.evaluateFileStatistics(footer, 1));
       EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
       EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);
@@ -178,7 +178,7 @@ namespace orc {
       *footer.add_statistics() = createIntStats(0L, 50L);
       *footer.add_statistics() = createIntStats(0L, 50L);
       ReaderMetrics metrics;
-      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 
&metrics);
+      SargsApplier applier(*type, sarg.get(), 1000, WriterVersion_ORC_135, 0, 
&metrics, nullptr);
       EXPECT_TRUE(applier.evaluateFileStatistics(footer, 1));
       EXPECT_EQ(metrics.SelectedRowGroupCount.load(), 0);
       EXPECT_EQ(metrics.EvaluatedRowGroupCount.load(), 1);

(orc) branch main updated: ORC-2022: [C++] Add support to use dictionary for IN expression

Reply via email to