(doris-thirdparty) branch orc updated: [feature] Changes for optimize complex type column reading with column pruning. (#369)

kakachen Sat, 22 Nov 2025 08:20:40 -0800

This is an automated email from the ASF dual-hosted git repository.

kakachen pushed a commit to branch orc
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/orc by this push:
     new e32697077ca [feature] Changes for optimize complex type column reading 
with column pruning. (#369)
e32697077ca is described below

commit e32697077ca674d30e201cc339aeb50870522144
Author: Qi Chen <[email protected]>
AuthorDate: Sun Nov 23 00:20:26 2025 +0800

    [feature] Changes for optimize complex type column reading with column 
pruning. (#369)
    
    1. Modify the RowReaderOptions::includeTypes() function to prevent it from 
automatically selecting the parent node.
    2. Add RowReaderOptions::filterTypes().
    3. Fix orc struct reader late materialization in StructColumnReader::skip().
---
 c++/include/orc/Reader.hh | 22 +++++++++++++-
 c++/src/ColumnReader.cc   |  5 +--
 c++/src/Options.hh        | 15 +++++++++
 c++/src/Reader.cc         | 77 +++++++++++++++++++++++++++++++++++++++++++++--
 c++/src/Reader.hh         |  2 +-
 c++/src/TypeImpl.cc       |  1 +
 6 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 96a431faae8..1ab60b1d2c5 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -178,7 +178,6 @@ namespace orc {
     /**
      * Selects which type ids to read. The root type is always 0 and the
      * rest of the types are labeled in a preorder traversal of the tree.
-     * The parent types are automatically selected, but the children are not.
      *
      * This option clears any previous setting of the selected columns or
      * types.
@@ -206,6 +205,17 @@ namespace orc {
      */
     RowReaderOptions& filter(const std::list<std::string>& filterColNames);
 
+    /**
+     * Selects which type ids to filter. The root type is always 0 and the
+     * rest of the types are labeled in a preorder traversal of the tree.
+     *
+     * This option clears any previous setting of the filter columns or
+     * types.
+     * @param types a list of the type ids to filter
+     * @return this
+     */
+    RowReaderOptions& filterTypes(const std::list<uint64_t>& types);
+
     /**
      * A map type of <typeId, ReadIntent>.
      */
@@ -308,6 +318,16 @@ namespace orc {
      */
     const std::list<std::string>& getFilterColNames() const;
 
+    /**
+     * Were the filter type ids set?
+     */
+    bool getFilterTypeIdsSet() const;
+
+    /**
+     * Get the list of filter type ids.
+     */
+    const std::list<uint64_t>& getFilterTypeIds() const;
+
     /**
      * Get the start of the range for the data being processed.
      * @return if not set, return 0
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index 4dde99917c0..c8b03fc435e 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -1158,9 +1158,10 @@ namespace orc {
   }
 
   uint64_t StructColumnReader::skip(uint64_t numValues, const ReadPhase& 
readPhase) {
-    if (readPhase.contains(this->type.getReaderCategory())) {
-      numValues = ColumnReader::skip(numValues, readPhase);
+    if (!readPhase.contains(this->type.getReaderCategory())) {
+      return 0;
     }
+    numValues = ColumnReader::skip(numValues, readPhase);
     for (auto& ptr : children) {
       if (shouldProcessChild(ptr->getType().getReaderCategory(), readPhase)) {
         ptr->skip(numValues, readPhase);
diff --git a/c++/src/Options.hh b/c++/src/Options.hh
index 40a583e9397..5cd2b49ffac 100644
--- a/c++/src/Options.hh
+++ b/c++/src/Options.hh
@@ -238,6 +238,13 @@ namespace orc {
     return *this;
   }
 
+  RowReaderOptions& RowReaderOptions::filterTypes(const std::list<uint64_t>& 
types) {
+    privateBits->filter = ColumnFilter_TYPE_IDS;
+    privateBits->filterColumnIndexes.assign(types.begin(), types.end());
+    privateBits->filterColumnNames.clear();
+    return *this;
+  }
+
   RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) {
     privateBits->dataStart = offset;
     privateBits->dataLength = length;
@@ -268,6 +275,14 @@ namespace orc {
     return privateBits->filterColumnNames;
   }
 
+  bool RowReaderOptions::getFilterTypeIdsSet() const {
+    return privateBits->filter == ColumnFilter_TYPE_IDS;
+  }
+
+  const std::list<uint64_t>& RowReaderOptions::getFilterTypeIds() const {
+    return privateBits->filterColumnIndexes;
+  }
+
   uint64_t RowReaderOptions::getOffset() const {
     return privateBits->dataStart;
   }
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 82cd3e0f0d7..d5f83fc3590 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -176,22 +176,36 @@ namespace orc {
            field != options.getInclude().end(); ++field) {
         updateSelectedByFieldId(selectedColumns, *field);
       }
+      selectParents(selectedColumns, *contents->schema.get());
     } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) 
{
       for (std::list<std::string>::const_iterator field = 
options.getIncludeNames().begin();
            field != options.getIncludeNames().end(); ++field) {
         updateSelectedByName(selectedColumns, *field);
       }
+      selectParents(selectedColumns, *contents->schema.get());
     } else if (options.getTypeIdsSet()) {
       const RowReaderOptions::IdReadIntentMap idReadIntentMap = 
options.getIdReadIntentMap();
       for (std::list<uint64_t>::const_iterator typeId = 
options.getInclude().begin();
            typeId != options.getInclude().end(); ++typeId) {
-        updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap);
+        if (!idReadIntentMap.empty()) {
+          updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap);
+          selectParents(selectedColumns, *contents->schema.get());
+        } else {
+          if (*typeId < selectedColumns.size()) {
+            // Only select the specified type ID, do not automatically select 
children or parents
+            selectedColumns[*typeId] = true;
+          } else {
+            std::stringstream buffer;
+            buffer << "Invalid type id selected " << *typeId << " out of " << 
selectedColumns.size();
+            throw ParseError(buffer.str());
+          }
+        }
       }
     } else {
       // default is to select all columns
       std::fill(selectedColumns.begin(), selectedColumns.end(), true);
+      selectParents(selectedColumns, *contents->schema.get());
     }
-    selectParents(selectedColumns, *contents->schema.get());
     selectedColumns[0] = true;  // column 0 is selected by default
   }
 
@@ -374,6 +388,65 @@ namespace orc {
         processChildren(type);
       }
 
+      startReadPhase = ReadPhase::LEADERS;
+      readerContext = std::unique_ptr<ReaderContext>(new ReaderContext());
+      readerContext->setFilterCallback(std::move(filterColIds), filter);
+    } else if (opts.getFilterTypeIdsSet()) {
+      // Handle filter by type IDs
+      const std::list<uint64_t>& filterTypeIds = opts.getFilterTypeIds();
+
+      for (const auto& typeId : filterTypeIds) {
+        if (typeId >= idTypeMap.size()) {
+          std::stringstream buffer;
+          buffer << "Invalid type id for filter " << typeId << " out of " << 
idTypeMap.size();
+          throw ParseError(buffer.str());
+        }
+
+        Type* type = idTypeMap[typeId];
+
+        // Process current node and all its parent nodes
+        // Set FILTER_CHILD for leaf nodes and FILTER_PARENT for non-leaf nodes
+        Type* current = type;
+        while (current != nullptr) {
+          if (current->getSubtypeCount() == 0) {
+            current->setReaderCategory(ReaderCategory::FILTER_CHILD);
+          } else if (current->getKind() == TypeKind::LIST
+                     || current->getKind() == TypeKind::MAP) {
+            
current->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT);
+          } else {
+            current->setReaderCategory(ReaderCategory::FILTER_PARENT);
+          }
+          filterColIds.emplace(current->getColumnId());
+          current = current->getParent();
+        }
+
+        // Process all child nodes of the current node
+        // For child nodes: set FILTER_CHILD if it's a leaf, FILTER_PARENT if 
it has children
+        std::function<void(Type*)> processChildren = [&processChildren](Type* 
node) {
+          if (node == nullptr) return;
+
+          // Iterate through all child nodes
+          for (int i = 0; i < node->getSubtypeCount(); ++i) {
+            Type* child = node->getSubtype(i);
+            if (child->getSubtypeCount() == 0) {
+              // Leaf node (no children)
+              child->setReaderCategory(ReaderCategory::FILTER_CHILD);
+            } else if (child->getKind() == TypeKind::LIST
+                       || child->getKind() == TypeKind::MAP) {
+              
child->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT);
+              // Recursively process its children
+              processChildren(child);
+            } else {
+              // Non-leaf node (has children)
+              child->setReaderCategory(ReaderCategory::FILTER_PARENT);
+              // Recursively process its children
+              processChildren(child);
+            }
+          }
+        };
+        processChildren(type);
+      }
+
       startReadPhase = ReadPhase::LEADERS;
       readerContext = std::unique_ptr<ReaderContext>(new ReaderContext());
       readerContext->setFilterCallback(std::move(filterColIds), filter);
diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh
index 1fd429be86a..05990851a74 100644
--- a/c++/src/Reader.hh
+++ b/c++/src/Reader.hh
@@ -212,7 +212,7 @@ namespace orc {
     ReadPhase startReadPhase;
     bool needsFollowColumnsRead;
 
-    std::map<uint64_t, const Type*> idTypeMap;
+    std::map<uint64_t, Type*> idTypeMap;
     std::map<std::string, Type*> nameTypeMap;
     std::vector<std::string> columns;
 
diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc
index a85e32bcd1f..a418fa94f24 100644
--- a/c++/src/TypeImpl.cc
+++ b/c++/src/TypeImpl.cc
@@ -591,6 +591,7 @@ namespace orc {
         throw NotImplementedYet("Unknown type kind");
     }
     result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId());
+    result->setReaderCategory(fileType->getReaderCategory());
     for (auto& key : fileType->getAttributeKeys()) {
       const auto& value = fileType->getAttributeValue(key);
       result->setAttribute(key, value);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris-thirdparty) branch orc updated: [feature] Changes for optimize complex type column reading with column pruning. (#369)

Reply via email to