This is an automated email from the ASF dual-hosted git repository. kakachen pushed a commit to branch cq_test in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
commit c5c6731dc246a4dc579ced4fe89efde3952696f6 Author: kakachen <[email protected]> AuthorDate: Thu Oct 16 11:33:07 2025 +0800 Init commit. --- c++/include/orc/Reader.hh | 22 +++++++++++++++++++ c++/src/ColumnReader.cc | 8 +++++-- c++/src/Options.hh | 15 +++++++++++++ c++/src/Reader.cc | 55 +++++++++++++++++++++++++++++++++++++++++------ c++/src/Reader.hh | 2 +- c++/src/TypeImpl.cc | 1 + 6 files changed, 94 insertions(+), 9 deletions(-) diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index 96a431faae8..1fa3b9bf733 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -206,6 +206,18 @@ namespace orc { */ RowReaderOptions& filter(const std::list<std::string>& filterColNames); + /** + * Selects which type ids to filter. The root type is always 0 and the + * rest of the types are labeled in a preorder traversal of the tree. + * The parent types are automatically selected, but the children are not. + * + * This option clears any previous setting of the filter columns or + * types. + * @param types a list of the type ids to filter + * @return this + */ + RowReaderOptions& filterTypes(const std::list<uint64_t>& types); + /** * A map type of <typeId, ReadIntent>. */ @@ -308,6 +320,16 @@ namespace orc { */ const std::list<std::string>& getFilterColNames() const; + /** + * Were the filter type ids set? + */ + bool getFilterTypeIdsSet() const; + + /** + * Get the list of filter type ids. + */ + const std::list<uint64_t>& getFilterTypeIds() const; + /** * Get the start of the range for the data being processed. * @return if not set, return 0 diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc index 4dde99917c0..bc1179ff13f 100644 --- a/c++/src/ColumnReader.cc +++ b/c++/src/ColumnReader.cc @@ -1158,9 +1158,13 @@ namespace orc { } uint64_t StructColumnReader::skip(uint64_t numValues, const ReadPhase& readPhase) { - if (readPhase.contains(this->type.getReaderCategory())) { - numValues = ColumnReader::skip(numValues, readPhase); + if (!readPhase.contains(this->type.getReaderCategory())) { + return 0; } + // if (readPhase.contains(this->type.getReaderCategory())) { + // numValues = ColumnReader::skip(numValues, readPhase); + // } + numValues = ColumnReader::skip(numValues, readPhase); for (auto& ptr : children) { if (shouldProcessChild(ptr->getType().getReaderCategory(), readPhase)) { ptr->skip(numValues, readPhase); diff --git a/c++/src/Options.hh b/c++/src/Options.hh index 40a583e9397..5cd2b49ffac 100644 --- a/c++/src/Options.hh +++ b/c++/src/Options.hh @@ -238,6 +238,13 @@ namespace orc { return *this; } + RowReaderOptions& RowReaderOptions::filterTypes(const std::list<uint64_t>& types) { + privateBits->filter = ColumnFilter_TYPE_IDS; + privateBits->filterColumnIndexes.assign(types.begin(), types.end()); + privateBits->filterColumnNames.clear(); + return *this; + } + RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) { privateBits->dataStart = offset; privateBits->dataLength = length; @@ -268,6 +275,14 @@ namespace orc { return privateBits->filterColumnNames; } + bool RowReaderOptions::getFilterTypeIdsSet() const { + return privateBits->filter == ColumnFilter_TYPE_IDS; + } + + const std::list<uint64_t>& RowReaderOptions::getFilterTypeIds() const { + return privateBits->filterColumnIndexes; + } + uint64_t RowReaderOptions::getOffset() const { return privateBits->dataStart; } diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 82cd3e0f0d7..673b668b1df 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -337,9 +337,6 @@ namespace orc { while (current != nullptr) { if (current->getSubtypeCount() == 0) { current->setReaderCategory(ReaderCategory::FILTER_CHILD); - } else if (current->getKind() == TypeKind::LIST - || current->getKind() == TypeKind::MAP) { - current->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT); } else { current->setReaderCategory(ReaderCategory::FILTER_PARENT); } @@ -358,11 +355,57 @@ namespace orc { if (child->getSubtypeCount() == 0) { // Leaf node (no children) child->setReaderCategory(ReaderCategory::FILTER_CHILD); - } else if (child->getKind() == TypeKind::LIST - || child->getKind() == TypeKind::MAP) { - child->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT); + } else { + // Non-leaf node (has children) + child->setReaderCategory(ReaderCategory::FILTER_PARENT); // Recursively process its children processChildren(child); + } + } + }; + processChildren(type); + } + + startReadPhase = ReadPhase::LEADERS; + readerContext = std::unique_ptr<ReaderContext>(new ReaderContext()); + readerContext->setFilterCallback(std::move(filterColIds), filter); + } else if (opts.getFilterTypeIdsSet()) { + // Handle filter by type IDs + const std::list<uint64_t>& filterTypeIds = opts.getFilterTypeIds(); + + for (const auto& typeId : filterTypeIds) { + if (typeId >= idTypeMap.size()) { + std::stringstream buffer; + buffer << "Invalid type id for filter " << typeId << " out of " << idTypeMap.size(); + throw ParseError(buffer.str()); + } + + Type* type = idTypeMap[typeId]; + + // Process current node and all its parent nodes + // Set FILTER_CHILD for leaf nodes and FILTER_PARENT for non-leaf nodes + Type* current = type; + while (current != nullptr) { + if (current->getSubtypeCount() == 0) { + current->setReaderCategory(ReaderCategory::FILTER_CHILD); + } else { + current->setReaderCategory(ReaderCategory::FILTER_PARENT); + } + filterColIds.emplace(current->getColumnId()); + current = current->getParent(); + } + + // Process all child nodes of the current node + // For child nodes: set FILTER_CHILD if it's a leaf, FILTER_PARENT if it has children + std::function<void(Type*)> processChildren = [&processChildren](Type* node) { + if (node == nullptr) return; + + // Iterate through all child nodes + for (int i = 0; i < node->getSubtypeCount(); ++i) { + Type* child = node->getSubtype(i); + if (child->getSubtypeCount() == 0) { + // Leaf node (no children) + child->setReaderCategory(ReaderCategory::FILTER_CHILD); } else { // Non-leaf node (has children) child->setReaderCategory(ReaderCategory::FILTER_PARENT); diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index 1fd429be86a..05990851a74 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -212,7 +212,7 @@ namespace orc { ReadPhase startReadPhase; bool needsFollowColumnsRead; - std::map<uint64_t, const Type*> idTypeMap; + std::map<uint64_t, Type*> idTypeMap; std::map<std::string, Type*> nameTypeMap; std::vector<std::string> columns; diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index a85e32bcd1f..a418fa94f24 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -591,6 +591,7 @@ namespace orc { throw NotImplementedYet("Unknown type kind"); } result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId()); + result->setReaderCategory(fileType->getReaderCategory()); for (auto& key : fileType->getAttributeKeys()) { const auto& value = fileType->getAttributeValue(key); result->setAttribute(key, value); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
