This is an automated email from the ASF dual-hosted git repository.
kakachen pushed a commit to branch orc
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/orc by this push:
new e32697077ca [feature] Changes for optimize complex type column reading
with column pruning. (#369)
e32697077ca is described below
commit e32697077ca674d30e201cc339aeb50870522144
Author: Qi Chen <[email protected]>
AuthorDate: Sun Nov 23 00:20:26 2025 +0800
[feature] Changes for optimize complex type column reading with column
pruning. (#369)
1. Modify the RowReaderOptions::includeTypes() function to prevent it from
automatically selecting the parent node.
2. Add RowReaderOptions::filterTypes().
3. Fix orc struct reader late materialization in StructColumnReader::skip().
---
c++/include/orc/Reader.hh | 22 +++++++++++++-
c++/src/ColumnReader.cc | 5 +--
c++/src/Options.hh | 15 +++++++++
c++/src/Reader.cc | 77 +++++++++++++++++++++++++++++++++++++++++++++--
c++/src/Reader.hh | 2 +-
c++/src/TypeImpl.cc | 1 +
6 files changed, 116 insertions(+), 6 deletions(-)
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 96a431faae8..1ab60b1d2c5 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -178,7 +178,6 @@ namespace orc {
/**
* Selects which type ids to read. The root type is always 0 and the
* rest of the types are labeled in a preorder traversal of the tree.
- * The parent types are automatically selected, but the children are not.
*
* This option clears any previous setting of the selected columns or
* types.
@@ -206,6 +205,17 @@ namespace orc {
*/
RowReaderOptions& filter(const std::list<std::string>& filterColNames);
+ /**
+ * Selects which type ids to filter. The root type is always 0 and the
+ * rest of the types are labeled in a preorder traversal of the tree.
+ *
+ * This option clears any previous setting of the filter columns or
+ * types.
+ * @param types a list of the type ids to filter
+ * @return this
+ */
+ RowReaderOptions& filterTypes(const std::list<uint64_t>& types);
+
/**
* A map type of <typeId, ReadIntent>.
*/
@@ -308,6 +318,16 @@ namespace orc {
*/
const std::list<std::string>& getFilterColNames() const;
+ /**
+ * Were the filter type ids set?
+ */
+ bool getFilterTypeIdsSet() const;
+
+ /**
+ * Get the list of filter type ids.
+ */
+ const std::list<uint64_t>& getFilterTypeIds() const;
+
/**
* Get the start of the range for the data being processed.
* @return if not set, return 0
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index 4dde99917c0..c8b03fc435e 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -1158,9 +1158,10 @@ namespace orc {
}
uint64_t StructColumnReader::skip(uint64_t numValues, const ReadPhase&
readPhase) {
- if (readPhase.contains(this->type.getReaderCategory())) {
- numValues = ColumnReader::skip(numValues, readPhase);
+ if (!readPhase.contains(this->type.getReaderCategory())) {
+ return 0;
}
+ numValues = ColumnReader::skip(numValues, readPhase);
for (auto& ptr : children) {
if (shouldProcessChild(ptr->getType().getReaderCategory(), readPhase)) {
ptr->skip(numValues, readPhase);
diff --git a/c++/src/Options.hh b/c++/src/Options.hh
index 40a583e9397..5cd2b49ffac 100644
--- a/c++/src/Options.hh
+++ b/c++/src/Options.hh
@@ -238,6 +238,13 @@ namespace orc {
return *this;
}
+ RowReaderOptions& RowReaderOptions::filterTypes(const std::list<uint64_t>&
types) {
+ privateBits->filter = ColumnFilter_TYPE_IDS;
+ privateBits->filterColumnIndexes.assign(types.begin(), types.end());
+ privateBits->filterColumnNames.clear();
+ return *this;
+ }
+
RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) {
privateBits->dataStart = offset;
privateBits->dataLength = length;
@@ -268,6 +275,14 @@ namespace orc {
return privateBits->filterColumnNames;
}
+ bool RowReaderOptions::getFilterTypeIdsSet() const {
+ return privateBits->filter == ColumnFilter_TYPE_IDS;
+ }
+
+ const std::list<uint64_t>& RowReaderOptions::getFilterTypeIds() const {
+ return privateBits->filterColumnIndexes;
+ }
+
uint64_t RowReaderOptions::getOffset() const {
return privateBits->dataStart;
}
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 82cd3e0f0d7..d5f83fc3590 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -176,22 +176,36 @@ namespace orc {
field != options.getInclude().end(); ++field) {
updateSelectedByFieldId(selectedColumns, *field);
}
+ selectParents(selectedColumns, *contents->schema.get());
} else if (contents->schema->getKind() == STRUCT && options.getNamesSet())
{
for (std::list<std::string>::const_iterator field =
options.getIncludeNames().begin();
field != options.getIncludeNames().end(); ++field) {
updateSelectedByName(selectedColumns, *field);
}
+ selectParents(selectedColumns, *contents->schema.get());
} else if (options.getTypeIdsSet()) {
const RowReaderOptions::IdReadIntentMap idReadIntentMap =
options.getIdReadIntentMap();
for (std::list<uint64_t>::const_iterator typeId =
options.getInclude().begin();
typeId != options.getInclude().end(); ++typeId) {
- updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap);
+ if (!idReadIntentMap.empty()) {
+ updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap);
+ selectParents(selectedColumns, *contents->schema.get());
+ } else {
+ if (*typeId < selectedColumns.size()) {
+ // Only select the specified type ID, do not automatically select
children or parents
+ selectedColumns[*typeId] = true;
+ } else {
+ std::stringstream buffer;
+ buffer << "Invalid type id selected " << *typeId << " out of " <<
selectedColumns.size();
+ throw ParseError(buffer.str());
+ }
+ }
}
} else {
// default is to select all columns
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
+ selectParents(selectedColumns, *contents->schema.get());
}
- selectParents(selectedColumns, *contents->schema.get());
selectedColumns[0] = true; // column 0 is selected by default
}
@@ -374,6 +388,65 @@ namespace orc {
processChildren(type);
}
+ startReadPhase = ReadPhase::LEADERS;
+ readerContext = std::unique_ptr<ReaderContext>(new ReaderContext());
+ readerContext->setFilterCallback(std::move(filterColIds), filter);
+ } else if (opts.getFilterTypeIdsSet()) {
+ // Handle filter by type IDs
+ const std::list<uint64_t>& filterTypeIds = opts.getFilterTypeIds();
+
+ for (const auto& typeId : filterTypeIds) {
+ if (typeId >= idTypeMap.size()) {
+ std::stringstream buffer;
+ buffer << "Invalid type id for filter " << typeId << " out of " <<
idTypeMap.size();
+ throw ParseError(buffer.str());
+ }
+
+ Type* type = idTypeMap[typeId];
+
+ // Process current node and all its parent nodes
+ // Set FILTER_CHILD for leaf nodes and FILTER_PARENT for non-leaf nodes
+ Type* current = type;
+ while (current != nullptr) {
+ if (current->getSubtypeCount() == 0) {
+ current->setReaderCategory(ReaderCategory::FILTER_CHILD);
+ } else if (current->getKind() == TypeKind::LIST
+ || current->getKind() == TypeKind::MAP) {
+
current->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT);
+ } else {
+ current->setReaderCategory(ReaderCategory::FILTER_PARENT);
+ }
+ filterColIds.emplace(current->getColumnId());
+ current = current->getParent();
+ }
+
+ // Process all child nodes of the current node
+ // For child nodes: set FILTER_CHILD if it's a leaf, FILTER_PARENT if
it has children
+ std::function<void(Type*)> processChildren = [&processChildren](Type*
node) {
+ if (node == nullptr) return;
+
+ // Iterate through all child nodes
+ for (int i = 0; i < node->getSubtypeCount(); ++i) {
+ Type* child = node->getSubtype(i);
+ if (child->getSubtypeCount() == 0) {
+ // Leaf node (no children)
+ child->setReaderCategory(ReaderCategory::FILTER_CHILD);
+ } else if (child->getKind() == TypeKind::LIST
+ || child->getKind() == TypeKind::MAP) {
+
child->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT);
+ // Recursively process its children
+ processChildren(child);
+ } else {
+ // Non-leaf node (has children)
+ child->setReaderCategory(ReaderCategory::FILTER_PARENT);
+ // Recursively process its children
+ processChildren(child);
+ }
+ }
+ };
+ processChildren(type);
+ }
+
startReadPhase = ReadPhase::LEADERS;
readerContext = std::unique_ptr<ReaderContext>(new ReaderContext());
readerContext->setFilterCallback(std::move(filterColIds), filter);
diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh
index 1fd429be86a..05990851a74 100644
--- a/c++/src/Reader.hh
+++ b/c++/src/Reader.hh
@@ -212,7 +212,7 @@ namespace orc {
ReadPhase startReadPhase;
bool needsFollowColumnsRead;
- std::map<uint64_t, const Type*> idTypeMap;
+ std::map<uint64_t, Type*> idTypeMap;
std::map<std::string, Type*> nameTypeMap;
std::vector<std::string> columns;
diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc
index a85e32bcd1f..a418fa94f24 100644
--- a/c++/src/TypeImpl.cc
+++ b/c++/src/TypeImpl.cc
@@ -591,6 +591,7 @@ namespace orc {
throw NotImplementedYet("Unknown type kind");
}
result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId());
+ result->setReaderCategory(fileType->getReaderCategory());
for (auto& key : fileType->getAttributeKeys()) {
const auto& value = fileType->getAttributeValue(key);
result->setAttribute(key, value);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]