rizaon commented on a change in pull request #990:
URL: https://github.com/apache/orc/pull/990#discussion_r780619426
##########
File path: c++/include/orc/Common.hh
##########
@@ -122,6 +122,11 @@ namespace orc {
StreamKind_BLOOM_FILTER_UTF8 = 8
};
+ enum ArrayReadIntent {
Review comment:
Renamed back to `ReadIntent`, gave description comments, and TODO for
MAP and UNION type.
##########
File path: c++/include/orc/Reader.hh
##########
@@ -149,6 +149,26 @@ namespace orc {
*/
RowReaderOptions& includeTypes(const std::list<uint64_t>& types);
+ /**
+ * A map of <typeId, ArrayReadIntent> used as a parameter in
+ * RowReaderOptions::includeTypesWithIntents.
Review comment:
Deleted in latest commit.
##########
File path: c++/include/orc/Reader.hh
##########
@@ -149,6 +149,26 @@ namespace orc {
*/
RowReaderOptions& includeTypes(const std::list<uint64_t>& types);
+ /**
+ * A map of <typeId, ArrayReadIntent> used as a parameter in
+ * RowReaderOptions::includeTypesWithIntents.
+ */
+ typedef std::map<uint64_t, ArrayReadIntent> TypeReadIntents;
+
+ /**
+ * Selects which type ids to read and specific ArrayReadIntents for each
+ * type id. The root type is always 0 and the rest of the types are
+ * labeled in a preorder traversal of the tree. The parent types are
+ * automatically selected, but the children are not.
+ *
+ * This option clears any previous setting of the selected columns or
+ * types.
+ * @param typesWithIntents a map of <typeId, ArrayReadIntent>.
Review comment:
Deleted in latest commit.
##########
File path: c++/include/orc/Reader.hh
##########
@@ -266,6 +286,12 @@ namespace orc {
* Get desired timezone to return data of timestamp type
*/
const std::string& getTimezoneName() const;
+
+ /**
+ * Get the <typeId, ArrayReadIntent> map that was supplied through
Review comment:
Deleted in latest commit.
##########
File path: c++/include/orc/Reader.hh
##########
@@ -565,6 +591,15 @@ namespace orc {
*/
virtual void seekToRow(uint64_t rowNumber) = 0;
+ /**
+ * Get an ArrayReadIntent for a given typeId.
+ * @param typeId the type id to look up.
+ * @return an ArrayReadIntent that was specified for given typeId through
+ * RowReaderOptions::includeTypesWithIntents. If no ArrayReadIntent was
+ * specified for typeId, return ArrayReadIntent_ALL.
+ */
+ virtual ArrayReadIntent getReadIntent(uint64_t typeId) const = 0;
Review comment:
Deleted in latest commit.
##########
File path: c++/src/ColumnReader.cc
##########
@@ -993,6 +993,9 @@ namespace orc {
void nextInternal(ColumnVectorBatch& rowBatch,
uint64_t numValues,
char *notNull);
+
+ void readArrayIndices(const int64_t *arrayOffsets, uint64_t arrayCount,
Review comment:
Deleted in latest commit.
##########
File path: c++/src/Reader.cc
##########
@@ -290,6 +291,15 @@ namespace orc {
return selectedColumns;
}
+ ArrayReadIntent RowReaderImpl::getReadIntent(uint64_t typeId) const {
+ auto elem = readIntents.find(typeId);
+ if (elem == readIntents.end()) {
+ return ArrayReadIntent_ALL;
+ } else {
+ return elem->second;
Review comment:
Created 4 new test:
- TestReadIntent.testListAll
- TestReadIntent.testListOffsets
- TestReadIntent.testListAllAndOffsets
- TestReadIntent.testListConflictingIntent
##########
File path: c++/src/ColumnReader.cc
##########
@@ -1006,9 +1009,12 @@ namespace orc {
if (stream == nullptr)
throw ParseError("LENGTH stream not found in List column");
rle = createRleDecoder(std::move(stream), false, vers, memoryPool);
- const Type& childType = *type.getSubtype(0);
- if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) {
- child = buildReader(childType, stripe);
+ ArrayReadIntent intent = stripe.getReadIntent(columnId);
+ if (intent == ArrayReadIntent_ALL) {
+ const Type &childType = *type.getSubtype(0);
+ if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) {
+ child = buildReader(childType, stripe);
+ }
Review comment:
I replace this with 4 new tests at `c++/test/TestReader.cc`.
However, they stop at column selection verification and do not read a row
batch.
##########
File path: c++/src/Options.hh
##########
@@ -174,20 +175,36 @@ namespace orc {
privateBits->selection = ColumnSelection_FIELD_IDS;
privateBits->includedColumnIndexes.assign(include.begin(), include.end());
privateBits->includedColumnNames.clear();
+ privateBits->readIntents.clear();
return *this;
}
RowReaderOptions& RowReaderOptions::include(const std::list<std::string>&
include) {
privateBits->selection = ColumnSelection_NAMES;
privateBits->includedColumnNames.assign(include.begin(), include.end());
privateBits->includedColumnIndexes.clear();
+ privateBits->readIntents.clear();
return *this;
}
RowReaderOptions& RowReaderOptions::includeTypes(const std::list<uint64_t>&
types) {
privateBits->selection = ColumnSelection_TYPE_IDS;
privateBits->includedColumnIndexes.assign(types.begin(), types.end());
privateBits->includedColumnNames.clear();
+ privateBits->readIntents.clear();
+ return *this;
+ }
+
+ RowReaderOptions&
+ RowReaderOptions::includeTypesWithIntents(const TypeReadIntents&
typesAndIntents) {
+ privateBits->selection = ColumnSelection_TYPE_IDS;
+ privateBits->includedColumnIndexes.clear();
+ privateBits->readIntents.clear();
+ for (const auto& typeIntentPair : typesAndIntents ) {
+ privateBits->readIntents[typeIntentPair.first] = typeIntentPair.second;
Review comment:
It is now resolved by relying on `ColumnSelector::selectParents()` to
override the decision.
##########
File path: c++/src/Vector.cc
##########
@@ -242,7 +242,6 @@ namespace orc {
}
return false;
}
-
Review comment:
Reverted.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]