rizaon commented on a change in pull request #990:
URL: https://github.com/apache/orc/pull/990#discussion_r783509278
##########
File path: c++/test/TestReader.cc
##########
@@ -137,4 +139,89 @@ namespace orc {
CheckFileWithSargs("bad_bloom_filter_1.6.11.orc", "ORC C++ 1.6.11");
CheckFileWithSargs("bad_bloom_filter_1.6.0.orc", "ORC C++");
}
+
+ /**
+ * Read complextypes_iceberg.orc and verify the resolved selections.
+ *
+ * The ORC file has the following schema:
+ * struct<
+ * id:bigint,
+ * int_array:array<int>,
+ * int_array_array:array<array<int>>,
+ * int_map:map<string,int>,
+ * int_map_array:array<map<string,int>>,
+ * nested_struct:struct<
+ * a:int,
+ * b:array<int>,
+ * c:struct<
+ * d:array<array<struct<
+ * e:int,
+ * f:string
+ * >>>
+ * >,
+ * g:map<string,struct<
+ * h:struct<
+ * i:array<double>
+ * >
+ * >>
+ * >
+ * >
+ *
+ * @param readIntents TypeReadIntents describing the section.
Review comment:
I believe this has been fixed at commit
[11d66a2](https://github.com/apache/orc/pull/990/commits/11d66a2035f1d91346e1e44986e7ffd4e20e5b0f)
##########
File path: c++/test/TestReader.cc
##########
@@ -137,4 +139,89 @@ namespace orc {
CheckFileWithSargs("bad_bloom_filter_1.6.11.orc", "ORC C++ 1.6.11");
CheckFileWithSargs("bad_bloom_filter_1.6.0.orc", "ORC C++");
}
+
+ /**
+ * Read complextypes_iceberg.orc and verify the resolved selections.
+ *
+ * The ORC file has the following schema:
+ * struct<
+ * id:bigint,
+ * int_array:array<int>,
+ * int_array_array:array<array<int>>,
+ * int_map:map<string,int>,
+ * int_map_array:array<map<string,int>>,
+ * nested_struct:struct<
+ * a:int,
+ * b:array<int>,
+ * c:struct<
+ * d:array<array<struct<
+ * e:int,
+ * f:string
+ * >>>
+ * >,
+ * g:map<string,struct<
+ * h:struct<
+ * i:array<double>
+ * >
+ * >>
+ * >
+ * >
+ *
+ * @param readIntents TypeReadIntents describing the section.
+ * @param expectedSelection expected TypeIds that will be selected from given
+ * readIntents.
+ */
+ void verifySelection(const RowReaderOptions::TypeReadIntents &readIntents,
+ const std::vector<uint32_t> &expectedSelection) {
+ std::string fileName = "complextypes_iceberg.orc";
+ std::stringstream ss;
+ if (const char* example_dir = std::getenv("ORC_EXAMPLE_DIR")) {
+ ss << example_dir;
+ } else {
+ ss << "../../../examples";
+ }
+ ss << "/" << fileName;
+ ReaderOptions readerOpts;
+ std::unique_ptr<Reader> reader =
+ createReader(readLocalFile(ss.str().c_str()), readerOpts);
+
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(readIntents);
+ std::unique_ptr<RowReader> rowReader =
+ reader->createRowReader(rowReaderOpts);
+ std::vector<bool> expected(reader->getType().getMaximumColumnId() + 1,
+ false);
+ for (auto id : expectedSelection) {
+ expected[id] = true;
+ }
+ ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected));
+ }
+
+ TEST(TestReadIntent, testListAll) {
+ // select all of int_array_array.
+ verifySelection({{4, ReadIntent_ALL}}, {0, 4, 5, 6});
+ }
+
+ TEST(TestReadIntent, testListOffsets) {
+ // select only the offsets of int_array_array.
+ verifySelection({{4, ReadIntent_OFFSETS}}, {0, 4});
Review comment:
OK. Will add such test in my next commit.
##########
File path: c++/test/TestReader.cc
##########
@@ -137,4 +139,89 @@ namespace orc {
CheckFileWithSargs("bad_bloom_filter_1.6.11.orc", "ORC C++ 1.6.11");
CheckFileWithSargs("bad_bloom_filter_1.6.0.orc", "ORC C++");
}
+
+ /**
+ * Read complextypes_iceberg.orc and verify the resolved selections.
+ *
+ * The ORC file has the following schema:
+ * struct<
+ * id:bigint,
+ * int_array:array<int>,
+ * int_array_array:array<array<int>>,
+ * int_map:map<string,int>,
+ * int_map_array:array<map<string,int>>,
+ * nested_struct:struct<
+ * a:int,
+ * b:array<int>,
+ * c:struct<
+ * d:array<array<struct<
+ * e:int,
+ * f:string
+ * >>>
+ * >,
+ * g:map<string,struct<
+ * h:struct<
+ * i:array<double>
+ * >
+ * >>
+ * >
+ * >
+ *
+ * @param readIntents TypeReadIntents describing the section.
+ * @param expectedSelection expected TypeIds that will be selected from given
+ * readIntents.
+ */
+ void verifySelection(const RowReaderOptions::TypeReadIntents &readIntents,
+ const std::vector<uint32_t> &expectedSelection) {
+ std::string fileName = "complextypes_iceberg.orc";
+ std::stringstream ss;
+ if (const char* example_dir = std::getenv("ORC_EXAMPLE_DIR")) {
+ ss << example_dir;
+ } else {
+ ss << "../../../examples";
+ }
+ ss << "/" << fileName;
+ ReaderOptions readerOpts;
+ std::unique_ptr<Reader> reader =
+ createReader(readLocalFile(ss.str().c_str()), readerOpts);
+
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(readIntents);
+ std::unique_ptr<RowReader> rowReader =
+ reader->createRowReader(rowReaderOpts);
+ std::vector<bool> expected(reader->getType().getMaximumColumnId() + 1,
+ false);
+ for (auto id : expectedSelection) {
+ expected[id] = true;
+ }
+ ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected));
+ }
+
+ TEST(TestReadIntent, testListAll) {
+ // select all of int_array_array.
+ verifySelection({{4, ReadIntent_ALL}}, {0, 4, 5, 6});
+ }
+
+ TEST(TestReadIntent, testListOffsets) {
+ // select only the offsets of int_array_array.
+ verifySelection({{4, ReadIntent_OFFSETS}}, {0, 4});
+
+ // select only the offsets of int_array_array.item.
+ verifySelection({{4, ReadIntent_OFFSETS}, {5, ReadIntent_OFFSETS}},
+ {0, 4, 5});
Review comment:
complextypes_iceberg.orc is the most complex ORC file in the example dir
that I can find. I can't find any that has 3 nested arrays.
Would you like me to create a new ORC file? Is there any tool that I can use?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]