emkornfield commented on a change in pull request #8177:
URL: https://github.com/apache/arrow/pull/8177#discussion_r488815645
##########
File path: cpp/src/parquet/level_conversion_test.cc
##########
@@ -108,22 +114,245 @@ TEST(GreaterThanBitmap, GeneratesExpectedBitmasks) {
TEST(DefinitionLevelsToBitmap, WithRepetitionLevelFiltersOutEmptyListValues) {
std::vector<uint8_t> validity_bitmap(/*count*/ 8, 0);
- int64_t null_count = 5;
- int64_t values_read = 1;
+ ValidityBitmapInputOutput io;
+ io.values_read_upper_bound = 64;
+ io.values_read = 1;
+ io.null_count = 5;
+ io.valid_bits = validity_bitmap.data();
+ io.valid_bits_offset = 1;
+
+ LevelInfo level_info;
+ level_info.repeated_ancestor_def_level = 1;
+ level_info.def_level = 2;
+ level_info.rep_level = 1;
// All zeros should be ignored, ones should be unset in the bitmp and 2
should be set.
std::vector<int16_t> def_levels = {0, 0, 0, 2, 2, 1, 0, 2};
- DefinitionLevelsToBitmap(
- def_levels.data(), def_levels.size(), /*max_definition_level=*/2,
- /*max_repetition_level=*/1, &values_read, &null_count,
validity_bitmap.data(),
- /*valid_bits_offset=*/1);
+ DefinitionLevelsToBitmap(def_levels.data(), def_levels.size(), level_info,
&io);
EXPECT_EQ(BitmapToString(validity_bitmap, /*bit_count=*/8), "01101000");
for (size_t x = 1; x < validity_bitmap.size(); x++) {
EXPECT_EQ(validity_bitmap[x], 0) << "index: " << x;
}
- EXPECT_EQ(null_count, /*5 + 1 =*/6);
- EXPECT_EQ(values_read, 4); // value should get overwritten.
+ EXPECT_EQ(io.null_count, /*5 + 1 =*/6);
+ EXPECT_EQ(io.values_read, 4); // value should get overwritten.
+}
+
+class MultiLevelTestData {
+ public:
+ // Triply nested list values borrow from write_path
+ // [null, [[1 , null, 3], []], []],
+ // [[[]], [[], [1, 2]], null, [[3]]],
+ // null,
+ // []
+ std::vector<int16_t> def_levels_{2, 7, 6, 7, 5, 3, // first row
+ 5, 5, 7, 7, 2, 7, // second row
+ 0, // third row
+ 1};
+ std::vector<int16_t> rep_levels_{0, 1, 3, 3, 2, 1, // first row
+ 0, 1, 2, 3, 1, 1, // second row
+ 0, 0};
+};
+
+template <typename ConverterType>
+class NestedListTest : public testing::Test {
+ public:
+ MultiLevelTestData test_data_;
+ ConverterType converter_;
+};
+
+template <typename ListType>
+struct RepDefLevelConverter {
+ using ListLengthType = ListType;
+ ListLengthType* ComputeListInfo(const MultiLevelTestData& test_data,
+ LevelInfo level_info,
ValidityBitmapInputOutput* output,
+ ListType* lengths) {
+ ConvertDefRepLevelsToList(test_data.def_levels_.data(),
test_data.rep_levels_.data(),
+ test_data.def_levels_.size(), level_info,
output, lengths);
+ return lengths + output->values_read;
+ }
+};
+
+using ConverterTypes =
+ ::testing::Types<RepDefLevelConverter</*list_length_type=*/int32_t>,
+ RepDefLevelConverter</*list_length_type=*/int64_t>>;
+TYPED_TEST_CASE(NestedListTest, ConverterTypes);
+
+TYPED_TEST(NestedListTest, OuterMostTest) {
+ // [null, [[1 , null, 3], []], []],
+ // [[[]], [[], [1, 2]], null, [[3]]],
+ // null,
+ // []
+ // -> 4 outer most lists (len(3), len(4), null, len(0))
+ LevelInfo level_info;
+ level_info.rep_level = 1;
+ level_info.def_level = 2;
+
+ std::vector<typename TypeParam::ListLengthType> lengths(5, 0);
+ uint64_t validity_output;
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = 4;
+ validity_io.valid_bits = reinterpret_cast<uint8_t*>(&validity_output);
+ typename TypeParam::ListLengthType* next_position =
this->converter_.ComputeListInfo(
+ this->test_data_, level_info, &validity_io, lengths.data());
+
+ EXPECT_THAT(next_position, lengths.data() + 4);
+ EXPECT_THAT(lengths, testing::ElementsAre(0, 3, 7, 7, 7));
+
+ EXPECT_EQ(validity_io.values_read, 4);
+ EXPECT_EQ(validity_io.null_count, 1);
+ EXPECT_EQ(BitmapToString(validity_io.valid_bits, /*length=*/4), "1101");
+}
+
+TYPED_TEST(NestedListTest, MiddleListTest) {
+ // [null, [[1 , null, 3], []], []],
+ // [[[]], [[], [1, 2]], null, [[3]]],
+ // null,
+ // []
+ // -> middle lists (null, len(2), len(0),
+ // len(1), len(2), null, len(1),
+ // N/A,
+ // N/A
+ LevelInfo level_info;
+ level_info.rep_level = 2;
+ level_info.def_level = 4;
+ level_info.repeated_ancestor_def_level = 2;
+
+ std::vector<typename TypeParam::ListLengthType> lengths(8, 0);
+ uint64_t validity_output;
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = 7;
+ validity_io.valid_bits = reinterpret_cast<uint8_t*>(&validity_output);
+ typename TypeParam::ListLengthType* next_position =
this->converter_.ComputeListInfo(
+ this->test_data_, level_info, &validity_io, lengths.data());
+
+ EXPECT_THAT(next_position, lengths.data() + 7);
+ EXPECT_THAT(lengths, testing::ElementsAre(0, 0, 2, 2, 3, 5, 5, 6));
+
+ EXPECT_EQ(validity_io.values_read, 7);
+ EXPECT_EQ(validity_io.null_count, 2);
+ EXPECT_EQ(BitmapToString(validity_io.valid_bits, /*length=*/7), "0111101");
+}
+
+TYPED_TEST(NestedListTest, InnerMostListTest) {
+ // [null, [[1, null, 3], []], []],
+ // [[[]], [[], [1, 2]], null, [[3]]],
+ // null,
+ // []
+ // -> 4 inner lists (N/A, [len(3), len(0)], N/A
+ // len(0), [len(0), len(2)], N/A, len(1),
+ // N/A,
+ // N/A
+ LevelInfo level_info;
+ level_info.rep_level = 3;
+ level_info.def_level = 6;
+ level_info.repeated_ancestor_def_level = 4;
+
+ std::vector<typename TypeParam::ListLengthType> lengths(7, 0);
+ uint64_t validity_output;
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = 6;
+ validity_io.valid_bits = reinterpret_cast<uint8_t*>(&validity_output);
+ typename TypeParam::ListLengthType* next_position =
this->converter_.ComputeListInfo(
+ this->test_data_, level_info, &validity_io, lengths.data());
+
+ EXPECT_THAT(next_position, lengths.data() + 6);
+ EXPECT_THAT(lengths, testing::ElementsAre(0, 3, 3, 3, 3, 5, 6));
+
+ EXPECT_EQ(validity_io.values_read, 6);
+ EXPECT_EQ(validity_io.null_count, 0);
+ EXPECT_EQ(BitmapToString(validity_io.valid_bits, /*length=*/6), "111111");
+}
Review comment:
sorry, I didn't respond directly here. I'll refactor the tests tonight.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]