emkornfield commented on a change in pull request #8177:
URL: https://github.com/apache/arrow/pull/8177#discussion_r489170674
##########
File path: cpp/src/parquet/level_conversion_test.cc
##########
@@ -108,22 +114,245 @@ TEST(GreaterThanBitmap, GeneratesExpectedBitmasks) {
TEST(DefinitionLevelsToBitmap, WithRepetitionLevelFiltersOutEmptyListValues) {
std::vector<uint8_t> validity_bitmap(/*count*/ 8, 0);
- int64_t null_count = 5;
- int64_t values_read = 1;
+ ValidityBitmapInputOutput io;
+ io.values_read_upper_bound = 64;
+ io.values_read = 1;
+ io.null_count = 5;
+ io.valid_bits = validity_bitmap.data();
+ io.valid_bits_offset = 1;
+
+ LevelInfo level_info;
+ level_info.repeated_ancestor_def_level = 1;
+ level_info.def_level = 2;
+ level_info.rep_level = 1;
// All zeros should be ignored, ones should be unset in the bitmp and 2
should be set.
std::vector<int16_t> def_levels = {0, 0, 0, 2, 2, 1, 0, 2};
- DefinitionLevelsToBitmap(
- def_levels.data(), def_levels.size(), /*max_definition_level=*/2,
- /*max_repetition_level=*/1, &values_read, &null_count,
validity_bitmap.data(),
- /*valid_bits_offset=*/1);
+ DefinitionLevelsToBitmap(def_levels.data(), def_levels.size(), level_info,
&io);
EXPECT_EQ(BitmapToString(validity_bitmap, /*bit_count=*/8), "01101000");
for (size_t x = 1; x < validity_bitmap.size(); x++) {
EXPECT_EQ(validity_bitmap[x], 0) << "index: " << x;
}
- EXPECT_EQ(null_count, /*5 + 1 =*/6);
- EXPECT_EQ(values_read, 4); // value should get overwritten.
+ EXPECT_EQ(io.null_count, /*5 + 1 =*/6);
+ EXPECT_EQ(io.values_read, 4); // value should get overwritten.
+}
+
+class MultiLevelTestData {
+ public:
+ // Triply nested list values borrow from write_path
+ // [null, [[1 , null, 3], []], []],
+ // [[[]], [[], [1, 2]], null, [[3]]],
+ // null,
+ // []
+ std::vector<int16_t> def_levels_{2, 7, 6, 7, 5, 3, // first row
+ 5, 5, 7, 7, 2, 7, // second row
+ 0, // third row
+ 1};
+ std::vector<int16_t> rep_levels_{0, 1, 3, 3, 2, 1, // first row
+ 0, 1, 2, 3, 1, 1, // second row
+ 0, 0};
+};
+
+template <typename ConverterType>
+class NestedListTest : public testing::Test {
+ public:
+ MultiLevelTestData test_data_;
+ ConverterType converter_;
+};
+
+template <typename ListType>
+struct RepDefLevelConverter {
+ using ListLengthType = ListType;
+ ListLengthType* ComputeListInfo(const MultiLevelTestData& test_data,
+ LevelInfo level_info,
ValidityBitmapInputOutput* output,
+ ListType* lengths) {
+ ConvertDefRepLevelsToList(test_data.def_levels_.data(),
test_data.rep_levels_.data(),
+ test_data.def_levels_.size(), level_info,
output, lengths);
+ return lengths + output->values_read;
+ }
+};
+
+using ConverterTypes =
+ ::testing::Types<RepDefLevelConverter</*list_length_type=*/int32_t>,
+ RepDefLevelConverter</*list_length_type=*/int64_t>>;
+TYPED_TEST_CASE(NestedListTest, ConverterTypes);
+
+TYPED_TEST(NestedListTest, OuterMostTest) {
+ // [null, [[1 , null, 3], []], []],
+ // [[[]], [[], [1, 2]], null, [[3]]],
+ // null,
+ // []
+ // -> 4 outer most lists (len(3), len(4), null, len(0))
+ LevelInfo level_info;
+ level_info.rep_level = 1;
+ level_info.def_level = 2;
+
+ std::vector<typename TypeParam::ListLengthType> lengths(5, 0);
+ uint64_t validity_output;
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = 4;
+ validity_io.valid_bits = reinterpret_cast<uint8_t*>(&validity_output);
+ typename TypeParam::ListLengthType* next_position =
this->converter_.ComputeListInfo(
+ this->test_data_, level_info, &validity_io, lengths.data());
+
+ EXPECT_THAT(next_position, lengths.data() + 4);
+ EXPECT_THAT(lengths, testing::ElementsAre(0, 3, 7, 7, 7));
+
+ EXPECT_EQ(validity_io.values_read, 4);
+ EXPECT_EQ(validity_io.null_count, 1);
+ EXPECT_EQ(BitmapToString(validity_io.valid_bits, /*length=*/4), "1101");
+}
+
+TYPED_TEST(NestedListTest, MiddleListTest) {
+ // [null, [[1 , null, 3], []], []],
+ // [[[]], [[], [1, 2]], null, [[3]]],
+ // null,
+ // []
+ // -> middle lists (null, len(2), len(0),
+ // len(1), len(2), null, len(1),
+ // N/A,
+ // N/A
+ LevelInfo level_info;
+ level_info.rep_level = 2;
+ level_info.def_level = 4;
+ level_info.repeated_ancestor_def_level = 2;
+
+ std::vector<typename TypeParam::ListLengthType> lengths(8, 0);
+ uint64_t validity_output;
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = 7;
+ validity_io.valid_bits = reinterpret_cast<uint8_t*>(&validity_output);
+ typename TypeParam::ListLengthType* next_position =
this->converter_.ComputeListInfo(
+ this->test_data_, level_info, &validity_io, lengths.data());
+
+ EXPECT_THAT(next_position, lengths.data() + 7);
+ EXPECT_THAT(lengths, testing::ElementsAre(0, 0, 2, 2, 3, 5, 5, 6));
+
+ EXPECT_EQ(validity_io.values_read, 7);
+ EXPECT_EQ(validity_io.null_count, 2);
+ EXPECT_EQ(BitmapToString(validity_io.valid_bits, /*length=*/7), "0111101");
+}
+
+TYPED_TEST(NestedListTest, InnerMostListTest) {
+ // [null, [[1, null, 3], []], []],
+ // [[[]], [[], [1, 2]], null, [[3]]],
+ // null,
+ // []
+ // -> 4 inner lists (N/A, [len(3), len(0)], N/A
+ // len(0), [len(0), len(2)], N/A, len(1),
+ // N/A,
+ // N/A
+ LevelInfo level_info;
+ level_info.rep_level = 3;
+ level_info.def_level = 6;
+ level_info.repeated_ancestor_def_level = 4;
+
+ std::vector<typename TypeParam::ListLengthType> lengths(7, 0);
+ uint64_t validity_output;
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = 6;
+ validity_io.valid_bits = reinterpret_cast<uint8_t*>(&validity_output);
+ typename TypeParam::ListLengthType* next_position =
this->converter_.ComputeListInfo(
+ this->test_data_, level_info, &validity_io, lengths.data());
+
+ EXPECT_THAT(next_position, lengths.data() + 6);
+ EXPECT_THAT(lengths, testing::ElementsAre(0, 3, 3, 3, 3, 5, 6));
+
+ EXPECT_EQ(validity_io.values_read, 6);
+ EXPECT_EQ(validity_io.null_count, 0);
+ EXPECT_EQ(BitmapToString(validity_io.valid_bits, /*length=*/6), "111111");
+}
Review comment:
For this, I think if we have good coverage at the reconstruction level
it is likely better. I refactored the tests so if we do identify bugs in the
future it will be easy to add.
My main motivation here is I missed a couple of bugs by focusing heavily on
the analogous code and not end-to-end (this was partially hampered by not
having the read side done).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]