AntoinePrv commented on code in PR #47294: URL: https://github.com/apache/arrow/pull/47294#discussion_r2355358556
########## cpp/src/arrow/util/rle_encoding_test.cc: ########## @@ -602,20 +1038,126 @@ struct GetBatchSpacedTestCase { int bit_width; }; -TEST(RleDecoder, GetBatchSpaced) { - uint32_t kSeed = 1337; - ::arrow::random::RandomArrayGenerator rand(kSeed); - - std::vector<GetBatchSpacedTestCase<int32_t>> int32_cases{ - {1, 100000, 0.01, 1}, {1, 100000, 0.1, 1}, {1, 100000, 0.5, 1}, - {4, 100000, 0.05, 3}, {100, 100000, 0.05, 7}, +template <typename T> +void DoTestGetBatchSpacedRoundtrip() { + using Data = DataTestRleBitPacked<T>; + using ArrowType = typename Data::ArrowType; + using RandomPart = typename Data::RandomPart; + using NullPart = typename Data::NullPart; + using RepeatPart = typename Data::RepeatPart; + + std::vector<Data> test_cases = { + { + {RandomPart{/* max=*/1, /* size=*/400, /* null_proba= */ 0.1}}, + /* bit_width= */ 1, + }, + { + { + RandomPart{/* max=*/7, /* size=*/10037, /* null_proba= */ 0.0}, + NullPart{/* size= */ 1153}, + RandomPart{/* max=*/7, /* size=*/800, /* null_proba= */ 0.5}, + }, + /* bit_width= */ 3, + }, + { + { + NullPart{/* size= */ 80}, + RandomPart{/* max=*/static_cast<T>(1023), /* size=*/800, + /* null_proba= */ 0.01}, + NullPart{/* size= */ 1023}, + }, + /* bit_width= */ 11, + }, + { + {RepeatPart{/* value=*/13, /* size=*/100000, /* null_proba= */ 0.01}}, + /* bit_width= */ 10, + }, + { + { + NullPart{/* size= */ 1024}, + RepeatPart{/* value=*/static_cast<T>(10000), /* size=*/100000, + /* null_proba= */ 0.1}, + NullPart{/* size= */ 77}, + }, + /* bit_width= */ 23, + }, + { + { + RepeatPart{/* value=*/13, /* size=*/100000, /* null_proba= */ 0.0}, + NullPart{/* size= */ 1153}, + RepeatPart{/* value=*/72, /* size=*/100799, /* null_proba= */ 0.5}, + }, + /* bit_width= */ 10, + }, + { + { + RandomPart{/* max=*/1, /* size=*/1013, /* null_proba= */ 0.01}, + NullPart{/* size=*/8}, + RepeatPart{1, /* size= */ 256, /* null_proba= */ 0.1}, + NullPart{/* size=*/128}, + RepeatPart{0, /* size= */ 256, /* null_proba= */ 0.0}, + NullPart{/* size=*/15}, + RandomPart{/* max=*/1, /* size=*/8 * 1024, /* null_proba= */ 0.01}, + }, + /* bit_width= */ 1, + }, }; - for (auto case_ : int32_cases) { - auto arr = rand.Int32(case_.size, /*min=*/0, case_.max_value, case_.null_probability); - CheckRoundTripSpaced<Int32Type>(*arr, case_.bit_width); - CheckRoundTripSpaced<Int32Type>(*arr->Slice(1), case_.bit_width); + + ::arrow::random::RandomArrayGenerator rand(/* seed= */ 12); + // FRAGILE: we create a dictionary large enough so that any encoded value from the + // previous test cases can be used as an index in the dictionary. + // Its size must be increased accordingly if larger values are encoded in the test + // cases. + auto dict = std::static_pointer_cast<arrow::FloatArray>(rand.Float32(20000, -1.0, 1.0)); + + // Number of bits available in T to write a positive integer. + constexpr int kBitsAvailable = 8 * sizeof(T) - (std::is_signed_v<T> ? 1 : 0); Review Comment: No, for a given bit_width, we test with all possible integer type that can fit that contain it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org