adamreeve commented on code in PR #48546:
URL: https://github.com/apache/arrow/pull/48546#discussion_r2629163652
##########
cpp/src/parquet/arrow/generate_fuzz_corpus.cc:
##########
@@ -369,6 +378,100 @@ Result<std::vector<Column>> ExampleColumns(int32_t
length, double null_probabili
return columns;
}
+template <typename T>
+constexpr auto kMin = std::numeric_limits<T>::lowest();
+template <typename T>
+constexpr auto kMax = std::numeric_limits<T>::max();
+
+// Generate columns for physical types along with their supported encodings
+Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings(
+ int32_t length, double null_probability = 0.2) {
+ const EncodingVector kIntEncodings = {Encoding::PLAIN,
Encoding::RLE_DICTIONARY,
+ Encoding::DELTA_BINARY_PACKED,
+ Encoding::BYTE_STREAM_SPLIT};
+ const EncodingVector kFloatEncodings = {Encoding::PLAIN,
Encoding::RLE_DICTIONARY,
+ Encoding::BYTE_STREAM_SPLIT};
+ const EncodingVector kBooleanEncodings = {Encoding::PLAIN, Encoding::RLE};
+ const EncodingVector kByteArrayEncodings = {Encoding::PLAIN,
Encoding::RLE_DICTIONARY,
+
Encoding::DELTA_LENGTH_BYTE_ARRAY,
+ Encoding::DELTA_BYTE_ARRAY};
+ const EncodingVector kFixedLenByteArrayEncodings = {
+ Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::DELTA_BYTE_ARRAY,
+ Encoding::BYTE_STREAM_SPLIT};
+
+ std::vector<ColumnWithEncodings> columns;
+
+ random::RandomArrayGenerator gen(42);
+ auto name_gen = Column::NameGenerator();
+
+ for (const double true_probability : {0.0, 0.001, 0.01, 0.5, 0.999}) {
+ columns.push_back(
+ {{name_gen(), gen.Boolean(length, true_probability, null_probability)},
+ kBooleanEncodings});
+ }
+
+ columns.push_back(
+ {{name_gen(), gen.Int32(length, -100, 100, null_probability)},
kIntEncodings});
+ columns.push_back(
+ {{name_gen(), gen.Int32(length, kMin<int32_t>, kMax<int32_t>,
null_probability)},
+ kIntEncodings});
+ columns.push_back({{name_gen(), gen.Int64(length, -100'000, 100'000,
null_probability)},
+ kIntEncodings});
+ columns.push_back(
+ {{name_gen(), gen.Int64(length, kMin<int64_t>, kMax<int64_t>,
null_probability)},
+ kIntEncodings});
+
+ // XXX should we add INT96? It's deprecated, only supports PLAIN and is
featured in
+ // the parquet-testing files.
Review Comment:
I guess it depends on the reason for having the fuzz tests. If the goal is
to find bugs that could be security risks then it probably does make sense to
include INT96 unless there are plans to completely remove support for it any
time soon.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]