lidavidm commented on a change in pull request #9715:
URL: https://github.com/apache/arrow/pull/9715#discussion_r596087724
##########
File path: cpp/src/arrow/testing/random.cc
##########
@@ -558,5 +584,248 @@ std::shared_ptr<Array>
RandomArrayGenerator::ArrayOf(std::shared_ptr<DataType> t
return RandomArrayGeneratorOfImpl{this, type, size, null_probability,
nullptr}.Finish();
}
+namespace {
+template <typename T>
+typename T::c_type GetMetadata(const KeyValueMetadata* metadata, const
std::string& key,
+ typename T::c_type default_value) {
+ if (!metadata) return default_value;
+ const auto index = metadata->FindKey(key);
+ if (index < 0) return default_value;
+ const auto& value = metadata->value(index);
+ typename T::c_type output{};
+ auto type = checked_pointer_cast<T>(TypeTraits<T>::type_singleton());
+ if (!internal::ParseValue(*type, value.data(), value.length(), &output)) {
+ ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value));
+ }
+ return output;
+}
+
+Result<std::shared_ptr<Array>> GenerateArray(const Field& field, int64_t
length,
+ RandomArrayGenerator* generator) {
+#define GENERATE_INTEGRAL_CASE_VIEW(BASE_TYPE, VIEW_TYPE)
\
+ case VIEW_TYPE::type_id: {
\
+ const BASE_TYPE::c_type min_value = GetMetadata<BASE_TYPE>(
\
+ field.metadata().get(), "min",
std::numeric_limits<BASE_TYPE::c_type>::min()); \
+ const BASE_TYPE::c_type max_value = GetMetadata<BASE_TYPE>(
\
+ field.metadata().get(), "max",
std::numeric_limits<BASE_TYPE::c_type>::max()); \
+ return generator->Numeric<BASE_TYPE>(length, min_value, max_value,
null_probability) \
+ ->View(field.type());
\
+ }
+#define GENERATE_INTEGRAL_CASE(ARROW_TYPE) \
+ GENERATE_INTEGRAL_CASE_VIEW(ARROW_TYPE, ARROW_TYPE)
+#define GENERATE_FLOATING_CASE(ARROW_TYPE, GENERATOR_FUNC)
\
+ case ARROW_TYPE::type_id: {
\
+ const ARROW_TYPE::c_type min_value = GetMetadata<ARROW_TYPE>(
\
+ field.metadata().get(), "min",
std::numeric_limits<ARROW_TYPE::c_type>::min()); \
+ const ARROW_TYPE::c_type max_value = GetMetadata<ARROW_TYPE>(
\
+ field.metadata().get(), "max",
std::numeric_limits<ARROW_TYPE::c_type>::max()); \
+ const double nan_probability =
\
+ GetMetadata<DoubleType>(field.metadata().get(), "nan_probability", 0);
\
+ return generator->GENERATOR_FUNC(length, min_value, max_value,
null_probability, \
+ nan_probability);
\
+ }
+
+ const double null_probability =
+ field.nullable()
+ ? GetMetadata<DoubleType>(field.metadata().get(),
"null_probability", 0.01)
+ : 0.0;
+ switch (field.type()->id()) {
+ case Type::type::NA:
+ return std::make_shared<NullArray>(length);
+
+ case Type::type::BOOL: {
+ const double true_probability =
+ GetMetadata<DoubleType>(field.metadata().get(), "true_probability",
0.5);
+ return generator->Boolean(length, true_probability, null_probability);
+ }
+
+ GENERATE_INTEGRAL_CASE(UInt8Type);
+ GENERATE_INTEGRAL_CASE(Int8Type);
+ GENERATE_INTEGRAL_CASE(UInt16Type);
+ GENERATE_INTEGRAL_CASE(Int16Type);
+ GENERATE_INTEGRAL_CASE(UInt32Type);
+ GENERATE_INTEGRAL_CASE(Int32Type);
+ GENERATE_INTEGRAL_CASE(UInt64Type);
+ GENERATE_INTEGRAL_CASE(Int64Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int16Type, HalfFloatType);
+ GENERATE_FLOATING_CASE(FloatType, Float32);
+ GENERATE_FLOATING_CASE(DoubleType, Float64);
+
+ case Type::type::STRING:
+ case Type::type::BINARY: {
+ const int32_t min_length =
GetMetadata<Int32Type>(field.metadata().get(), "min", 0);
+ const int32_t max_length =
+ GetMetadata<Int32Type>(field.metadata().get(), "max", 1024);
+ const int32_t unique_values =
+ GetMetadata<Int32Type>(field.metadata().get(), "unique", -1);
+ if (unique_values > 0) {
+ return generator
+ ->StringWithRepeats(length, unique_values, min_length, max_length,
+ null_probability)
+ ->View(field.type());
+ }
+ return generator->String(length, min_length, max_length,
null_probability)
+ ->View(field.type());
+ }
+
+ case Type::type::DECIMAL128:
+ case Type::type::DECIMAL256:
+ case Type::type::FIXED_SIZE_BINARY: {
+ auto byte_width =
+
internal::checked_pointer_cast<FixedSizeBinaryType>(field.type())->byte_width();
+ return generator->FixedSizeBinary(length, byte_width, null_probability)
+ ->View(field.type());
+ }
+
+ GENERATE_INTEGRAL_CASE_VIEW(Int32Type, Date32Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int64Type, Date64Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int64Type, TimestampType);
+ GENERATE_INTEGRAL_CASE_VIEW(Int32Type, Time32Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int64Type, Time64Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int32Type, MonthIntervalType);
+
+ // This isn't as flexible as it could be, but the array-of-structs
layout of this
+ // type means it's not a (useful) composition of other generators
+ GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType);
+
+ case Type::type::LIST: {
+ const int32_t values_length = GetMetadata<Int32Type>(
+ field.metadata().get(), "values", static_cast<int32_t>(length));
Review comment:
I was mostly mirroring the existing generator functions which presented
different APIs, but I'll see if I can't consolidate the cases further for
list/string and their large variants. (And maybe then map can delegate to the
list case.)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]