mathyingzhou commented on a change in pull request #8648: URL: https://github.com/apache/arrow/pull/8648#discussion_r612990896
########## File path: cpp/src/arrow/adapters/orc/adapter_test.cc ########## @@ -58,6 +82,182 @@ class MemoryOutputStream : public liborc::OutputStream { uint64_t length_, natural_write_size_; }; +std::shared_ptr<Buffer> GenerateFixedDifferenceBuffer(int32_t fixed_length, + int64_t length) { + BufferBuilder builder; + int32_t offsets[length]; + ARROW_EXPECT_OK(builder.Resize(4 * length)); + for (int32_t i = 0; i < length; i++) { + offsets[i] = fixed_length * i; + } + ARROW_EXPECT_OK(builder.Append(offsets, 4 * length)); + std::shared_ptr<Buffer> buffer; + ARROW_EXPECT_OK(builder.Finish(&buffer)); + return buffer; +} + +std::shared_ptr<Array> CastFixedSizeBinaryArrayToBinaryArray( + std::shared_ptr<Array> array) { + auto fixed_size_binary_array = std::static_pointer_cast<FixedSizeBinaryArray>(array); + std::shared_ptr<Buffer> value_offsets = GenerateFixedDifferenceBuffer( + fixed_size_binary_array->byte_width(), array->length() + 1); + return std::make_shared<BinaryArray>(array->length(), value_offsets, + array->data()->buffers[1], + array->data()->buffers[0]); +} + +template <typename TargetArrayType> +std::shared_ptr<Array> CastInt64ArrayToTemporalArray( + const std::shared_ptr<DataType>& type, std::shared_ptr<Array> array) { + std::shared_ptr<ArrayData> new_array_data = + ArrayData::Make(type, array->length(), array->data()->buffers); + return std::make_shared<TargetArrayType>(new_array_data); +} + +template <typename T, typename U> +void randintpartition(int64_t n, T sum, std::vector<U>* out) { + const int random_seed = 0; + std::default_random_engine gen(random_seed); + out->resize(n, static_cast<T>(0)); + T remaining_sum = sum; + std::generate(out->begin(), out->end() - 1, [&gen, &remaining_sum] { + std::uniform_int_distribution<T> d(static_cast<T>(0), remaining_sum); + auto res = d(gen); + remaining_sum -= res; + return static_cast<U>(res); + }); + (*out)[n - 1] += remaining_sum; + std::random_shuffle(out->begin(), out->end()); +} + +Result<std::shared_ptr<Array>> GenerateRandomDate64Array(int64_t size, + double null_probability) { + arrow::random::RandomArrayGenerator rand(kRandomSeed); + return CastInt64ArrayToTemporalArray<Date64Array>( + date64(), rand.Int64(size, kMilliMin, kMilliMax, null_probability)); +} + +Result<std::shared_ptr<Array>> GenerateRandomTimestampArray(int64_t size, Review comment: @emkornfield @lidavidm Since I use the fact that real DATE64 and TIMESTAMP (with UNIT not equals NANO) can be cast to TIMESTAMP (using NANO) without getting beyond int64_t (because ORC essentially only supports NANO, see TimestampVectorBatch in https://orc.apache.org/docs/core-cpp.html) I don't think `arrow::random::RandomArrayGenerator.ArrayOf` can be used. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org