pitrou commented on a change in pull request #11864:
URL: https://github.com/apache/arrow/pull/11864#discussion_r773118529
##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -420,6 +420,30 @@ struct ARROW_EXPORT Utf8NormalizeOptions : public
FunctionOptions {
Form form;
};
+class ARROW_EXPORT RandomOptions : public FunctionOptions {
+ public:
+ enum Initializer { SystemRandom, Seed };
+
+ static RandomOptions FromSystemRandom(int64_t length) {
+ return RandomOptions{length, SystemRandom, 0};
+ }
+ static RandomOptions FromSeed(int64_t length, uint32_t seed) {
+ return RandomOptions{length, Seed, seed};
+ }
+
+ RandomOptions(int64_t length, Initializer initializer, uint32_t seed);
+ RandomOptions();
+ constexpr static char const kTypeName[] = "RandomOptions";
+ static RandomOptions Defaults() { return RandomOptions(); }
+
+ /// The length of the array returned. Negative is invalid.
+ int64_t length;
+ /// The type of initialization for random number generation - system or
provided seed.
+ Initializer initializer;
+ /// The seed value used to initialize the random number generation.
+ uint32_t seed;
Review comment:
Do we want to make this `uint64_t` to allow passing more entropy? Or
would the higher bits be ignored?
##########
File path: cpp/src/arrow/compute/kernels/scalar_random_test.cc
##########
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/compute/api.h"
+#include "arrow/compute/kernels/test_util.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+namespace compute {
+
+namespace {
+
+void TestRandomWithOptions(const RandomOptions& random_options) {
+ ASSERT_OK_AND_ASSIGN(Datum result, CallFunction("random", {},
&random_options));
+ const auto result_array = result.make_array();
+ ValidateOutput(*result_array);
+ ASSERT_EQ(result_array->length(), random_options.length);
+ ASSERT_EQ(result_array->null_count(), 0);
+ AssertTypeEqual(result_array->type(), float64());
+
+ if (random_options.length > 0) {
+ // verify E(X), E(X^2) is near theory
+ double sum = 0, square_sum = 0;
+ const double* values = result_array->data()->GetValues<double>(1);
+ for (int64_t i = 0; i < random_options.length; ++i) {
+ const double value = values[i];
+ ASSERT_GE(value, 0);
+ ASSERT_LT(value, 1);
+ sum += value;
+ square_sum += value * value;
+ }
+ const double E_X = 0.5;
+ const double E_X2 = 1.0 / 12 + E_X * E_X;
+ ASSERT_NEAR(sum / random_options.length, E_X, E_X * 0.02);
+ ASSERT_NEAR(square_sum / random_options.length, E_X2, E_X2 * 0.02);
+ }
+}
+
+} // namespace
+
+TEST(TestRandom, Seed) {
+ const int kCount = 100000;
+ auto random_options = RandomOptions::FromSeed(/*length=*/kCount, /*seed=*/0);
+ TestRandomWithOptions(random_options);
+}
+
+TEST(TestRandom, SystemRandom) {
+ const int kCount = 100000;
+ auto random_options = RandomOptions::FromSystemRandom(/*length=*/kCount);
+ TestRandomWithOptions(random_options);
+}
Review comment:
Can you add a test that calling SystemRandom several times (including
from different threads) produces different results?
##########
File path: cpp/src/arrow/compute/kernels/scalar_random_test.cc
##########
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/compute/api.h"
+#include "arrow/compute/kernels/test_util.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+namespace compute {
+
+namespace {
+
+void TestRandomWithOptions(const RandomOptions& random_options) {
+ ASSERT_OK_AND_ASSIGN(Datum result, CallFunction("random", {},
&random_options));
+ const auto result_array = result.make_array();
+ ValidateOutput(*result_array);
+ ASSERT_EQ(result_array->length(), random_options.length);
+ ASSERT_EQ(result_array->null_count(), 0);
+ AssertTypeEqual(result_array->type(), float64());
+
+ if (random_options.length > 0) {
+ // verify E(X), E(X^2) is near theory
Review comment:
Neat!
##########
File path: cpp/src/arrow/compute/kernels/scalar_random.cc
##########
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <mutex>
+#include <random>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/registry.h"
+#include "arrow/util/pcg_random.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+// Generates a random floating point number in range [0, 1).
+double generate_uniform(random::pcg64_fast& rng) {
+ // This equation is copied from numpy. It calculates `rng() / 2^64` and
+ // the return value is strictly less than 1.
+ static_assert(random::pcg64_fast::min() == 0ULL, "");
+ static_assert(random::pcg64_fast::max() == ~0ULL, "");
+ return (rng() >> 11) * (1.0 / 9007199254740992.0);
+}
+
+using RandomState = OptionsWrapper<RandomOptions>;
+
+Status ExecRandom(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static random::pcg64_fast seed_gen((std::random_device{})());
Review comment:
It seems this seeds with a generator with a single 32-bit number.
Perhaps we can change this to seed with an appropriate number of bits.
Apparently the PCG library can help with this, see the mention of
`seed_seq_from` under https://www.pcg-random.org/using-pcg-cpp.html#id2
##########
File path: cpp/src/arrow/compute/kernels/scalar_random.cc
##########
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <mutex>
+#include <random>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/registry.h"
+#include "arrow/util/pcg_random.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+// Generates a random floating point number in range [0, 1).
+double generate_uniform(random::pcg64_fast& rng) {
Review comment:
Nit: as per our coding guidelines, when an argument is mutable, pass it
by pointer rather than by reference (i.e. `random::pcg64_fast* rng`).
##########
File path: cpp/src/arrow/compute/kernels/scalar_random.cc
##########
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <mutex>
+#include <random>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/registry.h"
+#include "arrow/util/pcg_random.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+// Generates a random floating point number in range [0, 1).
+double generate_uniform(random::pcg64_fast& rng) {
+ // This equation is copied from numpy. It calculates `rng() / 2^64` and
+ // the return value is strictly less than 1.
+ static_assert(random::pcg64_fast::min() == 0ULL, "");
+ static_assert(random::pcg64_fast::max() == ~0ULL, "");
+ return (rng() >> 11) * (1.0 / 9007199254740992.0);
+}
+
+using RandomState = OptionsWrapper<RandomOptions>;
+
+Status ExecRandom(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static random::pcg64_fast seed_gen((std::random_device{})());
+ static std::mutex seed_gen_mutex;
+
+ random::pcg64_fast gen;
+ const RandomOptions& options = RandomState::Get(ctx);
+ if (options.length < 0) {
+ return Status::Invalid("Negative number of elements");
+ }
+
+ auto out_data = ArrayData::Make(float64(), options.length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(options.length * sizeof(double)));
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+
+ if (options.initializer == RandomOptions::Seed) {
+ gen.seed(options.seed);
+ } else {
+ std::lock_guard<std::mutex> seed_gen_lock(seed_gen_mutex);
+ gen.seed(seed_gen());
+ }
+ for (int64_t i = 0; i < options.length; ++i) {
+ out_buffer[i] = generate_uniform(gen);
+ }
+ *out = std::move(out_data);
+ return Status::OK();
+}
+
+const FunctionDoc random_doc{
+ "Generates a number in range [0, 1)",
Review comment:
```suggestion
"Generate numbers in the range [0, 1)",
```
##########
File path: cpp/src/arrow/compute/kernels/scalar_random.cc
##########
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <mutex>
+#include <random>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/registry.h"
+#include "arrow/util/pcg_random.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+// Generates a random floating point number in range [0, 1).
+double generate_uniform(random::pcg64_fast& rng) {
+ // This equation is copied from numpy. It calculates `rng() / 2^64` and
+ // the return value is strictly less than 1.
+ static_assert(random::pcg64_fast::min() == 0ULL, "");
+ static_assert(random::pcg64_fast::max() == ~0ULL, "");
+ return (rng() >> 11) * (1.0 / 9007199254740992.0);
+}
+
+using RandomState = OptionsWrapper<RandomOptions>;
+
+Status ExecRandom(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static random::pcg64_fast seed_gen((std::random_device{})());
+ static std::mutex seed_gen_mutex;
+
+ random::pcg64_fast gen;
+ const RandomOptions& options = RandomState::Get(ctx);
+ if (options.length < 0) {
+ return Status::Invalid("Negative number of elements");
+ }
+
+ auto out_data = ArrayData::Make(float64(), options.length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(options.length * sizeof(double)));
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+
+ if (options.initializer == RandomOptions::Seed) {
+ gen.seed(options.seed);
+ } else {
+ std::lock_guard<std::mutex> seed_gen_lock(seed_gen_mutex);
+ gen.seed(seed_gen());
+ }
+ for (int64_t i = 0; i < options.length; ++i) {
+ out_buffer[i] = generate_uniform(gen);
+ }
+ *out = std::move(out_data);
+ return Status::OK();
+}
+
+const FunctionDoc random_doc{
+ "Generates a number in range [0, 1)",
Review comment:
(note we use infinitives, not present tense in doc summaries)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]