benibus commented on code in PR #14552:
URL: https://github.com/apache/arrow/pull/14552#discussion_r1012095512


##########
cpp/src/arrow/json/test_common.h:
##########
@@ -133,31 +150,72 @@ struct GenerateImpl {
     return Status::NotImplemented("random generation of arrays of type ", t);
   }
 
+  static bool GetBool(Engine& e, double probability) {
+    return std::uniform_real_distribution<>{0, 1}(e) < probability;
+  }
+
+  static bool MaybeWriteNull(Engine& e, Writer* writer, double probability) {
+    bool outcome = GetBool(e, probability);
+    if (outcome) writer->Null();
+    return outcome;
+  }
+
+  static Status WriteValue(const DataType& type, Engine& e, Writer* writer,
+                           const GenerateOptions& options) {
+    GenerateImpl visitor = {e, *writer, options};
+    return VisitTypeInline(type, &visitor);
+  }
+
   Engine& e;
   rj::Writer<rj::StringBuffer>& writer;
+  const GenerateOptions& options;
 };
 
 template <typename Engine>
 inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
-                              Writer* writer) {
-  if (std::uniform_real_distribution<>{0, 1}(e) < .2) {
-    // one out of 5 chance of null, anywhere
-    writer->Null();
+                              Writer* writer, const GenerateOptions& options) {
+  using Impl = GenerateImpl<Engine>;
+  if (Impl::MaybeWriteNull(e, writer, options.null_probability)) {
     return Status::OK();
   }
-  GenerateImpl<Engine> visitor = {e, *writer};
-  return VisitTypeInline(*type, &visitor);
+  return Impl::WriteValue(*type, e, writer, options);
 }
 
 template <typename Engine>
 inline static Status Generate(const std::vector<std::shared_ptr<Field>>& 
fields,
-                              Engine& e, Writer* writer) {
+                              Engine& e, Writer* writer, const 
GenerateOptions& options) {
+  using Impl = GenerateImpl<Engine>;
+
   RETURN_NOT_OK(OK(writer->StartObject()));
-  for (const auto& f : fields) {
-    writer->Key(f->name().c_str());
-    RETURN_NOT_OK(Generate(f->type(), e, writer));
+  if (fields.empty()) {
+    return OK(writer->EndObject(0));
+  }
+
+  // Indices of the fields we plan to actually write, in order
+  std::vector<size_t> indices;
+  indices.reserve(static_cast<size_t>(fields.size() * 
options.field_probability));
+
+  for (size_t i = 0; i < fields.size(); ++i) {
+    if (Impl::GetBool(e, options.field_probability)) {
+      indices.push_back(i);
+    }
+  }
+  if (options.randomize_field_order) {
+    std::shuffle(indices.begin(), indices.end(), e);
   }
-  return OK(writer->EndObject(static_cast<int>(fields.size())));
+
+  // Scale the likelyhood of null values to account for any fields we've 
dropped
+  double null_probability = options.null_probability;
+  null_probability *= static_cast<double>(indices.size()) / fields.size();

Review Comment:
   Yeah... that calculation is egregiously wrong, actually. `null_probability` 
doesn't need to be adjusted at all. Fortunately, that simplifies everything 
else.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to