This is an automated email from the ASF dual-hosted git repository.

wzhou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 679d58fa6d970065f6c656ffcfd504794c2be516
Author: Daniel Becker <[email protected]>
AuthorDate: Thu Jun 22 18:05:19 2023 +0200

    IMPALA-12238: RandomNestedDataGenerator should take a seed argument
    
    RandomNestedDataGenerator can be used to produce parquet files with
    random data from Avro schemas. This change makes it possible to provide
    a seed value for the random generator so the generated files are
    reproducible. The seed can be given as the last (optional) command line
    argument. It is parsed as a Java 'long'.
    
    Testing:
     - manually verified that when run with the same arguments (including
       the seed), the data generator produces the same results
    
    Change-Id: Iee33604bbfe12895100afbd0f98ac302dee9a238
    Reviewed-on: http://gerrit.cloudera.org:8080/20136
    Reviewed-by: Csaba Ringhofer <[email protected]>
    Tested-by: Daniel Becker <[email protected]>
---
 .../datagenerator/RandomNestedDataGenerator.java    | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git 
a/java/datagenerator/src/main/java/org/apache/impala/datagenerator/RandomNestedDataGenerator.java
 
b/java/datagenerator/src/main/java/org/apache/impala/datagenerator/RandomNestedDataGenerator.java
index 9a3cb7894..fc68847c4 100644
--- 
a/java/datagenerator/src/main/java/org/apache/impala/datagenerator/RandomNestedDataGenerator.java
+++ 
b/java/datagenerator/src/main/java/org/apache/impala/datagenerator/RandomNestedDataGenerator.java
@@ -24,6 +24,7 @@ import java.lang.StringBuilder;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.HashMap;
+import java.util.Optional;
 import java.util.Random;
 import java.util.Date;
 
@@ -54,9 +55,10 @@ public class RandomNestedDataGenerator {
   public static final Double CHANCE_UNIQUE = 0.02;
 
   private static void generateDataToFile(
-      String schemaFile, int targetNumElements, String outputFile) throws 
IOException {
+      String schemaFile, int targetNumElements, String outputFile, 
Optional<Long> seed)
+      throws IOException {
     buildCache();
-    rand = new Random();
+    rand = seed.isPresent() ? new Random(seed.get()) : new Random();
     Schema schema = new Schema.Parser().parse(new File(schemaFile));
     Configuration conf = new Configuration();
     conf.set("parquet.avro.write-old-list-structure", "false");
@@ -251,8 +253,10 @@ public class RandomNestedDataGenerator {
   }
 
   public static void main(String[] args) throws Exception {
-    if (args.length != 4) {
-      System.err.println("Arguments: schema_file num_elements list_len 
output_file");
+    final int num_args = args.length;
+    if (num_args < 4 || num_args > 5) {
+      System.err.println(
+          "Arguments: schema_file num_elements list_len output_file 
[random_seed]");
       System.exit(1);
     }
     String schemaFile = args[0];
@@ -260,6 +264,13 @@ public class RandomNestedDataGenerator {
     numListItems = Integer.valueOf(args[2]);
     String outputFile = args[3];
 
-    generateDataToFile(schemaFile, numElements, outputFile);
+    Optional<Long> seed;
+    if (num_args > 4) {
+      seed = Optional.of(Long.valueOf(args[4]));
+    } else {
+      seed = Optional.empty();
+    }
+
+    generateDataToFile(schemaFile, numElements, outputFile, seed);
   }
 }

Reply via email to