vinothchandar commented on a change in pull request #623: Hudi Test Suite
URL: https://github.com/apache/incubator-hudi/pull/623#discussion_r276877085
 
 

 ##########
 File path: 
hoodie-bench/src/main/java/com/uber/hoodie/integrationsuite/generator/GenericRecordFullPayloadGenerator.java
 ##########
 @@ -0,0 +1,232 @@
+/*
+ *  Copyright (c) 2019 Uber Technologies, Inc. ([email protected])
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ */
+
+package com.uber.hoodie.integrationsuite.generator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.uber.hoodie.common.util.collection.Pair;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.UUID;
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+/**
+ * This is a GenericRecord payload generator that generates full generic 
records {@link GenericRecord}.
+ * Every field of a generic record created using this generator contains a 
random value.
+ */
+public class GenericRecordFullPayloadGenerator implements Serializable {
+
+  public static final int DEFAULT_PAYLOAD_SIZE = 1024 * 1024; // 1 MB
+  private static final int DEFAULT_ENTRIES_FOR_COLLECTIONS = 10;
+  private static Logger log = 
LogManager.getLogger(GenericRecordFullPayloadGenerator.class);
+  protected final Random random = new Random();
+  // The source schema used to generate a payload
+  private final transient Schema baseSchema;
+  // Used to validate a generic record
+  private final transient GenericData genericData = new GenericData();
+  // Number of more bytes to add based on the estimated full record payload 
size and min payload size
+  private int numberOfBytesToAdd;
+  // If more elements should be packed to meet the minPayloadSize
+  private boolean shouldAddMore;
+  // How many complex fields have we visited that can help us pack more 
entries and increase the size of the record
+  private int numberOfComplexFields;
+  // The size of a full record where every field of a generic record created 
contains 1 random value
+  private int estimatedFullPayloadSize;
+
+  public GenericRecordFullPayloadGenerator(Schema schema) {
+    this(schema, DEFAULT_PAYLOAD_SIZE);
+  }
+
+  public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize) {
+    Pair<Integer, Integer> sizeInfo = new 
GenericRecordFullPayloadSizeEstimator(schema)
+        .typeEstimateAndNumComplexFields();
+    this.estimatedFullPayloadSize = sizeInfo.getLeft();
+    this.numberOfComplexFields = sizeInfo.getRight();
+    this.baseSchema = schema;
+    this.shouldAddMore = estimatedFullPayloadSize < minPayloadSize;
+    if (this.shouldAddMore) {
+      this.numberOfBytesToAdd = minPayloadSize - estimatedFullPayloadSize;
+      if (numberOfComplexFields < 1) {
+        log.warn("The schema does not have any collections/complex fields. 
Cannot achieve minPayloadSize => "
+            + minPayloadSize);
+      }
+    }
+  }
+
+  protected static boolean isPrimitive(Schema localSchema) {
+    if (localSchema.getType() != Type.ARRAY
+        && localSchema.getType() != Type.MAP
+        && localSchema.getType() != Type.RECORD
+        && localSchema.getType() != Type.UNION) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  public GenericRecord getNewPayload() {
+    return convert(baseSchema);
 
 Review comment:
   how fast is this generation? I tried to use an external avro data generator 
before and it was just too painfully slow that I had to write a simplified one.
   
   just confirming that this generation is done, persisted to dfs/kafka and 
then delta streamer reads from sink again? 
   I was concerned that if we inline this with the hudi dag, then if this 
conversion is slow, then it will seem like ingestion/writing is slow.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to