rdblue commented on a change in pull request #828:
URL: https://github.com/apache/iceberg/pull/828#discussion_r434860692



##########
File path: spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
##########
@@ -442,4 +474,226 @@ private static BigInteger randomUnscaled(int precision, 
Random random) {
 
     return new BigInteger(sb.toString());
   }
+
+  private static class DictionaryEncodedDataGenerator extends 
RandomDataGenerator {
+
+    private DictionaryEncodedDataGenerator(Schema schema, long seed) {
+      super(schema, seed);
+    }
+
+    @Override
+    public Object primitive(Type.PrimitiveType primitive) {
+      Object result = generateDictionaryEncodablePrimitive(primitive, random);
+      return super.getPrimitive(primitive, result);
+    }
+
+    @SuppressWarnings("checkstyle:CyclomaticComplexity")
+    private static Object 
generateDictionaryEncodablePrimitive(Type.PrimitiveType primitive, Random 
random) {
+      // 3 choices
+      int choice = random.nextInt(3);
+      switch (primitive.typeId()) {
+        case BOOLEAN:
+          return true; // doesn't really matter for booleans since they are 
not dictionary encoded
+
+        case INTEGER:
+          switch (choice) {
+            case 0:
+              return 0;
+            case 1:
+              return 1;
+            case 2:
+              return 2;
+          }
+
+        case LONG:
+          switch (choice) {
+            case 0:
+              return 0L;
+            case 1:
+              return 1L;
+            case 2:
+              return 2L;
+          }
+
+        case FLOAT:
+          switch (choice) {
+            case 0:
+              return 0.0f;
+            case 1:
+              return 1.0f;
+            case 2:
+              return 2.0f;
+          }
+
+        case DOUBLE:
+          switch (choice) {
+            case 0:
+              return 0.0d;
+            case 1:
+              return 1.0d;
+            case 2:
+              return 2.0d;
+          }
+
+        case DATE:
+          switch (choice) {
+            case 0:
+              return 0;
+            case 1:
+              return 1;
+            case 2:
+              return 2;
+          }
+
+        case TIME:
+          switch (choice) {
+            case 0:
+              return 0L;
+            case 1:
+              return 1L;
+            case 2:
+              return 2L;
+          }
+
+        case TIMESTAMP:
+          switch (choice) {
+            case 0:
+              return 0L;
+            case 1:
+              return 1L;
+            case 2:
+              return 2L;
+          }
+
+        case STRING:
+          switch (choice) {
+            case 0:
+              return UTF8String.fromString("0");
+            case 1:
+              return UTF8String.fromString("1");
+            case 2:
+              return UTF8String.fromString("2");
+          }
+
+        case FIXED:
+          byte[] fixed = new byte[((Types.FixedType) primitive).length()];
+          switch (choice) {
+            case 0:
+              fixed[0] = 0;
+              return fixed;
+            case 1:
+              fixed[0] = 1;
+              return fixed;
+            case 2:
+              fixed[0] = 2;
+              return fixed;
+          }
+
+        case BINARY:
+          byte[] binary = new byte[4];
+          switch (choice) {
+            case 0:
+              binary[0] = 0;
+              return binary;
+            case 1:
+              binary[0] = 1;
+              return binary;
+            case 2:
+              binary[0] = 2;
+              return binary;
+          }
+
+        case DECIMAL:
+          Types.DecimalType type = (Types.DecimalType) primitive;
+          switch (choice) {
+            case 0:
+              BigInteger unscaled = new BigInteger("1");
+              return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+            case 1:
+              unscaled = new BigInteger("2");
+              return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+            case 2:
+              unscaled = new BigInteger("3");
+              return Decimal.apply(new BigDecimal(unscaled, type.scale()));
+          }
+
+        default:
+          throw new IllegalArgumentException(
+              "Cannot generate random value for unknown type: " + primitive);
+      }
+    }
+  }
+
+  private static class DictionaryFallbackToPlainEncodingDataGenerator extends 
RandomDataGenerator {
+    private final long numValues;
+    private final float fraction;
+    private int current;
+
+    private DictionaryFallbackToPlainEncodingDataGenerator(Schema schema, long 
seed, int numRecords, float fraction) {

Review comment:
       I think it would be easier to understand if this were a combination of 
the normal generator and the dictionary-encoded generator and used just some 
number of records before falling back. Then you would only need the two 
existing ways to generate primitives. Something like this, where 
`generateDictionaryEncodablePrimitive()` is what I pasted for 
`DictionaryEncodedDataGenerator` above:
   
   ```java
     private static class DictionaryEncodedDataGenerator extends 
RandomDataGenerator {
       private DictionaryEncodedDataGenerator(Schema schema, long seed) {
         super(schema, seed);
       }
   
       @Override
       protected Object randomValue(Type.PrimitiveType primitive, Random 
random) {
         return generateDictionaryEncodablePrimitive(primitive, random);
       }
     }
   
     private static class FallbackDataGenerator extends RandomDataGenerator {
       private final long dictionaryEncodedRows;
       private long rowCount = 0;
   
       private FallbackDataGenerator(Schema schema, long seed, int 
numDictionaryEncoded) {
         super(schema, seed);
         this.dictionaryEncodedRows = numDictionaryEncoded;
       }
   
       @Override
       protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
         this.rowCount += 1;
         if (rowCount > dictionaryEncodedRows) {
           return generatePrimitive(primitive, rand);
         } else {
           return generateDictionaryEncodablePrimitive(primitive, rand);
         }
       }
     }
   ```
   
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to