[GitHub] [druid] imply-cheddar commented on a diff in pull request #13803: nested columns + arrays = array columns!

via GitHub Tue, 14 Mar 2023 00:32:45 -0700


imply-cheddar commented on code in PR #13803:
URL: https://github.com/apache/druid/pull/13803#discussion_r1135011007



##########
processing/src/main/java/org/apache/druid/data/input/impl/DelimitedInputFormat.java:
##########
@@ -41,6 +42,18 @@
 public class DelimitedInputFormat extends FlatTextInputFormat
 {
   public static final String TYPE_KEY = "tsv";
+
+  public static DelimitedInputFormat ofColumns(String... columns)
+  {
+    return new DelimitedInputFormat(
+        Arrays.asList(columns),
+        null,
+        null,
+        false,
+        false,
+        0
+    );
+  }

Review Comment:
   Location nit: does this need to be in `main` rather than `test`?



##########
processing/src/main/java/org/apache/druid/data/input/impl/MapInputRowParser.java:
##########
@@ -77,22 +76,36 @@ public static InputRow parse(InputRowSchema inputRowSchema, 
Map<String, Object>
    * 3) If isIncludeAllDimensions is not set and {@link 
DimensionsSpec#getDimensionNames()} is empty,
    *    the dimensions in the given map is returned.
    *
-   * In any case, the returned list does not include any dimensions in {@link 
DimensionsSpec#getDimensionExclusions()}.
+   * In any case, the returned list does not include any dimensions in {@link 
DimensionsSpec#getDimensionExclusions()}
+   * or {@link TimestampSpec#getTimestampColumn()}.
    */
   private static List<String> findDimensions(
+      TimestampSpec timestampSpec,
       DimensionsSpec dimensionsSpec,
       Map<String, Object> rawInputRow
   )
   {
     if (dimensionsSpec.isIncludeAllDimensions()) {
       LinkedHashSet<String> dimensions = new 
LinkedHashSet<>(dimensionsSpec.getDimensionNames());
-      dimensions.addAll(Sets.difference(rawInputRow.keySet(), 
dimensionsSpec.getDimensionExclusions()));
+      for (String field : rawInputRow.keySet()) {
+        if (timestampSpec.getTimestampColumn().equals(field) || 
dimensionsSpec.getDimensionExclusions().contains(field)) {

Review Comment:
   can we just get these two things once?



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -437,41 +471,51 @@ public Class<?> classOfObject()
       this.typeSet = new NestedLiteralTypeInfo.MutableTypeSet();
     }
 
-    private StructuredDataProcessor.ProcessedLiteral<?> processValue(@Nullable 
Object value)
+    private StructuredDataProcessor.ProcessedLiteral<?> 
processValue(ExprEval<?> eval)
     {
-      // null value is always added to the global dictionary as id 0, so we 
can ignore them here
-      if (value != null) {
-        // why not
-        ExprEval<?> eval = ExprEval.bestEffortOf(value);
-        final ColumnType columnType = ExpressionType.toColumnType(eval.type());
-
-        switch (columnType.getType()) {
-          case LONG:
-            globalDimensionDictionary.addLongValue(eval.asLong());
-            typeSet.add(ColumnType.LONG);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asLong(),
-                StructuredDataProcessor.getLongObjectEstimateSize()
-            );
-          case DOUBLE:
-            globalDimensionDictionary.addDoubleValue(eval.asDouble());
-            typeSet.add(ColumnType.DOUBLE);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asDouble(),
-                StructuredDataProcessor.getDoubleObjectEstimateSize()
-            );
-          case STRING:
-          default:
-            final String asString = eval.asString();
-            globalDimensionDictionary.addStringValue(asString);
-            typeSet.add(ColumnType.STRING);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asString(),
-                StructuredDataProcessor.estimateStringSize(asString)
-            );
-        }
+      final ColumnType columnType = ExpressionType.toColumnType(eval.type());
+      int sizeEstimate;
+      switch (columnType.getType()) {
+        case LONG:
+          typeSet.add(ColumnType.LONG);
+          sizeEstimate = globalDimensionDictionary.addLongValue(eval.asLong());
+          return new StructuredDataProcessor.ProcessedLiteral<>(eval.asLong(), 
sizeEstimate);
+        case DOUBLE:
+          typeSet.add(ColumnType.DOUBLE);
+          sizeEstimate = 
globalDimensionDictionary.addDoubleValue(eval.asDouble());
+          return new 
StructuredDataProcessor.ProcessedLiteral<>(eval.asDouble(), sizeEstimate);
+        case ARRAY:
+          // skip empty arrays for now, they will always be called 'string' 
arrays, which isn't very helpful here since
+          // it will pollute the type set
+          Preconditions.checkNotNull(columnType.getElementType(), "Array 
element type must not be null");

Review Comment:
   If this were to ever happen, I think I'd want to know which field was the 
bad one.



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -411,7 +445,7 @@ public Object getObject()
         if (0 <= dimIndex && dimIndex < dims.length) {
           final StructuredData data = (StructuredData) dims[dimIndex];
           if (data != null) {
-            return data.getValue();
+            return ExprEval.bestEffortOf(data.getValue()).value();

Review Comment:
   Is this counting on coercion or something?



##########
processing/src/main/java/org/apache/druid/data/input/impl/JsonInputFormat.java:
##########
@@ -41,6 +41,14 @@ public class JsonInputFormat extends NestedInputFormat
 {
   public static final String TYPE_KEY = "json";
 
+  public static final JsonInputFormat DEFAULT = new JsonInputFormat(
+      JSONPathSpec.DEFAULT,
+      null,
+      null,
+      null,
+      null
+  );
+

Review Comment:
   Location nit: does this need to be in `main` rather than `test`?



##########
processing/src/main/java/org/apache/druid/query/metadata/SegmentAnalyzer.java:
##########
@@ -134,6 +134,11 @@ public Map<String, ColumnAnalysis> analyze(Segment segment)
             analysis = analyzeStringColumn(capabilities, storageAdapter, 
columnName);
           }
           break;
+        case ARRAY:
+          // todo (clint): this is wack, but works for now because arrays are 
always nested complex columns...

Review Comment:
   Instead of a todo like this.  How about an if check that validates that it's 
a nested column and, if it's not a nested column, throws an Exception that is 
clearly saying that the code is currently written assuming that the arrays are 
delivered only by nested columns?



##########
processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java:
##########
@@ -480,11 +480,25 @@ public Expr apply(List<Expr> args)
       final StructuredDataProcessor processor = new StructuredDataProcessor()
       {
         @Override
-        public StructuredDataProcessor.ProcessedLiteral<?> 
processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
+        public StructuredDataProcessor.ProcessedLiteral<?> 
processLiteralField(ArrayList<NestedPathPart> fieldPath, @Nullable Object 
fieldValue)
         {
           // do nothing, we only want the list of fields returned by this 
processor
           return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
         }
+
+        @Nullable
+        @Override
+        public ProcessedLiteral<?> processArrayOfLiteralsField(
+            ArrayList<NestedPathPart> fieldPath,
+            @Nullable Object maybeArrayOfLiterals
+        )
+        {
+          ExprEval<?> eval = ExprEval.bestEffortOf(maybeArrayOfLiterals);
+          if (eval.type().isArray() && 
eval.type().getElementType().isPrimitive()) {
+            return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
+          }

Review Comment:
   The if condition makes me believe that we found an array and should be 
joyously celebrating, but the return value is a `NULL_LITERAL`, why is it 
correct to return a `NULL_LITERAL` and how is that different from just 
returning `null`?



##########
extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/NestedColumnParquetReaderTest.java:
##########
@@ -181,7 +181,7 @@ public void 
testNestedColumnSchemalessNestedTestFileNoNested() throws IOExceptio
     );
 
     List<InputRow> rows = readAllRows(reader);
-    Assert.assertEquals(ImmutableList.of("dim1", "metric1", "timestamp"), 
rows.get(0).getDimensions());
+    Assert.assertEquals(ImmutableList.of("dim1", "metric1"), 
rows.get(0).getDimensions());

Review Comment:
   Why the change in expectation?



##########
processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java:
##########
@@ -480,11 +480,25 @@ public Expr apply(List<Expr> args)
       final StructuredDataProcessor processor = new StructuredDataProcessor()
       {
         @Override
-        public StructuredDataProcessor.ProcessedLiteral<?> 
processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
+        public StructuredDataProcessor.ProcessedLiteral<?> 
processLiteralField(ArrayList<NestedPathPart> fieldPath, @Nullable Object 
fieldValue)
         {
           // do nothing, we only want the list of fields returned by this 
processor
           return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
         }
+
+        @Nullable
+        @Override
+        public ProcessedLiteral<?> processArrayOfLiteralsField(
+            ArrayList<NestedPathPart> fieldPath,
+            @Nullable Object maybeArrayOfLiterals
+        )
+        {
+          ExprEval<?> eval = ExprEval.bestEffortOf(maybeArrayOfLiterals);

Review Comment:
   Would it make sense to break `ExprEval.bestEffortOf` into a bunch of checks 
for different groups of expected types (i.e. `ExprEval.maybeLiteral()` and 
`ExprEval.maybeArray`, etc.).  Calls to `bestEffortOf` can cascade through, but 
places like this that already know some of what they expect can call the one 
that more aligns with expectations?



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -59,16 +62,43 @@ public class NestedDataColumnIndexer implements 
DimensionIndexer<StructuredData,
   protected final StructuredDataProcessor indexerProcessor = new 
StructuredDataProcessor()
   {
     @Override
-    public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> 
fieldPath, Object fieldValue)
+    public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> 
fieldPath, @Nullable Object fieldValue)
     {
-      final String fieldName = 
NestedPathFinder.toNormalizedJsonPath(fieldPath);
-      LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
-      if (fieldIndexer == null) {
-        estimatedFieldKeySize += 
StructuredDataProcessor.estimateStringSize(fieldName);
-        fieldIndexer = new LiteralFieldIndexer(globalDictionary);
-        fieldIndexers.put(fieldName, fieldIndexer);
+      // null value is always added to the global dictionary as id 0, so we 
can ignore them here
+      if (fieldValue != null) {
+        // why not
+        final String fieldName = 
NestedPathFinder.toNormalizedJsonPath(fieldPath);
+        ExprEval<?> eval = ExprEval.bestEffortOf(fieldValue);
+        LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
+        if (fieldIndexer == null) {
+          estimatedFieldKeySize += 
StructuredDataProcessor.estimateStringSize(fieldName);
+          fieldIndexer = new LiteralFieldIndexer(globalDictionary);
+          fieldIndexers.put(fieldName, fieldIndexer);
+        }
+        return fieldIndexer.processValue(eval);
       }
-      return fieldIndexer.processValue(fieldValue);
+      return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
+    }
+
+    @Nullable
+    @Override
+    public ProcessedLiteral<?> processArrayOfLiteralsField(
+        ArrayList<NestedPathPart> fieldPath,
+        Object maybeArrayOfLiterals
+    )
+    {
+      final ExprEval<?> maybeLiteralArray = 
ExprEval.bestEffortOf(maybeArrayOfLiterals);
+      if (maybeLiteralArray.type().isArray() && 
maybeLiteralArray.type().getElementType().isPrimitive()) {
+        final String fieldName = 
NestedPathFinder.toNormalizedJsonPath(fieldPath);
+        LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
+        if (fieldIndexer == null) {
+          estimatedFieldKeySize += 
StructuredDataProcessor.estimateStringSize(fieldName);
+          fieldIndexer = new LiteralFieldIndexer(globalDictionary);
+          fieldIndexers.put(fieldName, fieldIndexer);
+        }
+        return fieldIndexer.processValue(maybeLiteralArray);
+      }
+      return null;

Review Comment:
   This looks a lot like code in NestedDataExpressions, except this doesn't 
return the `NULL_LITERAL`.  I find myself wondering if the 
NestedDataExpressions code shouldn't look like this?



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -59,16 +62,43 @@ public class NestedDataColumnIndexer implements 
DimensionIndexer<StructuredData,
   protected final StructuredDataProcessor indexerProcessor = new 
StructuredDataProcessor()
   {
     @Override
-    public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> 
fieldPath, Object fieldValue)
+    public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> 
fieldPath, @Nullable Object fieldValue)
     {
-      final String fieldName = 
NestedPathFinder.toNormalizedJsonPath(fieldPath);
-      LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
-      if (fieldIndexer == null) {
-        estimatedFieldKeySize += 
StructuredDataProcessor.estimateStringSize(fieldName);
-        fieldIndexer = new LiteralFieldIndexer(globalDictionary);
-        fieldIndexers.put(fieldName, fieldIndexer);
+      // null value is always added to the global dictionary as id 0, so we 
can ignore them here
+      if (fieldValue != null) {
+        // why not

Review Comment:
   Because I don't know "why?"  (if this comment is attempting to add 
information, more words please).



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -145,6 +175,10 @@ public DimensionSelector makeDimensionSelector(
     final int dimIndex = desc.getIndex();
     final ColumnValueSelector<?> rootLiteralSelector = 
getRootLiteralValueSelector(currEntry, dimIndex);
     if (rootLiteralSelector != null) {
+      final LiteralFieldIndexer root = 
fieldIndexers.get(NestedPathFinder.JSON_PATH_ROOT);
+      if (root.getTypes().getSingleType().isArray()) {
+        throw new UnsupportedOperationException("Not supported");
+      }

Review Comment:
   I'm reading this as "if all we have are root-level entries and they are 
always arrays, then throw a UOE exception".  I'm pretty sure I'm reading it 
wrong, but wishing the error message gave me more context without me feeling 
like I need to expand lines on the review to know what this is validating.



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -437,41 +471,51 @@ public Class<?> classOfObject()
       this.typeSet = new NestedLiteralTypeInfo.MutableTypeSet();
     }
 
-    private StructuredDataProcessor.ProcessedLiteral<?> processValue(@Nullable 
Object value)
+    private StructuredDataProcessor.ProcessedLiteral<?> 
processValue(ExprEval<?> eval)
     {
-      // null value is always added to the global dictionary as id 0, so we 
can ignore them here
-      if (value != null) {
-        // why not
-        ExprEval<?> eval = ExprEval.bestEffortOf(value);
-        final ColumnType columnType = ExpressionType.toColumnType(eval.type());
-
-        switch (columnType.getType()) {
-          case LONG:
-            globalDimensionDictionary.addLongValue(eval.asLong());
-            typeSet.add(ColumnType.LONG);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asLong(),
-                StructuredDataProcessor.getLongObjectEstimateSize()
-            );
-          case DOUBLE:
-            globalDimensionDictionary.addDoubleValue(eval.asDouble());
-            typeSet.add(ColumnType.DOUBLE);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asDouble(),
-                StructuredDataProcessor.getDoubleObjectEstimateSize()
-            );
-          case STRING:
-          default:
-            final String asString = eval.asString();
-            globalDimensionDictionary.addStringValue(asString);
-            typeSet.add(ColumnType.STRING);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asString(),
-                StructuredDataProcessor.estimateStringSize(asString)
-            );
-        }
+      final ColumnType columnType = ExpressionType.toColumnType(eval.type());
+      int sizeEstimate;
+      switch (columnType.getType()) {
+        case LONG:
+          typeSet.add(ColumnType.LONG);
+          sizeEstimate = globalDimensionDictionary.addLongValue(eval.asLong());
+          return new StructuredDataProcessor.ProcessedLiteral<>(eval.asLong(), 
sizeEstimate);
+        case DOUBLE:
+          typeSet.add(ColumnType.DOUBLE);
+          sizeEstimate = 
globalDimensionDictionary.addDoubleValue(eval.asDouble());
+          return new 
StructuredDataProcessor.ProcessedLiteral<>(eval.asDouble(), sizeEstimate);
+        case ARRAY:
+          // skip empty arrays for now, they will always be called 'string' 
arrays, which isn't very helpful here since
+          // it will pollute the type set

Review Comment:
   How does it do the skipping of empties?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [druid] imply-cheddar commented on a diff in pull request #13803: nested columns + arrays = array columns!

Reply via email to