[GitHub] [druid] kfaraz commented on a diff in pull request #12428: Fix indexMerger to respect the includeAllDimensions flag

GitBox Tue, 12 Apr 2022 04:18:24 -0700


kfaraz commented on code in PR #12428:
URL: https://github.com/apache/druid/pull/12428#discussion_r848297249



##########
core/src/test/java/org/apache/druid/data/input/impl/JsonInputFormatTest.java:
##########
@@ -72,4 +72,26 @@ public void testEquals()
               .withIgnoredFields("objectMapper")
               .verify();
   }
+
+  @Test
+  public void testUseFieldDiscovery_setKeepNullColumnsByDefault()
+  {
+    final JsonInputFormat format = new JsonInputFormat(
+        new JSONPathSpec(true, null),
+        null,
+        null
+    );
+    Assert.assertTrue(format.isKeepNullColumns());
+  }
+
+  @Test
+  public void testUseFieldDiscovery_doNotChangeKeepNullColumnsUserSets()

Review Comment:
   Nit: should we also add a test where `JsonPathSpec.useFieldDiscovery = 
false` ?



##########
indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionMultiPhaseParallelIndexingWithNullColumnTest.java:
##########
@@ -165,37 +335,105 @@ public void 
testIngestNullColumn_storeEmptyColumnsOff_shouldNotStoreEmptyColumns
 
     Set<DataSegment> segments = 
getIndexingServiceClient().getPublishedSegments(task);
     Assert.assertFalse(segments.isEmpty());
+    final List<DimensionSchema> expectedDimensions = 
DimensionsSpec.getDefaultSchemas(
+        Collections.singletonList("ts")
+    );
     for (DataSegment segment : segments) {
-      Assert.assertFalse(segment.getDimensions().contains("unknownDim"));
+      Assert.assertEquals(expectedDimensions.size(), 
segment.getDimensions().size());
+      for (int i = 0; i < expectedDimensions.size(); i++) {
+        Assert.assertEquals(expectedDimensions.get(i).getName(), 
segment.getDimensions().get(i));
+      }
     }
   }
 
   private InputSource getInputSource() throws JsonProcessingException
   {
     final ObjectMapper mapper = getObjectMapper();
-    final List<Map<String, Object>> rows = ImmutableList.of(
-        ImmutableMap.of(
-            "ts", "2022-01-01",
-            "dim1", "val1",
-            "dim2", "val11"
-        ),
-        ImmutableMap.of(
-            "ts", "2022-01-02",
-            "dim1", "val2",
-            "dim2", "val12"
-        ),
-        ImmutableMap.of(
-            "ts", "2022-01-03",
-            "dim1", "val3",
-            "dim2", "val13"
-        )
-    );
+    final List<Map<String, Object>> rows = new ArrayList<>();

Review Comment:
   Nit: this part might be more readable if we add a private method that 
returns a row given the input values i.e.
   ```
   private Map<String, Object> createTestRow(String ts, String... dims) {
      // return map with "ts" -> ts, "dim1" -> dims[0] and so on.
   }
   ```



##########
processing/src/main/java/org/apache/druid/segment/IndexMergerV9.java:
##########
@@ -1460,4 +1456,33 @@ private static <T extends TimeAndDimsPointer> T 
reorderRowPointerColumns(
       );
     }
   }
+
+  private static class DimensionMergerUtil

Review Comment:
   +1
   
   In that vein, maybe `DimensionsSpecInspector` or `DimensionStorageChecker` 
would be a better name?



##########
indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionMultiPhaseParallelIndexingWithNullColumnTest.java:
##########
@@ -165,37 +335,105 @@ public void 
testIngestNullColumn_storeEmptyColumnsOff_shouldNotStoreEmptyColumns
 
     Set<DataSegment> segments = 
getIndexingServiceClient().getPublishedSegments(task);
     Assert.assertFalse(segments.isEmpty());
+    final List<DimensionSchema> expectedDimensions = 
DimensionsSpec.getDefaultSchemas(
+        Collections.singletonList("ts")
+    );
     for (DataSegment segment : segments) {
-      Assert.assertFalse(segment.getDimensions().contains("unknownDim"));
+      Assert.assertEquals(expectedDimensions.size(), 
segment.getDimensions().size());
+      for (int i = 0; i < expectedDimensions.size(); i++) {
+        Assert.assertEquals(expectedDimensions.get(i).getName(), 
segment.getDimensions().get(i));
+      }
     }
   }
 
   private InputSource getInputSource() throws JsonProcessingException
   {
     final ObjectMapper mapper = getObjectMapper();
-    final List<Map<String, Object>> rows = ImmutableList.of(
-        ImmutableMap.of(
-            "ts", "2022-01-01",
-            "dim1", "val1",
-            "dim2", "val11"
-        ),
-        ImmutableMap.of(
-            "ts", "2022-01-02",
-            "dim1", "val2",
-            "dim2", "val12"
-        ),
-        ImmutableMap.of(
-            "ts", "2022-01-03",
-            "dim1", "val3",
-            "dim2", "val13"
-        )
-    );
+    final List<Map<String, Object>> rows = new ArrayList<>();
+    Map<String, Object> row;
+    for (int i = 0; i < 3; i++) {
+      row = new HashMap<>();
+      row.put("ts", StringUtils.format("2022-01-%02d", i + 1));
+      for (int j = 0; j < 2; j++) {
+        row.put("dim" + (j + 1), "val" + (j + 1));
+      }
+      row.put("dim3", null);
+      rows.add(row);
+    }
+    row = new HashMap<>();
+    row.put("ts", "2022-01-04");
+    row.put("dim1", null);
+    row.put("dim2", null);
+    row.put("dim3", null);
+    row.put("nested", ImmutableMap.of("k", "v"));
+    rows.add(row);
     final String data = StringUtils.format(
-        "%s\n%s\n%s\n",
+        "%s\n%s\n%s\n%s\n",
         mapper.writeValueAsString(rows.get(0)),
         mapper.writeValueAsString(rows.get(1)),
-        mapper.writeValueAsString(rows.get(2))
+        mapper.writeValueAsString(rows.get(2)),
+        mapper.writeValueAsString(rows.get(3))
     );
-    return new InlineInputSource(data);
+
+    return new SplittableInlineDataSource(ImmutableList.of(data));
+  }
+
+  /**
+   * Splittable inlineDataSource to run tests with range partitioning which 
requires the inputSource to be splittable.
+   */
+  private static final class SplittableInlineDataSource implements 
SplittableInputSource<String>
+  {
+    private final List<String> data;
+
+    @JsonCreator

Review Comment:
   Do these tests need to serialize/deserialize to run the underlying tasks?



##########
indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionMultiPhaseParallelIndexingWithNullColumnTest.java:
##########
@@ -19,37 +19,61 @@
 
 package org.apache.druid.indexing.common.task.batch.parallel;
 
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
 import org.apache.druid.data.input.InputFormat;
+import org.apache.druid.data.input.InputRowSchema;
 import org.apache.druid.data.input.InputSource;
+import org.apache.druid.data.input.InputSourceReader;
+import org.apache.druid.data.input.InputSplit;
+import org.apache.druid.data.input.SplitHintSpec;
+import org.apache.druid.data.input.impl.ByteEntity;
 import org.apache.druid.data.input.impl.DimensionSchema;
 import org.apache.druid.data.input.impl.DimensionsSpec;
 import org.apache.druid.data.input.impl.InlineInputSource;
+import org.apache.druid.data.input.impl.InputEntityIteratingReader;
 import org.apache.druid.data.input.impl.JsonInputFormat;
+import org.apache.druid.data.input.impl.SplittableInputSource;
 import org.apache.druid.data.input.impl.TimestampSpec;
 import org.apache.druid.indexer.TaskState;
+import org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec;
 import org.apache.druid.indexer.partitions.HashedPartitionsSpec;
+import org.apache.druid.indexer.partitions.PartitionsSpec;
 import org.apache.druid.indexing.common.LockGranularity;
 import org.apache.druid.indexing.common.task.Tasks;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularities;
+import org.apache.druid.java.util.common.parsers.JSONPathFieldSpec;
+import org.apache.druid.java.util.common.parsers.JSONPathFieldType;
+import org.apache.druid.java.util.common.parsers.JSONPathSpec;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
 import org.apache.druid.timeline.DataSegment;
 import org.joda.time.Interval;
 import org.junit.Assert;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 
+import javax.annotation.Nullable;
+import java.io.File;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.stream.Stream;
 
+@RunWith(Parameterized.class)
 public class HashPartitionMultiPhaseParallelIndexingWithNullColumnTest extends 
AbstractMultiPhaseParallelIndexingTest

Review Comment:
   Nit: We should probably rename this test class now that it is a 
parameterized test and runs for `DimensionRangePartitionsSpec` too.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [druid] kfaraz commented on a diff in pull request #12428: Fix indexMerger to respect the includeAllDimensions flag

Reply via email to