Re: [PR] Fix Hadoop multi-value string null value handling to match native batch (druid)

via GitHub Fri, 23 Jan 2026 11:02:26 -0800


github-advanced-security[bot] commented on code in PR #18944:
URL: https://github.com/apache/druid/pull/18944#discussion_r2722465584



##########
indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopNullValueIngestionTest.java:
##########
@@ -0,0 +1,402 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.indexer;
+
+import com.fasterxml.jackson.databind.InjectableValues;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.jsontype.NamedType;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import org.apache.commons.io.FileUtils;
+import org.apache.druid.data.input.InputRow;
+import org.apache.druid.data.input.impl.DimensionsSpec;
+import org.apache.druid.data.input.impl.JSONParseSpec;
+import org.apache.druid.data.input.impl.StringInputRowParser;
+import org.apache.druid.data.input.impl.TimestampSpec;
+import org.apache.druid.indexer.granularity.UniformGranularitySpec;
+import org.apache.druid.indexer.hadoop.DatasourceRecordReader;
+import org.apache.druid.jackson.DefaultObjectMapper;
+import org.apache.druid.java.util.common.Intervals;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.common.granularity.Granularities;
+import org.apache.druid.query.aggregation.CountAggregatorFactory;
+import org.apache.druid.segment.IndexIO;
+import org.apache.druid.segment.QueryableIndex;
+import org.apache.druid.segment.QueryableIndexCursorFactory;
+import org.apache.druid.segment.indexing.DataSchema;
+import org.apache.druid.segment.loading.LocalDataSegmentPuller;
+import org.apache.druid.segment.realtime.WindowedCursorFactory;
+import org.apache.druid.segment.transform.TransformSpec;
+import org.apache.druid.timeline.DataSegment.PruneSpecsHolder;
+import org.apache.druid.timeline.partition.HashBasedNumberedShardSpec;
+import org.apache.druid.timeline.partition.HashPartitionFunction;
+import org.joda.time.Interval;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * End-to-end test for null value handling in Hadoop batch ingestion.
+ * Tests that null values in multi-value dimensions are preserved through
+ * the ingestion pipeline (serde, indexing, segment creation).
+ */
+public class HadoopNullValueIngestionTest
+{
+  private static final ObjectMapper MAPPER;
+  private static final IndexIO INDEX_IO;
+  private static final Interval INTERVAL = 
Intervals.of("2024-01-01T00:00:00Z/P1D");
+
+  static {
+    MAPPER = new DefaultObjectMapper();
+    MAPPER.registerSubtypes(new NamedType(HashBasedNumberedShardSpec.class, 
"hashed"));
+    InjectableValues inject = new InjectableValues.Std()
+        .addValue(ObjectMapper.class, MAPPER)
+        .addValue(PruneSpecsHolder.class, PruneSpecsHolder.DEFAULT);
+    MAPPER.setInjectableValues(inject);
+    INDEX_IO = HadoopDruidIndexerConfig.INDEX_IO;
+  }
+
+  @Rule
+  public final TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+  /**
+   * Tests that null values in multi-value string dimensions are preserved.
+   * This tests the fix for the issue where null values were being serialized
+   * as the string "null" instead of being preserved as actual null.
+   */
+  @Test
+  public void testMultiValueDimensionWithNullValues() throws Exception
+  {
+    File tmpDir = temporaryFolder.newFolder();
+    File dataFile = new File(tmpDir, "data.json");
+
+    // Create test data with null values in multi-value dimension
+    // Using unique dim values to identify rows
+    List<String> data = ImmutableList.of(
+        "{\"ts\":\"2024-01-01T00:00:00Z\", \"dim\": \"mixed_null\", \"tags\": 
[\"a\", null, \"b\"]}",
+        "{\"ts\":\"2024-01-01T01:00:00Z\", \"dim\": \"only_null\", \"tags\": 
[null]}",
+        "{\"ts\":\"2024-01-01T02:00:00Z\", \"dim\": \"no_null\", \"tags\": 
[\"x\", \"y\"]}"
+    );
+    FileUtils.writeLines(dataFile, data);
+
+    HadoopDruidIndexerConfig config = makeConfig(dataFile, tmpDir, 
ImmutableList.of("dim", "tags"));
+
+    // Run ingestion
+    IndexGeneratorJob job = new IndexGeneratorJob(config);
+    Assert.assertTrue("Ingestion job should succeed", 
JobHelper.runJobs(ImmutableList.of(job)));
+
+    List<DataSegmentAndIndexZipFilePath> dataSegmentAndIndexZipFilePaths =
+        IndexGeneratorJob.getPublishedSegmentAndIndexZipFilePaths(config);
+    JobHelper.renameIndexFilesForSegments(config.getSchema(), 
dataSegmentAndIndexZipFilePaths);
+
+    // Load and verify the segment
+    File segmentFolder = new File(
+        StringUtils.format(
+            "%s/%s/%s_%s/%s/0",
+            config.getSchema().getIOConfig().getSegmentOutputPath(),
+            config.getSchema().getDataSchema().getDataSource(),
+            INTERVAL.getStart().toString(),
+            INTERVAL.getEnd().toString(),
+            config.getSchema().getTuningConfig().getVersion()
+        )
+    );
+    Assert.assertTrue("Segment folder should exist", segmentFolder.exists());
+
+    File indexZip = new File(segmentFolder, "index.zip");
+    Assert.assertTrue("index.zip should exist", indexZip.exists());
+
+    File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
+    new LocalDataSegmentPuller().getSegmentFiles(indexZip, 
tmpUnzippedSegmentDir);
+
+    QueryableIndex index = INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
+
+    // Read rows from the segment
+    DatasourceRecordReader.SegmentReader segmentReader = new 
DatasourceRecordReader.SegmentReader(
+        ImmutableList.of(new WindowedCursorFactory(new 
QueryableIndexCursorFactory(index), INTERVAL)),
+        TransformSpec.NONE,
+        ImmutableList.of("dim", "tags"),
+        ImmutableList.of("count"),
+        null
+    );
+
+    List<InputRow> rows = new ArrayList<>();
+    while (segmentReader.hasMore()) {
+      rows.add(segmentReader.nextRow());
+    }
+    segmentReader.close();
+
+    Assert.assertEquals("Should have 3 rows", 3, rows.size());
+
+    // Find rows by their dim values
+    InputRow mixedNullRow = null;
+    InputRow onlyNullRow = null;
+    InputRow noNullRow = null;
+
+    for (InputRow row : rows) {
+      List<String> dimVals = row.getDimension("dim");
+      if (dimVals != null && !dimVals.isEmpty()) {
+        String dimVal = dimVals.get(0);
+        if ("mixed_null".equals(dimVal)) {
+          mixedNullRow = row;
+        } else if ("only_null".equals(dimVal)) {
+          onlyNullRow = row;
+        } else if ("no_null".equals(dimVal)) {
+          noNullRow = row;
+        }
+      }
+    }
+
+    Assert.assertNotNull("Should find mixed_null row", mixedNullRow);
+    Assert.assertNotNull("Should find only_null row", onlyNullRow);
+    Assert.assertNotNull("Should find no_null row", noNullRow);
+
+    // Verify mixed_null row: tags = ["a", null, "b"]
+    // Note: Multi-value dimensions are sorted when stored in segments.
+    // Null sorts first, so ["a", null, "b"] becomes [null, "a", "b"]
+    List<String> mixedTags = mixedNullRow.getDimension("tags");
+    Assert.assertEquals("mixed_null row tags should have 3 elements", 3, 
mixedTags.size());
+    // The key assertion: null is preserved as actual null, not converted to 
string "null"
+    Assert.assertTrue(
+        "Should contain a null value (not string 'null')",
+        mixedTags.contains(null)
+    );
+    Assert.assertFalse(
+        "Should NOT contain string 'null' - that would indicate incorrect 
serialization",
+        mixedTags.contains("null")
+    );
+    // Verify all expected values are present (accounting for sort order)
+    Assert.assertTrue("Should contain 'a'", mixedTags.contains("a"));
+    Assert.assertTrue("Should contain 'b'", mixedTags.contains("b"));
+    // Verify sorted order: null sorts first
+    Assert.assertNull("First element should be null (null sorts first)", 
mixedTags.get(0));
+    Assert.assertEquals("Second element should be 'a'", "a", mixedTags.get(1));
+    Assert.assertEquals("Third element should be 'b'", "b", mixedTags.get(2));
+
+    // Verify only_null row: tags = [null]
+    // When a dimension has only null values, Druid returns an empty list from 
getDimension()
+    // This is expected Druid behavior - the key is that it doesn't return 
["null"] (string)
+    List<String> onlyNullTags = onlyNullRow.getDimension("tags");
+    Assert.assertTrue(
+        "only_null row tags should be empty (Druid returns empty list for 
all-null dimension)",
+        onlyNullTags.isEmpty()
+    );
+    Assert.assertFalse(
+        "Should NOT contain string 'null' - that would indicate incorrect 
serialization",
+        onlyNullTags.contains("null")
+    );
+
+    // Verify no_null row: tags = ["x", "y"]
+    List<String> noNullTags = noNullRow.getDimension("tags");
+    Assert.assertEquals(Arrays.asList("x", "y"), noNullTags);
+    Assert.assertFalse("Should not contain null", noNullTags.contains(null));
+  }
+
+  /**
+   * Tests that a completely null dimension value is handled correctly.
+   */
+  @Test
+  public void testNullDimensionValue() throws Exception
+  {
+    File tmpDir = temporaryFolder.newFolder();
+    File dataFile = new File(tmpDir, "data.json");
+
+    // Create test data where "tags" dimension is null (not present or 
explicitly null)
+    List<String> data = ImmutableList.of(
+        "{\"ts\":\"2024-01-01T00:00:00Z\", \"dim\": \"row1\", \"tags\": null}",
+        "{\"ts\":\"2024-01-01T01:00:00Z\", \"dim\": \"row2\"}",
+        "{\"ts\":\"2024-01-01T02:00:00Z\", \"dim\": \"row3\", \"tags\": 
\"value\"}"
+    );
+    FileUtils.writeLines(dataFile, data);
+
+    HadoopDruidIndexerConfig config = makeConfig(dataFile, tmpDir, 
ImmutableList.of("dim", "tags"));
+
+    // Run ingestion
+    IndexGeneratorJob job = new IndexGeneratorJob(config);
+    Assert.assertTrue("Ingestion job should succeed", 
JobHelper.runJobs(ImmutableList.of(job)));
+
+    List<DataSegmentAndIndexZipFilePath> dataSegmentAndIndexZipFilePaths =
+        IndexGeneratorJob.getPublishedSegmentAndIndexZipFilePaths(config);
+    JobHelper.renameIndexFilesForSegments(config.getSchema(), 
dataSegmentAndIndexZipFilePaths);
+
+    // Load and verify the segment
+    File segmentFolder = new File(
+        StringUtils.format(
+            "%s/%s/%s_%s/%s/0",
+            config.getSchema().getIOConfig().getSegmentOutputPath(),
+            config.getSchema().getDataSchema().getDataSource(),
+            INTERVAL.getStart().toString(),
+            INTERVAL.getEnd().toString(),
+            config.getSchema().getTuningConfig().getVersion()
+        )
+    );
+
+    File indexZip = new File(segmentFolder, "index.zip");
+    Assert.assertTrue("index.zip should exist", indexZip.exists());
+
+    File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
+    new LocalDataSegmentPuller().getSegmentFiles(indexZip, 
tmpUnzippedSegmentDir);
+
+    QueryableIndex index = INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
+
+    // Read rows from the segment
+    DatasourceRecordReader.SegmentReader segmentReader = new 
DatasourceRecordReader.SegmentReader(
+        ImmutableList.of(new WindowedCursorFactory(new 
QueryableIndexCursorFactory(index), INTERVAL)),
+        TransformSpec.NONE,
+        ImmutableList.of("dim", "tags"),
+        ImmutableList.of("count"),
+        null
+    );
+
+    List<InputRow> rows = new ArrayList<>();
+    while (segmentReader.hasMore()) {
+      rows.add(segmentReader.nextRow());
+    }
+    segmentReader.close();
+
+    Assert.assertEquals("Should have 3 rows", 3, rows.size());
+
+    // Sort rows by dim to ensure consistent ordering
+    rows.sort(Comparator.comparing(a -> 
String.valueOf(a.getDimension("dim").get(0))));
+
+    // Row 1: tags = null
+    InputRow row1 = rows.get(0);
+    Assert.assertEquals("row1", row1.getDimension("dim").get(0));
+    List<String> tags1 = row1.getDimension("tags");
+    // Null dimension returns empty list or list with null
+    Assert.assertTrue(
+        "Null dimension should return empty or single-null list",
+        tags1.isEmpty() || (tags1.size() == 1 && tags1.get(0) == null)
+    );
+
+    // Row 2: tags not present (missing)
+    InputRow row2 = rows.get(1);
+    Assert.assertEquals("row2", row2.getDimension("dim").get(0));
+    List<String> tags2 = row2.getDimension("tags");
+    Assert.assertTrue(
+        "Missing dimension should return empty or single-null list",
+        tags2.isEmpty() || (tags2.size() == 1 && tags2.get(0) == null)
+    );
+
+    // Row 3: tags = "value"
+    InputRow row3 = rows.get(2);
+    Assert.assertEquals("row3", row3.getDimension("dim").get(0));
+    Assert.assertEquals(Collections.singletonList("value"), 
row3.getDimension("tags"));
+  }
+
+  private HadoopDruidIndexerConfig makeConfig(File dataFile, File tmpDir, 
List<String> dimensions) throws Exception
+  {
+    File outputDir = new File(tmpDir, "output");
+
+    HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(
+        new HadoopIngestionSpec(
+            DataSchema.builder()
+                      .withDataSource("test_null_values")
+                      .withParserMap(MAPPER.convertValue(
+                          new StringInputRowParser(
+                              new JSONParseSpec(
+                                  new TimestampSpec("ts", "iso", null),
+                                  new 
DimensionsSpec(DimensionsSpec.getDefaultSchemas(dimensions)),
+                                  null,
+                                  null,
+                                  null
+                              ),
+                              null
+                          ),
+                          Map.class
+                      ))

Review Comment:
   ## Deprecated method or constructor invocation
   
   Invoking [Builder.withParserMap](1) should be avoided because it has been 
deprecated.
   
   [Show more 
details](https://github.com/apache/druid/security/code-scanning/10785)



##########
indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopNullValueIngestionTest.java:
##########
@@ -0,0 +1,402 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.indexer;
+
+import com.fasterxml.jackson.databind.InjectableValues;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.jsontype.NamedType;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import org.apache.commons.io.FileUtils;
+import org.apache.druid.data.input.InputRow;
+import org.apache.druid.data.input.impl.DimensionsSpec;
+import org.apache.druid.data.input.impl.JSONParseSpec;
+import org.apache.druid.data.input.impl.StringInputRowParser;
+import org.apache.druid.data.input.impl.TimestampSpec;
+import org.apache.druid.indexer.granularity.UniformGranularitySpec;
+import org.apache.druid.indexer.hadoop.DatasourceRecordReader;
+import org.apache.druid.jackson.DefaultObjectMapper;
+import org.apache.druid.java.util.common.Intervals;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.common.granularity.Granularities;
+import org.apache.druid.query.aggregation.CountAggregatorFactory;
+import org.apache.druid.segment.IndexIO;
+import org.apache.druid.segment.QueryableIndex;
+import org.apache.druid.segment.QueryableIndexCursorFactory;
+import org.apache.druid.segment.indexing.DataSchema;
+import org.apache.druid.segment.loading.LocalDataSegmentPuller;
+import org.apache.druid.segment.realtime.WindowedCursorFactory;
+import org.apache.druid.segment.transform.TransformSpec;
+import org.apache.druid.timeline.DataSegment.PruneSpecsHolder;
+import org.apache.druid.timeline.partition.HashBasedNumberedShardSpec;
+import org.apache.druid.timeline.partition.HashPartitionFunction;
+import org.joda.time.Interval;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * End-to-end test for null value handling in Hadoop batch ingestion.
+ * Tests that null values in multi-value dimensions are preserved through
+ * the ingestion pipeline (serde, indexing, segment creation).
+ */
+public class HadoopNullValueIngestionTest
+{
+  private static final ObjectMapper MAPPER;
+  private static final IndexIO INDEX_IO;
+  private static final Interval INTERVAL = 
Intervals.of("2024-01-01T00:00:00Z/P1D");
+
+  static {
+    MAPPER = new DefaultObjectMapper();
+    MAPPER.registerSubtypes(new NamedType(HashBasedNumberedShardSpec.class, 
"hashed"));
+    InjectableValues inject = new InjectableValues.Std()
+        .addValue(ObjectMapper.class, MAPPER)
+        .addValue(PruneSpecsHolder.class, PruneSpecsHolder.DEFAULT);
+    MAPPER.setInjectableValues(inject);
+    INDEX_IO = HadoopDruidIndexerConfig.INDEX_IO;
+  }
+
+  @Rule
+  public final TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+  /**
+   * Tests that null values in multi-value string dimensions are preserved.
+   * This tests the fix for the issue where null values were being serialized
+   * as the string "null" instead of being preserved as actual null.
+   */
+  @Test
+  public void testMultiValueDimensionWithNullValues() throws Exception
+  {
+    File tmpDir = temporaryFolder.newFolder();
+    File dataFile = new File(tmpDir, "data.json");
+
+    // Create test data with null values in multi-value dimension
+    // Using unique dim values to identify rows
+    List<String> data = ImmutableList.of(
+        "{\"ts\":\"2024-01-01T00:00:00Z\", \"dim\": \"mixed_null\", \"tags\": 
[\"a\", null, \"b\"]}",
+        "{\"ts\":\"2024-01-01T01:00:00Z\", \"dim\": \"only_null\", \"tags\": 
[null]}",
+        "{\"ts\":\"2024-01-01T02:00:00Z\", \"dim\": \"no_null\", \"tags\": 
[\"x\", \"y\"]}"
+    );
+    FileUtils.writeLines(dataFile, data);
+
+    HadoopDruidIndexerConfig config = makeConfig(dataFile, tmpDir, 
ImmutableList.of("dim", "tags"));
+
+    // Run ingestion
+    IndexGeneratorJob job = new IndexGeneratorJob(config);
+    Assert.assertTrue("Ingestion job should succeed", 
JobHelper.runJobs(ImmutableList.of(job)));
+
+    List<DataSegmentAndIndexZipFilePath> dataSegmentAndIndexZipFilePaths =
+        IndexGeneratorJob.getPublishedSegmentAndIndexZipFilePaths(config);
+    JobHelper.renameIndexFilesForSegments(config.getSchema(), 
dataSegmentAndIndexZipFilePaths);
+
+    // Load and verify the segment
+    File segmentFolder = new File(
+        StringUtils.format(
+            "%s/%s/%s_%s/%s/0",
+            config.getSchema().getIOConfig().getSegmentOutputPath(),
+            config.getSchema().getDataSchema().getDataSource(),
+            INTERVAL.getStart().toString(),
+            INTERVAL.getEnd().toString(),
+            config.getSchema().getTuningConfig().getVersion()
+        )
+    );
+    Assert.assertTrue("Segment folder should exist", segmentFolder.exists());
+
+    File indexZip = new File(segmentFolder, "index.zip");
+    Assert.assertTrue("index.zip should exist", indexZip.exists());
+
+    File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
+    new LocalDataSegmentPuller().getSegmentFiles(indexZip, 
tmpUnzippedSegmentDir);
+
+    QueryableIndex index = INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
+
+    // Read rows from the segment
+    DatasourceRecordReader.SegmentReader segmentReader = new 
DatasourceRecordReader.SegmentReader(
+        ImmutableList.of(new WindowedCursorFactory(new 
QueryableIndexCursorFactory(index), INTERVAL)),
+        TransformSpec.NONE,
+        ImmutableList.of("dim", "tags"),
+        ImmutableList.of("count"),
+        null
+    );
+
+    List<InputRow> rows = new ArrayList<>();
+    while (segmentReader.hasMore()) {
+      rows.add(segmentReader.nextRow());
+    }
+    segmentReader.close();
+
+    Assert.assertEquals("Should have 3 rows", 3, rows.size());
+
+    // Find rows by their dim values
+    InputRow mixedNullRow = null;
+    InputRow onlyNullRow = null;
+    InputRow noNullRow = null;
+
+    for (InputRow row : rows) {
+      List<String> dimVals = row.getDimension("dim");
+      if (dimVals != null && !dimVals.isEmpty()) {
+        String dimVal = dimVals.get(0);
+        if ("mixed_null".equals(dimVal)) {
+          mixedNullRow = row;
+        } else if ("only_null".equals(dimVal)) {
+          onlyNullRow = row;
+        } else if ("no_null".equals(dimVal)) {
+          noNullRow = row;
+        }
+      }
+    }
+
+    Assert.assertNotNull("Should find mixed_null row", mixedNullRow);
+    Assert.assertNotNull("Should find only_null row", onlyNullRow);
+    Assert.assertNotNull("Should find no_null row", noNullRow);
+
+    // Verify mixed_null row: tags = ["a", null, "b"]
+    // Note: Multi-value dimensions are sorted when stored in segments.
+    // Null sorts first, so ["a", null, "b"] becomes [null, "a", "b"]
+    List<String> mixedTags = mixedNullRow.getDimension("tags");
+    Assert.assertEquals("mixed_null row tags should have 3 elements", 3, 
mixedTags.size());
+    // The key assertion: null is preserved as actual null, not converted to 
string "null"
+    Assert.assertTrue(
+        "Should contain a null value (not string 'null')",
+        mixedTags.contains(null)
+    );
+    Assert.assertFalse(
+        "Should NOT contain string 'null' - that would indicate incorrect 
serialization",
+        mixedTags.contains("null")
+    );
+    // Verify all expected values are present (accounting for sort order)
+    Assert.assertTrue("Should contain 'a'", mixedTags.contains("a"));
+    Assert.assertTrue("Should contain 'b'", mixedTags.contains("b"));
+    // Verify sorted order: null sorts first
+    Assert.assertNull("First element should be null (null sorts first)", 
mixedTags.get(0));
+    Assert.assertEquals("Second element should be 'a'", "a", mixedTags.get(1));
+    Assert.assertEquals("Third element should be 'b'", "b", mixedTags.get(2));
+
+    // Verify only_null row: tags = [null]
+    // When a dimension has only null values, Druid returns an empty list from 
getDimension()
+    // This is expected Druid behavior - the key is that it doesn't return 
["null"] (string)
+    List<String> onlyNullTags = onlyNullRow.getDimension("tags");
+    Assert.assertTrue(
+        "only_null row tags should be empty (Druid returns empty list for 
all-null dimension)",
+        onlyNullTags.isEmpty()
+    );
+    Assert.assertFalse(
+        "Should NOT contain string 'null' - that would indicate incorrect 
serialization",
+        onlyNullTags.contains("null")
+    );
+
+    // Verify no_null row: tags = ["x", "y"]
+    List<String> noNullTags = noNullRow.getDimension("tags");
+    Assert.assertEquals(Arrays.asList("x", "y"), noNullTags);
+    Assert.assertFalse("Should not contain null", noNullTags.contains(null));
+  }
+
+  /**
+   * Tests that a completely null dimension value is handled correctly.
+   */
+  @Test
+  public void testNullDimensionValue() throws Exception
+  {
+    File tmpDir = temporaryFolder.newFolder();
+    File dataFile = new File(tmpDir, "data.json");
+
+    // Create test data where "tags" dimension is null (not present or 
explicitly null)
+    List<String> data = ImmutableList.of(
+        "{\"ts\":\"2024-01-01T00:00:00Z\", \"dim\": \"row1\", \"tags\": null}",
+        "{\"ts\":\"2024-01-01T01:00:00Z\", \"dim\": \"row2\"}",
+        "{\"ts\":\"2024-01-01T02:00:00Z\", \"dim\": \"row3\", \"tags\": 
\"value\"}"
+    );
+    FileUtils.writeLines(dataFile, data);
+
+    HadoopDruidIndexerConfig config = makeConfig(dataFile, tmpDir, 
ImmutableList.of("dim", "tags"));
+
+    // Run ingestion
+    IndexGeneratorJob job = new IndexGeneratorJob(config);
+    Assert.assertTrue("Ingestion job should succeed", 
JobHelper.runJobs(ImmutableList.of(job)));
+
+    List<DataSegmentAndIndexZipFilePath> dataSegmentAndIndexZipFilePaths =
+        IndexGeneratorJob.getPublishedSegmentAndIndexZipFilePaths(config);
+    JobHelper.renameIndexFilesForSegments(config.getSchema(), 
dataSegmentAndIndexZipFilePaths);
+
+    // Load and verify the segment
+    File segmentFolder = new File(
+        StringUtils.format(
+            "%s/%s/%s_%s/%s/0",
+            config.getSchema().getIOConfig().getSegmentOutputPath(),
+            config.getSchema().getDataSchema().getDataSource(),
+            INTERVAL.getStart().toString(),
+            INTERVAL.getEnd().toString(),
+            config.getSchema().getTuningConfig().getVersion()
+        )
+    );
+
+    File indexZip = new File(segmentFolder, "index.zip");
+    Assert.assertTrue("index.zip should exist", indexZip.exists());
+
+    File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
+    new LocalDataSegmentPuller().getSegmentFiles(indexZip, 
tmpUnzippedSegmentDir);
+
+    QueryableIndex index = INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
+
+    // Read rows from the segment
+    DatasourceRecordReader.SegmentReader segmentReader = new 
DatasourceRecordReader.SegmentReader(
+        ImmutableList.of(new WindowedCursorFactory(new 
QueryableIndexCursorFactory(index), INTERVAL)),
+        TransformSpec.NONE,
+        ImmutableList.of("dim", "tags"),
+        ImmutableList.of("count"),
+        null
+    );
+
+    List<InputRow> rows = new ArrayList<>();
+    while (segmentReader.hasMore()) {
+      rows.add(segmentReader.nextRow());
+    }
+    segmentReader.close();
+
+    Assert.assertEquals("Should have 3 rows", 3, rows.size());
+
+    // Sort rows by dim to ensure consistent ordering
+    rows.sort(Comparator.comparing(a -> 
String.valueOf(a.getDimension("dim").get(0))));
+
+    // Row 1: tags = null
+    InputRow row1 = rows.get(0);
+    Assert.assertEquals("row1", row1.getDimension("dim").get(0));
+    List<String> tags1 = row1.getDimension("tags");
+    // Null dimension returns empty list or list with null
+    Assert.assertTrue(
+        "Null dimension should return empty or single-null list",
+        tags1.isEmpty() || (tags1.size() == 1 && tags1.get(0) == null)
+    );
+
+    // Row 2: tags not present (missing)
+    InputRow row2 = rows.get(1);
+    Assert.assertEquals("row2", row2.getDimension("dim").get(0));
+    List<String> tags2 = row2.getDimension("tags");
+    Assert.assertTrue(
+        "Missing dimension should return empty or single-null list",
+        tags2.isEmpty() || (tags2.size() == 1 && tags2.get(0) == null)
+    );
+
+    // Row 3: tags = "value"
+    InputRow row3 = rows.get(2);
+    Assert.assertEquals("row3", row3.getDimension("dim").get(0));
+    Assert.assertEquals(Collections.singletonList("value"), 
row3.getDimension("tags"));
+  }
+
+  private HadoopDruidIndexerConfig makeConfig(File dataFile, File tmpDir, 
List<String> dimensions) throws Exception
+  {
+    File outputDir = new File(tmpDir, "output");
+
+    HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(
+        new HadoopIngestionSpec(
+            DataSchema.builder()
+                      .withDataSource("test_null_values")
+                      .withParserMap(MAPPER.convertValue(
+                          new StringInputRowParser(
+                              new JSONParseSpec(
+                                  new TimestampSpec("ts", "iso", null),
+                                  new 
DimensionsSpec(DimensionsSpec.getDefaultSchemas(dimensions)),
+                                  null,
+                                  null,
+                                  null
+                              ),
+                              null
+                          ),
+                          Map.class
+                      ))
+                      .withAggregators(new CountAggregatorFactory("count"))
+                      .withGranularity(new UniformGranularitySpec(
+                          Granularities.DAY,
+                          Granularities.NONE,
+                          ImmutableList.of(INTERVAL)
+                      ))
+                      .withObjectMapper(MAPPER)

Review Comment:
   ## Deprecated method or constructor invocation
   
   Invoking [Builder.withObjectMapper](1) should be avoided because it has been 
deprecated.
   
   [Show more 
details](https://github.com/apache/druid/security/code-scanning/10784)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Fix Hadoop multi-value string null value handling to match native batch (druid)

Reply via email to