This is an automated email from the ASF dual-hosted git repository.

lqc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 3da05fbf7f Add `optimizeDictionaryType` config to automatically choose 
dictionary type (#14444)
3da05fbf7f is described below

commit 3da05fbf7fd7bbb0e6b8149fd60c588a8c757e7a
Author: Christopher Peck <[email protected]>
AuthorDate: Thu Nov 21 13:01:42 2024 -0800

    Add `optimizeDictionaryType` config to automatically choose dictionary type 
(#14444)
    
    * add optimizeDictionaryType
    
    * make backwards compat
---
 .../creator/impl/SegmentColumnarIndexCreator.java  |  8 +++
 .../impl/SegmentIndexCreationDriverImpl.java       |  5 +-
 .../index/dictionary/DictionaryIndexType.java      | 15 ++++
 .../segment/index/loader/ForwardIndexHandler.java  |  4 +-
 .../converter/RealtimeSegmentConverterTest.java    | 81 ++++++++++++++++++++++
 .../spi/creator/SegmentGeneratorConfig.java        | 10 +++
 .../segment/spi/index/DictionaryIndexConfig.java   |  7 ++
 .../pinot/spi/config/table/IndexingConfig.java     | 14 ++++
 .../spi/utils/builder/TableConfigBuilder.java      | 21 ++++++
 9 files changed, 161 insertions(+), 4 deletions(-)

diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
index 69e81303ea..7f6c3a15f5 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -182,6 +182,14 @@ public class SegmentColumnarIndexCreator implements 
SegmentCreator {
           LOGGER.info("Creating dictionary index in column {}.{} even when it 
is disabled in config",
               segmentCreationSpec.getTableName(), columnName);
         }
+
+        // override dictionary type if configured to do so
+        if (_config.isOptimizeDictionaryType()) {
+          LOGGER.info("Overriding dictionary type for column: {} using 
var-length dictionary: {}", columnName,
+              columnIndexCreationInfo.isUseVarLengthDictionary());
+          dictConfig = new DictionaryIndexConfig(dictConfig, 
columnIndexCreationInfo.isUseVarLengthDictionary());
+        }
+
         SegmentDictionaryCreator creator =
             new 
DictionaryIndexPlugin().getIndexType().createIndexCreator(context, dictConfig);
 
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java
index 6e3a0fb266..ae7c0e71a2 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java
@@ -553,9 +553,8 @@ public class SegmentIndexCreationDriverImpl implements 
SegmentIndexCreationDrive
       ColumnStatistics columnProfile = 
_segmentStats.getColumnProfileFor(column);
       DictionaryIndexConfig dictionaryIndexConfig = 
indexConfigsMap.get(column).getConfig(StandardIndexes.dictionary());
       boolean createDictionary = dictionaryIndexConfig.isDisabled();
-      boolean useVarLengthDictionary =
-          dictionaryIndexConfig.getUseVarLengthDictionary() || 
DictionaryIndexType.shouldUseVarLengthDictionary(
-              storedType, columnProfile);
+      boolean useVarLengthDictionary = 
dictionaryIndexConfig.getUseVarLengthDictionary()
+          || 
DictionaryIndexType.optimizeTypeShouldUseVarLengthDictionary(storedType, 
columnProfile);
       Object defaultNullValue = fieldSpec.getDefaultNullValue();
       if (storedType == DataType.BYTES) {
         defaultNullValue = new ByteArray((byte[]) defaultNullValue);
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/dictionary/DictionaryIndexType.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/dictionary/DictionaryIndexType.java
index 179444bab0..37e0f4f436 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/dictionary/DictionaryIndexType.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/dictionary/DictionaryIndexType.java
@@ -176,6 +176,21 @@ public class DictionaryIndexType
     return false;
   }
 
+  /**
+   * Similar to shouldUseVarLengthDictionary, but also checks STRING type. 
Separated due to backwards compatibility
+   * concerns.
+   */
+  public static boolean 
optimizeTypeShouldUseVarLengthDictionary(FieldSpec.DataType columnStoredType,
+      ColumnStatistics profile) {
+    if (columnStoredType == FieldSpec.DataType.BYTES || columnStoredType == 
FieldSpec.DataType.BIG_DECIMAL
+        || columnStoredType == FieldSpec.DataType.STRING) {
+      return !profile.isFixedLength();
+    }
+
+    return false;
+  }
+
+
   /**
    * This function evaluates whether to override dictionary (i.e use 
noDictionary)
    * for a column even when its explicitly configured. This evaluation is for 
both dimension and metric
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java
index 84d5df6dbc..8d22a74ebe 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java
@@ -883,8 +883,10 @@ public class ForwardIndexHandler extends BaseIndexHandler {
 
       DictionaryIndexConfig dictConf = 
_fieldIndexConfigs.get(column).getConfig(StandardIndexes.dictionary());
 
+      boolean optimizeDictionaryType = 
_tableConfig.getIndexingConfig().isOptimizeDictionaryType();
       boolean useVarLength = dictConf.getUseVarLengthDictionary() || 
DictionaryIndexType.shouldUseVarLengthDictionary(
-          reader.getStoredType(), statsCollector);
+          reader.getStoredType(), statsCollector) || (optimizeDictionaryType
+          && 
DictionaryIndexType.optimizeTypeShouldUseVarLengthDictionary(reader.getStoredType(),
 statsCollector));
       SegmentDictionaryCreator dictionaryCreator = new 
SegmentDictionaryCreator(existingColMetadata.getFieldSpec(),
           _segmentDirectory.getSegmentMetadata().getIndexDir(), useVarLength);
 
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
index d0f9ef90d2..06581baabd 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
@@ -452,6 +452,87 @@ public class RealtimeSegmentConverterTest {
     segmentFile.destroy();
   }
 
+  @DataProvider
+  public static Object[][] optimizeDictionaryTypeParams() {
+    // Format: {optimizeDictionaryType, expectedCRC}, crc is used here to 
check the correct dictionary type was used
+    return new Object[][]{
+        {true, "2653526366"},
+        {false, "2948830084"},
+    };
+  }
+
+  @Test(dataProvider = "optimizeDictionaryTypeParams")
+  public void testOptimizeDictionaryTypeConversion(Object[] params)
+      throws Exception {
+    File tmpDir = new File(TMP_DIR, "tmp_" + System.currentTimeMillis());
+    TableConfig tableConfig =
+        new TableConfigBuilder(TableType.REALTIME).setTableName("testTable")
+            .setTimeColumnName(DATE_TIME_COLUMN)
+            .setColumnMajorSegmentBuilderEnabled(true)
+            .setOptimizeDictionaryType((boolean) params[0])
+            .build();
+    Schema schema = new Schema.SchemaBuilder()
+        .addSingleValueDimension(STRING_COLUMN1, FieldSpec.DataType.STRING)
+        .addSingleValueDimension(STRING_COLUMN2, FieldSpec.DataType.STRING)
+        .addDateTime(DATE_TIME_COLUMN, FieldSpec.DataType.LONG, 
"1:MILLISECONDS:EPOCH", "1:MILLISECONDS")
+        .build();
+
+    String tableNameWithType = tableConfig.getTableName();
+    String segmentName = "testTable__0__0__123456";
+
+    RealtimeSegmentConfig.Builder realtimeSegmentConfigBuilder =
+        new 
RealtimeSegmentConfig.Builder().setTableNameWithType(tableNameWithType).setSegmentName(segmentName)
+            
.setStreamName(tableNameWithType).setSchema(schema).setTimeColumnName(DATE_TIME_COLUMN).setCapacity(1000)
+            .setIndex(Sets.newHashSet(Sets.newHashSet(STRING_COLUMN1, 
STRING_COLUMN2)), StandardIndexes.dictionary(),
+                DictionaryIndexConfig.DEFAULT)
+            
.setSegmentZKMetadata(getSegmentZKMetadata(segmentName)).setOffHeap(true)
+            .setMemoryManager(new DirectMemoryManager(segmentName))
+            .setStatsHistory(RealtimeSegmentStatsHistory.deserialzeFrom(new 
File(tmpDir, "stats")))
+            .setConsumerDir(new File(tmpDir, "consumerDir").getAbsolutePath());
+
+    // create mutable segment impl
+    MutableSegmentImpl mutableSegmentImpl = new 
MutableSegmentImpl(realtimeSegmentConfigBuilder.build(), null);
+    List<GenericRow> rows = new ArrayList<>();
+    for (int i = 0; i < 10; i++) {
+      GenericRow row = new GenericRow();
+      row.putValue(STRING_COLUMN1, "string" + i * 10); // var length
+      row.putValue(STRING_COLUMN2, "string" + i % 10); // fixed length
+      row.putValue(DATE_TIME_COLUMN, 1697814309L + i);
+      rows.add(row);
+    }
+    for (GenericRow row : rows) {
+      mutableSegmentImpl.index(row, null);
+    }
+
+    File outputDir = new File(tmpDir, "outputDir");
+    SegmentZKPropsConfig segmentZKPropsConfig = new SegmentZKPropsConfig();
+    segmentZKPropsConfig.setStartOffset("1");
+    segmentZKPropsConfig.setEndOffset("100");
+    RealtimeSegmentConverter converter =
+        new RealtimeSegmentConverter(mutableSegmentImpl, segmentZKPropsConfig, 
outputDir.getAbsolutePath(), schema,
+            tableNameWithType, tableConfig, segmentName, false);
+    converter.build(SegmentVersion.v3, null);
+
+
+    File indexDir = new File(outputDir, segmentName);
+    SegmentMetadataImpl segmentMetadata = new SegmentMetadataImpl(indexDir);
+    assertEquals(segmentMetadata.getCrc(), params[1]);
+
+    assertEquals(segmentMetadata.getVersion(), SegmentVersion.v3);
+    assertEquals(segmentMetadata.getTotalDocs(), rows.size());
+    assertEquals(segmentMetadata.getTimeColumn(), DATE_TIME_COLUMN);
+    assertEquals(segmentMetadata.getTimeUnit(), TimeUnit.MILLISECONDS);
+
+    long expectedStartTime = (long) rows.get(0).getValue(DATE_TIME_COLUMN);
+    assertEquals(segmentMetadata.getStartTime(), expectedStartTime);
+    long expectedEndTime = (long) rows.get(rows.size() - 
1).getValue(DATE_TIME_COLUMN);
+    assertEquals(segmentMetadata.getEndTime(), expectedEndTime);
+
+    
assertTrue(segmentMetadata.getAllColumns().containsAll(schema.getColumnNames()));
+    assertEquals(segmentMetadata.getStartOffset(), "1");
+    assertEquals(segmentMetadata.getEndOffset(), "100");
+  }
+
   @DataProvider
   public static Object[][] reuseParams() {
     List<Boolean> enabledColumnMajorSegmentBuildParams = Arrays.asList(false, 
true);
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
index 5affe8ac4f..fff8782634 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
@@ -112,6 +112,7 @@ public class SegmentGeneratorConfig implements Serializable 
{
   private boolean _failOnEmptySegment = false;
   private boolean _optimizeDictionary = false;
   private boolean _optimizeDictionaryForMetrics = false;
+  private boolean _optimizeDictionaryType = false;
   private double _noDictionarySizeRatioThreshold = 
IndexingConfig.DEFAULT_NO_DICTIONARY_SIZE_RATIO_THRESHOLD;
   private Double _noDictionaryCardinalityRatioThreshold;
   private boolean _realtimeConversion = false;
@@ -168,6 +169,7 @@ public class SegmentGeneratorConfig implements Serializable 
{
     _defaultNullHandlingEnabled = indexingConfig.isNullHandlingEnabled();
     _optimizeDictionary = indexingConfig.isOptimizeDictionary();
     _optimizeDictionaryForMetrics = 
indexingConfig.isOptimizeDictionaryForMetrics();
+    _optimizeDictionaryType = indexingConfig.isOptimizeDictionaryType();
     _noDictionarySizeRatioThreshold = 
indexingConfig.getNoDictionarySizeRatioThreshold();
     _noDictionaryCardinalityRatioThreshold = 
indexingConfig.getNoDictionaryCardinalityRatioThreshold();
 
@@ -655,6 +657,14 @@ public class SegmentGeneratorConfig implements 
Serializable {
     _optimizeDictionaryForMetrics = optimizeDictionaryForMetrics;
   }
 
+  public boolean isOptimizeDictionaryType() {
+    return _optimizeDictionaryType;
+  }
+
+  public void setOptimizeDictionaryType(boolean optimizeDictionaryType) {
+    _optimizeDictionaryType = optimizeDictionaryType;
+  }
+
   public double getNoDictionarySizeRatioThreshold() {
     return _noDictionarySizeRatioThreshold;
   }
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/DictionaryIndexConfig.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/DictionaryIndexConfig.java
index 3b27f17026..df3909d104 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/DictionaryIndexConfig.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/DictionaryIndexConfig.java
@@ -48,6 +48,13 @@ public class DictionaryIndexConfig extends IndexConfig {
     this(false, onHeap, useVarLengthDictionary, intern);
   }
 
+  /**
+   * Constructor for patching an existing config, but overrides the 
useVarLengthDictionary property with the input
+   */
+  public DictionaryIndexConfig(DictionaryIndexConfig base, boolean 
useVarLengthDictionary) {
+    this(base.isEnabled(), base.isOnHeap(), useVarLengthDictionary, 
base.getIntern());
+  }
+
   @JsonCreator
   public DictionaryIndexConfig(@JsonProperty("disabled") Boolean disabled, 
@JsonProperty("onHeap") Boolean onHeap,
       @JsonProperty("useVarLengthDictionary") @Nullable Boolean 
useVarLengthDictionary,
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
index 4b86dfa9b3..5beb126e0d 100644
--- 
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
+++ 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
@@ -79,6 +79,12 @@ public class IndexingConfig extends BaseJsonConfig {
    */
   private boolean _optimizeDictionaryForMetrics;
 
+  /**
+   * Optimize the dictionary type for var width columns, if values are all the 
same length then use a fixed-width
+   * dictionary. Else, use a var-width dictionary.
+   */
+  private boolean _optimizeDictionaryType;
+
   private double _noDictionarySizeRatioThreshold = 
DEFAULT_NO_DICTIONARY_SIZE_RATIO_THRESHOLD;
 
   // Used in conjunction with `optimizeDictionary`, if cardinality / total 
docs is less than the threshold,
@@ -371,6 +377,14 @@ public class IndexingConfig extends BaseJsonConfig {
     _optimizeDictionaryForMetrics = optimizeDictionaryForMetrics;
   }
 
+  public boolean isOptimizeDictionaryType() {
+    return _optimizeDictionaryType;
+  }
+
+  public void setOptimizeDictionaryType(boolean optimizeDictionaryType) {
+    _optimizeDictionaryType = optimizeDictionaryType;
+  }
+
   public double getNoDictionarySizeRatioThreshold() {
     return _noDictionarySizeRatioThreshold;
   }
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java
 
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java
index dc8fb2ae8a..5e9d915cfc 100644
--- 
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java
+++ 
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java
@@ -105,10 +105,13 @@ public class TableConfigBuilder {
   private List<StarTreeIndexConfig> _starTreeIndexConfigs;
   private List<String> _jsonIndexColumns;
   private boolean _aggregateMetrics;
+  private boolean _optimizeDictionary;
   private boolean _optimizeDictionaryForMetrics;
   // This threshold determines if dictionary should be enabled or not for a 
metric column and is relevant
   // only when _optimizeDictionaryForMetrics is set to true.
+  private boolean _optimizeDictionaryType;
   private double _noDictionarySizeRatioThreshold;
+  private double _noDictionaryCardinalityRatioThreshold;
 
   private TableCustomConfig _customConfig;
   private QuotaConfig _quotaConfig;
@@ -259,16 +262,31 @@ public class TableConfigBuilder {
     return this;
   }
 
+  public TableConfigBuilder setOptimizeDictionary(boolean optimizeDictionary) {
+    _optimizeDictionary = optimizeDictionary;
+    return this;
+  }
+
   public TableConfigBuilder setOptimizeDictionaryForMetrics(boolean 
optimizeDictionaryForMetrics) {
     _optimizeDictionaryForMetrics = optimizeDictionaryForMetrics;
     return this;
   }
 
+  public TableConfigBuilder setOptimizeDictionaryType(boolean 
optimizeDictionaryType) {
+    _optimizeDictionaryType = optimizeDictionaryType;
+    return this;
+  }
+
   public TableConfigBuilder setNoDictionarySizeRatioThreshold(double 
noDictionarySizeRatioThreshold) {
     _noDictionarySizeRatioThreshold = noDictionarySizeRatioThreshold;
     return this;
   }
 
+  public TableConfigBuilder setNoDictionaryCardinalityRatioThreshold(double 
noDictionaryCardinalityRatioThreshold) {
+    _noDictionaryCardinalityRatioThreshold = 
noDictionaryCardinalityRatioThreshold;
+    return this;
+  }
+
   public TableConfigBuilder setCreateInvertedIndexDuringSegmentGeneration(
       boolean createInvertedIndexDuringSegmentGeneration) {
     _createInvertedIndexDuringSegmentGeneration = 
createInvertedIndexDuringSegmentGeneration;
@@ -464,8 +482,11 @@ public class TableConfigBuilder {
     indexingConfig.setStarTreeIndexConfigs(_starTreeIndexConfigs);
     indexingConfig.setJsonIndexColumns(_jsonIndexColumns);
     indexingConfig.setAggregateMetrics(_aggregateMetrics);
+    indexingConfig.setOptimizeDictionary(_optimizeDictionary);
     
indexingConfig.setOptimizeDictionaryForMetrics(_optimizeDictionaryForMetrics);
+    indexingConfig.setOptimizeDictionaryType(_optimizeDictionaryType);
     
indexingConfig.setNoDictionarySizeRatioThreshold(_noDictionarySizeRatioThreshold);
+    
indexingConfig.setNoDictionaryCardinalityRatioThreshold(_noDictionaryCardinalityRatioThreshold);
     indexingConfig.setTierOverwrites(_tierOverwrites);
 
     if (_customConfig == null) {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to