This is an automated email from the ASF dual-hosted git repository.
lqc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 3da05fbf7f Add `optimizeDictionaryType` config to automatically choose
dictionary type (#14444)
3da05fbf7f is described below
commit 3da05fbf7fd7bbb0e6b8149fd60c588a8c757e7a
Author: Christopher Peck <[email protected]>
AuthorDate: Thu Nov 21 13:01:42 2024 -0800
Add `optimizeDictionaryType` config to automatically choose dictionary type
(#14444)
* add optimizeDictionaryType
* make backwards compat
---
.../creator/impl/SegmentColumnarIndexCreator.java | 8 +++
.../impl/SegmentIndexCreationDriverImpl.java | 5 +-
.../index/dictionary/DictionaryIndexType.java | 15 ++++
.../segment/index/loader/ForwardIndexHandler.java | 4 +-
.../converter/RealtimeSegmentConverterTest.java | 81 ++++++++++++++++++++++
.../spi/creator/SegmentGeneratorConfig.java | 10 +++
.../segment/spi/index/DictionaryIndexConfig.java | 7 ++
.../pinot/spi/config/table/IndexingConfig.java | 14 ++++
.../spi/utils/builder/TableConfigBuilder.java | 21 ++++++
9 files changed, 161 insertions(+), 4 deletions(-)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
index 69e81303ea..7f6c3a15f5 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -182,6 +182,14 @@ public class SegmentColumnarIndexCreator implements
SegmentCreator {
LOGGER.info("Creating dictionary index in column {}.{} even when it
is disabled in config",
segmentCreationSpec.getTableName(), columnName);
}
+
+ // override dictionary type if configured to do so
+ if (_config.isOptimizeDictionaryType()) {
+ LOGGER.info("Overriding dictionary type for column: {} using
var-length dictionary: {}", columnName,
+ columnIndexCreationInfo.isUseVarLengthDictionary());
+ dictConfig = new DictionaryIndexConfig(dictConfig,
columnIndexCreationInfo.isUseVarLengthDictionary());
+ }
+
SegmentDictionaryCreator creator =
new
DictionaryIndexPlugin().getIndexType().createIndexCreator(context, dictConfig);
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java
index 6e3a0fb266..ae7c0e71a2 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java
@@ -553,9 +553,8 @@ public class SegmentIndexCreationDriverImpl implements
SegmentIndexCreationDrive
ColumnStatistics columnProfile =
_segmentStats.getColumnProfileFor(column);
DictionaryIndexConfig dictionaryIndexConfig =
indexConfigsMap.get(column).getConfig(StandardIndexes.dictionary());
boolean createDictionary = dictionaryIndexConfig.isDisabled();
- boolean useVarLengthDictionary =
- dictionaryIndexConfig.getUseVarLengthDictionary() ||
DictionaryIndexType.shouldUseVarLengthDictionary(
- storedType, columnProfile);
+ boolean useVarLengthDictionary =
dictionaryIndexConfig.getUseVarLengthDictionary()
+ ||
DictionaryIndexType.optimizeTypeShouldUseVarLengthDictionary(storedType,
columnProfile);
Object defaultNullValue = fieldSpec.getDefaultNullValue();
if (storedType == DataType.BYTES) {
defaultNullValue = new ByteArray((byte[]) defaultNullValue);
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/dictionary/DictionaryIndexType.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/dictionary/DictionaryIndexType.java
index 179444bab0..37e0f4f436 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/dictionary/DictionaryIndexType.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/dictionary/DictionaryIndexType.java
@@ -176,6 +176,21 @@ public class DictionaryIndexType
return false;
}
+ /**
+ * Similar to shouldUseVarLengthDictionary, but also checks STRING type.
Separated due to backwards compatibility
+ * concerns.
+ */
+ public static boolean
optimizeTypeShouldUseVarLengthDictionary(FieldSpec.DataType columnStoredType,
+ ColumnStatistics profile) {
+ if (columnStoredType == FieldSpec.DataType.BYTES || columnStoredType ==
FieldSpec.DataType.BIG_DECIMAL
+ || columnStoredType == FieldSpec.DataType.STRING) {
+ return !profile.isFixedLength();
+ }
+
+ return false;
+ }
+
+
/**
* This function evaluates whether to override dictionary (i.e use
noDictionary)
* for a column even when its explicitly configured. This evaluation is for
both dimension and metric
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java
index 84d5df6dbc..8d22a74ebe 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java
@@ -883,8 +883,10 @@ public class ForwardIndexHandler extends BaseIndexHandler {
DictionaryIndexConfig dictConf =
_fieldIndexConfigs.get(column).getConfig(StandardIndexes.dictionary());
+ boolean optimizeDictionaryType =
_tableConfig.getIndexingConfig().isOptimizeDictionaryType();
boolean useVarLength = dictConf.getUseVarLengthDictionary() ||
DictionaryIndexType.shouldUseVarLengthDictionary(
- reader.getStoredType(), statsCollector);
+ reader.getStoredType(), statsCollector) || (optimizeDictionaryType
+ &&
DictionaryIndexType.optimizeTypeShouldUseVarLengthDictionary(reader.getStoredType(),
statsCollector));
SegmentDictionaryCreator dictionaryCreator = new
SegmentDictionaryCreator(existingColMetadata.getFieldSpec(),
_segmentDirectory.getSegmentMetadata().getIndexDir(), useVarLength);
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
index d0f9ef90d2..06581baabd 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
@@ -452,6 +452,87 @@ public class RealtimeSegmentConverterTest {
segmentFile.destroy();
}
+ @DataProvider
+ public static Object[][] optimizeDictionaryTypeParams() {
+ // Format: {optimizeDictionaryType, expectedCRC}, crc is used here to
check the correct dictionary type was used
+ return new Object[][]{
+ {true, "2653526366"},
+ {false, "2948830084"},
+ };
+ }
+
+ @Test(dataProvider = "optimizeDictionaryTypeParams")
+ public void testOptimizeDictionaryTypeConversion(Object[] params)
+ throws Exception {
+ File tmpDir = new File(TMP_DIR, "tmp_" + System.currentTimeMillis());
+ TableConfig tableConfig =
+ new TableConfigBuilder(TableType.REALTIME).setTableName("testTable")
+ .setTimeColumnName(DATE_TIME_COLUMN)
+ .setColumnMajorSegmentBuilderEnabled(true)
+ .setOptimizeDictionaryType((boolean) params[0])
+ .build();
+ Schema schema = new Schema.SchemaBuilder()
+ .addSingleValueDimension(STRING_COLUMN1, FieldSpec.DataType.STRING)
+ .addSingleValueDimension(STRING_COLUMN2, FieldSpec.DataType.STRING)
+ .addDateTime(DATE_TIME_COLUMN, FieldSpec.DataType.LONG,
"1:MILLISECONDS:EPOCH", "1:MILLISECONDS")
+ .build();
+
+ String tableNameWithType = tableConfig.getTableName();
+ String segmentName = "testTable__0__0__123456";
+
+ RealtimeSegmentConfig.Builder realtimeSegmentConfigBuilder =
+ new
RealtimeSegmentConfig.Builder().setTableNameWithType(tableNameWithType).setSegmentName(segmentName)
+
.setStreamName(tableNameWithType).setSchema(schema).setTimeColumnName(DATE_TIME_COLUMN).setCapacity(1000)
+ .setIndex(Sets.newHashSet(Sets.newHashSet(STRING_COLUMN1,
STRING_COLUMN2)), StandardIndexes.dictionary(),
+ DictionaryIndexConfig.DEFAULT)
+
.setSegmentZKMetadata(getSegmentZKMetadata(segmentName)).setOffHeap(true)
+ .setMemoryManager(new DirectMemoryManager(segmentName))
+ .setStatsHistory(RealtimeSegmentStatsHistory.deserialzeFrom(new
File(tmpDir, "stats")))
+ .setConsumerDir(new File(tmpDir, "consumerDir").getAbsolutePath());
+
+ // create mutable segment impl
+ MutableSegmentImpl mutableSegmentImpl = new
MutableSegmentImpl(realtimeSegmentConfigBuilder.build(), null);
+ List<GenericRow> rows = new ArrayList<>();
+ for (int i = 0; i < 10; i++) {
+ GenericRow row = new GenericRow();
+ row.putValue(STRING_COLUMN1, "string" + i * 10); // var length
+ row.putValue(STRING_COLUMN2, "string" + i % 10); // fixed length
+ row.putValue(DATE_TIME_COLUMN, 1697814309L + i);
+ rows.add(row);
+ }
+ for (GenericRow row : rows) {
+ mutableSegmentImpl.index(row, null);
+ }
+
+ File outputDir = new File(tmpDir, "outputDir");
+ SegmentZKPropsConfig segmentZKPropsConfig = new SegmentZKPropsConfig();
+ segmentZKPropsConfig.setStartOffset("1");
+ segmentZKPropsConfig.setEndOffset("100");
+ RealtimeSegmentConverter converter =
+ new RealtimeSegmentConverter(mutableSegmentImpl, segmentZKPropsConfig,
outputDir.getAbsolutePath(), schema,
+ tableNameWithType, tableConfig, segmentName, false);
+ converter.build(SegmentVersion.v3, null);
+
+
+ File indexDir = new File(outputDir, segmentName);
+ SegmentMetadataImpl segmentMetadata = new SegmentMetadataImpl(indexDir);
+ assertEquals(segmentMetadata.getCrc(), params[1]);
+
+ assertEquals(segmentMetadata.getVersion(), SegmentVersion.v3);
+ assertEquals(segmentMetadata.getTotalDocs(), rows.size());
+ assertEquals(segmentMetadata.getTimeColumn(), DATE_TIME_COLUMN);
+ assertEquals(segmentMetadata.getTimeUnit(), TimeUnit.MILLISECONDS);
+
+ long expectedStartTime = (long) rows.get(0).getValue(DATE_TIME_COLUMN);
+ assertEquals(segmentMetadata.getStartTime(), expectedStartTime);
+ long expectedEndTime = (long) rows.get(rows.size() -
1).getValue(DATE_TIME_COLUMN);
+ assertEquals(segmentMetadata.getEndTime(), expectedEndTime);
+
+
assertTrue(segmentMetadata.getAllColumns().containsAll(schema.getColumnNames()));
+ assertEquals(segmentMetadata.getStartOffset(), "1");
+ assertEquals(segmentMetadata.getEndOffset(), "100");
+ }
+
@DataProvider
public static Object[][] reuseParams() {
List<Boolean> enabledColumnMajorSegmentBuildParams = Arrays.asList(false,
true);
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
index 5affe8ac4f..fff8782634 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
@@ -112,6 +112,7 @@ public class SegmentGeneratorConfig implements Serializable
{
private boolean _failOnEmptySegment = false;
private boolean _optimizeDictionary = false;
private boolean _optimizeDictionaryForMetrics = false;
+ private boolean _optimizeDictionaryType = false;
private double _noDictionarySizeRatioThreshold =
IndexingConfig.DEFAULT_NO_DICTIONARY_SIZE_RATIO_THRESHOLD;
private Double _noDictionaryCardinalityRatioThreshold;
private boolean _realtimeConversion = false;
@@ -168,6 +169,7 @@ public class SegmentGeneratorConfig implements Serializable
{
_defaultNullHandlingEnabled = indexingConfig.isNullHandlingEnabled();
_optimizeDictionary = indexingConfig.isOptimizeDictionary();
_optimizeDictionaryForMetrics =
indexingConfig.isOptimizeDictionaryForMetrics();
+ _optimizeDictionaryType = indexingConfig.isOptimizeDictionaryType();
_noDictionarySizeRatioThreshold =
indexingConfig.getNoDictionarySizeRatioThreshold();
_noDictionaryCardinalityRatioThreshold =
indexingConfig.getNoDictionaryCardinalityRatioThreshold();
@@ -655,6 +657,14 @@ public class SegmentGeneratorConfig implements
Serializable {
_optimizeDictionaryForMetrics = optimizeDictionaryForMetrics;
}
+ public boolean isOptimizeDictionaryType() {
+ return _optimizeDictionaryType;
+ }
+
+ public void setOptimizeDictionaryType(boolean optimizeDictionaryType) {
+ _optimizeDictionaryType = optimizeDictionaryType;
+ }
+
public double getNoDictionarySizeRatioThreshold() {
return _noDictionarySizeRatioThreshold;
}
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/DictionaryIndexConfig.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/DictionaryIndexConfig.java
index 3b27f17026..df3909d104 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/DictionaryIndexConfig.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/DictionaryIndexConfig.java
@@ -48,6 +48,13 @@ public class DictionaryIndexConfig extends IndexConfig {
this(false, onHeap, useVarLengthDictionary, intern);
}
+ /**
+ * Constructor for patching an existing config, but overrides the
useVarLengthDictionary property with the input
+ */
+ public DictionaryIndexConfig(DictionaryIndexConfig base, boolean
useVarLengthDictionary) {
+ this(base.isEnabled(), base.isOnHeap(), useVarLengthDictionary,
base.getIntern());
+ }
+
@JsonCreator
public DictionaryIndexConfig(@JsonProperty("disabled") Boolean disabled,
@JsonProperty("onHeap") Boolean onHeap,
@JsonProperty("useVarLengthDictionary") @Nullable Boolean
useVarLengthDictionary,
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
index 4b86dfa9b3..5beb126e0d 100644
---
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
+++
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
@@ -79,6 +79,12 @@ public class IndexingConfig extends BaseJsonConfig {
*/
private boolean _optimizeDictionaryForMetrics;
+ /**
+ * Optimize the dictionary type for var width columns, if values are all the
same length then use a fixed-width
+ * dictionary. Else, use a var-width dictionary.
+ */
+ private boolean _optimizeDictionaryType;
+
private double _noDictionarySizeRatioThreshold =
DEFAULT_NO_DICTIONARY_SIZE_RATIO_THRESHOLD;
// Used in conjunction with `optimizeDictionary`, if cardinality / total
docs is less than the threshold,
@@ -371,6 +377,14 @@ public class IndexingConfig extends BaseJsonConfig {
_optimizeDictionaryForMetrics = optimizeDictionaryForMetrics;
}
+ public boolean isOptimizeDictionaryType() {
+ return _optimizeDictionaryType;
+ }
+
+ public void setOptimizeDictionaryType(boolean optimizeDictionaryType) {
+ _optimizeDictionaryType = optimizeDictionaryType;
+ }
+
public double getNoDictionarySizeRatioThreshold() {
return _noDictionarySizeRatioThreshold;
}
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java
index dc8fb2ae8a..5e9d915cfc 100644
---
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java
+++
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java
@@ -105,10 +105,13 @@ public class TableConfigBuilder {
private List<StarTreeIndexConfig> _starTreeIndexConfigs;
private List<String> _jsonIndexColumns;
private boolean _aggregateMetrics;
+ private boolean _optimizeDictionary;
private boolean _optimizeDictionaryForMetrics;
// This threshold determines if dictionary should be enabled or not for a
metric column and is relevant
// only when _optimizeDictionaryForMetrics is set to true.
+ private boolean _optimizeDictionaryType;
private double _noDictionarySizeRatioThreshold;
+ private double _noDictionaryCardinalityRatioThreshold;
private TableCustomConfig _customConfig;
private QuotaConfig _quotaConfig;
@@ -259,16 +262,31 @@ public class TableConfigBuilder {
return this;
}
+ public TableConfigBuilder setOptimizeDictionary(boolean optimizeDictionary) {
+ _optimizeDictionary = optimizeDictionary;
+ return this;
+ }
+
public TableConfigBuilder setOptimizeDictionaryForMetrics(boolean
optimizeDictionaryForMetrics) {
_optimizeDictionaryForMetrics = optimizeDictionaryForMetrics;
return this;
}
+ public TableConfigBuilder setOptimizeDictionaryType(boolean
optimizeDictionaryType) {
+ _optimizeDictionaryType = optimizeDictionaryType;
+ return this;
+ }
+
public TableConfigBuilder setNoDictionarySizeRatioThreshold(double
noDictionarySizeRatioThreshold) {
_noDictionarySizeRatioThreshold = noDictionarySizeRatioThreshold;
return this;
}
+ public TableConfigBuilder setNoDictionaryCardinalityRatioThreshold(double
noDictionaryCardinalityRatioThreshold) {
+ _noDictionaryCardinalityRatioThreshold =
noDictionaryCardinalityRatioThreshold;
+ return this;
+ }
+
public TableConfigBuilder setCreateInvertedIndexDuringSegmentGeneration(
boolean createInvertedIndexDuringSegmentGeneration) {
_createInvertedIndexDuringSegmentGeneration =
createInvertedIndexDuringSegmentGeneration;
@@ -464,8 +482,11 @@ public class TableConfigBuilder {
indexingConfig.setStarTreeIndexConfigs(_starTreeIndexConfigs);
indexingConfig.setJsonIndexColumns(_jsonIndexColumns);
indexingConfig.setAggregateMetrics(_aggregateMetrics);
+ indexingConfig.setOptimizeDictionary(_optimizeDictionary);
indexingConfig.setOptimizeDictionaryForMetrics(_optimizeDictionaryForMetrics);
+ indexingConfig.setOptimizeDictionaryType(_optimizeDictionaryType);
indexingConfig.setNoDictionarySizeRatioThreshold(_noDictionarySizeRatioThreshold);
+
indexingConfig.setNoDictionaryCardinalityRatioThreshold(_noDictionaryCardinalityRatioThreshold);
indexingConfig.setTierOverwrites(_tierOverwrites);
if (_customConfig == null) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]