This is an automated email from the ASF dual-hosted git repository. siddteotia pushed a commit to branch hotfix-0530 in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git
The following commit(s) were added to refs/heads/hotfix-0530 by this push: new 54cac4a Two changes: 54cac4a is described below commit 54cac4a84c713b98fef7d2a3e77d0bac2f9fb2ae Author: Siddharth Teotia <steo...@steotia-mn1.linkedin.biz> AuthorDate: Sat May 30 14:14:38 2020 -0700 Two changes: (1) PR https://github.com/apache/incubator-pinot/pull/5256 added support for deriving num docs per chunk for var byte raw index create from column length. This was specifically done as part of supporting text blobs. For use cases that don't want this feature and are high QPS, see a negative impact since size of chunk increases (earlier value of numDocsPerChunk was hardcoded to 1000) and based on the access pattern we might end up uncompressing a bigger chunk to get values for a set of docIds. We have made this change configurable. So the default behaviour is same as old (1000 docs per chunk) (2) PR https://github.com/apache/incubator-pinot/pull/4791 added support for noDict for STRING/BYTES in consuming segments. There is a particular impact of this change on the use cases that have set noDict on their STRING dimension columns for other performance reasons and also want metricsAggregation. These use cases don't get to aggregateMetrics because the new implementation was able to honor their table config setting of noDict on STRING/BYTES. Without metrics aggregation, memory pressure increases. So to continue aggregating metrics for such cases, we will create dictionary even if the column is part of noDictionary set from table config. --- .../generator/SegmentGeneratorConfig.java | 15 ++++++++++++ .../indexsegment/mutable/MutableSegmentImpl.java | 28 ++++++++++++++++++++-- .../creator/impl/SegmentColumnarIndexCreator.java | 22 +++++++++++++++-- .../fwd/SingleValueVarByteRawIndexCreator.java | 10 +++++++- .../apache/pinot/spi/config/table/FieldConfig.java | 1 + 5 files changed, 71 insertions(+), 5 deletions(-) diff --git a/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java b/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java index 59531fe..9af9c16 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/generator/SegmentGeneratorConfig.java @@ -102,6 +102,9 @@ public class SegmentGeneratorConfig { private boolean _skipTimeValueCheck = false; private boolean _nullHandlingEnabled = false; + // constructed from FieldConfig + private Map<String, Map<String, String>> _columnProperties = new HashMap<>(); + @Deprecated public SegmentGeneratorConfig() { } @@ -174,12 +177,24 @@ public class SegmentGeneratorConfig { _invertedIndexCreationColumns = indexingConfig.getInvertedIndexColumns(); } + List<FieldConfig> fieldConfigList = tableConfig.getFieldConfigList(); + if (fieldConfigList != null) { + for (FieldConfig fieldConfig : fieldConfigList) { + _columnProperties.put(fieldConfig.getName(), fieldConfig.getProperties()); + } + } + extractTextIndexColumnsFromTableConfig(tableConfig); _nullHandlingEnabled = indexingConfig.isNullHandlingEnabled(); } } + @Nonnull + public Map<String, Map<String, String>> getColumnProperties() { + return _columnProperties; + } + /** * Set time column details using the given time column */ diff --git a/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/mutable/MutableSegmentImpl.java b/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/mutable/MutableSegmentImpl.java index 07e5ec9..1f3f2e0 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/mutable/MutableSegmentImpl.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/indexsegment/mutable/MutableSegmentImpl.java @@ -330,8 +330,32 @@ public class MutableSegmentImpl implements MutableSegment { */ private boolean isNoDictionaryColumn(Set<String> noDictionaryColumns, Set<String> invertedIndexColumns, Set<String> textIndexColumns, FieldSpec fieldSpec, String column) { - return textIndexColumns.contains(column) || (noDictionaryColumns.contains(column) && fieldSpec.isSingleValueField() - && !invertedIndexColumns.contains(column)); + if (textIndexColumns.contains(column)) { + // text column is no dictionary currently + return true; + } + FieldSpec.DataType dataType = fieldSpec.getDataType(); + if (noDictionaryColumns.contains(column)) { + // Earlier we didn't support noDict in consuming segments for STRING and BYTES columns. + // So even if the user had the column in noDictionaryColumns set in table config, we still + // created dictionary in consuming segments. + // Later on we added this support. There is a particular impact of this change on the use cases + // that have set noDict on their STRING dimension columns for other performance + // reasons and also want metricsAggregation. These use cases don't get to + // aggregateMetrics because the new implementation is able to honor their table config setting + // of noDict on STRING/BYTES. Without metrics aggregation, memory pressure increases. + // So to continue aggregating metrics for such cases, we will create dictionary even + // if the column is part of noDictionary set from table config + if (fieldSpec instanceof DimensionFieldSpec && _aggregateMetrics && (dataType == FieldSpec.DataType.STRING || + dataType == FieldSpec.DataType.BYTES)) { + return false; + } + // MV columns are always dictionary encoded + // So don't create dictionary + return fieldSpec.isSingleValueField() && !invertedIndexColumns.contains(column); + } + // column is not a part of noDictionary set, so create dictionary + return false; } public SegmentPartitionConfig getSegmentPartitionConfig() { diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java index 12e6feb..496acc3 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java @@ -53,6 +53,7 @@ import org.apache.pinot.core.segment.creator.impl.inv.OffHeapBitmapInvertedIndex import org.apache.pinot.core.segment.creator.impl.inv.OnHeapBitmapInvertedIndexCreator; import org.apache.pinot.core.segment.creator.impl.inv.text.LuceneTextIndexCreator; import org.apache.pinot.core.segment.creator.impl.nullvalue.NullValueVectorCreator; +import org.apache.pinot.spi.config.table.FieldConfig; import org.apache.pinot.spi.data.DateTimeFieldSpec; import org.apache.pinot.spi.data.FieldSpec; import org.apache.pinot.spi.data.FieldSpec.FieldType; @@ -193,9 +194,10 @@ public class SegmentColumnarIndexCreator implements SegmentCreator { getColumnCompressionType(segmentCreationSpec, fieldSpec); // Initialize forward index creator + boolean deriveNumChunksForVarByteRawIndex = shouldDeriveNumChunksForRawIndex(columnName, segmentCreationSpec.getColumnProperties()); _forwardIndexCreatorMap.put(columnName, getRawIndexCreatorForColumn(_indexDir, compressionType, columnName, fieldSpec.getDataType(), totalDocs, - indexCreationInfo.getLengthOfLongestEntry())); + indexCreationInfo.getLengthOfLongestEntry(), deriveNumChunksForVarByteRawIndex)); // Initialize text index creator if (_textIndexColumns.contains(columnName)) { @@ -213,6 +215,14 @@ public class SegmentColumnarIndexCreator implements SegmentCreator { } } + private boolean shouldDeriveNumChunksForRawIndex(String columnName, Map<String, Map<String, String>> columnProperties) { + if (columnProperties != null) { + Map<String, String> properties = columnProperties.get(columnName); + return properties != null && Boolean.parseBoolean(properties.get(FieldConfig.DERIVE_NUM_CHUNKS_RAW_INDEX_KEY)); + } + return false; + } + /** * Helper method that returns compression type to use based on segment creation spec and field type. * <ul> @@ -539,6 +549,13 @@ public class SegmentColumnarIndexCreator implements SegmentCreator { ChunkCompressorFactory.CompressionType compressionType, String column, FieldSpec.DataType dataType, int totalDocs, int lengthOfLongestEntry) throws IOException { + return getRawIndexCreatorForColumn(file, compressionType, column, dataType, totalDocs, lengthOfLongestEntry, false); + } + + public static SingleValueRawIndexCreator getRawIndexCreatorForColumn(File file, + ChunkCompressorFactory.CompressionType compressionType, String column, FieldSpec.DataType dataType, int totalDocs, + int lengthOfLongestEntry, boolean deriveNumChunksForVarByteRawIndex) + throws IOException { SingleValueRawIndexCreator indexCreator; switch (dataType) { @@ -561,7 +578,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator { case STRING: case BYTES: indexCreator = - new SingleValueVarByteRawIndexCreator(file, compressionType, column, totalDocs, lengthOfLongestEntry); + new SingleValueVarByteRawIndexCreator(file, compressionType, column, totalDocs, lengthOfLongestEntry, + deriveNumChunksForVarByteRawIndex); break; default: diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/fwd/SingleValueVarByteRawIndexCreator.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/fwd/SingleValueVarByteRawIndexCreator.java index a8d7e6b..51c46a8 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/fwd/SingleValueVarByteRawIndexCreator.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/fwd/SingleValueVarByteRawIndexCreator.java @@ -28,6 +28,7 @@ import org.apache.pinot.core.segment.creator.impl.V1Constants; public class SingleValueVarByteRawIndexCreator extends BaseSingleValueRawIndexCreator { + private static final int DEFAULT_NUM_DOCS_PER_CHUNK = 1000; private static final int TARGET_MAX_CHUNK_SIZE = 1024 * 1024; private final VarByteChunkSingleValueWriter _indexWriter; @@ -35,8 +36,15 @@ public class SingleValueVarByteRawIndexCreator extends BaseSingleValueRawIndexCr public SingleValueVarByteRawIndexCreator(File baseIndexDir, ChunkCompressorFactory.CompressionType compressionType, String column, int totalDocs, int maxLength) throws IOException { + this(baseIndexDir, compressionType, column, totalDocs, maxLength, false); + } + + public SingleValueVarByteRawIndexCreator(File baseIndexDir, ChunkCompressorFactory.CompressionType compressionType, + String column, int totalDocs, int maxLength, boolean useLargeIndexFormat) + throws IOException { File file = new File(baseIndexDir, column + V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION); - _indexWriter = new VarByteChunkSingleValueWriter(file, compressionType, totalDocs, getNumDocsPerChunk(maxLength), maxLength); + int numDocsPerChunk = useLargeIndexFormat ? getNumDocsPerChunk(maxLength) : DEFAULT_NUM_DOCS_PER_CHUNK; + _indexWriter = new VarByteChunkSingleValueWriter(file, compressionType, totalDocs, numDocsPerChunk, maxLength); } @VisibleForTesting diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java index 8e0d2e0..75cc28b 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java @@ -36,6 +36,7 @@ public class FieldConfig extends BaseJsonConfig { public static String BLOOM_FILTER_COLUMN_KEY = "bloom.filter"; public static String ON_HEAP_DICTIONARY_COLUMN_KEY = "onheap.dictionary"; public static String VAR_LENGTH_DICTIONARY_COLUMN_KEY = "var.length.dictionary"; + public static String DERIVE_NUM_CHUNKS_RAW_INDEX_KEY = "derive.num.chunks.raw.index"; public static String TEXT_INDEX_REALTIME_READER_REFRESH_KEY = "text.index.realtime.reader.refresh"; // Lucene creates a query result cache if this option is enabled --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org