yashmayya commented on code in PR #13037:
URL: https://github.com/apache/pinot/pull/13037#discussion_r1593366648
##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/defaultcolumn/BaseDefaultColumnHandler.java:
##########
@@ -757,57 +753,157 @@ private void createDerivedColumnV1Indices(String column,
FunctionEvaluator funct
} else {
useVarLengthDictionary =
_indexLoadingConfig.getVarLengthDictionaryColumns().contains(column);
}
- indexCreationInfo = new ColumnIndexCreationInfo(statsCollector,
true, useVarLengthDictionary, true,
- new ByteArray((byte[]) fieldSpec.getDefaultNullValue()));
+ indexCreationInfo = new ColumnIndexCreationInfo(statsCollector,
createDictionary, useVarLengthDictionary,
+ true, new ByteArray((byte[]) fieldSpec.getDefaultNullValue()));
break;
}
default:
throw new IllegalStateException();
}
- // Create dictionary
- try (SegmentDictionaryCreator dictionaryCreator = new
SegmentDictionaryCreator(fieldSpec, _indexDir,
- indexCreationInfo.isUseVarLengthDictionary())) {
-
dictionaryCreator.build(indexCreationInfo.getSortedUniqueElementsArray());
+ if (createDictionary) {
+ createDerivedColumnForwardIndexWithDictionary(column, fieldSpec,
outputValues, indexCreationInfo);
+ } else {
+ createDerivedColumnForwardIndexWithoutDictionary(column, fieldSpec,
outputValues, indexCreationInfo);
+ }
+ } finally {
+ for (ValueReader valueReader : valueReaders) {
+ valueReader.close();
+ }
+ }
+ }
+
+ /**
+ * Helper method to create the dictionary and forward indices for a column
with derived values.
+ */
+ private void createDerivedColumnForwardIndexWithDictionary(String column,
FieldSpec fieldSpec, Object[] outputValues,
+ ColumnIndexCreationInfo indexCreationInfo) throws Exception {
+
+ // Create dictionary
+ try (SegmentDictionaryCreator dictionaryCreator = new
SegmentDictionaryCreator(fieldSpec, _indexDir,
+ indexCreationInfo.isUseVarLengthDictionary())) {
+
dictionaryCreator.build(indexCreationInfo.getSortedUniqueElementsArray());
+
+ int numDocs = outputValues.length;
- // Create forward index
- int cardinality = indexCreationInfo.getDistinctValueCount();
+ // Create forward index
+ boolean isSingleValue = fieldSpec.isSingleValueField();
+
+ try (ForwardIndexCreator forwardIndexCreator
+ = getForwardIndexCreator(fieldSpec, indexCreationInfo, numDocs,
column, true)) {
if (isSingleValue) {
- try (ForwardIndexCreator forwardIndexCreator =
indexCreationInfo.isSorted()
- ? new SingleValueSortedForwardIndexCreator(_indexDir, column,
cardinality)
- : new SingleValueUnsortedForwardIndexCreator(_indexDir, column,
cardinality, numDocs)) {
- for (int i = 0; i < numDocs; i++) {
-
forwardIndexCreator.putDictId(dictionaryCreator.indexOfSV(outputValues[i]));
- }
+ for (Object outputValue : outputValues) {
+
forwardIndexCreator.putDictId(dictionaryCreator.indexOfSV(outputValue));
}
} else {
- DictIdCompressionType dictIdCompressionType = null;
- FieldIndexConfigs fieldIndexConfig =
_indexLoadingConfig.getFieldIndexConfig(column);
- if (fieldIndexConfig != null) {
- ForwardIndexConfig forwardIndexConfig =
fieldIndexConfig.getConfig(new ForwardIndexPlugin().getIndexType());
- if (forwardIndexConfig != null) {
- dictIdCompressionType =
forwardIndexConfig.getDictIdCompressionType();
- }
- }
- try (ForwardIndexCreator forwardIndexCreator = dictIdCompressionType
== DictIdCompressionType.MV_ENTRY_DICT
- ? new MultiValueEntryDictForwardIndexCreator(_indexDir, column,
cardinality, numDocs)
- : new MultiValueUnsortedForwardIndexCreator(_indexDir, column,
cardinality, numDocs,
- indexCreationInfo.getTotalNumberOfEntries())) {
- for (int i = 0; i < numDocs; i++) {
-
forwardIndexCreator.putDictIdMV(dictionaryCreator.indexOfMV(outputValues[i]));
- }
+ for (Object outputValue : outputValues) {
+
forwardIndexCreator.putDictIdMV(dictionaryCreator.indexOfMV(outputValue));
}
}
-
// Add the column metadata
SegmentColumnarIndexCreator.addColumnMetadataInfo(_segmentProperties,
column, indexCreationInfo, numDocs,
fieldSpec, true, dictionaryCreator.getNumBytesPerEntry());
}
- } finally {
- for (ValueReader valueReader : valueReaders) {
- valueReader.close();
+ }
+ }
+
+ /**
+ * Helper method to create a forward index for a raw encoded column with
derived values.
+ */
+ private void createDerivedColumnForwardIndexWithoutDictionary(String column,
FieldSpec fieldSpec,
+ Object[] outputValues, ColumnIndexCreationInfo indexCreationInfo)
+ throws Exception {
+
+ // Create forward index
+ int numDocs = outputValues.length;
+ boolean isSingleValue = fieldSpec.isSingleValueField();
+
+ try (ForwardIndexCreator forwardIndexCreator
+ = getForwardIndexCreator(fieldSpec, indexCreationInfo, numDocs,
column, false)) {
+ if (isSingleValue) {
+ for (Object outputValue : outputValues) {
+ switch (fieldSpec.getDataType().getStoredType()) {
+ // Casts are safe here because we've already done the conversion
in createDerivedColumnV1Indices
+ case INT:
+ forwardIndexCreator.putInt((int) outputValue);
+ break;
+ case LONG:
+ forwardIndexCreator.putLong((long) outputValue);
+ break;
+ case FLOAT:
+ forwardIndexCreator.putFloat((float) outputValue);
+ break;
+ case DOUBLE:
+ forwardIndexCreator.putDouble((double) outputValue);
+ break;
+ case BIG_DECIMAL:
+ forwardIndexCreator.putBigDecimal((BigDecimal) outputValue);
+ break;
+ case STRING:
+ forwardIndexCreator.putString((String) outputValue);
+ break;
+ case BYTES:
+ forwardIndexCreator.putBytes((byte[]) outputValue);
+ break;
+ default:
+ throw new IllegalStateException();
+ }
+ }
+ } else {
+ for (Object outputValue : outputValues) {
+ switch (fieldSpec.getDataType().getStoredType()) {
+ // Casts are safe here because we've already done the conversion
in createDerivedColumnV1Indices
+ case INT:
+ forwardIndexCreator.putIntMV(ArrayUtils.toPrimitive((Integer[])
outputValue));
Review Comment:
We're using the primitive wrapper classes
[here](https://github.com/apache/pinot/blob/363a03eea82a66aef4af724625dc714676018aa5/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/defaultcolumn/BaseDefaultColumnHandler.java#L612)
for instance because in the dictionary MV case, we're casting to `Object[]`
which isn't possible with primitive arrays. So we could add separate branches
in each case
[here](https://github.com/apache/pinot/blob/363a03eea82a66aef4af724625dc714676018aa5/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/defaultcolumn/BaseDefaultColumnHandler.java#L603)
for multi-value fields to either create a primitive array (for raw encoded
column) or an array of primitive wrapper classes (for dictionary encoded
column). But then, I think we'd also need to update all the column statistics
collector classes, which currently only handle two cases - the entry is either
something that can be casted to an `Obj
ect[]` (i.e., an `Integer[]` for `IntColumnPreIndexStatsCollector`), or else
its treated as a single-value integer. Primitive arrays (`int[]` for instance)
don't seem to be currently handled appropriately (since they can't be cast to
`Object[]`) - see
[here](https://github.com/apache/pinot/blob/363a03eea82a66aef4af724625dc714676018aa5/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/stats/IntColumnPreIndexStatsCollector.java#L38-L61)
for example.
I can certainly make that change here in this PR though if you think it
makes sense. Alternatively, I'd be happy to file a follow-up PR to make this
optimization and related changes too. Thoughts?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]