imply-cheddar commented on code in PR #13803:
URL: https://github.com/apache/druid/pull/13803#discussion_r1148813190
##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -426,55 +471,71 @@ public Class<?> classOfObject()
};
}
- static class LiteralFieldIndexer
+ static class FieldIndexer
{
private final GlobalDimensionDictionary globalDimensionDictionary;
- private final NestedLiteralTypeInfo.MutableTypeSet typeSet;
+ private final NestedFieldTypeInfo.MutableTypeSet typeSet;
- LiteralFieldIndexer(GlobalDimensionDictionary globalDimensionDictionary)
+ FieldIndexer(GlobalDimensionDictionary globalDimensionDictionary)
{
this.globalDimensionDictionary = globalDimensionDictionary;
- this.typeSet = new NestedLiteralTypeInfo.MutableTypeSet();
+ this.typeSet = new NestedFieldTypeInfo.MutableTypeSet();
}
- private StructuredDataProcessor.ProcessedLiteral<?> processValue(@Nullable
Object value)
+ private StructuredDataProcessor.ProcessedValue<?> processValue(ExprEval<?>
eval)
{
- // null value is always added to the global dictionary as id 0, so we
can ignore them here
- if (value != null) {
- // why not
- ExprEval<?> eval = ExprEval.bestEffortOf(value);
- final ColumnType columnType = ExpressionType.toColumnType(eval.type());
-
- switch (columnType.getType()) {
- case LONG:
- globalDimensionDictionary.addLongValue(eval.asLong());
- typeSet.add(ColumnType.LONG);
- return new StructuredDataProcessor.ProcessedLiteral<>(
- eval.asLong(),
- StructuredDataProcessor.getLongObjectEstimateSize()
- );
- case DOUBLE:
- globalDimensionDictionary.addDoubleValue(eval.asDouble());
- typeSet.add(ColumnType.DOUBLE);
- return new StructuredDataProcessor.ProcessedLiteral<>(
- eval.asDouble(),
- StructuredDataProcessor.getDoubleObjectEstimateSize()
- );
- case STRING:
- default:
- final String asString = eval.asString();
- globalDimensionDictionary.addStringValue(asString);
- typeSet.add(ColumnType.STRING);
- return new StructuredDataProcessor.ProcessedLiteral<>(
- eval.asString(),
- StructuredDataProcessor.estimateStringSize(asString)
- );
- }
+ final ColumnType columnType = ExpressionType.toColumnType(eval.type());
+ int sizeEstimate;
+ switch (columnType.getType()) {
+ case LONG:
+ typeSet.add(ColumnType.LONG);
+ sizeEstimate = globalDimensionDictionary.addLongValue(eval.asLong());
+ return new StructuredDataProcessor.ProcessedValue<>(eval.asLong(),
sizeEstimate);
+ case DOUBLE:
+ typeSet.add(ColumnType.DOUBLE);
+ sizeEstimate =
globalDimensionDictionary.addDoubleValue(eval.asDouble());
+ return new StructuredDataProcessor.ProcessedValue<>(eval.asDouble(),
sizeEstimate);
+ case ARRAY:
+ // sanity check, this should never happen
+ Preconditions.checkNotNull(
+ columnType.getElementType(),
+ "Array type [%s] for value [%s] missing element type, how did
this possibly happen?",
+ eval.type(),
+ eval.valueOrDefault()
+ );
+ switch (columnType.getElementType().getType()) {
+ case LONG:
+ typeSet.add(ColumnType.LONG_ARRAY);
+ final Object[] longArray = eval.asArray();
+ sizeEstimate = globalDimensionDictionary.addLongArray(longArray);
+ return new StructuredDataProcessor.ProcessedValue<>(longArray,
sizeEstimate);
+ case DOUBLE:
+ typeSet.add(ColumnType.DOUBLE_ARRAY);
+ final Object[] doubleArray = eval.asArray();
+ sizeEstimate =
globalDimensionDictionary.addDoubleArray(doubleArray);
+ return new StructuredDataProcessor.ProcessedValue<>(doubleArray,
sizeEstimate);
+ case STRING:
+ final Object[] stringArray = eval.asArray();
+ // empty arrays and arrays with all nulls are detected as string
arrays, but dont count them as part of
+ // the type set
+ if (stringArray.length > 0 &&
!Arrays.stream(stringArray).allMatch(Objects::isNull)) {
+ typeSet.add(ColumnType.STRING_ARRAY);
+ }
+ sizeEstimate =
globalDimensionDictionary.addStringArray(stringArray);
+ return new StructuredDataProcessor.ProcessedValue<>(stringArray,
sizeEstimate);
+ default:
+ throw new IAE("Unhandled type: %s", columnType);
Review Comment:
Ah, so `array-of-arrays` should be handled in `StructuredDataProcessor`
instead?
Even after this PR, the handling of `array-of-arrays` and `array-of-objects`
is still a bit muddy, right? (it's falling back to the "an array is just an
object with numbers for field names" behavior?)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]