imply-cheddar commented on code in PR #13803:
URL: https://github.com/apache/druid/pull/13803#discussion_r1135011007
##########
processing/src/main/java/org/apache/druid/data/input/impl/DelimitedInputFormat.java:
##########
@@ -41,6 +42,18 @@
public class DelimitedInputFormat extends FlatTextInputFormat
{
public static final String TYPE_KEY = "tsv";
+
+ public static DelimitedInputFormat ofColumns(String... columns)
+ {
+ return new DelimitedInputFormat(
+ Arrays.asList(columns),
+ null,
+ null,
+ false,
+ false,
+ 0
+ );
+ }
Review Comment:
Location nit: does this need to be in `main` rather than `test`?
##########
processing/src/main/java/org/apache/druid/data/input/impl/MapInputRowParser.java:
##########
@@ -77,22 +76,36 @@ public static InputRow parse(InputRowSchema inputRowSchema,
Map<String, Object>
* 3) If isIncludeAllDimensions is not set and {@link
DimensionsSpec#getDimensionNames()} is empty,
* the dimensions in the given map is returned.
*
- * In any case, the returned list does not include any dimensions in {@link
DimensionsSpec#getDimensionExclusions()}.
+ * In any case, the returned list does not include any dimensions in {@link
DimensionsSpec#getDimensionExclusions()}
+ * or {@link TimestampSpec#getTimestampColumn()}.
*/
private static List<String> findDimensions(
+ TimestampSpec timestampSpec,
DimensionsSpec dimensionsSpec,
Map<String, Object> rawInputRow
)
{
if (dimensionsSpec.isIncludeAllDimensions()) {
LinkedHashSet<String> dimensions = new
LinkedHashSet<>(dimensionsSpec.getDimensionNames());
- dimensions.addAll(Sets.difference(rawInputRow.keySet(),
dimensionsSpec.getDimensionExclusions()));
+ for (String field : rawInputRow.keySet()) {
+ if (timestampSpec.getTimestampColumn().equals(field) ||
dimensionsSpec.getDimensionExclusions().contains(field)) {
Review Comment:
can we just get these two things once?
##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -437,41 +471,51 @@ public Class<?> classOfObject()
this.typeSet = new NestedLiteralTypeInfo.MutableTypeSet();
}
- private StructuredDataProcessor.ProcessedLiteral<?> processValue(@Nullable
Object value)
+ private StructuredDataProcessor.ProcessedLiteral<?>
processValue(ExprEval<?> eval)
{
- // null value is always added to the global dictionary as id 0, so we
can ignore them here
- if (value != null) {
- // why not
- ExprEval<?> eval = ExprEval.bestEffortOf(value);
- final ColumnType columnType = ExpressionType.toColumnType(eval.type());
-
- switch (columnType.getType()) {
- case LONG:
- globalDimensionDictionary.addLongValue(eval.asLong());
- typeSet.add(ColumnType.LONG);
- return new StructuredDataProcessor.ProcessedLiteral<>(
- eval.asLong(),
- StructuredDataProcessor.getLongObjectEstimateSize()
- );
- case DOUBLE:
- globalDimensionDictionary.addDoubleValue(eval.asDouble());
- typeSet.add(ColumnType.DOUBLE);
- return new StructuredDataProcessor.ProcessedLiteral<>(
- eval.asDouble(),
- StructuredDataProcessor.getDoubleObjectEstimateSize()
- );
- case STRING:
- default:
- final String asString = eval.asString();
- globalDimensionDictionary.addStringValue(asString);
- typeSet.add(ColumnType.STRING);
- return new StructuredDataProcessor.ProcessedLiteral<>(
- eval.asString(),
- StructuredDataProcessor.estimateStringSize(asString)
- );
- }
+ final ColumnType columnType = ExpressionType.toColumnType(eval.type());
+ int sizeEstimate;
+ switch (columnType.getType()) {
+ case LONG:
+ typeSet.add(ColumnType.LONG);
+ sizeEstimate = globalDimensionDictionary.addLongValue(eval.asLong());
+ return new StructuredDataProcessor.ProcessedLiteral<>(eval.asLong(),
sizeEstimate);
+ case DOUBLE:
+ typeSet.add(ColumnType.DOUBLE);
+ sizeEstimate =
globalDimensionDictionary.addDoubleValue(eval.asDouble());
+ return new
StructuredDataProcessor.ProcessedLiteral<>(eval.asDouble(), sizeEstimate);
+ case ARRAY:
+ // skip empty arrays for now, they will always be called 'string'
arrays, which isn't very helpful here since
+ // it will pollute the type set
+ Preconditions.checkNotNull(columnType.getElementType(), "Array
element type must not be null");
Review Comment:
If this were to ever happen, I think I'd want to know which field was the
bad one.
##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -411,7 +445,7 @@ public Object getObject()
if (0 <= dimIndex && dimIndex < dims.length) {
final StructuredData data = (StructuredData) dims[dimIndex];
if (data != null) {
- return data.getValue();
+ return ExprEval.bestEffortOf(data.getValue()).value();
Review Comment:
Is this counting on coercion or something?
##########
processing/src/main/java/org/apache/druid/data/input/impl/JsonInputFormat.java:
##########
@@ -41,6 +41,14 @@ public class JsonInputFormat extends NestedInputFormat
{
public static final String TYPE_KEY = "json";
+ public static final JsonInputFormat DEFAULT = new JsonInputFormat(
+ JSONPathSpec.DEFAULT,
+ null,
+ null,
+ null,
+ null
+ );
+
Review Comment:
Location nit: does this need to be in `main` rather than `test`?
##########
processing/src/main/java/org/apache/druid/query/metadata/SegmentAnalyzer.java:
##########
@@ -134,6 +134,11 @@ public Map<String, ColumnAnalysis> analyze(Segment segment)
analysis = analyzeStringColumn(capabilities, storageAdapter,
columnName);
}
break;
+ case ARRAY:
+ // todo (clint): this is wack, but works for now because arrays are
always nested complex columns...
Review Comment:
Instead of a todo like this. How about an if check that validates that it's
a nested column and, if it's not a nested column, throws an Exception that is
clearly saying that the code is currently written assuming that the arrays are
delivered only by nested columns?
##########
processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java:
##########
@@ -480,11 +480,25 @@ public Expr apply(List<Expr> args)
final StructuredDataProcessor processor = new StructuredDataProcessor()
{
@Override
- public StructuredDataProcessor.ProcessedLiteral<?>
processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
+ public StructuredDataProcessor.ProcessedLiteral<?>
processLiteralField(ArrayList<NestedPathPart> fieldPath, @Nullable Object
fieldValue)
{
// do nothing, we only want the list of fields returned by this
processor
return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
}
+
+ @Nullable
+ @Override
+ public ProcessedLiteral<?> processArrayOfLiteralsField(
+ ArrayList<NestedPathPart> fieldPath,
+ @Nullable Object maybeArrayOfLiterals
+ )
+ {
+ ExprEval<?> eval = ExprEval.bestEffortOf(maybeArrayOfLiterals);
+ if (eval.type().isArray() &&
eval.type().getElementType().isPrimitive()) {
+ return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
+ }
Review Comment:
The if condition makes me believe that we found an array and should be
joyously celebrating, but the return value is a `NULL_LITERAL`, why is it
correct to return a `NULL_LITERAL` and how is that different from just
returning `null`?
##########
extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/NestedColumnParquetReaderTest.java:
##########
@@ -181,7 +181,7 @@ public void
testNestedColumnSchemalessNestedTestFileNoNested() throws IOExceptio
);
List<InputRow> rows = readAllRows(reader);
- Assert.assertEquals(ImmutableList.of("dim1", "metric1", "timestamp"),
rows.get(0).getDimensions());
+ Assert.assertEquals(ImmutableList.of("dim1", "metric1"),
rows.get(0).getDimensions());
Review Comment:
Why the change in expectation?
##########
processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java:
##########
@@ -480,11 +480,25 @@ public Expr apply(List<Expr> args)
final StructuredDataProcessor processor = new StructuredDataProcessor()
{
@Override
- public StructuredDataProcessor.ProcessedLiteral<?>
processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
+ public StructuredDataProcessor.ProcessedLiteral<?>
processLiteralField(ArrayList<NestedPathPart> fieldPath, @Nullable Object
fieldValue)
{
// do nothing, we only want the list of fields returned by this
processor
return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
}
+
+ @Nullable
+ @Override
+ public ProcessedLiteral<?> processArrayOfLiteralsField(
+ ArrayList<NestedPathPart> fieldPath,
+ @Nullable Object maybeArrayOfLiterals
+ )
+ {
+ ExprEval<?> eval = ExprEval.bestEffortOf(maybeArrayOfLiterals);
Review Comment:
Would it make sense to break `ExprEval.bestEffortOf` into a bunch of checks
for different groups of expected types (i.e. `ExprEval.maybeLiteral()` and
`ExprEval.maybeArray`, etc.). Calls to `bestEffortOf` can cascade through, but
places like this that already know some of what they expect can call the one
that more aligns with expectations?
##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -59,16 +62,43 @@ public class NestedDataColumnIndexer implements
DimensionIndexer<StructuredData,
protected final StructuredDataProcessor indexerProcessor = new
StructuredDataProcessor()
{
@Override
- public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart>
fieldPath, Object fieldValue)
+ public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart>
fieldPath, @Nullable Object fieldValue)
{
- final String fieldName =
NestedPathFinder.toNormalizedJsonPath(fieldPath);
- LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
- if (fieldIndexer == null) {
- estimatedFieldKeySize +=
StructuredDataProcessor.estimateStringSize(fieldName);
- fieldIndexer = new LiteralFieldIndexer(globalDictionary);
- fieldIndexers.put(fieldName, fieldIndexer);
+ // null value is always added to the global dictionary as id 0, so we
can ignore them here
+ if (fieldValue != null) {
+ // why not
+ final String fieldName =
NestedPathFinder.toNormalizedJsonPath(fieldPath);
+ ExprEval<?> eval = ExprEval.bestEffortOf(fieldValue);
+ LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
+ if (fieldIndexer == null) {
+ estimatedFieldKeySize +=
StructuredDataProcessor.estimateStringSize(fieldName);
+ fieldIndexer = new LiteralFieldIndexer(globalDictionary);
+ fieldIndexers.put(fieldName, fieldIndexer);
+ }
+ return fieldIndexer.processValue(eval);
}
- return fieldIndexer.processValue(fieldValue);
+ return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
+ }
+
+ @Nullable
+ @Override
+ public ProcessedLiteral<?> processArrayOfLiteralsField(
+ ArrayList<NestedPathPart> fieldPath,
+ Object maybeArrayOfLiterals
+ )
+ {
+ final ExprEval<?> maybeLiteralArray =
ExprEval.bestEffortOf(maybeArrayOfLiterals);
+ if (maybeLiteralArray.type().isArray() &&
maybeLiteralArray.type().getElementType().isPrimitive()) {
+ final String fieldName =
NestedPathFinder.toNormalizedJsonPath(fieldPath);
+ LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
+ if (fieldIndexer == null) {
+ estimatedFieldKeySize +=
StructuredDataProcessor.estimateStringSize(fieldName);
+ fieldIndexer = new LiteralFieldIndexer(globalDictionary);
+ fieldIndexers.put(fieldName, fieldIndexer);
+ }
+ return fieldIndexer.processValue(maybeLiteralArray);
+ }
+ return null;
Review Comment:
This looks a lot like code in NestedDataExpressions, except this doesn't
return the `NULL_LITERAL`. I find myself wondering if the
NestedDataExpressions code shouldn't look like this?
##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -59,16 +62,43 @@ public class NestedDataColumnIndexer implements
DimensionIndexer<StructuredData,
protected final StructuredDataProcessor indexerProcessor = new
StructuredDataProcessor()
{
@Override
- public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart>
fieldPath, Object fieldValue)
+ public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart>
fieldPath, @Nullable Object fieldValue)
{
- final String fieldName =
NestedPathFinder.toNormalizedJsonPath(fieldPath);
- LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
- if (fieldIndexer == null) {
- estimatedFieldKeySize +=
StructuredDataProcessor.estimateStringSize(fieldName);
- fieldIndexer = new LiteralFieldIndexer(globalDictionary);
- fieldIndexers.put(fieldName, fieldIndexer);
+ // null value is always added to the global dictionary as id 0, so we
can ignore them here
+ if (fieldValue != null) {
+ // why not
Review Comment:
Because I don't know "why?" (if this comment is attempting to add
information, more words please).
##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -145,6 +175,10 @@ public DimensionSelector makeDimensionSelector(
final int dimIndex = desc.getIndex();
final ColumnValueSelector<?> rootLiteralSelector =
getRootLiteralValueSelector(currEntry, dimIndex);
if (rootLiteralSelector != null) {
+ final LiteralFieldIndexer root =
fieldIndexers.get(NestedPathFinder.JSON_PATH_ROOT);
+ if (root.getTypes().getSingleType().isArray()) {
+ throw new UnsupportedOperationException("Not supported");
+ }
Review Comment:
I'm reading this as "if all we have are root-level entries and they are
always arrays, then throw a UOE exception". I'm pretty sure I'm reading it
wrong, but wishing the error message gave me more context without me feeling
like I need to expand lines on the review to know what this is validating.
##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -437,41 +471,51 @@ public Class<?> classOfObject()
this.typeSet = new NestedLiteralTypeInfo.MutableTypeSet();
}
- private StructuredDataProcessor.ProcessedLiteral<?> processValue(@Nullable
Object value)
+ private StructuredDataProcessor.ProcessedLiteral<?>
processValue(ExprEval<?> eval)
{
- // null value is always added to the global dictionary as id 0, so we
can ignore them here
- if (value != null) {
- // why not
- ExprEval<?> eval = ExprEval.bestEffortOf(value);
- final ColumnType columnType = ExpressionType.toColumnType(eval.type());
-
- switch (columnType.getType()) {
- case LONG:
- globalDimensionDictionary.addLongValue(eval.asLong());
- typeSet.add(ColumnType.LONG);
- return new StructuredDataProcessor.ProcessedLiteral<>(
- eval.asLong(),
- StructuredDataProcessor.getLongObjectEstimateSize()
- );
- case DOUBLE:
- globalDimensionDictionary.addDoubleValue(eval.asDouble());
- typeSet.add(ColumnType.DOUBLE);
- return new StructuredDataProcessor.ProcessedLiteral<>(
- eval.asDouble(),
- StructuredDataProcessor.getDoubleObjectEstimateSize()
- );
- case STRING:
- default:
- final String asString = eval.asString();
- globalDimensionDictionary.addStringValue(asString);
- typeSet.add(ColumnType.STRING);
- return new StructuredDataProcessor.ProcessedLiteral<>(
- eval.asString(),
- StructuredDataProcessor.estimateStringSize(asString)
- );
- }
+ final ColumnType columnType = ExpressionType.toColumnType(eval.type());
+ int sizeEstimate;
+ switch (columnType.getType()) {
+ case LONG:
+ typeSet.add(ColumnType.LONG);
+ sizeEstimate = globalDimensionDictionary.addLongValue(eval.asLong());
+ return new StructuredDataProcessor.ProcessedLiteral<>(eval.asLong(),
sizeEstimate);
+ case DOUBLE:
+ typeSet.add(ColumnType.DOUBLE);
+ sizeEstimate =
globalDimensionDictionary.addDoubleValue(eval.asDouble());
+ return new
StructuredDataProcessor.ProcessedLiteral<>(eval.asDouble(), sizeEstimate);
+ case ARRAY:
+ // skip empty arrays for now, they will always be called 'string'
arrays, which isn't very helpful here since
+ // it will pollute the type set
Review Comment:
How does it do the skipping of empties?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]