Re: [PR] INSERT/REPLACE dimension target column types are validated against source input expressions (druid)

via GitHub Tue, 12 Mar 2024 05:50:19 -0700


kgyrtkirk commented on code in PR #15962:
URL: https://github.com/apache/druid/pull/15962#discussion_r1521333936



##########
sql/src/main/java/org/apache/druid/sql/calcite/planner/CalcitePlanner.java:
##########
@@ -403,6 +403,7 @@ private SqlValidator 
createSqlValidator(CalciteCatalogReader catalogReader)
         
SqlValidator.Config.DEFAULT.withConformance(connectionConfig.conformance())
                                    
.withLenientOperatorLookup(connectionConfig.lenientOperatorLookup())
                                    .withIdentifierExpansion(true);
+                                   //.withTypeCoercionEnabled(false);

Review Comment:
   nit: commented line



##########
extensions-core/druid-catalog/src/test/java/org/apache/druid/catalog/sql/CatalogIngestionTest.java:
##########
@@ -171,4 +196,313 @@ public void testInsertHourGrainWithDay()
         )
         .verify();
   }
+
+  /**
+   * If the segment grain is given in the catalog and absent in the 
PARTITIONED BY clause in the query, then use the
+   * value from the catalog.
+   */
+  @Test
+  public void testReplaceHourGrainPartitonedByFromCatalog()
+  {
+    testIngestionQuery()
+        .sql("REPLACE INTO hourDs OVERWRITE ALL\n" +
+             "SELECT * FROM foo")
+        .authentication(CalciteTests.SUPER_USER_AUTH_RESULT)
+        .expectTarget("hourDs", FOO_SIGNATURE)
+        .expectResources(dataSourceWrite("hourDs"), dataSourceRead("foo"))
+        .expectQuery(
+            newScanQueryBuilder()
+                .dataSource("foo")
+                .intervals(querySegmentSpec(Filtration.eternity()))
+                .columns("__time", "cnt", "dim1", "dim2", "extra1", "extra2", 
"extra3", "m1", "m2")
+                .context(queryContextWithGranularity(Granularities.HOUR))
+                .build()
+        )
+        .verify();
+  }
+
+  /**
+   * If the segment grain is given in the catalog, and also by PARTITIONED BY, 
then
+   * the query value is used.
+   */
+  @Test
+  public void testReplaceHourGrainWithDayPartitonedByFromQuery()
+  {
+    testIngestionQuery()
+        .sql("REPLACE INTO hourDs OVERWRITE ALL\n" +
+             "SELECT * FROM foo\n" +
+             "PARTITIONED BY day")
+        .authentication(CalciteTests.SUPER_USER_AUTH_RESULT)
+        .expectTarget("hourDs", FOO_SIGNATURE)
+        .expectResources(dataSourceWrite("hourDs"), dataSourceRead("foo"))
+        .expectQuery(
+            newScanQueryBuilder()
+                .dataSource("foo")
+                .intervals(querySegmentSpec(Filtration.eternity()))
+                .columns("__time", "cnt", "dim1", "dim2", "extra1", "extra2", 
"extra3", "m1", "m2")
+                .context(queryContextWithGranularity(Granularities.DAY))
+                .build()
+        )
+        .verify();
+  }
+
+  /**
+   * Attempt to verify that types specified in the catalog are pushed down to
+   * MSQ. At present, Druid does not have the tools needed to do a full 
push-down.
+   * We have to accept a good-enough push-down: that the produced type is at 
least
+   * compatible with the desired type.
+   */
+  @Test
+  public void testInsertIntoCatalogTable()
+  {
+    ExternalDataSource externalDataSource = new ExternalDataSource(
+        new InlineInputSource("2022-12-26T12:34:56,extra,10,\"20\",foo\n"),
+        new CsvInputFormat(ImmutableList.of("a", "b", "c", "d", "e"), null, 
false, false, 0),
+        RowSignature.builder()
+            .add("a", ColumnType.STRING)
+            .add("b", ColumnType.STRING)
+            .add("c", ColumnType.LONG)
+            .add("d", ColumnType.STRING)
+            .add("e", ColumnType.STRING)
+            .build()
+    );
+    final RowSignature signature = RowSignature.builder()
+        .add("__time", ColumnType.LONG)
+        .add("dim1", ColumnType.STRING)
+        .add("cnt", ColumnType.LONG)
+        .add("m1", ColumnType.DOUBLE)
+        .add("extra2", ColumnType.LONG)
+        .add("extra3", ColumnType.STRING)
+        .build();
+    testIngestionQuery()
+        .sql("INSERT INTO foo\n" +
+             "SELECT\n" +
+             "  TIME_PARSE(a) AS __time,\n" +
+             "  b AS dim1,\n" +
+             "  1 AS cnt,\n" +
+             "  c AS m1,\n" +
+             "  CAST(d AS BIGINT) AS extra2,\n" +
+             "  e AS extra3\n" +
+             "FROM TABLE(inline(\n" +
+             "  data => ARRAY['2022-12-26T12:34:56,extra,10,\"20\",foo'],\n" +
+             "  format => 'csv'))\n" +
+             "  (a VARCHAR, b VARCHAR, c BIGINT, d VARCHAR, e VARCHAR)\n" +
+             "PARTITIONED BY ALL TIME")
+        .authentication(CalciteTests.SUPER_USER_AUTH_RESULT)
+        .expectTarget("foo", signature)
+        .expectResources(dataSourceWrite("foo"), 
Externals.externalRead("EXTERNAL"))
+        .expectQuery(
+            newScanQueryBuilder()
+                .dataSource(externalDataSource)
+                .intervals(querySegmentSpec(Filtration.eternity()))
+                .virtualColumns(
+                    expressionVirtualColumn("v0", 
"timestamp_parse(\"a\",null,'UTC')", ColumnType.LONG),
+                    expressionVirtualColumn("v1", "1", ColumnType.LONG),
+                    expressionVirtualColumn("v2", "CAST(\"c\", 'DOUBLE')", 
ColumnType.DOUBLE),
+                    expressionVirtualColumn("v3", "CAST(\"d\", 'LONG')", 
ColumnType.LONG)
+                )
+                // Scan query lists columns in alphabetical order independent 
of the
+                // SQL project list or the defined schema. Here we just check 
that the
+                // set of columns is correct, but not their order.
+                .columns("b", "e", "v0", "v1", "v2", "v3")

Review Comment:
   its hard to interpret this plan like this...what will permute `v0` to be the 
1st column?
   shouldn't that be in the plan? ...or the rename of the columns to their 
output names?



##########
sql/src/main/java/org/apache/druid/sql/calcite/planner/DruidSqlValidator.java:
##########
@@ -226,6 +235,20 @@ public void validateInsert(final SqlInsert insert)
     // Determine the output (target) schema.
     final RelDataType targetType = validateTargetType(scope, insertNs, insert, 
sourceType, tableMetadata);
 
+    // WITH node type is computed to be the type of the body recursively in
+    // org.apache.calcite.sql2rel.SqlToRelConverter.convertQuery}. If this 
computed type
+    // is different than the type validated and stored for the node in memory 
a nasty relational
+    // algebra error will occur in 
org.apache.calcite.sql2rel.SqlToRelConverter.checkConvertedType.
+    // During the validateTargetType call above, the WITH body node validated 
type may be updated
+    // with any coercions applied. We update the validated node type of the 
WITH node here so
+    // that they are consistent.
+    if (source instanceof SqlWith) {
+      final RelDataType withBodyType = getValidatedNodeTypeIfKnown(((SqlWith) 
source).body);

Review Comment:
   I wonder if this is the right approach; is the issue coming from something 
like:
   * the with may supply `null`-s
   * the target table doesn't accept `null` value-s
   
   I wonder why not make this `replaceWithDefault` sacred by adding 
`COALESCE(colVal, 0)` and similar crap; so that calcite is also aware it...the 
runtime could remove the `coalesce` if its know that its pointless and done
   



##########
sql/src/main/java/org/apache/druid/sql/calcite/planner/DruidSqlValidator.java:
##########
@@ -432,11 +456,18 @@ private RelDataType validateTargetType(
         fields.add(Pair.of(colName, sourceField.getType()));
         continue;
       }
-      RelDataType relType = 
typeFactory.createSqlType(SqlTypeName.get(sqlTypeName));
-      fields.add(Pair.of(
-          colName,
-          typeFactory.createTypeWithNullability(relType, true)
-      ));
+      if (NullHandling.replaceWithDefault()) {

Review Comment:
   I wonder if the resultset may contain `null` even in case 
`replaceWithDefault()` is `true`
   fyi: `RowSignatures#toRelDataType` creates a nullable strings regardless of 
`replaceWithDefault`
   



##########
sql/src/main/java/org/apache/druid/sql/calcite/planner/DruidSqlValidator.java:
##########
@@ -449,6 +480,94 @@ private RelDataType validateTargetType(
     return targetType;
   }
 
+  @Override
+  protected void checkTypeAssignment(
+      @Nullable SqlValidatorScope sourceScope,
+      SqlValidatorTable table,
+      RelDataType sourceRowType,
+      RelDataType targetRowType,
+      final SqlNode query
+  )
+  {
+    if (SqlTypeUtil.equalAsStructSansNullability(typeFactory,
+        sourceRowType, targetRowType, null)) {
+      // Returns early if source and target row type equals sans nullability.

Review Comment:
   is this necessary? inserting a `non-null` into ` nullable` column is not the 
same as the opposite



##########
sql/src/main/java/org/apache/druid/sql/calcite/planner/DruidSqlValidator.java:
##########
@@ -449,6 +480,94 @@ private RelDataType validateTargetType(
     return targetType;
   }
 
+  @Override
+  protected void checkTypeAssignment(
+      @Nullable SqlValidatorScope sourceScope,
+      SqlValidatorTable table,
+      RelDataType sourceRowType,
+      RelDataType targetRowType,
+      final SqlNode query
+  )
+  {
+    if (SqlTypeUtil.equalAsStructSansNullability(typeFactory,
+        sourceRowType, targetRowType, null)) {
+      // Returns early if source and target row type equals sans nullability.
+      return;
+    }
+    final List<RelDataTypeField> sourceFields = sourceRowType.getFieldList();
+    List<RelDataTypeField> targetFields = targetRowType.getFieldList();
+    final int sourceCount = sourceFields.size();
+    for (int i = 0; i < sourceCount; ++i) {
+      RelDataType sourceFielRelDataType = sourceFields.get(i).getType();
+      RelDataType targetFieldRelDataType = targetFields.get(i).getType();
+      ColumnType sourceFieldColumnType = 
Calcites.getColumnTypeForRelDataType(sourceFielRelDataType);
+      ColumnType targetFieldColumnType = 
Calcites.getColumnTypeForRelDataType(targetFieldRelDataType);
+
+      if (targetFieldColumnType != 
ColumnType.leastRestrictiveType(targetFieldColumnType, sourceFieldColumnType)) {
+        SqlNode node = getNthExpr(query, i, sourceCount);
+        String targetTypeString;
+        String sourceTypeString;
+        if (SqlTypeUtil.areCharacterSetsMismatched(
+            sourceFielRelDataType,
+            targetFieldRelDataType)) {
+          sourceTypeString = sourceFielRelDataType.getFullTypeString();
+          targetTypeString = targetFieldRelDataType.getFullTypeString();
+        } else {
+          sourceTypeString = sourceFielRelDataType.toString();
+          targetTypeString = targetFieldRelDataType.toString();
+        }
+        throw newValidationError(node,
+            Static.RESOURCE.typeNotAssignable(
+                targetFields.get(i).getName(), targetTypeString,
+                sourceFields.get(i).getName(), sourceTypeString));
+      }
+    }
+    // the call to base class definition will insert implicit casts / 
coercions where needed.
+    super.checkTypeAssignment(sourceScope, table, sourceRowType, 
targetRowType, query);
+  }
+
+  /**
+   * Locates the n'th expression in an INSERT or UPDATE query.
+   *
+   * @param query       Query
+   * @param ordinal     Ordinal of expression
+   * @param sourceCount Number of expressions
+   * @return Ordinal'th expression, never null
+   */
+  private static SqlNode getNthExpr(SqlNode query, int ordinal, int 
sourceCount)

Review Comment:
   I wonder if there is a better way for this; I see that its hard to get 
access to this as that part of the validation was already done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] INSERT/REPLACE dimension target column types are validated against source input expressions (druid)

Reply via email to