clintropolis commented on code in PR #15920:
URL: https://github.com/apache/druid/pull/15920#discussion_r1522590886
##########
docs/querying/arrays.md:
##########
@@ -71,19 +71,61 @@ The following shows an example `dimensionsSpec` for native
ingestion of the data
### SQL-based ingestion
-Arrays can also be inserted with [SQL-based
ingestion](../multi-stage-query/index.md) when you include a query context
parameter
[`"arrayIngestMode":"array"`](../multi-stage-query/reference.md#context-parameters).
+#### `arrayIngestMode`
+
+Arrays can be inserted with [SQL-based
ingestion](../multi-stage-query/index.md) when you include the query context
+parameter `arrayIngestMode: array`.
+
+When `arrayIngestMode` is `array`, SQL ARRAY types are stored using Druid
array columns. This is recommended for new
+tables.
+
+When `arrayIngestMode` is `mvd`, SQL `VARCHAR ARRAY` are implicitly wrapped in
[`ARRAY_TO_MV`](sql-functions.md#array_to_mv).
+This causes them to be stored as [multi-value
strings](multi-value-dimensions.md), using the same `STRING` column type
+as regular scalar strings. SQL `BIGINT ARRAY` and `DOUBLE ARRAY` cannot be
loaded under `arrayIngestMode: mvd`. This
+is the default behavior when `arrayIngestMode` is not provided in your query
context, although the default behavior
+may change to `array` in a future release.
Review Comment:
nit: will change
##########
extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/DimensionSchemaUtils.java:
##########
@@ -57,76 +57,112 @@ public static DimensionSchema
createDimensionSchemaForExtern(final String column
);
}
+ /**
+ * Create a dimension schema for a dimension column, given the type that it
was assigned in the query, and the
+ * current values of {@link MultiStageQueryContext#CTX_USE_AUTO_SCHEMAS} and
+ * {@link MultiStageQueryContext#CTX_ARRAY_INGEST_MODE}.
+ *
+ * @param column column name
+ * @param queryType type of the column from the query
+ * @param useAutoType active value of {@link
MultiStageQueryContext#CTX_USE_AUTO_SCHEMAS}
+ * @param arrayIngestMode active value of {@link
MultiStageQueryContext#CTX_ARRAY_INGEST_MODE}
+ */
public static DimensionSchema createDimensionSchema(
final String column,
- @Nullable final ColumnType type,
+ @Nullable final ColumnType queryType,
boolean useAutoType,
ArrayIngestMode arrayIngestMode
)
{
if (useAutoType) {
// for complex types that are not COMPLEX<json>, we still want to use
the handler since 'auto' typing
// only works for the 'standard' built-in types
- if (type != null && type.is(ValueType.COMPLEX) &&
!ColumnType.NESTED_DATA.equals(type)) {
- final ColumnCapabilities capabilities =
ColumnCapabilitiesImpl.createDefault().setType(type);
+ if (queryType != null && queryType.is(ValueType.COMPLEX) &&
!ColumnType.NESTED_DATA.equals(queryType)) {
+ final ColumnCapabilities capabilities =
ColumnCapabilitiesImpl.createDefault().setType(queryType);
return DimensionHandlerUtils.getHandlerFromCapabilities(column,
capabilities, null)
.getDimensionSchema(capabilities);
}
- if (type != null && (type.isPrimitive() || type.isPrimitiveArray())) {
- return new AutoTypeColumnSchema(column, type);
+ if (queryType != null && (queryType.isPrimitive() ||
queryType.isPrimitiveArray())) {
+ return new AutoTypeColumnSchema(column, queryType);
}
return new AutoTypeColumnSchema(column, null);
} else {
- // if schema information is not available, create a string dimension
- if (type == null) {
- return new StringDimensionSchema(column);
- } else if (type.getType() == ValueType.STRING) {
- return new StringDimensionSchema(column);
- } else if (type.getType() == ValueType.LONG) {
+ // dimensionType may not be identical to queryType, depending on
arrayIngestMode.
+ final ColumnType dimensionType = getDimensionType(queryType,
arrayIngestMode);
+
+ if (dimensionType.getType() == ValueType.STRING) {
+ return new StringDimensionSchema(
+ column,
+ queryType != null && queryType.isArray()
+ ? DimensionSchema.MultiValueHandling.ARRAY
+ : DimensionSchema.MultiValueHandling.SORTED_ARRAY,
+ null
+ );
+ } else if (dimensionType.getType() == ValueType.LONG) {
return new LongDimensionSchema(column);
- } else if (type.getType() == ValueType.FLOAT) {
+ } else if (dimensionType.getType() == ValueType.FLOAT) {
return new FloatDimensionSchema(column);
- } else if (type.getType() == ValueType.DOUBLE) {
+ } else if (dimensionType.getType() == ValueType.DOUBLE) {
return new DoubleDimensionSchema(column);
- } else if (type.getType() == ValueType.ARRAY) {
- ValueType elementType = type.getElementType().getType();
- if (elementType == ValueType.STRING) {
- if (arrayIngestMode == ArrayIngestMode.NONE) {
- throw InvalidInput.exception(
- "String arrays can not be ingested when '%s' is set to '%s'.
Set '%s' in query context "
- + "to 'array' to ingest the string array as an array, or
ingest it as an MVD by explicitly casting the "
- + "array to an MVD with ARRAY_TO_MV function.",
- MultiStageQueryContext.CTX_ARRAY_INGEST_MODE,
- StringUtils.toLowerCase(arrayIngestMode.name()),
- MultiStageQueryContext.CTX_ARRAY_INGEST_MODE
- );
- } else if (arrayIngestMode == ArrayIngestMode.MVD) {
- return new StringDimensionSchema(column,
DimensionSchema.MultiValueHandling.ARRAY, null);
- } else {
- // arrayIngestMode == ArrayIngestMode.ARRAY would be true
- return new AutoTypeColumnSchema(column, type);
- }
- } else if (elementType.isNumeric()) {
- // ValueType == LONG || ValueType == FLOAT || ValueType == DOUBLE
- if (arrayIngestMode == ArrayIngestMode.ARRAY) {
- return new AutoTypeColumnSchema(column, type);
- } else {
- throw InvalidInput.exception(
- "Numeric arrays can only be ingested when '%s' is set to
'array' in the MSQ query's context. "
- + "Current value of the parameter [%s]",
- MultiStageQueryContext.CTX_ARRAY_INGEST_MODE,
- StringUtils.toLowerCase(arrayIngestMode.name())
- );
- }
- } else {
- throw new ISE("Cannot create dimension for type [%s]",
type.toString());
- }
+ } else if (dimensionType.getType() == ValueType.ARRAY) {
+ return new AutoTypeColumnSchema(column, dimensionType);
} else {
- final ColumnCapabilities capabilities =
ColumnCapabilitiesImpl.createDefault().setType(type);
+ final ColumnCapabilities capabilities =
ColumnCapabilitiesImpl.createDefault().setType(dimensionType);
return DimensionHandlerUtils.getHandlerFromCapabilities(column,
capabilities, null)
.getDimensionSchema(capabilities);
}
}
}
+
+ /**
+ * Based on a type from a query result, get the type of dimension we should
write.
+ *
+ * @throws org.apache.druid.error.DruidException if there is some problem
+ */
+ public static ColumnType getDimensionType(
+ @Nullable final ColumnType queryType,
+ final ArrayIngestMode arrayIngestMode
+ )
+ {
+ if (queryType == null) {
+ // if schema information is not available, create a string dimension
+ return ColumnType.STRING;
Review Comment:
i wonder if this should return `null` to make `createDimensionSchema`
decided to use the string schema (in case down the line we ever want to switch
that to the auto schema)
##########
docs/querying/arrays.md:
##########
@@ -71,19 +71,61 @@ The following shows an example `dimensionsSpec` for native
ingestion of the data
### SQL-based ingestion
-Arrays can also be inserted with [SQL-based
ingestion](../multi-stage-query/index.md) when you include a query context
parameter
[`"arrayIngestMode":"array"`](../multi-stage-query/reference.md#context-parameters).
+#### `arrayIngestMode`
+
+Arrays can be inserted with [SQL-based
ingestion](../multi-stage-query/index.md) when you include the query context
+parameter `arrayIngestMode: array`.
+
+When `arrayIngestMode` is `array`, SQL ARRAY types are stored using Druid
array columns. This is recommended for new
+tables.
+
+When `arrayIngestMode` is `mvd`, SQL `VARCHAR ARRAY` are implicitly wrapped in
[`ARRAY_TO_MV`](sql-functions.md#array_to_mv).
+This causes them to be stored as [multi-value
strings](multi-value-dimensions.md), using the same `STRING` column type
+as regular scalar strings. SQL `BIGINT ARRAY` and `DOUBLE ARRAY` cannot be
loaded under `arrayIngestMode: mvd`. This
+is the default behavior when `arrayIngestMode` is not provided in your query
context, although the default behavior
+may change to `array` in a future release.
+
+When `arrayIngestMode` is `none`, Druid throws an exception when trying to
store any type of arrays. This mode is most
+useful when set in the system default query context with
`druid.query.default.context.arrayIngestMode = none`, in cases
+where the cluster administrator wants SQL query authors to explicitly provide
one or the other in their query context.
+
+The following table summarizes the differences in SQL ARRAY handling between
`arrayIngestMode: array` and
+`arrayIngestMode: mvd`.
+
+| SQL type | Stored type when `arrayIngestMode: array` | Stored type when
`arrayIngestMode: mvd` (default) |
+|---|---|---|
+|`VARCHAR ARRAY`|`ARRAY<STRING>`|[multi-value
`STRING`](multi-value-dimensions.md)|
+|`BIGINT ARRAY`|`ARRAY<LONG>`|not possible (validation error)|
+|`DOUBLE ARRAY`|`ARRAY<DOUBLE>`|not possible (validation error)|
+
+In either mode, you can explicitly wrap string arrays in `ARRAY_TO_MV` to
cause them to be stored as
+[multi-value strings](multi-value-dimensions.md).
+
+When validating a SQL INSERT or REPLACE statement that contains arrays, Druid
checks whether the statement would lead
+to mixing string arrays and multi-value strings in the same column. If this
condition is detected, the statement fails
+validation unless the column is named under the `skipTypeVerification` context
parameter. This parameter can be either
+a comma-separated list of column names, or a JSON array in string form. This
validation is done to prevent accidentally
+mixing arrays and multi-value strings in the same column.
+
+#### Examples
+
+Set [`arrayIngestMode: array`](#arrayingestmode) in your query context to run
the following examples.
-For example, to insert the data used in this document:
```sql
REPLACE INTO "array_example" OVERWRITE ALL
WITH "ext" AS (
SELECT *
FROM TABLE(
EXTERN(
'{"type":"inline","data":"{\"timestamp\": \"2023-01-01T00:00:00\",
\"label\": \"row1\", \"arrayString\": [\"a\", \"b\"], \"arrayLong\":[1,
null,3], \"arrayDouble\":[1.1, 2.2, null]}\n{\"timestamp\":
\"2023-01-01T00:00:00\", \"label\": \"row2\", \"arrayString\": [null, \"b\"],
\"arrayLong\":null, \"arrayDouble\":[999, null, 5.5]}\n{\"timestamp\":
\"2023-01-01T00:00:00\", \"label\": \"row3\", \"arrayString\": [],
\"arrayLong\":[1, 2, 3], \"arrayDouble\":[null, 2.2, 1.1]} \n{\"timestamp\":
\"2023-01-01T00:00:00\", \"label\": \"row4\", \"arrayString\": [\"a\", \"b\"],
\"arrayLong\":[1, 2, 3], \"arrayDouble\":[]}\n{\"timestamp\":
\"2023-01-01T00:00:00\", \"label\": \"row5\", \"arrayString\": null,
\"arrayLong\":[], \"arrayDouble\":null}"}',
- '{"type":"json"}',
- '[{"name":"timestamp", "type":"STRING"},{"name":"label",
"type":"STRING"},{"name":"arrayString",
"type":"ARRAY<STRING>"},{"name":"arrayLong",
"type":"ARRAY<LONG>"},{"name":"arrayDouble", "type":"ARRAY<DOUBLE>"}]'
+ '{"type":"json"}'
)
+ ) EXTEND (
+ "timestamp" VARCHAR,
+ "label" VARCHAR,
+ "arrayString" VARCHAR ARRAY,
+ "arrayLong" BIGINT ARRAY,
+ "arrayDouble" DOUBLE ARRAY
Review Comment:
thanks for fixing these up, i wrote these docs before i fixed up `EXTEND` to
support array types :metal:
##########
extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/sql/MSQTaskSqlEngine.java:
##########
@@ -307,6 +337,124 @@ private static void validateLimitAndOffset(final RelNode
topRel, final boolean l
}
}
+ /**
+ * Validate that the query does not include any type changes from string to
array or vice versa.
+ *
+ * These type changes tend to cause problems due to mixing of multi-value
strings and string arrays. In particular,
+ * many queries written in the "classic MVD" style (treating MVDs as if they
were regular strings) will fail when
+ * MVDs and arrays are mixed. So, we detect them as invalid.
+ *
+ * @param rootRel root rel
+ * @param fieldMappings field mappings from {@link #validateInsert(RelNode,
List, Table, PlannerContext)}
+ * @param targetTable table we are inserting (or replacing) into, if any
+ * @param plannerContext planner context
+ */
+ private static void validateTypeChanges(
+ final RelNode rootRel,
+ final List<Pair<Integer, String>> fieldMappings,
+ @Nullable final Table targetTable,
+ final PlannerContext plannerContext
+ )
+ {
+ if (targetTable == null) {
+ return;
Review Comment:
>Maybe when we change the default, we can also consider making it so when
people explicitly set arrayIngestMode: mvd, they need to use ARRAY_TO_MV. (I
think that's what you're suggesting here.)
yea, I'm mostly thinking about it in terms of forcing people to write the
correct queries as soon as possible in either mode to make things less painful
down the road, and was thinking it would be nice to prevent bad habits for new
tables before they have a chance to get started since those habits are going to
have to change.
Its ok to wait until we change the default i guess, which this change should
allow us to do much sooner (druid 30 hopefully)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]