This is an automated email from the ASF dual-hosted git repository.

karan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new e6a11707cb Adding query stack fault to MSQ to capture native query 
errors. (#13926)
e6a11707cb is described below

commit e6a11707cba7284783860fc1155877676cfae1f0
Author: Karan Kumar <[email protected]>
AuthorDate: Wed Apr 5 16:29:10 2023 +0530

    Adding query stack fault to MSQ to capture native query errors. (#13926)
    
    
    * Add a new fault "QueryRuntimeError" to MSQ engine to capture native query 
errors.
    * Fixed bug in MSQ fault tolerance where worker were being retried if 
`UnexpectedMultiValueDimensionException` was thrown.
    * An exception from the query runtime with `org.apache.druid.query` as the 
package name is thrown as a QueryRuntimeError
---
 docs/multi-stage-query/reference.md                |  1 +
 .../java/org/apache/druid/msq/exec/MSQTasks.java   |  9 +--
 .../apache/druid/msq/guice/MSQIndexingModule.java  |  2 +
 .../druid/msq/indexing/error/MSQErrorReport.java   | 12 +++-
 .../msq/indexing/error/QueryRuntimeFault.java      | 79 ++++++++++++++++++++++
 .../org/apache/druid/msq/exec/MSQInsertTest.java   |  7 +-
 .../org/apache/druid/msq/exec/MSQSelectTest.java   | 19 +++---
 .../msq/indexing/error/MSQErrorReportTest.java     | 77 +++++++++++++++++++++
 .../msq/indexing/error/MSQFaultSerdeTest.java      |  2 +
 9 files changed, 187 insertions(+), 21 deletions(-)

diff --git a/docs/multi-stage-query/reference.md 
b/docs/multi-stage-query/reference.md
index 12e56eda31..d34f3e0cf9 100644
--- a/docs/multi-stage-query/reference.md
+++ b/docs/multi-stage-query/reference.md
@@ -750,6 +750,7 @@ The following table describes error codes you may encounter 
in the `multiStageQu
 | <a name="error_InsertTimeOutOfBounds">`InsertTimeOutOfBounds`</a> | A 
REPLACE query generated a timestamp outside the bounds of the TIMESTAMP 
parameter for your OVERWRITE WHERE clause.<br /> <br />To avoid this error, 
verify that the you specified is valid. | `interval`: time chunk interval 
corresponding to the out-of-bounds timestamp |
 | <a name="error_InvalidNullByte">`InvalidNullByte`</a> | A string column 
included a null byte. Null bytes in strings are not permitted. | `column`: The 
column that included the null byte |
 | <a name="error_QueryNotSupported">`QueryNotSupported`</a> | QueryKit could 
not translate the provided native query to a multi-stage query.<br /> <br 
/>This can happen if the query uses features that aren't supported, like 
GROUPING SETS. | |
+| <a name="error_QueryRuntimeError">`QueryRuntimeError`</a> | MSQ uses the 
native query engine to run the leaf stages. This error tells MSQ that error is 
in native query runtime.<br /> <br /> Since this is a generic error, the user 
needs to look at logs for the error message and stack trace to figure out the 
next course of action. If the user is stuck, consider raising a `github` issue 
for assistance. |  `baseErrorMessage` error message from the native query 
runtime. |
 | <a name="error_RowTooLarge">`RowTooLarge`</a> | The query tried to process a 
row that was too large to write to a single frame. See the [Limits](#limits) 
table for specific limits on frame size. Note that the effective maximum row 
size is smaller than the maximum frame size due to alignment considerations 
during frame writing. | `maxFrameSize`: The limit on the frame size. |
 | <a name="error_TaskStartTimeout">`TaskStartTimeout`</a> | Unable to launch 
`numTasks` tasks within `timeout` milliseconds.<br /><br />There may be 
insufficient available slots to start all the worker tasks simultaneously. Try 
splitting up your query into smaller chunks using a smaller value of 
[`maxNumTasks`](#context-parameters). Another option is to increase capacity. | 
`numTasks`: The number of tasks attempted to launch.<br /><br />`timeout`: 
Timeout, in milliseconds, that was exceeded. |
 | <a name="error_TooManyAttemptsForJob">`TooManyAttemptsForJob`</a> | Total 
relaunch attempt count across all workers exceeded max relaunch attempt limit. 
See the [Limits](#limits) table for the specific limit. | `maxRelaunchCount`: 
Max number of relaunches across all the workers defined in the 
[Limits](#limits) section. <br /><br /> `currentRelaunchCount`: current 
relaunch counter for the job across all workers. <br /><br /> `taskId`: Latest 
task id which failed <br /> <br /> `rootError [...]
diff --git 
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MSQTasks.java
 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MSQTasks.java
index 623d4c737f..3564ea59a9 100644
--- 
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MSQTasks.java
+++ 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MSQTasks.java
@@ -33,6 +33,7 @@ import org.apache.druid.msq.indexing.error.MSQErrorReport;
 import org.apache.druid.msq.indexing.error.MSQException;
 import org.apache.druid.msq.indexing.error.MSQFault;
 import org.apache.druid.msq.indexing.error.MSQFaultUtils;
+import org.apache.druid.msq.indexing.error.QueryRuntimeFault;
 import org.apache.druid.msq.indexing.error.UnknownFault;
 import org.apache.druid.msq.indexing.error.WorkerFailedFault;
 import org.apache.druid.msq.indexing.error.WorkerRpcFailedFault;
@@ -162,12 +163,12 @@ public class MSQTasks
   /**
    * Builds an error report from a possible controller error report and a 
possible worker error report. Both may be
    * null, in which case this function will return a report with {@link 
UnknownFault}.
-   *
+   * <br/>
    * We only include a single {@link MSQErrorReport} in the task report, 
because it's important that a query have
    * a single {@link MSQFault} explaining why it failed. To aid debugging
    * in cases where we choose the controller error over the worker error, 
we'll log the worker error too, even though
    * it doesn't appear in the report.
-   *
+   * <br/>
    * Logic: we prefer the controller exception unless it's {@link 
WorkerFailedFault}, {@link WorkerRpcFailedFault},
    * or {@link CanceledFault}. In these cases we prefer the worker error 
report. This ensures we get the best, most
    * useful exception even when the controller cancels worker tasks after a 
failure. (As tasks are canceled one by
@@ -228,8 +229,8 @@ public class MSQTasks
     logMessage.append(": 
").append(MSQFaultUtils.generateMessageWithErrorCode(errorReport.getFault()));
 
     if (errorReport.getExceptionStackTrace() != null) {
-      if (errorReport.getFault() instanceof UnknownFault) {
-        // Log full stack trace for unknown faults.
+      if (errorReport.getFault() instanceof UnknownFault || 
errorReport.getFault() instanceof QueryRuntimeFault) {
+        // Log full stack trace for unknown and QueryStack faults
         logMessage.append('\n').append(errorReport.getExceptionStackTrace());
       } else {
         // Log first line only (error class, message) for known faults, to 
avoid polluting logs.
diff --git 
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQIndexingModule.java
 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQIndexingModule.java
index 49656729a0..ea6a3c5bdb 100644
--- 
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQIndexingModule.java
+++ 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQIndexingModule.java
@@ -53,6 +53,7 @@ import 
org.apache.druid.msq.indexing.error.InvalidNullByteFault;
 import org.apache.druid.msq.indexing.error.MSQFault;
 import org.apache.druid.msq.indexing.error.NotEnoughMemoryFault;
 import org.apache.druid.msq.indexing.error.QueryNotSupportedFault;
+import org.apache.druid.msq.indexing.error.QueryRuntimeFault;
 import org.apache.druid.msq.indexing.error.RowTooLargeFault;
 import org.apache.druid.msq.indexing.error.TaskStartTimeoutFault;
 import org.apache.druid.msq.indexing.error.TooManyAttemptsForJob;
@@ -114,6 +115,7 @@ public class MSQIndexingModule implements DruidModule
       InvalidNullByteFault.class,
       NotEnoughMemoryFault.class,
       QueryNotSupportedFault.class,
+      QueryRuntimeFault.class,
       RowTooLargeFault.class,
       TaskStartTimeoutFault.class,
       TooManyBucketsFault.class,
diff --git 
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQErrorReport.java
 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQErrorReport.java
index 31bc2753aa..2ed375c791 100644
--- 
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQErrorReport.java
+++ 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQErrorReport.java
@@ -26,8 +26,10 @@ import com.google.common.base.Preconditions;
 import com.google.common.base.Throwables;
 import org.apache.druid.frame.processor.FrameRowTooLargeException;
 import org.apache.druid.frame.write.UnsupportedColumnTypeException;
+import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.parsers.ParseException;
 import org.apache.druid.msq.statistics.TooManyBucketsException;
+import 
org.apache.druid.query.groupby.epinephelinae.UnexpectedMultiValueDimensionException;
 
 import javax.annotation.Nullable;
 import java.util.Objects;
@@ -47,7 +49,7 @@ public class MSQErrorReport
   MSQErrorReport(
       @JsonProperty("taskId") final String taskId,
       @JsonProperty("host") @Nullable final String host,
-      @JsonProperty("stageNumber") final Integer stageNumber,
+      @JsonProperty("stageNumber") @Nullable final Integer stageNumber,
       @JsonProperty("error") final MSQFault fault,
       @JsonProperty("exceptionStackTrace") @Nullable final String 
exceptionStackTrace
   )
@@ -190,6 +192,14 @@ public class MSQErrorReport
         return new TooManyBucketsFault(((TooManyBucketsException) 
cause).getMaxBuckets());
       } else if (cause instanceof FrameRowTooLargeException) {
         return new RowTooLargeFault(((FrameRowTooLargeException) 
cause).getMaxFrameSize());
+      } else if (cause instanceof UnexpectedMultiValueDimensionException) {
+        return new QueryRuntimeFault(StringUtils.format(
+            "Column [%s] is a multi value string. Please wrap the column using 
MV_TO_ARRAY() to proceed further.",
+            ((UnexpectedMultiValueDimensionException) cause).getDimensionName()
+        ), cause.getMessage());
+      } else if 
(cause.getClass().getPackage().getName().startsWith("org.apache.druid.query")) {
+        // catch all for all query runtime exception faults.
+        return new QueryRuntimeFault(e.getMessage(), null);
       } else {
         cause = cause.getCause();
       }
diff --git 
a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/QueryRuntimeFault.java
 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/QueryRuntimeFault.java
new file mode 100644
index 0000000000..2a34130579
--- /dev/null
+++ 
b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/QueryRuntimeFault.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.indexing.error;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+
+import javax.annotation.Nullable;
+import java.util.Objects;
+
+/**
+ * Fault to throw when the error comes from the druid native query runtime 
while running in the MSQ engine .
+ */
+@JsonTypeName(QueryRuntimeFault.CODE)
+public class QueryRuntimeFault extends BaseMSQFault
+{
+  public static final String CODE = "QueryRuntimeError";
+  @Nullable
+  private final String baseErrorMessage;
+
+
+  @JsonCreator
+  public QueryRuntimeFault(
+      @JsonProperty("errorMessage") String errorMessage,
+      @Nullable @JsonProperty("baseErrorMessage") String baseErrorMessage
+  )
+  {
+    super(CODE, errorMessage);
+    this.baseErrorMessage = baseErrorMessage;
+  }
+
+  @JsonProperty
+  @Nullable
+  public String getBaseErrorMessage()
+  {
+    return baseErrorMessage;
+  }
+
+  @Override
+  public boolean equals(Object o)
+  {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    if (!super.equals(o)) {
+      return false;
+    }
+    QueryRuntimeFault that = (QueryRuntimeFault) o;
+    return Objects.equals(baseErrorMessage, that.baseErrorMessage);
+  }
+
+  @Override
+  public int hashCode()
+  {
+    return Objects.hash(super.hashCode(), baseErrorMessage);
+  }
+
+}
diff --git 
a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQInsertTest.java
 
b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQInsertTest.java
index f90f43902f..bdafa57f04 100644
--- 
a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQInsertTest.java
+++ 
b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQInsertTest.java
@@ -496,11 +496,8 @@ public class MSQInsertTest extends MSQTestBase
                      .setQueryContext(localContext)
                      .setExpectedExecutionErrorMatcher(CoreMatchers.allOf(
                          CoreMatchers.instanceOf(ISE.class),
-                         
ThrowableMessageMatcher.hasMessage(!FAULT_TOLERANCE.equals(contextName)
-                                                            ? 
CoreMatchers.containsString(
-                             "Encountered multi-value dimension [dim3] that 
cannot be processed with 'groupByEnableMultiValueUnnesting' set to false.")
-                                                            :
-                                                            
CoreMatchers.containsString("exceeded max relaunch count")
+                         
ThrowableMessageMatcher.hasMessage(CoreMatchers.containsString(
+                             "Column [dim3] is a multi value string. Please 
wrap the column using MV_TO_ARRAY() to proceed further.")
                          )
                      ))
                      .verifyExecutionError();
diff --git 
a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java
 
b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java
index 14cb7acbd6..6d006cb080 100644
--- 
a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java
+++ 
b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java
@@ -1275,12 +1275,8 @@ public class MSQSelectTest extends MSQTestBase
         .setQueryContext(localContext)
         .setExpectedExecutionErrorMatcher(CoreMatchers.allOf(
             CoreMatchers.instanceOf(ISE.class),
-            ThrowableMessageMatcher.hasMessage(
-                !FAULT_TOLERANCE.equals(contextName)
-                ? CoreMatchers.containsString(
-                    "Encountered multi-value dimension [dim3] that cannot be 
processed with 'groupByEnableMultiValueUnnesting' set to false.")
-                :
-                CoreMatchers.containsString("exceeded max relaunch count")
+            ThrowableMessageMatcher.hasMessage(CoreMatchers.containsString(
+                "Column [dim3] is a multi value string. Please wrap the column 
using MV_TO_ARRAY() to proceed further.")
             )
         ))
         .verifyExecutionError();
@@ -1450,11 +1446,8 @@ public class MSQSelectTest extends MSQTestBase
         .setExpectedExecutionErrorMatcher(CoreMatchers.allOf(
             CoreMatchers.instanceOf(ISE.class),
             ThrowableMessageMatcher.hasMessage(
-                !FAULT_TOLERANCE.equals(contextName)
-                ? CoreMatchers.containsString(
+                CoreMatchers.containsString(
                     "Encountered multi-value dimension [dim3] that cannot be 
processed with 'groupByEnableMultiValueUnnesting' set to false.")
-                :
-                CoreMatchers.containsString("exceeded max relaunch count")
             )
         ))
         .verifyExecutionError();
@@ -1581,7 +1574,11 @@ public class MSQSelectTest extends MSQTestBase
   @Test
   public void testMultiValueStringWithIncorrectType() throws IOException
   {
-    final File toRead = 
MSQTestFileUtils.getResourceAsTemporaryFile(temporaryFolder, this, 
"/unparseable-mv-string-array.json");
+    final File toRead = MSQTestFileUtils.getResourceAsTemporaryFile(
+        temporaryFolder,
+        this,
+        "/unparseable-mv-string-array.json"
+    );
     final String toReadAsJson = 
queryFramework().queryJsonMapper().writeValueAsString(toRead.getAbsolutePath());
 
     RowSignature rowSignature = RowSignature.builder()
diff --git 
a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQErrorReportTest.java
 
b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQErrorReportTest.java
new file mode 100644
index 0000000000..c042658505
--- /dev/null
+++ 
b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQErrorReportTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.indexing.error;
+
+import org.apache.druid.frame.processor.FrameRowTooLargeException;
+import org.apache.druid.frame.write.UnsupportedColumnTypeException;
+import org.apache.druid.java.util.common.parsers.ParseException;
+import org.apache.druid.msq.statistics.TooManyBucketsException;
+import org.apache.druid.query.QueryTimeoutException;
+import 
org.apache.druid.query.groupby.epinephelinae.UnexpectedMultiValueDimensionException;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class MSQErrorReportTest
+{
+
+  public static final String ERROR_MESSAGE = "test";
+
+  @Test
+  public void testErrorReportFault()
+  {
+    Assert.assertEquals(UnknownFault.forException(null), 
MSQErrorReport.getFaultFromException(null));
+
+    MSQException msqException = new MSQException(null, 
UnknownFault.forMessage(ERROR_MESSAGE));
+    Assert.assertEquals(msqException.getFault(), 
MSQErrorReport.getFaultFromException(msqException));
+
+    ParseException parseException = new ParseException(null, ERROR_MESSAGE);
+    Assert.assertEquals(
+        new CannotParseExternalDataFault(ERROR_MESSAGE),
+        MSQErrorReport.getFaultFromException(parseException)
+    );
+
+    UnsupportedColumnTypeException columnTypeException = new 
UnsupportedColumnTypeException(ERROR_MESSAGE, null);
+    Assert.assertEquals(
+        new ColumnTypeNotSupportedFault(ERROR_MESSAGE, null),
+        MSQErrorReport.getFaultFromException(columnTypeException)
+    );
+
+    TooManyBucketsException tooManyBucketsException = new 
TooManyBucketsException(10);
+    Assert.assertEquals(new TooManyBucketsFault(10), 
MSQErrorReport.getFaultFromException(tooManyBucketsException));
+
+    FrameRowTooLargeException tooLargeException = new 
FrameRowTooLargeException(10);
+    Assert.assertEquals(new RowTooLargeFault(10), 
MSQErrorReport.getFaultFromException(tooLargeException));
+
+    UnexpectedMultiValueDimensionException mvException = new 
UnexpectedMultiValueDimensionException(ERROR_MESSAGE);
+    Assert.assertEquals(QueryRuntimeFault.CODE, 
MSQErrorReport.getFaultFromException(mvException).getErrorCode());
+
+    QueryTimeoutException queryException = new 
QueryTimeoutException(ERROR_MESSAGE);
+    Assert.assertEquals(
+        new QueryRuntimeFault(ERROR_MESSAGE, null),
+        MSQErrorReport.getFaultFromException(queryException)
+    );
+
+    RuntimeException runtimeException = new RuntimeException(ERROR_MESSAGE);
+    Assert.assertEquals(
+        UnknownFault.forException(runtimeException),
+        MSQErrorReport.getFaultFromException(runtimeException)
+    );
+  }
+}
diff --git 
a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java
 
b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java
index 1ad4d08b42..8342510c76 100644
--- 
a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java
+++ 
b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java
@@ -63,6 +63,8 @@ public class MSQFaultSerdeTest
     assertFaultSerde(new InvalidNullByteFault("the column"));
     assertFaultSerde(new NotEnoughMemoryFault(1000, 1000, 900, 1, 2));
     assertFaultSerde(QueryNotSupportedFault.INSTANCE);
+    assertFaultSerde(new QueryRuntimeFault("new error", "base error"));
+    assertFaultSerde(new QueryRuntimeFault("new error", null));
     assertFaultSerde(new RowTooLargeFault(1000));
     assertFaultSerde(new TaskStartTimeoutFault(10, 11));
     assertFaultSerde(new TooManyBucketsFault(10));


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to