kfaraz commented on code in PR #18510:
URL: https://github.com/apache/druid/pull/18510#discussion_r2373935603
##########
indexing-service/src/main/java/org/apache/druid/indexing/common/task/NoopTask.java:
##########
@@ -29,33 +29,38 @@
import org.apache.druid.indexing.common.config.TaskConfig;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.emitter.service.ServiceMetricEvent;
import org.apache.druid.server.security.ResourceAction;
import javax.annotation.Nonnull;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
/**
*/
public class NoopTask extends AbstractTask implements
PendingSegmentAllocatingTask
{
public static final String TYPE = "noop";
+ public static final String NOOP_TASK_EVENT_STARTED = "noopTask-started";
private static final int DEFAULT_RUN_TIME = 2500;
@JsonIgnore
private final long runTime;
+ @JsonIgnore
+ private final AtomicBoolean aborted = new AtomicBoolean();
+
@JsonCreator
public NoopTask(
@JsonProperty("id") String id,
@JsonProperty("groupId") String groupId,
@JsonProperty("dataSource") String dataSource,
@JsonProperty("runTime") long runTimeMillis,
@JsonProperty("isReadyTime") long isReadyTime,
- @JsonProperty("context") Map<String, Object> context
- )
+ @JsonProperty("context") Map<String, Object> context)
Review Comment:
Nit: not needed. older style was cleaner.
##########
indexing-service/src/main/java/org/apache/druid/indexing/common/task/NoopTask.java:
##########
@@ -29,33 +29,38 @@
import org.apache.druid.indexing.common.config.TaskConfig;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.emitter.service.ServiceMetricEvent;
import org.apache.druid.server.security.ResourceAction;
import javax.annotation.Nonnull;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
/**
*/
public class NoopTask extends AbstractTask implements
PendingSegmentAllocatingTask
{
public static final String TYPE = "noop";
+ public static final String NOOP_TASK_EVENT_STARTED = "noopTask-started";
private static final int DEFAULT_RUN_TIME = 2500;
@JsonIgnore
private final long runTime;
+ @JsonIgnore
Review Comment:
Nit: I think this annotation is not required. Any field/method not
explicitly marked `@JsonProperty` is skipped from serialization by default.
##########
indexing-service/src/main/java/org/apache/druid/indexing/common/task/NoopTask.java:
##########
@@ -29,33 +29,38 @@
import org.apache.druid.indexing.common.config.TaskConfig;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.emitter.service.ServiceMetricEvent;
import org.apache.druid.server.security.ResourceAction;
import javax.annotation.Nonnull;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
/**
*/
public class NoopTask extends AbstractTask implements
PendingSegmentAllocatingTask
{
public static final String TYPE = "noop";
+ public static final String NOOP_TASK_EVENT_STARTED = "noopTask-started";
Review Comment:
Nit:
```suggestion
public static final String EVENT_STARTED = "task/noop/started";
```
##########
indexing-service/src/main/java/org/apache/druid/indexing/common/task/NoopTask.java:
##########
@@ -97,13 +102,22 @@ public boolean isReady(TaskActionClient taskActionClient)
@Override
public void stopGracefully(TaskConfig taskConfig)
{
+ aborted.set(true);
}
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception
{
- Thread.sleep(runTime);
- return TaskStatus.success(getId());
+
toolbox.getEmitter().emit(ServiceMetricEvent.builder().setMetric(NOOP_TASK_EVENT_STARTED,
1));
+ long endTime = System.currentTimeMillis() + runTime;
+ while (endTime > System.currentTimeMillis() && !aborted.get()) {
+ Thread.sleep(100);
+ }
Review Comment:
How about we use a `countDownLatch.await(runTime)` instead?
The latch will be counted down in `stopGracefully()`.
So we will wait on the latch until stop is requested or until the runTime
elapses.
##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/server/HttpRemoteTaskRunnerWorkerFailTest.java:
##########
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.server;
+
+import org.apache.druid.client.indexing.TaskStatusResponse;
+import org.apache.druid.common.utils.IdUtils;
+import org.apache.druid.indexer.TaskState;
+import org.apache.druid.indexing.common.task.NoopTask;
+import org.apache.druid.indexing.overlord.hrtr.HttpRemoteTaskRunner;
+import org.apache.druid.query.DruidMetrics;
+import org.apache.druid.segment.TestDataSource;
+import org.apache.druid.testing.embedded.EmbeddedBroker;
+import org.apache.druid.testing.embedded.EmbeddedCoordinator;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.EmbeddedIndexer;
+import org.apache.druid.testing.embedded.EmbeddedOverlord;
+import org.apache.druid.testing.embedded.junit5.EmbeddedClusterTestBase;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class HttpRemoteTaskRunnerWorkerFailTest extends EmbeddedClusterTestBase
+{
+ private final EmbeddedOverlord overlord = new EmbeddedOverlord();
+ private final EmbeddedIndexer indexer = new
EmbeddedIndexer().addProperty("druid.worker.capacity", "3");
Review Comment:
Probably not needed for this test since we are going to run a single task.
```suggestion
private final EmbeddedIndexer indexer = new EmbeddedIndexer();
```
##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/IndexTaskTest.java:
##########
@@ -99,6 +99,9 @@ public void test_runIndexTask_forInlineDatasource()
}
cluster.callApi().waitForAllSegmentsToBeAvailable(dataSource, coordinator,
broker);
+ broker.latchableEmitter().waitForMetricEvent(
+ event -> event.hasDimension(DruidMetrics.DATASOURCE, dataSource)
+ );
Review Comment:
Is this needed? the call before this line
(`waitForAllSegmentsToBeAvailable`) should already be taking care of segment
availability.
##########
indexing-service/src/main/java/org/apache/druid/indexing/common/task/NoopTask.java:
##########
@@ -97,13 +102,22 @@ public boolean isReady(TaskActionClient taskActionClient)
@Override
public void stopGracefully(TaskConfig taskConfig)
{
+ aborted.set(true);
}
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception
{
- Thread.sleep(runTime);
- return TaskStatus.success(getId());
+
toolbox.getEmitter().emit(ServiceMetricEvent.builder().setMetric(NOOP_TASK_EVENT_STARTED,
1));
+ long endTime = System.currentTimeMillis() + runTime;
+ while (endTime > System.currentTimeMillis() && !aborted.get()) {
Review Comment:
Just curious as to why we needed to maintain the `aborted` field.
When we stop the `Indexer`, it should ideally interrupt all its tasks, thus
triggering a `task/run/time` `FAILED` event.
Was that not happening?
##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/server/HttpRemoteTaskRunnerWorkerFailTest.java:
##########
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.server;
+
+import org.apache.druid.client.indexing.TaskStatusResponse;
+import org.apache.druid.common.utils.IdUtils;
+import org.apache.druid.indexer.TaskState;
+import org.apache.druid.indexing.common.task.NoopTask;
+import org.apache.druid.indexing.overlord.hrtr.HttpRemoteTaskRunner;
+import org.apache.druid.query.DruidMetrics;
+import org.apache.druid.segment.TestDataSource;
+import org.apache.druid.testing.embedded.EmbeddedBroker;
+import org.apache.druid.testing.embedded.EmbeddedCoordinator;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.EmbeddedIndexer;
+import org.apache.druid.testing.embedded.EmbeddedOverlord;
+import org.apache.druid.testing.embedded.junit5.EmbeddedClusterTestBase;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class HttpRemoteTaskRunnerWorkerFailTest extends EmbeddedClusterTestBase
+{
+ private final EmbeddedOverlord overlord = new EmbeddedOverlord();
+ private final EmbeddedIndexer indexer = new
EmbeddedIndexer().addProperty("druid.worker.capacity", "3");
+
+ @Override
+ public EmbeddedDruidCluster createCluster()
+ {
+ return EmbeddedDruidCluster.withEmbeddedDerbyAndZookeeper()
+ .useLatchableEmitter()
+ .addServer(new EmbeddedCoordinator())
+ .addServer(new EmbeddedBroker())
+ .addServer(overlord)
+ .addServer(indexer);
+ }
+
+ @Test
+ public void test_overlord_marksTaskAsFailed_ifIndexerCrashes() throws
Exception
+ {
+ final String taskId = IdUtils.newTaskId("sim_test_noop",
TestDataSource.WIKI, null);
+ cluster.callApi().onLeaderOverlord(
+ o -> {
+ return o.runTask(taskId, new NoopTask(taskId, null, null, 8000L, 0L,
null));
+ }
+ );
+ // wait for the overlord to dispatch the task and worker start it
+ indexer.latchableEmitter().waitForMetricEvent(
+ event -> event.hasMetricName(NoopTask.NOOP_TASK_EVENT_STARTED),
+ 1000
Review Comment:
Any specific reason to not use the default timeout (10s)?
Lower timeouts may cause flakiness in the tests.
##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/server/HttpRemoteTaskRunnerWorkerFailTest.java:
##########
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.server;
+
+import org.apache.druid.client.indexing.TaskStatusResponse;
+import org.apache.druid.common.utils.IdUtils;
+import org.apache.druid.indexer.TaskState;
+import org.apache.druid.indexing.common.task.NoopTask;
+import org.apache.druid.indexing.overlord.hrtr.HttpRemoteTaskRunner;
+import org.apache.druid.query.DruidMetrics;
+import org.apache.druid.segment.TestDataSource;
+import org.apache.druid.testing.embedded.EmbeddedBroker;
+import org.apache.druid.testing.embedded.EmbeddedCoordinator;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.EmbeddedIndexer;
+import org.apache.druid.testing.embedded.EmbeddedOverlord;
+import org.apache.druid.testing.embedded.junit5.EmbeddedClusterTestBase;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class HttpRemoteTaskRunnerWorkerFailTest extends EmbeddedClusterTestBase
+{
+ private final EmbeddedOverlord overlord = new EmbeddedOverlord();
+ private final EmbeddedIndexer indexer = new
EmbeddedIndexer().addProperty("druid.worker.capacity", "3");
+
+ @Override
+ public EmbeddedDruidCluster createCluster()
+ {
+ return EmbeddedDruidCluster.withEmbeddedDerbyAndZookeeper()
+ .useLatchableEmitter()
+ .addServer(new EmbeddedCoordinator())
+ .addServer(new EmbeddedBroker())
+ .addServer(overlord)
+ .addServer(indexer);
+ }
+
+ @Test
+ public void test_overlord_marksTaskAsFailed_ifIndexerCrashes() throws
Exception
+ {
+ final String taskId = IdUtils.newTaskId("sim_test_noop",
TestDataSource.WIKI, null);
+ cluster.callApi().onLeaderOverlord(
+ o -> {
+ return o.runTask(taskId, new NoopTask(taskId, null, null, 8000L, 0L,
null));
+ }
Review Comment:
Nit: simpler lambda
```suggestion
o -> o.runTask(taskId, new NoopTask(taskId, null, null, 8000L, 0L,
null))
```
##########
indexing-service/src/main/java/org/apache/druid/indexing/common/task/NoopTask.java:
##########
@@ -97,13 +102,22 @@ public boolean isReady(TaskActionClient taskActionClient)
@Override
public void stopGracefully(TaskConfig taskConfig)
{
+ aborted.set(true);
}
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception
{
- Thread.sleep(runTime);
- return TaskStatus.success(getId());
+
toolbox.getEmitter().emit(ServiceMetricEvent.builder().setMetric(NOOP_TASK_EVENT_STARTED,
1));
Review Comment:
Shorthand:
```suggestion
emitMetric(toolbox.getEmitter(), NOOP_TASK_EVENT_STARTED, 1);
```
##########
indexing-service/src/main/java/org/apache/druid/indexing/overlord/hrtr/HttpRemoteTaskRunner.java:
##########
@@ -1543,6 +1550,14 @@ public void taskAddedOrUpdated(final TaskAnnouncement
announcement, final Worker
HttpRemoteTaskRunnerWorkItem.State.RUNNING
);
tasks.put(taskId, taskItem);
+ final ServiceMetricEvent.Builder metricBuilder = new
ServiceMetricEvent.Builder();
+ metricBuilder.setDimension(DruidMetrics.TASK_ID, taskId);
+ emitter.emit(
+ metricBuilder.setMetric(
+ TASK_UNKNOWN_COUNT,
+ (long) 1
+ )
Review Comment:
Nit: can be done in a single line.
##########
indexing-service/src/main/java/org/apache/druid/indexing/overlord/hrtr/HttpRemoteTaskRunner.java:
##########
@@ -611,17 +612,18 @@ void addWorker(final Worker worker)
// tasks that we think are running on this worker. Provide that
information to WorkerHolder that
// manages the task syncing with that worker.
for (Map.Entry<String, HttpRemoteTaskRunnerWorkItem> e :
tasks.entrySet()) {
- if (e.getValue().getState() ==
HttpRemoteTaskRunnerWorkItem.State.RUNNING) {
- Worker w = e.getValue().getWorker();
- if (w != null && w.getHost().equals(worker.getHost()) &&
e.getValue().getTask() != null) {
- expectedAnnouncements.add(
- TaskAnnouncement.create(
- e.getValue().getTask(),
- TaskStatus.running(e.getKey()),
- e.getValue().getLocation()
- )
- );
- }
+ HttpRemoteTaskRunnerWorkItem workItem = e.getValue();
+ if (workItem.isRunningOnWorker(worker)) {
+ expectedAnnouncements.add(
+ TaskAnnouncement.create(
+ workItem.getTaskId(),
+ workItem.getTaskType(),
+ null,
Review Comment:
No, I just meant the one on line 620.
Resolving this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]