suneet-s commented on a change in pull request #10359: URL: https://github.com/apache/druid/pull/10359#discussion_r502021664
########## File path: indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleMetrics.java ########## @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.worker.shuffle; + +import com.google.common.annotations.VisibleForTesting; +import com.google.errorprone.annotations.concurrent.GuardedBy; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * Shuffle metrcis for middleManagers and indexers. This class is thread-safe because shuffle can be performed by + * multiple HTTP threads while a monitoring thread periodically emits the snapshot of metrics. + * + * @see ShuffleResource + * @see org.apache.druid.java.util.metrics.MonitorScheduler + */ +public class ShuffleMetrics +{ + /** + * This lock is used to synchronize accesses to the reference to {@link #datasourceMetrics} and the + * {@link PerDatasourceShuffleMetrics} values of the map. This means, + * + * - Any updates on PerDatasourceShuffleMetrics in the map (and thus its key as well) should be synchronized + * under this lock. + * - Any updates on the reference to datasourceMetrics should be synchronized under this lock. + */ + private final Object lock = new Object(); + + /** + * A map of (datasource name) -> {@link PerDatasourceShuffleMetrics}. This map is replaced with an empty map + * whenever a snapshot is taken since the map can keep growing over time otherwise. For concurrent access pattern, + * see {@link #shuffleRequested} and {@link #snapshotAndReset()}. + */ + @GuardedBy("lock") Review comment: Ah - that makes sense. Thanks for the explanation ########## File path: indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleMetrics.java ########## @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.worker.shuffle; + +import com.google.common.annotations.VisibleForTesting; +import com.google.errorprone.annotations.concurrent.GuardedBy; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * Shuffle metrcis for middleManagers and indexers. This class is thread-safe because shuffle can be performed by + * multiple HTTP threads while a monitoring thread periodically emits the snapshot of metrics. + * + * @see ShuffleResource + * @see org.apache.druid.java.util.metrics.MonitorScheduler + */ +public class ShuffleMetrics +{ + /** + * This lock is used to synchronize accesses to the reference to {@link #datasourceMetrics} and the + * {@link PerDatasourceShuffleMetrics} values of the map. This means, + * + * - Any updates on PerDatasourceShuffleMetrics in the map (and thus its key as well) should be synchronized + * under this lock. + * - Any updates on the reference to datasourceMetrics should be synchronized under this lock. + */ + private final Object lock = new Object(); + + /** + * A map of (datasource name) -> {@link PerDatasourceShuffleMetrics}. This map is replaced with an empty map + * whenever a snapshot is taken since the map can keep growing over time otherwise. For concurrent access pattern, + * see {@link #shuffleRequested} and {@link #snapshotAndReset()}. + */ + @GuardedBy("lock") + private Map<String, PerDatasourceShuffleMetrics> datasourceMetrics = new HashMap<>(); + + /** + * This method is called whenever a new shuffle is requested. Multiple tasks can request shuffle at the same time, + * while the monitoring thread takes a snapshot of the metrics. There is a happens-before relationship between + * shuffleRequested and {@link #snapshotAndReset()}. + */ + public void shuffleRequested(String supervisorTaskId, long fileLength) + { + synchronized (lock) { Review comment: I like this approach a lot 🤘 ########## File path: indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleModule.java ########## @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.worker.shuffle; + +import com.google.inject.Binder; +import com.google.inject.Module; +import com.google.inject.Provides; +import org.apache.druid.guice.Jerseys; +import org.apache.druid.guice.LazySingleton; +import org.apache.druid.java.util.metrics.MonitorScheduler; + +import java.util.Optional; + +public class ShuffleModule implements Module +{ + @Override + public void configure(Binder binder) + { + Jerseys.addResource(binder, ShuffleResource.class); + } + + /** + * {@link ShuffleMetrics} is used in {@link ShuffleResource} and {@link ShuffleMonitor} to collect metrics + * and report them, respectively. Unlike ShuffleResource, ShuffleMonitor can be created via a user config + * ({@link org.apache.druid.server.metrics.MonitorsConfig}) in potentially any node types, where it is not + * possible to create ShuffleMetrics. This method checks the {@link MonitorScheduler} if ShuffleMonitor is + * registered on it, and sets the proper ShuffleMetrics. + */ + @Provides + @LazySingleton + public Optional<ShuffleMetrics> getShuffleMetrics(MonitorScheduler monitorScheduler) + { + // ShuffleMonitor cannot be registered dynamically, but can only via the static configuration (MonitorsConfig). + // As a result, it is safe to check only one time if it is registered in MonitorScheduler. + final Optional<ShuffleMonitor> maybeMonitor = monitorScheduler.findMonitor(ShuffleMonitor.class); Review comment: I see that `MonitorScheduler` has a `removeMonitor` method, and ShuffleMetrics is provided as a Singleton. Can someone remove the ShuffleMonitor while Druid is running? If they do that how would it impact ShuffleMetrics being reported ########## File path: indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleModule.java ########## @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.worker.shuffle; + +import com.google.inject.Binder; +import com.google.inject.Module; +import com.google.inject.Provides; +import org.apache.druid.guice.Jerseys; +import org.apache.druid.guice.LazySingleton; +import org.apache.druid.java.util.metrics.MonitorScheduler; + +import java.util.Optional; + +public class ShuffleModule implements Module +{ + @Override + public void configure(Binder binder) + { + Jerseys.addResource(binder, ShuffleResource.class); Review comment: Can you add a ModuleTest that validates the `ShuffleResource` and `Optional<ShuffleMetrics>`is injectable a. I think I've written `AuthorizerMapperModuleTest` that would be a similar example ########## File path: indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleMonitor.java ########## @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.worker.shuffle; + +import org.apache.druid.indexing.worker.shuffle.ShuffleMetrics.PerDatasourceShuffleMetrics; +import org.apache.druid.java.util.emitter.service.ServiceEmitter; +import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; +import org.apache.druid.java.util.emitter.service.ServiceMetricEvent.Builder; +import org.apache.druid.java.util.metrics.AbstractMonitor; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.checkerframework.checker.nullness.qual.Nullable; + +import java.util.Map; + +public class ShuffleMonitor extends AbstractMonitor +{ + private static final String SUPERVISOR_TASK_ID_DIMENSION = "supervisorTaskId"; + private static final String SHUFFLE_BYTES_KEY = "ingest/shuffle/bytes"; + private static final String SHUFFLE_REQUESTS_KEY = "ingest/shuffle/requests"; + + /** + * ShuffleMonitor can be instantiated in any node types if it is defined in + * {@link org.apache.druid.server.metrics.MonitorsConfig}. Since {@link ShuffleMetrics} is defined + * in the `indexing-service` module, some node types (such as broker) would fail to create it + * if they don't have required dependencies. To avoid this problem, this variable is lazily initialized + * only in the node types which has the {@link ShuffleModule}. + */ + @MonotonicNonNull + private ShuffleMetrics shuffleMetrics; + + public void setShuffleMetrics(ShuffleMetrics shuffleMetrics) + { + this.shuffleMetrics = shuffleMetrics; + } + + @Nullable + public ShuffleMetrics getShuffleMetrics() + { + return shuffleMetrics; + } + + @Override + public boolean doMonitor(ServiceEmitter emitter) + { + if (shuffleMetrics != null) { + final Map<String, PerDatasourceShuffleMetrics> snapshot = shuffleMetrics.snapshotAndReset(); + snapshot.forEach((supervisorTaskId, perDatasourceShuffleMetrics) -> { + final Builder metricBuilder = ServiceMetricEvent + .builder() + .setDimension(SUPERVISOR_TASK_ID_DIMENSION, supervisorTaskId); + emitter.emit(metricBuilder.build(SHUFFLE_BYTES_KEY, perDatasourceShuffleMetrics.getShuffleBytes())); + emitter.emit(metricBuilder.build(SHUFFLE_REQUESTS_KEY, perDatasourceShuffleMetrics.getShuffleRequests())); + }); + } + return true; Review comment: Should we add unit tests for this function? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
