Re: [PR] add computation state cache, config loaders, and status pages [beam]

via GitHub Mon, 06 May 2024 23:07:00 -0700


m-trieu commented on code in PR #31133:
URL: https://github.com/apache/beam/pull/31133#discussion_r1591858198



##########
runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCache.java:
##########
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.dataflow.worker.streaming;
+
+import static java.util.stream.Collectors.toConcurrentMap;
+import static 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList;
+
+import com.google.api.services.dataflow.model.MapTask;
+import java.io.PrintWriter;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutionException;
+import java.util.function.Function;
+import javax.annotation.concurrent.ThreadSafe;
+import 
org.apache.beam.runners.dataflow.worker.apiary.FixMultiOutputInfosOnParDoInstructions;
+import org.apache.beam.runners.dataflow.worker.status.StatusDataProvider;
+import 
org.apache.beam.runners.dataflow.worker.streaming.config.ComputationConfig;
+import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor;
+import 
org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache;
+import 
org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget;
+import org.apache.beam.sdk.annotations.Internal;
+import org.apache.beam.sdk.fn.IdGenerator;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheBuilder;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheLoader;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.LoadingCache;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Cache of {@link String} computationId to {@link ComputationState}. */
+@Internal
+@ThreadSafe
+public final class ComputationStateCache implements StatusDataProvider {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(ComputationStateCache.class);
+
+  private final LoadingCache<String, ComputationState> computationCache;
+
+  /**
+   * Fix up MapTask representation because MultiOutputInfos are missing from 
system generated
+   * ParDoInstructions.
+   */
+  private final Function<MapTask, MapTask> 
fixMultiOutputInfosOnParDoInstructions;
+
+  private ComputationStateCache(
+      LoadingCache<String, ComputationState> computationCache,
+      Function<MapTask, MapTask> fixMultiOutputInfosOnParDoInstructions) {
+    this.computationCache = computationCache;
+    this.fixMultiOutputInfosOnParDoInstructions = 
fixMultiOutputInfosOnParDoInstructions;
+  }
+
+  public static ComputationStateCache create(
+      ComputationConfig.Fetcher computationConfigFetcher,
+      BoundedQueueExecutor workUnitExecutor,
+      Function<String, WindmillStateCache.ForComputation> 
perComputationStateCacheViewFactory,
+      IdGenerator idGenerator) {
+    Function<MapTask, MapTask> fixMultiOutputInfosOnParDoInstructions =
+        new FixMultiOutputInfosOnParDoInstructions(idGenerator);
+    ConcurrentMap<String, String> stateNameMap = new ConcurrentHashMap<>();
+    return new ComputationStateCache(
+        CacheBuilder.newBuilder()
+            .build(
+                newComputationStateCacheLoader(
+                    computationConfigFetcher,
+                    workUnitExecutor,
+                    perComputationStateCacheViewFactory,
+                    fixMultiOutputInfosOnParDoInstructions,
+                    stateNameMap)),
+        fixMultiOutputInfosOnParDoInstructions);
+  }
+
+  @VisibleForTesting
+  public static ComputationStateCache forTesting(
+      ComputationConfig.Fetcher computationConfigFetcher,
+      BoundedQueueExecutor workUnitExecutor,
+      Function<String, WindmillStateCache.ForComputation> 
perComputationStateCacheViewFactory,
+      IdGenerator idGenerator,
+      ConcurrentMap<String, String> stateNameMap) {
+    Function<MapTask, MapTask> fixMultiOutputInfosOnParDoInstructions =
+        new FixMultiOutputInfosOnParDoInstructions(idGenerator);
+    return new ComputationStateCache(
+        CacheBuilder.newBuilder()
+            .build(
+                newComputationStateCacheLoader(
+                    computationConfigFetcher,
+                    workUnitExecutor,
+                    perComputationStateCacheViewFactory,
+                    fixMultiOutputInfosOnParDoInstructions,
+                    stateNameMap)),
+        fixMultiOutputInfosOnParDoInstructions);
+  }
+
+  private static CacheLoader<String, ComputationState> 
newComputationStateCacheLoader(
+      ComputationConfig.Fetcher computationConfigFetcher,
+      BoundedQueueExecutor workUnitExecutor,
+      Function<String, WindmillStateCache.ForComputation> 
perComputationStateCacheViewFactory,
+      Function<MapTask, MapTask> fixMultiOutputInfosOnParDoInstructions,
+      ConcurrentMap<String, String> stateNameMap) {
+    return new CacheLoader<String, ComputationState>() {
+      @Override
+      public ComputationState load(String computationId) {
+        // LoadingCache load(K key) will throw an exception if we return null 
here,
+        // throw ComputationStateNotFoundException to represent semantics 
better.
+        ComputationConfig computationConfig =
+            computationConfigFetcher
+                .getConfig(computationId)
+                .orElseThrow(() -> new 
ComputationStateNotFoundException(computationId));
+        stateNameMap.putAll(computationConfig.stateNameMap());
+        return new ComputationState(
+            computationId,
+            
fixMultiOutputInfosOnParDoInstructions.apply(computationConfig.mapTask()),
+            workUnitExecutor,
+            !computationConfig.userTransformToStateFamilyName().isEmpty()
+                ? computationConfig.userTransformToStateFamilyName()
+                : stateNameMap,
+            perComputationStateCacheViewFactory.apply(computationId));
+      }
+    };
+  }
+
+  /**
+   * Returns the {@link ComputationState} associated with the given 
computationId. May perform IO if
+   * a value is not present, and it is possible that after IO is performed 
there is no value
+   * correlated with that computationId.
+   */
+  public Optional<ComputationState> get(String computationId) {
+    try {
+      return Optional.ofNullable(computationCache.get(computationId));
+    } catch (ExecutionException | ComputationStateNotFoundException e) {
+      if (e.getCause() instanceof ComputationStateNotFoundException) {
+        LOG.error(
+            "Trying to fetch unknown computation={}, known computations are 
{}.",
+            computationId,
+            ImmutableSet.copyOf(computationCache.asMap().keySet()));
+      } else {
+        LOG.warn("Error occurred fetching computation for computationId={}", 
computationId, e);
+      }
+    }
+
+    return Optional.empty();
+  }
+
+  public Optional<ComputationState> getIfPresent(String computationId) {
+    return Optional.ofNullable(computationCache.getIfPresent(computationId));
+  }
+
+  /** Returns a read-only view of all computations. */
+  public ImmutableList<ComputationState> getAllComputations() {

Review Comment:
   done



##########
runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCache.java:
##########
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.dataflow.worker.streaming;
+
+import static java.util.stream.Collectors.toConcurrentMap;
+import static 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList;
+
+import com.google.api.services.dataflow.model.MapTask;
+import java.io.PrintWriter;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutionException;
+import java.util.function.Function;
+import javax.annotation.concurrent.ThreadSafe;
+import 
org.apache.beam.runners.dataflow.worker.apiary.FixMultiOutputInfosOnParDoInstructions;
+import org.apache.beam.runners.dataflow.worker.status.StatusDataProvider;
+import 
org.apache.beam.runners.dataflow.worker.streaming.config.ComputationConfig;
+import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor;
+import 
org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache;
+import 
org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget;
+import org.apache.beam.sdk.annotations.Internal;
+import org.apache.beam.sdk.fn.IdGenerator;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheBuilder;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheLoader;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.LoadingCache;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList;
+import 
org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Cache of {@link String} computationId to {@link ComputationState}. */
+@Internal
+@ThreadSafe
+public final class ComputationStateCache implements StatusDataProvider {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(ComputationStateCache.class);
+
+  private final LoadingCache<String, ComputationState> computationCache;
+
+  /**
+   * Fix up MapTask representation because MultiOutputInfos are missing from 
system generated
+   * ParDoInstructions.
+   */
+  private final Function<MapTask, MapTask> 
fixMultiOutputInfosOnParDoInstructions;
+
+  private ComputationStateCache(
+      LoadingCache<String, ComputationState> computationCache,
+      Function<MapTask, MapTask> fixMultiOutputInfosOnParDoInstructions) {
+    this.computationCache = computationCache;
+    this.fixMultiOutputInfosOnParDoInstructions = 
fixMultiOutputInfosOnParDoInstructions;
+  }
+
+  public static ComputationStateCache create(
+      ComputationConfig.Fetcher computationConfigFetcher,
+      BoundedQueueExecutor workUnitExecutor,
+      Function<String, WindmillStateCache.ForComputation> 
perComputationStateCacheViewFactory,
+      IdGenerator idGenerator) {
+    Function<MapTask, MapTask> fixMultiOutputInfosOnParDoInstructions =
+        new FixMultiOutputInfosOnParDoInstructions(idGenerator);
+    ConcurrentMap<String, String> stateNameMap = new ConcurrentHashMap<>();
+    return new ComputationStateCache(
+        CacheBuilder.newBuilder()
+            .build(
+                newComputationStateCacheLoader(
+                    computationConfigFetcher,
+                    workUnitExecutor,
+                    perComputationStateCacheViewFactory,
+                    fixMultiOutputInfosOnParDoInstructions,
+                    stateNameMap)),
+        fixMultiOutputInfosOnParDoInstructions);
+  }
+
+  @VisibleForTesting
+  public static ComputationStateCache forTesting(
+      ComputationConfig.Fetcher computationConfigFetcher,
+      BoundedQueueExecutor workUnitExecutor,
+      Function<String, WindmillStateCache.ForComputation> 
perComputationStateCacheViewFactory,
+      IdGenerator idGenerator,
+      ConcurrentMap<String, String> stateNameMap) {
+    Function<MapTask, MapTask> fixMultiOutputInfosOnParDoInstructions =
+        new FixMultiOutputInfosOnParDoInstructions(idGenerator);
+    return new ComputationStateCache(

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] add computation state cache, config loaders, and status pages [beam]

Reply via email to