Pochatkin commented on code in PR #3104: URL: https://github.com/apache/ignite-3/pull/3104#discussion_r1471130288
########## modules/client-handler/src/main/java/org/apache/ignite/client/handler/requests/compute/ClientComputeExecuteRequest.java: ########## @@ -50,29 +56,82 @@ public class ClientComputeExecuteRequest { public static CompletableFuture<Void> process( ClientMessageUnpacker in, ClientMessagePacker out, - IgniteCompute compute, + IgniteComputeInternal compute, ClusterService cluster, - NotificationSender notificationSender) { - var nodeName = in.tryUnpackNil() ? null : in.unpackString(); - - var node = nodeName == null - ? cluster.topologyService().localMember() - : cluster.topologyService().getByConsistentId(nodeName); - - if (node == null) { - throw new IgniteException("Specified node is not present in the cluster: " + nodeName); - } + NotificationSender notificationSender + ) { + Set<ClusterNode> candidates = unpackCandidateNodes(in, cluster); List<DeploymentUnit> deploymentUnits = unpackDeploymentUnits(in); String jobClassName = in.unpackString(); JobExecutionOptions options = JobExecutionOptions.builder().priority(in.unpackInt()).maxRetries(in.unpackInt()).build(); Object[] args = unpackArgs(in); - JobExecution<Object> execution = compute.executeAsync(Set.of(node), deploymentUnits, jobClassName, options, args); + ClusterNode localNode = cluster.topologyService().localMember(); + ClusterNode targetNode = selectTargetNode(candidates, localNode); + candidates.remove(targetNode); + + JobExecution<Object> execution = compute.executeAsyncWithFailover(targetNode, candidates, + deploymentUnits, jobClassName, options, args); sendResultAndStatus(execution, notificationSender); return execution.idAsync().thenAccept(out::packUuid); } + private static Set<ClusterNode> unpackCandidateNodes(ClientMessageUnpacker in, ClusterService cluster) { + Set<String> candidateIds = Arrays.stream(in.unpackObjectArrayFromBinaryTuple()) + .map(String.class::cast) + .collect(Collectors.toSet()); + + Set<ClusterNode> candidates = new HashSet<>(); + List<String> notFoundNodes = new ArrayList<>(); + candidateIds.forEach(id -> { + ClusterNode node = cluster.topologyService().getByConsistentId(id); + if (node == null) { + notFoundNodes.add(id); + } else { + candidates.add(node); + } + }); + + if (!notFoundNodes.isEmpty()) { + throw new IgniteException("Specified nodes are not present in the cluster: " + notFoundNodes); + } + return candidates; + } + + /** + * Selects a random node from the set of candidates, preferably not a local node. Review Comment: >If the coordinator fails, the job will fail for the client anyway (due to connection loss). And the client will retry it. This is not the same as failover mechanism, this is client retry mechanism. Let me explain the difference here: 1. Failover is internal layer and we save same ID for Compute Job in case when Worker node die. Retry mechasism doesn't support it. 2. Same logic for startTime. 3. If priority of Job was change until fail, when retry from client will submit job with old priorty, because priority change didn't catch by client. etc. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@ignite.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org