[ 
https://issues.apache.org/jira/browse/STORM-1279?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15189549#comment-15189549
 ] 

ASF GitHub Bot commented on STORM-1279:
---------------------------------------

Github user revans2 commented on a diff in the pull request:

    https://github.com/apache/storm/pull/1184#discussion_r55712301
  
    --- Diff: 
storm-core/src/jvm/org/apache/storm/daemon/supervisor/SyncProcessEvent.java ---
    @@ -0,0 +1,669 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.storm.daemon.supervisor;
    +
    +import org.apache.commons.io.FileUtils;
    +import org.apache.commons.lang.StringUtils;
    +import org.apache.storm.Config;
    +import org.apache.storm.container.cgroup.CgroupManager;
    +import org.apache.storm.generated.ExecutorInfo;
    +import org.apache.storm.generated.LSWorkerHeartbeat;
    +import org.apache.storm.generated.LocalAssignment;
    +import org.apache.storm.generated.WorkerResources;
    +import org.apache.storm.utils.ConfigUtils;
    +import org.apache.storm.utils.LocalState;
    +import org.apache.storm.utils.Time;
    +import org.apache.storm.utils.Utils;
    +import org.eclipse.jetty.util.ConcurrentHashSet;
    +import org.slf4j.Logger;
    +import org.slf4j.LoggerFactory;
    +import org.yaml.snakeyaml.Yaml;
    +
    +import java.io.File;
    +import java.io.FileWriter;
    +import java.io.IOException;
    +import java.util.*;
    +
    +/**
    + * 1. to kill are those in allocated that are dead or disallowed 2. kill 
the ones that should be dead - read pids, kill -9 and individually remove file 
- rmr
    + * heartbeat dir, rmdir pid dir, rmdir id dir (catch exception and log) 3. 
of the rest, figure out what assignments aren't yet satisfied 4. generate new 
worker
    + * ids, write new "approved workers" to LS 5. create local dir for worker 
id 5. launch new workers (give worker-id, port, and supervisor-id) 6. wait for 
workers
    + * launch
    + */
    +public class SyncProcessEvent extends ShutdownWork implements Runnable {
    +
    +    private static Logger LOG = 
LoggerFactory.getLogger(SyncProcessEvent.class);
    +
    +    private  LocalState localState;
    +
    +    private SupervisorData supervisorData;
    +
    +    private class ProcessExitCallback implements Utils.ExitCodeCallable {
    +        private final String logPrefix;
    +        private final String workerId;
    +
    +        public ProcessExitCallback(String logPrefix, String workerId) {
    +            this.logPrefix = logPrefix;
    +            this.workerId = workerId;
    +        }
    +
    +        @Override
    +        public Object call() throws Exception {
    +            return null;
    +        }
    +
    +        @Override
    +        public Object call(int exitCode) {
    +            LOG.info("{} exited with code: {}", logPrefix, exitCode);
    +            supervisorData.getDeadWorkers().add(workerId);
    +            return null;
    +        }
    +    }
    +
    +    public SyncProcessEvent(){
    +
    +    }
    +    public SyncProcessEvent(SupervisorData supervisorData) {
    +        init(supervisorData);
    +    }
    +
    +    //TODO: initData is intended to local supervisor, so we will remove 
them after porting worker.clj to java
    +    public void init(SupervisorData supervisorData){
    +        this.supervisorData = supervisorData;
    +        this.localState = supervisorData.getLocalState();
    +    }
    +
    +
    +    /**
    +     * 1. to kill are those in allocated that are dead or disallowed 2. 
kill the ones that should be dead - read pids, kill -9 and individually remove 
file -
    +     * rmr heartbeat dir, rmdir pid dir, rmdir id dir (catch exception and 
log) 3. of the rest, figure out what assignments aren't yet satisfied 4. 
generate new
    +     * worker ids, write new "approved workers" to LS 5. create local dir 
for worker id 5. launch new workers (give worker-id, port, and supervisor-id) 
6. wait
    +     * for workers launch
    +     */
    +    @Override
    +    public void run() {
    +        LOG.debug("Syncing processes");
    +        try {
    +            Map conf = supervisorData.getConf();
    +            Map<Integer, LocalAssignment> assignedExecutors = 
localState.getLocalAssignmentsMap();
    +
    +            if (assignedExecutors == null) {
    +                assignedExecutors = new HashMap<>();
    +            }
    +            int now = Time.currentTimeSecs();
    +
    +            Map<String, StateHeartbeat> localWorkerStats = 
getLocalWorkerStats(supervisorData, assignedExecutors, now);
    +
    +            Set<String> keeperWorkerIds = new HashSet<>();
    +            Set<Integer> keepPorts = new HashSet<>();
    +            for (Map.Entry<String, StateHeartbeat> entry : 
localWorkerStats.entrySet()) {
    +                StateHeartbeat stateHeartbeat = entry.getValue();
    +                if (stateHeartbeat.getState() == State.valid) {
    +                    keeperWorkerIds.add(entry.getKey());
    +                    
keepPorts.add(stateHeartbeat.getHeartbeat().get_port());
    +                }
    +            }
    +            Map<Integer, LocalAssignment> reassignExecutors = 
getReassignExecutors(assignedExecutors, keepPorts);
    +            Map<Integer, String> newWorkerIds = new HashMap<>();
    +            for (Integer port : reassignExecutors.keySet()) {
    +                newWorkerIds.put(port, Utils.uuid());
    +            }
    +            LOG.debug("Syncing processes");
    +            LOG.debug("Assigned executors: {}", assignedExecutors);
    +            LOG.debug("Allocated: {}", localWorkerStats);
    +
    +            for (Map.Entry<String, StateHeartbeat> entry : 
localWorkerStats.entrySet()) {
    +                StateHeartbeat stateHeartbeat = entry.getValue();
    +                if (stateHeartbeat.getState() != State.valid) {
    +                    LOG.info("Shutting down and clearing state for id {}, 
Current supervisor time: {}, State: {}, Heartbeat: {}", entry.getKey(), now,
    +                            stateHeartbeat.getState(), 
stateHeartbeat.getHeartbeat());
    +                    shutWorker(supervisorData, entry.getKey());
    +                }
    +            }
    +            // start new workers
    +            Map<String, Integer> newWorkerPortToIds = 
startNewWorkers(newWorkerIds, reassignExecutors);
    +
    +            Map<String, Integer> allWorkerPortToIds = new HashMap<>();
    +            Map<String, Integer> approvedWorkers = 
localState.getApprovedWorkers();
    +            for (String keeper : keeperWorkerIds) {
    +                allWorkerPortToIds.put(keeper, 
approvedWorkers.get(keeper));
    +            }
    +            allWorkerPortToIds.putAll(newWorkerPortToIds);
    +            localState.setApprovedWorkers(allWorkerPortToIds);
    +            waitForWorkersLaunch(conf, newWorkerPortToIds.keySet());
    +
    +        } catch (Exception e) {
    +            LOG.error("Failed Sync Process", e);
    +            throw Utils.wrapInRuntime(e);
    +        }
    +
    +    }
    +
    +    protected void waitForWorkersLaunch(Map conf, Set<String> workerIds) 
throws Exception {
    +        int startTime = Time.currentTimeSecs();
    +        int timeOut = (int) 
conf.get(Config.NIMBUS_SUPERVISOR_TIMEOUT_SECS);
    +        for (String workerId : workerIds) {
    +            LocalState localState = ConfigUtils.workerState(conf, 
workerId);
    +            while (true) {
    +                LSWorkerHeartbeat hb = localState.getWorkerHeartBeat();
    +                if (hb != null || (Time.currentTimeSecs() - startTime) > 
timeOut)
    +                    break;
    +                LOG.info("{} still hasn't started", workerId);
    +                Time.sleep(500);
    +            }
    +            if (localState.getWorkerHeartBeat() == null) {
    +                LOG.info("Worker {} failed to start", workerId);
    +            }
    +        }
    +    }
    +
    +    protected Map<Integer, LocalAssignment> 
getReassignExecutors(Map<Integer, LocalAssignment> assignExecutors, 
Set<Integer> keepPorts) {
    +        Map<Integer, LocalAssignment> reassignExecutors = new HashMap<>();
    +        reassignExecutors.putAll(assignExecutors);
    +        for (Integer port : keepPorts) {
    +            reassignExecutors.remove(port);
    +        }
    +        return reassignExecutors;
    +    }
    +
    +
    +
    +    /**
    +     * Returns map from worker id to worker heartbeat. if the heartbeat is 
nil, then the worker is dead
    +     * 
    +     * @param assignedExecutors
    +     * @return
    +     * @throws Exception
    +     */
    +    public Map<String, StateHeartbeat> getLocalWorkerStats(SupervisorData 
supervisorData, Map<Integer, LocalAssignment> assignedExecutors, int now) 
throws Exception {
    +        Map<String, StateHeartbeat> workerIdHbstate = new HashMap<>();
    +        Map conf = supervisorData.getConf();
    +        LocalState localState = supervisorData.getLocalState();
    +        Map<String, LSWorkerHeartbeat> idToHeartbeat = 
SupervisorUtils.readWorkerHeartbeats(conf);
    +        Map<String, Integer> approvedWorkers = 
localState.getApprovedWorkers();
    +        Set<String> approvedIds = new HashSet<>();
    +        if (approvedWorkers != null) {
    +            approvedIds.addAll(approvedWorkers.keySet());
    +        }
    +        for (Map.Entry<String, LSWorkerHeartbeat> entry : 
idToHeartbeat.entrySet()) {
    +            String workerId = entry.getKey();
    +            LSWorkerHeartbeat whb = entry.getValue();
    +            State state;
    +            if (whb == null) {
    +                state = State.notStarted;
    +            } else if (!approvedIds.contains(workerId) || 
!matchesAssignment(whb, assignedExecutors)) {
    +                state = State.disallowed;
    +            } else if (supervisorData.getDeadWorkers().contains(workerId)) 
{
    +                LOG.info("Worker Process {}as died", workerId);
    +                state = State.timedOut;
    +            } else if (SupervisorUtils.isWorkerHbTimedOut(now, whb, conf)) 
{
    +                state = State.timedOut;
    +            } else {
    +                state = State.valid;
    +            }
    +            LOG.debug("Worker:{} state:{} WorkerHeartbeat:{} at supervisor 
time-secs {}", workerId, state, whb, now);
    +            workerIdHbstate.put(workerId, new StateHeartbeat(state, whb));
    +        }
    +        return workerIdHbstate;
    +    }
    +
    +    protected boolean matchesAssignment(LSWorkerHeartbeat whb, 
Map<Integer, LocalAssignment> assignedExecutors) {
    +        LocalAssignment localAssignment = 
assignedExecutors.get(whb.get_port());
    +        if (localAssignment == null || 
!localAssignment.get_topology_id().equals(whb.get_topology_id())) {
    +            return false;
    +        }
    +        List<ExecutorInfo> executorInfos = new ArrayList<>();
    +        executorInfos.addAll(whb.get_executors());
    +        // remove SYSTEM_EXECUTOR_ID
    +        executorInfos.remove(new ExecutorInfo(-1, -1));
    +        List<ExecutorInfo> localExecuorInfos = 
localAssignment.get_executors();
    +
    +        if (localExecuorInfos.size() != executorInfos.size())
    +            return false;
    +
    +        for (ExecutorInfo executorInfo : localExecuorInfos){
    +            if (!localExecuorInfos.contains(executorInfo))
    +                return false;
    +        }
    +        return true;
    +    }
    +
    +    /**
    +     * launch a worker in local mode.
    +     */
    +    protected void launchWorker(SupervisorData supervisorData, String 
stormId, Long port, String workerId, WorkerResources resources) throws 
IOException {
    +        // port this function after porting worker to java
    +    }
    +
    +    protected String getWorkerClassPath(String stormJar, Map stormConf) {
    +        List<String> topoClasspath = new ArrayList<>();
    +        Object object = stormConf.get(Config.TOPOLOGY_CLASSPATH);
    +
    +        if (object instanceof List) {
    +            topoClasspath.addAll((List<String>) object);
    +        } else if (object instanceof String){
    +            topoClasspath.add((String)object);
    +        }else {
    +            //ignore
    --- End diff --
    
    The original code was
    
    ```
    (let [...
              topo-classpath (if-let [cp (storm-conf TOPOLOGY-CLASSPATH)]
                               [cp]
                               [])
              ... (Utils/addToClasspath something topo-classpath) ...
    ```
    
    The Signature for Utils.addToClasspath is
    
    ```
    public static String addToClasspath(String classpath, Collection<String> 
paths);
    ```
    
    if `cp` was anything but a `String` in that code we would get a 
`ClassCastException` that would bring down the supervisor.  This code is even 
more lenient and supports both `String` and `List<String>`.  Which fixes a bug 
in our translation of the Utils code.
    
    You are right that we don't want a bad user config to bring down a 
supervisor, but if nimbus is doing the right thing and validating the config 
before it goes to the supervisor we should never see this happen.  
`TOPOLOGY_CLASSPATH` is annotated with ` @isStringOrStringList`.  So if we ever 
see `object` as something that is not a `String`, `List<String>`, or `null` it 
means we have a bug in our code somewhere, and silently ignoring it feels wrong 
to me.
    
    I would be fine with us logging an error if it is something different, but 
not totally ignoring it.


> port backtype.storm.daemon.supervisor to java
> ---------------------------------------------
>
>                 Key: STORM-1279
>                 URL: https://issues.apache.org/jira/browse/STORM-1279
>             Project: Apache Storm
>          Issue Type: New Feature
>          Components: storm-core
>            Reporter: Robert Joseph Evans
>            Assignee: John Fang
>              Labels: java-migration, jstorm-merger
>         Attachments: Discussion about supervisor.pdf
>
>
> https://github.com/apache/storm/tree/jstorm-import/jstorm-core/src/main/java/com/alibaba/jstorm/daemon/supervisor
>  as an example
> backtype.storm.event usage should be replaced with built-in java threadpools.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to