Michael Blow has uploaded a new change for review.

  https://asterix-gerrit.ics.uci.edu/2097

Change subject: [ASTERIXDB-1076][HYR] Prevent node death false positives
......................................................................

[ASTERIXDB-1076][HYR] Prevent node death false positives

- Measure actual time since last heartbeat touched, not based on number
  of dead cycle detections since last heartbeat received
- Update heartbeat touch on job result received, in addition to when
  heartbeat data is received
- Minor refactoring in NC/CC config

Change-Id: Idb1abcc2b783b192b88ed988d398fcfe763531e9
---
M 
asterixdb/asterix-app/src/main/java/org/apache/asterix/api/common/AsterixHyracksIntegrationUtil.java
M 
hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/ClusterControllerService.java
M 
hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/NodeControllerState.java
M 
hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/cluster/NodeManager.java
M 
hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/AbstractTaskLifecycleWork.java
M 
hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/RegisterNodeWork.java
M 
hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/CCConfig.java
M 
hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/ControllerConfig.java
M 
hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/NCConfig.java
M 
hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NodeControllerService.java
10 files changed, 38 insertions(+), 25 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb 
refs/changes/97/2097/1

diff --git 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/api/common/AsterixHyracksIntegrationUtil.java
 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/api/common/AsterixHyracksIntegrationUtil.java
index f5e94b1..ecf25eb 100644
--- 
a/asterixdb/asterix-app/src/main/java/org/apache/asterix/api/common/AsterixHyracksIntegrationUtil.java
+++ 
b/asterixdb/asterix-app/src/main/java/org/apache/asterix/api/common/AsterixHyracksIntegrationUtil.java
@@ -166,7 +166,7 @@
         ncConfig.getConfigManager().processConfig();
 
         // get initial partitions from config
-        String[] nodeStores = 
ncConfig.getAppConfig().getStringArray(NCConfig.Option.IODEVICES);
+        String[] nodeStores = 
ncConfig.getNodeScopedAppConfig().getStringArray(NCConfig.Option.IODEVICES);
         if (nodeStores == null) {
             throw new IllegalStateException("Couldn't find stores for NC: " + 
ncConfig.getNodeId());
         }
diff --git 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/ClusterControllerService.java
 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/ClusterControllerService.java
index dfc79ed..a3fbb70 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/ClusterControllerService.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/ClusterControllerService.java
@@ -207,7 +207,7 @@
         webServer.start();
         info = new ClusterControllerInfo(ccConfig.getClientListenAddress(), 
ccConfig.getClientListenPort(),
                 webServer.getListeningPort());
-        timer.schedule(sweeper, 0, ccConfig.getHeartbeatPeriod());
+        timer.schedule(sweeper, 0, ccConfig.getHeartbeatPeriodMillis());
         jobLog.open();
         startApplication();
 
diff --git 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/NodeControllerState.java
 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/NodeControllerState.java
index 7be6524..b045002 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/NodeControllerState.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/NodeControllerState.java
@@ -141,7 +141,7 @@
 
     private int rrdPtr;
 
-    private int lastHeartbeatDuration;
+    private long lastHeartbeatNanoTime;
 
     private NodeCapacity capacity;
 
@@ -210,7 +210,7 @@
     }
 
     public synchronized void notifyHeartbeat(HeartbeatData hbData) {
-        lastHeartbeatDuration = 0;
+        touchHeartbeat();
         hbTime[rrdPtr] = System.currentTimeMillis();
         if (hbData != null) {
             heapInitSize[rrdPtr] = hbData.heapInitSize;
@@ -247,8 +247,16 @@
         }
     }
 
-    public int incrementLastHeartbeatDuration() {
-        return lastHeartbeatDuration++;
+    public void touchHeartbeat() {
+        lastHeartbeatNanoTime = System.nanoTime();
+    }
+
+    public long nanosSinceLastHeartbeat() {
+        return System.nanoTime() - lastHeartbeatNanoTime;
+    }
+
+    public long getLastHeartbeatNanoTime() {
+        return lastHeartbeatNanoTime;
     }
 
     public NodeControllerRemoteProxy getNodeController() {
diff --git 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/cluster/NodeManager.java
 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/cluster/NodeManager.java
index 2d43d42..a380967 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/cluster/NodeManager.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/cluster/NodeManager.java
@@ -29,6 +29,7 @@
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 import java.util.logging.Logger;
 
 import org.apache.commons.lang3.tuple.Pair;
@@ -147,11 +148,13 @@
         Set<String> deadNodes = new HashSet<>();
         Set<JobId> affectedJobIds = new HashSet<>();
         Iterator<Map.Entry<String, NodeControllerState>> nodeIterator = 
nodeRegistry.entrySet().iterator();
+        long deadNodeNanosThreshold = TimeUnit.MILLISECONDS
+                .toNanos(ccConfig.getHeartbeatMaxMisses() * 
ccConfig.getHeartbeatPeriodMillis());
         while (nodeIterator.hasNext()) {
             Map.Entry<String, NodeControllerState> entry = nodeIterator.next();
             String nodeId = entry.getKey();
             NodeControllerState state = entry.getValue();
-            if (state.incrementLastHeartbeatDuration() >= 
ccConfig.getHeartbeatMaxMisses()) {
+            if (state.nanosSinceLastHeartbeat() >= deadNodeNanosThreshold) {
                 deadNodes.add(nodeId);
                 affectedJobIds.addAll(state.getActiveJobIds());
                 // Removes the node from node map.
diff --git 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/AbstractTaskLifecycleWork.java
 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/AbstractTaskLifecycleWork.java
index 3babf00..446bfd1 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/AbstractTaskLifecycleWork.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/AbstractTaskLifecycleWork.java
@@ -27,6 +27,7 @@
 import org.apache.hyracks.api.job.ActivityCluster;
 import org.apache.hyracks.api.job.JobId;
 import org.apache.hyracks.control.cc.ClusterControllerService;
+import org.apache.hyracks.control.cc.NodeControllerState;
 import org.apache.hyracks.control.cc.job.ActivityPlan;
 import org.apache.hyracks.control.cc.job.IJobManager;
 import org.apache.hyracks.control.cc.job.JobRun;
@@ -75,6 +76,10 @@
                 }
             }
         }
+        final NodeControllerState ncState = 
ccs.getNodeManager().getNodeControllerState(nodeId);
+        if (ncState != null) {
+            ncState.touchHeartbeat();
+        }
     }
 
     protected abstract void performEvent(TaskAttempt ta);
diff --git 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/RegisterNodeWork.java
 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/RegisterNodeWork.java
index 5866ba5..d1d2208 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/RegisterNodeWork.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-cc/src/main/java/org/apache/hyracks/control/cc/work/RegisterNodeWork.java
@@ -67,7 +67,7 @@
             NodeParameters params = new NodeParameters();
             params.setClusterControllerInfo(ccs.getClusterControllerInfo());
             params.setDistributedState(ccs.getContext().getDistributedState());
-            params.setHeartbeatPeriod(ccs.getCCConfig().getHeartbeatPeriod());
+            
params.setHeartbeatPeriod(ccs.getCCConfig().getHeartbeatPeriodMillis());
             
params.setProfileDumpPeriod(ccs.getCCConfig().getProfileDumpPeriod());
             result = new CCNCFunctions.NodeRegistrationResult(params, null);
             ccs.getJobIdFactory().ensureMinimumId(reg.getMaxJobId() + 1);
diff --git 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/CCConfig.java
 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/CCConfig.java
index cbc6146..3ea0d6b 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/CCConfig.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/CCConfig.java
@@ -176,8 +176,6 @@
         }
     }
 
-    private final ConfigManager configManager;
-
     private List<String> appArgs = new ArrayList<>();
 
     public CCConfig() {
@@ -186,7 +184,6 @@
 
     public CCConfig(ConfigManager configManager) {
         super(configManager);
-        this.configManager = configManager;
         configManager.register(Option.class);
         configManager.registerArgsListener(appArgs::addAll);
     }
@@ -205,10 +202,6 @@
      */
     public Ini getIni() {
         return configManager.toIni(false);
-    }
-
-    public ConfigManager getConfigManager() {
-        return configManager;
     }
 
     // QQQ Note that clusterListenAddress is *not directly used* yet. Both
@@ -270,7 +263,7 @@
         configManager.set(Option.CONSOLE_LISTEN_PORT, consoleListenPort);
     }
 
-    public int getHeartbeatPeriod() {
+    public int getHeartbeatPeriodMillis() {
         return getAppConfig().getInt(Option.HEARTBEAT_PERIOD);
     }
 
diff --git 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/ControllerConfig.java
 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/ControllerConfig.java
index 1745e2a..19c89e0 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/ControllerConfig.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/ControllerConfig.java
@@ -86,6 +86,10 @@
         return configManager.getAppConfig();
     }
 
+    public ConfigManager getConfigManager() {
+        return configManager;
+    }
+
     public String getConfigFile() {
         return getAppConfig().getString(ControllerConfig.Option.CONFIG_FILE);
     }
diff --git 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/NCConfig.java
 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/NCConfig.java
index bd5895e..e8c96d4 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/NCConfig.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/NCConfig.java
@@ -247,12 +247,7 @@
         return appArgs.toArray(new String[appArgs.size()]);
     }
 
-    public ConfigManager getConfigManager() {
-        return configManager;
-    }
-
-    @Override
-    public IApplicationConfig getAppConfig() {
+    public IApplicationConfig getNodeScopedAppConfig() {
         return appConfig;
     }
 
diff --git 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NodeControllerService.java
 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NodeControllerService.java
index 4a2c2e9..933895e 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NodeControllerService.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NodeControllerService.java
@@ -366,7 +366,8 @@
     }
 
     private void startApplication() throws Exception {
-        serviceCtx = new NCServiceContext(this, serverCtx, ioManager, id, 
memoryManager, lccm, ncConfig.getAppConfig());
+        serviceCtx = new NCServiceContext(this, serverCtx, ioManager, id, 
memoryManager, lccm,
+                ncConfig.getNodeScopedAppConfig());
         application.init(serviceCtx);
         executor = 
Executors.newCachedThreadPool(serviceCtx.getThreadFactory());
         application.start(ncConfig.getAppArgsArray());
@@ -559,7 +560,7 @@
 
             hbData.diskReads = ioCounter.getReads();
             hbData.diskWrites = ioCounter.getWrites();
-            hbData.numCores = Runtime.getRuntime().availableProcessors() - 1; 
// Reserves one core for heartbeats.
+            hbData.numCores = Runtime.getRuntime().availableProcessors();
 
             try {
                 cc.nodeHeartbeat(id, hbData);
@@ -568,7 +569,11 @@
             } catch (InterruptedException e) {
                 throw e;
             } catch (Exception e) {
-                LOGGER.log(Level.SEVERE, "Exception sending heartbeat; will 
retry after 1s", e);
+                if (LOGGER.isLoggable(Level.FINE)) {
+                    LOGGER.log(Level.FINE, "Exception sending heartbeat; will 
retry after 1s", e);
+                } else {
+                    LOGGER.log(Level.SEVERE, "Exception sending heartbeat; 
will retry after 1s: " + e.toString());
+                }
                 return false;
             }
         }

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/2097
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idb1abcc2b783b192b88ed988d398fcfe763531e9
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Michael Blow <[email protected]>

Reply via email to