[GitHub] johnyangk commented on a change in pull request #72: [Nemo-59] Skewed data-aware executor allocation

GitBox Sun, 08 Jul 2018 18:01:33 -0700

johnyangk commented on a change in pull request #72: [Nemo-59] Skewed 
data-aware executor allocation
URL: https://github.com/apache/incubator-nemo/pull/72#discussion_r200861495


 ##########
 File path: 
runtime/common/src/main/java/edu/snu/nemo/runtime/common/optimizer/pass/runtime/DataSkewRuntimePass.java
 ##########
 @@ -75,52 +81,75 @@ public PhysicalPlan apply(final PhysicalPlan originalPlan,
         .collect(Collectors.toList());
 
     // Get number of evaluators of the next stage (number of blocks).
-    final Integer taskListSize = 
optimizationEdges.stream().findFirst().orElseThrow(() ->
+    final Integer reducerTaskNum = 
optimizationEdges.stream().findFirst().orElseThrow(() ->
         new RuntimeException("optimization edges are 
empty")).getDst().getTaskIds().size();
 
     // Calculate keyRanges.
-    final List<KeyRange> keyRanges = calculateHashRanges(metricData.right(), 
taskListSize);
+    final List<KeyRange> keyRanges = calculateHashRanges(metricData.right(), 
reducerTaskNum);
 
     // Overwrite the previously assigned hash value range in the physical DAG 
with the new range.
     optimizationEdges.forEach(optimizationEdge -> {
       // Update the information.
-      final List<KeyRange> taskIdxToHashRange = new ArrayList<>();
-      IntStream.range(0, taskListSize).forEach(i -> 
taskIdxToHashRange.add(keyRanges.get(i)));
+      final Map<Integer, KeyRange> taskIdxToHashRange = new HashMap<>();
+      for (int taskIdx = 0; taskIdx < reducerTaskNum; taskIdx++) {
+        taskIdxToHashRange.put(taskIdx, keyRanges.get(taskIdx));
+      }
       optimizationEdge.setTaskIdxToKeyRange(taskIdxToHashRange);
     });
 
     return new PhysicalPlan(originalPlan.getId(), physicalDAGBuilder.build());
   }
 
+  private boolean containsSkewedHash(final List<Integer> skewedHashes,
+                                     final int startingHash, final int 
finishingHash) {
+    for (int h = startingHash; h < finishingHash; h++) {
+      if (skewedHashes.contains(h)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   /**
    * Method for calculating key ranges to evenly distribute the skewed metric 
data.
    *
    * @param aggregatedMetricData the metric data.
-   * @param taskListSize the size of the task list.
+   * @param taskNum the size of the task list.
    * @return the list of key ranges calculated.
    */
   @VisibleForTesting
   public List<KeyRange> calculateHashRanges(final Map<Integer, Long> 
aggregatedMetricData,
-                                            final Integer taskListSize) {
+                                            final Integer taskNum) {
     // NOTE: aggregatedMetricDataMap is made up of a map of (hash value, 
blockSize).
     // Get the max hash value.
     final int maxHashValue = aggregatedMetricData.keySet().stream()
         .max(Integer::compareTo)
         .orElseThrow(() -> new DynamicOptimizationException("Cannot find max 
hash value among blocks."));
 
+    // Identify skewed hashes.
+    List<Map.Entry<Integer, Long>> sortedMetricData = 
aggregatedMetricData.entrySet().stream()
+        .sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue()))
+        .collect(Collectors.toList());
+
+    List<Integer> skewedHashes = new ArrayList<>();
+    for (int i = 0; i < numSkewedHashes; i++) {
+      skewedHashes.add(sortedMetricData.get(i).getKey());
+      LOG.info("Skewed hash: Hash {} Size {}", 
sortedMetricData.get(i).getKey(), sortedMetricData.get(i).getValue());
+    }
+
     // Do the optimization using the information derived above.
     final Long totalSize = aggregatedMetricData.values().stream().mapToLong(n 
-> n).sum(); // get total size
-    final Long idealSizePerTask = totalSize / taskListSize; // and derive the 
ideal size per task
-    LOG.info("idealSizePerTask {} = {}(totalSize) / {}(taskListSize)",
-        idealSizePerTask, totalSize, taskListSize);
+    final Long idealSizePerTask = totalSize / taskNum; // and derive the ideal 
size per task
+    LOG.info("idealSizePerTask {} = {}(totalSize) / {}(taskNum)",
+        idealSizePerTask, totalSize, taskNum);
 
-    // find HashRanges to apply (for each blocks of each block).
-    final List<KeyRange> keyRanges = new ArrayList<>(taskListSize);
+    final List<KeyRange> keyRanges = new ArrayList<>(taskNum);
     int startingHashValue = 0;
     int finishingHashValue = 1; // initial values
     Long currentAccumulatedSize = 
aggregatedMetricData.getOrDefault(startingHashValue, 0L);
-    for (int i = 1; i <= taskListSize; i++) {
-      if (i != taskListSize) {
+    Long prevAccumulatedSize = 0L;
+    for (int i = 1; i <= taskNum; i++) {
+      if (i != taskNum) {
         final Long idealAccumulatedSize = idealSizePerTask * i; // where we 
should end
         // find the point while adding up one by one.
 
 Review comment:
   Can you elaborate a little bit more in the comment what the 'point', 'one', 
and 'step' mean?
   Maybe introduce some more helper private methods?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] johnyangk commented on a change in pull request #72: [Nemo-59] Skewed data-aware executor allocation

Reply via email to