mboehm7 commented on a change in pull request #881: spark wip for review
URL: https://github.com/apache/systemml/pull/881#discussion_r407232599
 
 

 ##########
 File path: scripts/staging/slicing/sparked/sparked_union_slicer.py
 ##########
 @@ -0,0 +1,50 @@
+from pyspark import SparkContext
+
+from slicing.base.top_k import Topk
+from slicing.sparked import sparked_utils
+from slicing.sparked.sparked_utils import update_top_k
+
+
+def process(all_features, predictions, loss, sc, debug, alpha, k, w, 
loss_type, enumerator):
+    top_k = Topk(k)
+    cur_lvl = 0
+    levels = []
+    all_features = list(all_features)
+    first_level = {}
+    first_tasks = sc.parallelize(all_features)
+    SparkContext.broadcast(sc, top_k)
+    init_slices = first_tasks.mapPartitions(lambda features: 
sparked_utils.make_first_level(features, predictions, loss, top_k,
+                                                        alpha, k, w, 
loss_type)).map(lambda node: (node.key, node)).collect()
+    first_level.update(init_slices)
+    update_top_k(first_level, top_k, alpha, predictions)
+    SparkContext.broadcast(sc, top_k)
+    SparkContext.broadcast(sc, first_level)
+    levels.append(first_level)
+    cur_lvl = 1
+    top_k.print_topk()
+    SparkContext.broadcast(sc, top_k)
+    while len(levels[cur_lvl - 1]) > 0:
+        cur_lvl_res = {}
+        for left in range(int(cur_lvl / 2) + 1):
+            right = cur_lvl - left - 1
+            partitions = sc.parallelize(levels[left].values())
+            mapped = partitions.mapPartitions(lambda nodes: 
sparked_utils.nodes_enum(nodes, levels[right].values(), predictions,loss,
 
 Review comment:
   Would be possible to only broadcast the nodes that have been created in the 
last iteration, as all other levels have already been broadcast before?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to