This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 3d8fe86e99 [SYSTEMDS-3373] Fix hash robustness of lineage cache probing
3d8fe86e99 is described below

commit 3d8fe86e99a7e677e361b89a0273c9f8cf032292
Author: Matthias Boehm <[email protected]>
AuthorDate: Sat May 7 22:54:26 2022 +0200

    [SYSTEMDS-3373] Fix hash robustness of lineage cache probing
    
    This patch fixes special cases with very long repeated lineage traces,
    which created repeating hash values and thus required deep comparisons
    and thus super-linear complexity. We now include the height (distance
    from leaves) into the hash computation which significantly reduces the
    chances of such repetitions. For 70,000 iterations of a single op,
    this patch improved the total runtime from 142s to 1.9s. Furthermore,
    we now have tests to check this issue. Thanks to David for catching
    this edge case.
---
 .../apache/sysds/runtime/lineage/LineageCache.java |  4 +-
 .../sysds/runtime/lineage/LineageCacheEntry.java   |  2 +-
 .../runtime/lineage/LineageCacheEviction.java      |  2 +-
 .../apache/sysds/runtime/lineage/LineageItem.java  | 56 +++++++-----------
 .../sysds/runtime/lineage/LineageItemUtils.java    |  2 +-
 .../functions/lineage/MiscLineageProbingTest.java  | 69 ++++++++++++++++++++++
 src/test/scripts/functions/lineage/MiscProbe1.dml  |  7 +++
 src/test/scripts/functions/lineage/MiscProbe2.dml  |  7 +++
 8 files changed, 110 insertions(+), 39 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index 7575b0aa0b..3ea7d3d143 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -216,7 +216,7 @@ public class LineageCache
                        LineageItem li = new LineageItem(opcode, liInputs);
                        // set _distLeaf2Node for this special lineage item to 1
                        // to save it from early eviction if DAGHEIGHT policy 
is selected
-                       li.setDistLeaf2Node(1);
+                       li.setHeight(1);
                        LineageCacheEntry e = null;
                        synchronized(_cache) {
                                if (LineageCache.probe(li)) {
@@ -301,7 +301,7 @@ public class LineageCache
                        return new 
FederatedResponse(FederatedResponse.ResponseType.ERROR);
 
                LineageItem li = udf.getLineageItem(ec).getValue();
-               li.setDistLeaf2Node(1); //to save from early eviction
+               li.setHeight(1); //to save from early eviction
                LineageCacheEntry e = null;
                synchronized(_cache) {
                        if (probe(li))
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
index ea99cf8255..962c7d5307 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
@@ -215,7 +215,7 @@ public class LineageCacheEntry {
        }
        
        protected synchronized long getDagHeight() {
-               return _key.getDistLeaf2Node();
+               return _key.getHeight();
        }
        
        protected synchronized double getCostNsize() {
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java
index 3fca108b84..267440d44a 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java
@@ -234,7 +234,7 @@ public class LineageCacheEviction
                                System.out.println(" spill time = " + 
getDiskSpillEstimate(e) * 1000);
                                System.out.print("dim = " + 
e.getMBValue().getNumRows() + " " + e.getMBValue().getNumColumns());
                                System.out.print(" size = " + 
getDiskSizeEstimate(e));
-                               System.out.println(" DAG height = " + 
e._key.getDistLeaf2Node());
+                               System.out.println(" DAG height = " + 
e._key.getHeight());
                        }
 
                        if (spilltime < 
LineageCacheConfig.MIN_SPILL_TIME_ESTIMATE) {
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java
index 31284f755d..311dae2a86 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java
@@ -20,6 +20,7 @@
 package org.apache.sysds.runtime.lineage;
 
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Stack;
@@ -35,11 +36,13 @@ public class LineageItem {
        private final String _opcode;
        private final String _data;
        private LineageItem[] _inputs;
+       private long _height = 0; //distance leaf to node
        private int _hash = 0;
        private LineageItem _dedupPatch;
-       private long _distLeaf2Node;
        private final BooleanArray32 _specialValueBits;  // TODO: Move this to 
a new subclass
        // map from thread id to visited flag to allow concurrent checks 
through the lineage trace
+       
+       //TODO replace with thread local concurrent hashmap per worker
        private Map<Long, Boolean> _visited = new ConcurrentHashMap<>();
        
        public enum LineageItemType {Literal, Creation, Instruction, Dedup}
@@ -100,11 +103,12 @@ public class LineageItem {
                _opcode = opcode;
                _data = data;
                _inputs = inputs;
+               // store the distance of this node from the leaves. 
(O(#inputs)) operation
+               _height = ((inputs != null && inputs.length != 0) ?
+                       Arrays.stream(inputs).map(l -> 
l._height).max(Long::compare).get() : 0) + 1;
                // materialize hash on construction 
                // (constant time operation if input hashes constructed)
                _hash = hashCode();
-               // store the distance of this node from the leaves. 
(O(#inputs)) operation
-               _distLeaf2Node = distLeaf2Node();
                _specialValueBits = new BooleanArray32(specialValueBits);
        }
        
@@ -127,8 +131,16 @@ public class LineageItem {
                return _data;
        }
        
-       public void fixHash() {
-               _hash = 0;
+       public long getHeight() {
+               return _height;
+       }
+       
+       public void setHeight(long height) {
+               _height = height;
+       }
+       
+       public void resetHash() {
+               _hash = 0; //enable recomputation
                _hash = hashCode();
        }
 
@@ -154,25 +166,6 @@ public class LineageItem {
                _specialValueBits.setValue(value);
        }
 
-       private long distLeaf2Node() {
-               // Derive height only if the corresponding reuse
-               // policy is selected, otherwise set -1.
-               if (LineageCacheConfig.ReuseCacheType.isNone()
-                       || !LineageCacheConfig.isDagHeightBased())
-                       return -1;
-
-               if (_inputs != null && _inputs.length > 0) {
-                       // find the input with highest height
-                       long maxDistance = _inputs[0].getDistLeaf2Node();
-                       for (int i=1; i<_inputs.length; i++)
-                               if (_inputs[i].getDistLeaf2Node() > maxDistance)
-                                       maxDistance = 
_inputs[i].getDistLeaf2Node();
-                       return maxDistance + 1;
-               }
-               else
-                       return 1;  //leaf node
-       }
-       
        public long getId() {
                return _id;
        }
@@ -193,14 +186,6 @@ public class LineageItem {
                return _opcode.startsWith(LineageItemUtils.LPLACEHOLDER);
        }
        
-       public void setDistLeaf2Node(long d) {
-               _distLeaf2Node = d;
-       }
-       
-       public long getDistLeaf2Node() {
-               return _distLeaf2Node;
-       }
-       
        public LineageItem getDedupPatch() {
                return _dedupPatch;
        }
@@ -299,6 +284,8 @@ public class LineageItem {
                                ret = li1._opcode.equals(li2._opcode);
                                ret &= li1._data.equals(li2._data);
                        }
+                       //check hash including height as pre-filter
+                       //(for special cases the height is later set to 1 but 
the hash is materialized)
                        ret &= (li1.hashCode() == li2.hashCode());
                        if (!ret) break;
 
@@ -428,8 +415,9 @@ public class LineageItem {
                                _opcode.hashCode(), _data.hashCode());
                        if (_inputs != null)
                                for (LineageItem li : _inputs)
-                                       h = 
UtilFunctions.intHashCodeRobust(li.hashCode(), h);
-                       _hash = h;
+                                       h = UtilFunctions.intHashCodeRobust(h, 
li.hashCode());
+                       _hash = UtilFunctions.intHashCodeRobust(h,
+                               UtilFunctions.longHashCode(_height));
                }
                return _hash;
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java
index 9897e0d99d..4955684f4e 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java
@@ -414,7 +414,7 @@ public class LineageItemUtils {
                }
 
                //fix the hash codes bottom-up, as the inputs have changed
-               root.fixHash();
+               root.resetHash();
                root.setVisited();
        }
 
diff --git 
a/src/test/java/org/apache/sysds/test/functions/lineage/MiscLineageProbingTest.java
 
b/src/test/java/org/apache/sysds/test/functions/lineage/MiscLineageProbingTest.java
new file mode 100644
index 0000000000..5404b25fd0
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/lineage/MiscLineageProbingTest.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.lineage;
+
+import org.apache.sysds.runtime.controlprogram.parfor.stat.Timing;
+import org.apache.sysds.runtime.lineage.Lineage;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class MiscLineageProbingTest extends LineageBase {
+       
+       protected static final String TEST_DIR = "functions/lineage/";
+       protected static final String TEST_NAME1 = "MiscProbe1";
+       protected static final String TEST_NAME2 = "MiscProbe2";
+       
+       protected String TEST_CLASS_DIR = TEST_DIR + 
MiscLineageProbingTest.class.getSimpleName() + "/";
+       
+       @Override
+       public void setUp() {
+               TestUtils.clearAssertionInformation();
+               addTestConfiguration(TEST_NAME1, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1));
+               addTestConfiguration(TEST_NAME2, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME2));
+       }
+       
+       @Test
+       public void testLineageTraceLineage1() {
+               testLineageTrace(TEST_NAME1);
+       }
+       
+       @Test
+       public void testLineageTraceLineage2() {
+               testLineageTrace(TEST_NAME2);
+       }
+
+       public void testLineageTrace(String testName) {
+               try {
+                       getAndLoadTestConfiguration(testName);
+                       String HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = HOME + testName + ".dml";
+                       programArgs = new String[]{"-explain","-stats", 
"--lineage", "reuse"};
+                       
+                       Timing time = new Timing(true);
+                       runTest(true, false, null, -1);
+                       Assert.assertTrue(time.stop() < 20000);
+               }
+               finally {
+                       Lineage.setLinReuseNone();
+               }
+       }
+}
diff --git a/src/test/scripts/functions/lineage/MiscProbe1.dml 
b/src/test/scripts/functions/lineage/MiscProbe1.dml
new file mode 100644
index 0000000000..5f19f14fdd
--- /dev/null
+++ b/src/test/scripts/functions/lineage/MiscProbe1.dml
@@ -0,0 +1,7 @@
+
+X = rand(rows=10, cols=8, seed=1234);
+n = 200000;
+for(counter in 1:n) { # create lineage trace
+  X = X + 0.1;
+}
+print(sum(X));
\ No newline at end of file
diff --git a/src/test/scripts/functions/lineage/MiscProbe2.dml 
b/src/test/scripts/functions/lineage/MiscProbe2.dml
new file mode 100644
index 0000000000..4204c59469
--- /dev/null
+++ b/src/test/scripts/functions/lineage/MiscProbe2.dml
@@ -0,0 +1,7 @@
+
+X = rand(rows=10, cols=8, seed=1234);
+n = 200000;
+for(counter in 1:n) { # create lineage trace
+  X = 0.1 + X;
+}
+print(sum(X));
\ No newline at end of file

Reply via email to