This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 096ca06a68 [SYSTEMDS-3498] Fix unique() for inputs with many distinct
items
096ca06a68 is described below
commit 096ca06a686904a2422765dd0f45444c0fa4f446
Author: Matthias Boehm <[email protected]>
AuthorDate: Fri Feb 10 19:42:31 2023 +0100
[SYSTEMDS-3498] Fix unique() for inputs with many distinct items
This patch fixes the correctness of the existing unique() operation
for obtaining the set of distinct items. So far the distinct items
were placed by some heuristics of tall/wide matrices into the output,
but if the output size exceeded the spark block size, crashed. R's
unique semantics are to deduplicate values or rows (for multi-column
inputs). For now, this patch ensures correctness but only supports
input vectors which can be extended in the future.
---
.../sysds/runtime/matrix/data/LibMatrixSketch.java | 94 ++++++----------------
.../org/apache/sysds/test/AutomatedTestBase.java | 4 +-
.../sysds/test/functions/unique/UniqueRowCol.java | 74 +----------------
3 files changed, 26 insertions(+), 146 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
index 3793564dbe..a848ef9480 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixSketch.java
@@ -21,40 +21,39 @@ package org.apache.sysds.runtime.matrix.data;
import org.apache.commons.lang.NotImplementedException;
import org.apache.sysds.common.Types;
-import org.apache.sysds.hops.OptimizerUtils;
-import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
-import java.util.List;
public class LibMatrixSketch {
- private enum MatrixShape {
- SKINNY, // rows > cols
- WIDE, // rows < cols
- }
-
public static MatrixBlock getUniqueValues(MatrixBlock blkIn,
Types.Direction dir) {
-
- int R = blkIn.getNumRows();
- int C = blkIn.getNumColumns();
- List<HashSet<Double>> hashSets = new ArrayList<>();
-
- MatrixShape matrixShape = (R >= C)? MatrixShape.SKINNY :
MatrixShape.WIDE;
- MatrixBlock blkOut;
- switch (dir)
- {
+ //similar to R's unique, this operation takes a matrix and
computes the
+ //unique values (or rows in case of multiple column inputs)
+
+ int rlen = blkIn.getNumRows();
+ int clen = blkIn.getNumColumns();
+
+ MatrixBlock blkOut = null;
+ switch (dir) {
case RowCol:
+ if( clen != 1 )
+ throw new
NotImplementedException("Unique only support single-column vectors yet");
+ // TODO optimize for dense/sparse/compressed
(once multi-column support added)
+
+ // obtain set of unique items (dense input
vector)
HashSet<Double> hashSet = new HashSet<>();
- // TODO optimize for sparse and compressed
inputs
- for (int i=0; i<R; ++i) {
- for (int j=0; j<C; ++j) {
- hashSet.add(blkIn.getValue(i,
j));
- }
+ for( int i=0; i<rlen; i++ ) {
+ hashSet.add(blkIn.quickGetValue(i, 0));
+ }
+
+ // allocate output block and place values
+ int rlen2 = hashSet.size();
+ blkOut = new MatrixBlock(rlen2, 1,
false).allocateBlock();
+ Iterator<Double> iter = hashSet.iterator();
+ for( int i=0; i<rlen2; i++ ) {
+ blkOut.quickSetValue(i, 0, iter.next());
}
- hashSets.add(hashSet);
- blkOut = serializeRowCol(hashSets, dir,
matrixShape);
break;
case Row:
@@ -67,51 +66,4 @@ public class LibMatrixSketch {
return blkOut;
}
-
- private static MatrixBlock serializeRowCol(List<HashSet<Double>>
hashSets, Types.Direction dir, MatrixShape matrixShape) {
-
- if (dir != Types.Direction.RowCol) {
- throw new IllegalArgumentException("Unrecognized
direction: " + dir);
- }
-
- MatrixBlock blkOut;
-
- if (hashSets.isEmpty()) {
- throw new IllegalArgumentException("Corrupt sketch:
metadata cannot be empty");
- }
-
- int R, C;
- HashSet<Double> hashSet = hashSets.get(0);
- Iterator<Double> iter = hashSet.iterator();
-
- if (hashSet.size() <= OptimizerUtils.DEFAULT_BLOCKSIZE) {
- if (matrixShape == MatrixShape.SKINNY) {
- // Rx1 column vector
- R = hashSet.size();
- C = 1;
- } else { // WIDE
- // 1xC row vector
- R = 1;
- C = hashSet.size();
- }
- } else {
- if (matrixShape == MatrixShape.SKINNY) {
- R = OptimizerUtils.DEFAULT_BLOCKSIZE;
- C = (hashSet.size() /
OptimizerUtils.DEFAULT_BLOCKSIZE) + 1;
- } else { // WIDE
- R = (hashSet.size() /
OptimizerUtils.DEFAULT_BLOCKSIZE) + 1;
- C = OptimizerUtils.DEFAULT_BLOCKSIZE;
- }
- }
-
- blkOut = new MatrixBlock(R, C, false);
- for (int i=0; i<R; ++i) {
- // C is guaranteed to be > 0
- for (int j=0; j<C; ++j) {
- blkOut.setValue(i, j, iter.next());
- }
- }
-
- return blkOut;
- }
}
diff --git a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
index b7f272cdbd..4a20b25290 100644
--- a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
+++ b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
@@ -1947,8 +1947,8 @@ public abstract class AutomatedTestBase {
TestUtils.compareDMLScalarWithJavaScalar(javaFile, dmlFile, epsilon);
}
else {
- TestUtils
-
.compareDMLMatrixWithJavaMatrixRowsOutOfOrder(comparisonFiles[i],
outputDirectories[i], epsilon);
+
TestUtils.compareDMLMatrixWithJavaMatrixRowsOutOfOrder(
+ comparisonFiles[i],
outputDirectories[i], epsilon);
}
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/unique/UniqueRowCol.java
b/src/test/java/org/apache/sysds/test/functions/unique/UniqueRowCol.java
index a8b2fc1ba7..7f6ad0ff56 100644
--- a/src/test/java/org/apache/sysds/test/functions/unique/UniqueRowCol.java
+++ b/src/test/java/org/apache/sysds/test/functions/unique/UniqueRowCol.java
@@ -66,80 +66,8 @@ public class UniqueRowCol extends UniqueBase {
@Test
public void testWideSmallCP() {
- double[][] inputMatrix = {{1,1,6,9,4,2,0,9,0,0,4,4}};
+ double[][] inputMatrix =
{{1},{1},{6},{9},{4},{2},{0},{9},{0},{0},{4},{4}};
double[][] expectedMatrix = {{1,6,9,4,2,0}};
uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
}
-
- @Test
- public void testSquareLargeCP() {
- double[][] inputMatrix = new double[1000][1000];
- // Input is a 1000 x 1000 matrix:
- // [1, 1, ..., 1, 2, 2, .., 2]
- // [1, 1, ..., 1, 2, 2, .., 2]
- // ..
- // [1, 1, ..., 1, 2, 2, .., 2]
- // [2, 2, ..., 2, 1, 1, .., 1]
- // [2, 2, ..., 2, 1, 1, .., 1]
- // ..
- // [2, 2, ..., 2, 1, 1, .., 1]
- for (int i=0; i<500; ++i) {
- for (int j=0; j<500; ++j) {
- inputMatrix[i][j] = 1;
- inputMatrix[i+500][j+500] = 1;
- }
- }
- for (int i=500; i<1000; ++i) {
- for (int j=0; j<500; ++j) {
- inputMatrix[i][j] = 2;
- inputMatrix[i-500][j+500] = 2;
- }
- }
- // Expect the output to be a skinny matrix due to the following
condition in code:
- // (R >= C)? LibMatrixSketch.MatrixShape.SKINNY :
LibMatrixSketch.MatrixShape.WIDE;
- double[][] expectedMatrix = {{1},{2}};
- uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
- }
-
- @Test
- public void testSkinnyLargeCP() {
- double[][] inputMatrix = new double[2000][2];
- // Input is a 2000 x 2 matrix:
- // [1, 2]
- // [1, 2]
- // ..
- // [1, 2]
- // [2, 1]
- // [2, 1]
- // ..
- // [2, 1]
- for (int i=0; i<1000; ++i) {
- inputMatrix[i][0] = 1;
- inputMatrix[i][1] = 2;
- }
- for (int i=1000; i<2000; ++i) {
- inputMatrix[i][0] = 2;
- inputMatrix[i][1] = 1;
- }
- double[][] expectedMatrix = {{1}, {2}};
- uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
- }
-
- @Test
- public void testWideLargeCP() {
- double[][] inputMatrix = new double[2][2000];
- // Input is a 2 x 2000 matrix:
- // [1, 1, ..., 1, 2, 2, .., 2]
- // [2, 2, ..., 2, 1, 1, .., 1]
- for (int j=0; j<1000; ++j) {
- inputMatrix[0][j] = 1;
- inputMatrix[1][j+1000] = 1;
- }
- for (int j=1000; j<2000; ++j) {
- inputMatrix[0][j] = 2;
- inputMatrix[1][j-1000] = 2;
- }
- double[][] expectedMatrix = {{1,2}};
- uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
- }
}