This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new a0b195e [SYSTEMDS-2656] Fix robustness spark transform encode (empty
partitions)
a0b195e is described below
commit a0b195eebe8d703875b9e2cb6016041ef959ed19
Author: Matthias Boehm <[email protected]>
AuthorDate: Wed Sep 2 22:36:09 2020 +0200
[SYSTEMDS-2656] Fix robustness spark transform encode (empty partitions)
This patch fixes an edge cases of spark transform encode (specifically
recode and dummy code) when spark partitions are completely empty.
---
...ltiReturnParameterizedBuiltinSPInstruction.java | 4 +-
.../runtime/transform/encode/EncoderRecode.java | 10 +++--
.../test/functions/builtin/BuiltinMiceTest.java | 44 +++++++++-------------
3 files changed, 26 insertions(+), 32 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
b/src/main/java/org/apache/sysds/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
index df5f16b..5dd7b7f 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
@@ -254,9 +254,11 @@ public class MultiReturnParameterizedBuiltinSPInstruction
extends ComputationSPI
throws Exception
{
//build meta data (e.g., recode maps)
- if( _raEncoder != null )
+ if( _raEncoder != null ) {
+ _raEncoder.prepareBuildPartial();
while( iter.hasNext() )
_raEncoder.buildPartial(iter.next()._2());
+ }
//output recode maps as columnID - token pairs
ArrayList<Tuple2<Integer,Object>> ret = new
ArrayList<>();
diff --git
a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderRecode.java
b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderRecode.java
index e195835..d8d524a 100644
--- a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderRecode.java
+++ b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderRecode.java
@@ -121,14 +121,16 @@ public class EncoderRecode extends Encoder
protected void putCode(HashMap<String,Long> map, String key) {
map.put(key, Long.valueOf(map.size()+1));
}
+
+ public void prepareBuildPartial() {
+ //ensure allocated partial recode map
+ if( _rcdMapsPart == null )
+ _rcdMapsPart = new HashMap<>();
+ }
public void buildPartial(FrameBlock in) {
if( !isApplicable() )
return;
-
- //ensure allocated partial recode map
- if( _rcdMapsPart == null )
- _rcdMapsPart = new HashMap<>();
//construct partial recode map (tokens w/o codes)
//iterate over columns for sequential access
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
index a647918..ab76824 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
@@ -51,42 +51,34 @@ public class BuiltinMiceTest extends AutomatedTestBase {
runMiceNominalTest(mask, 1, false, LopProperties.ExecType.CP);
}
-// @Test
-// public void testMiceMixSpark() {
-// double[][] mask = {{ 0.0, 0.0, 1.0, 1.0, 0.0}};
-// runMiceNominalTest(mask, 1, LopProperties.ExecType.SPARK);
-// }
-
@Test
public void testMiceNumberCP() {
double[][] mask = {{ 0.0, 0.0, 0.0, 0.0, 0.0}};
runMiceNominalTest(mask, 2, false, LopProperties.ExecType.CP);
}
-// @Test
-// public void testMiceNumberSpark() {
-// double[][] mask = {{ 0.0, 0.0, 0.0, 0.0, 0.0}};
-// runMiceNominalTest(mask, 2, LopProperties.ExecType.SPARK);
-// }
-
@Test
public void testMiceCategoricalCP() {
double[][] mask = {{ 1.0, 1.0, 1.0, 1.0, 1.0}};
runMiceNominalTest(mask, 3, false, LopProperties.ExecType.CP);
}
-// @Test
-// public void testMiceCategoricalSpark() {
-// double[][] mask = {{ 1.0, 1.0, 1.0, 1.0, 1.0}};
-// runMiceNominalTest(mask, 3, LopProperties.ExecType.SPARK);
-// }
-
@Test
public void testMiceMixLineageReuseCP() {
double[][] mask = {{ 0.0, 0.0, 1.0, 1.0, 0.0}};
runMiceNominalTest(mask, 1, true, LopProperties.ExecType.CP);
}
+ //added a single, relatively-fast spark test, others seem infeasible
+ //as forcing every operation to spark takes too long for complex,
+ //composite builtins like mice.
+
+ @Test
+ public void testMiceNumberSpark() {
+ double[][] mask = {{ 0.0, 0.0, 0.0, 0.0, 0.0}};
+ runMiceNominalTest(mask, 2, false,
LopProperties.ExecType.SPARK);
+ }
+
private void runMiceNominalTest(double[][] mask, int testType, boolean
lineage, LopProperties.ExecType instType) {
Types.ExecMode platformOld = setExecMode(instType);
try {
@@ -94,10 +86,10 @@ public class BuiltinMiceTest extends AutomatedTestBase {
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[]{"-nvargs", "X=" + DATASET,
"Mask="+input("M"),
- "iteration=" + iter, "dataN=" +
output("N"), "dataC=" + output("C")};
+ "iteration=" + iter, "dataN=" + output("N"),
"dataC=" + output("C")};
if (lineage) {
- String[] lin = new String[]
{"-stats","-lineage", ReuseCacheType.REUSE_HYBRID.name().toLowerCase()};
- programArgs = (String[])
ArrayUtils.addAll(programArgs, lin);
+ programArgs = (String[])
ArrayUtils.addAll(programArgs, new String[] {
+ "-stats","-lineage",
ReuseCacheType.REUSE_HYBRID.name().toLowerCase()});
}
writeInputMatrixWithMTD("M", mask, true);
@@ -125,18 +117,16 @@ public class BuiltinMiceTest extends AutomatedTestBase {
}
}
- private void testNumericOutput()
- {
+ private void testNumericOutput() {
//compare matrices
HashMap<MatrixValue.CellIndex, Double> dmlfileN =
readDMLMatrixFromHDFS("N");
HashMap<MatrixValue.CellIndex, Double> rfileN =
readRMatrixFromFS("N");
// compare numerical imputations
TestUtils.compareMatrices(dmlfileN, rfileN, eps, "Stat-DML",
"Stat-R");
-
}
- private void testCategoricalOutput()
- {
+
+ private void testCategoricalOutput() {
HashMap<MatrixValue.CellIndex, Double> dmlfileC =
readDMLMatrixFromHDFS("C");
HashMap<MatrixValue.CellIndex, Double> rfileC =
readRMatrixFromFS("C");
@@ -154,4 +144,4 @@ public class BuiltinMiceTest extends AutomatedTestBase {
else
Assert.fail("categorical test fails, the true value
count is less than 98%");
}
-}
\ No newline at end of file
+}