[SYSTEMML-1824] Fix transformapply by colnames w/ meta data superset 

This patch improves the robustness of transformapply. So far we required
that input data and meta data are of equal number of columns. With this
change, we support meta data that is a superset of columns if the
transform specification uses column names (which allows a join by name).


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2c9694de
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2c9694de
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2c9694de

Branch: refs/heads/master
Commit: 2c9694decfe7908cc57f567ee8ff0d279a37d34f
Parents: 61a0931
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Tue Aug 1 19:22:03 2017 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Tue Aug 1 19:30:57 2017 -0700

----------------------------------------------------------------------
 ...ReturnParameterizedBuiltinSPInstruction.java |   2 +-
 .../ParameterizedBuiltinSPInstruction.java      |   2 +-
 .../sysml/runtime/matrix/data/FrameBlock.java   |  12 ++-
 .../transform/encode/EncoderFactory.java        |  24 ++++-
 .../runtime/transform/meta/TfMetaUtils.java     |   8 +-
 .../sysml/runtime/util/DataConverter.java       |   2 +-
 .../TransformFrameEncodeApplySubsetTest.java    | 101 +++++++++++++++++++
 .../TransformFrameEncodeApplySubset.dml         |  32 ++++++
 .../functions/transform/ZPackageSuite.java      |   1 +
 9 files changed, 174 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
index 254c3d7..586d282 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
@@ -116,7 +116,7 @@ public class MultiReturnParameterizedBuiltinSPInstruction 
extends ComputationSPI
                        String spec = ec.getScalarInput(input2.getName(), 
input2.getValueType(), input2.isLiteral()).getStringValue();
                        MatrixCharacteristics mcIn = 
sec.getMatrixCharacteristics(input1.getName());
                        MatrixCharacteristics mcOut = 
sec.getMatrixCharacteristics(output.getName());
-                       String[] colnames = 
!TfMetaUtils.isIDSpecification(spec) ?
+                       String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
                                        in.lookup(1L).get(0).getColumnNames() : 
null; 
                                        
                        //step 1: build transform meta data

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
index e62dd60..01f4512 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
@@ -438,7 +438,7 @@ public class ParameterizedBuiltinSPInstruction  extends 
ComputationSPInstruction
                        FrameBlock meta = 
sec.getFrameInput(params.get("meta"));                
                        MatrixCharacteristics mcIn = 
sec.getMatrixCharacteristics(params.get("target"));
                        MatrixCharacteristics mcOut = 
sec.getMatrixCharacteristics(output.getName());
-                       String[] colnames = 
!TfMetaUtils.isIDSpecification(params.get("spec")) ?
+                       String[] colnames = 
!TfMetaUtils.isIDSpec(params.get("spec")) ?
                                        in.lookup(1L).get(0).getColumnNames() : 
null; 
                        
                        //compute omit offset map for block shifts

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
index 5e6404b..10b39fe 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
@@ -477,7 +477,7 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
                _numRows = cols[0].length;
        }
 
-       public Object getColumn(int c) {
+       public Object getColumnData(int c) {
                switch(_schema[c]) {
                        case STRING:  return ((StringArray)_coldata[c])._data; 
                        case BOOLEAN: return ((BooleanArray)_coldata[c])._data;
@@ -487,6 +487,16 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
                }
        }
        
+       public Array getColumn(int c) {
+               return _coldata[c]; 
+       }
+       
+       public void setColumn(int c, Array column) {
+               if( _coldata == null )
+                       _coldata = new Array[getNumColumns()];
+               _coldata[c] = column; 
+       }
+       
        /**
         * Get a row iterator over the frame where all fields are encoded
         * as strings independent of their value types.  

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
index 5e0a178..df506e0 100644
--- 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
+++ 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
@@ -45,9 +45,9 @@ public class EncoderFactory
        }
 
        @SuppressWarnings("unchecked")
-       public static Encoder createEncoder(String spec,  String[] colnames, 
ValueType[] schema, FrameBlock meta) 
+       public static Encoder createEncoder(String spec, String[] colnames, 
ValueType[] schema, FrameBlock meta) 
                throws DMLRuntimeException 
-       {       
+       {
                Encoder encoder = null;
                int clen = schema.length;
                
@@ -93,10 +93,26 @@ public class EncoderFactory
                        }
                        
                        //create composite decoder of all created encoders
-                       //and initialize meta data (recode, dummy, bin, mv)
                        encoder = new EncoderComposite(lencoders);
-                       if( meta != null )
+                       
+                       //initialize meta data w/ robustness for superset of 
cols
+                       if( meta != null ) {
+                               String[] colnames2 = meta.getColumnNames();
+                               if( !TfMetaUtils.isIDSpec(jSpec) && 
colnames!=null && colnames2!=null 
+                                       && !ArrayUtils.isEquals(colnames, 
colnames2) ) 
+                               {
+                                       //create temporary meta frame block w/ 
shallow column copy
+                                       FrameBlock meta2 = new 
FrameBlock(meta.getSchema(), colnames2);
+                                       meta2.setNumRows(meta.getNumRows());
+                                       for( int i=0; i<colnames.length; i++ ) {
+                                               int pos = 
Arrays.binarySearch(colnames2, colnames[i]);
+                                               meta2.setColumn(i, 
meta.getColumn(pos));
+                                               meta2.setColumnMetadata(i, 
meta.getColumnMetadata(pos));
+                                       }
+                                       meta = meta2;
+                               }
                                encoder.initMetaData(meta);
+                       }
                }
                catch(Exception ex) {
                        throw new DMLRuntimeException(ex);

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java 
b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
index afb7ee9..abaad26 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
@@ -50,15 +50,19 @@ import org.apache.wink.json4j.JSONObject;
 
 public class TfMetaUtils 
 {
-       public static boolean isIDSpecification(String spec) throws 
DMLRuntimeException {
+       public static boolean isIDSpec(String spec) throws DMLRuntimeException {
                try {
                        JSONObject jSpec = new JSONObject(spec);
-                       return jSpec.containsKey("ids") && 
jSpec.getBoolean("ids");
+                       return isIDSpec(jSpec);
                }
                catch(JSONException ex) {
                        throw new DMLRuntimeException(ex);
                }
        }
+       
+       public static boolean isIDSpec(JSONObject spec) throws JSONException {
+               return spec.containsKey("ids") && spec.getBoolean("ids");
+       }
 
        public static boolean containsOmitSpec(String spec, String[] colnames) 
throws DMLRuntimeException {
                return (TfMetaUtils.parseJsonIDList(spec, colnames, 
TfUtils.TXMETHOD_OMIT).length > 0); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/util/DataConverter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/DataConverter.java 
b/src/main/java/org/apache/sysml/runtime/util/DataConverter.java
index ac1e80e..10f043b 100644
--- a/src/main/java/org/apache/sysml/runtime/util/DataConverter.java
+++ b/src/main/java/org/apache/sysml/runtime/util/DataConverter.java
@@ -528,7 +528,7 @@ public class DataConverter
                        double[][] a = new double[n][];
                        double[] c = mb.getDenseBlock();
                        for( int j=0; j<n; j++ )
-                               a[j] = (double[])frame.getColumn(j);            
        
+                               a[j] = (double[])frame.getColumnData(j);        
                
                        int blocksizeIJ = 16; //blocks of a+overhead/c in L1 
cache
                        for( int bi=0; bi<m; bi+=blocksizeIJ )
                                for( int bj=0; bj<n; bj+=blocksizeIJ ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
new file mode 100644
index 0000000..b06bf92
--- /dev/null
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.transform;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase 
+{
+       private final static String TEST_NAME1 = 
"TransformFrameEncodeApplySubset";
+       private final static String TEST_DIR = "functions/transform/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
TransformFrameEncodeApplySubsetTest.class.getSimpleName() + "/";
+       
+       //dataset and transform tasks without missing values
+       private final static String DATASET1    = "homes3/homes.csv";
+       
+       @Override
+       public void setUp()  {
+               TestUtils.clearAssertionInformation();
+               addTestConfiguration(TEST_NAME1, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "y" }) );
+       }
+       
+       @Test
+       public void testHomesRecodeColnamesSingleNodeCSV() {
+               runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", true);
+       }
+       
+       @Test
+       public void testHomesRecodeColnamesSparkCSV() {
+               runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", true);
+       }
+       
+       @Test
+       public void testHomesRecodeColnamesHybridCSV() {
+               runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true);
+       }
+       
+       
+       /**
+        * 
+        * @param rt
+        * @param ofmt
+        * @param dataset
+        */
+       private void runTransformTest(RUNTIME_PLATFORM rt, String ofmt, boolean 
colnames)
+       {
+               //set runtime platform
+               RUNTIME_PLATFORM rtold = rtplatform;
+               rtplatform = rt;
+
+               boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+               if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == 
RUNTIME_PLATFORM.HYBRID_SPARK)
+                       DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+               if( !ofmt.equals("csv") )
+                       throw new RuntimeException("Unsupported test output 
format");
+               
+               try
+               {
+                       getAndLoadTestConfiguration(TEST_NAME1);
+                       
+                       String HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
+                       programArgs = new String[]{"-explain", 
"recompile_hops", "-args", 
+                               HOME + "input/" + DATASET1, output("R") };
+       
+                       runTest(true, false, null, -1); 
+                       
+                       //check output 
+                       Assert.assertEquals(Double.valueOf(148), 
+                               readDMLMatrixFromHDFS("R").get(new 
CellIndex(1,1)));
+               }
+               finally {
+                       rtplatform = rtold;
+                       DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+               }
+       }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml
----------------------------------------------------------------------
diff --git 
a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml 
b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml
new file mode 100644
index 0000000..1e55af4
--- /dev/null
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+F = read($1, data_type="frame", format="csv");
+
+spec = "{ids: false, recode: [ zipcode, district, view ]}";
+[X, M] = transformencode(target=F, spec=spec);
+
+spec2 = "{ids: false, recode: [ district ]}";
+X2 = transformapply(target=F[,2], spec=spec2, meta=M);
+
+R = as.matrix(sum(X[,2]==X2));
+
+write(R, $2);

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java
----------------------------------------------------------------------
diff --git 
a/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java
 
b/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java
index 645a468..736cfc7 100644
--- 
a/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java
+++ 
b/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java
@@ -30,6 +30,7 @@ import org.junit.runners.Suite;
        TransformCSVFrameEncodeDecodeTest.class,
        TransformCSVFrameEncodeReadTest.class,
        TransformEncodeDecodeTest.class,
+       TransformFrameEncodeApplySubsetTest.class,
        TransformFrameEncodeApplyTest.class,
        TransformFrameEncodeDecodeTest.class,
        TransformFrameEncodeDecodeTokenTest.class,

Reply via email to