[SYSTEMML-1824] Fix transformapply by colnames w/ meta data superset This patch improves the robustness of transformapply. So far we required that input data and meta data are of equal number of columns. With this change, we support meta data that is a superset of columns if the transform specification uses column names (which allows a join by name).
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2c9694de Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2c9694de Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2c9694de Branch: refs/heads/master Commit: 2c9694decfe7908cc57f567ee8ff0d279a37d34f Parents: 61a0931 Author: Matthias Boehm <mboe...@gmail.com> Authored: Tue Aug 1 19:22:03 2017 -0700 Committer: Matthias Boehm <mboe...@gmail.com> Committed: Tue Aug 1 19:30:57 2017 -0700 ---------------------------------------------------------------------- ...ReturnParameterizedBuiltinSPInstruction.java | 2 +- .../ParameterizedBuiltinSPInstruction.java | 2 +- .../sysml/runtime/matrix/data/FrameBlock.java | 12 ++- .../transform/encode/EncoderFactory.java | 24 ++++- .../runtime/transform/meta/TfMetaUtils.java | 8 +- .../sysml/runtime/util/DataConverter.java | 2 +- .../TransformFrameEncodeApplySubsetTest.java | 101 +++++++++++++++++++ .../TransformFrameEncodeApplySubset.dml | 32 ++++++ .../functions/transform/ZPackageSuite.java | 1 + 9 files changed, 174 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java index 254c3d7..586d282 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java @@ -116,7 +116,7 @@ public class MultiReturnParameterizedBuiltinSPInstruction extends ComputationSPI String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue(); MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName()); MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName()); - String[] colnames = !TfMetaUtils.isIDSpecification(spec) ? + String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null; //step 1: build transform meta data http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java index e62dd60..01f4512 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java @@ -438,7 +438,7 @@ public class ParameterizedBuiltinSPInstruction extends ComputationSPInstruction FrameBlock meta = sec.getFrameInput(params.get("meta")); MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(params.get("target")); MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName()); - String[] colnames = !TfMetaUtils.isIDSpecification(params.get("spec")) ? + String[] colnames = !TfMetaUtils.isIDSpec(params.get("spec")) ? in.lookup(1L).get(0).getColumnNames() : null; //compute omit offset map for block shifts http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java index 5e6404b..10b39fe 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java @@ -477,7 +477,7 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable _numRows = cols[0].length; } - public Object getColumn(int c) { + public Object getColumnData(int c) { switch(_schema[c]) { case STRING: return ((StringArray)_coldata[c])._data; case BOOLEAN: return ((BooleanArray)_coldata[c])._data; @@ -487,6 +487,16 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable } } + public Array getColumn(int c) { + return _coldata[c]; + } + + public void setColumn(int c, Array column) { + if( _coldata == null ) + _coldata = new Array[getNumColumns()]; + _coldata[c] = column; + } + /** * Get a row iterator over the frame where all fields are encoded * as strings independent of their value types. http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java index 5e0a178..df506e0 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java @@ -45,9 +45,9 @@ public class EncoderFactory } @SuppressWarnings("unchecked") - public static Encoder createEncoder(String spec, String[] colnames, ValueType[] schema, FrameBlock meta) + public static Encoder createEncoder(String spec, String[] colnames, ValueType[] schema, FrameBlock meta) throws DMLRuntimeException - { + { Encoder encoder = null; int clen = schema.length; @@ -93,10 +93,26 @@ public class EncoderFactory } //create composite decoder of all created encoders - //and initialize meta data (recode, dummy, bin, mv) encoder = new EncoderComposite(lencoders); - if( meta != null ) + + //initialize meta data w/ robustness for superset of cols + if( meta != null ) { + String[] colnames2 = meta.getColumnNames(); + if( !TfMetaUtils.isIDSpec(jSpec) && colnames!=null && colnames2!=null + && !ArrayUtils.isEquals(colnames, colnames2) ) + { + //create temporary meta frame block w/ shallow column copy + FrameBlock meta2 = new FrameBlock(meta.getSchema(), colnames2); + meta2.setNumRows(meta.getNumRows()); + for( int i=0; i<colnames.length; i++ ) { + int pos = Arrays.binarySearch(colnames2, colnames[i]); + meta2.setColumn(i, meta.getColumn(pos)); + meta2.setColumnMetadata(i, meta.getColumnMetadata(pos)); + } + meta = meta2; + } encoder.initMetaData(meta); + } } catch(Exception ex) { throw new DMLRuntimeException(ex); http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java index afb7ee9..abaad26 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java +++ b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java @@ -50,15 +50,19 @@ import org.apache.wink.json4j.JSONObject; public class TfMetaUtils { - public static boolean isIDSpecification(String spec) throws DMLRuntimeException { + public static boolean isIDSpec(String spec) throws DMLRuntimeException { try { JSONObject jSpec = new JSONObject(spec); - return jSpec.containsKey("ids") && jSpec.getBoolean("ids"); + return isIDSpec(jSpec); } catch(JSONException ex) { throw new DMLRuntimeException(ex); } } + + public static boolean isIDSpec(JSONObject spec) throws JSONException { + return spec.containsKey("ids") && spec.getBoolean("ids"); + } public static boolean containsOmitSpec(String spec, String[] colnames) throws DMLRuntimeException { return (TfMetaUtils.parseJsonIDList(spec, colnames, TfUtils.TXMETHOD_OMIT).length > 0); http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/util/DataConverter.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/util/DataConverter.java b/src/main/java/org/apache/sysml/runtime/util/DataConverter.java index ac1e80e..10f043b 100644 --- a/src/main/java/org/apache/sysml/runtime/util/DataConverter.java +++ b/src/main/java/org/apache/sysml/runtime/util/DataConverter.java @@ -528,7 +528,7 @@ public class DataConverter double[][] a = new double[n][]; double[] c = mb.getDenseBlock(); for( int j=0; j<n; j++ ) - a[j] = (double[])frame.getColumn(j); + a[j] = (double[])frame.getColumnData(j); int blocksizeIJ = 16; //blocks of a+overhead/c in L1 cache for( int bi=0; bi<m; bi+=blocksizeIJ ) for( int bj=0; bj<n; bj+=blocksizeIJ ) { http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java new file mode 100644 index 0000000..b06bf92 --- /dev/null +++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.test.integration.functions.transform; + +import org.junit.Assert; +import org.junit.Test; +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; +import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysml.test.integration.AutomatedTestBase; +import org.apache.sysml.test.integration.TestConfiguration; +import org.apache.sysml.test.utils.TestUtils; + +public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase +{ + private final static String TEST_NAME1 = "TransformFrameEncodeApplySubset"; + private final static String TEST_DIR = "functions/transform/"; + private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeApplySubsetTest.class.getSimpleName() + "/"; + + //dataset and transform tasks without missing values + private final static String DATASET1 = "homes3/homes.csv"; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "y" }) ); + } + + @Test + public void testHomesRecodeColnamesSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", true); + } + + @Test + public void testHomesRecodeColnamesSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", true); + } + + @Test + public void testHomesRecodeColnamesHybridCSV() { + runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true); + } + + + /** + * + * @param rt + * @param ofmt + * @param dataset + */ + private void runTransformTest(RUNTIME_PLATFORM rt, String ofmt, boolean colnames) + { + //set runtime platform + RUNTIME_PLATFORM rtold = rtplatform; + rtplatform = rt; + + boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG; + if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) + DMLScript.USE_LOCAL_SPARK_CONFIG = true; + + if( !ofmt.equals("csv") ) + throw new RuntimeException("Unsupported test output format"); + + try + { + getAndLoadTestConfiguration(TEST_NAME1); + + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME1 + ".dml"; + programArgs = new String[]{"-explain", "recompile_hops", "-args", + HOME + "input/" + DATASET1, output("R") }; + + runTest(true, false, null, -1); + + //check output + Assert.assertEquals(Double.valueOf(148), + readDMLMatrixFromHDFS("R").get(new CellIndex(1,1))); + } + finally { + rtplatform = rtold; + DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld; + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml new file mode 100644 index 0000000..1e55af4 --- /dev/null +++ b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +F = read($1, data_type="frame", format="csv"); + +spec = "{ids: false, recode: [ zipcode, district, view ]}"; +[X, M] = transformencode(target=F, spec=spec); + +spec2 = "{ids: false, recode: [ district ]}"; +X2 = transformapply(target=F[,2], spec=spec2, meta=M); + +R = as.matrix(sum(X[,2]==X2)); + +write(R, $2); http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java ---------------------------------------------------------------------- diff --git a/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java index 645a468..736cfc7 100644 --- a/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java +++ b/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java @@ -30,6 +30,7 @@ import org.junit.runners.Suite; TransformCSVFrameEncodeDecodeTest.class, TransformCSVFrameEncodeReadTest.class, TransformEncodeDecodeTest.class, + TransformFrameEncodeApplySubsetTest.class, TransformFrameEncodeApplyTest.class, TransformFrameEncodeDecodeTest.class, TransformFrameEncodeDecodeTokenTest.class,