Repository: incubator-systemml Updated Branches: refs/heads/master ab45af17c -> 468e472a6
[SYSTEMML-569] Frame transform encode/decode/apply w/ column name specs Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/468e472a Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/468e472a Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/468e472a Branch: refs/heads/master Commit: 468e472a61d9ae7e1746ca1b48ad43e7ac4d79d8 Parents: ab45af1 Author: Matthias Boehm <[email protected]> Authored: Sun Jul 10 21:28:57 2016 -0700 Committer: Matthias Boehm <[email protected]> Committed: Mon Jul 11 11:25:03 2016 -0700 ---------------------------------------------------------------------- ...ReturnParameterizedBuiltinCPInstruction.java | 5 +- .../cp/ParameterizedBuiltinCPInstruction.java | 7 +- ...ReturnParameterizedBuiltinSPInstruction.java | 14 ++- .../ParameterizedBuiltinSPInstruction.java | 16 +-- .../spark/utils/FrameRDDConverterUtils.java | 20 ++-- .../sysml/runtime/transform/BinAgent.java | 8 +- .../sysml/runtime/transform/DummycodeAgent.java | 4 +- .../sysml/runtime/transform/MVImputeAgent.java | 4 +- .../sysml/runtime/transform/OmitAgent.java | 5 +- .../sysml/runtime/transform/RecodeAgent.java | 6 +- .../apache/sysml/runtime/transform/TfUtils.java | 13 ++- .../transform/decode/DecoderFactory.java | 6 +- .../transform/encode/EncoderFactory.java | 30 +++--- .../runtime/transform/meta/TfMetaUtils.java | 86 ++++++++++----- .../functions/jmlc/FrameReadMetaTest.java | 2 +- .../TransformFrameEncodeApplyTest.java | 107 ++++++++++++++----- .../TransformFrameEncodeDecodeTest.java | 44 ++++++-- .../input/homes3/homes.tfspec_bin2.json | 5 + .../input/homes3/homes.tfspec_dummy2.json | 2 + .../input/homes3/homes.tfspec_impute2.json | 10 ++ .../input/homes3/homes.tfspec_omit2.json | 2 + .../input/homes3/homes.tfspec_recode2.json | 2 + 22 files changed, 276 insertions(+), 122 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/instructions/cp/MultiReturnParameterizedBuiltinCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/MultiReturnParameterizedBuiltinCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/MultiReturnParameterizedBuiltinCPInstruction.java index af076f7..05cf19e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/MultiReturnParameterizedBuiltinCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/MultiReturnParameterizedBuiltinCPInstruction.java @@ -20,6 +20,7 @@ package org.apache.sysml.runtime.instructions.cp; import java.util.ArrayList; +import java.util.List; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; @@ -81,11 +82,13 @@ public class MultiReturnParameterizedBuiltinCPInstruction extends ComputationCPI //obtain and pin input frame FrameBlock fin = ec.getFrameInput(input1.getName()); String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue(); + List<String> colnames = fin.getColumnNames(); //execute block transform encode - Encoder encoder = EncoderFactory.createEncoder(spec, fin.getNumColumns(), null); + Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fin.getNumColumns(), null); MatrixBlock data = encoder.encode(fin, new MatrixBlock(fin.getNumRows(), fin.getNumColumns(), false)); //build and apply FrameBlock meta = encoder.getMetaData(new FrameBlock(fin.getNumColumns(), ValueType.STRING)); + meta.setColumnNames(colnames); //release input and outputs ec.releaseFrameInput(input1.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java index eca1627..a245d10 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java @@ -20,6 +20,7 @@ package org.apache.sysml.runtime.instructions.cp; import java.util.HashMap; +import java.util.List; import org.apache.sysml.lops.Lop; import org.apache.sysml.parser.ParameterizedBuiltinFunctionExpression; @@ -269,9 +270,10 @@ public class ParameterizedBuiltinCPInstruction extends ComputationCPInstruction //acquire locks FrameBlock data = ec.getFrameInput(params.get("target")); FrameBlock meta = ec.getFrameInput(params.get("meta")); + List<String> colNames = data.getColumnNames(); //compute transformapply - Encoder encoder = EncoderFactory.createEncoder(params.get("spec"), data.getNumColumns(), meta); + Encoder encoder = EncoderFactory.createEncoder(params.get("spec"), colNames, data.getNumColumns(), meta); MatrixBlock mbout = encoder.apply(data, new MatrixBlock(data.getNumRows(), data.getNumColumns(), false)); //release locks @@ -283,9 +285,10 @@ public class ParameterizedBuiltinCPInstruction extends ComputationCPInstruction //acquire locks MatrixBlock data = ec.getMatrixInput(params.get("target")); FrameBlock meta = ec.getFrameInput(params.get("meta")); + List<String> colnames = meta.getColumnNames(); //compute transformdecode - Decoder decoder = DecoderFactory.createDecoder(getParameterMap().get("spec"), null, meta); + Decoder decoder = DecoderFactory.createDecoder(getParameterMap().get("spec"), colnames, null, meta); FrameBlock fbout = decoder.decode(data, new FrameBlock(meta.getNumColumns(), ValueType.STRING)); //release locks http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java index a53f673..9b35daa 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java @@ -24,6 +24,7 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import java.util.List; import java.util.Map.Entry; import org.apache.spark.Accumulator; @@ -126,9 +127,11 @@ public class MultiReturnParameterizedBuiltinSPInstruction extends ComputationSPI String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue(); MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName()); MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName()); - + List<String> colnames = !TfMetaUtils.isIDSpecification(spec) ? + in.lookup(1L).get(0).getColumnNames() : null; + //step 1: build transform meta data - Encoder encoderBuild = EncoderFactory.createEncoder(spec, + Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), null); Accumulator<Long> accMax = sec.getSparkContext().accumulator(0L, new MaxAcc()); @@ -148,17 +151,18 @@ public class MultiReturnParameterizedBuiltinSPInstruction extends ComputationSPI FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo); FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns()); meta.recomputeColumnCardinality(); //recompute num distinct items per column + meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames()); //step 2: transform apply (similar to spark transformapply) //compute omit offset map for block shifts TfOffsetMap omap = null; - if( TfMetaUtils.containsOmitSpec(spec) ) { + if( TfMetaUtils.containsOmitSpec(spec, colnames) ) { omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair( - new RDDTransformApplyOffsetFunction(spec)).collect())); + new RDDTransformApplyOffsetFunction(spec, colnames)).collect())); } //create encoder broadcast (avoiding replication per task) - Encoder encoder = EncoderFactory.createEncoder(spec, + Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), meta); mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java index 25a4078..f880f57 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java @@ -21,6 +21,7 @@ package org.apache.sysml.runtime.instructions.spark; import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.function.Function; @@ -428,16 +429,18 @@ public class ParameterizedBuiltinSPInstruction extends ComputationSPInstruction FrameBlock meta = sec.getFrameInput(params.get("meta")); MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(params.get("target")); MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName()); + List<String> colnames = !TfMetaUtils.isIDSpecification(params.get("spec")) ? + in.lookup(1L).get(0).getColumnNames() : null; //compute omit offset map for block shifts TfOffsetMap omap = null; - if( TfMetaUtils.containsOmitSpec(params.get("spec")) ) { + if( TfMetaUtils.containsOmitSpec(params.get("spec"), colnames) ) { omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair( - new RDDTransformApplyOffsetFunction(params.get("spec"))).collect())); + new RDDTransformApplyOffsetFunction(params.get("spec"), colnames)).collect())); } //create encoder broadcast (avoiding replication per task) - Encoder encoder = EncoderFactory.createEncoder(params.get("spec"), + Encoder encoder = EncoderFactory.createEncoder(params.get("spec"), colnames, fo.getSchema(), (int)fo.getNumColumns(), meta); mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder); @@ -460,6 +463,7 @@ public class ParameterizedBuiltinSPInstruction extends ComputationSPInstruction JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(params.get("target")); MatrixCharacteristics mc = sec.getMatrixCharacteristics(params.get("target")); FrameBlock meta = sec.getFrameInput(params.get("meta")); + List<String> colnames = meta.getColumnNames(); //reblock if necessary (clen > bclen) if( mc.getCols() > mc.getNumColBlocks() ) { @@ -469,7 +473,7 @@ public class ParameterizedBuiltinSPInstruction extends ComputationSPInstruction } //construct decoder and decode individual matrix blocks - Decoder decoder = DecoderFactory.createDecoder(params.get("spec"), null, meta); + Decoder decoder = DecoderFactory.createDecoder(params.get("spec"), colnames, null, meta); JavaPairRDD<Long,FrameBlock> out = in.mapToPair( new RDDTransformDecodeFunction(decoder, meta.getNumColumns(), mc.getRowsPerBlock())); @@ -769,9 +773,9 @@ public class ParameterizedBuiltinSPInstruction extends ComputationSPInstruction private int[] _omitColList = null; - public RDDTransformApplyOffsetFunction(String spec) { + public RDDTransformApplyOffsetFunction(String spec, List<String> colnames) { try { - _omitColList = TfMetaUtils.parseJsonIDList(spec, TfUtils.TXMETHOD_OMIT); + _omitColList = TfMetaUtils.parseJsonIDList(spec, colnames, TfUtils.TXMETHOD_OMIT); } catch (DMLRuntimeException e) { throw new RuntimeException(e); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java index b63724a..c640d4d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; +import java.util.Arrays; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; @@ -394,10 +395,9 @@ public class FrameRDDConverterUtils private String _delim = null; private boolean _fill = false; private int _maxRowsPerBlock = -1; + private List<String> _colnames = null; - - public CSVToBinaryBlockFunction(MatrixCharacteristics mc, boolean hasHeader, String delim, boolean fill) - { + public CSVToBinaryBlockFunction(MatrixCharacteristics mc, boolean hasHeader, String delim, boolean fill) { _clen = mc.getCols(); _hasHeader = hasHeader; _delim = delim; @@ -420,10 +420,12 @@ public class FrameRDDConverterUtils Tuple2<Text,Long> tmp = arg0.next(); String row = tmp._1().toString(); long rowix = tmp._2(); - if(!_hasHeader) // In case there is no header, rowindex to be adjusted to base 1. - ++rowix; - if(_hasHeader && rowix == 0) //Skip header + if(!_hasHeader) // In case there is no header, rowindex to be adjusted to base 1. + rowix++; + if(_hasHeader && rowix == 0) { //Skip header + _colnames = Arrays.asList(row.split(_delim)); continue; + } if( iRowsInBlock == 0 || iRowsInBlock == _maxRowsPerBlock) { if( iRowsInBlock == _maxRowsPerBlock ) @@ -436,7 +438,7 @@ public class FrameRDDConverterUtils String[] parts = IOUtilFunctions.split(row, _delim); boolean emptyFound = false; mb[0].appendRow(parts); - ++iRowsInBlock; + iRowsInBlock++; //sanity check empty cells filled w/ values IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(row, _fill, emptyFound); @@ -453,7 +455,9 @@ public class FrameRDDConverterUtils { //compute row block index and number of column blocks ix[0] = new LongWritable(rowix); - mb[0] = new FrameBlock((int)_clen, ValueType.STRING); + mb[0] = new FrameBlock((int)_clen, ValueType.STRING); + if( _colnames != null ) + mb[0].setColumnNames(_colnames); } // Flushes current state of filled column blocks to output list. http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/transform/BinAgent.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/BinAgent.java b/src/main/java/org/apache/sysml/runtime/transform/BinAgent.java index ad7cbfc..ee0e56a 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/BinAgent.java +++ b/src/main/java/org/apache/sysml/runtime/transform/BinAgent.java @@ -68,10 +68,10 @@ public class BinAgent extends Encoder super( null, clen ); } - public BinAgent(JSONObject parsedSpec, int clen) + public BinAgent(JSONObject parsedSpec, List<String> colnames, int clen) throws JSONException, IOException { - this(parsedSpec, clen, false); + this(parsedSpec, colnames, clen, false); } /** @@ -81,7 +81,7 @@ public class BinAgent extends Encoder * @throws JSONException * @throws IOException */ - public BinAgent(JSONObject parsedSpec, int clen, boolean colsOnly) + public BinAgent(JSONObject parsedSpec, List<String> colnames, int clen, boolean colsOnly) throws JSONException, IOException { super( null, clen ); @@ -89,7 +89,7 @@ public class BinAgent extends Encoder return; if( colsOnly ) { - List<Integer> collist = TfMetaUtils.parseBinningColIDs(parsedSpec); + List<Integer> collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames); initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0]))); } else http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/transform/DummycodeAgent.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/DummycodeAgent.java b/src/main/java/org/apache/sysml/runtime/transform/DummycodeAgent.java index aeeb62b..b51d639 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/DummycodeAgent.java +++ b/src/main/java/org/apache/sysml/runtime/transform/DummycodeAgent.java @@ -64,11 +64,11 @@ public class DummycodeAgent extends Encoder super(list, clen); } - public DummycodeAgent(JSONObject parsedSpec, int clen) throws JSONException { + public DummycodeAgent(JSONObject parsedSpec, List<String> colnames, int clen) throws JSONException { super(null, clen); if ( parsedSpec.containsKey(TfUtils.TXMETHOD_DUMMYCODE) ) { - int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, TfUtils.TXMETHOD_DUMMYCODE); + int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE); initColList(collist); } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/transform/MVImputeAgent.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/MVImputeAgent.java b/src/main/java/org/apache/sysml/runtime/transform/MVImputeAgent.java index 68896ac..1266ced 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/MVImputeAgent.java +++ b/src/main/java/org/apache/sysml/runtime/transform/MVImputeAgent.java @@ -98,13 +98,13 @@ public class MVImputeAgent extends Encoder public KahanObject[] getMeans_scnomv() { return _scnomvMeanList; } public CM_COV_Object[] getVars_scnomv() { return _scnomvVarList; } - public MVImputeAgent(JSONObject parsedSpec, int clen) + public MVImputeAgent(JSONObject parsedSpec, List<String> colnames, int clen) throws JSONException { super(null, clen); //handle column list - int[] collist = TfMetaUtils.parseJsonObjectIDList(parsedSpec, TfUtils.TXMETHOD_IMPUTE); + int[] collist = TfMetaUtils.parseJsonObjectIDList(parsedSpec, colnames, TfUtils.TXMETHOD_IMPUTE); initColList(collist); //handle method list http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/transform/OmitAgent.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/OmitAgent.java b/src/main/java/org/apache/sysml/runtime/transform/OmitAgent.java index 186ac7a..de6d59f 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/OmitAgent.java +++ b/src/main/java/org/apache/sysml/runtime/transform/OmitAgent.java @@ -21,6 +21,7 @@ package org.apache.sysml.runtime.transform; import java.io.IOException; import java.util.Iterator; +import java.util.List; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -49,13 +50,13 @@ public class OmitAgent extends Encoder super(list, clen); } - public OmitAgent(JSONObject parsedSpec, int clen) + public OmitAgent(JSONObject parsedSpec, List<String> colnames, int clen) throws JSONException { super(null, clen); if (!parsedSpec.containsKey(TfUtils.TXMETHOD_OMIT)) return; - int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, TfUtils.TXMETHOD_OMIT); + int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_OMIT); initColList(collist); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/transform/RecodeAgent.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/RecodeAgent.java b/src/main/java/org/apache/sysml/runtime/transform/RecodeAgent.java index 5abe9db..edfdff4 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/RecodeAgent.java +++ b/src/main/java/org/apache/sysml/runtime/transform/RecodeAgent.java @@ -59,19 +59,19 @@ public class RecodeAgent extends Encoder private HashMap<Integer, HashMap<String, Long>> _rcdMaps = new HashMap<Integer, HashMap<String, Long>>(); private HashMap<Integer, HashMap<String,String>> _finalMaps = null; - public RecodeAgent(JSONObject parsedSpec, int clen) + public RecodeAgent(JSONObject parsedSpec, List<String> colnames, int clen) throws JSONException { super(null, clen); int rcdCount = 0; if( parsedSpec.containsKey(TfUtils.TXMETHOD_RECODE) ) { - int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, TfUtils.TXMETHOD_RECODE); + int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_RECODE); rcdCount = initColList(collist); } if ( parsedSpec.containsKey(TfUtils.TXMETHOD_MVRCD)) { - _mvrcdList = TfMetaUtils.parseJsonIDList(parsedSpec, TfUtils.TXMETHOD_MVRCD); + _mvrcdList = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_MVRCD); rcdCount += _mvrcdList.length; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/transform/TfUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/TfUtils.java b/src/main/java/org/apache/sysml/runtime/transform/TfUtils.java index 24bde22..2b63797 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/TfUtils.java +++ b/src/main/java/org/apache/sysml/runtime/transform/TfUtils.java @@ -23,6 +23,7 @@ import java.io.EOFException; import java.io.IOException; import java.io.Serializable; import java.util.Arrays; +import java.util.List; import java.util.regex.Pattern; import org.apache.hadoop.filecache.DistributedCache; @@ -135,7 +136,7 @@ public class TfUtils implements Serializable{ } _NAstrings = TfUtils.parseNAStrings(job); _spec = job.get(MRJobConfiguration.TF_SPEC); - _oa = new OmitAgent(new JSONObject(_spec), -1); + _oa = new OmitAgent(new JSONObject(_spec), null, -1); } // called from GenTFMtdMapper, ApplyTf (Hadoop) @@ -244,11 +245,13 @@ public class TfUtils implements Serializable{ private void createAgents(JSONObject spec, String[] naStrings) throws IOException, JSONException { - _oa = new OmitAgent(spec, _numInputCols); + List<String> colnames = Arrays.asList(_outputColumnNames); + + _oa = new OmitAgent(spec, colnames, _numInputCols); _mia = new MVImputeAgent(spec, naStrings, _numInputCols); - _ra = new RecodeAgent(spec, _numInputCols); - _ba = new BinAgent(spec, _numInputCols); - _da = new DummycodeAgent(spec, _numInputCols); + _ra = new RecodeAgent(spec, colnames, _numInputCols); + _ba = new BinAgent(spec, colnames, _numInputCols); + _da = new DummycodeAgent(spec, colnames, _numInputCols); } public void setupAgents(OmitAgent oa, MVImputeAgent mia, RecodeAgent ra, BinAgent ba, DummycodeAgent da) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java index 78f1ad2..f276015 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java +++ b/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java @@ -46,7 +46,7 @@ public class DecoderFactory * @throws DMLRuntimeException */ @SuppressWarnings("unchecked") - public static Decoder createDecoder(String spec, List<ValueType> schema, FrameBlock meta) + public static Decoder createDecoder(String spec, List<String> colnames, List<ValueType> schema, FrameBlock meta) throws DMLRuntimeException { Decoder decoder = null; @@ -64,9 +64,9 @@ public class DecoderFactory //create decoders 'recode', 'dummy' and 'pass-through' List<Integer> rcIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(jSpec, TfUtils.TXMETHOD_RECODE))); + TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_RECODE))); List<Integer> dcIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(jSpec, TfUtils.TXMETHOD_DUMMYCODE))); + TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE))); rcIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs)); List<Integer> ptIDs = new ArrayList<Integer>(CollectionUtils .subtract(UtilFunctions.getSequenceList(1, schema.size(), 1), rcIDs)); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java index ae98bae..8adea7b 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java @@ -48,8 +48,8 @@ public class EncoderFactory * @return * @throws DMLRuntimeException */ - public static Encoder createEncoder(String spec, int clen, FrameBlock meta) throws DMLRuntimeException { - return createEncoder(spec, Collections.nCopies(clen, ValueType.STRING), meta); + public static Encoder createEncoder(String spec, List<String> colnames, int clen, FrameBlock meta) throws DMLRuntimeException { + return createEncoder(spec, colnames, Collections.nCopies(clen, ValueType.STRING), meta); } /** @@ -61,9 +61,9 @@ public class EncoderFactory * @return * @throws DMLRuntimeException */ - public static Encoder createEncoder(String spec, List<ValueType> schema, int clen, FrameBlock meta) throws DMLRuntimeException { + public static Encoder createEncoder(String spec, List<String> colnames, List<ValueType> schema, int clen, FrameBlock meta) throws DMLRuntimeException { List<ValueType> lschema = (schema==null) ? Collections.nCopies(clen, ValueType.STRING) : schema; - return createEncoder(spec, lschema, meta); + return createEncoder(spec, colnames, lschema, meta); } @@ -75,7 +75,7 @@ public class EncoderFactory * @throws DMLRuntimeException */ @SuppressWarnings("unchecked") - public static Encoder createEncoder(String spec, List<ValueType> schema, FrameBlock meta) + public static Encoder createEncoder(String spec, List<String> colnames, List<ValueType> schema, FrameBlock meta) throws DMLRuntimeException { Encoder encoder = null; @@ -89,21 +89,21 @@ public class EncoderFactory //prepare basic id lists (recode, dummycode, pass-through) //note: any dummycode column requires recode as preparation List<Integer> rcIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(jSpec, TfUtils.TXMETHOD_RECODE))); + TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_RECODE))); List<Integer> dcIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(jSpec, TfUtils.TXMETHOD_DUMMYCODE))); + TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE))); rcIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs)); - List<Integer> binIDs = TfMetaUtils.parseBinningColIDs(jSpec); + List<Integer> binIDs = TfMetaUtils.parseBinningColIDs(jSpec, colnames); List<Integer> ptIDs = new ArrayList<Integer>(CollectionUtils.subtract( CollectionUtils.subtract(UtilFunctions.getSequenceList(1, clen, 1), rcIDs), binIDs)); List<Integer> oIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(jSpec, TfUtils.TXMETHOD_OMIT))); + TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_OMIT))); List<Integer> mvIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonObjectIDList(jSpec, TfUtils.TXMETHOD_IMPUTE))); + TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_IMPUTE))); //create individual encoders if( !rcIDs.isEmpty() ) { - RecodeAgent ra = new RecodeAgent(jSpec, clen); + RecodeAgent ra = new RecodeAgent(jSpec, colnames, clen); ra.setColList(ArrayUtils.toPrimitive(rcIDs.toArray(new Integer[0]))); lencoders.add(ra); } @@ -111,13 +111,13 @@ public class EncoderFactory lencoders.add(new EncoderPassThrough( ArrayUtils.toPrimitive(ptIDs.toArray(new Integer[0])), clen)); if( !dcIDs.isEmpty() ) - lencoders.add(new DummycodeAgent(jSpec, schema.size())); + lencoders.add(new DummycodeAgent(jSpec, colnames, schema.size())); if( !binIDs.isEmpty() ) - lencoders.add(new BinAgent(jSpec, schema.size(), true)); + lencoders.add(new BinAgent(jSpec, colnames, schema.size(), true)); if( !oIDs.isEmpty() ) - lencoders.add(new OmitAgent(jSpec, schema.size())); + lencoders.add(new OmitAgent(jSpec, colnames, schema.size())); if( !mvIDs.isEmpty() ) { - MVImputeAgent ma = new MVImputeAgent(jSpec, schema.size()); + MVImputeAgent ma = new MVImputeAgent(jSpec, colnames, schema.size()); ma.initRecodeIDList(rcIDs); lencoders.add(ma); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java index 82396f6..de883f3 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java +++ b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java @@ -57,23 +57,41 @@ public class TfMetaUtils * @return * @throws DMLRuntimeException */ - public static boolean containsOmitSpec(String spec) throws DMLRuntimeException { - return (TfMetaUtils.parseJsonIDList(spec, TfUtils.TXMETHOD_OMIT).length > 0); + public static boolean isIDSpecification(String spec) throws DMLRuntimeException { + try { + JSONObject jSpec = new JSONObject(spec); + return jSpec.containsKey("ids") && jSpec.getBoolean("ids"); + } + catch(JSONException ex) { + throw new DMLRuntimeException(ex); + } + } + + /** + * + * @param spec + * @param colnames + * @return + * @throws DMLRuntimeException + */ + public static boolean containsOmitSpec(String spec, List<String> colnames) throws DMLRuntimeException { + return (TfMetaUtils.parseJsonIDList(spec, colnames, TfUtils.TXMETHOD_OMIT).length > 0); } /** * * @param spec + * @param colnames * @param group * @return - * @throws JSONException + * @throws DMLRuntimeException */ - public static int[] parseJsonIDList(String spec, String group) + public static int[] parseJsonIDList(String spec, List<String> colnames, String group) throws DMLRuntimeException { try { JSONObject jSpec = new JSONObject(spec); - return parseJsonIDList(jSpec, group); + return parseJsonIDList(jSpec, colnames, group); } catch(JSONException ex) { throw new DMLRuntimeException(ex); @@ -82,29 +100,39 @@ public class TfMetaUtils /** * TODO consolidate external and internal json spec definitions - * - * @param parsedSpec + * + * @param spec + * @param colnames * @param group * @return - * @throws JSONException + * @throws JSONException */ - public static int[] parseJsonIDList(JSONObject spec, String group) + public static int[] parseJsonIDList(JSONObject spec, List<String> colnames, String group) throws JSONException { int[] colList = new int[0]; + boolean ids = spec.containsKey("ids") && spec.getBoolean("ids"); if( spec.containsKey(group) ) { //parse attribute-array or plain array of IDs JSONArray attrs = null; - if( spec.get(group) instanceof JSONObject ) + if( spec.get(group) instanceof JSONObject ) { attrs = (JSONArray) ((JSONObject)spec.get(group)).get(TfUtils.JSON_ATTRS); + ids = true; //file-based transform outputs ids w/o id tags + } else attrs = (JSONArray)spec.get(group); //construct ID list array colList = new int[attrs.size()]; - for(int i=0; i < colList.length; i++) - colList[i] = UtilFunctions.toInt(attrs.get(i)); + for(int i=0; i < colList.length; i++) { + colList[i] = ids ? UtilFunctions.toInt(attrs.get(i)) : + (colnames.indexOf(attrs.get(i)) + 1); + if( colList[i] <= 0 ) { + throw new RuntimeException("Specified column '" + + attrs.get(i)+"' does not exist."); + } + } //ensure ascending order of column IDs Arrays.sort(colList); @@ -120,10 +148,11 @@ public class TfMetaUtils * @return * @throws JSONException */ - public static int[] parseJsonObjectIDList(JSONObject spec, String group) + public static int[] parseJsonObjectIDList(JSONObject spec, List<String> colnames, String group) throws JSONException { int[] colList = new int[0]; + boolean ids = spec.containsKey("ids") && spec.getBoolean("ids"); if( spec.containsKey(group) && spec.get(group) instanceof JSONArray ) { @@ -131,7 +160,12 @@ public class TfMetaUtils colList = new int[colspecs.size()]; for(int j=0; j<colspecs.size(); j++) { JSONObject colspec = (JSONObject) colspecs.get(j); - colList[j] = colspec.getInt("id"); + colList[j] = ids ? colspec.getInt("id") : + (colnames.indexOf(colspec.get("name")) + 1); + if( colList[j] <= 0 ) { + throw new RuntimeException("Specified column '" + + colspec.get(ids?"id":"name")+"' does not exist."); + } } //ensure ascending order of column IDs @@ -188,8 +222,8 @@ public class TfMetaUtils } //get list of recode ids - List<Integer> recodeIDs = parseRecodeColIDs(spec); - List<Integer> binIDs = parseBinningColIDs(spec); + List<Integer> recodeIDs = parseRecodeColIDs(spec, colnames); + List<Integer> binIDs = parseBinningColIDs(spec, colnames); //create frame block from in-memory strings return convertToTransformMetaDataFrame(rows, colnames, recodeIDs, binIDs, meta, mvmeta); @@ -243,8 +277,8 @@ public class TfMetaUtils } //get list of recode ids - List<Integer> recodeIDs = parseRecodeColIDs(spec); - List<Integer> binIDs = parseBinningColIDs(spec); + List<Integer> recodeIDs = parseRecodeColIDs(spec, colnames); + List<Integer> binIDs = parseBinningColIDs(spec, colnames); //create frame block from in-memory strings return convertToTransformMetaDataFrame(rows, colnames, recodeIDs, binIDs, meta, mvmeta); @@ -327,7 +361,7 @@ public class TfMetaUtils * @throws IOException */ @SuppressWarnings("unchecked") - private static List<Integer> parseRecodeColIDs(String spec) + private static List<Integer> parseRecodeColIDs(String spec, List<String> colnames) throws IOException { if( spec == null ) @@ -339,9 +373,9 @@ public class TfMetaUtils //parse json transform specification for recode col ids JSONObject jSpec = new JSONObject(spec); List<Integer> rcIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(jSpec, TfUtils.TXMETHOD_RECODE))); + TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_RECODE))); List<Integer> dcIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(jSpec, TfUtils.TXMETHOD_DUMMYCODE))); + TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE))); specRecodeIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs)); } catch(Exception ex) { @@ -357,12 +391,12 @@ public class TfMetaUtils * @return * @throws IOException */ - public static List<Integer> parseBinningColIDs(String spec) + public static List<Integer> parseBinningColIDs(String spec, List<String> colnames) throws IOException { try { JSONObject jSpec = new JSONObject(spec); - return parseBinningColIDs(jSpec); + return parseBinningColIDs(jSpec, colnames); } catch(JSONException ex) { throw new IOException(ex); @@ -375,17 +409,17 @@ public class TfMetaUtils * @return * @throws IOException */ - public static List<Integer> parseBinningColIDs(JSONObject jSpec) + public static List<Integer> parseBinningColIDs(JSONObject jSpec, List<String> colnames) throws IOException { try { if( jSpec.containsKey(TfUtils.TXMETHOD_BIN) && jSpec.get(TfUtils.TXMETHOD_BIN) instanceof JSONArray ) { return Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonObjectIDList(jSpec, TfUtils.TXMETHOD_BIN))); + TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN))); } else { //internally generates return Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(jSpec, TfUtils.TXMETHOD_BIN))); + TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN))); } } catch(JSONException ex) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java index 3fad934..b0eb3ce 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java @@ -179,7 +179,7 @@ public class FrameReadMetaTest extends AutomatedTestBase throws DMLRuntimeException { List<Integer> collist = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(spec, TfUtils.TXMETHOD_RECODE))); + TfMetaUtils.parseJsonIDList(spec, M.getColumnNames(), TfUtils.TXMETHOD_RECODE))); HashMap<String,Long>[] ret = new HashMap[M.getNumColumns()]; Iterator<Object[]> iter = M.getObjectRowIterator(); while( iter.hasNext() ) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java index b61060b..27d58f9 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java @@ -39,13 +39,18 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase //dataset and transform tasks without missing values private final static String DATASET1 = "homes3/homes.csv"; private final static String SPEC1 = "homes3/homes.tfspec_recode.json"; + private final static String SPEC1b = "homes3/homes.tfspec_recode2.json"; private final static String SPEC2 = "homes3/homes.tfspec_dummy.json"; + private final static String SPEC2b = "homes3/homes.tfspec_dummy2.json"; private final static String SPEC3 = "homes3/homes.tfspec_bin.json"; //incl recode + private final static String SPEC3b = "homes3/homes.tfspec_bin2.json"; //incl recode //dataset and transform tasks with missing values private final static String DATASET2 = "homes/homes.csv"; private final static String SPEC4 = "homes3/homes.tfspec_impute.json"; + private final static String SPEC4b = "homes3/homes.tfspec_impute2.json"; private final static String SPEC5 = "homes3/homes.tfspec_omit.json"; + private final static String SPEC5b = "homes3/homes.tfspec_omit2.json"; public enum TransformType { RECODE, @@ -62,62 +67,112 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase } @Test - public void testHomesRecodeSingleNodeCSV() { - runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.RECODE); + public void testHomesRecodeIDsSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.RECODE, false); } @Test - public void testHomesRecodeSparkCSV() { - runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.RECODE); + public void testHomesRecodeIDsSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.RECODE, false); } @Test - public void testHomesDummycodeSingleNodeCSV() { - runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.DUMMY); + public void testHomesDummycodeIDsSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.DUMMY, false); } @Test - public void testHomesDummycodeSparkCSV() { - runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.DUMMY); + public void testHomesDummycodeIDsSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.DUMMY, false); } @Test - public void testHomesBinningSingleNodeCSV() { - runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.BIN); + public void testHomesBinningIDsSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.BIN, false); } @Test - public void testHomesBinningSparkCSV() { - runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.BIN); + public void testHomesBinningIDsSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.BIN, false); } @Test - public void testHomesOmitSingleNodeCSV() { - runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.OMIT); + public void testHomesOmitIDsSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.OMIT, false); } @Test - public void testHomesOmitSparkCSV() { - runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.OMIT); + public void testHomesOmitIDsSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.OMIT, false); } @Test - public void testHomesImputeSingleNodeCSV() { - runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.IMPUTE); + public void testHomesImputeIDsSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.IMPUTE, false); } @Test - public void testHomesImputeSparkCSV() { - runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.IMPUTE); + public void testHomesImputeIDsSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.IMPUTE, false); } + @Test + public void testHomesRecodeColnamesSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.RECODE, true); + } + + @Test + public void testHomesRecodeColnamesSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.RECODE, true); + } + + @Test + public void testHomesDummycodeColnamesSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.DUMMY, true); + } + + @Test + public void testHomesDummycodeColnamesSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.DUMMY, true); + } + + @Test + public void testHomesBinningColnamesSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.BIN, true); + } + + @Test + public void testHomesBinningColnamesSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.BIN, true); + } + + @Test + public void testHomesOmitColnamesSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.OMIT, true); + } + + @Test + public void testHomesOmitvColnamesSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.OMIT, true); + } + + @Test + public void testHomesImputeColnamesSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.IMPUTE, true); + } + + @Test + public void testHomesImputeColnamesSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.IMPUTE, true); + } + /** * * @param rt * @param ofmt * @param dataset */ - private void runTransformTest( RUNTIME_PLATFORM rt, String ofmt, TransformType type ) + private void runTransformTest( RUNTIME_PLATFORM rt, String ofmt, TransformType type, boolean colnames ) { //set runtime platform RUNTIME_PLATFORM rtold = rtplatform; @@ -131,11 +186,11 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase //set transform specification String SPEC = null; String DATASET = null; switch( type ) { - case RECODE: SPEC = SPEC1; DATASET = DATASET1; break; - case DUMMY: SPEC = SPEC2; DATASET = DATASET1; break; - case BIN: SPEC = SPEC3; DATASET = DATASET1; break; - case IMPUTE: SPEC = SPEC4; DATASET = DATASET2; break; - case OMIT: SPEC = SPEC5; DATASET = DATASET2; break; + case RECODE: SPEC = colnames?SPEC1b:SPEC1; DATASET = DATASET1; break; + case DUMMY: SPEC = colnames?SPEC2b:SPEC2; DATASET = DATASET1; break; + case BIN: SPEC = colnames?SPEC3b:SPEC3; DATASET = DATASET1; break; + case IMPUTE: SPEC = colnames?SPEC4b:SPEC4; DATASET = DATASET2; break; + case OMIT: SPEC = colnames?SPEC5b:SPEC5; DATASET = DATASET2; break; } if( !ofmt.equals("csv") ) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeDecodeTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeDecodeTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeDecodeTest.java index 09f497e..0bdf4da 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeDecodeTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeDecodeTest.java @@ -42,7 +42,9 @@ public class TransformFrameEncodeDecodeTest extends AutomatedTestBase //dataset and transform tasks without missing values private final static String DATASET1 = "homes3/homes.csv"; private final static String SPEC1 = "homes3/homes.tfspec_recode.json"; + private final static String SPEC1b = "homes3/homes.tfspec_recode2.json"; private final static String SPEC2 = "homes3/homes.tfspec_dummy.json"; + private final static String SPEC2b = "homes3/homes.tfspec_dummy2.json"; public enum TransformType { RECODE, @@ -60,23 +62,43 @@ public class TransformFrameEncodeDecodeTest extends AutomatedTestBase } @Test - public void testHomesRecodeSingleNodeCSV() { - runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.RECODE); + public void testHomesRecodeIDsSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.RECODE, false); } @Test - public void testHomesRecodeSparkCSV() { - runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.RECODE); + public void testHomesRecodeIDsSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.RECODE, false); } @Test - public void testHomesDummycodeSingleNodeCSV() { - runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.DUMMY); + public void testHomesDummycodeIDsSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.DUMMY, false); } @Test - public void testHomesDummycodeSparkCSV() { - runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.DUMMY); + public void testHomesDummycodeIDsSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.DUMMY, false); + } + + @Test + public void testHomesRecodeColnamesSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.RECODE, true); + } + + @Test + public void testHomesRecodeColnamesSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.RECODE, true); + } + + @Test + public void testHomesDummycodeColnamesSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.DUMMY, true); + } + + @Test + public void testHomesDummycodeColnamesSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.DUMMY, true); } /** @@ -85,7 +107,7 @@ public class TransformFrameEncodeDecodeTest extends AutomatedTestBase * @param ofmt * @param dataset */ - private void runTransformTest( RUNTIME_PLATFORM rt, String ofmt, TransformType type ) + private void runTransformTest( RUNTIME_PLATFORM rt, String ofmt, TransformType type, boolean colnames ) { //set runtime platform RUNTIME_PLATFORM rtold = rtplatform; @@ -99,8 +121,8 @@ public class TransformFrameEncodeDecodeTest extends AutomatedTestBase //set transform specification String SPEC = null; String DATASET = null; switch( type ) { - case RECODE: SPEC = SPEC1; DATASET = DATASET1; break; - case DUMMY: SPEC = SPEC2; DATASET = DATASET1; break; + case RECODE: SPEC = colnames?SPEC1b:SPEC1; DATASET = DATASET1; break; + case DUMMY: SPEC = colnames?SPEC2b:SPEC2; DATASET = DATASET1; break; default: throw new RuntimeException("Unsupported transform type for encode/decode test."); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_bin2.json ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_bin2.json b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_bin2.json new file mode 100644 index 0000000..a5a5375 --- /dev/null +++ b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_bin2.json @@ -0,0 +1,5 @@ +{ + "recode": [ zipcode, "district", "view" ], "bin": [ + { "name": "saleprice" , "method": "equi-width", "numbins": 3 } + ,{ "name": "sqft", "method": "equi-width", "numbins": 4 }] + } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_dummy2.json ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_dummy2.json b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_dummy2.json new file mode 100644 index 0000000..917fab6 --- /dev/null +++ b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_dummy2.json @@ -0,0 +1,2 @@ +{ + "dummycode": [ "district", "view", "zipcode" ] } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_impute2.json ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_impute2.json b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_impute2.json new file mode 100644 index 0000000..5fae74b --- /dev/null +++ b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_impute2.json @@ -0,0 +1,10 @@ +{ + "impute": [ + { "name": "zipcode", "method": "global_mode" } + ,{ "name": "district", "method": "constant", "value": "south" } + ,{ "name": "numbedrooms", "method": "constant", "value": "2" } + ,{ "name": "numbathrooms", "method": "constant", "value": "1" } + ,{ "name": "floors", "method": "constant", "value": "1" } + ,{ "name": "view", "method": "global_mode" } + ,{ "name": "askingprice", "method": "global_mean" } +], "recode": [ "district", "view" ] } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_omit2.json ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_omit2.json b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_omit2.json new file mode 100644 index 0000000..2114958 --- /dev/null +++ b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_omit2.json @@ -0,0 +1,2 @@ +{ + "omit": [ "zipcode","district","numbedrooms","numbathrooms","floors","view","saleprice","askingprice" ], "recode": [ "district", "view" ] } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/468e472a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode2.json ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode2.json b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode2.json new file mode 100644 index 0000000..d446628 --- /dev/null +++ b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode2.json @@ -0,0 +1,2 @@ +{ + "recode": [ "zipcode", "district", "view" ] } \ No newline at end of file
