[SYSTEMML-1850] Fix robustness transformdecode w/ delimiter tokens This patch makes the construction and splitting of recode map entries consistent and robust for special tokens, that include the delimiter (i.e., Lop.DATATYPE_PREFIX) itself.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4c3eab89 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4c3eab89 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4c3eab89 Branch: refs/heads/master Commit: 4c3eab89f32355b2d5d898db4411d9cde6eb8c08 Parents: 585afa2 Author: Matthias Boehm <[email protected]> Authored: Thu Aug 17 20:46:50 2017 -0700 Committer: Matthias Boehm <[email protected]> Committed: Fri Aug 18 14:15:42 2017 -0700 ---------------------------------------------------------------------- .../sysml/runtime/matrix/data/FrameBlock.java | 9 ++------- .../runtime/transform/decode/DecoderRecode.java | 6 +++--- .../runtime/transform/encode/EncoderRecode.java | 18 ++++++++++++++++-- 3 files changed, 21 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/4c3eab89/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java index 45ad26c..b3dc311 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java @@ -35,11 +35,11 @@ import java.util.Map; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.io.Writable; -import org.apache.sysml.lops.Lop; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.CacheBlock; import org.apache.sysml.runtime.io.IOUtilFunctions; +import org.apache.sysml.runtime.transform.encode.EncoderRecode; import org.apache.sysml.runtime.util.IndexRange; import org.apache.sysml.runtime.util.UtilFunctions; @@ -1049,12 +1049,7 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable for( int i=0; i<getNumRows(); i++ ) { Object val = ldata.get(i); if( val != null ) { - // Instead of using splitCSV which is forcing string with RFC-4180 format, - // using Lop.DATATYPE_PREFIX separator to split token and code - String[] tmp = new String[2]; - int pos = val.toString().lastIndexOf(Lop.DATATYPE_PREFIX); - tmp[0] = val.toString().substring(0, pos); - tmp[1] = val.toString().substring(pos+1); + String[] tmp = EncoderRecode.splitRecodeMapEntry(val.toString()); map.put(tmp[0], Long.parseLong(tmp[1])); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/4c3eab89/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderRecode.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderRecode.java b/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderRecode.java index c53704b..f6e8471 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderRecode.java +++ b/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderRecode.java @@ -21,12 +21,12 @@ package org.apache.sysml.runtime.transform.decode; import java.util.HashMap; -import org.apache.sysml.lops.Lop; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.matrix.data.FrameBlock; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.transform.TfUtils; +import org.apache.sysml.runtime.transform.encode.EncoderRecode; import org.apache.sysml.runtime.util.UtilFunctions; /** @@ -82,9 +82,9 @@ public class DecoderRecode extends Decoder for( int i=0; i<meta.getNumRows(); i++ ) { if( meta.get(i, _colList[j]-1)==null ) break; //reached end of recode map - String[] tmp = meta.get(i, _colList[j]-1).toString().split(Lop.DATATYPE_PREFIX); + String[] tmp = EncoderRecode.splitRecodeMapEntry(meta.get(i, _colList[j]-1).toString()); Object obj = UtilFunctions.stringToObject(_schema[_colList[j]-1], tmp[0]); - map.put(Long.parseLong(tmp[1]), obj); + map.put(Long.parseLong(tmp[1]), obj); } _rcMaps[j] = map; } http://git-wip-us.apache.org/repos/asf/systemml/blob/4c3eab89/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java index 526d31e..3acf640 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java @@ -188,11 +188,25 @@ public class EncoderRecode extends Encoder /** * Returns the Recode map entry which consists of concatenation of code, delimiter and token. + * * @param token is part of Recode map - * @param code is code for token - * @return the concatenation of code and token with delimiter in between + * @param code is code for token + * @return the concatenation of token and code with delimiter in between */ public static String constructRecodeMapEntry(String token, Long code) { return token + Lop.DATATYPE_PREFIX + code.toString(); } + + /** + * Splits a Recode map entry into its token and code. + * + * @param value concatenation of token and code with delimiter in between + * @return string array of token and code + */ + public static String[] splitRecodeMapEntry(String value) { + // Instead of using splitCSV which is forcing string with RFC-4180 format, + // using Lop.DATATYPE_PREFIX separator to split token and code + int pos = value.toString().lastIndexOf(Lop.DATATYPE_PREFIX); + return new String[] {value.substring(0, pos), value.substring(pos+1)}; + } }
