Repository: incubator-systemml Updated Branches: refs/heads/master 1b4f1ec4d -> ca4e2600e
[SYSTEMML-1244] Fix robustness csv text read (quoted recoded maps) Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c87da2ce Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c87da2ce Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c87da2ce Branch: refs/heads/master Commit: c87da2ce8ffe4ab6a03fce4cd548703147f12fca Parents: 1b4f1ec Author: Matthias Boehm <[email protected]> Authored: Sat Feb 11 00:51:00 2017 +0100 Committer: Matthias Boehm <[email protected]> Committed: Sat Feb 11 19:32:59 2017 +0100 ---------------------------------------------------------------------- .../sysml/runtime/io/IOUtilFunctions.java | 46 +++++-- .../transform/FrameCSVReadWriteTest.java | 119 +++++++++++++++++ .../TransformCSVFrameEncodeReadTest.java | 130 +++++++++++++++++++ .../functions/transform/FrameCSVReadWrite.dml | 27 ++++ .../transform/TransformCSVFrameEncodeRead.dml | 29 +++++ .../functions/misc/ZPackageSuite.java | 3 +- .../functions/transform/ZPackageSuite.java | 2 + 7 files changed, 343 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c87da2ce/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java b/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java index 492665d..3f0ea56 100644 --- a/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java @@ -117,13 +117,12 @@ public class IOUtilFunctions /** * Splits a string by a specified delimiter into all tokens, including empty - * while respecting the rules for quotes and escapes defined in RFC4180. - * - * NOTE: use StringEscapeUtils.unescapeCsv(tmp) if needed afterwards. + * while respecting the rules for quotes and escapes defined in RFC4180, + * with robustness for various special cases. * * @param str string to split * @param delim delimiter - * @return string array + * @return string array of tokens */ public static String[] splitCSV(String str, String delim) { @@ -135,6 +134,7 @@ public class IOUtilFunctions ArrayList<String> tokens = new ArrayList<String>(); int from = 0, to = 0; int len = str.length(); + int dlen = delim.length(); while( from < len ) { // for all tokens if( str.charAt(from) == CSV_QUOTE_CHAR && str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) { @@ -143,8 +143,11 @@ public class IOUtilFunctions while( to+1 < len && str.charAt(to+1)==CSV_QUOTE_CHAR ) to = str.indexOf(CSV_QUOTE_CHAR, to+2); // to + "" to += 1; // last " + // handle remaining non-quoted characters "aa"a + if( to<len-1 && !str.regionMatches(to, delim, 0, dlen) ) + to = str.indexOf(delim, to+1); } - else if(str.regionMatches(from, delim, 0, delim.length())) { + else if( str.regionMatches(from, delim, 0, dlen) ) { to = from; // empty string } else { // default: unquoted non-empty @@ -165,6 +168,16 @@ public class IOUtilFunctions return tokens.toArray(new String[0]); } + /** + * Splits a string by a specified delimiter into all tokens, including empty + * while respecting the rules for quotes and escapes defined in RFC4180, + * with robustness for various special cases. + * + * @param str string to split + * @param delim delimiter + * @param string array for tokens, length needs to match the number of tokens + * @return string array of tokens + */ public static String[] splitCSV(String str, String delim, String[] tokens) { // check for empty input @@ -174,6 +187,7 @@ public class IOUtilFunctions // scan string and create individual tokens int from = 0, to = 0; int len = str.length(); + int dlen = delim.length(); int pos = 0; while( from < len ) { // for all tokens if( str.charAt(from) == CSV_QUOTE_CHAR @@ -183,8 +197,11 @@ public class IOUtilFunctions while( to+1 < len && str.charAt(to+1)==CSV_QUOTE_CHAR ) to = str.indexOf(CSV_QUOTE_CHAR, to+2); // to + "" to += 1; // last " + // handle remaining non-quoted characters "aa"a + if( to<len-1 && !str.regionMatches(to, delim, 0, dlen) ) + to = str.indexOf(delim, to+1); } - else if(str.regionMatches(from, delim, 0, delim.length())) { + else if( str.regionMatches(from, delim, 0, dlen) ) { to = from; // empty string } else { // default: unquoted non-empty @@ -207,9 +224,10 @@ public class IOUtilFunctions /** * Counts the number of tokens defined by the given delimiter, respecting - * the rules for quotes and escapes defined in RFC4180. + * the rules for quotes and escapes defined in RFC4180, + * with robustness for various special cases. * - * @param str string + * @param str string to split * @param delim delimiter * @return number of tokens split by the given delimiter */ @@ -223,6 +241,7 @@ public class IOUtilFunctions int numTokens = 0; int from = 0, to = 0; int len = str.length(); + int dlen = delim.length(); while( from < len ) { // for all tokens if( str.charAt(from) == CSV_QUOTE_CHAR && str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) { @@ -231,8 +250,11 @@ public class IOUtilFunctions while( to+1 < len && str.charAt(to+1)==CSV_QUOTE_CHAR ) to = str.indexOf(CSV_QUOTE_CHAR, to+2); // to + "" to += 1; // last " + // handle remaining non-quoted characters "aa"a + if( to<len-1 && !str.regionMatches(to, delim, 0, dlen) ) + to = str.indexOf(delim, to+1); } - else if(str.regionMatches(from, delim, 0, delim.length())) { + else if( str.regionMatches(from, delim, 0, dlen) ) { to = from; // empty string } else { // default: unquoted non-empty @@ -366,11 +388,11 @@ public class IOUtilFunctions informat.getRecordReader(splits[i], job, Reporter.NULL); try { if( reader.next(key, value) ) { - String row = value.toString().trim(); - if( row.startsWith(TfUtils.TXMTD_MVPREFIX) ) + if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) ) reader.next(key, value); - if( row.startsWith(TfUtils.TXMTD_NDPREFIX) ) + if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) ) reader.next(key, value); + String row = value.toString().trim(); if( !row.isEmpty() ) ncol = IOUtilFunctions.countTokensCSV(row, delim); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c87da2ce/src/test/java/org/apache/sysml/test/integration/functions/transform/FrameCSVReadWriteTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/FrameCSVReadWriteTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/FrameCSVReadWriteTest.java new file mode 100644 index 0000000..35078f3 --- /dev/null +++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/FrameCSVReadWriteTest.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.test.integration.functions.transform; + +import org.junit.Test; +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.io.FrameReader; +import org.apache.sysml.runtime.io.FrameReaderFactory; +import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties; +import org.apache.sysml.runtime.matrix.data.FrameBlock; +import org.apache.sysml.runtime.matrix.data.InputInfo; +import org.apache.sysml.runtime.util.DataConverter; +import org.apache.sysml.test.integration.AutomatedTestBase; +import org.apache.sysml.test.integration.TestConfiguration; +import org.apache.sysml.test.utils.TestUtils; + +public class FrameCSVReadWriteTest extends AutomatedTestBase +{ + private final static String TEST_NAME1 = "FrameCSVReadWrite"; + private final static String TEST_DIR = "functions/transform/"; + private final static String TEST_CLASS_DIR = TEST_DIR + FrameCSVReadWriteTest.class.getSimpleName() + "/"; + + //dataset and transform tasks without missing values + private final static String DATASET = "csv_mix/quotes1.csv"; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME1, + new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "R" }) ); + } + + @Test + public void testCSVReadWriteSinglenode() { + runCSVQuotesReadWriteTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv"); + } + + @Test + public void testCSVReadWriteHybrid() { + runCSVQuotesReadWriteTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv"); + } + + @Test + public void testCSVReadWriteSpark() { + runCSVQuotesReadWriteTest(RUNTIME_PLATFORM.SPARK, "csv"); + } + + + /** + * + * @param rt + * @param ofmt + * @param dataset + */ + private void runCSVQuotesReadWriteTest( RUNTIME_PLATFORM rt, String ofmt ) + { + //set runtime platform + RUNTIME_PLATFORM rtold = rtplatform; + boolean csvReblockOld = OptimizerUtils.ALLOW_FRAME_CSV_REBLOCK; + rtplatform = rt; + + boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG; + if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) + DMLScript.USE_LOCAL_SPARK_CONFIG = true; + + if( !ofmt.equals("csv") ) + throw new RuntimeException("Unsupported test output format"); + + try + { + getAndLoadTestConfiguration(TEST_NAME1); + + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME1 + ".dml"; + programArgs = new String[]{"-explain","-args", + HOME + "input/" + DATASET, output("R") }; + + OptimizerUtils.ALLOW_FRAME_CSV_REBLOCK = true; + runTest(true, false, null, -1); + + //read input/output and compare + FrameReader reader1 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo, + new CSVFileFormatProperties(false, ",", false)); + FrameBlock fb1 = reader1.readFrameFromHDFS(HOME + "input/" + DATASET, -1L, -1L); + FrameReader reader2 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo); + FrameBlock fb2 = reader2.readFrameFromHDFS(output("R"), -1L, -1L); + String[][] R1 = DataConverter.convertToStringFrame(fb1); + String[][] R2 = DataConverter.convertToStringFrame(fb2); + TestUtils.compareFrames(R1, R2, R1.length, R1[0].length); + } + catch(Exception ex) { + throw new RuntimeException(ex); + } + finally { + rtplatform = rtold; + DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld; + OptimizerUtils.ALLOW_FRAME_CSV_REBLOCK = csvReblockOld; + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c87da2ce/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformCSVFrameEncodeReadTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformCSVFrameEncodeReadTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformCSVFrameEncodeReadTest.java new file mode 100644 index 0000000..b28c2df --- /dev/null +++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformCSVFrameEncodeReadTest.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.test.integration.functions.transform; + +import org.junit.Test; +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.io.FrameReader; +import org.apache.sysml.runtime.io.FrameReaderTextCSV; +import org.apache.sysml.runtime.io.FrameReaderTextCSVParallel; +import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties; +import org.apache.sysml.runtime.matrix.data.FrameBlock; +import org.apache.sysml.runtime.util.DataConverter; +import org.apache.sysml.test.integration.AutomatedTestBase; +import org.apache.sysml.test.integration.TestConfiguration; +import org.apache.sysml.test.utils.TestUtils; + +public class TransformCSVFrameEncodeReadTest extends AutomatedTestBase +{ + private final static String TEST_NAME1 = "TransformCSVFrameEncodeRead"; + private final static String TEST_DIR = "functions/transform/"; + private final static String TEST_CLASS_DIR = TEST_DIR + TransformCSVFrameEncodeReadTest.class.getSimpleName() + "/"; + + //dataset and transform tasks without missing values + private final static String DATASET = "csv_mix/quotes1.csv"; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME1, + new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "R" }) ); + } + + @Test + public void testFrameReadMetaSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", false); + } + + @Test + public void testFrameReadMetaSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", false); + } + + @Test + public void testFrameReadMetaHybridCSV() { + runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", false); + } + + @Test + public void testFrameParReadMetaSingleNodeCSV() { + runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", true); + } + + @Test + public void testFrameParReadMetaSparkCSV() { + runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", true); + } + + @Test + public void testFrameParReadMetaHybridCSV() { + runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true); + } + + /** + * + * @param rt + * @param ofmt + * @param dataset + */ + private void runTransformTest( RUNTIME_PLATFORM rt, String ofmt, boolean parRead ) + { + //set runtime platform + RUNTIME_PLATFORM rtold = rtplatform; + boolean csvReblockOld = OptimizerUtils.ALLOW_FRAME_CSV_REBLOCK; + rtplatform = rt; + + boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG; + if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) + DMLScript.USE_LOCAL_SPARK_CONFIG = true; + + if( !ofmt.equals("csv") ) + throw new RuntimeException("Unsupported test output format"); + + try + { + getAndLoadTestConfiguration(TEST_NAME1); + + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME1 + ".dml"; + programArgs = new String[]{"-explain", "-stats","-args", + HOME + "input/" + DATASET, output("R") }; + + OptimizerUtils.ALLOW_FRAME_CSV_REBLOCK = true; + runTest(true, false, null, -1); + + //read input/output and compare + FrameReader reader2 = parRead ? + new FrameReaderTextCSVParallel( new CSVFileFormatProperties() ) : + new FrameReaderTextCSV( new CSVFileFormatProperties() ); + FrameBlock fb2 = reader2.readFrameFromHDFS(output("R"), -1L, -1L); + System.out.println(DataConverter.toString(fb2)); + } + catch(Exception ex) { + throw new RuntimeException(ex); + } + finally { + rtplatform = rtold; + DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld; + OptimizerUtils.ALLOW_FRAME_CSV_REBLOCK = csvReblockOld; + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c87da2ce/src/test/scripts/functions/transform/FrameCSVReadWrite.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/FrameCSVReadWrite.dml b/src/test/scripts/functions/transform/FrameCSVReadWrite.dml new file mode 100644 index 0000000..88f0cf5 --- /dev/null +++ b/src/test/scripts/functions/transform/FrameCSVReadWrite.dml @@ -0,0 +1,27 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read($1, data_type="frame", format="csv"); +if(1==1){} + +print(toString(X)); +write(X, $2, format="csv"); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c87da2ce/src/test/scripts/functions/transform/TransformCSVFrameEncodeRead.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/TransformCSVFrameEncodeRead.dml b/src/test/scripts/functions/transform/TransformCSVFrameEncodeRead.dml new file mode 100644 index 0000000..9da935f --- /dev/null +++ b/src/test/scripts/functions/transform/TransformCSVFrameEncodeRead.dml @@ -0,0 +1,29 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +F1 = read($1, data_type="frame", format="csv"); +jspec = "{\"ids\": true, \"recode\": [1,2,3]}"; + +[X, M] = transformencode(target=F1, spec=jspec); + +print(toString(M)) +write(M, $2, format="csv"); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c87da2ce/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java ---------------------------------------------------------------------- diff --git a/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java index 6c40dd7..32b5f7b 100644 --- a/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java +++ b/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java @@ -47,9 +47,10 @@ import org.junit.runners.Suite; PrintMatrixTest.class, ReadAfterWriteTest.class, RewriteFusedRandTest.class, + RewriteLoopVectorization.class, + RewritePushdownSumBinaryMult.class, RewritePushdownSumOnBinaryTest.class, RewritePushdownUaggTest.class, - RewritePushdownSumBinaryMult.class, RewriteSimplifyRowColSumMVMultTest.class, RewriteSlicedMatrixMultTest.class, RewriteFuseBinaryOpChainTest.class, http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c87da2ce/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java ---------------------------------------------------------------------- diff --git a/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java index 1996b01..e36d4a0 100644 --- a/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java +++ b/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java @@ -26,10 +26,12 @@ import org.junit.runners.Suite; * won't run two of them at once. */ @RunWith(Suite.class) @Suite.SuiteClasses({ + FrameCSVReadWriteTest.class, RunTest.class, ScalingTest.class, TransformAndApplyTest.class, TransformCSVFrameEncodeDecodeTest.class, + TransformCSVFrameEncodeReadTest.class, TransformEncodeDecodeTest.class, TransformFrameApplyTest.class, TransformFrameEncodeApplyTest.class,
