Repository: incubator-systemml Updated Branches: refs/heads/master 638d213a0 -> e77205d9c
[SYSTEMML-1022][SYSTEMML-1035] Update spark version and unit tests This patch updates Spark build version to 1.6.0 and fixes MLContextFrameTest with change to FrameRDDConverterUtils. Also added cast to double for ID column in MLContextTest, DataFrameVectorFrameConversionTest, and DataFrameVectorScriptTest. Closes #269. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/e77205d9 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/e77205d9 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/e77205d9 Branch: refs/heads/master Commit: e77205d9c3eee943f32a34dadd8014696a40df63 Parents: 638d213 Author: Glenn Weidner <gweid...@us.ibm.com> Authored: Tue Nov 15 12:35:36 2016 -0800 Committer: Glenn Weidner <gweid...@us.ibm.com> Committed: Tue Nov 15 12:35:36 2016 -0800 ---------------------------------------------------------------------- pom.xml | 2 +- .../spark/utils/FrameRDDConverterUtils.java | 9 +++++ .../DataFrameVectorFrameConversionTest.java | 2 +- .../mlcontext/DataFrameVectorScriptTest.java | 2 +- .../mlcontext/MLContextFrameTest.java | 37 +++++++++++--------- .../integration/mlcontext/MLContextTest.java | 20 +++++------ 6 files changed, 42 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e77205d9/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 733b866..7844944 100644 --- a/pom.xml +++ b/pom.xml @@ -65,7 +65,7 @@ <properties> <hadoop.version>2.4.1</hadoop.version> <antlr.version>4.5.3</antlr.version> - <spark.version>1.4.1</spark.version> + <spark.version>1.6.0</spark.version> <scala.version>2.10.5</scala.version> <scala.binary.version>2.10</scala.binary.version> <scala.test.version>2.2.6</scala.test.version> http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e77205d9/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java index f89117b..b7d1c3d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java @@ -508,6 +508,15 @@ public class FrameRDDConverterUtils JavaRDD<String> dataRdd = sc.textFile(fnameIn); return dataRdd.map(new RowGenerator(schema, delim)); } + + /* + * It will return JavaRDD<Row> based on csv data. + */ + public static JavaRDD<Row> csvToRowRDD(JavaSparkContext sc, JavaRDD<String> dataRdd, String delim, ValueType[] schema) + { + // Convert each line to a java rdd. + return dataRdd.map(new RowGenerator(schema, delim)); + } /* * Row Generator class based on individual line in CSV file. http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e77205d9/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java index 20287c7..8e5933c 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java @@ -315,7 +315,7 @@ public class DataFrameVectorFrameConversionTest extends AutomatedTestBase for( int i=0; i<mb.getNumRows(); i++ ) { Object[] row = new Object[clen]; if( containsID ) - row[0] = i+1; + row[0] = (double)i+1; for( int j=0, j2=0; j<mb.getNumColumns(); j++, j2++ ) { if( schema[j2] != ValueType.OBJECT ) { row[j2+off] = UtilFunctions http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e77205d9/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java index 6720744..2a53fb4 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java @@ -336,7 +336,7 @@ public class DataFrameVectorScriptTest extends AutomatedTestBase for( int i=0; i<mb.getNumRows(); i++ ) { Object[] row = new Object[clen]; if( containsID ) - row[0] = i+1; + row[0] = (double)i+1; for( int j=0, j2=0; j<mb.getNumColumns(); j++, j2++ ) { if( schema[j2] != ValueType.OBJECT ) { row[j2+off] = UtilFunctions http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e77205d9/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextFrameTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextFrameTest.java b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextFrameTest.java index b536034..4dc2028 100644 --- a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextFrameTest.java +++ b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextFrameTest.java @@ -52,7 +52,8 @@ import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils; import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils; import org.apache.sysml.test.integration.AutomatedTestBase; -import org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToRow; +import org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow; + import org.junit.After; import org.junit.AfterClass; import org.junit.Assert; @@ -76,6 +77,7 @@ public class MLContextFrameTest extends AutomatedTestBase { private static SparkConf conf; private static JavaSparkContext sc; private static MLContext ml; + private static String CSV_DELIM = ","; @BeforeClass public static void setUpClass() { @@ -233,8 +235,8 @@ public class MLContextFrameTest extends AutomatedTestBase { JavaRDD<String> javaRDDB = sc.parallelize(listB); if (inputType == IO_TYPE.DATAFRAME) { - JavaRDD<Row> javaRddRowA = javaRDDA.map(new MLContextTest.CommaSeparatedValueStringToRow()); - JavaRDD<Row> javaRddRowB = javaRDDB.map(new MLContextTest.CommaSeparatedValueStringToRow()); + JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDA, CSV_DELIM, schemaA); + JavaRDD<Row> javaRddRowB = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDB, CSV_DELIM, schemaB); // Create DataFrame SQLContext sqlContext = new SQLContext(sc); @@ -401,16 +403,16 @@ public class MLContextFrameTest extends AutomatedTestBase { List<Row> listAOut = dataFrameA.collectAsList(); Row row1 = listAOut.get(0); - Assert.assertEquals("Mistmatch with expected value", Long.valueOf(1), row1.get(0)); - Assert.assertEquals("Mistmatch with expected value", "Str2", row1.get(1)); - Assert.assertEquals("Mistmatch with expected value", 3.0, row1.get(2)); - Assert.assertEquals("Mistmatch with expected value", true, row1.get(3)); + Assert.assertEquals("Mismatch with expected value", Long.valueOf(1), row1.get(0)); + Assert.assertEquals("Mismatch with expected value", "Str2", row1.get(1)); + Assert.assertEquals("Mismatch with expected value", 3.0, row1.get(2)); + Assert.assertEquals("Mismatch with expected value", true, row1.get(3)); Row row2 = listAOut.get(1); - Assert.assertEquals("Mistmatch with expected value", Long.valueOf(4), row2.get(0)); - Assert.assertEquals("Mistmatch with expected value", "Str12", row2.get(1)); - Assert.assertEquals("Mistmatch with expected value", 13.0, row2.get(2)); - Assert.assertEquals("Mistmatch with expected value", true, row2.get(3)); + Assert.assertEquals("Mismatch with expected value", Long.valueOf(4), row2.get(0)); + Assert.assertEquals("Mismatch with expected value", "Str12", row2.get(1)); + Assert.assertEquals("Mismatch with expected value", 13.0, row2.get(2)); + Assert.assertEquals("Mismatch with expected value", true, row2.get(3)); DataFrame dataFrameC = mlResults.getDataFrame("C").drop(RDDConverterUtils.DF_ID_COLUMN); StructType dfschemaC = dataFrameC.schema(); @@ -422,12 +424,12 @@ public class MLContextFrameTest extends AutomatedTestBase { List<Row> listCOut = dataFrameC.collectAsList(); Row row3 = listCOut.get(0); - Assert.assertEquals("Mistmatch with expected value", "Str12", row3.get(0)); - Assert.assertEquals("Mistmatch with expected value", 13.0, row3.get(1)); + Assert.assertEquals("Mismatch with expected value", "Str12", row3.get(0)); + Assert.assertEquals("Mismatch with expected value", 13.0, row3.get(1)); Row row4 = listCOut.get(1); - Assert.assertEquals("Mistmatch with expected value", "Str25", row4.get(0)); - Assert.assertEquals("Mistmatch with expected value", 26.0, row4.get(1)); + Assert.assertEquals("Mismatch with expected value", "Str25", row4.get(0)); + Assert.assertEquals("Mismatch with expected value", 26.0, row4.get(1)); } else { String[][] frameA = mlResults.getFrameAs2DStringArray("A"); Assert.assertEquals("Str2", frameA[0][1]); @@ -481,14 +483,15 @@ public class MLContextFrameTest extends AutomatedTestBase { dataA.add("Test2,5.0"); dataA.add("Test3,6.0"); JavaRDD<String> javaRddStringA = sc.parallelize(dataA); + ValueType[] schema = { ValueType.STRING, ValueType.DOUBLE }; List<String> dataB = new ArrayList<String>(); dataB.add("1.0"); dataB.add("2.0"); JavaRDD<String> javaRddStringB = sc.parallelize(dataB); - JavaRDD<Row> javaRddRowA = javaRddStringA.map(new CommaSeparatedValueStringToRow()); - JavaRDD<Row> javaRddRowB = javaRddStringB.map(new CommaSeparatedValueStringToRow()); + JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRddStringA, CSV_DELIM, schema); + JavaRDD<Row> javaRddRowB = javaRddStringB.map(new CommaSeparatedValueStringToDoubleArrayRow()); SQLContext sqlContext = new SQLContext(sc); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e77205d9/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java index acf888a..dd343d4 100644 --- a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java +++ b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java @@ -567,7 +567,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); @@ -594,7 +594,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); @@ -621,7 +621,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); @@ -648,7 +648,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); @@ -675,7 +675,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); DataFrame dataFrame = sqlContext.createDataFrame(javaRddRow, schema); @@ -700,7 +700,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); DataFrame dataFrame = sqlContext.createDataFrame(javaRddRow, schema); @@ -2119,7 +2119,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); @@ -2144,7 +2144,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); @@ -2169,7 +2169,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); DataFrame dataFrame = sqlContext.createDataFrame(javaRddRow, schema); @@ -2192,7 +2192,7 @@ public class MLContextTest extends AutomatedTestBase { JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); SQLContext sqlContext = new SQLContext(sc); List<StructField> fields = new ArrayList<StructField>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.IntegerType, true)); + fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); DataFrame dataFrame = sqlContext.createDataFrame(javaRddRow, schema);