[jira] [Updated] (SPARK-19449) Inconsistent results between ml package RandomForestClassificationModel and mllib package RandomForestModel

2017-02-03 Thread Aseem Bansal (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-19449?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Aseem Bansal updated SPARK-19449:
-
Description: 
I worked on some code to convert ml package RandomForestClassificationModel to 
mllib package RandomForestModel. It was needed because we need to make 
predictions on the order of ms. I found that the results are inconsistent 
although the underlying DecisionTreeModel are exactly the same. So the behavior 
between the 2 implementations is inconsistent which should not be the case.

The below code can be used to reproduce the issue. Can run this as a simple 
Java app as long as you have spark dependencies set up properly.

{noformat}
import org.apache.spark.ml.Transformer;
import org.apache.spark.ml.classification.*;
import org.apache.spark.ml.linalg.*;
import org.apache.spark.ml.regression.RandomForestRegressionModel;
import org.apache.spark.mllib.linalg.DenseVector;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.tree.configuration.Algo;
import org.apache.spark.mllib.tree.model.DecisionTreeModel;
import org.apache.spark.mllib.tree.model.RandomForestModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Enumeration;

import java.util.ArrayList;
import java.util.List;
import java.util.Random;

abstract class Predictor {
abstract double predict(Vector vector);
}

public class MainConvertModels {

public static final int seed = 42;

public static void main(String[] args) {

int numRows = 1000;
int numFeatures = 3;
int numClasses = 2;

double trainFraction = 0.8;
double testFraction = 0.2;


SparkSession spark = SparkSession.builder()
.appName("conversion app")
.master("local")
.getOrCreate();


Dataset data = getDummyData(spark, numRows, numFeatures, 
numClasses);

Dataset[] splits = data.randomSplit(new double[]{trainFraction, 
testFraction}, seed);
Dataset trainingData = splits[0];
Dataset testData = splits[1];
testData.cache();

List labels = getLabels(testData);
List features = getFeatures(testData);

DecisionTreeClassifier classifier1 = new DecisionTreeClassifier();
DecisionTreeClassificationModel model1 = classifier1.fit(trainingData);
final DecisionTreeModel convertedModel1 = 
convertDecisionTreeModel(model1, Algo.Classification());


RandomForestClassifier classifier = new RandomForestClassifier();
RandomForestClassificationModel model2 = classifier.fit(trainingData);
final RandomForestModel convertedModel2 = 
convertRandomForestModel(model2);

System.out.println(

"** DecisionTreeClassifier\n" +
"** Original **" + getInfo(model1, testData) + "\n" +
"** New  **" + getInfo(new Predictor() {
double predict(Vector vector) {return 
convertedModel1.predict(vector);}
}, labels, features) + "\n" +

"\n" +

"** RandomForestClassifier\n" +
"** Original **" + getInfo(model2, testData) + "\n" +
"** New  **" + getInfo(new Predictor() {double 
predict(Vector vector) {return convertedModel2.predict(vector);}}, labels, 
features) + "\n" +

"\n" +
"");
}

static Dataset getDummyData(SparkSession spark, int numberRows, int 
numberFeatures, int labelUpperBound) {

StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, 
Metadata.empty()),
new StructField("features", new VectorUDT(), false, 
Metadata.empty())
});

double[][] vectors = prepareData(numberRows, numberFeatures);

Random random = new Random(seed);
List dataTest = new ArrayList<>();
for (double[] vector : vectors) {
double label = (double) random.nextInt(2);
dataTest.add(RowFactory.create(label, Vectors.dense(vector)));
}

return spark.createDataFrame(dataTest, schema);
}

static double[][] prepareData(int numRows, int numFeatures) {

Random random = new Random(seed);

double[][] result = new double[numRows][numFeatures];

for (int row = 0; row < numRows; row++) {
for (int feature = 0; feature < numFeatures; feature++) {
result[row][feature] = random.nextDouble();
}
}

return result;
}

static 

[jira] [Updated] (SPARK-19449) Inconsistent results between ml package RandomForestClassificationModel and mllib package RandomForestModel

2017-02-03 Thread Aseem Bansal (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-19449?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Aseem Bansal updated SPARK-19449:
-
Description: 
I worked on some code to convert ml package RandomForestClassificationModel to 
mllib package RandomForestModel. It was needed because we need to make 
predictions on the order of ms. I found that the results are inconsistent 
although the underlying DecisionTreeModel are exactly the same. 

The below code can be used to reproduce the issue. Can run this as a simple 
Java app as long as you have spark dependencies set up properly.

{noformat}
import org.apache.spark.ml.Transformer;
import org.apache.spark.ml.classification.*;
import org.apache.spark.ml.linalg.*;
import org.apache.spark.ml.regression.RandomForestRegressionModel;
import org.apache.spark.mllib.linalg.DenseVector;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.tree.configuration.Algo;
import org.apache.spark.mllib.tree.model.DecisionTreeModel;
import org.apache.spark.mllib.tree.model.RandomForestModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Enumeration;

import java.util.ArrayList;
import java.util.List;
import java.util.Random;

abstract class Predictor {
abstract double predict(Vector vector);
}

public class MainConvertModels {

public static final int seed = 42;

public static void main(String[] args) {

int numRows = 1000;
int numFeatures = 3;
int numClasses = 2;

double trainFraction = 0.8;
double testFraction = 0.2;


SparkSession spark = SparkSession.builder()
.appName("conversion app")
.master("local")
.getOrCreate();

//Dataset data = getData(spark, "libsvm", 
"/opt/spark2/data/mllib/sample_libsvm_data.txt");
Dataset data = getDummyData(spark, numRows, numFeatures, 
numClasses);

Dataset[] splits = data.randomSplit(new double[]{trainFraction, 
testFraction}, seed);
Dataset trainingData = splits[0];
Dataset testData = splits[1];
testData.cache();

List labels = getLabels(testData);
List features = getFeatures(testData);

DecisionTreeClassifier classifier1 = new DecisionTreeClassifier();
DecisionTreeClassificationModel model1 = classifier1.fit(trainingData);
final DecisionTreeModel convertedModel1 = 
convertDecisionTreeModel(model1, Algo.Classification());


RandomForestClassifier classifier = new RandomForestClassifier();
RandomForestClassificationModel model2 = classifier.fit(trainingData);
final RandomForestModel convertedModel2 = 
convertRandomForestModel(model2);


LogisticRegression lr = new LogisticRegression();
LogisticRegressionModel model3 = lr.fit(trainingData);
final org.apache.spark.mllib.classification.LogisticRegressionModel 
convertedModel3 = convertLogisticRegressionModel(model3);


System.out.println(

"** DecisionTreeClassifier\n" +
"** Original **" + getInfo(model1, testData) + "\n" +
"** New  **" + getInfo(new Predictor() {
double predict(Vector vector) {return 
convertedModel1.predict(vector);}
}, labels, features) + "\n" +

"\n" +

"** RandomForestClassifier\n" +
"** Original **" + getInfo(model2, testData) + "\n" +
"** New  **" + getInfo(new Predictor() {double 
predict(Vector vector) {return convertedModel2.predict(vector);}}, labels, 
features) + "\n" +

"\n" +

"** LogisticRegression\n" +
"** Original **" + getInfo(model3, testData) + "\n" +
"** New  **" + getInfo(new Predictor() {double 
predict(Vector vector) { return convertedModel3.predict(vector);}}, labels, 
features) + "\n" +

"");
}

static Dataset getData(SparkSession spark, String format, String 
location) {

return spark.read()
.format(format)
.load(location);
}

static Dataset getDummyData(SparkSession spark, int numberRows, int 
numberFeatures, int labelUpperBound) {

StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, 
Metadata.empty()),
new StructField("features", new VectorUDT(), false, 
Metadata.empty())
});

double[][] vectors = prepareData(numberRows, numberFeatures);

 

[jira] [Updated] (SPARK-19449) Inconsistent results between ml package RandomForestClassificationModel and mllib package RandomForestModel

2017-02-03 Thread Aseem Bansal (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-19449?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Aseem Bansal updated SPARK-19449:
-
Description: 
I worked on some code to convert ml package RandomForestClassificationModel to 
mllib package RandomForestModel. It was needed because we need to make 
predictions on the order of ms. I found that the results are inconsistent 
although the underlying DecisionTreeModel are exactly the same. So the behavior 
between the 2 implementations is inconsistent which should not be the case.

The below code can be used to reproduce the issue. Can run this as a simple 
Java app as long as you have spark dependencies set up properly.

{noformat}
import org.apache.spark.ml.Transformer;
import org.apache.spark.ml.classification.*;
import org.apache.spark.ml.linalg.*;
import org.apache.spark.ml.regression.RandomForestRegressionModel;
import org.apache.spark.mllib.linalg.DenseVector;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.tree.configuration.Algo;
import org.apache.spark.mllib.tree.model.DecisionTreeModel;
import org.apache.spark.mllib.tree.model.RandomForestModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Enumeration;

import java.util.ArrayList;
import java.util.List;
import java.util.Random;

abstract class Predictor {
abstract double predict(Vector vector);
}

public class MainConvertModels {

public static final int seed = 42;

public static void main(String[] args) {

int numRows = 1000;
int numFeatures = 3;
int numClasses = 2;

double trainFraction = 0.8;
double testFraction = 0.2;


SparkSession spark = SparkSession.builder()
.appName("conversion app")
.master("local")
.getOrCreate();

//Dataset data = getData(spark, "libsvm", 
"/opt/spark2/data/mllib/sample_libsvm_data.txt");
Dataset data = getDummyData(spark, numRows, numFeatures, 
numClasses);

Dataset[] splits = data.randomSplit(new double[]{trainFraction, 
testFraction}, seed);
Dataset trainingData = splits[0];
Dataset testData = splits[1];
testData.cache();

List labels = getLabels(testData);
List features = getFeatures(testData);

DecisionTreeClassifier classifier1 = new DecisionTreeClassifier();
DecisionTreeClassificationModel model1 = classifier1.fit(trainingData);
final DecisionTreeModel convertedModel1 = 
convertDecisionTreeModel(model1, Algo.Classification());


RandomForestClassifier classifier = new RandomForestClassifier();
RandomForestClassificationModel model2 = classifier.fit(trainingData);
final RandomForestModel convertedModel2 = 
convertRandomForestModel(model2);


LogisticRegression lr = new LogisticRegression();
LogisticRegressionModel model3 = lr.fit(trainingData);
final org.apache.spark.mllib.classification.LogisticRegressionModel 
convertedModel3 = convertLogisticRegressionModel(model3);


System.out.println(

"** DecisionTreeClassifier\n" +
"** Original **" + getInfo(model1, testData) + "\n" +
"** New  **" + getInfo(new Predictor() {
double predict(Vector vector) {return 
convertedModel1.predict(vector);}
}, labels, features) + "\n" +

"\n" +

"** RandomForestClassifier\n" +
"** Original **" + getInfo(model2, testData) + "\n" +
"** New  **" + getInfo(new Predictor() {double 
predict(Vector vector) {return convertedModel2.predict(vector);}}, labels, 
features) + "\n" +

"\n" +

"** LogisticRegression\n" +
"** Original **" + getInfo(model3, testData) + "\n" +
"** New  **" + getInfo(new Predictor() {double 
predict(Vector vector) { return convertedModel3.predict(vector);}}, labels, 
features) + "\n" +

"");
}

static Dataset getData(SparkSession spark, String format, String 
location) {

return spark.read()
.format(format)
.load(location);
}

static Dataset getDummyData(SparkSession spark, int numberRows, int 
numberFeatures, int labelUpperBound) {

StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, 
Metadata.empty()),
new StructField("features", new VectorUDT(), false,