Both libraries are heavily parameterized. You should check what the defaults are for both.
Some ideas: - What regularization is being used. L1/L2? - Does the regularization parameter have the same interpretation? 1/C = lambda? Some libraries use C. Some use lambda. - Also, some libraries regularize the intercept (scikit), other do not. (It doesn't seem like a particularly good idea to regularize the intercept if your optimizer permits not doing it). On Sun, Mar 12, 2017 at 7:07 PM, Frank Astier via scikit-learn < [email protected]> wrote: > (this was also posted to stackoverflow on 03/10) > > I am setting up a very simple logistic regression problem in scikit-learn > and in spark.ml, and the results diverge: the models they learn are > different, but I can't figure out why (data is the same, model type is the > same, regularization is the same...). > > No doubt I am missing some setting on one side or the other. Which > setting? How should I set up either scikit or spark.ml to find the same > model as its counterpart? > > I give the sklearn code and spark.ml code below. Both should be ready to > cut-and-paste and run. > > scikit-learn code: > ---------------------- > > import numpy as np > from sklearn.linear_model import LogisticRegression, Ridge > > X = np.array([ > [-0.7306653538519616, 0.0], > [0.6750417712898752, -0.4232874171873786], > [0.1863463229359709, -0.8163423997075965], > [-0.6719842051493347, 0.0], > [0.9699938346531928, 0.0], > [0.22759406190283604, 0.0], > [0.9688721028330911, 0.0], > [0.5993795346650845, 0.0], > [0.9219423508390701, -0.8972778242305388], > [0.7006904841584055, -0.5607635619919824] > ]) > > y = np.array([ > 0.0, > 1.0, > 1.0, > 0.0, > 1.0, > 1.0, > 1.0, > 0.0, > 0.0, > 0.0 > ]) > > m, n = X.shape > > # Add intercept term to simulate inputs to GameEstimator > X_with_intercept = np.hstack((X, np.ones(m)[:,np.newaxis])) > > l = 0.3 > e = LogisticRegression( > fit_intercept=False, > penalty='l2', > C=1/l, > max_iter=100, > tol=1e-11) > > e.fit(X_with_intercept, y) > > print e.coef_ > # => [[ 0.98662189 0.45571052 -0.23467255]] > > # Linear regression is called Ridge in sklearn > e = Ridge( > fit_intercept=False, > alpha=l, > max_iter=100, > tol=1e-11) > > e.fit(X_with_intercept, y) > > print e.coef_ > # =>[ 0.32155545 0.17904355 0.41222418] > > spark.ml code: > ------------------- > > import org.apache.spark.{SparkConf, SparkContext} > import org.apache.spark.ml.classification.LogisticRegression > import org.apache.spark.ml.regression.LinearRegression > import org.apache.spark.mllib.linalg.Vectors > import org.apache.spark.mllib.regression.LabeledPoint > import org.apache.spark.sql.SQLContext > > object TestSparkRegression { > def main(args: Array[String]): Unit = { > import org.apache.log4j.{Level, Logger} > > Logger.getLogger("org").setLevel(Level.OFF) > Logger.getLogger("akka").setLevel(Level.OFF) > > val conf = new SparkConf().setAppName("test").setMaster("local") > val sc = new SparkContext(conf) > > val sparkTrainingData = new SQLContext(sc) > .createDataFrame(Seq( > LabeledPoint(0.0, Vectors.dense(-0.7306653538519616, 0.0)), > LabeledPoint(1.0, Vectors.dense(0.6750417712898752, > -0.4232874171873786)), > LabeledPoint(1.0, Vectors.dense(0.1863463229359709, > -0.8163423997075965)), > LabeledPoint(0.0, Vectors.dense(-0.6719842051493347, 0.0)), > LabeledPoint(1.0, Vectors.dense(0.9699938346531928, 0.0)), > LabeledPoint(1.0, Vectors.dense(0.22759406190283604, 0.0)), > LabeledPoint(1.0, Vectors.dense(0.9688721028330911, 0.0)), > LabeledPoint(0.0, Vectors.dense(0.5993795346650845, 0.0)), > LabeledPoint(0.0, Vectors.dense(0.9219423508390701, > -0.8972778242305388)), > LabeledPoint(0.0, Vectors.dense(0.7006904841584055, > -0.5607635619919824)))) > .toDF("label", "features") > > val logisticModel = new LogisticRegression() > .setRegParam(0.3) > .setLabelCol("label") > .setFeaturesCol("features") > .fit(sparkTrainingData) > > println(s"Spark logistic model coefficients: > ${logisticModel.coefficients} Intercept: ${logisticModel.intercept}") > // Spark logistic model coefficients: > [0.5451588538376263,0.26740606573584713] > Intercept: -0.13897955358689987 > > val linearModel = new LinearRegression() > .setRegParam(0.3) > .setLabelCol("label") > .setFeaturesCol("features") > .setSolver("l-bfgs") > .fit(sparkTrainingData) > > println(s"Spark linear model coefficients: > ${linearModel.coefficients} Intercept: ${linearModel.intercept}") > // Spark linear model coefficients: > [0.19852664861346023,0.11501200541407802] > Intercept: 0.45464906876832323 > > sc.stop() > } > } > > Thanks, > > Frank > > _______________________________________________ > scikit-learn mailing list > [email protected] > https://mail.python.org/mailman/listinfo/scikit-learn > >
_______________________________________________ scikit-learn mailing list [email protected] https://mail.python.org/mailman/listinfo/scikit-learn
