Repository: incubator-systemml Updated Branches: refs/heads/master cf92e8417 -> 9d0087cbb
[SYSTEMML-1238] Updated the default parameters of mllearn to match that of scikit learn. - Also updated the test to compare our algorithm to scikit-learn. Closes #398. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/9d0087cb Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/9d0087cb Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/9d0087cb Branch: refs/heads/master Commit: 9d0087cbbd250c9b486923555b450602f816cf19 Parents: cf92e84 Author: Niketan Pansare <[email protected]> Authored: Fri Feb 17 14:54:23 2017 -0800 Committer: Niketan Pansare <[email protected]> Committed: Fri Feb 17 14:59:49 2017 -0800 ---------------------------------------------------------------------- docs/algorithms-regression.md | 8 +- docs/beginners-guide-python.md | 2 +- docs/python-reference.md | 6 +- .../spark/utils/RDDConverterUtilsExt.java | 2 +- src/main/python/systemml/mllearn/estimators.py | 34 ++++---- src/main/python/tests/test_mllearn_df.py | 56 +++++++------ src/main/python/tests/test_mllearn_numpy.py | 87 ++++++++++++++------ 7 files changed, 124 insertions(+), 71 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/docs/algorithms-regression.md ---------------------------------------------------------------------- diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md index 992862e..80b38a3 100644 --- a/docs/algorithms-regression.md +++ b/docs/algorithms-regression.md @@ -83,8 +83,8 @@ efficient when the number of features $m$ is relatively small <div data-lang="Python" markdown="1"> {% highlight python %} from systemml.mllearn import LinearRegression -# C = 1/reg -lr = LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve') +# C = 1/reg (to disable regularization, use float("inf")) +lr = LinearRegression(sqlCtx, fit_intercept=True, normalize=False, C=float("inf"), solver='direct-solve') # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = lr.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" @@ -125,8 +125,8 @@ y_test = lr.fit(df_train) <div data-lang="Python" markdown="1"> {% highlight python %} from systemml.mllearn import LinearRegression -# C = 1/reg -lr = LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg') +# C = 1/reg (to disable regularization, use float("inf")) +lr = LinearRegression(sqlCtx, fit_intercept=True, normalize=False, max_iter=100, tol=0.000001, C=float("inf"), solver='newton-cg') # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices y_test = lr.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/docs/beginners-guide-python.md ---------------------------------------------------------------------- diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md index 4d1b098..ffab09e 100644 --- a/docs/beginners-guide-python.md +++ b/docs/beginners-guide-python.md @@ -228,7 +228,7 @@ X_test = diabetes_X[-20:] y_train = diabetes.target[:-20] y_test = diabetes.target[-20:] # Create linear regression object -regr = LinearRegression(sqlCtx, fit_intercept=True, C=1, solver='direct-solve') +regr = LinearRegression(sqlCtx, fit_intercept=True, C=float("inf"), solver='direct-solve') # Train the model using the training sets regr.fit(X_train, y_train) y_predicted = regr.predict(X_test) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/docs/python-reference.md ---------------------------------------------------------------------- diff --git a/docs/python-reference.md b/docs/python-reference.md index 65dcb5c..8d38598 100644 --- a/docs/python-reference.md +++ b/docs/python-reference.md @@ -731,7 +731,7 @@ LogisticRegression score: 0.922222 ### Reference documentation - *class*`systemml.mllearn.estimators.LinearRegression`(*sqlCtx*, *fit\_intercept=True*, *max\_iter=100*, *tol=1e-06*, *C=1.0*, *solver='newton-cg'*, *transferUsingDF=False*)(#systemml.mllearn.estimators.LinearRegression "Permalink to this definition") + *class*`systemml.mllearn.estimators.LinearRegression`(*sqlCtx*, *fit\_intercept=True*, *normalize=False*, *max\_iter=100*, *tol=1e-06*, *C=float("inf")*, *solver='newton-cg'*, *transferUsingDF=False*)(#systemml.mllearn.estimators.LinearRegression "Permalink to this definition") : Bases: `systemml.mllearn.estimators.BaseSystemMLRegressor`{.xref .py .py-class .docutils .literal} @@ -760,7 +760,7 @@ LogisticRegression score: 0.922222 >>> # The mean square error >>> print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2)) - *class*`systemml.mllearn.estimators.LogisticRegression`(*sqlCtx*, *penalty='l2'*, *fit\_intercept=True*, *max\_iter=100*, *max\_inner\_iter=0*, *tol=1e-06*, *C=1.0*, *solver='newton-cg'*, *transferUsingDF=False*)(#systemml.mllearn.estimators.LogisticRegression "Permalink to this definition") + *class*`systemml.mllearn.estimators.LogisticRegression`(*sqlCtx*, *penalty='l2'*, *fit\_intercept=True*, *normalize=False*, *max\_iter=100*, *max\_inner\_iter=0*, *tol=1e-06*, *C=1.0*, *solver='newton-cg'*, *transferUsingDF=False*)(#systemml.mllearn.estimators.LogisticRegression "Permalink to this definition") : Bases: `systemml.mllearn.estimators.BaseSystemMLClassifier`{.xref .py .py-class .docutils .literal} @@ -817,7 +817,7 @@ LogisticRegression score: 0.922222 >>> prediction = model.transform(test) >>> prediction.show() - *class*`systemml.mllearn.estimators.SVM`(*sqlCtx*, *fit\_intercept=True*, *max\_iter=100*, *tol=1e-06*, *C=1.0*, *is\_multi\_class=False*, *transferUsingDF=False*)(#systemml.mllearn.estimators.SVM "Permalink to this definition") + *class*`systemml.mllearn.estimators.SVM`(*sqlCtx*, *fit\_intercept=True*, *normalize=False*, *max\_iter=100*, *tol=1e-06*, *C=1.0*, *is\_multi\_class=False*, *transferUsingDF=False*)(#systemml.mllearn.estimators.SVM "Permalink to this definition") : Bases: `systemml.mllearn.estimators.BaseSystemMLClassifier`{.xref .py .py-class .docutils .literal} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java index dea5601..cdf090d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java @@ -195,7 +195,7 @@ public class RDDConverterUtilsExt long limit = mb.getNumRows()*mb.getNumColumns(); int times = Double.SIZE / Byte.SIZE; - if( limit * times > Integer.MAX_VALUE ) + if( limit > Integer.MAX_VALUE / times ) throw new DMLRuntimeException("MatrixBlock of size " + limit + " cannot be converted to dense numpy array"); ret = new byte[(int) (limit * times)]; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/src/main/python/systemml/mllearn/estimators.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py index c4eaf3d..4188ade 100644 --- a/src/main/python/systemml/mllearn/estimators.py +++ b/src/main/python/systemml/mllearn/estimators.py @@ -294,7 +294,7 @@ class LogisticRegression(BaseSystemMLClassifier): """ - def __init__(self, sparkSession, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): + def __init__(self, sparkSession, penalty='l2', fit_intercept=True, normalize=False, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): """ Performs both binomial and multinomial logistic regression. @@ -303,10 +303,11 @@ class LogisticRegression(BaseSystemMLClassifier): sparkSession: PySpark SparkSession penalty: Only 'l2' supported fit_intercept: Specifies whether to add intercept or not (default: True) + normalize: This parameter is ignored when fit_intercept is set to False. (default: False) max_iter: Maximum number of outer (Fisher scoring) iterations (default: 100) max_inner_iter: Maximum number of inner (conjugate gradient) iterations, or 0 if no maximum limit provided (default: 0) tol: Tolerance used in the convergence criterion (default: 0.000001) - C: 1/regularization parameter (default: 1.0) + C: 1/regularization parameter (default: 1.0 similar to scikit-learn. To disable regularization, please use float("inf")) solver: Only 'newton-cg' solver supported """ self.sparkSession = sparkSession @@ -316,12 +317,11 @@ class LogisticRegression(BaseSystemMLClassifier): self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression(self.uid, self.sc._jsc.sc()) self.estimator.setMaxOuterIter(max_iter) self.estimator.setMaxInnerIter(max_inner_iter) - if C <= 0: - raise Exception('C has to be positive') - reg = 1.0 / C + reg = 0.0 if C == float("inf") else 1.0 / C + icpt = 2 if fit_intercept == True and normalize == True else int(fit_intercept) self.estimator.setRegParam(reg) self.estimator.setTol(tol) - self.estimator.setIcpt(int(fit_intercept)) + self.estimator.setIcpt(icpt) self.transferUsingDF = transferUsingDF self.setOutputRawPredictionsToFalse = True if penalty != 'l2': @@ -361,7 +361,7 @@ class LinearRegression(BaseSystemMLRegressor): """ - def __init__(self, sparkSession, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): + def __init__(self, sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=0.000001, C=float("inf"), solver='newton-cg', transferUsingDF=False): """ Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables. @@ -369,9 +369,10 @@ class LinearRegression(BaseSystemMLRegressor): ---------- sparkSession: PySpark SparkSession fit_intercept: Specifies whether to add intercept or not (default: True) + normalize: If True, the regressors X will be normalized before regression. This parameter is ignored when fit_intercept is set to False. (default: False) max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100) tol: Tolerance used in the convergence criterion (default: 0.000001) - C: 1/regularization parameter (default: 1.0) + C: 1/regularization parameter (default: float("inf") as scikit learn doesnot support regularization by default) solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg'). Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient. 'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and @@ -386,12 +387,11 @@ class LinearRegression(BaseSystemMLRegressor): else: raise Exception('Only newton-cg solver supported') self.estimator.setMaxIter(max_iter) - if C <= 0: - raise Exception('C has to be positive') - reg = 1.0 / C + reg = 0.0 if C == float("inf") else 1.0 / C + icpt = 2 if fit_intercept == True and normalize == True else int(fit_intercept) self.estimator.setRegParam(reg) self.estimator.setTol(tol) - self.estimator.setIcpt(int(fit_intercept)) + self.estimator.setIcpt(icpt) self.transferUsingDF = transferUsingDF self.setOutputRawPredictionsToFalse = False @@ -421,7 +421,7 @@ class SVM(BaseSystemMLClassifier): """ - def __init__(self, sparkSession, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False): + def __init__(self, sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False): """ Performs both binary-class and multiclass SVM (Support Vector Machines). @@ -429,9 +429,10 @@ class SVM(BaseSystemMLClassifier): ---------- sparkSession: PySpark SparkSession fit_intercept: Specifies whether to add intercept or not (default: True) + normalize: This parameter is ignored when fit_intercept is set to False. (default: False) max_iter: Maximum number iterations (default: 100) tol: Tolerance used in the convergence criterion (default: 0.000001) - C: 1/regularization parameter (default: 1.0) + C: 1/regularization parameter (default: 1.0 similar to scikit-learn. To disable regularization, please use float("inf")) is_multi_class: Specifies whether to use binary-class SVM or multi-class SVM algorithm (default: False) """ self.sparkSession = sparkSession @@ -442,10 +443,11 @@ class SVM(BaseSystemMLClassifier): self.estimator.setMaxIter(max_iter) if C <= 0: raise Exception('C has to be positive') - reg = 1.0 / C + reg = 0.0 if C == float("inf") else 1.0 / C + icpt = 2 if fit_intercept == True and normalize == True else int(fit_intercept) self.estimator.setRegParam(reg) self.estimator.setTol(tol) - self.estimator.setIcpt(int(fit_intercept)) + self.estimator.setIcpt(icpt) self.transferUsingDF = transferUsingDF self.setOutputRawPredictionsToFalse = False http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/src/main/python/tests/test_mllearn_df.py ---------------------------------------------------------------------- diff --git a/src/main/python/tests/test_mllearn_df.py b/src/main/python/tests/test_mllearn_df.py index da49953..d949f4e 100644 --- a/src/main/python/tests/test_mllearn_df.py +++ b/src/main/python/tests/test_mllearn_df.py @@ -40,7 +40,8 @@ from pyspark.sql import SparkSession from sklearn import datasets, metrics, neighbors from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer - +from sklearn import linear_model +from sklearn.metrics import accuracy_score, r2_score from systemml.mllearn import LinearRegression, LogisticRegression, NaiveBayes, SVM sc = SparkContext() @@ -61,20 +62,40 @@ class TestMLLearn(unittest.TestCase): y_test = y_digits[int(.9 * n_samples):] # Convert to DataFrame for i/o: current way to transfer data logistic = LogisticRegression(sparkSession, transferUsingDF=True) - score = logistic.fit(X_train, y_train).score(X_test, y_test) - self.failUnless(score > 0.9) + logistic.fit(X_train, y_train) + mllearn_predicted = logistic.predict(X_test) + sklearn_logistic = linear_model.LogisticRegression() + sklearn_logistic.fit(X_train, y_train) + self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn - def test_linear_regression_sk2(self): + def test_linear_regression(self): diabetes = datasets.load_diabetes() diabetes_X = diabetes.data[:, np.newaxis, 2] diabetes_X_train = diabetes_X[:-20] diabetes_X_test = diabetes_X[-20:] diabetes_y_train = diabetes.target[:-20] diabetes_y_test = diabetes.target[-20:] - regr = LinearRegression(sparkSession, transferUsingDF=True) + regr = LinearRegression(sparkSession, solver='direct-solve', transferUsingDF=True) regr.fit(diabetes_X_train, diabetes_y_train) - score = regr.score(diabetes_X_test, diabetes_y_test) - self.failUnless(score > 0.4) # TODO: Improve r2-score (may be I am using it incorrectly) + mllearn_predicted = regr.predict(diabetes_X_test) + sklearn_regr = linear_model.LinearRegression() + sklearn_regr.fit(diabetes_X_train, diabetes_y_train) + self.failUnless(r2_score(sklearn_regr.predict(diabetes_X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn + + def test_linear_regression_cg(self): + diabetes = datasets.load_diabetes() + diabetes_X = diabetes.data[:, np.newaxis, 2] + diabetes_X_train = diabetes_X[:-20] + diabetes_X_test = diabetes_X[-20:] + diabetes_y_train = diabetes.target[:-20] + diabetes_y_test = diabetes.target[-20:] + regr = LinearRegression(sparkSession, solver='newton-cg', transferUsingDF=True) + regr.fit(diabetes_X_train, diabetes_y_train) + mllearn_predicted = regr.predict(diabetes_X_test) + sklearn_regr = linear_model.LinearRegression() + sklearn_regr.fit(diabetes_X_train, diabetes_y_train) + self.failUnless(r2_score(sklearn_regr.predict(diabetes_X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn + def test_svm_sk2(self): digits = datasets.load_digits() @@ -86,22 +107,11 @@ class TestMLLearn(unittest.TestCase): X_test = X_digits[int(.9 * n_samples):] y_test = y_digits[int(.9 * n_samples):] svm = SVM(sparkSession, is_multi_class=True, transferUsingDF=True) - score = svm.fit(X_train, y_train).score(X_test, y_test) - self.failUnless(score > 0.9) - - #def test_naive_bayes_sk2(self): - # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] - # newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) - # newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) - # vectorizer = TfidfVectorizer() - # # Both vectors and vectors_test are SciPy CSR matrix - # vectors = vectorizer.fit_transform(newsgroups_train.data) - # vectors_test = vectorizer.transform(newsgroups_test.data) - # nb = NaiveBayes(sparkSession) - # nb.fit(vectors, newsgroups_train.target) - # pred = nb.predict(vectors_test) - # score = metrics.f1_score(newsgroups_test.target, pred, average='weighted') - # self.failUnless(score > 0.8) + mllearn_predicted = svm.fit(X_train, y_train).predict(X_test) + from sklearn import linear_model, svm + clf = svm.LinearSVC() + sklearn_predicted = clf.fit(X_train, y_train).predict(X_test) + self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 ) if __name__ == '__main__': http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/src/main/python/tests/test_mllearn_numpy.py ---------------------------------------------------------------------- diff --git a/src/main/python/tests/test_mllearn_numpy.py b/src/main/python/tests/test_mllearn_numpy.py index 925554f..faa4d32 100644 --- a/src/main/python/tests/test_mllearn_numpy.py +++ b/src/main/python/tests/test_mllearn_numpy.py @@ -40,11 +40,26 @@ from pyspark.sql import SparkSession from sklearn import datasets, metrics, neighbors from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer - +from sklearn.metrics import accuracy_score, r2_score from systemml.mllearn import LinearRegression, LogisticRegression, NaiveBayes, SVM +from sklearn import linear_model sc = SparkContext() sparkSession = SparkSession.builder.getOrCreate() +import os + +def writeColVector(X, fileName): + fileName = os.path.join(os.getcwd(), fileName) + X.tofile(fileName, sep='\n') + metaDataFileContent = '{ "data_type": "matrix", "value_type": "double", "rows":' + str(len(X)) + ', "cols": 1, "nnz": -1, "format": "csv", "author": "systemml-tests", "created": "0000-00-00 00:00:00 PST" }' + with open(fileName+'.mtd', 'w') as text_file: + text_file.write(metaDataFileContent) + +def deleteIfExists(fileName): + try: + os.remove(fileName) + except OSError: + pass # Currently not integrated with JUnit test # ~/spark-1.6.1-scala-2.11/bin/spark-submit --master local[*] --driver-class-path SystemML.jar test.py @@ -59,8 +74,11 @@ class TestMLLearn(unittest.TestCase): X_test = X_digits[int(.9 * n_samples):] y_test = y_digits[int(.9 * n_samples):] logistic = LogisticRegression(sparkSession) - score = logistic.fit(X_train, y_train).score(X_test, y_test) - self.failUnless(score > 0.9) + logistic.fit(X_train, y_train) + mllearn_predicted = logistic.predict(X_test) + sklearn_logistic = linear_model.LogisticRegression() + sklearn_logistic.fit(X_train, y_train) + self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn def test_logistic_mlpipeline(self): training = sparkSession.createDataFrame([ @@ -101,11 +119,27 @@ class TestMLLearn(unittest.TestCase): diabetes_X_test = diabetes_X[-20:] diabetes_y_train = diabetes.target[:-20] diabetes_y_test = diabetes.target[-20:] - regr = LinearRegression(sparkSession) + regr = LinearRegression(sparkSession, solver='direct-solve') regr.fit(diabetes_X_train, diabetes_y_train) - score = regr.score(diabetes_X_test, diabetes_y_test) - self.failUnless(score > 0.4) # TODO: Improve r2-score (may be I am using it incorrectly) + mllearn_predicted = regr.predict(diabetes_X_test) + sklearn_regr = linear_model.LinearRegression() + sklearn_regr.fit(diabetes_X_train, diabetes_y_train) + self.failUnless(r2_score(sklearn_regr.predict(diabetes_X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn + def test_linear_regression_cg(self): + diabetes = datasets.load_diabetes() + diabetes_X = diabetes.data[:, np.newaxis, 2] + diabetes_X_train = diabetes_X[:-20] + diabetes_X_test = diabetes_X[-20:] + diabetes_y_train = diabetes.target[:-20] + diabetes_y_test = diabetes.target[-20:] + regr = LinearRegression(sparkSession, solver='newton-cg') + regr.fit(diabetes_X_train, diabetes_y_train) + mllearn_predicted = regr.predict(diabetes_X_test) + sklearn_regr = linear_model.LinearRegression() + sklearn_regr.fit(diabetes_X_train, diabetes_y_train) + self.failUnless(r2_score(sklearn_regr.predict(diabetes_X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn + def test_svm(self): digits = datasets.load_digits() X_digits = digits.data @@ -116,8 +150,11 @@ class TestMLLearn(unittest.TestCase): X_test = X_digits[int(.9 * n_samples):] y_test = y_digits[int(.9 * n_samples):] svm = SVM(sparkSession, is_multi_class=True) - score = svm.fit(X_train, y_train).score(X_test, y_test) - self.failUnless(score > 0.9) + mllearn_predicted = svm.fit(X_train, y_train).predict(X_test) + from sklearn import linear_model, svm + clf = svm.LinearSVC() + sklearn_predicted = clf.fit(X_train, y_train).predict(X_test) + self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 ) def test_naive_bayes(self): digits = datasets.load_digits() @@ -129,22 +166,26 @@ class TestMLLearn(unittest.TestCase): X_test = X_digits[int(.9 * n_samples):] y_test = y_digits[int(.9 * n_samples):] nb = NaiveBayes(sparkSession) - score = nb.fit(X_train, y_train).score(X_test, y_test) - self.failUnless(score > 0.8) + mllearn_predicted = nb.fit(X_train, y_train).predict(X_test) + from sklearn.naive_bayes import MultinomialNB + clf = MultinomialNB() + sklearn_predicted = clf.fit(X_train, y_train).predict(X_test) + self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 ) - #def test_naive_bayes1(self): - # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] - # newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) - # newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) - # vectorizer = TfidfVectorizer() - # # Both vectors and vectors_test are SciPy CSR matrix - # vectors = vectorizer.fit_transform(newsgroups_train.data) - # vectors_test = vectorizer.transform(newsgroups_test.data) - # nb = NaiveBayes(sparkSession) - # nb.fit(vectors, newsgroups_train.target) - # pred = nb.predict(vectors_test) - # score = metrics.f1_score(newsgroups_test.target, pred, average='weighted') - # self.failUnless(score > 0.8) + def test_naive_bayes1(self): + categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] + newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) + newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) + vectorizer = TfidfVectorizer() + # Both vectors and vectors_test are SciPy CSR matrix + vectors = vectorizer.fit_transform(newsgroups_train.data) + vectors_test = vectorizer.transform(newsgroups_test.data) + nb = NaiveBayes(sparkSession) + mllearn_predicted = nb.fit(vectors, newsgroups_train.target).predict(vectors_test) + from sklearn.naive_bayes import MultinomialNB + clf = MultinomialNB() + sklearn_predicted = clf.fit(vectors, newsgroups_train.target).predict(vectors_test) + self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 ) if __name__ == '__main__':
