Repository: spark Updated Branches: refs/heads/branch-2.0 af70ad028 -> b349237e4
[SPARK-16242][MLLIB][PYSPARK] Conversion between old/new matrix columns in a DataFrame (Python) ## What changes were proposed in this pull request? This PR implements python wrappers for #13888 to convert old/new matrix columns in a DataFrame. ## How was this patch tested? Doctest in python. Author: Yanbo Liang <yblia...@gmail.com> Closes #13935 from yanboliang/spark-16242. (cherry picked from commit e158478a9fff5e63ae0336a54b3f360d0cd38921) Signed-off-by: Yanbo Liang <yblia...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b349237e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b349237e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b349237e Branch: refs/heads/branch-2.0 Commit: b349237e4b4cf60fccf9bfcf76deca78f1224bf1 Parents: af70ad0 Author: Yanbo Liang <yblia...@gmail.com> Authored: Tue Jun 28 06:28:22 2016 -0700 Committer: Yanbo Liang <yblia...@gmail.com> Committed: Tue Jun 28 06:28:58 2016 -0700 ---------------------------------------------------------------------- .../spark/mllib/api/python/PythonMLLibAPI.scala | 14 ++++ python/pyspark/mllib/util.py | 80 ++++++++++++++++++++ 2 files changed, 94 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/b349237e/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index f2c70ba..f4819f7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -1215,6 +1215,20 @@ private[python] class PythonMLLibAPI extends Serializable { def convertVectorColumnsFromML(dataset: DataFrame, cols: JArrayList[String]): DataFrame = { MLUtils.convertVectorColumnsFromML(dataset, cols.asScala: _*) } + + /** + * Python-friendly version of [[MLUtils.convertMatrixColumnsToML()]]. + */ + def convertMatrixColumnsToML(dataset: DataFrame, cols: JArrayList[String]): DataFrame = { + MLUtils.convertMatrixColumnsToML(dataset, cols.asScala: _*) + } + + /** + * Python-friendly version of [[MLUtils.convertMatrixColumnsFromML()]] + */ + def convertMatrixColumnsFromML(dataset: DataFrame, cols: JArrayList[String]): DataFrame = { + MLUtils.convertMatrixColumnsFromML(dataset, cols.asScala: _*) + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/b349237e/python/pyspark/mllib/util.py ---------------------------------------------------------------------- diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index a7e6bcc..48867a0 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -281,6 +281,86 @@ class MLUtils(object): raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) return callMLlibFunc("convertVectorColumnsFromML", dataset, list(cols)) + @staticmethod + @since("2.0.0") + def convertMatrixColumnsToML(dataset, *cols): + """ + Converts matrix columns in an input DataFrame from the + :py:class:`pyspark.mllib.linalg.Matrix` type to the new + :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` + package. + + :param dataset: + input dataset + :param cols: + a list of matrix columns to be converted. + New matrix columns will be ignored. If unspecified, all old + matrix columns will be converted excepted nested ones. + :return: + the input dataset with old matrix columns converted to the + new matrix type + + >>> import pyspark + >>> from pyspark.mllib.linalg import Matrices + >>> from pyspark.mllib.util import MLUtils + >>> df = spark.createDataFrame( + ... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]), + ... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"]) + >>> r1 = MLUtils.convertMatrixColumnsToML(df).first() + >>> isinstance(r1.x, pyspark.ml.linalg.SparseMatrix) + True + >>> isinstance(r1.y, pyspark.ml.linalg.DenseMatrix) + True + >>> r2 = MLUtils.convertMatrixColumnsToML(df, "x").first() + >>> isinstance(r2.x, pyspark.ml.linalg.SparseMatrix) + True + >>> isinstance(r2.y, pyspark.mllib.linalg.DenseMatrix) + True + """ + if not isinstance(dataset, DataFrame): + raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) + return callMLlibFunc("convertMatrixColumnsToML", dataset, list(cols)) + + @staticmethod + @since("2.0.0") + def convertMatrixColumnsFromML(dataset, *cols): + """ + Converts matrix columns in an input DataFrame to the + :py:class:`pyspark.mllib.linalg.Matrix` type from the new + :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` + package. + + :param dataset: + input dataset + :param cols: + a list of matrix columns to be converted. + Old matrix columns will be ignored. If unspecified, all new + matrix columns will be converted except nested ones. + :return: + the input dataset with new matrix columns converted to the + old matrix type + + >>> import pyspark + >>> from pyspark.ml.linalg import Matrices + >>> from pyspark.mllib.util import MLUtils + >>> df = spark.createDataFrame( + ... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]), + ... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"]) + >>> r1 = MLUtils.convertMatrixColumnsFromML(df).first() + >>> isinstance(r1.x, pyspark.mllib.linalg.SparseMatrix) + True + >>> isinstance(r1.y, pyspark.mllib.linalg.DenseMatrix) + True + >>> r2 = MLUtils.convertMatrixColumnsFromML(df, "x").first() + >>> isinstance(r2.x, pyspark.mllib.linalg.SparseMatrix) + True + >>> isinstance(r2.y, pyspark.ml.linalg.DenseMatrix) + True + """ + if not isinstance(dataset, DataFrame): + raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) + return callMLlibFunc("convertMatrixColumnsFromML", dataset, list(cols)) + class Saveable(object): """ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org