Repository: spark
Updated Branches:
  refs/heads/branch-2.0 17c7522c8 -> d3027c45f


[SPARK-16328][ML][MLLIB][PYSPARK] Add 'asML' and 'fromML' conversion methods to 
PySpark linalg

The move to `ml.linalg` created `asML`/`fromML` utility methods in Scala/Java 
for converting between representations. These are missing in Python, this PR 
adds them.

## How was this patch tested?

New doctests.

Author: Nick Pentreath <ni...@za.ibm.com>

Closes #13997 from MLnick/SPARK-16328-python-linalg-convert.

(cherry picked from commit dab10516138867b7c4fc6d42168497e82853b539)
Signed-off-by: Joseph K. Bradley <jos...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d3027c45
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d3027c45
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d3027c45

Branch: refs/heads/branch-2.0
Commit: d3027c45fbe02752d260aefff9dae707ba5c5d4c
Parents: 17c7522
Author: Nick Pentreath <ni...@za.ibm.com>
Authored: Thu Jun 30 17:52:15 2016 -0700
Committer: Joseph K. Bradley <jos...@databricks.com>
Committed: Thu Jun 30 17:52:29 2016 -0700

----------------------------------------------------------------------
 python/pyspark/mllib/linalg/__init__.py | 99 ++++++++++++++++++++++++++++
 python/pyspark/mllib/tests.py           | 69 +++++++++++++++++++
 2 files changed, 168 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d3027c45/python/pyspark/mllib/linalg/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/linalg/__init__.py 
b/python/pyspark/mllib/linalg/__init__.py
index 3a345b2..15dc53a 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -39,6 +39,7 @@ else:
 import numpy as np
 
 from pyspark import since
+from pyspark.ml import linalg as newlinalg
 from pyspark.sql.types import UserDefinedType, StructField, StructType, 
ArrayType, DoubleType, \
     IntegerType, ByteType, BooleanType
 
@@ -247,6 +248,15 @@ class Vector(object):
         """
         raise NotImplementedError
 
+    def asML(self):
+        """
+        Convert this vector to the new mllib-local representation.
+        This does NOT copy the data; it copies references.
+
+        :return: :py:class:`pyspark.ml.linalg.Vector`
+        """
+        raise NotImplementedError
+
 
 class DenseVector(Vector):
     """
@@ -408,6 +418,17 @@ class DenseVector(Vector):
         """
         return self.array
 
+    def asML(self):
+        """
+        Convert this vector to the new mllib-local representation.
+        This does NOT copy the data; it copies references.
+
+        :return: :py:class:`pyspark.ml.linalg.DenseVector`
+
+        .. versionadded:: 2.0.0
+        """
+        return newlinalg.DenseVector(self.array)
+
     @property
     def values(self):
         """
@@ -737,6 +758,17 @@ class SparseVector(Vector):
         arr[self.indices] = self.values
         return arr
 
+    def asML(self):
+        """
+        Convert this vector to the new mllib-local representation.
+        This does NOT copy the data; it copies references.
+
+        :return: :py:class:`pyspark.ml.linalg.SparseVector`
+
+        .. versionadded:: 2.0.0
+        """
+        return newlinalg.SparseVector(self.size, self.indices, self.values)
+
     def __len__(self):
         return self.size
 
@@ -846,6 +878,24 @@ class Vectors(object):
         return DenseVector(elements)
 
     @staticmethod
+    def fromML(vec):
+        """
+        Convert a vector from the new mllib-local representation.
+        This does NOT copy the data; it copies references.
+
+        :param vec: a :py:class:`pyspark.ml.linalg.Vector`
+        :return: a :py:class:`pyspark.mllib.linalg.Vector`
+
+        .. versionadded:: 2.0.0
+        """
+        if isinstance(vec, newlinalg.DenseVector):
+            return DenseVector(vec.array)
+        elif isinstance(vec, newlinalg.SparseVector):
+            return SparseVector(vec.size, vec.indices, vec.values)
+        else:
+            raise TypeError("Unsupported vector type %s" % type(vec))
+
+    @staticmethod
     def stringify(vector):
         """
         Converts a vector into a string, which can be recognized by
@@ -945,6 +995,13 @@ class Matrix(object):
         """
         raise NotImplementedError
 
+    def asML(self):
+        """
+        Convert this matrix to the new mllib-local representation.
+        This does NOT copy the data; it copies references.
+        """
+        raise NotImplementedError
+
     @staticmethod
     def _convert_to_array(array_like, dtype):
         """
@@ -1044,6 +1101,17 @@ class DenseMatrix(Matrix):
 
         return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, 
values)
 
+    def asML(self):
+        """
+        Convert this matrix to the new mllib-local representation.
+        This does NOT copy the data; it copies references.
+
+        :return: :py:class:`pyspark.ml.linalg.DenseMatrix`
+
+        .. versionadded:: 2.0.0
+        """
+        return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, 
self.isTransposed)
+
     def __getitem__(self, indices):
         i, j = indices
         if i < 0 or i >= self.numRows:
@@ -1216,6 +1284,18 @@ class SparseMatrix(Matrix):
         densevals = np.ravel(self.toArray(), order='F')
         return DenseMatrix(self.numRows, self.numCols, densevals)
 
+    def asML(self):
+        """
+        Convert this matrix to the new mllib-local representation.
+        This does NOT copy the data; it copies references.
+
+        :return: :py:class:`pyspark.ml.linalg.SparseMatrix`
+
+        .. versionadded:: 2.0.0
+        """
+        return newlinalg.SparseMatrix(self.numRows, self.numCols, 
self.colPtrs, self.rowIndices,
+                                      self.values, self.isTransposed)
+
     # TODO: More efficient implementation:
     def __eq__(self, other):
         return np.all(self.toArray() == other.toArray())
@@ -1236,6 +1316,25 @@ class Matrices(object):
         """
         return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
 
+    @staticmethod
+    def fromML(mat):
+        """
+        Convert a matrix from the new mllib-local representation.
+        This does NOT copy the data; it copies references.
+
+        :param mat: a :py:class:`pyspark.ml.linalg.Matrix`
+        :return: a :py:class:`pyspark.mllib.linalg.Matrix`
+
+        .. versionadded:: 2.0.0
+        """
+        if isinstance(mat, newlinalg.DenseMatrix):
+            return DenseMatrix(mat.numRows, mat.numCols, mat.values, 
mat.isTransposed)
+        elif isinstance(mat, newlinalg.SparseMatrix):
+            return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, 
mat.rowIndices,
+                                mat.values, mat.isTransposed)
+        else:
+            raise TypeError("Unsupported matrix type %s" % type(mat))
+
 
 class QRDecomposition(object):
     """

http://git-wip-us.apache.org/repos/asf/spark/blob/d3027c45/python/pyspark/mllib/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 74cf7bb..72fa8b5 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -49,6 +49,7 @@ else:
     import unittest
 
 from pyspark import SparkContext
+import pyspark.ml.linalg as newlinalg
 from pyspark.mllib.common import _to_java_object_rdd
 from pyspark.mllib.clustering import StreamingKMeans, StreamingKMeansModel
 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, 
_convert_to_vector,\
@@ -423,6 +424,74 @@ class VectorTests(MLlibTestCase):
         tmp = SparseVector(4, [0, 2], [3, 0])
         self.assertEqual(tmp.numNonzeros(), 1)
 
+    def test_ml_mllib_vector_conversion(self):
+        # to ml
+        # dense
+        mllibDV = Vectors.dense([1, 2, 3])
+        mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
+        mlDV2 = mllibDV.asML()
+        self.assertEqual(mlDV2, mlDV1)
+        # sparse
+        mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
+        mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
+        mlSV2 = mllibSV.asML()
+        self.assertEqual(mlSV2, mlSV1)
+        # from ml
+        # dense
+        mllibDV1 = Vectors.dense([1, 2, 3])
+        mlDV = newlinalg.Vectors.dense([1, 2, 3])
+        mllibDV2 = Vectors.fromML(mlDV)
+        self.assertEqual(mllibDV1, mllibDV2)
+        # sparse
+        mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
+        mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
+        mllibSV2 = Vectors.fromML(mlSV)
+        self.assertEqual(mllibSV1, mllibSV2)
+
+    def test_ml_mllib_matrix_conversion(self):
+        # to ml
+        # dense
+        mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
+        mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
+        mlDM2 = mllibDM.asML()
+        self.assertEqual(mlDM2, mlDM1)
+        # transposed
+        mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
+        mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
+        mlDMt2 = mllibDMt.asML()
+        self.assertEqual(mlDMt2, mlDMt1)
+        # sparse
+        mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
+        mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 
4])
+        mlSM2 = mllibSM.asML()
+        self.assertEqual(mlSM2, mlSM1)
+        # transposed
+        mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
+        mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], 
True)
+        mlSMt2 = mllibSMt.asML()
+        self.assertEqual(mlSMt2, mlSMt1)
+        # from ml
+        # dense
+        mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
+        mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
+        mllibDM2 = Matrices.fromML(mlDM)
+        self.assertEqual(mllibDM1, mllibDM2)
+        # transposed
+        mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
+        mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
+        mllibDMt2 = Matrices.fromML(mlDMt)
+        self.assertEqual(mllibDMt1, mllibDMt2)
+        # sparse
+        mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
+        mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
+        mllibSM2 = Matrices.fromML(mlSM)
+        self.assertEqual(mllibSM1, mllibSM2)
+        # transposed
+        mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
+        mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], 
True)
+        mllibSMt2 = Matrices.fromML(mlSMt)
+        self.assertEqual(mllibSMt1, mllibSMt2)
+
 
 class ListTests(MLlibTestCase):
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to