git commit: [SPARK-3701][MLLIB] update python linalg api and small fixes

meng Tue, 30 Sep 2014 17:11:47 -0700

Repository: spark
Updated Branches:
  refs/heads/master 6c696d7da -> d75496b18



[SPARK-3701][MLLIB] update python linalg api and small fixes

1. doc updates
2. simple checks on vector dimensions
3. use column major for matrices

davies jkbradley

Author: Xiangrui Meng <[email protected]>

Closes #2548 from mengxr/mllib-py-clean and squashes the following commits:

6dce2df [Xiangrui Meng] address comments
116b5db [Xiangrui Meng] use np.dot instead of array.dot
75f2fcc [Xiangrui Meng] fix python style
fefce00 [Xiangrui Meng] better check of vector size with more tests
067ef71 [Xiangrui Meng] majored -> major
ef853f9 [Xiangrui Meng] update python linalg api and small fixes


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d75496b1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d75496b1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d75496b1

Branch: refs/heads/master
Commit: d75496b1898dace4da1cf95e53c38093f8f95221
Parents: 6c696d7
Author: Xiangrui Meng <[email protected]>
Authored: Tue Sep 30 17:10:36 2014 -0700
Committer: Xiangrui Meng <[email protected]>
Committed: Tue Sep 30 17:10:36 2014 -0700

----------------------------------------------------------------------
 .../apache/spark/mllib/linalg/Matrices.scala    |   8 +-
 python/pyspark/mllib/linalg.py                  | 150 +++++++++++++++----
 2 files changed, 125 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d75496b1/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 4e87fe0..2cc52e9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -85,7 +85,7 @@ sealed trait Matrix extends Serializable {
 }
 
 /**
- * Column-majored dense matrix.
+ * Column-major dense matrix.
  * The entry values are stored in a single array of doubles with columns 
listed in sequence.
  * For example, the following matrix
  * {{{
@@ -128,7 +128,7 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val 
values: Array[Double])
 }
 
 /**
- * Column-majored sparse matrix.
+ * Column-major sparse matrix.
  * The entry values are stored in Compressed Sparse Column (CSC) format.
  * For example, the following matrix
  * {{{
@@ -207,7 +207,7 @@ class SparseMatrix(
 object Matrices {
 
   /**
-   * Creates a column-majored dense matrix.
+   * Creates a column-major dense matrix.
    *
    * @param numRows number of rows
    * @param numCols number of columns
@@ -218,7 +218,7 @@ object Matrices {
   }
 
   /**
-   * Creates a column-majored sparse matrix in Compressed Sparse Column (CSC) 
format.
+   * Creates a column-major sparse matrix in Compressed Sparse Column (CSC) 
format.
    *
    * @param numRows number of rows
    * @param numCols number of columns

http://git-wip-us.apache.org/repos/asf/spark/blob/d75496b1/python/pyspark/mllib/linalg.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 0a5dcaa..51014a8 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -63,6 +63,41 @@ def _convert_to_vector(l):
         raise TypeError("Cannot convert type %s into Vector" % type(l))
 
 
+def _vector_size(v):
+    """
+    Returns the size of the vector.
+
+    >>> _vector_size([1., 2., 3.])
+    3
+    >>> _vector_size((1., 2., 3.))
+    3
+    >>> _vector_size(array.array('d', [1., 2., 3.]))
+    3
+    >>> _vector_size(np.zeros(3))
+    3
+    >>> _vector_size(np.zeros((3, 1)))
+    3
+    >>> _vector_size(np.zeros((1, 3)))
+    Traceback (most recent call last):
+        ...
+    ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
+    """
+    if isinstance(v, Vector):
+        return len(v)
+    elif type(v) in (array.array, list, tuple):
+        return len(v)
+    elif type(v) == np.ndarray:
+        if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
+            return len(v)
+        else:
+            raise ValueError("Cannot treat an ndarray of shape %s as a vector" 
% str(v.shape))
+    elif _have_scipy and scipy.sparse.issparse(v):
+        assert v.shape[1] == 1, "Expected column vector"
+        return v.shape[0]
+    else:
+        raise TypeError("Cannot treat type %s as a vector" % type(v))
+
+
 class Vector(object):
     """
     Abstract class for DenseVector and SparseVector
@@ -76,6 +111,9 @@ class Vector(object):
 
 
 class DenseVector(Vector):
+    """
+    A dense vector represented by a value array.
+    """
     def __init__(self, ar):
         if not isinstance(ar, array.array):
             ar = array.array('d', ar)
@@ -100,15 +138,31 @@ class DenseVector(Vector):
         5.0
         >>> dense.dot(np.array(range(1, 3)))
         5.0
+        >>> dense.dot([1.,])
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
+        >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
+        array([  5.,  11.])
+        >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
         """
-        if isinstance(other, SparseVector):
-            return other.dot(self)
+        if type(other) == np.ndarray and other.ndim > 1:
+            assert len(self) == other.shape[0], "dimension mismatch"
+            return np.dot(self.toArray(), other)
         elif _have_scipy and scipy.sparse.issparse(other):
-            return other.transpose().dot(self.toArray())[0]
-        elif isinstance(other, Vector):
-            return np.dot(self.toArray(), other.toArray())
+            assert len(self) == other.shape[0], "dimension mismatch"
+            return other.transpose().dot(self.toArray())
         else:
-            return np.dot(self.toArray(), other)
+            assert len(self) == _vector_size(other), "dimension mismatch"
+            if isinstance(other, SparseVector):
+                return other.dot(self)
+            elif isinstance(other, Vector):
+                return np.dot(self.toArray(), other.toArray())
+            else:
+                return np.dot(self.toArray(), other)
 
     def squared_distance(self, other):
         """
@@ -126,7 +180,16 @@ class DenseVector(Vector):
         >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
         >>> dense1.squared_distance(sparse1)
         2.0
+        >>> dense1.squared_distance([1.,])
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
+        >>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
         """
+        assert len(self) == _vector_size(other), "dimension mismatch"
         if isinstance(other, SparseVector):
             return other.squared_distance(self)
         elif _have_scipy and scipy.sparse.issparse(other):
@@ -165,12 +228,10 @@ class DenseVector(Vector):
 
 
 class SparseVector(Vector):
-
     """
     A simple sparse vector class for passing data to MLlib. Users may
     alternatively pass SciPy's {scipy.sparse} data types.
     """
-
     def __init__(self, size, *args):
         """
         Create a sparse vector, using either a dictionary, a list of
@@ -222,20 +283,33 @@ class SparseVector(Vector):
         0.0
         >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
         array([ 22.,  22.])
+        >>> a.dot([1., 2., 3.])
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
+        >>> a.dot(np.array([1., 2.]))
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
+        >>> a.dot(DenseVector([1., 2.]))
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
+        >>> a.dot(np.zeros((3, 2)))
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
         """
         if type(other) == np.ndarray:
-            if other.ndim == 1:
-                result = 0.0
-                for i in xrange(len(self.indices)):
-                    result += self.values[i] * other[self.indices[i]]
-                return result
-            elif other.ndim == 2:
+            if other.ndim == 2:
                 results = [self.dot(other[:, i]) for i in 
xrange(other.shape[1])]
                 return np.array(results)
-            else:
-                raise Exception("Cannot call dot with %d-dimensional array" % 
other.ndim)
+            elif other.ndim > 2:
+                raise ValueError("Cannot call dot with %d-dimensional array" % 
other.ndim)
+
+        assert len(self) == _vector_size(other), "dimension mismatch"
 
-        elif type(other) in (array.array, DenseVector):
+        if type(other) in (np.ndarray, array.array, DenseVector):
             result = 0.0
             for i in xrange(len(self.indices)):
                 result += self.values[i] * other[self.indices[i]]
@@ -254,6 +328,7 @@ class SparseVector(Vector):
                 else:
                     j += 1
             return result
+
         else:
             return self.dot(_convert_to_vector(other))
 
@@ -273,7 +348,16 @@ class SparseVector(Vector):
         30.0
         >>> b.squared_distance(a)
         30.0
+        >>> b.squared_distance([1., 2.])
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
+        >>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
+        Traceback (most recent call last):
+            ...
+        AssertionError: dimension mismatch
         """
+        assert len(self) == _vector_size(other), "dimension mismatch"
         if type(other) in (list, array.array, DenseVector, np.array, 
np.ndarray):
             if type(other) is np.array and other.ndim != 1:
                 raise Exception("Cannot call squared_distance with 
%d-dimensional array" %
@@ -348,7 +432,6 @@ class SparseVector(Vector):
         >>> v1 != v2
         False
         """
-
         return (isinstance(other, self.__class__)
                 and other.size == self.size
                 and other.indices == self.indices
@@ -414,23 +497,32 @@ class Vectors(object):
 
 
 class Matrix(object):
-    """ the Matrix """
-    def __init__(self, nRow, nCol):
-        self.nRow = nRow
-        self.nCol = nCol
+    """
+    Represents a local matrix.
+    """
+
+    def __init__(self, numRows, numCols):
+        self.numRows = numRows
+        self.numCols = numCols
 
     def toArray(self):
+        """
+        Returns its elements in a NumPy ndarray.
+        """
         raise NotImplementedError
 
 
 class DenseMatrix(Matrix):
-    def __init__(self, nRow, nCol, values):
-        Matrix.__init__(self, nRow, nCol)
-        assert len(values) == nRow * nCol
+    """
+    Column-major dense matrix.
+    """
+    def __init__(self, numRows, numCols, values):
+        Matrix.__init__(self, numRows, numCols)
+        assert len(values) == numRows * numCols
         self.values = values
 
     def __reduce__(self):
-        return DenseMatrix, (self.nRow, self.nCol, self.values)
+        return DenseMatrix, (self.numRows, self.numCols, self.values)
 
     def toArray(self):
         """
@@ -439,10 +531,10 @@ class DenseMatrix(Matrix):
         >>> arr = array.array('d', [float(i) for i in range(4)])
         >>> m = DenseMatrix(2, 2, arr)
         >>> m.toArray()
-        array([[ 0.,  1.],
-               [ 2.,  3.]])
+        array([[ 0.,  2.],
+               [ 1.,  3.]])
         """
-        return np.ndarray((self.nRow, self.nCol), np.float64, 
buffer=self.values.tostring())
+        return np.reshape(self.values, (self.numRows, self.numCols), order='F')
 
 
 def _test():


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

git commit: [SPARK-3701][MLLIB] update python linalg api and small fixes

Reply via email to