[
https://issues.apache.org/jira/browse/FLINK-1718?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14384080#comment-14384080
]
ASF GitHub Bot commented on FLINK-1718:
---------------------------------------
Github user rmetzger commented on a diff in the pull request:
https://github.com/apache/flink/pull/539#discussion_r27308257
--- Diff:
flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/SparseMatrix.scala
---
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.ml.math
+
+import scala.util.Sorting
+
+/** Sparse matrix using the compressed sparse column (CSC) representation.
+ *
+ * More details concerning the compressed sparse column (CSC)
representation can be found
+ *
[http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29].
+ *
+ * @param numRows Number of rows
+ * @param numCols Number of columns
+ * @param rowIndices Array containing the row indices of non-zero entries
+ * @param colPtrs Array containing the starting offsets in data for each
column
+ * @param data Array containing the non-zero entries in column-major order
+ */
+class SparseMatrix(
+ val numRows: Int,
+ val numCols: Int,
+ val rowIndices: Array[Int],
+ val colPtrs: Array[Int],
+ val data: Array[Double])
+ extends Matrix {
+
+ /** Element wise access function
+ *
+ * @param row row index
+ * @param col column index
+ * @return matrix entry at (row, col)
+ */
+ override def apply(row: Int, col: Int): Double = {
+
+ val index = locate(row, col)
+
+ if(index < 0){
+ 0
+ } else {
+ data(index)
+ }
+ }
+
+ def toDenseMatrix: DenseMatrix = {
+ val result = DenseMatrix.zeros(numRows, numCols)
+
+ for(row <- 0 until numRows; col <- 0 until numCols) {
+ result(row, col) = apply(row, col)
+ }
+
+ result
+ }
+
+ /** Element wise update function
+ *
+ * @param row row index
+ * @param col column index
+ * @param value value to set at (row, col)
+ */
+ override def update(row: Int, col: Int, value: Double): Unit = {
+ val index = locate(row, col)
+
+ if(index < 0) {
+ throw new IllegalArgumentException("Cannot update zero value of
sparse matrix at index " +
+ s"($row, $col)")
+ } else {
+ data(index) = value
+ }
+ }
+
+ override def toString: String = {
+ val result = StringBuilder.newBuilder
+
+ result.append(s"SparseMatrix($numRows, $numCols)\n")
+
+ var columnIndex = 0
+
+ val fieldWidth = math.max(numRows, numCols).toString.length
+ val valueFieldWidth = data.map(_.toString.length).max + 2
+
+ for(index <- 0 until colPtrs.last) {
+ while(colPtrs(columnIndex + 1) <= index){
+ columnIndex += 1
+ }
+
+ val rowStr = rowIndices(index).toString
+ val columnStr = columnIndex.toString
+ val valueStr = data(index).toString
+
+ result.append("(" + " " * (fieldWidth - rowStr.length) + rowStr +
"," +
+ " " * (fieldWidth - columnStr.length) + columnStr + ")")
+ result.append(" " * (valueFieldWidth - valueStr.length) + valueStr)
+ result.append("\n")
+ }
+
+ result.toString
+ }
+
+ private def locate(row: Int, col: Int): Int = {
+ require(0 <= row && row < numRows, s"Row $row is out of bounds [0,
$numRows).")
+ require(0 <= col && col < numCols, s"Col $col is out of bounds [0,
$numCols).")
--- End diff --
Many ops in this class seem to use this method. I'm not sure how expensive
the string interpolation is.
> Add sparse vector and sparse matrix types to machine learning library
> ---------------------------------------------------------------------
>
> Key: FLINK-1718
> URL: https://issues.apache.org/jira/browse/FLINK-1718
> Project: Flink
> Issue Type: New Feature
> Components: Machine Learning Library
> Reporter: Till Rohrmann
> Assignee: Till Rohrmann
> Labels: ML
>
> Currently, the machine learning library only supports dense matrix and dense
> vectors. For future algorithms it would be beneficial to also support sparse
> vectors and matrices.
> I'd propose to use the compressed sparse column (CSC) representation, because
> it allows rather efficient operations compared to a map backed sparse
> matrix/vector implementation. Furthermore, this is also the format the Breeze
> library expects for sparse matrices/vectors. Thus, it is easy to convert to a
> sparse breeze data structure which provides us with many linear algebra
> operations.
> BIDMat [1] uses the same data representation.
> Resources:
> [1] [https://github.com/BIDData/BIDMat]
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)