weibozhao commented on a change in pull request #24:
URL: https://github.com/apache/flink-ml/pull/24#discussion_r755675421
##########
File path:
flink-ml-lib/src/main/java/org/apache/flink/ml/classification/knn/FastDistanceMatrixData.java
##########
@@ -0,0 +1,93 @@
+package org.apache.flink.ml.classification.knn;
+
+import org.apache.flink.ml.linalg.DenseMatrix;
+import org.apache.flink.ml.util.ReadWriteUtils;
+import org.apache.flink.types.Row;
+import org.apache.flink.util.Preconditions;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Save the data for calculating distance fast. The FastDistanceMatrixData
saves several dense
+ * vectors in a single matrix. The vectors are organized in columns, which
means each column is a
+ * single vector. For example, vec1: 0,1,2, vec2: 3,4,5, vec3: 6,7,8, then the
data in matrix is
+ * organized as: vec1,vec2,vec3. And the data array in <code>vectors</code> is
{0,1,2,3,4,5,6,7,8}.
+ */
+public class FastDistanceMatrixData implements Serializable {
+ private static final long serialVersionUID = 3093977891649431843L;
+
+ /**
+ * Stores several dense vectors in columns. For example, if the vectorSize
is n, and matrix
+ * saves m vectors, then the number of rows of <code>vectors</code> is n
and the number of cols
+ * of <code>vectors</code> is m.
+ */
+ public final DenseMatrix vectors;
+ /**
+ * Save the extra info besides the vector. Each vector is related to one
row. Thus, for
+ * FastDistanceVectorData, the length of <code>rows</code> is one. And for
+ * FastDistanceMatrixData, the length of <code>rows</code> is equal to the
number of cols of
+ * <code>matrix</code>. Besides, the order of the rows are the same with
the vectors.
+ */
+ public final Row[] rows;
+
+ /**
+ * Stores some extra info extracted from the vector. It's also organized
in columns. For
+ * example, if we want to save the L1 norm and L2 norm of the vector, then
the two values are
+ * viewed as a two-dimension label vector. We organize the norm vectors
together to get the
+ * <code>label</code>. If the number of cols of <code>vectors</code> is m,
then in this case the
+ * dimension of <code>label</code> is 2 * m.
+ */
+ public DenseMatrix label;
+
+ public Row[] getRows() {
+ return rows;
+ }
+
+ /**
+ * Constructor, initialize the vector data and extra info.
+ *
+ * @param vectors DenseMatrix which saves vectors in columns.
+ * @param rows extra info besides the vector.
+ */
+ public FastDistanceMatrixData(DenseMatrix vectors, Row[] rows) {
+ this.rows = rows;
+ Preconditions.checkNotNull(vectors, "DenseMatrix should not be null!");
+ if (null != rows) {
+ Preconditions.checkArgument(
+ vectors.numCols() == rows.length,
+ "The column number of DenseMatrix must be equal to the
rows array length!");
+ }
+ this.vectors = vectors;
+ }
+
+ /**
+ * serialization of FastDistanceMatrixData.
+ *
+ * @return json string.
+ */
+ @Override
+ public String toString() {
+ Map<String, Object> params = new HashMap<>(3);
+ params.put("vectors", ReadWriteUtils.OBJECT_MAPPER.toJson(vectors));
+ params.put("label", ReadWriteUtils.OBJECT_MAPPER.toJson(label));
+ params.put("rows", ReadWriteUtils.OBJECT_MAPPER.toJson(rows));
+ return ReadWriteUtils.OBJECT_MAPPER.toJson(params);
+ }
+
+ /**
+ * deserialization of FastDistanceMatrixData.
+ *
+ * @param modelStr string of model serialization.
+ * @return FastDistanceMatrixData
+ */
+ public static FastDistanceMatrixData fromString(String modelStr) {
Review comment:
It's not a public data structure, it is using only in knn algo. I think
toString and fromString is OK.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]