[ml] Makes StandardScalers state package private and reduce redundant code. Adjusts flink-ml readme.
Project: http://git-wip-us.apache.org/repos/asf/flink/repo Commit: http://git-wip-us.apache.org/repos/asf/flink/commit/97611c24 Tree: http://git-wip-us.apache.org/repos/asf/flink/tree/97611c24 Diff: http://git-wip-us.apache.org/repos/asf/flink/diff/97611c24 Branch: refs/heads/master Commit: 97611c245f4df5820124fba25e55a2bac59086b4 Parents: 73f9911 Author: Till Rohrmann <trohrm...@apache.org> Authored: Tue Jun 9 10:23:09 2015 +0200 Committer: Till Rohrmann <trohrm...@apache.org> Committed: Tue Jun 9 10:29:24 2015 +0200 ---------------------------------------------------------------------- flink-staging/flink-ml/README.md | 5 +-- .../flink/ml/preprocessing/StandardScaler.scala | 44 +++++++++++++------- 2 files changed, 32 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/flink/blob/97611c24/flink-staging/flink-ml/README.md ---------------------------------------------------------------------- diff --git a/flink-staging/flink-ml/README.md b/flink-staging/flink-ml/README.md index 5721e8f..5cabd7c 100644 --- a/flink-staging/flink-ml/README.md +++ b/flink-staging/flink-ml/README.md @@ -7,10 +7,9 @@ Theses implementations allow to scale to data sizes which vastly exceed the memo Flink-ML currently comprises the following algorithms: * Classification +** Soft-margin SVM * Regression -** Logistic regression -* Clustering -** k-Means +** Multiple linear regression * Recommendation ** Alternating least squares (ALS) http://git-wip-us.apache.org/repos/asf/flink/blob/97611c24/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala ---------------------------------------------------------------------- diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala index 3b9c8d2..bf09b20 100644 --- a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala @@ -21,10 +21,8 @@ package org.apache.flink.ml.preprocessing import breeze.linalg import breeze.numerics.sqrt import breeze.numerics.sqrt._ -import org.apache.flink.api.common.functions._ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala._ -import org.apache.flink.configuration.Configuration import org.apache.flink.ml._ import org.apache.flink.ml.common.{LabeledVector, Parameter, ParameterMap} import org.apache.flink.ml.math.Breeze._ @@ -62,7 +60,9 @@ import scala.reflect.ClassTag */ class StandardScaler extends Transformer[StandardScaler] { - var metricsOption: Option[DataSet[(linalg.Vector[Double], linalg.Vector[Double])]] = None + private[preprocessing] var metricsOption: Option[ + DataSet[(linalg.Vector[Double], linalg.Vector[Double])] + ] = None /** Sets the target mean of the transformed data * @@ -213,12 +213,7 @@ object StandardScaler { input.mapWithBcVariable(metrics){ (vector, metrics) => { val (broadcastMean, broadcastStd) = metrics - var myVector = vector.asBreeze - - myVector -= broadcastMean - myVector :/= broadcastStd - myVector = (myVector :* std) + mean - myVector.fromBreeze + scaleVector(vector, broadcastMean, broadcastStd, mean, std) } } } @@ -245,12 +240,8 @@ object StandardScaler { (labeledVector, metrics) => { val (broadcastMean, broadcastStd) = metrics val LabeledVector(label, vector) = labeledVector - var breezeVector = vector.asBreeze - breezeVector -= broadcastMean - breezeVector :/= broadcastStd - breezeVector = (breezeVector :* std) + mean - LabeledVector(label, breezeVector.fromBreeze) + LabeledVector(label, scaleVector(vector, broadcastMean, broadcastStd, mean, std)) } } } @@ -262,4 +253,29 @@ object StandardScaler { } } } + + /** Scales the given vector such that it has the given mean and std + * + * @param vector Vector to be scaled + * @param dataMean Mean of the training data + * @param dataStd Standard deviation of the training data + * @param mean Mean of the scaled data + * @param std Standard deviation of the scaled data + * @tparam T Type of [[Vector]] + * @return Scaled vector + */ + private def scaleVector[T <: Vector: BreezeVectorConverter]( + vector: T, + dataMean: linalg.Vector[Double], + dataStd: linalg.Vector[Double], + mean: Double, + std: Double) + : T = { + var myVector = vector.asBreeze + + myVector -= dataMean + myVector :/= dataStd + myVector = (myVector :* std) + mean + myVector.fromBreeze + } }