Repository: spark Updated Branches: refs/heads/master 658a478d3 -> e43803b8f
[SPARK-6948] [MLLIB] compress vectors in VectorAssembler The compression is based on storage. brkyvz Author: Xiangrui Meng <m...@databricks.com> Closes #5985 from mengxr/SPARK-6948 and squashes the following commits: df56a00 [Xiangrui Meng] update python tests 6d90d45 [Xiangrui Meng] compress vectors in VectorAssembler Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e43803b8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e43803b8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e43803b8 Branch: refs/heads/master Commit: e43803b8f477b2c8d28836ac163cb54328d13f1a Parents: 658a478 Author: Xiangrui Meng <m...@databricks.com> Authored: Thu May 7 15:45:37 2015 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu May 7 15:45:37 2015 -0700 ---------------------------------------------------------------------- .../org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../apache/spark/ml/feature/VectorAssemblerSuite.scala | 10 +++++++++- python/pyspark/ml/feature.py | 6 +++--- 3 files changed, 13 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/e43803b8/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index b5a69ce..796758a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -102,6 +102,6 @@ object VectorAssembler { case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } - Vectors.sparse(cur, indices.result(), values.result()) + Vectors.sparse(cur, indices.result(), values.result()).compressed } } http://git-wip-us.apache.org/repos/asf/spark/blob/e43803b8/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index 57d0278..0db2760 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.feature import org.scalatest.FunSuite import org.apache.spark.SparkException -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SQLContext} @@ -48,6 +48,14 @@ class VectorAssemblerSuite extends FunSuite with MLlibTestSparkContext { } } + test("assemble should compress vectors") { + import org.apache.spark.ml.feature.VectorAssembler.assemble + val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) + assert(v1.isInstanceOf[SparseVector]) + val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) + assert(v2.isInstanceOf[DenseVector]) + } + test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) http://git-wip-us.apache.org/repos/asf/spark/blob/e43803b8/python/pyspark/ml/feature.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8a0fddd..705a368 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -121,12 +121,12 @@ class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol): >>> df = sc.parallelize([Row(a=1, b=0, c=3)]).toDF() >>> vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features") >>> vecAssembler.transform(df).head().features - SparseVector(3, {0: 1.0, 2: 3.0}) + DenseVector([1.0, 0.0, 3.0]) >>> vecAssembler.setParams(outputCol="freqs").transform(df).head().freqs - SparseVector(3, {0: 1.0, 2: 3.0}) + DenseVector([1.0, 0.0, 3.0]) >>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "vector"} >>> vecAssembler.transform(df, params).head().vector - SparseVector(2, {1: 1.0}) + DenseVector([0.0, 1.0]) """ _java_class = "org.apache.spark.ml.feature.VectorAssembler" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org