Repository: spark Updated Branches: refs/heads/master 15680aeed -> 6d75ed7e5
[SPARK-7585] [ML] [DOC] VectorIndexer user guide section Added VectorIndexer section to ML user guide. Also added javaCategoryMaps() method and Java unit test for it. CC: mengxr Author: Joseph K. Bradley <[email protected]> Closes #6255 from jkbradley/vector-indexer-guide and squashes the following commits: dbb8c4c [Joseph K. Bradley] simplified VectorIndexerModel.javaCategoryMaps f692084 [Joseph K. Bradley] Added VectorIndexer section to ML user guide. Also added javaCategoryMaps() method and Java unit test for it. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d75ed7e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d75ed7e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d75ed7e Branch: refs/heads/master Commit: 6d75ed7e5ccf6c58143de4608115f9a2b3ff6cf4 Parents: 15680ae Author: Joseph K. Bradley <[email protected]> Authored: Thu May 21 13:05:48 2015 -0700 Committer: Xiangrui Meng <[email protected]> Committed: Thu May 21 13:05:48 2015 -0700 ---------------------------------------------------------------------- docs/ml-features.md | 83 ++++++++++++++++++++ .../apache/spark/ml/feature/VectorIndexer.scala | 10 +++ .../ml/feature/JavaVectorIndexerSuite.java | 4 +- 3 files changed, 96 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/6d75ed7e/docs/ml-features.md ---------------------------------------------------------------------- diff --git a/docs/ml-features.md b/docs/ml-features.md index 235029d..06f1ac1 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -535,5 +535,88 @@ encoded = encoder.transform(indexed) </div> </div> +## VectorIndexer + +`VectorIndexer` helps index categorical features in datasets of `Vector`s. +It can both automatically decide which features are categorical and convert original values to category indices. Specifically, it does the following: + +1. Take an input column of type [Vector](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) and a parameter `maxCategories`. +2. Decide which features should be categorical based on the number of distinct values, where features with at most `maxCategories` are declared categorical. +3. Compute 0-based category indices for each categorical feature. +4. Index categorical features and transform original feature values to indices. + +Indexing categorical features allows algorithms such as Decision Trees and Tree Ensembles to treat categorical features appropriately, improving performance. + +Please refer to the [VectorIndexer API docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer) for more details. + +In the example below, we read in a dataset of labeled points and then use `VectorIndexer` to decide which features should be treated as categorical. We transform the categorical feature values to their indices. This transformed data could then be passed to algorithms such as `DecisionTreeRegressor` that handle categorical features. + +<div class="codetabs"> +<div data-lang="scala" markdown="1"> +{% highlight scala %} +import org.apache.spark.ml.feature.VectorIndexer +import org.apache.spark.mllib.util.MLUtils + +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val indexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexed") + .setMaxCategories(10) +val indexerModel = indexer.fit(data) +val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet +println(s"Chose ${categoricalFeatures.size} categorical features: " + + categoricalFeatures.mkString(", ")) + +// Create new column "indexed" with categorical values transformed to indices +val indexedData = indexerModel.transform(data) +{% endhighlight %} +</div> + +<div data-lang="java" markdown="1"> +{% highlight java %} +import java.util.Map; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.VectorIndexer; +import org.apache.spark.ml.feature.VectorIndexerModel; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.sql.DataFrame; + +JavaRDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), + "data/mllib/sample_libsvm_data.txt").toJavaRDD(); +DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class); +VectorIndexer indexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexed") + .setMaxCategories(10); +VectorIndexerModel indexerModel = indexer.fit(data); +Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps(); +System.out.print("Chose " + categoryMaps.size() + "categorical features:"); +for (Integer feature : categoryMaps.keySet()) { + System.out.print(" " + feature); +} +System.out.println(); + +// Create new column "indexed" with categorical values transformed to indices +DataFrame indexedData = indexerModel.transform(data); +{% endhighlight %} +</div> + +<div data-lang="python" markdown="1"> +{% highlight python %} +from pyspark.ml.feature import VectorIndexer +from pyspark.mllib.util import MLUtils + +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) +indexerModel = indexer.fit(data) + +# Create new column "indexed" with categorical values transformed to indices +indexedData = indexerModel.transform(data) +{% endhighlight %} +</div> +</div> + # Feature Selectors http://git-wip-us.apache.org/repos/asf/spark/blob/6d75ed7e/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index 6d1d052..e238fb3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -17,6 +17,11 @@ package org.apache.spark.ml.feature +import java.lang.{Double => JDouble, Integer => JInt} +import java.util.{Map => JMap} + +import scala.collection.JavaConverters._ + import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute._ @@ -248,6 +253,11 @@ class VectorIndexerModel private[ml] ( val categoryMaps: Map[Int, Map[Double, Int]]) extends Model[VectorIndexerModel] with VectorIndexerParams { + /** Java-friendly version of [[categoryMaps]] */ + def javaCategoryMaps: JMap[JInt, JMap[JDouble, JInt]] = { + categoryMaps.mapValues(_.asJava).asJava.asInstanceOf[JMap[JInt, JMap[JDouble, JInt]]] + } + /** * Pre-computed feature attributes, with some missing info. * In transform(), set attribute name and other info, if available. http://git-wip-us.apache.org/repos/asf/spark/blob/6d75ed7e/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java ---------------------------------------------------------------------- diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java index 1611001..c7ae546 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java @@ -19,6 +19,7 @@ package org.apache.spark.ml.feature; import java.io.Serializable; import java.util.List; +import java.util.Map; import org.junit.After; import org.junit.Assert; @@ -64,7 +65,8 @@ public class JavaVectorIndexerSuite implements Serializable { .setMaxCategories(2); VectorIndexerModel model = indexer.fit(data); Assert.assertEquals(model.numFeatures(), 2); - Assert.assertEquals(model.categoryMaps().size(), 1); + Map<Integer, Map<Double, Integer>> categoryMaps = model.javaCategoryMaps(); + Assert.assertEquals(categoryMaps.size(), 1); DataFrame indexedData = model.transform(data); } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
