Github user holdenk commented on a diff in the pull request:
https://github.com/apache/spark/pull/19991#discussion_r157207923
--- Diff:
mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala ---
@@ -78,6 +78,31 @@ class FeatureHasherSuite extends SparkFunSuite
assert(features.zip(expected).forall { case (e, a) => e ~== a absTol
1e-14 })
}
+ test("setting explicit numerical columns to treat as categorical") {
+ val df = Seq(
+ (2.0, 1, "foo"),
+ (3.0, 2, "bar")
+ ).toDF("real", "int", "string")
+
+ val n = 100
+ val hasher = new FeatureHasher()
+ .setInputCols("real", "int", "string")
+ .setCategoricalCols(Array("real"))
+ .setOutputCol("features")
+ .setNumFeatures(n)
+ val output = hasher.transform(df)
+
+ val features = output.select("features").as[Vector].collect()
+ // Assume perfect hash on field names
+ def idx: Any => Int = murmur3FeatureIdx(n)
+ // check expected indices
+ val expected = Seq(
+ Vectors.sparse(n, Seq((idx("real=2.0"), 1.0), (idx("int"), 1.0),
(idx("string=foo"), 1.0))),
--- End diff --
shouldn't this be idx("int=1")? (and bellow idx("int=2")?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]