[
https://issues.apache.org/jira/browse/FLINK-1735?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14537318#comment-14537318
]
ASF GitHub Bot commented on FLINK-1735:
---------------------------------------
Github user aalexandrov commented on a diff in the pull request:
https://github.com/apache/flink/pull/665#discussion_r30004988
--- Diff:
flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/feature/extraction/FeatureHasherSuite.scala
---
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.ml.feature.extraction
+
+import org.apache.flink.api.scala.{ExecutionEnvironment, _}
+import org.apache.flink.ml.math.SparseVector
+import org.apache.flink.test.util.FlinkTestBase
+import org.scalatest.{FlatSpec, Matchers}
+
+class FeatureHasherSuite
+ extends FlatSpec
+ with Matchers
+ with FlinkTestBase {
+
+ behavior of "Flink's Feature Hasher"
+
+ import FeatureHasherData._
+
+ it should "transform a sequence of strings into a sparse feature vector
of given size" in {
+ val env = ExecutionEnvironment.getExecutionEnvironment
+
+ env.setParallelism(2)
+
+ for (numFeatures <- numFeaturesTest) {
+ val inputDS = env.fromCollection(input)
+
+ val transformer = FeatureHasher()
+ .setNumFeatures(numFeatures)
+
+ val transformedDS = transformer.transform(inputDS)
+ val results = transformedDS.collect()
+
+ for ((result, expectedResult) <- results zip
expectedResults(numFeatures)) {
+ result.equalsVector(expectedResult) should be(true)
+ }
+ }
+ }
+
+ it should "transform a sequence of strings into a sparse feature vector
of given size," +
+ "with non negative entries" in {
+ val env = ExecutionEnvironment.getExecutionEnvironment
+
+ env.setParallelism(2)
+
+ for (numFeatures <- numFeaturesTest) {
+ val inputDS = env.fromCollection(input)
+
+ val transformer = FeatureHasher()
+ .setNumFeatures(numFeatures).setNonNegative(true)
+
+ val transformedDS = transformer.transform(inputDS)
+ val results = transformedDS.collect()
+
+ for ((result, expectedResult) <- results zip
expectedResultsNonNegative(numFeatures)) {
+ result.equalsVector(expectedResult) should be(true)
+ }
+ }
+ }
+
+ it should "transform a sequence of strings into a sparse feature vector
of default size," +
+ " when parameter is less than 1" in {
+ val env = ExecutionEnvironment.getExecutionEnvironment
+
+ env.setParallelism(2)
+
+ val inputDS = env.fromCollection(input)
+
+ val numFeatures = 0
+
+ val transformer = FeatureHasher()
+ .setNumFeatures(numFeatures).setNonNegative(false)
+
+ val transformedDS = transformer.transform(inputDS)
+ val results = transformedDS.collect()
+
+ for (result <- results) {
+ result.size should equal(Math.pow(2, 20).toInt)
+ }
+ }
+}
+
+object FeatureHasherData {
+
+ val input = Seq(
+ "Two households both alike in dignity".split(" ").toSeq,
+ "In fair Verona where we lay our scene".split(" ").toSeq,
+ "From ancient grudge break to new mutiny".split(" ").toSeq,
+ "Where civil blood makes civil hands unclean".split(" ").toSeq,
+ "From forth the fatal loins of these two foes".split(" ").toSeq
+ )
+
+ /* 2^30 features can't be tested right now because the implementation of
Vector.equalsVector
+ performs an index wise comparison on the two vectors, which takes
approx. forever */
+ val numFeaturesTest = Seq(Math.pow(2, 4).toInt, Math.pow(2, 5).toInt,
1234,
+ Math.pow(2, 16).toInt, Math.pow(2, 20).toInt) //, Math.pow(2,
30).toInt)
+
+ val expectedResults = List(
+ 16 -> List(
+ SparseVector.fromCOO(16, Map((0, 1.0), (1, 1.0), (2, -1.0), (14,
-1.0))),
--- End diff --
you can use arrow notation for the `pair` entries in the `Map` constructor,
e.g.,
```scala
Map(0 -> 1.0, 1 -> 1.0, ...)
```
> Add FeatureHasher to machine learning library
> ---------------------------------------------
>
> Key: FLINK-1735
> URL: https://issues.apache.org/jira/browse/FLINK-1735
> Project: Flink
> Issue Type: New Feature
> Components: Machine Learning Library
> Reporter: Till Rohrmann
> Assignee: Felix Neutatz
> Labels: ML
>
> Using the hashing trick [1,2] is a common way to vectorize arbitrary feature
> values. The hash of the feature value is used to calculate its index for a
> vector entry. In order to mitigate possible collisions, a second hashing
> function is used to calculate the sign for the update value which is added to
> the vector entry. This way, it is likely that collision will simply cancel
> out.
> A feature hasher would also be helpful for NLP problems where it could be
> used to vectorize bag of words or ngrams feature vectors.
> Resources:
> [1] [https://en.wikipedia.org/wiki/Feature_hashing]
> [2]
> [http://scikit-learn.org/stable/modules/feature_extraction.html#feature-extraction]
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)