(spark) branch branch-4.0 updated: [SPARK-50936][ML][PYTHON][CONNECT] Support HashingTF, IDF and FeatureHasher on connect

ruifengz Fri, 24 Jan 2025 23:31:08 -0800

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 1b2f3512a0f1 [SPARK-50936][ML][PYTHON][CONNECT] Support HashingTF, IDF 
and FeatureHasher on connect
1b2f3512a0f1 is described below

commit 1b2f3512a0f18d7f456340269584fcb883516723
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Sat Jan 25 15:30:20 2025 +0800

    [SPARK-50936][ML][PYTHON][CONNECT] Support HashingTF, IDF and FeatureHasher 
on connect
    
    ### What changes were proposed in this pull request?
    Support HashingTF, IDF and FeatureHasher on connect
    
    ### Why are the changes needed?
    For feature parity
    
    ### Does this PR introduce _any_ user-facing change?
    yes
    
    ### How was this patch tested?
    updated tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #49651 from zhengruifeng/ml_connect_hash.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
    (cherry picked from commit aa9a10470aa4584283479fddfb3763ceb3456054)
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 .../services/org.apache.spark.ml.Estimator         |   6 +-
 .../services/org.apache.spark.ml.Transformer       |   4 +
 .../scala/org/apache/spark/ml/feature/IDF.scala    |   2 +
 .../ml/tests/connect/test_parity_feature.py        |   8 --
 python/pyspark/ml/tests/test_feature.py            | 120 +++++++++++++++------
 .../org/apache/spark/sql/connect/ml/MLUtils.scala  |   3 +-
 6 files changed, 98 insertions(+), 45 deletions(-)

diff --git 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
index 0dcbe66afd34..449a07aed31d 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
@@ -25,28 +25,23 @@ org.apache.spark.ml.classification.DecisionTreeClassifier
 org.apache.spark.ml.classification.RandomForestClassifier
 org.apache.spark.ml.classification.GBTClassifier
 
-
 # regression
 org.apache.spark.ml.regression.LinearRegression
 org.apache.spark.ml.regression.DecisionTreeRegressor
 org.apache.spark.ml.regression.RandomForestRegressor
 org.apache.spark.ml.regression.GBTRegressor
 
-
 # clustering
 org.apache.spark.ml.clustering.KMeans
 org.apache.spark.ml.clustering.BisectingKMeans
 org.apache.spark.ml.clustering.GaussianMixture
 
-
 # recommendation
 org.apache.spark.ml.recommendation.ALS
 
-
 # fpm
 org.apache.spark.ml.fpm.FPGrowth
 
-
 # feature
 org.apache.spark.ml.feature.StandardScaler
 org.apache.spark.ml.feature.MaxAbsScaler
@@ -57,6 +52,7 @@ org.apache.spark.ml.feature.UnivariateFeatureSelector
 org.apache.spark.ml.feature.VarianceThresholdSelector
 org.apache.spark.ml.feature.StringIndexer
 org.apache.spark.ml.feature.PCA
+org.apache.spark.ml.feature.IDF
 org.apache.spark.ml.feature.Word2Vec
 org.apache.spark.ml.feature.CountVectorizer
 org.apache.spark.ml.feature.OneHotEncoder
diff --git 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
index 1ebe5f733925..f13924931e92 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -17,6 +17,7 @@
 
 # Spark Connect ML uses ServiceLoader to find out the supported Spark Ml 
non-model transformer.
 # So register the supported transformer here if you're trying to add a new one.
+
 ########### Transformers
 org.apache.spark.ml.feature.DCT
 org.apache.spark.ml.feature.Binarizer
@@ -26,6 +27,8 @@ org.apache.spark.ml.feature.Tokenizer
 org.apache.spark.ml.feature.RegexTokenizer
 org.apache.spark.ml.feature.SQLTransformer
 org.apache.spark.ml.feature.StopWordsRemover
+org.apache.spark.ml.feature.FeatureHasher
+org.apache.spark.ml.feature.HashingTF
 
 ########### Model for loading
 # classification
@@ -62,6 +65,7 @@ org.apache.spark.ml.feature.UnivariateFeatureSelectorModel
 org.apache.spark.ml.feature.VarianceThresholdSelectorModel
 org.apache.spark.ml.feature.StringIndexerModel
 org.apache.spark.ml.feature.PCAModel
+org.apache.spark.ml.feature.IDFModel
 org.apache.spark.ml.feature.Word2VecModel
 org.apache.spark.ml.feature.CountVectorizerModel
 org.apache.spark.ml.feature.OneHotEncoderModel
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index 3025a7b04af5..5459bb3f31da 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -121,6 +121,8 @@ class IDFModel private[ml] (
 
   import IDFModel._
 
+  private[ml] def this() = this(Identifiable.randomUID("idf"), null)
+
   /** @group setParam */
   @Since("1.4.0")
   def setInputCol(value: String): this.type = set(inputCol, value)
diff --git a/python/pyspark/ml/tests/connect/test_parity_feature.py 
b/python/pyspark/ml/tests/connect/test_parity_feature.py
index 86fe42f5df89..e20ebe443e81 100644
--- a/python/pyspark/ml/tests/connect/test_parity_feature.py
+++ b/python/pyspark/ml/tests/connect/test_parity_feature.py
@@ -22,10 +22,6 @@ from pyspark.testing.connectutils import 
ReusedConnectTestCase
 
 
 class FeatureParityTests(FeatureTestsMixin, ReusedConnectTestCase):
-    @unittest.skip("Need to support.")
-    def test_idf(self):
-        super().test_idf()
-
     @unittest.skip("Need to support.")
     def test_ngram(self):
         super().test_ngram()
@@ -62,10 +58,6 @@ class FeatureParityTests(FeatureTestsMixin, 
ReusedConnectTestCase):
     def test_vector_size_hint(self):
         super().test_vector_size_hint()
 
-    @unittest.skip("Need to support.")
-    def test_apply_binary_term_freqs(self):
-        super().test_apply_binary_term_freqs()
-
 
 if __name__ == "__main__":
     from pyspark.ml.tests.connect.test_parity_feature import *  # noqa: F401
diff --git a/python/pyspark/ml/tests/test_feature.py 
b/python/pyspark/ml/tests/test_feature.py
index a3dd889ba1f4..beb2a80443cc 100644
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@@ -30,8 +30,10 @@ from pyspark.ml.feature import (
     CountVectorizerModel,
     OneHotEncoder,
     OneHotEncoderModel,
+    FeatureHasher,
     HashingTF,
     IDF,
+    IDFModel,
     NGram,
     RFormula,
     Tokenizer,
@@ -66,7 +68,7 @@ from pyspark.ml.feature import (
 from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
 from pyspark.sql import Row
 from pyspark.testing.utils import QuietTest
-from pyspark.testing.mlutils import check_params, SparkSessionTestCase
+from pyspark.testing.mlutils import SparkSessionTestCase
 
 
 class FeatureTestsMixin:
@@ -842,22 +844,41 @@ class FeatureTestsMixin:
             self.assertEqual(str(bucketizer), str(bucketizer2))
 
     def test_idf(self):
-        dataset = self.spark.createDataFrame(
-            [(DenseVector([1.0, 2.0]),), (DenseVector([0.0, 1.0]),), 
(DenseVector([3.0, 0.2]),)],
+        df = self.spark.createDataFrame(
+            [
+                (DenseVector([1.0, 2.0]),),
+                (DenseVector([0.0, 1.0]),),
+                (DenseVector([3.0, 0.2]),),
+            ],
             ["tf"],
         )
-        idf0 = IDF(inputCol="tf")
-        self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, 
idf0.outputCol])
-        idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"})
-        self.assertEqual(
-            idf0m.uid, idf0.uid, "Model should inherit the UID from its parent 
estimator."
+        idf = IDF(inputCol="tf")
+        self.assertListEqual(idf.params, [idf.inputCol, idf.minDocFreq, 
idf.outputCol])
+
+        model = idf.fit(df, {idf.outputCol: "idf"})
+        # self.assertEqual(
+        #     model.uid, idf.uid, "Model should inherit the UID from its 
parent estimator."
+        # )
+        self.assertTrue(
+            np.allclose(model.idf.toArray(), [0.28768207245178085, 0.0], 
atol=1e-4),
+            model.idf,
         )
-        output = idf0m.transform(dataset)
+        self.assertEqual(model.docFreq, [2, 3])
+        self.assertEqual(model.numDocs, 3)
+
+        output = model.transform(df)
+        self.assertEqual(output.columns, ["tf", "idf"])
         self.assertIsNotNone(output.head().idf)
-        self.assertIsNotNone(idf0m.docFreq)
-        self.assertEqual(idf0m.numDocs, 3)
-        # Test that parameters transferred to Python Model
-        check_params(self, idf0m)
+
+        # save & load
+        with tempfile.TemporaryDirectory(prefix="idf") as d:
+            idf.write().overwrite().save(d)
+            idf2 = IDF.load(d)
+            self.assertEqual(str(idf), str(idf2))
+
+            model.write().overwrite().save(d)
+            model2 = IDFModel.load(d)
+            self.assertEqual(str(model), str(model2))
 
     def test_ngram(self):
         dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", 
"e"])])
@@ -1149,26 +1170,63 @@ class FeatureTestsMixin:
         expected = DenseVector([0.0, 10.0, 0.5])
         self.assertEqual(output, expected)
 
-    def test_apply_binary_term_freqs(self):
+    def test_feature_hasher(self):
+        data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]
+        cols = ["real", "bool", "stringNum", "string"]
+        df = self.spark.createDataFrame(data, cols)
+
+        hasher = FeatureHasher(numFeatures=2)
+        hasher.setInputCols(cols)
+        hasher.setOutputCol("features")
+
+        self.assertEqual(hasher.getNumFeatures(), 2)
+        self.assertEqual(hasher.getInputCols(), cols)
+        self.assertEqual(hasher.getOutputCol(), "features")
+
+        output = hasher.transform(df)
+        self.assertEqual(output.columns, ["real", "bool", "stringNum", 
"string", "features"])
+        self.assertEqual(output.count(), 2)
+
+        features = output.head().features.toArray()
+        self.assertTrue(
+            np.allclose(features, [2.0, 3.0], atol=1e-4),
+            features,
+        )
+
+        # save & load
+        with tempfile.TemporaryDirectory(prefix="feature_hasher") as d:
+            hasher.write().overwrite().save(d)
+            hasher2 = FeatureHasher.load(d)
+            self.assertEqual(str(hasher), str(hasher2))
+
+    def test_hashing_tf(self):
         df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], 
["id", "words"])
-        n = 10
-        hashingTF = HashingTF()
-        
hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
-        output = hashingTF.transform(df)
+        tf = HashingTF()
+        
tf.setInputCol("words").setOutputCol("features").setNumFeatures(10).setBinary(True)
+        self.assertEqual(tf.getInputCol(), "words")
+        self.assertEqual(tf.getOutputCol(), "features")
+        self.assertEqual(tf.getNumFeatures(), 10)
+        self.assertTrue(tf.getBinary())
+
+        output = tf.transform(df)
+        self.assertEqual(output.columns, ["id", "words", "features"])
+        self.assertEqual(output.count(), 1)
+
         features = output.select("features").first().features.toArray()
-        expected = Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 
0.0]).toArray()
-        for i in range(0, n):
-            self.assertAlmostEqual(
-                features[i],
-                expected[i],
-                14,
-                "Error at "
-                + str(i)
-                + ": expected "
-                + str(expected[i])
-                + ", got "
-                + str(features[i]),
-            )
+        self.assertTrue(
+            np.allclose(
+                features,
+                [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0],
+                atol=1e-4,
+            ),
+            features,
+        )
+
+        # save & load
+        with tempfile.TemporaryDirectory(prefix="hashing_tf") as d:
+            tf.write().overwrite().save(d)
+            tf2 = HashingTF.load(d)
+            self.assertEqual(str(tf), str(tf2))
 
 
 class FeatureTests(FeatureTestsMixin, SparkSessionTestCase):
diff --git 
a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
 
b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
index c468c6e44333..833c00b1a5c3 100644
--- 
a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
+++ 
b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
@@ -592,7 +592,8 @@ private[ml] object MLUtils {
     (classOf[PCAModel], Set("pc", "explainedVariance")),
     (classOf[Word2VecModel], Set("getVectors", "findSynonyms", 
"findSynonymsArray")),
     (classOf[CountVectorizerModel], Set("vocabulary")),
-    (classOf[OneHotEncoderModel], Set("categorySizes")))
+    (classOf[OneHotEncoderModel], Set("categorySizes")),
+    (classOf[IDFModel], Set("idf", "docFreq", "numDocs")))
 
   private def validate(obj: Any, method: String): Unit = {
     assert(obj != null)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch branch-4.0 updated: [SPARK-50936][ML][PYTHON][CONNECT] Support HashingTF, IDF and FeatureHasher on connect

Reply via email to