(spark) branch branch-4.0 updated: [SPARK-50989][ML][PYTHON][CONNECT] Support NGram, Normalizer and Interaction on connect

ruifengz Sat, 25 Jan 2025 03:54:47 -0800

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 50d9dfed7183 [SPARK-50989][ML][PYTHON][CONNECT] Support NGram, 
Normalizer and Interaction on connect
50d9dfed7183 is described below

commit 50d9dfed7183df99992c82212fd44cc5ced916ad
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Sat Jan 25 19:53:27 2025 +0800

    [SPARK-50989][ML][PYTHON][CONNECT] Support NGram, Normalizer and 
Interaction on connect
    
    ### What changes were proposed in this pull request?
    Support NGram, Normalizer and Interaction on connect
    
    ### Why are the changes needed?
    feature parity
    
    ### Does this PR introduce _any_ user-facing change?
    yes
    
    ### How was this patch tested?
    added tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #49668 from zhengruifeng/ml_connect_ngram.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
    (cherry picked from commit acdda8a3eb159c18f87e15a4cda3050f15d8240b)
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 .../services/org.apache.spark.ml.Transformer       |  3 +
 .../ml/tests/connect/test_parity_feature.py        |  4 --
 python/pyspark/ml/tests/test_feature.py            | 67 +++++++++++++++++++---
 3 files changed, 63 insertions(+), 11 deletions(-)

diff --git 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
index 74a2c960a98b..ce880bb2ef31 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -20,6 +20,9 @@
 
 ########### Transformers
 org.apache.spark.ml.feature.DCT
+org.apache.spark.ml.feature.NGram
+org.apache.spark.ml.feature.Normalizer
+org.apache.spark.ml.feature.Interaction
 org.apache.spark.ml.feature.Binarizer
 org.apache.spark.ml.feature.Bucketizer
 org.apache.spark.ml.feature.VectorAssembler
diff --git a/python/pyspark/ml/tests/connect/test_parity_feature.py 
b/python/pyspark/ml/tests/connect/test_parity_feature.py
index e20ebe443e81..595e9ff7cd54 100644
--- a/python/pyspark/ml/tests/connect/test_parity_feature.py
+++ b/python/pyspark/ml/tests/connect/test_parity_feature.py
@@ -22,10 +22,6 @@ from pyspark.testing.connectutils import 
ReusedConnectTestCase
 
 
 class FeatureParityTests(FeatureTestsMixin, ReusedConnectTestCase):
-    @unittest.skip("Need to support.")
-    def test_ngram(self):
-        super().test_ngram()
-
     @unittest.skip("Need to support.")
     def test_count_vectorizer_with_binary(self):
         super().test_count_vectorizer_with_binary()
diff --git a/python/pyspark/ml/tests/test_feature.py 
b/python/pyspark/ml/tests/test_feature.py
index 8c201a01b338..1424ed4947e2 100644
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@@ -37,6 +37,8 @@ from pyspark.ml.feature import (
     Imputer,
     ImputerModel,
     NGram,
+    Normalizer,
+    Interaction,
     RFormula,
     Tokenizer,
     SQLTransformer,
@@ -923,13 +925,64 @@ class FeatureTestsMixin:
             self.assertEqual(str(model), str(model2))
 
     def test_ngram(self):
-        dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", 
"e"])])
-        ngram0 = NGram(n=4, inputCol="input", outputCol="output")
-        self.assertEqual(ngram0.getN(), 4)
-        self.assertEqual(ngram0.getInputCol(), "input")
-        self.assertEqual(ngram0.getOutputCol(), "output")
-        transformedDF = ngram0.transform(dataset)
-        self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
+        spark = self.spark
+        df = spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])])
+
+        ngram = NGram(n=4, inputCol="input", outputCol="output")
+        self.assertEqual(ngram.getN(), 4)
+        self.assertEqual(ngram.getInputCol(), "input")
+        self.assertEqual(ngram.getOutputCol(), "output")
+
+        output = ngram.transform(df)
+        self.assertEqual(output.head().output, ["a b c d", "b c d e"])
+
+        # save & load
+        with tempfile.TemporaryDirectory(prefix="ngram") as d:
+            ngram.write().overwrite().save(d)
+            ngram2 = NGram.load(d)
+            self.assertEqual(str(ngram), str(ngram2))
+
+    def test_normalizer(self):
+        spark = self.spark
+        df = spark.createDataFrame(
+            [(Vectors.dense([3.0, -4.0]),), (Vectors.sparse(4, {1: 4.0, 3: 
3.0}),)],
+            ["input"],
+        )
+
+        normalizer = Normalizer(p=2.0, inputCol="input", outputCol="output")
+        self.assertEqual(normalizer.getP(), 2.0)
+        self.assertEqual(normalizer.getInputCol(), "input")
+        self.assertEqual(normalizer.getOutputCol(), "output")
+
+        output = normalizer.transform(df)
+        self.assertEqual(output.columns, ["input", "output"])
+        self.assertEqual(output.count(), 2)
+
+        # save & load
+        with tempfile.TemporaryDirectory(prefix="normalizer") as d:
+            normalizer.write().overwrite().save(d)
+            normalizer2 = Normalizer.load(d)
+            self.assertEqual(str(normalizer), str(normalizer2))
+
+    def test_interaction(self):
+        spark = self.spark
+        df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"])
+
+        interaction = Interaction()
+        interaction.setInputCols(["a", "b"])
+        interaction.setOutputCol("ab")
+        self.assertEqual(interaction.getInputCols(), ["a", "b"])
+        self.assertEqual(interaction.getOutputCol(), "ab")
+
+        output = interaction.transform(df)
+        self.assertEqual(output.columns, ["a", "b", "ab"])
+        self.assertEqual(output.count(), 2)
+
+        # save & load
+        with tempfile.TemporaryDirectory(prefix="interaction") as d:
+            interaction.write().overwrite().save(d)
+            interaction2 = Interaction.load(d)
+            self.assertEqual(str(interaction), str(interaction2))
 
     def test_count_vectorizer_with_binary(self):
         dataset = self.spark.createDataFrame(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch branch-4.0 updated: [SPARK-50989][ML][PYTHON][CONNECT] Support NGram, Normalizer and Interaction on connect

Reply via email to