This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 50d9dfed7183 [SPARK-50989][ML][PYTHON][CONNECT] Support NGram,
Normalizer and Interaction on connect
50d9dfed7183 is described below
commit 50d9dfed7183df99992c82212fd44cc5ced916ad
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Sat Jan 25 19:53:27 2025 +0800
[SPARK-50989][ML][PYTHON][CONNECT] Support NGram, Normalizer and
Interaction on connect
### What changes were proposed in this pull request?
Support NGram, Normalizer and Interaction on connect
### Why are the changes needed?
feature parity
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
added tests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #49668 from zhengruifeng/ml_connect_ngram.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
(cherry picked from commit acdda8a3eb159c18f87e15a4cda3050f15d8240b)
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.../services/org.apache.spark.ml.Transformer | 3 +
.../ml/tests/connect/test_parity_feature.py | 4 --
python/pyspark/ml/tests/test_feature.py | 67 +++++++++++++++++++---
3 files changed, 63 insertions(+), 11 deletions(-)
diff --git
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
index 74a2c960a98b..ce880bb2ef31 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -20,6 +20,9 @@
########### Transformers
org.apache.spark.ml.feature.DCT
+org.apache.spark.ml.feature.NGram
+org.apache.spark.ml.feature.Normalizer
+org.apache.spark.ml.feature.Interaction
org.apache.spark.ml.feature.Binarizer
org.apache.spark.ml.feature.Bucketizer
org.apache.spark.ml.feature.VectorAssembler
diff --git a/python/pyspark/ml/tests/connect/test_parity_feature.py
b/python/pyspark/ml/tests/connect/test_parity_feature.py
index e20ebe443e81..595e9ff7cd54 100644
--- a/python/pyspark/ml/tests/connect/test_parity_feature.py
+++ b/python/pyspark/ml/tests/connect/test_parity_feature.py
@@ -22,10 +22,6 @@ from pyspark.testing.connectutils import
ReusedConnectTestCase
class FeatureParityTests(FeatureTestsMixin, ReusedConnectTestCase):
- @unittest.skip("Need to support.")
- def test_ngram(self):
- super().test_ngram()
-
@unittest.skip("Need to support.")
def test_count_vectorizer_with_binary(self):
super().test_count_vectorizer_with_binary()
diff --git a/python/pyspark/ml/tests/test_feature.py
b/python/pyspark/ml/tests/test_feature.py
index 8c201a01b338..1424ed4947e2 100644
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@@ -37,6 +37,8 @@ from pyspark.ml.feature import (
Imputer,
ImputerModel,
NGram,
+ Normalizer,
+ Interaction,
RFormula,
Tokenizer,
SQLTransformer,
@@ -923,13 +925,64 @@ class FeatureTestsMixin:
self.assertEqual(str(model), str(model2))
def test_ngram(self):
- dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d",
"e"])])
- ngram0 = NGram(n=4, inputCol="input", outputCol="output")
- self.assertEqual(ngram0.getN(), 4)
- self.assertEqual(ngram0.getInputCol(), "input")
- self.assertEqual(ngram0.getOutputCol(), "output")
- transformedDF = ngram0.transform(dataset)
- self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
+ spark = self.spark
+ df = spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])])
+
+ ngram = NGram(n=4, inputCol="input", outputCol="output")
+ self.assertEqual(ngram.getN(), 4)
+ self.assertEqual(ngram.getInputCol(), "input")
+ self.assertEqual(ngram.getOutputCol(), "output")
+
+ output = ngram.transform(df)
+ self.assertEqual(output.head().output, ["a b c d", "b c d e"])
+
+ # save & load
+ with tempfile.TemporaryDirectory(prefix="ngram") as d:
+ ngram.write().overwrite().save(d)
+ ngram2 = NGram.load(d)
+ self.assertEqual(str(ngram), str(ngram2))
+
+ def test_normalizer(self):
+ spark = self.spark
+ df = spark.createDataFrame(
+ [(Vectors.dense([3.0, -4.0]),), (Vectors.sparse(4, {1: 4.0, 3:
3.0}),)],
+ ["input"],
+ )
+
+ normalizer = Normalizer(p=2.0, inputCol="input", outputCol="output")
+ self.assertEqual(normalizer.getP(), 2.0)
+ self.assertEqual(normalizer.getInputCol(), "input")
+ self.assertEqual(normalizer.getOutputCol(), "output")
+
+ output = normalizer.transform(df)
+ self.assertEqual(output.columns, ["input", "output"])
+ self.assertEqual(output.count(), 2)
+
+ # save & load
+ with tempfile.TemporaryDirectory(prefix="normalizer") as d:
+ normalizer.write().overwrite().save(d)
+ normalizer2 = Normalizer.load(d)
+ self.assertEqual(str(normalizer), str(normalizer2))
+
+ def test_interaction(self):
+ spark = self.spark
+ df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"])
+
+ interaction = Interaction()
+ interaction.setInputCols(["a", "b"])
+ interaction.setOutputCol("ab")
+ self.assertEqual(interaction.getInputCols(), ["a", "b"])
+ self.assertEqual(interaction.getOutputCol(), "ab")
+
+ output = interaction.transform(df)
+ self.assertEqual(output.columns, ["a", "b", "ab"])
+ self.assertEqual(output.count(), 2)
+
+ # save & load
+ with tempfile.TemporaryDirectory(prefix="interaction") as d:
+ interaction.write().overwrite().save(d)
+ interaction2 = Interaction.load(d)
+ self.assertEqual(str(interaction), str(interaction2))
def test_count_vectorizer_with_binary(self):
dataset = self.spark.createDataFrame(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]