This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 83d5d44185d8 [SPARK-50938][ML][PYTHON][CONNECT] Support VectorSizeHint
and VectorSlicer on Connect
83d5d44185d8 is described below
commit 83d5d44185d8e0f9d480aaad1031254ed08d47c1
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Jan 28 09:34:34 2025 +0900
[SPARK-50938][ML][PYTHON][CONNECT] Support VectorSizeHint and VectorSlicer
on Connect
### What changes were proposed in this pull request?
Support VectorSizeHint and VectorSlicer on Connect
### Why are the changes needed?
feature parity
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
new tests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #49692 from zhengruifeng/ml_connect_vs.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit 704c901986b443240a0a2a1ffdc48abff7db1579)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../services/org.apache.spark.ml.Transformer | 2 +
.../ml/tests/connect/test_parity_feature.py | 4 --
python/pyspark/ml/tests/test_feature.py | 44 ++++++++++++++++++++--
3 files changed, 42 insertions(+), 8 deletions(-)
diff --git
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
index c64f93866caf..8aa1b1a00bca 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -26,6 +26,8 @@ org.apache.spark.ml.feature.Interaction
org.apache.spark.ml.feature.Binarizer
org.apache.spark.ml.feature.Bucketizer
org.apache.spark.ml.feature.VectorAssembler
+org.apache.spark.ml.feature.VectorSlicer
+org.apache.spark.ml.feature.VectorSizeHint
org.apache.spark.ml.feature.Tokenizer
org.apache.spark.ml.feature.RegexTokenizer
org.apache.spark.ml.feature.SQLTransformer
diff --git a/python/pyspark/ml/tests/connect/test_parity_feature.py
b/python/pyspark/ml/tests/connect/test_parity_feature.py
index 595e9ff7cd54..908c89173737 100644
--- a/python/pyspark/ml/tests/connect/test_parity_feature.py
+++ b/python/pyspark/ml/tests/connect/test_parity_feature.py
@@ -50,10 +50,6 @@ class FeatureParityTests(FeatureTestsMixin,
ReusedConnectTestCase):
def test_string_indexer_from_labels(self):
super().test_string_indexer_from_labels()
- @unittest.skip("Need to support.")
- def test_vector_size_hint(self):
- super().test_vector_size_hint()
-
if __name__ == "__main__":
from pyspark.ml.tests.connect.test_parity_feature import * # noqa: F401
diff --git a/python/pyspark/ml/tests/test_feature.py
b/python/pyspark/ml/tests/test_feature.py
index 1c1c703bd221..aa5643d69911 100644
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@@ -66,6 +66,7 @@ from pyspark.ml.feature import (
TargetEncoder,
TargetEncoderModel,
VectorSizeHint,
+ VectorSlicer,
VectorAssembler,
PCA,
PCAModel,
@@ -1392,14 +1393,49 @@ class FeatureTestsMixin:
["id", "vector"],
)
- sizeHint = VectorSizeHint(inputCol="vector", handleInvalid="skip")
- sizeHint.setSize(3)
- self.assertEqual(sizeHint.getSize(), 3)
+ sh = VectorSizeHint(inputCol="vector", handleInvalid="skip")
+ sh.setSize(3)
+ self.assertEqual(sh.getSize(), 3)
- output = sizeHint.transform(df).head().vector
+ output = sh.transform(df).head().vector
expected = DenseVector([0.0, 10.0, 0.5])
self.assertEqual(output, expected)
+ # save & load
+ with tempfile.TemporaryDirectory(prefix="vector_size_hint") as d:
+ sh.write().overwrite().save(d)
+ sh2 = VectorSizeHint.load(d)
+ self.assertEqual(str(sh), str(sh2))
+
+ def test_vector_slicer(self):
+ spark = self.spark
+
+ df = spark.createDataFrame(
+ [
+ (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
+ (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
+ (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),),
+ ],
+ ["features"],
+ )
+
+ vs = VectorSlicer(outputCol="sliced", indices=[1, 4])
+ vs.setInputCol("features")
+ self.assertEqual(vs.getIndices(), [1, 4])
+ self.assertEqual(vs.getInputCol(), "features")
+ self.assertEqual(vs.getOutputCol(), "sliced")
+
+ output = vs.transform(df)
+ self.assertEqual(output.columns, ["features", "sliced"])
+ self.assertEqual(output.count(), 3)
+ self.assertEqual(output.head().sliced, Vectors.dense([2.3, 1.0]))
+
+ # save & load
+ with tempfile.TemporaryDirectory(prefix="vector_slicer") as d:
+ vs.write().overwrite().save(d)
+ vs2 = VectorSlicer.load(d)
+ self.assertEqual(str(vs), str(vs2))
+
def test_feature_hasher(self):
data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]
cols = ["real", "bool", "stringNum", "string"]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]