This is an automated email from the ASF dual-hosted git repository.
cutlerb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 5bf5d9d [SPARK-26970][PYTHON][ML] Add Spark ML interaction
transformer to PySpark
5bf5d9d is described below
commit 5bf5d9d854db53541956dedb03e2de8eecf65b81
Author: Andrew-Crosby <[email protected]>
AuthorDate: Tue Apr 23 13:53:33 2019 -0700
[SPARK-26970][PYTHON][ML] Add Spark ML interaction transformer to PySpark
## What changes were proposed in this pull request?
Adds the Spark ML Interaction transformer to PySpark
## How was this patch tested?
- Added Python doctest
- Ran the newly added example code
- Manually confirmed that a PipelineModel that contains an Interaction
transformer can now be loaded in PySpark
Closes #24426 from Andrew-Crosby/pyspark-interaction-transformer.
Lead-authored-by: Andrew-Crosby
<[email protected]>
Co-authored-by: Andrew-Crosby <[email protected]>
Signed-off-by: Bryan Cutler <[email protected]>
---
examples/src/main/python/ml/interaction_example.py | 56 ++++++++++++++++++++++
python/pyspark/ml/feature.py | 54 +++++++++++++++++++++
2 files changed, 110 insertions(+)
diff --git a/examples/src/main/python/ml/interaction_example.py
b/examples/src/main/python/ml/interaction_example.py
new file mode 100644
index 0000000..4b63227
--- /dev/null
+++ b/examples/src/main/python/ml/interaction_example.py
@@ -0,0 +1,56 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.feature import Interaction, VectorAssembler
+# $example off$
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+ spark = SparkSession\
+ .builder\
+ .appName("InteractionExample")\
+ .getOrCreate()
+
+ # $example on$
+ df = spark.createDataFrame(
+ [(1, 1, 2, 3, 8, 4, 5),
+ (2, 4, 3, 8, 7, 9, 8),
+ (3, 6, 1, 9, 2, 3, 6),
+ (4, 10, 8, 6, 9, 4, 5),
+ (5, 9, 2, 7, 10, 7, 3),
+ (6, 1, 1, 4, 2, 8, 4)],
+ ["id1", "id2", "id3", "id4", "id5", "id6", "id7"])
+
+ assembler1 = VectorAssembler(inputCols=["id2", "id3", "id4"],
outputCol="vec1")
+
+ assembled1 = assembler1.transform(df)
+
+ assembler2 = VectorAssembler(inputCols=["id5", "id6", "id7"],
outputCol="vec2")
+
+ assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2")
+
+ interaction = Interaction(inputCols=["id1", "vec1", "vec2"],
outputCol="interactedCol")
+
+ interacted = interaction.transform(assembled2)
+
+ interacted.show(truncate=False)
+ # $example off$
+
+ spark.stop()
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 595ab18..4f5809c 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -39,6 +39,7 @@ __all__ = ['Binarizer',
'IDF', 'IDFModel',
'Imputer', 'ImputerModel',
'IndexToString',
+ 'Interaction',
'MaxAbsScaler', 'MaxAbsScalerModel',
'MinHashLSH', 'MinHashLSHModel',
'MinMaxScaler', 'MinMaxScalerModel',
@@ -1228,6 +1229,59 @@ class ImputerModel(JavaModel, JavaMLReadable,
JavaMLWritable):
@inherit_doc
+class Interaction(JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadable,
JavaMLWritable):
+ """
+ Implements the feature interaction transform. This transformer takes in
Double and Vector type
+ columns and outputs a flattened vector of their feature interactions. To
handle interaction,
+ we first one-hot encode any nominal features. Then, a vector of the
feature cross-products is
+ produced.
+
+ For example, given the input feature values `Double(2)` and `Vector(3,
4)`, the output would be
+ `Vector(6, 8)` if all input features were numeric. If the first feature
was instead nominal
+ with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4,
0, 0)`.
+
+ >>> df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"])
+ >>> interaction = Interaction(inputCols=["a", "b"], outputCol="ab")
+ >>> interaction.transform(df).show()
+ +---+---+-----+
+ | a| b| ab|
+ +---+---+-----+
+ |0.0|1.0|[0.0]|
+ |2.0|3.0|[6.0]|
+ +---+---+-----+
+ ...
+ >>> interactionPath = temp_path + "/interaction"
+ >>> interaction.save(interactionPath)
+ >>> loadedInteraction = Interaction.load(interactionPath)
+ >>> loadedInteraction.transform(df).head().ab ==
interaction.transform(df).head().ab
+ True
+
+ .. versionadded:: 3.0.0
+ """
+
+ @keyword_only
+ def __init__(self, inputCols=None, outputCol=None):
+ """
+ __init__(self, inputCols=None, outputCol=None):
+ """
+ super(Interaction, self).__init__()
+ self._java_obj =
self._new_java_obj("org.apache.spark.ml.feature.Interaction", self.uid)
+ self._setDefault()
+ kwargs = self._input_kwargs
+ self.setParams(**kwargs)
+
+ @keyword_only
+ @since("3.0.0")
+ def setParams(self, inputCols=None, outputCol=None):
+ """
+ setParams(self, inputCols=None, outputCol=None)
+ Sets params for this Interaction.
+ """
+ kwargs = self._input_kwargs
+ return self._set(**kwargs)
+
+
+@inherit_doc
class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable,
JavaMLWritable):
"""
Rescale each feature individually to range [-1, 1] by dividing through the
largest maximum
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]