Repository: spark
Updated Branches:
refs/heads/branch-2.0 729cadb6b -> 254e33f4b
[SPARK-18274][ML][PYSPARK] Memory leak in PySpark JavaWrapper
## What changes were proposed in this pull request?
In`JavaWrapper `'s destructor make Java Gateway dereference object in
destructor, using `SparkContext._active_spark_context._gateway.detach`
Fixing the copying parameter bug, by moving the `copy` method from `JavaModel`
to `JavaParams`
## How was this patch tested?
```scala
import random, string
from pyspark.ml.feature import StringIndexer
l = [(''.join(random.choice(string.ascii_uppercase) for _ in range(10)), ) for
_ in range(int(7e5))] # 700000 random strings of 10 characters
df = spark.createDataFrame(l, ['string'])
for i in range(50):
indexer = StringIndexer(inputCol='string', outputCol='index')
indexer.fit(df)
```
* Before: would keep StringIndexer strong reference, causing GC issues and is
halted midway
After: garbage collection works as the object is dereferenced, and computation
completes
* Mem footprint tested using profiler
* Added a parameter copy related test which was failing before.
Author: Sandeep Singh <[email protected]>
Author: jkbradley <[email protected]>
Closes #15843 from techaddict/SPARK-18274.
(cherry picked from commit 78bb7f8071379114314c394e0167c4c5fd8545c5)
Signed-off-by: Joseph K. Bradley <[email protected]>
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/254e33f4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/254e33f4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/254e33f4
Branch: refs/heads/branch-2.0
Commit: 254e33f4b2db14fe9438b67023a0b721f9f61a3f
Parents: 729cadb
Author: Sandeep Singh <[email protected]>
Authored: Thu Dec 1 13:22:40 2016 -0800
Committer: Joseph K. Bradley <[email protected]>
Committed: Thu Dec 1 13:23:10 2016 -0800
----------------------------------------------------------------------
python/pyspark/ml/tests.py | 18 +++++++++++++++++
python/pyspark/ml/wrapper.py | 41 ++++++++++++++++++++++-----------------
2 files changed, 41 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/254e33f4/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index de95a47..ae95f17 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -379,6 +379,24 @@ class ParamTests(PySparkTestCase):
self.assertEqual(model.getWindowSize(), 6)
+class EvaluatorTests(SparkSessionTestCase):
+
+ def test_java_params(self):
+ """
+ This tests a bug fixed by SPARK-18274 which causes multiple copies
+ of a Params instance in Python to be linked to the same Java instance.
+ """
+ evaluator = RegressionEvaluator(metricName="r2")
+ df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
+ evaluator.evaluate(df)
+ self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
+ evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"})
+ evaluator.evaluate(df)
+ evaluatorCopy.evaluate(df)
+ self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
+ self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
+
+
class FeatureTests(SparkSessionTestCase):
def test_binarizer(self):
http://git-wip-us.apache.org/repos/asf/spark/blob/254e33f4/python/pyspark/ml/wrapper.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 25c44b7..13b75e9 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -71,6 +71,10 @@ class JavaParams(JavaWrapper, Params):
__metaclass__ = ABCMeta
+ def __del__(self):
+ if SparkContext._active_spark_context:
+ SparkContext._active_spark_context._gateway.detach(self._java_obj)
+
def _make_java_param_pair(self, param, value):
"""
Makes a Java parm pair.
@@ -180,6 +184,25 @@ class JavaParams(JavaWrapper, Params):
% stage_name)
return py_stage
+ def copy(self, extra=None):
+ """
+ Creates a copy of this instance with the same uid and some
+ extra params. This implementation first calls Params.copy and
+ then make a copy of the companion Java pipeline component with
+ extra params. So both the Python wrapper and the Java pipeline
+ component get copied.
+
+ :param extra: Extra parameters to copy to the new instance
+ :return: Copy of this instance
+ """
+ if extra is None:
+ extra = dict()
+ that = super(JavaParams, self).copy(extra)
+ if self._java_obj is not None:
+ that._java_obj = self._java_obj.copy(self._empty_java_param_map())
+ that._transfer_params_to_java()
+ return that
+
@inherit_doc
class JavaEstimator(JavaParams, Estimator):
@@ -256,21 +279,3 @@ class JavaModel(JavaTransformer, Model):
super(JavaModel, self).__init__(java_model)
if java_model is not None:
self._resetUid(java_model.uid())
-
- def copy(self, extra=None):
- """
- Creates a copy of this instance with the same uid and some
- extra params. This implementation first calls Params.copy and
- then make a copy of the companion Java model with extra params.
- So both the Python wrapper and the Java model get copied.
-
- :param extra: Extra parameters to copy to the new instance
- :return: Copy of this instance
- """
- if extra is None:
- extra = dict()
- that = super(JavaModel, self).copy(extra)
- if self._java_obj is not None:
- that._java_obj = self._java_obj.copy(self._empty_java_param_map())
- that._transfer_params_to_java()
- return that
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]