spark git commit: [SPARK-20606][ML] Revert "[] ML 2.2 QA: Remove deprecated methods for ML"
Repository: spark Updated Branches: refs/heads/branch-2.2 3eb0ee06a -> 80a57fa90 [SPARK-20606][ML] Revert "[] ML 2.2 QA: Remove deprecated methods for ML" This reverts commit b8733e0ad9f5a700f385e210450fd2c10137293e. Author: Yanbo Liang Closes #17944 from yanboliang/spark-20606-revert. (cherry picked from commit 0698e6c88ca11fdfd6e5498cab784cf6dbcdfacb) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80a57fa9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80a57fa9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80a57fa9 Branch: refs/heads/branch-2.2 Commit: 80a57fa90be8dca4340345c09b4ea28fbf11a516 Parents: 3eb0ee0 Author: Yanbo Liang Authored: Thu May 11 14:48:13 2017 +0800 Committer: Yanbo Liang Committed: Thu May 11 14:48:26 2017 +0800 -- .../classification/DecisionTreeClassifier.scala | 18 ++-- .../spark/ml/classification/GBTClassifier.scala | 24 ++--- .../classification/RandomForestClassifier.scala | 24 ++--- .../ml/regression/DecisionTreeRegressor.scala | 18 ++-- .../spark/ml/regression/GBTRegressor.scala | 24 ++--- .../ml/regression/RandomForestRegressor.scala | 24 ++--- .../org/apache/spark/ml/tree/treeParams.scala | 105 +++ .../org/apache/spark/ml/util/ReadWrite.scala| 16 +++ project/MimaExcludes.scala | 68 python/pyspark/ml/util.py | 32 ++ 10 files changed, 219 insertions(+), 134 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/80a57fa9/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 5fb105c..9f60f08 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -54,27 +54,27 @@ class DecisionTreeClassifier @Since("1.4.0") ( /** @group setParam */ @Since("1.4.0") - def setMaxDepth(value: Int): this.type = set(maxDepth, value) + override def setMaxDepth(value: Int): this.type = set(maxDepth, value) /** @group setParam */ @Since("1.4.0") - def setMaxBins(value: Int): this.type = set(maxBins, value) + override def setMaxBins(value: Int): this.type = set(maxBins, value) /** @group setParam */ @Since("1.4.0") - def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) + override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) /** @group setParam */ @Since("1.4.0") - def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) + override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) /** @group expertSetParam */ @Since("1.4.0") - def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) + override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) /** @group expertSetParam */ @Since("1.4.0") - def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) + override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) /** * Specifies how often to checkpoint the cached node IDs. @@ -86,15 +86,15 @@ class DecisionTreeClassifier @Since("1.4.0") ( * @group setParam */ @Since("1.4.0") - def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) /** @group setParam */ @Since("1.4.0") - def setImpurity(value: String): this.type = set(impurity, value) + override def setImpurity(value: String): this.type = set(impurity, value) /** @group setParam */ @Since("1.6.0") - def setSeed(value: Long): this.type = set(seed, value) + override def setSeed(value: Long): this.type = set(seed, value) override protected def train(dataset: Dataset[_]): DecisionTreeClassificationModel = { val categoricalFeatures: Map[Int, Int] = http://git-wip-us.apache.org/repos/asf/spark/blob/80a57fa9/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index 263ed10..ade0960 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src
spark git commit: [SPARK-20606][ML] Revert "[] ML 2.2 QA: Remove deprecated methods for ML"
Repository: spark Updated Branches: refs/heads/master 8ddbc431d -> 0698e6c88 [SPARK-20606][ML] Revert "[] ML 2.2 QA: Remove deprecated methods for ML" This reverts commit b8733e0ad9f5a700f385e210450fd2c10137293e. Author: Yanbo Liang Closes #17944 from yanboliang/spark-20606-revert. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0698e6c8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0698e6c8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0698e6c8 Branch: refs/heads/master Commit: 0698e6c88ca11fdfd6e5498cab784cf6dbcdfacb Parents: 8ddbc43 Author: Yanbo Liang Authored: Thu May 11 14:48:13 2017 +0800 Committer: Yanbo Liang Committed: Thu May 11 14:48:13 2017 +0800 -- .../classification/DecisionTreeClassifier.scala | 18 ++-- .../spark/ml/classification/GBTClassifier.scala | 24 ++--- .../classification/RandomForestClassifier.scala | 24 ++--- .../ml/regression/DecisionTreeRegressor.scala | 18 ++-- .../spark/ml/regression/GBTRegressor.scala | 24 ++--- .../ml/regression/RandomForestRegressor.scala | 24 ++--- .../org/apache/spark/ml/tree/treeParams.scala | 105 +++ .../org/apache/spark/ml/util/ReadWrite.scala| 16 +++ project/MimaExcludes.scala | 68 python/pyspark/ml/util.py | 32 ++ 10 files changed, 219 insertions(+), 134 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0698e6c8/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 5fb105c..9f60f08 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -54,27 +54,27 @@ class DecisionTreeClassifier @Since("1.4.0") ( /** @group setParam */ @Since("1.4.0") - def setMaxDepth(value: Int): this.type = set(maxDepth, value) + override def setMaxDepth(value: Int): this.type = set(maxDepth, value) /** @group setParam */ @Since("1.4.0") - def setMaxBins(value: Int): this.type = set(maxBins, value) + override def setMaxBins(value: Int): this.type = set(maxBins, value) /** @group setParam */ @Since("1.4.0") - def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) + override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) /** @group setParam */ @Since("1.4.0") - def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) + override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) /** @group expertSetParam */ @Since("1.4.0") - def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) + override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) /** @group expertSetParam */ @Since("1.4.0") - def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) + override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) /** * Specifies how often to checkpoint the cached node IDs. @@ -86,15 +86,15 @@ class DecisionTreeClassifier @Since("1.4.0") ( * @group setParam */ @Since("1.4.0") - def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) /** @group setParam */ @Since("1.4.0") - def setImpurity(value: String): this.type = set(impurity, value) + override def setImpurity(value: String): this.type = set(impurity, value) /** @group setParam */ @Since("1.6.0") - def setSeed(value: Long): this.type = set(seed, value) + override def setSeed(value: Long): this.type = set(seed, value) override protected def train(dataset: Dataset[_]): DecisionTreeClassificationModel = { val categoricalFeatures: Map[Int, Int] = http://git-wip-us.apache.org/repos/asf/spark/blob/0698e6c8/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index 263ed10..ade0960 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -70,27 +70,27 @@ class GBTClassifier
spark git commit: [SPARK-20685] Fix BatchPythonEvaluation bug in case of single UDF w/ repeated arg.
Repository: spark Updated Branches: refs/heads/branch-2.1 bdc08ab64 -> 92a71a667 [SPARK-20685] Fix BatchPythonEvaluation bug in case of single UDF w/ repeated arg. ## What changes were proposed in this pull request? There's a latent corner-case bug in PySpark UDF evaluation where executing a `BatchPythonEvaluation` with a single multi-argument UDF where _at least one argument value is repeated_ will crash at execution with a confusing error. This problem was introduced in #12057: the code there has a fast path for handling a "batch UDF evaluation consisting of a single Python UDF", but that branch incorrectly assumes that a single UDF won't have repeated arguments and therefore skips the code for unpacking arguments from the input row (whose schema may not necessarily match the UDF inputs due to de-duplication of repeated arguments which occurred in the JVM before sending UDF inputs to Python). This fix here is simply to remove this special-casing: it turns out that the code in the "multiple UDFs" branch just so happens to work for the single-UDF case because Python treats `(x)` as equivalent to `x`, not as a single-argument tuple. ## How was this patch tested? New regression test in `pyspark.python.sql.tests` module (tested and confirmed that it fails before my fix). Author: Josh Rosen Closes #17927 from JoshRosen/SPARK-20685. (cherry picked from commit 8ddbc431d8b21d5ee57d3d209a4f25e301f15283) Signed-off-by: Xiao Li Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92a71a66 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92a71a66 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92a71a66 Branch: refs/heads/branch-2.1 Commit: 92a71a667dd3e13664015f2a9dd2a39e2c1514eb Parents: bdc08ab Author: Josh Rosen Authored: Wed May 10 16:50:57 2017 -0700 Committer: Xiao Li Committed: Wed May 10 16:51:16 2017 -0700 -- python/pyspark/sql/tests.py | 6 ++ python/pyspark/worker.py| 29 + 2 files changed, 19 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/92a71a66/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 22b1ffc..c3cc742 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -324,6 +324,12 @@ class SQLTests(ReusedPySparkTestCase): [row] = self.spark.sql("SELECT double(double(1) + 1)").collect() self.assertEqual(row[0], 6) +def test_single_udf_with_repeated_argument(self): +# regression test for SPARK-20685 +self.spark.catalog.registerFunction("add", lambda x, y: x + y, IntegerType()) +row = self.spark.sql("SELECT add(1, 1)").first() +self.assertEqual(tuple(row), (2, )) + def test_multiple_udfs(self): self.spark.catalog.registerFunction("double", lambda x: x * 2, IntegerType()) [row] = self.spark.sql("SELECT double(1), double(2)").collect() http://git-wip-us.apache.org/repos/asf/spark/blob/92a71a66/python/pyspark/worker.py -- diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 0918282..5eeaac7 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -86,22 +86,19 @@ def read_single_udf(pickleSer, infile): def read_udfs(pickleSer, infile): num_udfs = read_int(infile) -if num_udfs == 1: -# fast path for single UDF -_, udf = read_single_udf(pickleSer, infile) -mapper = lambda a: udf(*a) -else: -udfs = {} -call_udf = [] -for i in range(num_udfs): -arg_offsets, udf = read_single_udf(pickleSer, infile) -udfs['f%d' % i] = udf -args = ["a[%d]" % o for o in arg_offsets] -call_udf.append("f%d(%s)" % (i, ", ".join(args))) -# Create function like this: -# lambda a: (f0(a0), f1(a1, a2), f2(a3)) -mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) -mapper = eval(mapper_str, udfs) +udfs = {} +call_udf = [] +for i in range(num_udfs): +arg_offsets, udf = read_single_udf(pickleSer, infile) +udfs['f%d' % i] = udf +args = ["a[%d]" % o for o in arg_offsets] +call_udf.append("f%d(%s)" % (i, ", ".join(args))) +# Create function like this: +# lambda a: (f0(a0), f1(a1, a2), f2(a3)) +# In the special case of a single UDF this will return a single result rather +# than a tuple of results; this is the format that the JVM side expects. +mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) +mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) ser = Bat
spark git commit: [SPARK-20685] Fix BatchPythonEvaluation bug in case of single UDF w/ repeated arg.
Repository: spark Updated Branches: refs/heads/master af8b6cc82 -> 8ddbc431d [SPARK-20685] Fix BatchPythonEvaluation bug in case of single UDF w/ repeated arg. ## What changes were proposed in this pull request? There's a latent corner-case bug in PySpark UDF evaluation where executing a `BatchPythonEvaluation` with a single multi-argument UDF where _at least one argument value is repeated_ will crash at execution with a confusing error. This problem was introduced in #12057: the code there has a fast path for handling a "batch UDF evaluation consisting of a single Python UDF", but that branch incorrectly assumes that a single UDF won't have repeated arguments and therefore skips the code for unpacking arguments from the input row (whose schema may not necessarily match the UDF inputs due to de-duplication of repeated arguments which occurred in the JVM before sending UDF inputs to Python). This fix here is simply to remove this special-casing: it turns out that the code in the "multiple UDFs" branch just so happens to work for the single-UDF case because Python treats `(x)` as equivalent to `x`, not as a single-argument tuple. ## How was this patch tested? New regression test in `pyspark.python.sql.tests` module (tested and confirmed that it fails before my fix). Author: Josh Rosen Closes #17927 from JoshRosen/SPARK-20685. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8ddbc431 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8ddbc431 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8ddbc431 Branch: refs/heads/master Commit: 8ddbc431d8b21d5ee57d3d209a4f25e301f15283 Parents: af8b6cc Author: Josh Rosen Authored: Wed May 10 16:50:57 2017 -0700 Committer: Xiao Li Committed: Wed May 10 16:50:57 2017 -0700 -- python/pyspark/sql/tests.py | 6 ++ python/pyspark/worker.py| 29 + 2 files changed, 19 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8ddbc431/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index e3fe01e..8707500 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -330,6 +330,12 @@ class SQLTests(ReusedPySparkTestCase): [row] = self.spark.sql("SELECT double(double(1) + 1)").collect() self.assertEqual(row[0], 6) +def test_single_udf_with_repeated_argument(self): +# regression test for SPARK-20685 +self.spark.catalog.registerFunction("add", lambda x, y: x + y, IntegerType()) +row = self.spark.sql("SELECT add(1, 1)").first() +self.assertEqual(tuple(row), (2, )) + def test_multiple_udfs(self): self.spark.catalog.registerFunction("double", lambda x: x * 2, IntegerType()) [row] = self.spark.sql("SELECT double(1), double(2)").collect() http://git-wip-us.apache.org/repos/asf/spark/blob/8ddbc431/python/pyspark/worker.py -- diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 25ee475..baaa3fe 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -87,22 +87,19 @@ def read_single_udf(pickleSer, infile): def read_udfs(pickleSer, infile): num_udfs = read_int(infile) -if num_udfs == 1: -# fast path for single UDF -_, udf = read_single_udf(pickleSer, infile) -mapper = lambda a: udf(*a) -else: -udfs = {} -call_udf = [] -for i in range(num_udfs): -arg_offsets, udf = read_single_udf(pickleSer, infile) -udfs['f%d' % i] = udf -args = ["a[%d]" % o for o in arg_offsets] -call_udf.append("f%d(%s)" % (i, ", ".join(args))) -# Create function like this: -# lambda a: (f0(a0), f1(a1, a2), f2(a3)) -mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) -mapper = eval(mapper_str, udfs) +udfs = {} +call_udf = [] +for i in range(num_udfs): +arg_offsets, udf = read_single_udf(pickleSer, infile) +udfs['f%d' % i] = udf +args = ["a[%d]" % o for o in arg_offsets] +call_udf.append("f%d(%s)" % (i, ", ".join(args))) +# Create function like this: +# lambda a: (f0(a0), f1(a1, a2), f2(a3)) +# In the special case of a single UDF this will return a single result rather +# than a tuple of results; this is the format that the JVM side expects. +mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) +mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) ser = BatchedSerializer(PickleSerializer(), 100)
spark git commit: [SPARK-20685] Fix BatchPythonEvaluation bug in case of single UDF w/ repeated arg.
Repository: spark Updated Branches: refs/heads/branch-2.2 86cef4df5 -> 3eb0ee06a [SPARK-20685] Fix BatchPythonEvaluation bug in case of single UDF w/ repeated arg. ## What changes were proposed in this pull request? There's a latent corner-case bug in PySpark UDF evaluation where executing a `BatchPythonEvaluation` with a single multi-argument UDF where _at least one argument value is repeated_ will crash at execution with a confusing error. This problem was introduced in #12057: the code there has a fast path for handling a "batch UDF evaluation consisting of a single Python UDF", but that branch incorrectly assumes that a single UDF won't have repeated arguments and therefore skips the code for unpacking arguments from the input row (whose schema may not necessarily match the UDF inputs due to de-duplication of repeated arguments which occurred in the JVM before sending UDF inputs to Python). This fix here is simply to remove this special-casing: it turns out that the code in the "multiple UDFs" branch just so happens to work for the single-UDF case because Python treats `(x)` as equivalent to `x`, not as a single-argument tuple. ## How was this patch tested? New regression test in `pyspark.python.sql.tests` module (tested and confirmed that it fails before my fix). Author: Josh Rosen Closes #17927 from JoshRosen/SPARK-20685. (cherry picked from commit 8ddbc431d8b21d5ee57d3d209a4f25e301f15283) Signed-off-by: Xiao Li Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3eb0ee06 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3eb0ee06 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3eb0ee06 Branch: refs/heads/branch-2.2 Commit: 3eb0ee06a588da5b9c08a72d178835c6e8bad36b Parents: 86cef4d Author: Josh Rosen Authored: Wed May 10 16:50:57 2017 -0700 Committer: Xiao Li Committed: Wed May 10 16:51:05 2017 -0700 -- python/pyspark/sql/tests.py | 6 ++ python/pyspark/worker.py| 29 + 2 files changed, 19 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3eb0ee06/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 2aa2d23..e06f62b 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -324,6 +324,12 @@ class SQLTests(ReusedPySparkTestCase): [row] = self.spark.sql("SELECT double(double(1) + 1)").collect() self.assertEqual(row[0], 6) +def test_single_udf_with_repeated_argument(self): +# regression test for SPARK-20685 +self.spark.catalog.registerFunction("add", lambda x, y: x + y, IntegerType()) +row = self.spark.sql("SELECT add(1, 1)").first() +self.assertEqual(tuple(row), (2, )) + def test_multiple_udfs(self): self.spark.catalog.registerFunction("double", lambda x: x * 2, IntegerType()) [row] = self.spark.sql("SELECT double(1), double(2)").collect() http://git-wip-us.apache.org/repos/asf/spark/blob/3eb0ee06/python/pyspark/worker.py -- diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 25ee475..baaa3fe 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -87,22 +87,19 @@ def read_single_udf(pickleSer, infile): def read_udfs(pickleSer, infile): num_udfs = read_int(infile) -if num_udfs == 1: -# fast path for single UDF -_, udf = read_single_udf(pickleSer, infile) -mapper = lambda a: udf(*a) -else: -udfs = {} -call_udf = [] -for i in range(num_udfs): -arg_offsets, udf = read_single_udf(pickleSer, infile) -udfs['f%d' % i] = udf -args = ["a[%d]" % o for o in arg_offsets] -call_udf.append("f%d(%s)" % (i, ", ".join(args))) -# Create function like this: -# lambda a: (f0(a0), f1(a1, a2), f2(a3)) -mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) -mapper = eval(mapper_str, udfs) +udfs = {} +call_udf = [] +for i in range(num_udfs): +arg_offsets, udf = read_single_udf(pickleSer, infile) +udfs['f%d' % i] = udf +args = ["a[%d]" % o for o in arg_offsets] +call_udf.append("f%d(%s)" % (i, ", ".join(args))) +# Create function like this: +# lambda a: (f0(a0), f1(a1, a2), f2(a3)) +# In the special case of a single UDF this will return a single result rather +# than a tuple of results; this is the format that the JVM side expects. +mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) +mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) ser = Bat
spark-website git commit: Trigger git sync
Repository: spark-website Updated Branches: refs/heads/asf-site 01e0279a0 -> c2c0905b4 Trigger git sync Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/c2c0905b Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/c2c0905b Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/c2c0905b Branch: refs/heads/asf-site Commit: c2c0905b446b7272dec0147cabb593814d486efa Parents: 01e0279 Author: Sean Owen Authored: Wed May 10 18:47:55 2017 +0100 Committer: Sean Owen Committed: Wed May 10 18:47:55 2017 +0100 -- -- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20689][PYSPARK] python doctest leaking bucketed table
Repository: spark Updated Branches: refs/heads/master 5c2c4dcce -> af8b6cc82 [SPARK-20689][PYSPARK] python doctest leaking bucketed table ## What changes were proposed in this pull request? It turns out pyspark doctest is calling saveAsTable without ever dropping them. Since we have separate python tests for bucketed table, and there is no checking of results, there is really no need to run the doctest, other than leaving it as an example in the generated doc ## How was this patch tested? Jenkins Author: Felix Cheung Closes #17932 from felixcheung/pytablecleanup. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af8b6cc8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af8b6cc8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af8b6cc8 Branch: refs/heads/master Commit: af8b6cc82336437a55ff7578c6505d251dfa30a9 Parents: 5c2c4dc Author: Felix Cheung Authored: Wed May 10 09:33:49 2017 -0700 Committer: Xiao Li Committed: Wed May 10 09:33:49 2017 -0700 -- python/pyspark/sql/readwriter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af8b6cc8/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 90ce8f8..61a6b76 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -575,7 +575,7 @@ class DataFrameWriter(OptionUtils): .. note:: Applicable for file-based data sources in combination with :py:meth:`DataFrameWriter.saveAsTable`. ->>> (df.write.format('parquet') +>>> (df.write.format('parquet') # doctest: +SKIP ... .bucketBy(100, 'year', 'month') ... .mode("overwrite") ... .saveAsTable('bucketed_table')) @@ -602,7 +602,7 @@ class DataFrameWriter(OptionUtils): :param col: a name of a column, or a list of names. :param cols: additional names (optional). If `col` is a list it should be empty. ->>> (df.write.format('parquet') +>>> (df.write.format('parquet') # doctest: +SKIP ... .bucketBy(100, 'year', 'month') ... .sortBy('day') ... .mode("overwrite") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19447] Remove remaining references to generated rows metric
Repository: spark Updated Branches: refs/heads/branch-2.2 358516dcb -> 86cef4df5 [SPARK-19447] Remove remaining references to generated rows metric ## What changes were proposed in this pull request? https://github.com/apache/spark/commit/b486ffc86d8ad6c303321dcf8514afee723f61f8 left behind references to "number of generated rows" metrics, that should have been removed. ## How was this patch tested? Existing unit tests. Author: Ala Luszczak Closes #17939 from ala/SPARK-19447-fix. (cherry picked from commit 5c2c4dcce529d228a97ede0386b95213ea0e1da5) Signed-off-by: Herman van Hovell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/86cef4df Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/86cef4df Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/86cef4df Branch: refs/heads/branch-2.2 Commit: 86cef4df5fd9e28a8ece4ec33376d3622de2ef69 Parents: 358516d Author: Ala Luszczak Authored: Wed May 10 08:41:04 2017 -0700 Committer: Herman van Hovell Committed: Wed May 10 08:41:14 2017 -0700 -- .../org/apache/spark/sql/execution/basicPhysicalOperators.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/86cef4df/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala index 64698d5..85096dc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -340,8 +340,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) override val output: Seq[Attribute] = range.output override lazy val metrics = Map( -"numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), -"numGeneratedRows" -> SQLMetrics.createMetric(sparkContext, "number of generated rows")) +"numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) override lazy val canonicalized: SparkPlan = { RangeExec(range.canonicalized.asInstanceOf[org.apache.spark.sql.catalyst.plans.logical.Range]) @@ -354,7 +353,6 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) protected override def doProduce(ctx: CodegenContext): String = { val numOutput = metricTerm(ctx, "numOutputRows") -val numGenerated = metricTerm(ctx, "numGeneratedRows") val initTerm = ctx.freshName("initRange") ctx.addMutableState("boolean", initTerm, s"$initTerm = false;") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19447] Remove remaining references to generated rows metric
Repository: spark Updated Branches: refs/heads/master fcb88f921 -> 5c2c4dcce [SPARK-19447] Remove remaining references to generated rows metric ## What changes were proposed in this pull request? https://github.com/apache/spark/commit/b486ffc86d8ad6c303321dcf8514afee723f61f8 left behind references to "number of generated rows" metrics, that should have been removed. ## How was this patch tested? Existing unit tests. Author: Ala Luszczak Closes #17939 from ala/SPARK-19447-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c2c4dcc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c2c4dcc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c2c4dcc Branch: refs/heads/master Commit: 5c2c4dcce529d228a97ede0386b95213ea0e1da5 Parents: fcb88f9 Author: Ala Luszczak Authored: Wed May 10 08:41:04 2017 -0700 Committer: Herman van Hovell Committed: Wed May 10 08:41:04 2017 -0700 -- .../org/apache/spark/sql/execution/basicPhysicalOperators.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c2c4dcc/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala index 64698d5..85096dc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -340,8 +340,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) override val output: Seq[Attribute] = range.output override lazy val metrics = Map( -"numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), -"numGeneratedRows" -> SQLMetrics.createMetric(sparkContext, "number of generated rows")) +"numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) override lazy val canonicalized: SparkPlan = { RangeExec(range.canonicalized.asInstanceOf[org.apache.spark.sql.catalyst.plans.logical.Range]) @@ -354,7 +353,6 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) protected override def doProduce(ctx: CodegenContext): String = { val numOutput = metricTerm(ctx, "numOutputRows") -val numGenerated = metricTerm(ctx, "numGeneratedRows") val initTerm = ctx.freshName("initRange") ctx.addMutableState("boolean", initTerm, s"$initTerm = false;") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][BUILD] Fix lint-java breaks.
Repository: spark Updated Branches: refs/heads/branch-2.2 5f6029c75 -> 358516dcb [MINOR][BUILD] Fix lint-java breaks. ## What changes were proposed in this pull request? This PR proposes to fix the lint-breaks as below: ``` [ERROR] src/main/java/org/apache/spark/unsafe/Platform.java:[51] (regexp) RegexpSingleline: No trailing whitespace allowed. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[45,25] (naming) MethodName: Method name 'ProcessingTime' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[62,25] (naming) MethodName: Method name 'ProcessingTime' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[78,25] (naming) MethodName: Method name 'ProcessingTime' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[92,25] (naming) MethodName: Method name 'ProcessingTime' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[102,25] (naming) MethodName: Method name 'Once' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java:[28,8] (imports) UnusedImports: Unused import - org.apache.spark.streaming.api.java.JavaDStream. ``` after: ``` dev/lint-java Checkstyle checks passed. ``` [Test Result](https://travis-ci.org/ConeyLiu/spark/jobs/229666169) ## How was this patch tested? Travis CI Author: Xianyang Liu Closes #17890 from ConeyLiu/codestyle. (cherry picked from commit fcb88f9211e39c705073db5300c96ceeb3f227d7) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/358516dc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/358516dc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/358516dc Branch: refs/heads/branch-2.2 Commit: 358516dcbef5178cdc6cb4387d7f6837359946ba Parents: 5f6029c Author: Xianyang Liu Authored: Wed May 10 13:56:34 2017 +0100 Committer: Sean Owen Committed: Wed May 10 13:56:42 2017 +0100 -- common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java | 2 +- core/src/main/scala/org/apache/spark/storage/BlockManager.scala | 3 --- dev/checkstyle-suppressions.xml | 2 +- .../streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java| 1 - 4 files changed, 2 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/358516dc/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java index 4ab5b68..aca6fca 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java @@ -48,7 +48,7 @@ public final class Platform { boolean _unaligned; String arch = System.getProperty("os.arch", ""); if (arch.equals("ppc64le") || arch.equals("ppc64")) { - // Since java.nio.Bits.unaligned() doesn't return true on ppc (See JDK-8165231), but + // Since java.nio.Bits.unaligned() doesn't return true on ppc (See JDK-8165231), but // ppc64 and ppc64le support it _unaligned = true; } else { http://git-wip-us.apache.org/repos/asf/spark/blob/358516dc/core/src/main/scala/org/apache/spark/storage/BlockManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 3219969..ad0dc3c 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -29,8 +29,6 @@ import scala.reflect.ClassTag import scala.util.Random import scala.util.control.NonFatal -import com.google.common.io.ByteStreams - import org.apache.spark._ import org.apache.spark.executor.{DataReadMethod, ShuffleWriteMetrics} import org.apache.spark.internal.Logging @@ -41,7 +39,6 @@ import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.network.shuffle.ExternalShuffleClient import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo import org.apache.spark.rpc.RpcEnv -import org.apache.spark.security.CryptoStreamUtils import org.apache.spark.serializer.{SerializerInstance, SerializerManager} import org.apache.spark.shuffle.ShuffleManager import org.apache.spark.storage.memory._ http://git-wip-us.apache
spark git commit: [MINOR][BUILD] Fix lint-java breaks.
Repository: spark Updated Branches: refs/heads/master 76e4a5566 -> fcb88f921 [MINOR][BUILD] Fix lint-java breaks. ## What changes were proposed in this pull request? This PR proposes to fix the lint-breaks as below: ``` [ERROR] src/main/java/org/apache/spark/unsafe/Platform.java:[51] (regexp) RegexpSingleline: No trailing whitespace allowed. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[45,25] (naming) MethodName: Method name 'ProcessingTime' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[62,25] (naming) MethodName: Method name 'ProcessingTime' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[78,25] (naming) MethodName: Method name 'ProcessingTime' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[92,25] (naming) MethodName: Method name 'ProcessingTime' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/main/scala/org/apache/spark/sql/streaming/Trigger.java:[102,25] (naming) MethodName: Method name 'Once' must match pattern '^[a-z][a-z0-9][a-zA-Z0-9_]*$'. [ERROR] src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java:[28,8] (imports) UnusedImports: Unused import - org.apache.spark.streaming.api.java.JavaDStream. ``` after: ``` dev/lint-java Checkstyle checks passed. ``` [Test Result](https://travis-ci.org/ConeyLiu/spark/jobs/229666169) ## How was this patch tested? Travis CI Author: Xianyang Liu Closes #17890 from ConeyLiu/codestyle. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fcb88f92 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fcb88f92 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fcb88f92 Branch: refs/heads/master Commit: fcb88f9211e39c705073db5300c96ceeb3f227d7 Parents: 76e4a55 Author: Xianyang Liu Authored: Wed May 10 13:56:34 2017 +0100 Committer: Sean Owen Committed: Wed May 10 13:56:34 2017 +0100 -- common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java | 2 +- core/src/main/scala/org/apache/spark/storage/BlockManager.scala | 3 --- dev/checkstyle-suppressions.xml | 2 +- .../streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java| 1 - 4 files changed, 2 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fcb88f92/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java index 4ab5b68..aca6fca 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java @@ -48,7 +48,7 @@ public final class Platform { boolean _unaligned; String arch = System.getProperty("os.arch", ""); if (arch.equals("ppc64le") || arch.equals("ppc64")) { - // Since java.nio.Bits.unaligned() doesn't return true on ppc (See JDK-8165231), but + // Since java.nio.Bits.unaligned() doesn't return true on ppc (See JDK-8165231), but // ppc64 and ppc64le support it _unaligned = true; } else { http://git-wip-us.apache.org/repos/asf/spark/blob/fcb88f92/core/src/main/scala/org/apache/spark/storage/BlockManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 33ce30c..b3e4584 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -29,8 +29,6 @@ import scala.reflect.ClassTag import scala.util.Random import scala.util.control.NonFatal -import com.google.common.io.ByteStreams - import org.apache.spark._ import org.apache.spark.executor.{DataReadMethod, ShuffleWriteMetrics} import org.apache.spark.internal.Logging @@ -41,7 +39,6 @@ import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.network.shuffle.ExternalShuffleClient import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo import org.apache.spark.rpc.RpcEnv -import org.apache.spark.security.CryptoStreamUtils import org.apache.spark.serializer.{SerializerInstance, SerializerManager} import org.apache.spark.shuffle.ShuffleManager import org.apache.spark.storage.memory._ http://git-wip-us.apache.org/repos/asf/spark/blob/fcb88f92/dev/checkstyle-suppressions.xml -
spark git commit: [SPARK-20678][SQL] Ndv for columns not in filter condition should also be updated
Repository: spark Updated Branches: refs/heads/branch-2.2 0851b6cfb -> 5f6029c75 [SPARK-20678][SQL] Ndv for columns not in filter condition should also be updated ## What changes were proposed in this pull request? In filter estimation, we update column stats for those columns in filter condition. However, if the number of rows decreases after the filter (i.e. the overall selectivity is less than 1), we need to update (scale down) the number of distinct values (NDV) for all columns, no matter they are in filter conditions or not. This pr also fixes the inconsistency of rounding mode for ndv and rowCount. ## How was this patch tested? Added new tests. Author: wangzhenhua Closes #17918 from wzhfy/scaleDownNdvAfterFilter. (cherry picked from commit 76e4a5566b1e9579632e03440cecd04dd142bc44) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5f6029c7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5f6029c7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5f6029c7 Branch: refs/heads/branch-2.2 Commit: 5f6029c7500b0c5a769c6b62879d8532a5692a50 Parents: 0851b6c Author: wangzhenhua Authored: Wed May 10 19:42:49 2017 +0800 Committer: Wenchen Fan Committed: Wed May 10 19:43:05 2017 +0800 -- .../statsEstimation/EstimationUtils.scala | 12 ++ .../statsEstimation/FilterEstimation.scala | 134 +++ .../statsEstimation/JoinEstimation.scala| 25 +--- .../statsEstimation/FilterEstimationSuite.scala | 63 + 4 files changed, 133 insertions(+), 101 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5f6029c7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala index f1aff62..e5fcdf9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala @@ -43,6 +43,18 @@ object EstimationUtils { avgLen = dataType.defaultSize, maxLen = dataType.defaultSize) } + /** + * Updates (scales down) the number of distinct values if the number of rows decreases after + * some operation (such as filter, join). Otherwise keep it unchanged. + */ + def updateNdv(oldNumRows: BigInt, newNumRows: BigInt, oldNdv: BigInt): BigInt = { +if (newNumRows < oldNumRows) { + ceil(BigDecimal(oldNdv) * BigDecimal(newNumRows) / BigDecimal(oldNumRows)) +} else { + oldNdv +} + } + def ceil(bigDecimal: BigDecimal): BigInt = bigDecimal.setScale(0, RoundingMode.CEILING).toBigInt() /** Get column stats for output attributes. */ http://git-wip-us.apache.org/repos/asf/spark/blob/5f6029c7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala index 4b6b3b1..df19086 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala @@ -19,12 +19,12 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation import scala.collection.immutable.HashSet import scala.collection.mutable -import scala.math.BigDecimal.RoundingMode import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, LeafNode, Statistics} +import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -32,14 +32,7 @@ case class FilterEstimation(plan: Filter, catalystConf: SQLConf) extends Logging private val childStats = plan.child.stats(catalystConf) - /** - * We will update the corresponding ColumnStats for a column after we apply a predicate condition. - * For example, column c has [min, max] value as [0, 100]. In a range condit
spark git commit: [SPARK-20678][SQL] Ndv for columns not in filter condition should also be updated
Repository: spark Updated Branches: refs/heads/master 789bdbe3d -> 76e4a5566 [SPARK-20678][SQL] Ndv for columns not in filter condition should also be updated ## What changes were proposed in this pull request? In filter estimation, we update column stats for those columns in filter condition. However, if the number of rows decreases after the filter (i.e. the overall selectivity is less than 1), we need to update (scale down) the number of distinct values (NDV) for all columns, no matter they are in filter conditions or not. This pr also fixes the inconsistency of rounding mode for ndv and rowCount. ## How was this patch tested? Added new tests. Author: wangzhenhua Closes #17918 from wzhfy/scaleDownNdvAfterFilter. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/76e4a556 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/76e4a556 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/76e4a556 Branch: refs/heads/master Commit: 76e4a5566b1e9579632e03440cecd04dd142bc44 Parents: 789bdbe Author: wangzhenhua Authored: Wed May 10 19:42:49 2017 +0800 Committer: Wenchen Fan Committed: Wed May 10 19:42:49 2017 +0800 -- .../statsEstimation/EstimationUtils.scala | 12 ++ .../statsEstimation/FilterEstimation.scala | 134 +++ .../statsEstimation/JoinEstimation.scala| 25 +--- .../statsEstimation/FilterEstimationSuite.scala | 63 + 4 files changed, 133 insertions(+), 101 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/76e4a556/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala index f1aff62..e5fcdf9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala @@ -43,6 +43,18 @@ object EstimationUtils { avgLen = dataType.defaultSize, maxLen = dataType.defaultSize) } + /** + * Updates (scales down) the number of distinct values if the number of rows decreases after + * some operation (such as filter, join). Otherwise keep it unchanged. + */ + def updateNdv(oldNumRows: BigInt, newNumRows: BigInt, oldNdv: BigInt): BigInt = { +if (newNumRows < oldNumRows) { + ceil(BigDecimal(oldNdv) * BigDecimal(newNumRows) / BigDecimal(oldNumRows)) +} else { + oldNdv +} + } + def ceil(bigDecimal: BigDecimal): BigInt = bigDecimal.setScale(0, RoundingMode.CEILING).toBigInt() /** Get column stats for output attributes. */ http://git-wip-us.apache.org/repos/asf/spark/blob/76e4a556/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala index 4b6b3b1..df19086 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala @@ -19,12 +19,12 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation import scala.collection.immutable.HashSet import scala.collection.mutable -import scala.math.BigDecimal.RoundingMode import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, LeafNode, Statistics} +import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -32,14 +32,7 @@ case class FilterEstimation(plan: Filter, catalystConf: SQLConf) extends Logging private val childStats = plan.child.stats(catalystConf) - /** - * We will update the corresponding ColumnStats for a column after we apply a predicate condition. - * For example, column c has [min, max] value as [0, 100]. In a range condition such as - * (c > 40 AND c <= 50), we need to set the column's [min, max] value to [40, 100] after w
spark git commit: [SPARK-20688][SQL] correctly check analysis for scalar sub-queries
Repository: spark Updated Branches: refs/heads/branch-2.1 69786ea3a -> bdc08ab64 [SPARK-20688][SQL] correctly check analysis for scalar sub-queries In `CheckAnalysis`, we should call `checkAnalysis` for `ScalarSubquery` at the beginning, as later we will call `plan.output` which is invalid if `plan` is not resolved. new regression test Author: Wenchen Fan Closes #17930 from cloud-fan/tmp. (cherry picked from commit 789bdbe3d0d9558043872161bdfa148ec021a849) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bdc08ab6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bdc08ab6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bdc08ab6 Branch: refs/heads/branch-2.1 Commit: bdc08ab644f8e0974681446d724600c5c2ac7a56 Parents: 69786ea Author: Wenchen Fan Authored: Wed May 10 19:30:00 2017 +0800 Committer: Wenchen Fan Committed: Wed May 10 19:32:44 2017 +0800 -- .../spark/sql/catalyst/analysis/CheckAnalysis.scala | 3 ++- .../test/scala/org/apache/spark/sql/SubquerySuite.scala | 10 +- 2 files changed, 11 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bdc08ab6/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 06bbd39..e68bf81 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -128,6 +128,8 @@ trait CheckAnalysis extends PredicateHelper { } case s @ ScalarSubquery(query, conditions, _) => +checkAnalysis(query) + // If no correlation, the output must be exactly one column if (conditions.isEmpty && query.output.size != 1) { failAnalysis( @@ -186,7 +188,6 @@ trait CheckAnalysis extends PredicateHelper { case fail => failAnalysis(s"Correlated scalar subqueries must be Aggregated: $fail") } } -checkAnalysis(query) s case s: SubqueryExpression => http://git-wip-us.apache.org/repos/asf/spark/blob/bdc08ab6/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index fb92f31..b0b2833 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -72,7 +72,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext { } } - test("rdd deserialization does not crash [SPARK-15791]") { + test("SPARK-15791: rdd deserialization does not crash") { sql("select (select 1 as b) as b").rdd.count() } @@ -839,4 +839,12 @@ class SubquerySuite extends QueryTest with SharedSQLContext { Row(0) :: Row(1) :: Nil) } } + + test("SPARK-20688: correctly check analysis for scalar sub-queries") { +withTempView("t") { + Seq(1 -> "a").toDF("i", "j").createTempView("t") + val e = intercept[AnalysisException](sql("SELECT (SELECT count(*) FROM t WHERE a = 1)")) + assert(e.message.contains("cannot resolve '`a`' given input columns: [i, j]")) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20688][SQL] correctly check analysis for scalar sub-queries
Repository: spark Updated Branches: refs/heads/branch-2.2 7597a522b -> 0851b6cfb [SPARK-20688][SQL] correctly check analysis for scalar sub-queries ## What changes were proposed in this pull request? In `CheckAnalysis`, we should call `checkAnalysis` for `ScalarSubquery` at the beginning, as later we will call `plan.output` which is invalid if `plan` is not resolved. ## How was this patch tested? new regression test Author: Wenchen Fan Closes #17930 from cloud-fan/tmp. (cherry picked from commit 789bdbe3d0d9558043872161bdfa148ec021a849) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0851b6cf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0851b6cf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0851b6cf Branch: refs/heads/branch-2.2 Commit: 0851b6cfb8980fa8816a96026fbf0498799e296b Parents: 7597a52 Author: Wenchen Fan Authored: Wed May 10 19:30:00 2017 +0800 Committer: Wenchen Fan Committed: Wed May 10 19:30:26 2017 +0800 -- .../spark/sql/catalyst/analysis/CheckAnalysis.scala | 6 +++--- .../test/scala/org/apache/spark/sql/SubquerySuite.scala | 10 +- 2 files changed, 12 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0851b6cf/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 61797bc..ea4560a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -130,12 +130,13 @@ trait CheckAnalysis extends PredicateHelper { } case s @ ScalarSubquery(query, conditions, _) => +checkAnalysis(query) + // If no correlation, the output must be exactly one column if (conditions.isEmpty && query.output.size != 1) { failAnalysis( s"Scalar subquery must return only one column, but got ${query.output.size}") -} -else if (conditions.nonEmpty) { +} else if (conditions.nonEmpty) { def checkAggregate(agg: Aggregate): Unit = { // Make sure correlated scalar subqueries contain one row for every outer row by // enforcing that they are aggregates containing exactly one aggregate expression. @@ -179,7 +180,6 @@ trait CheckAnalysis extends PredicateHelper { case fail => failAnalysis(s"Correlated scalar subqueries must be Aggregated: $fail") } } -checkAnalysis(query) s case s: SubqueryExpression => http://git-wip-us.apache.org/repos/asf/spark/blob/0851b6cf/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index 0f0199c..2a3bdfb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -72,7 +72,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext { } } - test("rdd deserialization does not crash [SPARK-15791]") { + test("SPARK-15791: rdd deserialization does not crash") { sql("select (select 1 as b) as b").rdd.count() } @@ -854,4 +854,12 @@ class SubquerySuite extends QueryTest with SharedSQLContext { sql("select * from l, r where l.a = r.c + 1 AND (exists (select * from r) OR l.a = r.c)"), Row(3, 3.0, 2, 3.0) :: Row(3, 3.0, 2, 3.0) :: Nil) } + + test("SPARK-20688: correctly check analysis for scalar sub-queries") { +withTempView("t") { + Seq(1 -> "a").toDF("i", "j").createTempView("t") + val e = intercept[AnalysisException](sql("SELECT (SELECT count(*) FROM t WHERE a = 1)")) + assert(e.message.contains("cannot resolve '`a`' given input columns: [i, j]")) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20688][SQL] correctly check analysis for scalar sub-queries
Repository: spark Updated Branches: refs/heads/master b512233a4 -> 789bdbe3d [SPARK-20688][SQL] correctly check analysis for scalar sub-queries ## What changes were proposed in this pull request? In `CheckAnalysis`, we should call `checkAnalysis` for `ScalarSubquery` at the beginning, as later we will call `plan.output` which is invalid if `plan` is not resolved. ## How was this patch tested? new regression test Author: Wenchen Fan Closes #17930 from cloud-fan/tmp. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/789bdbe3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/789bdbe3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/789bdbe3 Branch: refs/heads/master Commit: 789bdbe3d0d9558043872161bdfa148ec021a849 Parents: b512233 Author: Wenchen Fan Authored: Wed May 10 19:30:00 2017 +0800 Committer: Wenchen Fan Committed: Wed May 10 19:30:00 2017 +0800 -- .../spark/sql/catalyst/analysis/CheckAnalysis.scala | 6 +++--- .../test/scala/org/apache/spark/sql/SubquerySuite.scala | 10 +- 2 files changed, 12 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/789bdbe3/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 61797bc..ea4560a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -130,12 +130,13 @@ trait CheckAnalysis extends PredicateHelper { } case s @ ScalarSubquery(query, conditions, _) => +checkAnalysis(query) + // If no correlation, the output must be exactly one column if (conditions.isEmpty && query.output.size != 1) { failAnalysis( s"Scalar subquery must return only one column, but got ${query.output.size}") -} -else if (conditions.nonEmpty) { +} else if (conditions.nonEmpty) { def checkAggregate(agg: Aggregate): Unit = { // Make sure correlated scalar subqueries contain one row for every outer row by // enforcing that they are aggregates containing exactly one aggregate expression. @@ -179,7 +180,6 @@ trait CheckAnalysis extends PredicateHelper { case fail => failAnalysis(s"Correlated scalar subqueries must be Aggregated: $fail") } } -checkAnalysis(query) s case s: SubqueryExpression => http://git-wip-us.apache.org/repos/asf/spark/blob/789bdbe3/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index 131abf7..a01eb2a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -72,7 +72,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext { } } - test("rdd deserialization does not crash [SPARK-15791]") { + test("SPARK-15791: rdd deserialization does not crash") { sql("select (select 1 as b) as b").rdd.count() } @@ -867,4 +867,12 @@ class SubquerySuite extends QueryTest with SharedSQLContext { sql("select * from l, r where l.a = r.c + 1 AND (exists (select * from r) OR l.a = r.c)"), Row(3, 3.0, 2, 3.0) :: Row(3, 3.0, 2, 3.0) :: Nil) } + + test("SPARK-20688: correctly check analysis for scalar sub-queries") { +withTempView("t") { + Seq(1 -> "a").toDF("i", "j").createTempView("t") + val e = intercept[AnalysisException](sql("SELECT (SELECT count(*) FROM t WHERE a = 1)")) + assert(e.message.contains("cannot resolve '`a`' given input columns: [i, j]")) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20393][WEBU UI] Strengthen Spark to prevent XSS vulnerabilities
Repository: spark Updated Branches: refs/heads/master a4cbf26bc -> b512233a4 [SPARK-20393][WEBU UI] Strengthen Spark to prevent XSS vulnerabilities ## What changes were proposed in this pull request? Add stripXSS and stripXSSMap to Spark Core's UIUtils. Calling these functions at any point that getParameter is called against a HttpServletRequest. ## How was this patch tested? Unit tests, IBM Security AppScan Standard no longer showing vulnerabilities, manual verification of WebUI pages. Author: NICHOLAS T. MARION Closes #17686 from n-marion/xss-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b512233a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b512233a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b512233a Branch: refs/heads/master Commit: b512233a457092b0e2a39d0b42cb021abc69d375 Parents: a4cbf26 Author: NICHOLAS T. MARION Authored: Wed May 10 10:59:57 2017 +0100 Committer: Sean Owen Committed: Wed May 10 10:59:57 2017 +0100 -- .../spark/deploy/history/HistoryPage.scala | 3 +- .../deploy/master/ui/ApplicationPage.scala | 3 +- .../spark/deploy/master/ui/MasterPage.scala | 6 ++- .../apache/spark/deploy/worker/ui/LogPage.scala | 30 +-- .../scala/org/apache/spark/ui/UIUtils.scala | 21 +++ .../spark/ui/exec/ExecutorThreadDumpPage.scala | 4 +- .../org/apache/spark/ui/jobs/AllJobsPage.scala | 14 --- .../org/apache/spark/ui/jobs/JobPage.scala | 3 +- .../org/apache/spark/ui/jobs/JobsTab.scala | 5 ++- .../org/apache/spark/ui/jobs/PoolPage.scala | 3 +- .../org/apache/spark/ui/jobs/StagePage.scala| 15 .../org/apache/spark/ui/jobs/StageTable.scala | 15 .../org/apache/spark/ui/jobs/StagesTab.scala| 5 ++- .../org/apache/spark/ui/storage/RDDPage.scala | 13 --- .../org/apache/spark/ui/UIUtilsSuite.scala | 39 .../spark/deploy/mesos/ui/DriverPage.scala | 3 +- .../spark/sql/execution/ui/ExecutionPage.scala | 3 +- .../ui/ThriftServerSessionPage.scala| 4 +- .../apache/spark/streaming/ui/BatchPage.scala | 5 ++- 19 files changed, 140 insertions(+), 54 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b512233a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala index 0e7a6c2..af14717 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala @@ -26,8 +26,9 @@ import org.apache.spark.ui.{UIUtils, WebUIPage} private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("") { def render(request: HttpServletRequest): Seq[Node] = { +// stripXSS is called first to remove suspicious characters used in XSS attacks val requestedIncomplete = - Option(request.getParameter("showIncomplete")).getOrElse("false").toBoolean + Option(UIUtils.stripXSS(request.getParameter("showIncomplete"))).getOrElse("false").toBoolean val allAppsSize = parent.getApplicationList().count(_.completed != requestedIncomplete) val eventLogsUnderProcessCount = parent.getEventLogsUnderProcess() http://git-wip-us.apache.org/repos/asf/spark/blob/b512233a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala index a8d721f..94ff81c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala @@ -33,7 +33,8 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") /** Executor details for a particular application */ def render(request: HttpServletRequest): Seq[Node] = { -val appId = request.getParameter("appId") +// stripXSS is called first to remove suspicious characters used in XSS attacks +val appId = UIUtils.stripXSS(request.getParameter("appId")) val state = master.askSync[MasterStateResponse](RequestMasterState) val app = state.activeApps.find(_.id == appId) .getOrElse(state.completedApps.find(_.id == appId).orNull) http://git-wip-us.apache.org/repos/asf/spark/blob/b512233a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala -
spark git commit: [SPARK-20637][CORE] Remove mention of old RDD classes from comments
Repository: spark Updated Branches: refs/heads/master ca4625e0e -> a4cbf26bc [SPARK-20637][CORE] Remove mention of old RDD classes from comments ## What changes were proposed in this pull request? A few comments around the code mention RDD classes that do not exist anymore. I'm not sure of the best way to replace these, so I've just removed them here. ## How was this patch tested? Only changes code comments, no testing required Author: Michael Mior Closes #17900 from michaelmior/remove-old-rdds. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a4cbf26b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a4cbf26b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a4cbf26b Branch: refs/heads/master Commit: a4cbf26bca349a63586777ad3e398717eb94473e Parents: ca4625e Author: Michael Mior Authored: Wed May 10 10:21:43 2017 +0100 Committer: Sean Owen Committed: Wed May 10 10:21:43 2017 +0100 -- .../main/scala/org/apache/spark/scheduler/DAGScheduler.scala | 2 +- core/src/test/scala/org/apache/spark/CheckpointSuite.scala | 6 +++--- .../scala/org/apache/spark/scheduler/SparkListenerSuite.scala | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a4cbf26b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index aab177f..68178c7 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -58,7 +58,7 @@ import org.apache.spark.util._ * set of map output files, and another to read those files after a barrier). In the end, every * stage will have only shuffle dependencies on other stages, and may compute multiple operations * inside it. The actual pipelining of these operations happens in the RDD.compute() functions of - * various RDDs (MappedRDD, FilteredRDD, etc). + * various RDDs * * In addition to coming up with a DAG of stages, the DAGScheduler also determines the preferred * locations to run each task on, based on the current cache status, and passes these to the http://git-wip-us.apache.org/repos/asf/spark/blob/a4cbf26b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala index ee70a33..48408cc 100644 --- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala +++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala @@ -114,7 +114,7 @@ trait RDDCheckpointTester { self: SparkFunSuite => * RDDs partitions. So even if the parent RDD is checkpointed and its partitions changed, * the generated RDD will remember the partitions and therefore potentially the whole lineage. * This function should be called only those RDD whose partitions refer to parent RDD's - * partitions (i.e., do not call it on simple RDD like MappedRDD). + * partitions (i.e., do not call it on simple RDDs). * * @param op an operation to run on the RDD * @param reliableCheckpoint if true, use reliable checkpoints, otherwise use local checkpoints @@ -388,7 +388,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS // the parent RDD has been checkpointed and parent partitions have been changed. // Note that this test is very specific to the current implementation of CartesianRDD. val ones = sc.makeRDD(1 to 100, 10).map(x => x) -checkpoint(ones, reliableCheckpoint) // checkpoint that MappedRDD +checkpoint(ones, reliableCheckpoint) val cartesian = new CartesianRDD(sc, ones, ones) val splitBeforeCheckpoint = serializeDeserialize(cartesian.partitions.head.asInstanceOf[CartesianPartition]) @@ -411,7 +411,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS // Note that this test is very specific to the current implementation of // CoalescedRDDPartitions. val ones = sc.makeRDD(1 to 100, 10).map(x => x) -checkpoint(ones, reliableCheckpoint) // checkpoint that MappedRDD +checkpoint(ones, reliableCheckpoint) val coalesced = new CoalescedRDD(ones, 2) val splitBeforeCheckpoint = serializeDeserialize(coalesced.partitions.head.asInstanceOf[CoalescedRDDPartition]) http://git-wip-us.apache.org/repos/asf/spark/blob/a4cbf26b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala -
spark git commit: [SPARK-20630][WEB UI] Fixed column visibility in Executor Tab
Repository: spark Updated Branches: refs/heads/branch-2.2 3ed2f4d51 -> 7597a522b [SPARK-20630][WEB UI] Fixed column visibility in Executor Tab ## What changes were proposed in this pull request? #14617 added new columns to the executor table causing the visibility checks for the logs and threadDump columns to toggle the wrong columns since they used hard-coded column numbers. I've updated the checks to use column names instead of numbers so future updates don't accidentally break this again. Note: This will also need to be back ported into 2.2 since #14617 was merged there ## How was this patch tested? Manually tested Author: Alex Bozarth Closes #17904 from ajbozarth/spark20630. (cherry picked from commit ca4625e0e58df7f02346470d22a9478d9640709d) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7597a522 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7597a522 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7597a522 Branch: refs/heads/branch-2.2 Commit: 7597a522b7e5be43910e86cd6f805e7e9ee08ced Parents: 3ed2f4d Author: Alex Bozarth Authored: Wed May 10 10:20:10 2017 +0100 Committer: Sean Owen Committed: Wed May 10 10:20:19 2017 +0100 -- .../org/apache/spark/ui/static/executorspage.js | 12 1 file changed, 4 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7597a522/core/src/main/resources/org/apache/spark/ui/static/executorspage.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js index cb9922d..6643a8f 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -492,24 +492,20 @@ $(document).ready(function () { {data: 'totalInputBytes', render: formatBytes}, {data: 'totalShuffleRead', render: formatBytes}, {data: 'totalShuffleWrite', render: formatBytes}, -{data: 'executorLogs', render: formatLogsCells}, +{name: 'executorLogsCol', data: 'executorLogs', render: formatLogsCells}, { +name: 'threadDumpCol', data: 'id', render: function (data, type) { return type === 'display' ? ("Thread Dump" ) : data; } } ], -"columnDefs": [ -{ -"targets": [ 16 ], -"visible": getThreadDumpEnabled() -} -], "order": [[0, "asc"]] }; var dt = $(selector).DataTable(conf); -dt.column(15).visible(logsExist(response)); +dt.column('executorLogsCol:name').visible(logsExist(response)); + dt.column('threadDumpCol:name').visible(getThreadDumpEnabled()); $('#active-executors [data-toggle="tooltip"]').tooltip(); var sumSelector = "#summary-execs-table"; - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20630][WEB UI] Fixed column visibility in Executor Tab
Repository: spark Updated Branches: refs/heads/master 804949c6b -> ca4625e0e [SPARK-20630][WEB UI] Fixed column visibility in Executor Tab ## What changes were proposed in this pull request? #14617 added new columns to the executor table causing the visibility checks for the logs and threadDump columns to toggle the wrong columns since they used hard-coded column numbers. I've updated the checks to use column names instead of numbers so future updates don't accidentally break this again. Note: This will also need to be back ported into 2.2 since #14617 was merged there ## How was this patch tested? Manually tested Author: Alex Bozarth Closes #17904 from ajbozarth/spark20630. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ca4625e0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ca4625e0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ca4625e0 Branch: refs/heads/master Commit: ca4625e0e58df7f02346470d22a9478d9640709d Parents: 804949c Author: Alex Bozarth Authored: Wed May 10 10:20:10 2017 +0100 Committer: Sean Owen Committed: Wed May 10 10:20:10 2017 +0100 -- .../org/apache/spark/ui/static/executorspage.js | 12 1 file changed, 4 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ca4625e0/core/src/main/resources/org/apache/spark/ui/static/executorspage.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js index cb9922d..6643a8f 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -492,24 +492,20 @@ $(document).ready(function () { {data: 'totalInputBytes', render: formatBytes}, {data: 'totalShuffleRead', render: formatBytes}, {data: 'totalShuffleWrite', render: formatBytes}, -{data: 'executorLogs', render: formatLogsCells}, +{name: 'executorLogsCol', data: 'executorLogs', render: formatLogsCells}, { +name: 'threadDumpCol', data: 'id', render: function (data, type) { return type === 'display' ? ("Thread Dump" ) : data; } } ], -"columnDefs": [ -{ -"targets": [ 16 ], -"visible": getThreadDumpEnabled() -} -], "order": [[0, "asc"]] }; var dt = $(selector).DataTable(conf); -dt.column(15).visible(logsExist(response)); +dt.column('executorLogsCol:name').visible(logsExist(response)); + dt.column('threadDumpCol:name').visible(getThreadDumpEnabled()); $('#active-executors [data-toggle="tooltip"]').tooltip(); var sumSelector = "#summary-execs-table"; - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20631][PYTHON][ML] LogisticRegression._checkThresholdConsistency should use values not Params
Repository: spark Updated Branches: refs/heads/branch-2.0 46659974e -> d86dae8fe [SPARK-20631][PYTHON][ML] LogisticRegression._checkThresholdConsistency should use values not Params ## What changes were proposed in this pull request? - Replace `getParam` calls with `getOrDefault` calls. - Fix exception message to avoid unintended `TypeError`. - Add unit tests ## How was this patch tested? New unit tests. Author: zero323 Closes #17891 from zero323/SPARK-20631. (cherry picked from commit 804949c6bf00b8e26c39d48bbcc4d0470ee84e47) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d86dae8f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d86dae8f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d86dae8f Branch: refs/heads/branch-2.0 Commit: d86dae8feec5e9bf77dd5ba0cf9caa1b955de020 Parents: 4665997 Author: zero323 Authored: Wed May 10 16:57:52 2017 +0800 Committer: Yanbo Liang Committed: Wed May 10 17:00:22 2017 +0800 -- python/pyspark/ml/classification.py | 6 +++--- python/pyspark/ml/tests.py | 12 2 files changed, 15 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d86dae8f/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index bfeda7c..0a30321 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -200,13 +200,13 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti def _checkThresholdConsistency(self): if self.isSet(self.threshold) and self.isSet(self.thresholds): -ts = self.getParam(self.thresholds) +ts = self.getOrDefault(self.thresholds) if len(ts) != 2: raise ValueError("Logistic Regression getThreshold only applies to" + " binary classification, but thresholds has length != 2." + - " thresholds: " + ",".join(ts)) + " thresholds: {0}".format(str(ts))) t = 1.0/(1.0 + ts[0]/ts[1]) -t2 = self.getParam(self.threshold) +t2 = self.getOrDefault(self.threshold) if abs(t2 - t) >= 1E-5: raise ValueError("Logistic Regression getThreshold found inconsistent values for" + " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) http://git-wip-us.apache.org/repos/asf/spark/blob/d86dae8f/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 3c346b9..87f0aff 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -765,6 +765,18 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def logistic_regression_check_thresholds(self): +self.assertIsInstance( +LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]), +LogisticRegressionModel +) + +self.assertRaisesRegexp( +ValueError, +"Logistic Regression getThreshold found inconsistent.*$", +LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] +) + def _compare_params(self, m1, m2, param): """ Compare 2 ML Params instances for the given param, and assert both have the same param value - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20631][PYTHON][ML] LogisticRegression._checkThresholdConsistency should use values not Params
Repository: spark Updated Branches: refs/heads/master 0ef16bd4b -> 804949c6b [SPARK-20631][PYTHON][ML] LogisticRegression._checkThresholdConsistency should use values not Params ## What changes were proposed in this pull request? - Replace `getParam` calls with `getOrDefault` calls. - Fix exception message to avoid unintended `TypeError`. - Add unit tests ## How was this patch tested? New unit tests. Author: zero323 Closes #17891 from zero323/SPARK-20631. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/804949c6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/804949c6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/804949c6 Branch: refs/heads/master Commit: 804949c6bf00b8e26c39d48bbcc4d0470ee84e47 Parents: 0ef16bd Author: zero323 Authored: Wed May 10 16:57:52 2017 +0800 Committer: Yanbo Liang Committed: Wed May 10 16:57:52 2017 +0800 -- python/pyspark/ml/classification.py | 6 +++--- python/pyspark/ml/tests.py | 12 2 files changed, 15 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/804949c6/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index a9756ea..dcc12d9 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -349,13 +349,13 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti def _checkThresholdConsistency(self): if self.isSet(self.threshold) and self.isSet(self.thresholds): -ts = self.getParam(self.thresholds) +ts = self.getOrDefault(self.thresholds) if len(ts) != 2: raise ValueError("Logistic Regression getThreshold only applies to" + " binary classification, but thresholds has length != 2." + - " thresholds: " + ",".join(ts)) + " thresholds: {0}".format(str(ts))) t = 1.0/(1.0 + ts[0]/ts[1]) -t2 = self.getParam(self.threshold) +t2 = self.getOrDefault(self.threshold) if abs(t2 - t) >= 1E-5: raise ValueError("Logistic Regression getThreshold found inconsistent values for" + " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) http://git-wip-us.apache.org/repos/asf/spark/blob/804949c6/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 571ac4b..51a3e8e 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -807,6 +807,18 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def logistic_regression_check_thresholds(self): +self.assertIsInstance( +LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]), +LogisticRegressionModel +) + +self.assertRaisesRegexp( +ValueError, +"Logistic Regression getThreshold found inconsistent.*$", +LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] +) + def _compare_params(self, m1, m2, param): """ Compare 2 ML Params instances for the given param, and assert both have the same param value - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20631][PYTHON][ML] LogisticRegression._checkThresholdConsistency should use values not Params
Repository: spark Updated Branches: refs/heads/branch-2.1 8e097890a -> 69786ea3a [SPARK-20631][PYTHON][ML] LogisticRegression._checkThresholdConsistency should use values not Params ## What changes were proposed in this pull request? - Replace `getParam` calls with `getOrDefault` calls. - Fix exception message to avoid unintended `TypeError`. - Add unit tests ## How was this patch tested? New unit tests. Author: zero323 Closes #17891 from zero323/SPARK-20631. (cherry picked from commit 804949c6bf00b8e26c39d48bbcc4d0470ee84e47) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69786ea3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69786ea3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69786ea3 Branch: refs/heads/branch-2.1 Commit: 69786ea3a972af1b29a332dc11ac507ed4368cc6 Parents: 8e09789 Author: zero323 Authored: Wed May 10 16:57:52 2017 +0800 Committer: Yanbo Liang Committed: Wed May 10 16:58:34 2017 +0800 -- python/pyspark/ml/classification.py | 6 +++--- python/pyspark/ml/tests.py | 12 2 files changed, 15 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69786ea3/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 570a414..2b47c40 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -238,13 +238,13 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti def _checkThresholdConsistency(self): if self.isSet(self.threshold) and self.isSet(self.thresholds): -ts = self.getParam(self.thresholds) +ts = self.getOrDefault(self.thresholds) if len(ts) != 2: raise ValueError("Logistic Regression getThreshold only applies to" + " binary classification, but thresholds has length != 2." + - " thresholds: " + ",".join(ts)) + " thresholds: {0}".format(str(ts))) t = 1.0/(1.0 + ts[0]/ts[1]) -t2 = self.getParam(self.threshold) +t2 = self.getOrDefault(self.threshold) if abs(t2 - t) >= 1E-5: raise ValueError("Logistic Regression getThreshold found inconsistent values for" + " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) http://git-wip-us.apache.org/repos/asf/spark/blob/69786ea3/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 70e0c6d..7152036 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -808,6 +808,18 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def logistic_regression_check_thresholds(self): +self.assertIsInstance( +LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]), +LogisticRegressionModel +) + +self.assertRaisesRegexp( +ValueError, +"Logistic Regression getThreshold found inconsistent.*$", +LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] +) + def _compare_params(self, m1, m2, param): """ Compare 2 ML Params instances for the given param, and assert both have the same param value - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20631][PYTHON][ML] LogisticRegression._checkThresholdConsistency should use values not Params
Repository: spark Updated Branches: refs/heads/branch-2.2 ef50a9548 -> 3ed2f4d51 [SPARK-20631][PYTHON][ML] LogisticRegression._checkThresholdConsistency should use values not Params ## What changes were proposed in this pull request? - Replace `getParam` calls with `getOrDefault` calls. - Fix exception message to avoid unintended `TypeError`. - Add unit tests ## How was this patch tested? New unit tests. Author: zero323 Closes #17891 from zero323/SPARK-20631. (cherry picked from commit 804949c6bf00b8e26c39d48bbcc4d0470ee84e47) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3ed2f4d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3ed2f4d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3ed2f4d5 Branch: refs/heads/branch-2.2 Commit: 3ed2f4d516ce02dfef929195778f8214703913d8 Parents: ef50a95 Author: zero323 Authored: Wed May 10 16:57:52 2017 +0800 Committer: Yanbo Liang Committed: Wed May 10 16:58:08 2017 +0800 -- python/pyspark/ml/classification.py | 6 +++--- python/pyspark/ml/tests.py | 12 2 files changed, 15 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3ed2f4d5/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index a9756ea..dcc12d9 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -349,13 +349,13 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti def _checkThresholdConsistency(self): if self.isSet(self.threshold) and self.isSet(self.thresholds): -ts = self.getParam(self.thresholds) +ts = self.getOrDefault(self.thresholds) if len(ts) != 2: raise ValueError("Logistic Regression getThreshold only applies to" + " binary classification, but thresholds has length != 2." + - " thresholds: " + ",".join(ts)) + " thresholds: {0}".format(str(ts))) t = 1.0/(1.0 + ts[0]/ts[1]) -t2 = self.getParam(self.threshold) +t2 = self.getOrDefault(self.threshold) if abs(t2 - t) >= 1E-5: raise ValueError("Logistic Regression getThreshold found inconsistent values for" + " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) http://git-wip-us.apache.org/repos/asf/spark/blob/3ed2f4d5/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 571ac4b..51a3e8e 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -807,6 +807,18 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def logistic_regression_check_thresholds(self): +self.assertIsInstance( +LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]), +LogisticRegressionModel +) + +self.assertRaisesRegexp( +ValueError, +"Logistic Regression getThreshold found inconsistent.*$", +LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] +) + def _compare_params(self, m1, m2, param): """ Compare 2 ML Params instances for the given param, and assert both have the same param value - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org