spark git commit: Revert "[SPARK-18646][REPL] Set parent classloader as null for ExecutorClassLoader"
Repository: spark Updated Branches: refs/heads/branch-2.2 39eba3053 -> cf0719b5e Revert "[SPARK-18646][REPL] Set parent classloader as null for ExecutorClassLoader" This reverts commit 39eba3053ac99f03d9df56471bae5fc5cc9f4462. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf0719b5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf0719b5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf0719b5 Branch: refs/heads/branch-2.2 Commit: cf0719b5e99333b28bb4066b304dbcf8400c80ea Parents: 39eba30 Author: Wenchen Fan Authored: Thu Jul 13 08:34:42 2017 +0800 Committer: Wenchen Fan Committed: Thu Jul 13 08:34:42 2017 +0800 -- .../apache/spark/repl/ExecutorClassLoader.scala | 17 +++- .../spark/repl/ExecutorClassLoaderSuite.scala | 46 2 files changed, 6 insertions(+), 57 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cf0719b5/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala -- diff --git a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala index 127f673..df13b32 100644 --- a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala +++ b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala @@ -33,23 +33,18 @@ import org.apache.spark.internal.Logging import org.apache.spark.util.{ParentClassLoader, Utils} /** - * A ClassLoader that reads classes from a Hadoop FileSystem or HTTP URI, used to load classes - * defined by the interpreter when the REPL is used. Allows the user to specify if user class path - * should be first. This class loader delegates getting/finding resources to parent loader, which - * makes sense until REPL never provide resource dynamically. - * - * Note: [[ClassLoader]] will preferentially load class from parent. Only when parent is null or - * the load failed, that it will call the overridden `findClass` function. To avoid the potential - * issue caused by loading class using inappropriate class loader, we should set the parent of - * ClassLoader to null, so that we can fully control which class loader is used. For detailed - * discussion, see SPARK-18646. + * A ClassLoader that reads classes from a Hadoop FileSystem or HTTP URI, + * used to load classes defined by the interpreter when the REPL is used. + * Allows the user to specify if user class path should be first. + * This class loader delegates getting/finding resources to parent loader, + * which makes sense until REPL never provide resource dynamically. */ class ExecutorClassLoader( conf: SparkConf, env: SparkEnv, classUri: String, parent: ClassLoader, -userClassPathFirst: Boolean) extends ClassLoader(null) with Logging { +userClassPathFirst: Boolean) extends ClassLoader with Logging { val uri = new URI(classUri) val directory = uri.getPath http://git-wip-us.apache.org/repos/asf/spark/blob/cf0719b5/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala -- diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala index 092d3c2..6d274bd 100644 --- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala +++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala @@ -23,8 +23,6 @@ import java.nio.channels.{FileChannel, ReadableByteChannel} import java.nio.charset.StandardCharsets import java.nio.file.{Paths, StandardOpenOption} import java.util -import java.util.Collections -import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider} import scala.io.Source import scala.language.implicitConversions @@ -79,50 +77,6 @@ class ExecutorClassLoaderSuite } } - test("child over system classloader") { -// JavaFileObject for scala.Option class -val scalaOptionFile = new SimpleJavaFileObject( - URI.create(s"string:///scala/Option.java"), - JavaFileObject.Kind.SOURCE) { - - override def getCharContent(ignoreEncodingErrors: Boolean): CharSequence = { -"package scala; class Option {}" - } -} -// compile fake scala.Option class -ToolProvider - .getSystemJavaCompiler - .getTask(null, null, null, null, null, Collections.singletonList(scalaOptionFile)).call() - -// create 'scala' dir in tempDir1 -val scalaDir = new File(tempDir1, "scala") -assert(scalaDir.mkdir(), s"Failed to create 'scala' directory in $tempDir1") - -// move the generated class into scala dir -val filename = "Option.class" -val res
spark git commit: [SPARK-18646][REPL] Set parent classloader as null for ExecutorClassLoader
Repository: spark Updated Branches: refs/heads/master 780586a9f -> e08d06b37 [SPARK-18646][REPL] Set parent classloader as null for ExecutorClassLoader ## What changes were proposed in this pull request? `ClassLoader` will preferentially load class from `parent`. Only when `parent` is null or the load failed, that it will call the overridden `findClass` function. To avoid the potential issue caused by loading class using inappropriate class loader, we should set the `parent` of `ClassLoader` to null, so that we can fully control which class loader is used. This is take over of #17074, the primary author of this PR is taroplus . Should close #17074 after this PR get merged. ## How was this patch tested? Add test case in `ExecutorClassLoaderSuite`. Author: Kohki Nishio Author: Xingbo Jiang Closes #18614 from jiangxb1987/executor_classloader. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e08d06b3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e08d06b3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e08d06b3 Branch: refs/heads/master Commit: e08d06b37bc96cc48fec1c5e40f73e0bca09c616 Parents: 780586a Author: Kohki Nishio Authored: Thu Jul 13 08:22:40 2017 +0800 Committer: Wenchen Fan Committed: Thu Jul 13 08:22:40 2017 +0800 -- .../apache/spark/repl/ExecutorClassLoader.scala | 17 +--- .../spark/repl/ExecutorClassLoaderSuite.scala | 46 2 files changed, 57 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e08d06b3/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala -- diff --git a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala index df13b32..127f673 100644 --- a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala +++ b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala @@ -33,18 +33,23 @@ import org.apache.spark.internal.Logging import org.apache.spark.util.{ParentClassLoader, Utils} /** - * A ClassLoader that reads classes from a Hadoop FileSystem or HTTP URI, - * used to load classes defined by the interpreter when the REPL is used. - * Allows the user to specify if user class path should be first. - * This class loader delegates getting/finding resources to parent loader, - * which makes sense until REPL never provide resource dynamically. + * A ClassLoader that reads classes from a Hadoop FileSystem or HTTP URI, used to load classes + * defined by the interpreter when the REPL is used. Allows the user to specify if user class path + * should be first. This class loader delegates getting/finding resources to parent loader, which + * makes sense until REPL never provide resource dynamically. + * + * Note: [[ClassLoader]] will preferentially load class from parent. Only when parent is null or + * the load failed, that it will call the overridden `findClass` function. To avoid the potential + * issue caused by loading class using inappropriate class loader, we should set the parent of + * ClassLoader to null, so that we can fully control which class loader is used. For detailed + * discussion, see SPARK-18646. */ class ExecutorClassLoader( conf: SparkConf, env: SparkEnv, classUri: String, parent: ClassLoader, -userClassPathFirst: Boolean) extends ClassLoader with Logging { +userClassPathFirst: Boolean) extends ClassLoader(null) with Logging { val uri = new URI(classUri) val directory = uri.getPath http://git-wip-us.apache.org/repos/asf/spark/blob/e08d06b3/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala -- diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala index 6d274bd..092d3c2 100644 --- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala +++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala @@ -23,6 +23,8 @@ import java.nio.channels.{FileChannel, ReadableByteChannel} import java.nio.charset.StandardCharsets import java.nio.file.{Paths, StandardOpenOption} import java.util +import java.util.Collections +import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider} import scala.io.Source import scala.language.implicitConversions @@ -77,6 +79,50 @@ class ExecutorClassLoaderSuite } } + test("child over system classloader") { +// JavaFileObject for scala.Option class +val scalaOptionFile = new SimpleJavaFileObject( + URI.create(s"string:///scala/Option.java"), +
spark git commit: [SPARK-18646][REPL] Set parent classloader as null for ExecutorClassLoader
Repository: spark Updated Branches: refs/heads/branch-2.2 cb6fc89ba -> 39eba3053 [SPARK-18646][REPL] Set parent classloader as null for ExecutorClassLoader ## What changes were proposed in this pull request? `ClassLoader` will preferentially load class from `parent`. Only when `parent` is null or the load failed, that it will call the overridden `findClass` function. To avoid the potential issue caused by loading class using inappropriate class loader, we should set the `parent` of `ClassLoader` to null, so that we can fully control which class loader is used. This is take over of #17074, the primary author of this PR is taroplus . Should close #17074 after this PR get merged. ## How was this patch tested? Add test case in `ExecutorClassLoaderSuite`. Author: Kohki Nishio Author: Xingbo Jiang Closes #18614 from jiangxb1987/executor_classloader. (cherry picked from commit e08d06b37bc96cc48fec1c5e40f73e0bca09c616) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39eba305 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39eba305 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39eba305 Branch: refs/heads/branch-2.2 Commit: 39eba3053ac99f03d9df56471bae5fc5cc9f4462 Parents: cb6fc89 Author: Kohki Nishio Authored: Thu Jul 13 08:22:40 2017 +0800 Committer: Wenchen Fan Committed: Thu Jul 13 08:22:53 2017 +0800 -- .../apache/spark/repl/ExecutorClassLoader.scala | 17 +--- .../spark/repl/ExecutorClassLoaderSuite.scala | 46 2 files changed, 57 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/39eba305/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala -- diff --git a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala index df13b32..127f673 100644 --- a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala +++ b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala @@ -33,18 +33,23 @@ import org.apache.spark.internal.Logging import org.apache.spark.util.{ParentClassLoader, Utils} /** - * A ClassLoader that reads classes from a Hadoop FileSystem or HTTP URI, - * used to load classes defined by the interpreter when the REPL is used. - * Allows the user to specify if user class path should be first. - * This class loader delegates getting/finding resources to parent loader, - * which makes sense until REPL never provide resource dynamically. + * A ClassLoader that reads classes from a Hadoop FileSystem or HTTP URI, used to load classes + * defined by the interpreter when the REPL is used. Allows the user to specify if user class path + * should be first. This class loader delegates getting/finding resources to parent loader, which + * makes sense until REPL never provide resource dynamically. + * + * Note: [[ClassLoader]] will preferentially load class from parent. Only when parent is null or + * the load failed, that it will call the overridden `findClass` function. To avoid the potential + * issue caused by loading class using inappropriate class loader, we should set the parent of + * ClassLoader to null, so that we can fully control which class loader is used. For detailed + * discussion, see SPARK-18646. */ class ExecutorClassLoader( conf: SparkConf, env: SparkEnv, classUri: String, parent: ClassLoader, -userClassPathFirst: Boolean) extends ClassLoader with Logging { +userClassPathFirst: Boolean) extends ClassLoader(null) with Logging { val uri = new URI(classUri) val directory = uri.getPath http://git-wip-us.apache.org/repos/asf/spark/blob/39eba305/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala -- diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala index 6d274bd..092d3c2 100644 --- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala +++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala @@ -23,6 +23,8 @@ import java.nio.channels.{FileChannel, ReadableByteChannel} import java.nio.charset.StandardCharsets import java.nio.file.{Paths, StandardOpenOption} import java.util +import java.util.Collections +import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider} import scala.io.Source import scala.language.implicitConversions @@ -77,6 +79,50 @@ class ExecutorClassLoaderSuite } } + test("child over system classloader") { +// JavaFileObject for scala.Option class
spark-website git commit: More 2.2.0 Release Notes
Repository: spark-website Updated Branches: refs/heads/asf-site 40f588bb5 -> 869f8a6fb More 2.2.0 Release Notes Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/869f8a6f Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/869f8a6f Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/869f8a6f Branch: refs/heads/asf-site Commit: 869f8a6fb1548c773d2d75f63c55005417aeac35 Parents: 40f588b Author: Michael Armbrust Authored: Wed Jul 12 22:54:55 2017 + Committer: Michael Armbrust Committed: Wed Jul 12 22:54:55 2017 + -- releases/_posts/2017-07-11-spark-release-2-2-0.md | 4 +++- site/releases/spark-release-2-2-0.html| 7 ++- 2 files changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/869f8a6f/releases/_posts/2017-07-11-spark-release-2-2-0.md -- diff --git a/releases/_posts/2017-07-11-spark-release-2-2-0.md b/releases/_posts/2017-07-11-spark-release-2-2-0.md index 52ae28f..b630c75 100644 --- a/releases/_posts/2017-07-11-spark-release-2-2-0.md +++ b/releases/_posts/2017-07-11-spark-release-2-2-0.md @@ -128,6 +128,8 @@ The main focus of SparkR in the 2.2.0 release was adding extensive support for e ### Deprecations + - **Python** + - SPARK-12661: Drop support for Python 2.6 - **MLlib** - SPARK-18613: spark.ml LDA classes should not expose spark.mllib in APIs. In spark.ml.LDAModel, deprecated `oldLocalModel` and `getModel`. - **SparkR** @@ -143,7 +145,7 @@ The main focus of SparkR in the 2.2.0 release was adding extensive support for e ### Known Issues -- None +- SPARK-21093: Multiple gapply execution occasionally failed in SparkR ### Credits http://git-wip-us.apache.org/repos/asf/spark-website/blob/869f8a6f/site/releases/spark-release-2-2-0.html -- diff --git a/site/releases/spark-release-2-2-0.html b/site/releases/spark-release-2-2-0.html index 61504df..fc43088 100644 --- a/site/releases/spark-release-2-2-0.html +++ b/site/releases/spark-release-2-2-0.html @@ -371,6 +371,11 @@ Deprecations + Python + + SPARK-12661: Drop support for Python 2.6 + + MLlib SPARK-18613: spark.ml LDA classes should not expose spark.mllib in APIs. In spark.ml.LDAModel, deprecated oldLocalModel and getModel. @@ -401,7 +406,7 @@ Known Issues - None + SPARK-21093: Multiple gapply execution occasionally failed in SparkR Credits - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark-website git commit: Fix 2.2.0 contributor list
Repository: spark-website Updated Branches: refs/heads/asf-site 2fac17731 -> 40f588bb5 Fix 2.2.0 contributor list Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/40f588bb Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/40f588bb Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/40f588bb Branch: refs/heads/asf-site Commit: 40f588bb525e21e457c5d839937350a5c18172c4 Parents: 2fac177 Author: Michael Armbrust Authored: Wed Jul 12 22:46:30 2017 + Committer: Michael Armbrust Committed: Wed Jul 12 15:48:01 2017 -0700 -- releases/_posts/2017-07-11-spark-release-2-2-0.md | 2 +- site/releases/spark-release-2-2-0.html| 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/40f588bb/releases/_posts/2017-07-11-spark-release-2-2-0.md -- diff --git a/releases/_posts/2017-07-11-spark-release-2-2-0.md b/releases/_posts/2017-07-11-spark-release-2-2-0.md index 37d3638..52ae28f 100644 --- a/releases/_posts/2017-07-11-spark-release-2-2-0.md +++ b/releases/_posts/2017-07-11-spark-release-2-2-0.md @@ -148,4 +148,4 @@ The main focus of SparkR in the 2.2.0 release was adding extensive support for e ### Credits Last but not least, this release would not have been possible without the following contributors: -ALeksander Eskilson, Aaditya Ramesh, Adam Roberts, Adrian Petrescu, Ahmed Mahran, Alex Bozarth, Alexander Shorin, Alexander Ulanov, Andrew Duffy, Andrew Mills, Andrew Ray, Angus Gerry, Anthony Truchet, Anton Okolnychyi, Artur Sukhenko, Bartek Wisniewski, Bijay Pathak, Bill Chambers, Bjarne Fruergaard, Brian Cho, Bryan Cutler, Burak Yavuz, Cen Yu Hai, Charles Allen, Cheng Lian, Chie Hayashida, Christian Kadner, Clark Fitzgerald, Cody Koeninger, Daniel Darabos, Daoyuan Wang, David Navas, Davies Liu, Denny Lee, Devaraj K, Dhruve Ashar, Dilip Biswal, Ding Ding, Dmitriy Sokolov, Dongjoon Hyun, Drew Robb, Ekasit Kijsipongse, Eren Avsarogullari, Ergin Seyfe, Eric Liang, Erik O'Shaughnessy, Eyal Farago, Felix Cheung, Ferdinand Xu, Fred Reiss, Fu Xing, Gabriel Huang, Gaetan Semet, Gang Wu, Gayathri Murali, Gu Huiqin Alice, Guoqiang Li, Gurvinder Singh, Hao Ren, Herman Van Hovell, Hiroshi Inoue, Holden Karau, Hossein Falaki, Huang Zhaowei, Huaxin Gao, Hyukjin Kwon, Imran Rashid, Jacek Laskows ki, Jagadeesan A S, Jakob Odersky, Jason White, Jeff Zhang, Jianfei Wang, Jiang Xingbo, Jie Huang, Jie Xiong, Jisoo Kim, John Muller, Jose Hiram Soltren, Joseph K. Bradley, Josh Rosen, Jun Kim, Junyang Qian, Justin Pihony, Kapil Singh, Kay Ousterhout, Kazuaki Ishizaki, Kevin Grealish, Kevin McHale, Kishor Patil, Koert Kuipers, Kousuke Saruta, Krishna Kalyan, Liang Ke, Liang-Chi Hsieh, Lianhui Wang, Linbo Jin, Liwei Lin, Luciano Resende, Maciej Brynski, Maciej Szymkiewicz, Mahmoud Rawas, Manoj Kumar, Marcelo Vanzin, Mariusz Strzelecki, Mark Grover, Maxime Rihouey, Miao Wang, Michael Allman, Michael Armbrust, Michael Gummelt, Michal Senkyr, Michal Wesolowski, Mikael Staldal, Mike Ihbe, Mitesh Patel, Nan Zhu, Nattavut Sutyanyong, Nic Eggert, Nicholas Chammas, Nick Lavers, Nick Pentreath, Nicolas Fraison, Noritaka Sekiyama, Peng Meng, Peng, Meng, Pete Robbins, Peter Ableda, Peter Lee, Philipp Hoffmann, Prashant Sharma, Prince J Wesley, Priyanka Garg, Qian Huang, Qifan Pu, Rajesh Balamoh an, Reynold Xin, Robert Kruszewski, Russell Spitzer, Ryan Blue, Saisai Shao, Sameer Agarwal, Sami Jaktholm, Sandeep Purohit, Sandeep Singh, Satendra Kumar, Sean Owen, Sean Zhong, Seth Hendrickson, Sharkd Tu, Shen Hong, Shivansh Srivastava, Shivaram Venkataraman, Shixiong Zhu, Shuai Lin, Shubham Chopra, Sital Kedia, Song Jun, Srinath Shankar, Stavros Kontopoulos, Stefan Schulze, Steve Loughran, Suman Somasundar, Sun Dapeng, Sun Rui, Sunitha Kambhampati, Suresh Thalamati, Susan X. Huynh, Sylvain Zimmer, Takeshi YAMAMURO, Takuya UESHIN, Tao LI, Tao Lin, Tao Wang, Tarun Kumar, Tathagata Das, Tejas Patil, Thomas Graves, Timothy Chen, Timothy Hunter, Tom Graves, Tom Magrino, Tommy YU, Tyson Condie, Uncle Gen, Vinayak Joshi, Vincent Xie, Wang Fei, Wang Lei, Wang Tao, Wayne Zhang, Weichen Xu, Weiluo (David) Ren, Weiqing Yang, Wenchen Fan, Wesley Tang, William Benton, Wojciech Szymanski, Xiangrui Meng, Xianyang Liu, Xiao Li, Xin Ren, Xin Wu, Xing SHI, Xusen Yin, Yadong Qi, Yanbo Liang, Yang Wang, Yangyang Liu, Yin Huai, Yu Peng, Yucai Yu, Yuhao Yang, Yuming Wang, Yun Ni, Yves Raimond, Zhan Zhang, Zheng RuiFeng, Zhenhua Wang, pkch, tone-zhang, yimuxi \ No newline at end of file +ALeksander Eskilson, Aaditya Ramesh, Adam Budde, Adam Roberts, Adrian Ionescu, Ala Luszczak, Alex Bozarth, Andrew Ray, Anirudh Ramanathan, Anthony Truch
spark git commit: [SPARK-17701][SQL] Refactor RowDataSourceScanExec so its sameResult call does not compare strings
Repository: spark Updated Branches: refs/heads/master d2d2a5de1 -> 780586a9f [SPARK-17701][SQL] Refactor RowDataSourceScanExec so its sameResult call does not compare strings ## What changes were proposed in this pull request? Currently, `RowDataSourceScanExec` and `FileSourceScanExec` rely on a "metadata" string map to implement equality comparison, since the RDDs they depend on cannot be directly compared. This has resulted in a number of correctness bugs around exchange reuse, e.g. SPARK-17673 and SPARK-16818. To make these comparisons less brittle, we should refactor these classes to compare constructor parameters directly instead of relying on the metadata map. This PR refactors `RowDataSourceScanExec`, `FileSourceScanExec` will be fixed in the follow-up PR. ## How was this patch tested? existing tests Author: Wenchen Fan Closes #18600 from cloud-fan/minor. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/780586a9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/780586a9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/780586a9 Branch: refs/heads/master Commit: 780586a9f2400c3fdfdb9a6b954001a3c9663941 Parents: d2d2a5d Author: Wenchen Fan Authored: Wed Jul 12 09:23:54 2017 -0700 Committer: gatorsmile Committed: Wed Jul 12 09:23:54 2017 -0700 -- .../sql/execution/DataSourceScanExec.scala | 65 ++-- .../apache/spark/sql/execution/SparkPlan.scala | 5 -- .../spark/sql/execution/SparkPlanInfo.scala | 4 +- .../datasources/DataSourceStrategy.scala| 57 +++-- .../spark/sql/execution/ui/SparkPlanGraph.scala | 5 +- 5 files changed, 56 insertions(+), 80 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/780586a9/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index a0def68..588c937 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -33,21 +33,23 @@ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partition import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => ParquetSource} import org.apache.spark.sql.execution.metric.SQLMetrics -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.BaseRelation +import org.apache.spark.sql.sources.{BaseRelation, Filter} import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils trait DataSourceScanExec extends LeafExecNode with CodegenSupport { val relation: BaseRelation - val metastoreTableIdentifier: Option[TableIdentifier] + val tableIdentifier: Option[TableIdentifier] protected val nodeNamePrefix: String = "" override val nodeName: String = { -s"Scan $relation ${metastoreTableIdentifier.map(_.unquotedString).getOrElse("")}" +s"Scan $relation ${tableIdentifier.map(_.unquotedString).getOrElse("")}" } + // Metadata that describes more details of this scan. + protected def metadata: Map[String, String] + override def simpleString: String = { val metadataEntries = metadata.toSeq.sorted.map { case (key, value) => @@ -73,34 +75,25 @@ trait DataSourceScanExec extends LeafExecNode with CodegenSupport { /** Physical plan node for scanning data from a relation. */ case class RowDataSourceScanExec( -output: Seq[Attribute], +fullOutput: Seq[Attribute], +requiredColumnsIndex: Seq[Int], +filters: Set[Filter], +handledFilters: Set[Filter], rdd: RDD[InternalRow], @transient relation: BaseRelation, -override val outputPartitioning: Partitioning, -override val metadata: Map[String, String], -override val metastoreTableIdentifier: Option[TableIdentifier]) +override val tableIdentifier: Option[TableIdentifier]) extends DataSourceScanExec { + def output: Seq[Attribute] = requiredColumnsIndex.map(fullOutput) + override lazy val metrics = Map("numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) - val outputUnsafeRows = relation match { -case r: HadoopFsRelation if r.fileFormat.isInstanceOf[ParquetSource] => - !SparkSession.getActiveSession.get.sessionState.conf.getConf( -SQLConf.PARQUET_VECTORIZED_READER_ENABLED) -case _: HadoopFsRelation => true -case _ => false - } - protected override def doExecute(): RDD[InternalRow] = { -val unsafeRow = if (outputU
spark git commit: [SPARK-18619][ML] Make QuantileDiscretizer/Bucketizer/StringIndexer/RFormula inherit from HasHandleInvalid
Repository: spark Updated Branches: refs/heads/master aaad34dc2 -> d2d2a5de1 [SPARK-18619][ML] Make QuantileDiscretizer/Bucketizer/StringIndexer/RFormula inherit from HasHandleInvalid ## What changes were proposed in this pull request? 1, HasHandleInvaild support override 2, Make QuantileDiscretizer/Bucketizer/StringIndexer/RFormula inherit from HasHandleInvalid ## How was this patch tested? existing tests [JIRA](https://issues.apache.org/jira/browse/SPARK-18619) Author: Zheng RuiFeng Closes #18582 from zhengruifeng/heritate_HasHandleInvalid. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d2d2a5de Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d2d2a5de Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d2d2a5de Branch: refs/heads/master Commit: d2d2a5de186ddf381d0bdb353b23d64ff0224e7f Parents: aaad34d Author: Zheng RuiFeng Authored: Wed Jul 12 22:09:03 2017 +0800 Committer: Yanbo Liang Committed: Wed Jul 12 22:09:03 2017 +0800 -- .../apache/spark/ml/feature/Bucketizer.scala| 14 ++--- .../spark/ml/feature/QuantileDiscretizer.scala | 13 ++--- .../org/apache/spark/ml/feature/RFormula.scala | 13 ++--- .../apache/spark/ml/feature/StringIndexer.scala | 13 ++--- .../ml/param/shared/SharedParamsCodeGen.scala | 2 +- .../spark/ml/param/shared/sharedParams.scala| 2 +- .../GeneralizedLinearRegression.scala | 2 +- .../spark/ml/regression/LinearRegression.scala | 14 ++--- python/pyspark/ml/feature.py| 60 9 files changed, 53 insertions(+), 80 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d2d2a5de/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 46b512f..6a11a75 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -24,7 +24,7 @@ import org.apache.spark.annotation.Since import org.apache.spark.ml.Model import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.expressions.UserDefinedFunction @@ -36,7 +36,8 @@ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} */ @Since("1.4.0") final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) - extends Model[Bucketizer] with HasInputCol with HasOutputCol with DefaultParamsWritable { + extends Model[Bucketizer] with HasHandleInvalid with HasInputCol with HasOutputCol +with DefaultParamsWritable { @Since("1.4.0") def this() = this(Identifiable.randomUID("bucketizer")) @@ -84,17 +85,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String * Default: "error" * @group param */ - // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid. @Since("2.1.0") - val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " + -"invalid entries. Options are skip (filter out rows with invalid values), " + + override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", +"how to handle invalid entries. Options are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", ParamValidators.inArray(Bucketizer.supportedHandleInvalids)) - /** @group getParam */ - @Since("2.1.0") - def getHandleInvalid: String = $(handleInvalid) - /** @group setParam */ @Since("2.1.0") def setHandleInvalid(value: String): this.type = set(handleInvalid, value) http://git-wip-us.apache.org/repos/asf/spark/blob/d2d2a5de/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index feceeba..95e8830 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.ml._ import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.pa
spark-website git commit: Patch references to docs/programming-guide.html to docs/rdd-programming-guide.html
Repository: spark-website Updated Branches: refs/heads/asf-site 1c7fd01e9 -> 2fac17731 Patch references to docs/programming-guide.html to docs/rdd-programming-guide.html Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/2fac1773 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/2fac1773 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/2fac1773 Branch: refs/heads/asf-site Commit: 2fac17731bdaafc3ce47be5d0adad682487f983c Parents: 1c7fd01 Author: Sean Owen Authored: Wed Jul 12 12:20:26 2017 +0100 Committer: Sean Owen Committed: Wed Jul 12 12:20:26 2017 +0100 -- examples.md | 2 +- releases/_posts/2017-07-11-spark-release-2-2-0.md | 2 +- site/examples.html| 2 +- site/releases/spark-release-2-2-0.html| 2 +- site/sitemap.xml | 14 +++--- sitemap.xml | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/2fac1773/examples.md -- diff --git a/examples.md b/examples.md index fe9cc79..1bc45d0 100644 --- a/examples.md +++ b/examples.md @@ -11,7 +11,7 @@ navigation: These examples give a quick overview of the Spark API. Spark is built on the concept of distributed datasets, which contain arbitrary Java or Python objects. You create a dataset from external data, then apply parallel operations -to it. The building block of the Spark API is its [RDD API](https://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds). +to it. The building block of the Spark API is its [RDD API](https://spark.apache.org/docs/latest/rdd-programming-guide.html#resilient-distributed-datasets-rdds). In the RDD API, there are two types of operations: transformations, which define a new dataset based on previous ones, and actions, which kick off a job to execute on a cluster. http://git-wip-us.apache.org/repos/asf/spark-website/blob/2fac1773/releases/_posts/2017-07-11-spark-release-2-2-0.md -- diff --git a/releases/_posts/2017-07-11-spark-release-2-2-0.md b/releases/_posts/2017-07-11-spark-release-2-2-0.md index 8027d8a..37d3638 100644 --- a/releases/_posts/2017-07-11-spark-release-2-2-0.md +++ b/releases/_posts/2017-07-11-spark-release-2-2-0.md @@ -59,7 +59,7 @@ To download Apache Spark 2.2.0, visit the Spark Programming Guide and Spark SQL, DataFrames and Datasets Guide.* +*Programming guides: Spark RDD Programming Guide and Spark SQL, DataFrames and Datasets Guide.* ### Structured Streaming http://git-wip-us.apache.org/repos/asf/spark-website/blob/2fac1773/site/examples.html -- diff --git a/site/examples.html b/site/examples.html index 439a62b..a4cfeda 100644 --- a/site/examples.html +++ b/site/examples.html @@ -199,7 +199,7 @@ These examples give a quick overview of the Spark API. Spark is built on the concept of distributed datasets, which contain arbitrary Java or Python objects. You create a dataset from external data, then apply parallel operations -to it. The building block of the Spark API is its https://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds";>RDD API. +to it. The building block of the Spark API is its https://spark.apache.org/docs/latest/rdd-programming-guide.html#resilient-distributed-datasets-rdds";>RDD API. In the RDD API, there are two types of operations: transformations, which define a new dataset based on previous ones, and actions, which kick off a job to execute on a cluster. http://git-wip-us.apache.org/repos/asf/spark-website/blob/2fac1773/site/releases/spark-release-2-2-0.html -- diff --git a/site/releases/spark-release-2-2-0.html b/site/releases/spark-release-2-2-0.html index badc714..0460c7d 100644 --- a/site/releases/spark-release-2-2-0.html +++ b/site/releases/spark-release-2-2-0.html @@ -264,7 +264,7 @@ -Programming guides: Spark Programming Guide and Spark SQL, DataFrames and Datasets Guide. +Programming guides: Spark RDD Programming Guide and Spark SQL, DataFrames and Datasets Guide. Structured Streaming http://git-wip-us.apache.org/repos/asf/spark-website/blob/2fac1773/site/sitemap.xml -- diff --git a/site/sitemap.xml b/site/sitemap.xml index 591e871..0ce546f 100644 --- a/site/sitemap.xml +++ b/site/sitemap.xml @@ -22,7 +22,7 @@ 1.0 - https://spark.ap
spark git commit: [SPARK-21007][SQL] Add SQL function - RIGHT && LEFT
Repository: spark Updated Branches: refs/heads/master 5ed134ee2 -> aaad34dc2 [SPARK-21007][SQL] Add SQL function - RIGHT && LEFT ## What changes were proposed in this pull request? Add SQL function - RIGHT && LEFT, same as MySQL: https://dev.mysql.com/doc/refman/5.7/en/string-functions.html#function_left https://dev.mysql.com/doc/refman/5.7/en/string-functions.html#function_right ## How was this patch tested? unit test Author: liuxian Closes #18228 from 10110346/lx-wip-0607. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aaad34dc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aaad34dc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aaad34dc Branch: refs/heads/master Commit: aaad34dc2f537f7eef50fc5f72a7f178800e8d38 Parents: 5ed134e Author: liuxian Authored: Wed Jul 12 18:51:19 2017 +0800 Committer: Wenchen Fan Committed: Wed Jul 12 18:51:19 2017 +0800 -- .../catalyst/analysis/FunctionRegistry.scala| 2 + .../expressions/stringExpressions.scala | 43 .../sql-tests/inputs/string-functions.sql | 6 +++ .../sql-tests/results/string-functions.sql.out | 34 +++- 4 files changed, 84 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aaad34dc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index f4b3e86..10b22ae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -346,6 +346,8 @@ object FunctionRegistry { expression[StringSplit]("split"), expression[Substring]("substr"), expression[Substring]("substring"), +expression[Left]("left"), +expression[Right]("right"), expression[SubstringIndex]("substring_index"), expression[StringTranslate]("translate"), expression[StringTrim]("trim"), http://git-wip-us.apache.org/repos/asf/spark/blob/aaad34dc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 83fdcfc..d75b9d6 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1199,6 +1199,49 @@ case class Substring(str: Expression, pos: Expression, len: Expression) } /** + * Returns the rightmost n characters from the string. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(str, len) - Returns the rightmost `len`(`len` can be string type) characters from the string `str`,if `len` is less or equal than 0 the result is an empty string.", + extended = """ +Examples: + > SELECT _FUNC_('Spark SQL', 3); + SQL + """) +// scalastyle:on line.size.limit +case class Right(str: Expression, len: Expression, child: Expression) extends RuntimeReplaceable { + def this(str: Expression, len: Expression) = { +this(str, len, If(IsNull(str), Literal(null, StringType), If(LessThanOrEqual(len, Literal(0)), + Literal(UTF8String.EMPTY_UTF8, StringType), new Substring(str, UnaryMinus(len) + } + + override def flatArguments: Iterator[Any] = Iterator(str, len) + override def sql: String = s"$prettyName(${str.sql}, ${len.sql})" +} + +/** + * Returns the leftmost n characters from the string. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(str, len) - Returns the leftmost `len`(`len` can be string type) characters from the string `str`,if `len` is less or equal than 0 the result is an empty string.", + extended = """ +Examples: + > SELECT _FUNC_('Spark SQL', 3); + Spa + """) +// scalastyle:on line.size.limit +case class Left(str: Expression, len: Expression, child: Expression) extends RuntimeReplaceable { + def this(str: Expression, len: Expression) = { +this(str, len, Substring(str, Literal(1), len)) + } + + override def flatArguments: Iterator[Any] = Iterator(str, len) + override def sql: String = s"$prettyName(${str.sql}, ${len.sql})" +} + +/** * A function that returns the char length of the given string expression or * num
spark-website git commit: Add note about CVE-2017-7678
Repository: spark-website Updated Branches: refs/heads/asf-site f2d5d2a68 -> 1c7fd01e9 Add note about CVE-2017-7678 Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/1c7fd01e Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/1c7fd01e Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/1c7fd01e Branch: refs/heads/asf-site Commit: 1c7fd01e9eb258407da07dc765444e4cf8c28f7c Parents: f2d5d2a Author: Sean Owen Authored: Wed Jul 12 11:24:41 2017 +0100 Committer: Sean Owen Committed: Wed Jul 12 11:24:41 2017 +0100 -- security.md| 43 ++- site/security.html | 42 +- 2 files changed, 83 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/1c7fd01e/security.md -- diff --git a/security.md b/security.md index 505c225..a26f1d1 100644 --- a/security.md +++ b/security.md @@ -17,4 +17,45 @@ non-public list that will reach the Spark PMC. Messages to `secur...@apache.org` Known Security Issues -None yet. \ No newline at end of file +CVE-2017-7678 Apache Spark XSS web UI MHTML vulnerability + +Severity: Low + +Vendor: The Apache Software Foundation + +Versions Affected: +Versions of Apache Spark before 2.2.0 + +Description: +It is possible for an attacker to take advantage of a user's trust in the server to trick them into visiting a link that points to a shared Spark cluster and submits data including MHTML to the Spark master, or history server. This data, which could contain a script, would then be reflected back to the user and could be evaluated and executed by MS Windows-based clients. It is not an attack on Spark itself, but on the user, who may then execute the script inadvertently when viewing elements of the Spark web UIs. + +Mitigation: +Update to Apache Spark 2.2.0 or later. + +Example: +Request: +``` +GET /app/?appId=Content-Type:%20multipart/related;%20boundary=_AppScan%0d%0a-- +_AppScan%0d%0aContent-Location:foo%0d%0aContent-Transfer- +Encoding:base64%0d%0a%0d%0aPGh0bWw%2bPHNjcmlwdD5hbGVydCgiWFNTIik8L3NjcmlwdD48L2h0bWw%2b%0d%0a +HTTP/1.1 +``` + +Excerpt from response: +``` +No running application with ID Content-Type: multipart/related; +boundary=_AppScan +--_AppScan +Content-Location:foo +Content-Transfer-Encoding:base64 +PGh0bWw+PHNjcmlwdD5hbGVydCgiWFNTIik8L3NjcmlwdD48L2h0bWw+ + +``` +Result: In the above payload the BASE64 data decodes as: +``` +alert("XSS") +``` + +Credit: +- Mike Kasper, Nicholas Marion +- IBM z Systems Center for Secure Engineering \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark-website/blob/1c7fd01e/site/security.html -- diff --git a/site/security.html b/site/security.html index 83345d3..3ef0942 100644 --- a/site/security.html +++ b/site/security.html @@ -204,7 +204,47 @@ non-public list that will reach the Spark PMC. Messages to security@apache Known Security Issues -None yet. +CVE-2017-7678 Apache Spark XSS web UI MHTML vulnerability + +Severity: Low + +Vendor: The Apache Software Foundation + +Versions Affected: +Versions of Apache Spark before 2.2.0 + +Description: +It is possible for an attacker to take advantage of a user’s trust in the server to trick them into visiting a link that points to a shared Spark cluster and submits data including MHTML to the Spark master, or history server. This data, which could contain a script, would then be reflected back to the user and could be evaluated and executed by MS Windows-based clients. It is not an attack on Spark itself, but on the user, who may then execute the script inadvertently when viewing elements of the Spark web UIs. + +Mitigation: +Update to Apache Spark 2.2.0 or later. + +Example: +Request: +GET /app/?appId=Content-Type:%20multipart/related;%20boundary=_AppScan%0d%0a-- +_AppScan%0d%0aContent-Location:foo%0d%0aContent-Transfer- +Encoding:base64%0d%0a%0d%0aPGh0bWw%2bPHNjcmlwdD5hbGVydCgiWFNTIik8L3NjcmlwdD48L2h0bWw%2b%0d%0a +HTTP/1.1 + + +Excerpt from response: +No running application with ID Content-Type: multipart/related; +boundary=_AppScan +--_AppScan +Content-Location:foo +Content-Transfer-Encoding:base64 +PGh0bWw+PHNjcmlwdD5hbGVydCgiWFNTIik8L3NjcmlwdD48L2h0bWw+ ++ +Result: In the above payload the BASE64 data decodes as: + + + +Credit: + + Mike Kasper, Nicholas Marion + IBM z Systems Center for Secure Engineering + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail:
spark git commit: [SPARK-21305][ML][MLLIB] Add options to disable multi-threading of native BLAS
Repository: spark Updated Branches: refs/heads/master f587d2e3f -> 5ed134ee2 [SPARK-21305][ML][MLLIB] Add options to disable multi-threading of native BLAS ## What changes were proposed in this pull request? Many ML/MLLIB algorithms use native BLAS (like Intel MKL, ATLAS, OpenBLAS) to improvement the performance. Many popular Native BLAS, like Intel MKL, OpenBLAS, use multi-threading technology, which will conflict with Spark. Spark should provide options to disable multi-threading of Native BLAS. https://github.com/xianyi/OpenBLAS/wiki/faq#multi-threaded https://software.intel.com/en-us/articles/recommended-settings-for-calling-intel-mkl-routines-from-multi-threaded-applications ## How was this patch tested? The existing UT. Author: Peng Meng Closes #18551 from mpjlu/optimzeBLAS. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5ed134ee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5ed134ee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5ed134ee Branch: refs/heads/master Commit: 5ed134ee213060882c6e3ed713473fa6cc158d36 Parents: f587d2e Author: Peng Meng Authored: Wed Jul 12 11:02:04 2017 +0100 Committer: Sean Owen Committed: Wed Jul 12 11:02:04 2017 +0100 -- conf/spark-env.sh.template | 4 docs/ml-guide.md | 6 ++ 2 files changed, 10 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5ed134ee/conf/spark-env.sh.template -- diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index b9aab5a..1663019 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -61,3 +61,7 @@ # - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) # - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) # - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. +# Options for native BLAS, like Intel MKL, OpenBLAS, and so on. +# You might get better performance to enable these options if using native BLAS (see SPARK-21305). +# - MKL_NUM_THREADS=1Disable multi-threading of Intel MKL +# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS http://git-wip-us.apache.org/repos/asf/spark/blob/5ed134ee/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index fb46213..adb1c9a 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -61,6 +61,12 @@ To configure `netlib-java` / Breeze to use system optimised binaries, include project and read the [netlib-java](https://github.com/fommil/netlib-java) documentation for your platform's additional installation instructions. +The most popular native BLAS such as [Intel MKL](https://software.intel.com/en-us/mkl), [OpenBLAS](http://www.openblas.net), can use multiple threads in a single operation, which can conflict with Spark's execution model. + +Configuring these BLAS implementations to use a single thread for operations may actually improve performance (see [SPARK-21305](https://issues.apache.org/jira/browse/SPARK-21305)). It is usually optimal to match this to the number of cores each Spark task is configured to use, which is 1 by default and typically left at 1. + +Please refer to resources like the following to understand how to configure the number of threads these BLAS implementations use: [Intel MKL](https://software.intel.com/en-us/articles/recommended-settings-for-calling-intel-mkl-routines-from-multi-threaded-applications) and [OpenBLAS](https://github.com/xianyi/OpenBLAS/wiki/faq#multi-threaded). + To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.4 or newer. [^1]: To learn more about the benefits and background of system optimised natives, you may wish to - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20842][SQL] Upgrade to 1.2.2 for Hive Metastore Client 1.2
Repository: spark Updated Branches: refs/heads/master e0af76a36 -> f587d2e3f [SPARK-20842][SQL] Upgrade to 1.2.2 for Hive Metastore Client 1.2 ### What changes were proposed in this pull request? Hive 1.2.2 release is available. Below is the list of bugs fixed in 1.2.2 https://issues.apache.org/jira/secure/ReleaseNote.jspa?version=12332952&styleName=Text&projectId=12310843 ### How was this patch tested? N/A Author: Xiao Li Closes #18063 from gatorsmile/upgradeHiveClientTo1.2.2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f587d2e3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f587d2e3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f587d2e3 Branch: refs/heads/master Commit: f587d2e3fa133051a64e4ec1aa788b554b552690 Parents: e0af76a Author: Xiao Li Authored: Wed Jul 12 15:48:44 2017 +0800 Committer: Wenchen Fan Committed: Wed Jul 12 15:48:44 2017 +0800 -- .../org/apache/spark/sql/hive/client/IsolatedClientLoader.scala| 2 +- .../src/main/scala/org/apache/spark/sql/hive/client/package.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f587d2e3/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index b8aa067..930f0dd 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -92,7 +92,7 @@ private[hive] object IsolatedClientLoader extends Logging { case "14" | "0.14" | "0.14.0" => hive.v14 case "1.0" | "1.0.0" => hive.v1_0 case "1.1" | "1.1.0" => hive.v1_1 -case "1.2" | "1.2.0" | "1.2.1" => hive.v1_2 +case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2 case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0 case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1 } http://git-wip-us.apache.org/repos/asf/spark/blob/f587d2e3/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala index f9635e3..c14154a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala @@ -56,7 +56,7 @@ package object client { "net.hydromatic:linq4j", "net.hydromatic:quidem")) -case object v1_2 extends HiveVersion("1.2.1", +case object v1_2 extends HiveVersion("1.2.2", exclusions = Seq("eigenbase:eigenbase-properties", "org.apache.curator:*", "org.pentaho:pentaho-aggdesigner-algorithm", - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-21370][SS] Add test for state reliability when one read-only state store aborts after read-write state store commits
Repository: spark Updated Branches: refs/heads/master e16e8c7ad -> e0af76a36 [SPARK-21370][SS] Add test for state reliability when one read-only state store aborts after read-write state store commits ## What changes were proposed in this pull request? During Streaming Aggregation, we have two StateStores per task, one used as read-only in `StateStoreRestoreExec`, and one read-write used in `StateStoreSaveExec`. `StateStore.abort` will be called for these StateStores if they haven't committed their results. We need to make sure that `abort` in read-only store after a `commit` in the read-write store doesn't accidentally lead to the deletion of state. This PR adds a test for this condition. ## How was this patch tested? This PR adds a test. Author: Burak Yavuz Closes #18603 from brkyvz/ss-test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e0af76a3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e0af76a3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e0af76a3 Branch: refs/heads/master Commit: e0af76a36a67d409776bd379c6d6ef6d60356c06 Parents: e16e8c7 Author: Burak Yavuz Authored: Wed Jul 12 00:39:09 2017 -0700 Committer: Tathagata Das Committed: Wed Jul 12 00:39:09 2017 -0700 -- .../streaming/state/StateStoreSuite.scala | 31 1 file changed, 31 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e0af76a3/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala index c2087ec..7cb86dc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala @@ -665,6 +665,37 @@ abstract class StateStoreSuiteBase[ProviderClass <: StateStoreProvider] checkInvalidVersion(3) } + test("two concurrent StateStores - one for read-only and one for read-write") { +// During Streaming Aggregation, we have two StateStores per task, one used as read-only in +// `StateStoreRestoreExec`, and one read-write used in `StateStoreSaveExec`. `StateStore.abort` +// will be called for these StateStores if they haven't committed their results. We need to +// make sure that `abort` in read-only store after a `commit` in the read-write store doesn't +// accidentally lead to the deletion of state. +val dir = newDir() +val storeId = StateStoreId(dir, 0L, 1) +val provider0 = newStoreProvider(storeId) +// prime state +val store = provider0.getStore(0) +val key = "a" +put(store, key, 1) +store.commit() +assert(rowsToSet(store.iterator()) === Set(key -> 1)) + +// two state stores +val provider1 = newStoreProvider(storeId) +val restoreStore = provider1.getStore(1) +val saveStore = provider1.getStore(1) + +put(saveStore, key, get(restoreStore, key).get + 1) +saveStore.commit() +restoreStore.abort() + +// check that state is correct for next batch +val provider2 = newStoreProvider(storeId) +val finalStore = provider2.getStore(2) +assert(rowsToSet(finalStore.iterator()) === Set(key -> 2)) + } + /** Return a new provider with a random id */ def newStoreProvider(): ProviderClass - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-21146][CORE] Master/Worker should handle and shutdown when any thread gets UncaughtException
Repository: spark Updated Branches: refs/heads/master 24367f23f -> e16e8c7ad [SPARK-21146][CORE] Master/Worker should handle and shutdown when any thread gets UncaughtException ## What changes were proposed in this pull request? Adding the default UncaughtExceptionHandler to the Worker. ## How was this patch tested? I verified it manually, when any of the worker thread gets uncaught exceptions then the default UncaughtExceptionHandler will handle those exceptions. Author: Devaraj K Closes #18357 from devaraj-kavali/SPARK-21146. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e16e8c7a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e16e8c7a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e16e8c7a Branch: refs/heads/master Commit: e16e8c7ad31762aaca5e2bc874de1540af9cc4b7 Parents: 24367f2 Author: Devaraj K Authored: Wed Jul 12 00:14:58 2017 -0700 Committer: Shixiong Zhu Committed: Wed Jul 12 00:14:58 2017 -0700 -- .../scala/org/apache/spark/deploy/master/Master.scala| 4 +++- .../scala/org/apache/spark/deploy/worker/Worker.scala| 4 +++- .../main/scala/org/apache/spark/executor/Executor.scala | 2 +- .../spark/util/SparkUncaughtExceptionHandler.scala | 11 ++- core/src/main/scala/org/apache/spark/util/Utils.scala| 4 +++- .../spark/deploy/mesos/MesosClusterDispatcher.scala | 2 +- 6 files changed, 17 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e16e8c7a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 0dee25f..4cc580e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -36,7 +36,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.metrics.MetricsSystem import org.apache.spark.rpc._ import org.apache.spark.serializer.{JavaSerializer, Serializer} -import org.apache.spark.util.{ThreadUtils, Utils} +import org.apache.spark.util.{SparkUncaughtExceptionHandler, ThreadUtils, Utils} private[deploy] class Master( override val rpcEnv: RpcEnv, @@ -1045,6 +1045,8 @@ private[deploy] object Master extends Logging { val ENDPOINT_NAME = "Master" def main(argStrings: Array[String]) { +Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler( + exitOnUncaughtException = false)) Utils.initDaemon(log) val conf = new SparkConf val args = new MasterArguments(argStrings, conf) http://git-wip-us.apache.org/repos/asf/spark/blob/e16e8c7a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index bed4745..f6d3876 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -38,7 +38,7 @@ import org.apache.spark.deploy.worker.ui.WorkerWebUI import org.apache.spark.internal.Logging import org.apache.spark.metrics.MetricsSystem import org.apache.spark.rpc._ -import org.apache.spark.util.{ThreadUtils, Utils} +import org.apache.spark.util.{SparkUncaughtExceptionHandler, ThreadUtils, Utils} private[deploy] class Worker( override val rpcEnv: RpcEnv, @@ -737,6 +737,8 @@ private[deploy] object Worker extends Logging { val ENDPOINT_NAME = "Worker" def main(argStrings: Array[String]) { +Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler( + exitOnUncaughtException = false)) Utils.initDaemon(log) val conf = new SparkConf val args = new WorkerArguments(argStrings, conf) http://git-wip-us.apache.org/repos/asf/spark/blob/e16e8c7a/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 19e7eb0..21f0db1 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -56,7 +56,7 @@ private[spark] class Executor( env: SparkEnv, userClassPath: Seq[URL] = Nil, isLocal: Boolean = false, -uncaughtExceptionHandler: UncaughtExceptionHandler = SparkUncaughtExceptionHandler) +uncaughtExceptionHandler: UncaughtExceptionHandler = new SparkUncaug