spark git commit: [SPARK-17647][SQL][FOLLOWUP][MINOR] fix typo
Repository: spark Updated Branches: refs/heads/master 33ea908af -> b0a1e93e9 [SPARK-17647][SQL][FOLLOWUP][MINOR] fix typo ## What changes were proposed in this pull request? fix typo ## How was this patch tested? manual Author: Felix Cheung Closes #17663 from felixcheung/likedoctypo. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b0a1e93e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b0a1e93e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b0a1e93e Branch: refs/heads/master Commit: b0a1e93e93167b53058525a20a8b06f7df5f09a2 Parents: 33ea908 Author: Felix Cheung Authored: Mon Apr 17 23:55:40 2017 -0700 Committer: Felix Cheung Committed: Mon Apr 17 23:55:40 2017 -0700 -- .../apache/spark/sql/catalyst/expressions/regexpExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b0a1e93e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index a36da8e..3fa8458 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -79,7 +79,7 @@ abstract class StringRegexExpression extends BinaryExpression _ matches any one character in the input (similar to . in posix regular expressions) - % matches zero ore more characters in the input (similar to .* in posix regular + % matches zero or more characters in the input (similar to .* in posix regular expressions) The escape character is '\'. If an escape character precedes a special symbol or another - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [TEST][MINOR] Replace repartitionBy with distribute in CollapseRepartitionSuite
Repository: spark Updated Branches: refs/heads/master 0075562dd -> 33ea908af [TEST][MINOR] Replace repartitionBy with distribute in CollapseRepartitionSuite ## What changes were proposed in this pull request? Replace non-existent `repartitionBy` with `distribute` in `CollapseRepartitionSuite`. ## How was this patch tested? local build and `catalyst/testOnly *CollapseRepartitionSuite` Author: Jacek Laskowski Closes #17657 from jaceklaskowski/CollapseRepartitionSuite. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33ea908a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33ea908a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33ea908a Branch: refs/heads/master Commit: 33ea908af94152147e996a6dc8da41ada27d5af3 Parents: 0075562 Author: Jacek Laskowski Authored: Mon Apr 17 17:58:10 2017 -0700 Committer: Reynold Xin Committed: Mon Apr 17 17:58:10 2017 -0700 -- .../optimizer/CollapseRepartitionSuite.scala| 21 ++-- 1 file changed, 10 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/33ea908a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala index 59d2dc4..8cc8dec 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala @@ -106,8 +106,8 @@ class CollapseRepartitionSuite extends PlanTest { comparePlans(optimized2, correctAnswer) } - test("repartitionBy above repartition") { -// Always respects the top repartitionBy amd removes useless repartition + test("distribute above repartition") { +// Always respects the top distribute and removes useless repartition val query1 = testRelation .repartition(10) .distribute('a)(20) @@ -123,8 +123,8 @@ class CollapseRepartitionSuite extends PlanTest { comparePlans(optimized2, correctAnswer) } - test("repartitionBy above coalesce") { -// Always respects the top repartitionBy amd removes useless coalesce below repartition + test("distribute above coalesce") { +// Always respects the top distribute and removes useless coalesce below repartition val query1 = testRelation .coalesce(10) .distribute('a)(20) @@ -140,8 +140,8 @@ class CollapseRepartitionSuite extends PlanTest { comparePlans(optimized2, correctAnswer) } - test("repartition above repartitionBy") { -// Always respects the top repartition amd removes useless distribute below repartition + test("repartition above distribute") { +// Always respects the top repartition and removes useless distribute below repartition val query1 = testRelation .distribute('a)(10) .repartition(20) @@ -155,11 +155,10 @@ class CollapseRepartitionSuite extends PlanTest { comparePlans(optimized1, correctAnswer) comparePlans(optimized2, correctAnswer) - } - test("coalesce above repartitionBy") { -// Remove useless coalesce above repartition + test("coalesce above distribute") { +// Remove useless coalesce above distribute val query1 = testRelation .distribute('a)(10) .coalesce(20) @@ -180,8 +179,8 @@ class CollapseRepartitionSuite extends PlanTest { comparePlans(optimized2, correctAnswer2) } - test("collapse two adjacent repartitionBys into one") { -// Always respects the top repartitionBy + test("collapse two adjacent distributes into one") { +// Always respects the top distribute val query1 = testRelation .distribute('b)(10) .distribute('a)(20) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20349][SQL][REVERT-BRANCH2.1] ListFunctions returns duplicate functions after using persistent functions
Repository: spark Updated Branches: refs/heads/branch-2.1 622d7a8bf -> 3808b4728 [SPARK-20349][SQL][REVERT-BRANCH2.1] ListFunctions returns duplicate functions after using persistent functions Revert the changes of https://github.com/apache/spark/pull/17646 made in Branch 2.1, because it breaks the build. It needs the parser interface, but SessionCatalog in branch 2.1 does not have it. ### What changes were proposed in this pull request? The session catalog caches some persistent functions in the `FunctionRegistry`, so there can be duplicates. Our Catalog API `listFunctions` does not handle it. It would be better if `SessionCatalog` API can de-duplciate the records, instead of doing it by each API caller. In `FunctionRegistry`, our functions are identified by the unquoted string. Thus, this PR is try to parse it using our parser interface and then de-duplicate the names. ### How was this patch tested? Added test cases. Author: Xiao Li Closes #17661 from gatorsmile/compilationFix17646. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3808b472 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3808b472 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3808b472 Branch: refs/heads/branch-2.1 Commit: 3808b472813a2cdf560107787f6971e5202044a8 Parents: 622d7a8 Author: Xiao Li Authored: Mon Apr 17 17:57:20 2017 -0700 Committer: Reynold Xin Committed: Mon Apr 17 17:57:20 2017 -0700 -- .../sql/catalyst/catalog/SessionCatalog.scala | 21 +--- .../spark/sql/execution/command/functions.scala | 4 +++- .../spark/sql/hive/execution/HiveUDFSuite.scala | 17 3 files changed, 8 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3808b472/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 6f302d3..a5cf719 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.catalog import javax.annotation.concurrent.GuardedBy import scala.collection.mutable -import scala.util.{Failure, Success, Try} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -1099,25 +1098,15 @@ class SessionCatalog( def listFunctions(db: String, pattern: String): Seq[(FunctionIdentifier, String)] = { val dbName = formatDatabaseName(db) requireDbExists(dbName) -val dbFunctions = externalCatalog.listFunctions(dbName, pattern).map { f => - FunctionIdentifier(f, Some(dbName)) } -val loadedFunctions = - StringUtils.filterPattern(functionRegistry.listFunction(), pattern).map { f => -// In functionRegistry, function names are stored as an unquoted format. -Try(parser.parseFunctionIdentifier(f)) match { - case Success(e) => e - case Failure(_) => -// The names of some built-in functions are not parsable by our parser, e.g., % -FunctionIdentifier(f) -} - } +val dbFunctions = externalCatalog.listFunctions(dbName, pattern) + .map { f => FunctionIdentifier(f, Some(dbName)) } +val loadedFunctions = StringUtils.filterPattern(functionRegistry.listFunction(), pattern) + .map { f => FunctionIdentifier(f) } val functions = dbFunctions ++ loadedFunctions -// The session catalog caches some persistent functions in the FunctionRegistry -// so there can be duplicates. functions.map { case f if FunctionRegistry.functionSet.contains(f.funcName) => (f, "SYSTEM") case f => (f, "USER") -}.distinct +} } http://git-wip-us.apache.org/repos/asf/spark/blob/3808b472/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala index 75272d2..ea53987 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala @@ -208,6 +208,8 @@ case class ShowFunctionsCommand( case (f, "USER") if showUserFunctions => f.unquotedString case (f, "SYSTEM") if showSystemFunctions => f.unquotedString } -
spark git commit: Typo fix: distitrbuted -> distributed
Repository: spark Updated Branches: refs/heads/master e5fee3e4f -> 0075562dd Typo fix: distitrbuted -> distributed ## What changes were proposed in this pull request? Typo fix: distitrbuted -> distributed ## How was this patch tested? Existing tests Author: Andrew Ash Closes #17664 from ash211/patch-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0075562d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0075562d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0075562d Branch: refs/heads/master Commit: 0075562dd2551a31c35ca26922d6bd73cdb78ea4 Parents: e5fee3e Author: Andrew Ash Authored: Mon Apr 17 17:56:33 2017 -0700 Committer: Reynold Xin Committed: Mon Apr 17 17:56:33 2017 -0700 -- .../yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0075562d/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala -- diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 424bbca..b817570 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -577,7 +577,7 @@ private[spark] class Client( ).foreach { case (flist, resType, addToClasspath) => flist.foreach { file => val (_, localizedPath) = distribute(file, resType = resType) -// If addToClassPath, we ignore adding jar multiple times to distitrbuted cache. +// If addToClassPath, we ignore adding jar multiple times to distributed cache. if (addToClasspath) { if (localizedPath != null) { cachedSecondaryJarLinks += localizedPath - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [HOTFIX] Fix compilation.
Repository: spark Updated Branches: refs/heads/branch-2.1 db9517c16 -> 622d7a8bf [HOTFIX] Fix compilation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/622d7a8b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/622d7a8b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/622d7a8b Branch: refs/heads/branch-2.1 Commit: 622d7a8bf6be22e30db7ff38604ed86b44fcc87e Parents: db9517c Author: Reynold Xin Authored: Mon Apr 17 12:57:58 2017 -0700 Committer: Reynold Xin Committed: Mon Apr 17 12:57:58 2017 -0700 -- .../apache/spark/sql/catalyst/expressions/regexpExpressions.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/622d7a8b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index ad12177..0325d0e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -92,7 +92,8 @@ trait StringRegexExpression extends ImplicitCastInputTypes { See also: Use RLIKE to match with standard regular expressions. """) -case class Like(left: Expression, right: Expression) extends StringRegexExpression { +case class Like(left: Expression, right: Expression) + extends BinaryExpression with StringRegexExpression { override def escape(v: String): String = StringUtils.escapeLikeRegex(v) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17647][SQL] Fix backslash escaping in 'LIKE' patterns.
Repository: spark Updated Branches: refs/heads/branch-2.1 7aad057b0 -> db9517c16 [SPARK-17647][SQL] Fix backslash escaping in 'LIKE' patterns. This patch fixes a bug in the way LIKE patterns are translated to Java regexes. The bug causes any character following an escaped backslash to be escaped, i.e. there is double-escaping. A concrete example is the following pattern:`'%\\%'`. The expected Java regex that this pattern should correspond to (according to the behavior described below) is `'.*\\.*'`, however the current situation leads to `'.*\\%'` instead. --- Update: in light of the discussion that ensued, we should explicitly define the expected behaviour of LIKE expressions, especially in certain edge cases. With the help of gatorsmile, we put together a list of different RDBMS and their variations wrt to certain standard features. | RDBMS\Features | Wildcards | Default escape [1] | Case sensitivity | | --- | --- | --- | --- | | [MS SQL Server](https://msdn.microsoft.com/en-us/library/ms179859.aspx) | _, %, [], [^] | none | no | | [Oracle](https://docs.oracle.com/cd/B12037_01/server.101/b10759/conditions016.htm) | _, % | none | yes | | [DB2 z/OS](http://www.ibm.com/support/knowledgecenter/SSEPEK_11.0.0/sqlref/src/tpc/db2z_likepredicate.html) | _, % | none | yes | | [MySQL](http://dev.mysql.com/doc/refman/5.7/en/string-comparison-functions.html) | _, % | none | no | | [PostreSQL](https://www.postgresql.org/docs/9.0/static/functions-matching.html) | _, % | \ | yes | | [Hive](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF) | _, % | none | yes | | Current Spark | _, % | \ | yes | [1] Default escape character: most systems do not have a default escape character, instead the user can specify one by calling a like expression with an escape argument [A] LIKE [B] ESCAPE [C]. This syntax is currently not supported by Spark, however I would volunteer to implement this feature in a separate ticket. The specifications are often quite terse and certain scenarios are undocumented, so here is a list of scenarios that I am uncertain about and would appreciate any input. Specifically I am looking for feedback on whether or not Spark's current behavior should be changed. 1. [x] Ending a pattern with the escape sequence, e.g. `like 'a\'`. PostreSQL gives an error: 'LIKE pattern must not end with escape character', which I personally find logical. Currently, Spark allows "non-terminated" escapes and simply ignores them as part of the pattern. According to [DB2's documentation](http://www.ibm.com/support/knowledgecenter/SSEPGG_9.7.0/com.ibm.db2.luw.messages.sql.doc/doc/msql00130n.html), ending a pattern in an escape character is invalid. _Proposed new behaviour in Spark: throw AnalysisException_ 2. [x] Empty input, e.g. `'' like ''` Postgres and DB2 will match empty input only if the pattern is empty as well, any other combination of empty input will not match. Spark currently follows this rule. 3. [x] Escape before a non-special character, e.g. `'a' like '\a'`. Escaping a non-wildcard character is not really documented but PostgreSQL just treats it verbatim, which I also find the least surprising behavior. Spark does the same. According to [DB2's documentation](http://www.ibm.com/support/knowledgecenter/SSEPGG_9.7.0/com.ibm.db2.luw.messages.sql.doc/doc/msql00130n.html), it is invalid to follow an escape character with anything other than an escape character, an underscore or a percent sign. _Proposed new behaviour in Spark: throw AnalysisException_ The current specification is also described in the operator's source code in this patch. Extra case in regex unit tests. Author: Jakob Odersky This patch had conflicts when merged, resolved by Committer: Reynold Xin Closes #15398 from jodersky/SPARK-17647. (cherry picked from commit e5fee3e4f853f906f0b476bb04ee35a15f1ae650) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db9517c1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db9517c1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db9517c1 Branch: refs/heads/branch-2.1 Commit: db9517c1661935e88fe9c5d27874d718c928d5d6 Parents: 7aad057 Author: Jakob Odersky Authored: Mon Apr 17 11:17:57 2017 -0700 Committer: Reynold Xin Committed: Mon Apr 17 11:57:01 2017 -0700 -- .../expressions/regexpExpressions.scala | 28 +++- .../spark/sql/catalyst/util/StringUtils.scala | 50 +++--- .../expressions/RegexpExpressionsSuite.scala| 161 +++ .../sql/catalyst/util/StringUtilsSuite.scala| 4 +- 4 files changed, 154 insertions(+), 89 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db9517c1/sql/catalyst/src/main/scala/org/apache/spark/sql/cataly
spark git commit: [SPARK-17647][SQL] Fix backslash escaping in 'LIKE' patterns.
Repository: spark Updated Branches: refs/heads/master 01ff0350a -> e5fee3e4f [SPARK-17647][SQL] Fix backslash escaping in 'LIKE' patterns. ## What changes were proposed in this pull request? This patch fixes a bug in the way LIKE patterns are translated to Java regexes. The bug causes any character following an escaped backslash to be escaped, i.e. there is double-escaping. A concrete example is the following pattern:`'%\\%'`. The expected Java regex that this pattern should correspond to (according to the behavior described below) is `'.*\\.*'`, however the current situation leads to `'.*\\%'` instead. --- Update: in light of the discussion that ensued, we should explicitly define the expected behaviour of LIKE expressions, especially in certain edge cases. With the help of gatorsmile, we put together a list of different RDBMS and their variations wrt to certain standard features. | RDBMS\Features | Wildcards | Default escape [1] | Case sensitivity | | --- | --- | --- | --- | | [MS SQL Server](https://msdn.microsoft.com/en-us/library/ms179859.aspx) | _, %, [], [^] | none | no | | [Oracle](https://docs.oracle.com/cd/B12037_01/server.101/b10759/conditions016.htm) | _, % | none | yes | | [DB2 z/OS](http://www.ibm.com/support/knowledgecenter/SSEPEK_11.0.0/sqlref/src/tpc/db2z_likepredicate.html) | _, % | none | yes | | [MySQL](http://dev.mysql.com/doc/refman/5.7/en/string-comparison-functions.html) | _, % | none | no | | [PostreSQL](https://www.postgresql.org/docs/9.0/static/functions-matching.html) | _, % | \ | yes | | [Hive](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF) | _, % | none | yes | | Current Spark | _, % | \ | yes | [1] Default escape character: most systems do not have a default escape character, instead the user can specify one by calling a like expression with an escape argument [A] LIKE [B] ESCAPE [C]. This syntax is currently not supported by Spark, however I would volunteer to implement this feature in a separate ticket. The specifications are often quite terse and certain scenarios are undocumented, so here is a list of scenarios that I am uncertain about and would appreciate any input. Specifically I am looking for feedback on whether or not Spark's current behavior should be changed. 1. [x] Ending a pattern with the escape sequence, e.g. `like 'a\'`. PostreSQL gives an error: 'LIKE pattern must not end with escape character', which I personally find logical. Currently, Spark allows "non-terminated" escapes and simply ignores them as part of the pattern. According to [DB2's documentation](http://www.ibm.com/support/knowledgecenter/SSEPGG_9.7.0/com.ibm.db2.luw.messages.sql.doc/doc/msql00130n.html), ending a pattern in an escape character is invalid. _Proposed new behaviour in Spark: throw AnalysisException_ 2. [x] Empty input, e.g. `'' like ''` Postgres and DB2 will match empty input only if the pattern is empty as well, any other combination of empty input will not match. Spark currently follows this rule. 3. [x] Escape before a non-special character, e.g. `'a' like '\a'`. Escaping a non-wildcard character is not really documented but PostgreSQL just treats it verbatim, which I also find the least surprising behavior. Spark does the same. According to [DB2's documentation](http://www.ibm.com/support/knowledgecenter/SSEPGG_9.7.0/com.ibm.db2.luw.messages.sql.doc/doc/msql00130n.html), it is invalid to follow an escape character with anything other than an escape character, an underscore or a percent sign. _Proposed new behaviour in Spark: throw AnalysisException_ The current specification is also described in the operator's source code in this patch. ## How was this patch tested? Extra case in regex unit tests. Author: Jakob Odersky This patch had conflicts when merged, resolved by Committer: Reynold Xin Closes #15398 from jodersky/SPARK-17647. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e5fee3e4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e5fee3e4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e5fee3e4 Branch: refs/heads/master Commit: e5fee3e4f853f906f0b476bb04ee35a15f1ae650 Parents: 01ff035 Author: Jakob Odersky Authored: Mon Apr 17 11:17:57 2017 -0700 Committer: Reynold Xin Committed: Mon Apr 17 11:17:57 2017 -0700 -- .../expressions/regexpExpressions.scala | 25 ++- .../spark/sql/catalyst/util/StringUtils.scala | 50 +++--- .../expressions/RegexpExpressionsSuite.scala| 161 +++ .../sql/catalyst/util/StringUtilsSuite.scala| 4 +- 4 files changed, 153 insertions(+), 87 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e5fee3e4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExp
spark git commit: [SPARK-19019][PYTHON][BRANCH-2.0] Fix hijacked `collections.namedtuple` and port cloudpickle changes for PySpark to work with Python 3.6.0
Repository: spark Updated Branches: refs/heads/branch-2.0 24f6ef25a -> 84be4c8d6 [SPARK-19019][PYTHON][BRANCH-2.0] Fix hijacked `collections.namedtuple` and port cloudpickle changes for PySpark to work with Python 3.6.0 ## What changes were proposed in this pull request? This PR proposes to backports https://github.com/apache/spark/pull/16429 to branch-2.0 so that Python 3.6.0 works with Spark 2.0.x. ## How was this patch tested? Manually, via ``` ./run-tests --python-executables=python3.6 ``` ``` Finished test(python3.6): pyspark.tests (124s) Finished test(python3.6): pyspark.accumulators (4s) Finished test(python3.6): pyspark.broadcast (4s) Finished test(python3.6): pyspark.conf (3s) Finished test(python3.6): pyspark.context (15s) Finished test(python3.6): pyspark.ml.classification (24s) Finished test(python3.6): pyspark.sql.tests (190s) Finished test(python3.6): pyspark.mllib.tests (190s) Finished test(python3.6): pyspark.ml.clustering (14s) Finished test(python3.6): pyspark.ml.linalg.__init__ (0s) Finished test(python3.6): pyspark.ml.recommendation (18s) Finished test(python3.6): pyspark.ml.feature (28s) Finished test(python3.6): pyspark.ml.evaluation (28s) Finished test(python3.6): pyspark.ml.regression (21s) Finished test(python3.6): pyspark.ml.tuning (17s) Finished test(python3.6): pyspark.streaming.tests (239s) Finished test(python3.6): pyspark.mllib.evaluation (15s) Finished test(python3.6): pyspark.mllib.classification (24s) Finished test(python3.6): pyspark.mllib.clustering (37s) Finished test(python3.6): pyspark.mllib.linalg.__init__ (0s) Finished test(python3.6): pyspark.mllib.fpm (19s) Finished test(python3.6): pyspark.mllib.feature (19s) Finished test(python3.6): pyspark.mllib.random (8s) Finished test(python3.6): pyspark.ml.tests (76s) Finished test(python3.6): pyspark.mllib.stat.KernelDensity (0s) Finished test(python3.6): pyspark.mllib.recommendation (21s) Finished test(python3.6): pyspark.mllib.linalg.distributed (27s) Finished test(python3.6): pyspark.mllib.regression (22s) Finished test(python3.6): pyspark.mllib.stat._statistics (11s) Finished test(python3.6): pyspark.mllib.tree (16s) Finished test(python3.6): pyspark.profiler (8s) Finished test(python3.6): pyspark.shuffle (1s) Finished test(python3.6): pyspark.mllib.util (17s) Finished test(python3.6): pyspark.serializers (12s) Finished test(python3.6): pyspark.rdd (18s) Finished test(python3.6): pyspark.sql.conf (4s) Finished test(python3.6): pyspark.sql.catalog (14s) Finished test(python3.6): pyspark.sql.column (13s) Finished test(python3.6): pyspark.sql.context (15s) Finished test(python3.6): pyspark.sql.group (26s) Finished test(python3.6): pyspark.sql.dataframe (31s) Finished test(python3.6): pyspark.sql.functions (32s) Finished test(python3.6): pyspark.sql.types (5s) Finished test(python3.6): pyspark.sql.streaming (11s) Finished test(python3.6): pyspark.sql.window (5s) Finished test(python3.6): pyspark.streaming.util (0s) Finished test(python3.6): pyspark.sql.session (15s) Finished test(python3.6): pyspark.sql.readwriter (34s) Tests passed in 376 seconds ``` Author: hyukjinkwon Closes #17374 from HyukjinKwon/SPARK-19019-backport. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84be4c8d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84be4c8d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84be4c8d Branch: refs/heads/branch-2.0 Commit: 84be4c8d6fd52774462762f1f5972f60d286c289 Parents: 24f6ef2 Author: hyukjinkwon Authored: Mon Apr 17 10:03:42 2017 -0700 Committer: Holden Karau Committed: Mon Apr 17 10:03:42 2017 -0700 -- python/pyspark/cloudpickle.py | 98 ++ python/pyspark/serializers.py | 20 2 files changed, 87 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/84be4c8d/python/pyspark/cloudpickle.py -- diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py index 822ae46..94168f0 100644 --- a/python/pyspark/cloudpickle.py +++ b/python/pyspark/cloudpickle.py @@ -43,6 +43,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from __future__ import print_function import operator +import opcode import os import io import pickle @@ -53,6 +54,8 @@ from functools import partial import itertools import dis import traceback +import weakref + if sys.version < '3': from pickle import Pickler @@ -68,10 +71,10 @@ else: PY3 = True #relevant opcodes -STORE_GLOBAL = dis.opname.index('STORE_GLOBAL') -DELETE_GLOBAL = dis.opname.index('DELETE_GLOBAL') -LOAD_GLOBAL = dis.opname.index('LOAD_GLOBAL') -GLOBAL_OPS = [STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL] +STORE_GLOBAL = opcode.opmap['STORE_GLOBAL'] +DELETE
spark git commit: [SPARK-19019][PYTHON][BRANCH-1.6] Fix hijacked `collections.namedtuple` and port cloudpickle changes for PySpark to work with Python 3.6.0
Repository: spark Updated Branches: refs/heads/branch-1.6 23f9faa40 -> 6b315f3d5 [SPARK-19019][PYTHON][BRANCH-1.6] Fix hijacked `collections.namedtuple` and port cloudpickle changes for PySpark to work with Python 3.6.0 ## What changes were proposed in this pull request? This PR proposes to backports https://github.com/apache/spark/pull/16429 to branch-1.6 so that Python 3.6.0 works with Spark 1.6.x. ## How was this patch tested? Manually, via ``` ./run-tests --python-executables=python3.6 ``` ``` Finished test(python3.6): pyspark.conf (5s) Finished test(python3.6): pyspark.broadcast (7s) Finished test(python3.6): pyspark.accumulators (9s) Finished test(python3.6): pyspark.rdd (16s) Finished test(python3.6): pyspark.shuffle (0s) Finished test(python3.6): pyspark.serializers (11s) Finished test(python3.6): pyspark.profiler (5s) Finished test(python3.6): pyspark.context (21s) Finished test(python3.6): pyspark.ml.clustering (12s) Finished test(python3.6): pyspark.ml.feature (16s) Finished test(python3.6): pyspark.ml.classification (16s) Finished test(python3.6): pyspark.ml.recommendation (16s) Finished test(python3.6): pyspark.ml.tuning (14s) Finished test(python3.6): pyspark.ml.regression (16s) Finished test(python3.6): pyspark.ml.evaluation (12s) Finished test(python3.6): pyspark.ml.tests (17s) Finished test(python3.6): pyspark.mllib.classification (18s) Finished test(python3.6): pyspark.mllib.evaluation (12s) Finished test(python3.6): pyspark.mllib.feature (19s) Finished test(python3.6): pyspark.mllib.linalg.__init__ (0s) Finished test(python3.6): pyspark.mllib.fpm (12s) Finished test(python3.6): pyspark.mllib.clustering (31s) Finished test(python3.6): pyspark.mllib.random (8s) Finished test(python3.6): pyspark.mllib.linalg.distributed (17s) Finished test(python3.6): pyspark.mllib.recommendation (23s) Finished test(python3.6): pyspark.mllib.stat.KernelDensity (0s) Finished test(python3.6): pyspark.mllib.stat._statistics (13s) Finished test(python3.6): pyspark.mllib.regression (22s) Finished test(python3.6): pyspark.mllib.util (9s) Finished test(python3.6): pyspark.mllib.tree (14s) Finished test(python3.6): pyspark.sql.types (9s) Finished test(python3.6): pyspark.sql.context (16s) Finished test(python3.6): pyspark.sql.column (14s) Finished test(python3.6): pyspark.sql.group (16s) Finished test(python3.6): pyspark.sql.dataframe (25s) Finished test(python3.6): pyspark.tests (164s) Finished test(python3.6): pyspark.sql.window (6s) Finished test(python3.6): pyspark.sql.functions (19s) Finished test(python3.6): pyspark.streaming.util (0s) Finished test(python3.6): pyspark.sql.readwriter (24s) Finished test(python3.6): pyspark.sql.tests (38s) Finished test(python3.6): pyspark.mllib.tests (133s) Finished test(python3.6): pyspark.streaming.tests (189s) Tests passed in 380 seconds ``` Author: hyukjinkwon Closes #17375 from HyukjinKwon/SPARK-19019-backport-1.6. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6b315f3d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6b315f3d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6b315f3d Branch: refs/heads/branch-1.6 Commit: 6b315f3d5fe0aea1fdd65f1d56571ead930a54c3 Parents: 23f9faa Author: hyukjinkwon Authored: Mon Apr 17 09:58:55 2017 -0700 Committer: Holden Karau Committed: Mon Apr 17 09:58:55 2017 -0700 -- python/pyspark/cloudpickle.py | 98 ++ python/pyspark/serializers.py | 20 2 files changed, 87 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6b315f3d/python/pyspark/cloudpickle.py -- diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py index 822ae46..94168f0 100644 --- a/python/pyspark/cloudpickle.py +++ b/python/pyspark/cloudpickle.py @@ -43,6 +43,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from __future__ import print_function import operator +import opcode import os import io import pickle @@ -53,6 +54,8 @@ from functools import partial import itertools import dis import traceback +import weakref + if sys.version < '3': from pickle import Pickler @@ -68,10 +71,10 @@ else: PY3 = True #relevant opcodes -STORE_GLOBAL = dis.opname.index('STORE_GLOBAL') -DELETE_GLOBAL = dis.opname.index('DELETE_GLOBAL') -LOAD_GLOBAL = dis.opname.index('LOAD_GLOBAL') -GLOBAL_OPS = [STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL] +STORE_GLOBAL = opcode.opmap['STORE_GLOBAL'] +DELETE_GLOBAL = opcode.opmap['DELETE_GLOBAL'] +LOAD_GLOBAL = opcode.opmap['LOAD_GLOBAL'] +GLOBAL_OPS = (STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL) HAVE_ARGUMENT = dis.HAVE_ARGUMENT EXTENDED_ARG = dis.EXTENDED_ARG @@ -90,6 +93,43 @@ def _builtin_type(name): ret
spark git commit: [SPARK-20349][SQL] ListFunctions returns duplicate functions after using persistent functions
Repository: spark Updated Branches: refs/heads/branch-2.1 efa11a42f -> 7aad057b0 [SPARK-20349][SQL] ListFunctions returns duplicate functions after using persistent functions ### What changes were proposed in this pull request? The session catalog caches some persistent functions in the `FunctionRegistry`, so there can be duplicates. Our Catalog API `listFunctions` does not handle it. It would be better if `SessionCatalog` API can de-duplciate the records, instead of doing it by each API caller. In `FunctionRegistry`, our functions are identified by the unquoted string. Thus, this PR is try to parse it using our parser interface and then de-duplicate the names. ### How was this patch tested? Added test cases. Author: Xiao Li Closes #17646 from gatorsmile/showFunctions. (cherry picked from commit 01ff0350a85b179715946c3bd4f003db7c5e3641) Signed-off-by: Xiao Li Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7aad057b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7aad057b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7aad057b Branch: refs/heads/branch-2.1 Commit: 7aad057b00db240515692d5c07e67ee58f6b95d3 Parents: efa11a4 Author: Xiao Li Authored: Mon Apr 17 09:50:20 2017 -0700 Committer: Xiao Li Committed: Mon Apr 17 09:50:32 2017 -0700 -- .../sql/catalyst/catalog/SessionCatalog.scala | 21 +++- .../spark/sql/execution/command/functions.scala | 4 +--- .../spark/sql/hive/execution/HiveUDFSuite.scala | 17 3 files changed, 34 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7aad057b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index a5cf719..6f302d3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.catalog import javax.annotation.concurrent.GuardedBy import scala.collection.mutable +import scala.util.{Failure, Success, Try} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -1098,15 +1099,25 @@ class SessionCatalog( def listFunctions(db: String, pattern: String): Seq[(FunctionIdentifier, String)] = { val dbName = formatDatabaseName(db) requireDbExists(dbName) -val dbFunctions = externalCatalog.listFunctions(dbName, pattern) - .map { f => FunctionIdentifier(f, Some(dbName)) } -val loadedFunctions = StringUtils.filterPattern(functionRegistry.listFunction(), pattern) - .map { f => FunctionIdentifier(f) } +val dbFunctions = externalCatalog.listFunctions(dbName, pattern).map { f => + FunctionIdentifier(f, Some(dbName)) } +val loadedFunctions = + StringUtils.filterPattern(functionRegistry.listFunction(), pattern).map { f => +// In functionRegistry, function names are stored as an unquoted format. +Try(parser.parseFunctionIdentifier(f)) match { + case Success(e) => e + case Failure(_) => +// The names of some built-in functions are not parsable by our parser, e.g., % +FunctionIdentifier(f) +} + } val functions = dbFunctions ++ loadedFunctions +// The session catalog caches some persistent functions in the FunctionRegistry +// so there can be duplicates. functions.map { case f if FunctionRegistry.functionSet.contains(f.funcName) => (f, "SYSTEM") case f => (f, "USER") -} +}.distinct } http://git-wip-us.apache.org/repos/asf/spark/blob/7aad057b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala index ea53987..75272d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala @@ -208,8 +208,6 @@ case class ShowFunctionsCommand( case (f, "USER") if showUserFunctions => f.unquotedString case (f, "SYSTEM") if showSystemFunctions => f.unquotedString } -// The session catalog caches some persistent functions in the FunctionRegistry -// so there can be duplicates. -functionName
spark git commit: [SPARK-20349][SQL] ListFunctions returns duplicate functions after using persistent functions
Repository: spark Updated Branches: refs/heads/master 24f09b39c -> 01ff0350a [SPARK-20349][SQL] ListFunctions returns duplicate functions after using persistent functions ### What changes were proposed in this pull request? The session catalog caches some persistent functions in the `FunctionRegistry`, so there can be duplicates. Our Catalog API `listFunctions` does not handle it. It would be better if `SessionCatalog` API can de-duplciate the records, instead of doing it by each API caller. In `FunctionRegistry`, our functions are identified by the unquoted string. Thus, this PR is try to parse it using our parser interface and then de-duplicate the names. ### How was this patch tested? Added test cases. Author: Xiao Li Closes #17646 from gatorsmile/showFunctions. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/01ff0350 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/01ff0350 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/01ff0350 Branch: refs/heads/master Commit: 01ff0350a85b179715946c3bd4f003db7c5e3641 Parents: 24f09b3 Author: Xiao Li Authored: Mon Apr 17 09:50:20 2017 -0700 Committer: Xiao Li Committed: Mon Apr 17 09:50:20 2017 -0700 -- .../sql/catalyst/catalog/SessionCatalog.scala | 21 +++- .../spark/sql/execution/command/functions.scala | 4 +--- .../spark/sql/hive/execution/HiveUDFSuite.scala | 17 3 files changed, 34 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/01ff0350/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 1417bcc..3fbf83f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -22,6 +22,7 @@ import java.util.Locale import javax.annotation.concurrent.GuardedBy import scala.collection.mutable +import scala.util.{Failure, Success, Try} import com.google.common.cache.{Cache, CacheBuilder} import org.apache.hadoop.conf.Configuration @@ -1202,15 +1203,25 @@ class SessionCatalog( def listFunctions(db: String, pattern: String): Seq[(FunctionIdentifier, String)] = { val dbName = formatDatabaseName(db) requireDbExists(dbName) -val dbFunctions = externalCatalog.listFunctions(dbName, pattern) - .map { f => FunctionIdentifier(f, Some(dbName)) } -val loadedFunctions = StringUtils.filterPattern(functionRegistry.listFunction(), pattern) - .map { f => FunctionIdentifier(f) } +val dbFunctions = externalCatalog.listFunctions(dbName, pattern).map { f => + FunctionIdentifier(f, Some(dbName)) } +val loadedFunctions = + StringUtils.filterPattern(functionRegistry.listFunction(), pattern).map { f => +// In functionRegistry, function names are stored as an unquoted format. +Try(parser.parseFunctionIdentifier(f)) match { + case Success(e) => e + case Failure(_) => +// The names of some built-in functions are not parsable by our parser, e.g., % +FunctionIdentifier(f) +} + } val functions = dbFunctions ++ loadedFunctions +// The session catalog caches some persistent functions in the FunctionRegistry +// so there can be duplicates. functions.map { case f if FunctionRegistry.functionSet.contains(f.funcName) => (f, "SYSTEM") case f => (f, "USER") -} +}.distinct } http://git-wip-us.apache.org/repos/asf/spark/blob/01ff0350/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala index e0d0029..5450823 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala @@ -207,8 +207,6 @@ case class ShowFunctionsCommand( case (f, "USER") if showUserFunctions => f.unquotedString case (f, "SYSTEM") if showSystemFunctions => f.unquotedString } -// The session catalog caches some persistent functions in the FunctionRegistry -// so there can be duplicates. -functionNames.distinct.sorted.map(Row(_)) +functionNames.sorted.map(Row(_)) } } http://git-wip-us.apache.or
spark git commit: [SPARK-19828][R][FOLLOWUP] Rename asJsonArray to as.json.array in from_json function in R
Repository: spark Updated Branches: refs/heads/master 86d251c58 -> 24f09b39c [SPARK-19828][R][FOLLOWUP] Rename asJsonArray to as.json.array in from_json function in R ## What changes were proposed in this pull request? This was suggested to be `as.json.array` at the first place in the PR to SPARK-19828 but we could not do this as the lint check emits an error for multiple dots in the variable names. After SPARK-20278, now we are able to use `multiple.dots.in.names`. `asJsonArray` in `from_json` function is still able to be changed as 2.2 is not released yet. So, this PR proposes to rename `asJsonArray` to `as.json.array`. ## How was this patch tested? Jenkins tests, local tests with `./R/run-tests.sh` and manual `./dev/lint-r`. Existing tests should cover this. Author: hyukjinkwon Closes #17653 from HyukjinKwon/SPARK-19828-followup. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/24f09b39 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/24f09b39 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/24f09b39 Branch: refs/heads/master Commit: 24f09b39c7b947e52fda952676d5114c2540e732 Parents: 86d251c Author: hyukjinkwon Authored: Mon Apr 17 09:04:24 2017 -0700 Committer: Felix Cheung Committed: Mon Apr 17 09:04:24 2017 -0700 -- R/pkg/R/functions.R | 8 R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/24f09b39/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 449476d..c311921 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2438,12 +2438,12 @@ setMethod("date_format", signature(y = "Column", x = "character"), #' from_json #' #' Parses a column containing a JSON string into a Column of \code{structType} with the specified -#' \code{schema} or array of \code{structType} if \code{asJsonArray} is set to \code{TRUE}. +#' \code{schema} or array of \code{structType} if \code{as.json.array} is set to \code{TRUE}. #' If the string is unparseable, the Column will contains the value NA. #' #' @param x Column containing the JSON string. #' @param schema a structType object to use as the schema to use when parsing the JSON string. -#' @param asJsonArray indicating if input string is JSON array of objects or a single object. +#' @param as.json.array indicating if input string is JSON array of objects or a single object. #' @param ... additional named properties to control how the json is parsed, accepts the same #'options as the JSON data source. #' @@ -2459,8 +2459,8 @@ setMethod("date_format", signature(y = "Column", x = "character"), #'} #' @note from_json since 2.2.0 setMethod("from_json", signature(x = "Column", schema = "structType"), - function(x, schema, asJsonArray = FALSE, ...) { -if (asJsonArray) { + function(x, schema, as.json.array = FALSE, ...) { +if (as.json.array) { jschema <- callJStatic("org.apache.spark.sql.types.DataTypes", "createArrayType", schema$jobj) http://git-wip-us.apache.org/repos/asf/spark/blob/24f09b39/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 3fbb618..6a6c9a8 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1454,7 +1454,7 @@ test_that("column functions", { jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]" df <- as.DataFrame(list(list("people" = jsonArr))) schema <- structType(structField("name", "string")) - arr <- collect(select(df, alias(from_json(df$people, schema, asJsonArray = TRUE), "arrcol"))) + arr <- collect(select(df, alias(from_json(df$people, schema, as.json.array = TRUE), "arrcol"))) expect_equal(ncol(arr), 1) expect_equal(nrow(arr), 1) expect_is(arr[[1]][[1]], "list") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20335][SQL][BACKPORT-2.1] Children expressions of Hive UDF impacts the determinism of Hive UDF
Repository: spark Updated Branches: refs/heads/branch-2.1 2a3e50e24 -> efa11a42f [SPARK-20335][SQL][BACKPORT-2.1] Children expressions of Hive UDF impacts the determinism of Hive UDF ### What changes were proposed in this pull request? This PR is to backport https://github.com/apache/spark/pull/17635 to Spark 2.1 --- ```JAVA /** * Certain optimizations should not be applied if UDF is not deterministic. * Deterministic UDF returns same result each time it is invoked with a * particular input. This determinism just needs to hold within the context of * a query. * * return true if the UDF is deterministic */ boolean deterministic() default true; ``` Based on the definition of [UDFType](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFType.java#L42-L50), when Hive UDF's children are non-deterministic, Hive UDF is also non-deterministic. ### How was this patch tested? Added test cases. Author: Xiao Li Closes #17652 from gatorsmile/backport-17635. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/efa11a42 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/efa11a42 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/efa11a42 Branch: refs/heads/branch-2.1 Commit: efa11a42f0c34dcfaf4a1bf17055539c43c8e4f9 Parents: 2a3e50e Author: Xiao Li Authored: Mon Apr 17 15:59:55 2017 +0800 Committer: Wenchen Fan Committed: Mon Apr 17 15:59:55 2017 +0800 -- .../org/apache/spark/sql/hive/hiveUDFs.scala| 4 +-- .../hive/execution/AggregationQuerySuite.scala | 13 + .../spark/sql/hive/execution/HiveUDFSuite.scala | 30 3 files changed, 45 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/efa11a42/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala index 37414ad..3e46b74 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala @@ -42,7 +42,7 @@ private[hive] case class HiveSimpleUDF( name: String, funcWrapper: HiveFunctionWrapper, children: Seq[Expression]) extends Expression with HiveInspectors with CodegenFallback with Logging { - override def deterministic: Boolean = isUDFDeterministic + override def deterministic: Boolean = isUDFDeterministic && children.forall(_.deterministic) override def nullable: Boolean = true @@ -120,7 +120,7 @@ private[hive] case class HiveGenericUDF( override def nullable: Boolean = true - override def deterministic: Boolean = isUDFDeterministic + override def deterministic: Boolean = isUDFDeterministic && children.forall(_.deterministic) override def foldable: Boolean = isUDFDeterministic && returnInspector.isInstanceOf[ConstantObjectInspector] http://git-wip-us.apache.org/repos/asf/spark/blob/efa11a42/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala index 4a8086d..84f9159 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala @@ -509,6 +509,19 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(null, null, 110.0, null, null, 10.0) :: Nil) } + test("non-deterministic children expressions of UDAF") { +val e = intercept[AnalysisException] { + spark.sql( +""" + |SELECT mydoublesum(value + 1.5 * key + rand()) + |FROM agg1 + |GROUP BY key +""".stripMargin) +}.getMessage +assert(Seq("nondeterministic expression", + "should not appear in the arguments of an aggregate function").forall(e.contains)) + } + test("interpreted aggregate function") { checkAnswer( spark.sql( http://git-wip-us.apache.org/repos/asf/spark/blob/efa11a42/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala index 4098bb5..78c80da 100644 --- a/sql/hive/src/test/scala/org/apache/spark