spark git commit: [SPARK-23319][TESTS][BRANCH-2.3] Explicitly specify Pandas and PyArrow versions in PySpark tests (to skip or test)
Repository: spark Updated Branches: refs/heads/branch-2.3 db59e5542 -> 053830256 [SPARK-23319][TESTS][BRANCH-2.3] Explicitly specify Pandas and PyArrow versions in PySpark tests (to skip or test) This PR backports https://github.com/apache/spark/pull/20487 to branch-2.3. Author: hyukjinkwon Author: Takuya UESHIN Closes #20534 from HyukjinKwon/PR_TOOL_PICK_PR_20487_BRANCH-2.3. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/05383025 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/05383025 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/05383025 Branch: refs/heads/branch-2.3 Commit: 0538302561c4d77b2856b1ce73b3ccbcb6688ac6 Parents: db59e55 Author: hyukjinkwon Authored: Thu Feb 8 16:47:12 2018 +0900 Committer: hyukjinkwon Committed: Thu Feb 8 16:47:12 2018 +0900 -- pom.xml | 4 ++ python/pyspark/sql/dataframe.py | 3 ++ python/pyspark/sql/session.py | 3 ++ python/pyspark/sql/tests.py | 83 +++- python/pyspark/sql/utils.py | 30 + python/setup.py | 10 - 6 files changed, 86 insertions(+), 47 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/05383025/pom.xml -- diff --git a/pom.xml b/pom.xml index a8e448a..9aa531e 100644 --- a/pom.xml +++ b/pom.xml @@ -185,6 +185,10 @@ 2.8 1.8 1.0.0 + 0.8.0 ${java.home} http://git-wip-us.apache.org/repos/asf/spark/blob/05383025/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 59a4170..8ec24db 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1913,6 +1913,9 @@ class DataFrame(object): 02 Alice 15Bob """ +from pyspark.sql.utils import require_minimum_pandas_version +require_minimum_pandas_version() + import pandas as pd if self.sql_ctx.getConf("spark.sql.execution.pandas.respectSessionTimeZone").lower() \ http://git-wip-us.apache.org/repos/asf/spark/blob/05383025/python/pyspark/sql/session.py -- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 6c84023..2ac2ec2 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -640,6 +640,9 @@ class SparkSession(object): except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): +from pyspark.sql.utils import require_minimum_pandas_version +require_minimum_pandas_version() + if self.conf.get("spark.sql.execution.pandas.respectSessionTimeZone").lower() \ == "true": timezone = self.conf.get("spark.sql.session.timeZone") http://git-wip-us.apache.org/repos/asf/spark/blob/05383025/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 878d402..0e1b2ec 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -48,19 +48,26 @@ if sys.version_info[:2] <= (2, 6): else: import unittest -_have_pandas = False -_have_old_pandas = False +_pandas_requirement_message = None try: -import pandas -try: -from pyspark.sql.utils import require_minimum_pandas_version -require_minimum_pandas_version() -_have_pandas = True -except: -_have_old_pandas = True -except: -# No Pandas, but that's okay, we'll skip those tests -pass +from pyspark.sql.utils import require_minimum_pandas_version +require_minimum_pandas_version() +except ImportError as e: +from pyspark.util import _exception_message +# If Pandas version requirement is not satisfied, skip related tests. +_pandas_requirement_message = _exception_message(e) + +_pyarrow_requirement_message = None +try: +from pyspark.sql.utils import require_minimum_pyarrow_version +require_minimum_pyarrow_version() +except ImportError as e: +from pyspark.util import _exception_message +# If Arrow version requirement is not satisfied, skip related tests. +_pyarrow_requirement_message = _exception_message(e) + +_have_pandas = _pandas_requirement_message is None +_have_pyarrow = _pyarrow_requirement_message is None from pyspark import SparkContext from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row @@ -75,15 +82,6 @@ from pyspark.sql.window import Window from pyspark.sql.utils import AnalysisException, ParseExcep
svn commit: r24817 - in /dev/spark/2.3.1-SNAPSHOT-2018_02_07_22_01-db59e55-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Thu Feb 8 06:15:34 2018 New Revision: 24817 Log: Apache Spark 2.3.1-SNAPSHOT-2018_02_07_22_01-db59e55 docs [This commit notification would consist of 1442 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert [SPARK-22279][SQL] Turn on spark.sql.hive.convertMetastoreOrc by default
Repository: spark Updated Branches: refs/heads/branch-2.3 2ba07d5b1 -> db59e5542 Revert [SPARK-22279][SQL] Turn on spark.sql.hive.convertMetastoreOrc by default ## What changes were proposed in this pull request? This is to revert the changes made in https://github.com/apache/spark/pull/19499 , because this causes a regression. We should not ignore the table-specific compression conf when the Hive serde tables are converted to the data source tables. ## How was this patch tested? The existing tests. Author: gatorsmile Closes #20536 from gatorsmile/revert22279. (cherry picked from commit 3473fda6dc77bdfd84b3de95d2082856ad4f8626) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db59e554 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db59e554 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db59e554 Branch: refs/heads/branch-2.3 Commit: db59e554273fe0a54a3223079ff39106fdd1442e Parents: 2ba07d5 Author: gatorsmile Authored: Thu Feb 8 12:21:18 2018 +0800 Committer: Wenchen Fan Committed: Thu Feb 8 12:21:31 2018 +0800 -- sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db59e554/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index d9627eb..93f3f38 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -109,7 +109,7 @@ private[spark] object HiveUtils extends Logging { .doc("When set to true, the built-in ORC reader and writer are used to process " + "ORC tables created by using the HiveQL syntax, instead of Hive serde.") .booleanConf -.createWithDefault(true) +.createWithDefault(false) val HIVE_METASTORE_SHARED_PREFIXES = buildConf("spark.sql.hive.metastore.sharedPrefixes") .doc("A comma separated list of class prefixes that should be loaded using the classloader " + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert [SPARK-22279][SQL] Turn on spark.sql.hive.convertMetastoreOrc by default
Repository: spark Updated Branches: refs/heads/master a62f30d3f -> 3473fda6d Revert [SPARK-22279][SQL] Turn on spark.sql.hive.convertMetastoreOrc by default ## What changes were proposed in this pull request? This is to revert the changes made in https://github.com/apache/spark/pull/19499 , because this causes a regression. We should not ignore the table-specific compression conf when the Hive serde tables are converted to the data source tables. ## How was this patch tested? The existing tests. Author: gatorsmile Closes #20536 from gatorsmile/revert22279. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3473fda6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3473fda6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3473fda6 Branch: refs/heads/master Commit: 3473fda6dc77bdfd84b3de95d2082856ad4f8626 Parents: a62f30d Author: gatorsmile Authored: Thu Feb 8 12:21:18 2018 +0800 Committer: Wenchen Fan Committed: Thu Feb 8 12:21:18 2018 +0800 -- sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3473fda6/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index d9627eb..93f3f38 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -109,7 +109,7 @@ private[spark] object HiveUtils extends Logging { .doc("When set to true, the built-in ORC reader and writer are used to process " + "ORC tables created by using the HiveQL syntax, instead of Hive serde.") .booleanConf -.createWithDefault(true) +.createWithDefault(false) val HIVE_METASTORE_SHARED_PREFIXES = buildConf("spark.sql.hive.metastore.sharedPrefixes") .doc("A comma separated list of class prefixes that should be loaded using the classloader " + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r24815 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_07_20_01-a62f30d-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Thu Feb 8 04:15:19 2018 New Revision: 24815 Log: Apache Spark 2.4.0-SNAPSHOT-2018_02_07_20_01-a62f30d docs [This commit notification would consist of 1444 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23319][TESTS][FOLLOWUP] Fix a test for Python 3 without pandas.
Repository: spark Updated Branches: refs/heads/master 30295bf5a -> a62f30d3f [SPARK-23319][TESTS][FOLLOWUP] Fix a test for Python 3 without pandas. ## What changes were proposed in this pull request? This is a followup pr of #20487. When importing module but it doesn't exists, the error message is slightly different between Python 2 and 3. E.g., in Python 2: ``` No module named pandas ``` in Python 3: ``` No module named 'pandas' ``` So, one test to check an import error fails in Python 3 without pandas. This pr fixes it. ## How was this patch tested? Tested manually in my local environment. Author: Takuya UESHIN Closes #20538 from ueshin/issues/SPARK-23319/fup1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a62f30d3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a62f30d3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a62f30d3 Branch: refs/heads/master Commit: a62f30d3fa032ff75bc2b7bebbd0813e67ea5fd5 Parents: 30295bf Author: Takuya UESHIN Authored: Thu Feb 8 12:46:10 2018 +0900 Committer: hyukjinkwon Committed: Thu Feb 8 12:46:10 2018 +0900 -- python/pyspark/sql/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a62f30d3/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 58359b6..90ff084 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -2860,7 +2860,7 @@ class SQLTests(ReusedSQLTestCase): with QuietTest(self.sc): with self.assertRaisesRegexp( ImportError, -'(Pandas >= .* must be installed|No module named pandas)'): +"(Pandas >= .* must be installed|No module named '?pandas'?)"): import pandas as pd from datetime import datetime pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)], - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r24813 - in /dev/spark/2.3.1-SNAPSHOT-2018_02_07_18_01-2ba07d5-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Thu Feb 8 02:15:09 2018 New Revision: 24813 Log: Apache Spark 2.3.1-SNAPSHOT-2018_02_07_18_01-2ba07d5 docs [This commit notification would consist of 1442 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23300][TESTS][BRANCH-2.3] Prints out if Pandas and PyArrow are installed or not in PySpark SQL tests
Repository: spark Updated Branches: refs/heads/branch-2.3 05239afc9 -> 2ba07d5b1 [SPARK-23300][TESTS][BRANCH-2.3] Prints out if Pandas and PyArrow are installed or not in PySpark SQL tests This PR backports https://github.com/apache/spark/pull/20473 to branch-2.3. Author: hyukjinkwon Closes #20533 from HyukjinKwon/backport-20473. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2ba07d5b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2ba07d5b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2ba07d5b Branch: refs/heads/branch-2.3 Commit: 2ba07d5b101c44382e0db6d660da756c2f5ce627 Parents: 05239af Author: hyukjinkwon Authored: Thu Feb 8 09:29:31 2018 +0900 Committer: hyukjinkwon Committed: Thu Feb 8 09:29:31 2018 +0900 -- python/run-tests.py | 56 +++- 1 file changed, 55 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2ba07d5b/python/run-tests.py -- diff --git a/python/run-tests.py b/python/run-tests.py index 1341086..3539c76 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -31,6 +31,7 @@ if sys.version < '3': import Queue else: import queue as Queue +from distutils.version import LooseVersion # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module @@ -39,7 +40,7 @@ sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../de from sparktestsupport import SPARK_HOME # noqa (suppress pep8 warnings) from sparktestsupport.shellutils import which, subprocess_check_output # noqa -from sparktestsupport.modules import all_modules # noqa +from sparktestsupport.modules import all_modules, pyspark_sql # noqa python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root') @@ -151,6 +152,55 @@ def parse_opts(): return opts +def _check_dependencies(python_exec, modules_to_test): +# If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and +# explicitly prints out. See SPARK-23300. +if pyspark_sql in modules_to_test: +# TODO(HyukjinKwon): Relocate and deduplicate these version specifications. +minimum_pyarrow_version = '0.8.0' +minimum_pandas_version = '0.19.2' + +try: +pyarrow_version = subprocess_check_output( +[python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"], +universal_newlines=True, +stderr=open(os.devnull, 'w')).strip() +if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version): +LOGGER.info("Will test PyArrow related features against Python executable " +"'%s' in '%s' module." % (python_exec, pyspark_sql.name)) +else: +LOGGER.warning( +"Will skip PyArrow related features against Python executable " +"'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow " +"%s was found." % ( +python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version)) +except: +LOGGER.warning( +"Will skip PyArrow related features against Python executable " +"'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow " +"was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version)) + +try: +pandas_version = subprocess_check_output( +[python_exec, "-c", "import pandas; print(pandas.__version__)"], +universal_newlines=True, +stderr=open(os.devnull, 'w')).strip() +if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version): +LOGGER.info("Will test Pandas related features against Python executable " +"'%s' in '%s' module." % (python_exec, pyspark_sql.name)) +else: +LOGGER.warning( +"Will skip Pandas related features against Python executable " +"'%s' in '%s' module. Pandas >= %s is required; however, Pandas " +"%s was found." % ( +python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version)) +except: +LOGGER.warning( +"Will skip Pandas related features against Python executable " +"'%s' in '%s' module. Pandas >= %s is required; however, Pandas " +"was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version)) + + def main(): opts = parse_op
svn commit: r24808 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_07_16_01-30295bf-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Thu Feb 8 00:14:48 2018 New Revision: 24808 Log: Apache Spark 2.4.0-SNAPSHOT-2018_02_07_16_01-30295bf docs [This commit notification would consist of 1444 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23092][SQL] Migrate MemoryStream to DataSourceV2 APIs
Repository: spark Updated Branches: refs/heads/master 9841ae031 -> 30295bf5a [SPARK-23092][SQL] Migrate MemoryStream to DataSourceV2 APIs ## What changes were proposed in this pull request? This PR migrates the MemoryStream to DataSourceV2 APIs. One additional change is in the reported keys in StreamingQueryProgress.durationMs. "getOffset" and "getBatch" replaced with "setOffsetRange" and "getEndOffset" as tracking these make more sense. Unit tests changed accordingly. ## How was this patch tested? Existing unit tests, few updated unit tests. Author: Tathagata Das Author: Burak Yavuz Closes #20445 from tdas/SPARK-23092. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/30295bf5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/30295bf5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/30295bf5 Branch: refs/heads/master Commit: 30295bf5a6754d0ae43334f7bf00e7a29ed0f1af Parents: 9841ae0 Author: Tathagata Das Authored: Wed Feb 7 15:22:53 2018 -0800 Committer: Tathagata Das Committed: Wed Feb 7 15:22:53 2018 -0800 -- .../sql/execution/streaming/LongOffset.scala| 4 +- .../streaming/MicroBatchExecution.scala | 27 ++-- .../spark/sql/execution/streaming/memory.scala | 132 +++ .../streaming/sources/RateStreamSourceV2.scala | 2 +- .../execution/streaming/ForeachSinkSuite.scala | 55 +++- .../spark/sql/streaming/StreamSuite.scala | 8 +- .../apache/spark/sql/streaming/StreamTest.scala | 2 +- .../streaming/StreamingQueryListenerSuite.scala | 5 +- .../sql/streaming/StreamingQuerySuite.scala | 70 ++ 9 files changed, 171 insertions(+), 134 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/30295bf5/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala index 5f0b195..3ff5b86 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala @@ -17,10 +17,12 @@ package org.apache.spark.sql.execution.streaming +import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2} + /** * A simple offset for sources that produce a single linear stream of data. */ -case class LongOffset(offset: Long) extends Offset { +case class LongOffset(offset: Long) extends OffsetV2 { override val json = offset.toString http://git-wip-us.apache.org/repos/asf/spark/blob/30295bf5/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala index d9aa857..045d2b4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala @@ -270,16 +270,17 @@ class MicroBatchExecution( } case s: MicroBatchReader => updateStatusMessage(s"Getting offsets from $s") -reportTimeTaken("getOffset") { -// Once v1 streaming source execution is gone, we can refactor this away. -// For now, we set the range here to get the source to infer the available end offset, -// get that offset, and then set the range again when we later execute. -s.setOffsetRange( - toJava(availableOffsets.get(s).map(off => s.deserializeOffset(off.json))), - Optional.empty()) - - (s, Some(s.getEndOffset)) +reportTimeTaken("setOffsetRange") { + // Once v1 streaming source execution is gone, we can refactor this away. + // For now, we set the range here to get the source to infer the available end offset, + // get that offset, and then set the range again when we later execute. + s.setOffsetRange( +toJava(availableOffsets.get(s).map(off => s.deserializeOffset(off.json))), +Optional.empty()) } + +val currentOffset = reportTimeTaken("getEndOffset") { s.getEndOffset() } +(s, Option(currentOffset)) }.toMap availableOffsets ++= latestOffsets.filter { case (_, o) => o.nonEmpty }.mapValues(_.get) @@ -401,10 +402,14 @@ class MicroBatchExe
svn commit: r24795 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_07_12_01-9841ae0-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Wed Feb 7 20:20:36 2018 New Revision: 24795 Log: Apache Spark 2.4.0-SNAPSHOT-2018_02_07_12_01-9841ae0 docs [This commit notification would consist of 1444 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r24793 - in /dev/spark/2.3.1-SNAPSHOT-2018_02_07_10_01-05239af-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Wed Feb 7 18:17:45 2018 New Revision: 24793 Log: Apache Spark 2.3.1-SNAPSHOT-2018_02_07_10_01-05239af docs [This commit notification would consist of 1442 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23345][SQL] Remove open stream record even closing it fails
Repository: spark Updated Branches: refs/heads/branch-2.3 cb22e830b -> 05239afc9 [SPARK-23345][SQL] Remove open stream record even closing it fails ## What changes were proposed in this pull request? When `DebugFilesystem` closes opened stream, if any exception occurs, we still need to remove the open stream record from `DebugFilesystem`. Otherwise, it goes to report leaked filesystem connection. ## How was this patch tested? Existing tests. Author: Liang-Chi Hsieh Closes #20524 from viirya/SPARK-23345. (cherry picked from commit 9841ae0313cbee1f083f131f9446808c90ed5a7b) Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/05239afc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/05239afc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/05239afc Branch: refs/heads/branch-2.3 Commit: 05239afc9e62ef4c71c9f22a930e73888985510a Parents: cb22e83 Author: Liang-Chi Hsieh Authored: Wed Feb 7 09:48:49 2018 -0800 Committer: gatorsmile Committed: Wed Feb 7 09:48:57 2018 -0800 -- core/src/test/scala/org/apache/spark/DebugFilesystem.scala| 7 +-- .../scala/org/apache/spark/sql/test/SharedSparkSession.scala | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/05239afc/core/src/test/scala/org/apache/spark/DebugFilesystem.scala -- diff --git a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala index 91355f7..a5bdc95 100644 --- a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala +++ b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala @@ -103,8 +103,11 @@ class DebugFilesystem extends LocalFileSystem { override def markSupported(): Boolean = wrapped.markSupported() override def close(): Unit = { -wrapped.close() -removeOpenStream(wrapped) +try { + wrapped.close() +} finally { + removeOpenStream(wrapped) +} } override def read(): Int = wrapped.read() http://git-wip-us.apache.org/repos/asf/spark/blob/05239afc/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index 0b4629a..e758c86 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -111,7 +111,7 @@ trait SharedSparkSession spark.sharedState.cacheManager.clearCache() // files can be closed from other threads, so wait a bit // normally this doesn't take more than 1s -eventually(timeout(10.seconds)) { +eventually(timeout(10.seconds), interval(2.seconds)) { DebugFilesystem.assertNoOpenStreams() } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23345][SQL] Remove open stream record even closing it fails
Repository: spark Updated Branches: refs/heads/master 71cfba04a -> 9841ae031 [SPARK-23345][SQL] Remove open stream record even closing it fails ## What changes were proposed in this pull request? When `DebugFilesystem` closes opened stream, if any exception occurs, we still need to remove the open stream record from `DebugFilesystem`. Otherwise, it goes to report leaked filesystem connection. ## How was this patch tested? Existing tests. Author: Liang-Chi Hsieh Closes #20524 from viirya/SPARK-23345. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9841ae03 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9841ae03 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9841ae03 Branch: refs/heads/master Commit: 9841ae0313cbee1f083f131f9446808c90ed5a7b Parents: 71cfba0 Author: Liang-Chi Hsieh Authored: Wed Feb 7 09:48:49 2018 -0800 Committer: gatorsmile Committed: Wed Feb 7 09:48:49 2018 -0800 -- core/src/test/scala/org/apache/spark/DebugFilesystem.scala| 7 +-- .../scala/org/apache/spark/sql/test/SharedSparkSession.scala | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9841ae03/core/src/test/scala/org/apache/spark/DebugFilesystem.scala -- diff --git a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala index 91355f7..a5bdc95 100644 --- a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala +++ b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala @@ -103,8 +103,11 @@ class DebugFilesystem extends LocalFileSystem { override def markSupported(): Boolean = wrapped.markSupported() override def close(): Unit = { -wrapped.close() -removeOpenStream(wrapped) +try { + wrapped.close() +} finally { + removeOpenStream(wrapped) +} } override def read(): Int = wrapped.read() http://git-wip-us.apache.org/repos/asf/spark/blob/9841ae03/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index 0b4629a..e758c86 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -111,7 +111,7 @@ trait SharedSparkSession spark.sharedState.cacheManager.clearCache() // files can be closed from other threads, so wait a bit // normally this doesn't take more than 1s -eventually(timeout(10.seconds)) { +eventually(timeout(10.seconds), interval(2.seconds)) { DebugFilesystem.assertNoOpenStreams() } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r24788 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_07_08_01-71cfba0-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Wed Feb 7 16:19:52 2018 New Revision: 24788 Log: Apache Spark 2.4.0-SNAPSHOT-2018_02_07_08_01-71cfba0 docs [This commit notification would consist of 1444 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23319][TESTS] Explicitly specify Pandas and PyArrow versions in PySpark tests (to skip or test)
Repository: spark Updated Branches: refs/heads/master 9775df67f -> 71cfba04a [SPARK-23319][TESTS] Explicitly specify Pandas and PyArrow versions in PySpark tests (to skip or test) ## What changes were proposed in this pull request? This PR proposes to explicitly specify Pandas and PyArrow versions in PySpark tests to skip or test. We declared the extra dependencies: https://github.com/apache/spark/blob/b8bfce51abf28c66ba1fc67b0f25fe1617c81025/python/setup.py#L204 In case of PyArrow: Currently we only check if pyarrow is installed or not without checking the version. It already fails to run tests. For example, if PyArrow 0.7.0 is installed: ``` == ERROR: test_vectorized_udf_wrong_return_type (pyspark.sql.tests.ScalarPandasUDF) -- Traceback (most recent call last): File "/.../spark/python/pyspark/sql/tests.py", line 4019, in test_vectorized_udf_wrong_return_type f = pandas_udf(lambda x: x * 1.0, MapType(LongType(), LongType())) File "/.../spark/python/pyspark/sql/functions.py", line 2309, in pandas_udf return _create_udf(f=f, returnType=return_type, evalType=eval_type) File "/.../spark/python/pyspark/sql/udf.py", line 47, in _create_udf require_minimum_pyarrow_version() File "/.../spark/python/pyspark/sql/utils.py", line 132, in require_minimum_pyarrow_version "however, your version was %s." % pyarrow.__version__) ImportError: pyarrow >= 0.8.0 must be installed on calling Python process; however, your version was 0.7.0. -- Ran 33 tests in 8.098s FAILED (errors=33) ``` In case of Pandas: There are few tests for old Pandas which were tested only when Pandas version was lower, and I rewrote them to be tested when both Pandas version is lower and missing. ## How was this patch tested? Manually tested by modifying the condition: ``` test_createDataFrame_column_name_encoding (pyspark.sql.tests.ArrowTests) ... skipped 'Pandas >= 1.19.2 must be installed; however, your version was 0.19.2.' test_createDataFrame_does_not_modify_input (pyspark.sql.tests.ArrowTests) ... skipped 'Pandas >= 1.19.2 must be installed; however, your version was 0.19.2.' test_createDataFrame_respect_session_timezone (pyspark.sql.tests.ArrowTests) ... skipped 'Pandas >= 1.19.2 must be installed; however, your version was 0.19.2.' ``` ``` test_createDataFrame_column_name_encoding (pyspark.sql.tests.ArrowTests) ... skipped 'Pandas >= 0.19.2 must be installed; however, it was not found.' test_createDataFrame_does_not_modify_input (pyspark.sql.tests.ArrowTests) ... skipped 'Pandas >= 0.19.2 must be installed; however, it was not found.' test_createDataFrame_respect_session_timezone (pyspark.sql.tests.ArrowTests) ... skipped 'Pandas >= 0.19.2 must be installed; however, it was not found.' ``` ``` test_createDataFrame_column_name_encoding (pyspark.sql.tests.ArrowTests) ... skipped 'PyArrow >= 1.8.0 must be installed; however, your version was 0.8.0.' test_createDataFrame_does_not_modify_input (pyspark.sql.tests.ArrowTests) ... skipped 'PyArrow >= 1.8.0 must be installed; however, your version was 0.8.0.' test_createDataFrame_respect_session_timezone (pyspark.sql.tests.ArrowTests) ... skipped 'PyArrow >= 1.8.0 must be installed; however, your version was 0.8.0.' ``` ``` test_createDataFrame_column_name_encoding (pyspark.sql.tests.ArrowTests) ... skipped 'PyArrow >= 0.8.0 must be installed; however, it was not found.' test_createDataFrame_does_not_modify_input (pyspark.sql.tests.ArrowTests) ... skipped 'PyArrow >= 0.8.0 must be installed; however, it was not found.' test_createDataFrame_respect_session_timezone (pyspark.sql.tests.ArrowTests) ... skipped 'PyArrow >= 0.8.0 must be installed; however, it was not found.' ``` Author: hyukjinkwon Closes #20487 from HyukjinKwon/pyarrow-pandas-skip. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71cfba04 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71cfba04 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71cfba04 Branch: refs/heads/master Commit: 71cfba04aeec5ae9b85a507b13996e80f8750edc Parents: 9775df6 Author: hyukjinkwon Authored: Wed Feb 7 23:28:10 2018 +0900 Committer: hyukjinkwon Committed: Wed Feb 7 23:28:10 2018 +0900 -- pom.xml | 4 ++ python/pyspark/sql/dataframe.py | 3 ++ python/pyspark/sql/session.py | 3 ++ python/pyspark/sql/tests.py | 87 python/pyspark/sql/utils.py | 30 + python/setup.py | 10 - 6 files changed, 89 insertions(+), 48 deletions(-) -- http://git-wip-us.apache.org/repos/asf/
spark git commit: [SPARK-23122][PYSPARK][FOLLOWUP] Replace registerTempTable by createOrReplaceTempView
Repository: spark Updated Branches: refs/heads/branch-2.3 874d3f89f -> cb22e830b [SPARK-23122][PYSPARK][FOLLOWUP] Replace registerTempTable by createOrReplaceTempView ## What changes were proposed in this pull request? Replace `registerTempTable` by `createOrReplaceTempView`. ## How was this patch tested? N/A Author: gatorsmile Closes #20523 from gatorsmile/updateExamples. (cherry picked from commit 9775df67f924663598d51723a878557ddafb8cfd) Signed-off-by: hyukjinkwon Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cb22e830 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cb22e830 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cb22e830 Branch: refs/heads/branch-2.3 Commit: cb22e830b0af3f2d760beffea9a79a6d349e4661 Parents: 874d3f8 Author: gatorsmile Authored: Wed Feb 7 23:24:16 2018 +0900 Committer: hyukjinkwon Committed: Wed Feb 7 23:24:30 2018 +0900 -- python/pyspark/sql/udf.py | 2 +- .../src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cb22e830/python/pyspark/sql/udf.py -- diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index 82a28c8..5a848c2 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -348,7 +348,7 @@ class UDFRegistration(object): >>> spark.udf.registerJavaUDAF("javaUDAF", "test.org.apache.spark.sql.MyDoubleAvg") >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "a")],["id", "name"]) ->>> df.registerTempTable("df") +>>> df.createOrReplaceTempView("df") >>> spark.sql("SELECT name, javaUDAF(id) as avg from df group by name").collect() [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)] """ http://git-wip-us.apache.org/repos/asf/spark/blob/cb22e830/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java -- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java index ddbaa45..08dc129 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java @@ -46,7 +46,7 @@ public class JavaUDAFSuite { @SuppressWarnings("unchecked") @Test public void udf1Test() { -spark.range(1, 10).toDF("value").registerTempTable("df"); +spark.range(1, 10).toDF("value").createOrReplaceTempView("df"); spark.udf().registerJavaUDAF("myDoubleAvg", MyDoubleAvg.class.getName()); Row result = spark.sql("SELECT myDoubleAvg(value) as my_avg from df").head(); Assert.assertEquals(105.0, result.getDouble(0), 1.0e-6); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23122][PYSPARK][FOLLOWUP] Replace registerTempTable by createOrReplaceTempView
Repository: spark Updated Branches: refs/heads/master c36fecc3b -> 9775df67f [SPARK-23122][PYSPARK][FOLLOWUP] Replace registerTempTable by createOrReplaceTempView ## What changes were proposed in this pull request? Replace `registerTempTable` by `createOrReplaceTempView`. ## How was this patch tested? N/A Author: gatorsmile Closes #20523 from gatorsmile/updateExamples. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9775df67 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9775df67 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9775df67 Branch: refs/heads/master Commit: 9775df67f924663598d51723a878557ddafb8cfd Parents: c36fecc Author: gatorsmile Authored: Wed Feb 7 23:24:16 2018 +0900 Committer: hyukjinkwon Committed: Wed Feb 7 23:24:16 2018 +0900 -- python/pyspark/sql/udf.py | 2 +- .../src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9775df67/python/pyspark/sql/udf.py -- diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index 0f759c4..08c6b9e 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -356,7 +356,7 @@ class UDFRegistration(object): >>> spark.udf.registerJavaUDAF("javaUDAF", "test.org.apache.spark.sql.MyDoubleAvg") >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "a")],["id", "name"]) ->>> df.registerTempTable("df") +>>> df.createOrReplaceTempView("df") >>> spark.sql("SELECT name, javaUDAF(id) as avg from df group by name").collect() [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)] """ http://git-wip-us.apache.org/repos/asf/spark/blob/9775df67/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java -- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java index ddbaa45..08dc129 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java @@ -46,7 +46,7 @@ public class JavaUDAFSuite { @SuppressWarnings("unchecked") @Test public void udf1Test() { -spark.range(1, 10).toDF("value").registerTempTable("df"); +spark.range(1, 10).toDF("value").createOrReplaceTempView("df"); spark.udf().registerJavaUDAF("myDoubleAvg", MyDoubleAvg.class.getName()); Row result = spark.sql("SELECT myDoubleAvg(value) as my_avg from df").head(); Assert.assertEquals(105.0, result.getDouble(0), 1.0e-6); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org