[spark] branch master updated (ad454f46d85 -> 9c19528e337)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from ad454f46d85 [SPARK-42992][PYTHON] Introduce PySparkRuntimeError add 9c19528e337 [SPARK-43076][PS][CONNECT] Removing the dependency on `grpcio` when remote session is not used No new revisions were added by this update. Summary of changes: python/pyspark/pandas/_typing.py | 9 - python/pyspark/pandas/accessors.py | 2 +- python/pyspark/pandas/base.py | 16 +- python/pyspark/pandas/data_type_ops/base.py| 23 ++- python/pyspark/pandas/data_type_ops/binary_ops.py | 16 +- python/pyspark/pandas/data_type_ops/boolean_ops.py | 129 +++- python/pyspark/pandas/data_type_ops/date_ops.py| 51 +++-- python/pyspark/pandas/data_type_ops/num_ops.py | 218 + python/pyspark/pandas/frame.py | 147 +++--- python/pyspark/pandas/indexes/base.py | 45 ++--- python/pyspark/pandas/indexes/multi.py | 8 +- python/pyspark/pandas/indexing.py | 180 ++--- python/pyspark/pandas/internal.py | 176 + python/pyspark/pandas/namespace.py | 31 +-- python/pyspark/pandas/numpy_compat.py | 9 +- python/pyspark/pandas/series.py| 26 +-- python/pyspark/pandas/spark/accessors.py | 49 +++-- python/pyspark/pandas/utils.py | 74 --- python/pyspark/sql/connect/dataframe.py| 4 +- 19 files changed, 699 insertions(+), 514 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-42992][PYTHON] Introduce PySparkRuntimeError
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ad454f46d85 [SPARK-42992][PYTHON] Introduce PySparkRuntimeError ad454f46d85 is described below commit ad454f46d85e17dc59991ecd6c3326a8c6de8ea3 Author: itholic AuthorDate: Wed Apr 26 13:51:52 2023 +0800 [SPARK-42992][PYTHON] Introduce PySparkRuntimeError ### What changes were proposed in this pull request? This PR proposes to introduce new error for PySpark, and also applied it to existing RuntimeError under `python/pyspark/*.py`. ### Why are the changes needed? To cover the built-in RuntimeError by PySpark error framework. ### Does this PR introduce _any_ user-facing change? No, it's internal error framework improvement. ### How was this patch tested? The existing CI should pass. Closes #40617 from itholic/pyspark_runtime_error. Authored-by: itholic Signed-off-by: Ruifeng Zheng --- python/pyspark/accumulators.py | 15 ++- python/pyspark/broadcast.py| 23 +++- python/pyspark/conf.py | 6 +- python/pyspark/context.py | 40 +++--- python/pyspark/errors/__init__.py | 2 + python/pyspark/errors/error_classes.py | 137 + python/pyspark/errors/exceptions/base.py | 6 + python/pyspark/errors/utils.py | 4 +- python/pyspark/java_gateway.py | 18 ++- python/pyspark/profiler.py | 6 +- python/pyspark/rdd.py | 33 +++-- .../sql/tests/pandas/test_pandas_grouped_map.py| 2 +- python/pyspark/taskcontext.py | 30 +++-- python/pyspark/tests/test_context.py | 2 +- python/pyspark/tests/test_profiler.py | 27 ++-- python/pyspark/util.py | 13 +- python/pyspark/worker.py | 75 ++- 17 files changed, 346 insertions(+), 93 deletions(-) diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py index ce4bb561814..dc8520a844d 100644 --- a/python/pyspark/accumulators.py +++ b/python/pyspark/accumulators.py @@ -23,6 +23,7 @@ import threading from typing import Callable, Dict, Generic, Tuple, Type, TYPE_CHECKING, TypeVar, Union from pyspark.serializers import read_int, CPickleSerializer +from pyspark.errors import PySparkRuntimeError if TYPE_CHECKING: from pyspark._typing import SupportsIAdd # noqa: F401 @@ -140,14 +141,24 @@ class Accumulator(Generic[T]): def value(self) -> T: """Get the accumulator's value; only usable in driver program""" if self._deserialized: -raise RuntimeError("Accumulator.value cannot be accessed inside tasks") +raise PySparkRuntimeError( +error_class="VALUE_NOT_ACCESSIBLE", +message_parameters={ +"value": "Accumulator.value", +}, +) return self._value @value.setter def value(self, value: T) -> None: """Sets the accumulator's value; only usable in driver program""" if self._deserialized: -raise RuntimeError("Accumulator.value cannot be accessed inside tasks") +raise PySparkRuntimeError( +error_class="VALUE_NOT_ACCESSIBLE", +message_parameters={ +"value": "Accumulator.value", +}, +) self._value = value def add(self, term: T) -> None: diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py index b1a9b790af5..a72bf1e059b 100644 --- a/python/pyspark/broadcast.py +++ b/python/pyspark/broadcast.py @@ -40,6 +40,7 @@ from typing.io import BinaryIO # type: ignore[import] from pyspark.java_gateway import local_connect_and_auth from pyspark.serializers import ChunkedStream, pickle_protocol from pyspark.util import print_exec +from pyspark.errors import PySparkRuntimeError if TYPE_CHECKING: from pyspark import SparkContext @@ -58,7 +59,12 @@ def _from_id(bid: int) -> "Broadcast[Any]": from pyspark.broadcast import _broadcastRegistry if bid not in _broadcastRegistry: -raise RuntimeError("Broadcast variable '%s' not loaded!" % bid) +raise PySparkRuntimeError( +error_class="BROADCAST_VARIABLE_NOT_LOADED", +message_parameters={ +"variable": str(bid), +}, +) return _broadcastRegistry[bid] @@ -293,7 +299,10 @@ class Broadcast(Generic[T]): >>> b.unpersist() """ if self._jbroadcast is None: -raise RuntimeError("Broadcast can
[spark] branch master updated (b461cdea92e -> eae91ee3c96)
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from b461cdea92e [SPARK-43156][SPARK-43098][SQL] Extend scalar subquery count bug test with decorrelateInnerQuery disabled add eae91ee3c96 [SPARK-43274][SPARK-43275][PYTHON][CONNECT] Introduce `PySparkNotImplementedError` No new revisions were added by this update. Summary of changes: python/pyspark/errors/__init__.py | 2 ++ python/pyspark/errors/error_classes.py | 17 +- python/pyspark/errors/exceptions/base.py | 6 python/pyspark/sql/connect/group.py| 37 ++ python/pyspark/sql/connect/udf.py | 5 ++- .../sql/tests/connect/test_connect_basic.py| 14 +--- .../sql/tests/pandas/test_pandas_grouped_map.py| 5 ++- python/pyspark/sql/udf.py | 5 ++- 8 files changed, 70 insertions(+), 21 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-43156][SPARK-43098][SQL] Extend scalar subquery count bug test with decorrelateInnerQuery disabled
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b461cdea92e [SPARK-43156][SPARK-43098][SQL] Extend scalar subquery count bug test with decorrelateInnerQuery disabled b461cdea92e is described below commit b461cdea92ea08ce39bb3c9d733f0af7c56abf8d Author: Jack Chen AuthorDate: Wed Apr 26 12:14:24 2023 +0800 [SPARK-43156][SPARK-43098][SQL] Extend scalar subquery count bug test with decorrelateInnerQuery disabled ### What changes were proposed in this pull request? Extend a test to test with both the DecorrelateInnerQuery framework on and off, since this came up in both https://github.com/apache/spark/pull/40865 and https://github.com/apache/spark/pull/40811. (The default is on.) ### Why are the changes needed? Improve test coverage ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Test itself Closes #40946 from jchen5/subq-count-test. Authored-by: Jack Chen Signed-off-by: Wenchen Fan --- .../inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql index 0ca2f07b301..df88b8eb74d 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql @@ -1,3 +1,6 @@ +--CONFIG_DIM1 spark.sql.optimizer.decorrelateInnerQuery.enabled=true +--CONFIG_DIM1 spark.sql.optimizer.decorrelateInnerQuery.enabled=false + create temp view l (a, b) as values (1, 2.0), - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (014685c41e4 -> 55865bd1756)
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 014685c41e4 [SPARK-43277][YARN] Clean up deprecation hadoop api usage in `yarn` module add 55865bd1756 [SPARK-43276][CONNECT][PYTHON] Migrate Spark Connect Window errors into error class No new revisions were added by this update. Summary of changes: python/pyspark/errors/error_classes.py | 5 + python/pyspark/sql/connect/window.py | 21 + 2 files changed, 18 insertions(+), 8 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] Yikun commented on pull request #458: Update Spark Docker Images publish workflow
Yikun commented on PR #458: URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522735459 > As you know, Apache Spark dev mailing list is the only official way to build a community consensus. All the other channels (GitHub or even user mailing list). Please shoot the email. :) Sure! I will send the mail in this week! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-43277][YARN] Clean up deprecation hadoop api usage in `yarn` module
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 014685c41e4 [SPARK-43277][YARN] Clean up deprecation hadoop api usage in `yarn` module 014685c41e4 is described below commit 014685c41e4741f83570d8a2a6a253e48967919a Author: yangjie01 AuthorDate: Tue Apr 25 22:12:35 2023 -0500 [SPARK-43277][YARN] Clean up deprecation hadoop api usage in `yarn` module ### What changes were proposed in this pull request? `yarn` module has the following compilation warnings related to the Hadoop API: ``` [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala:157: [deprecation org.apache.spark.deploy.yarn.ApplicationMaster.prepareLocalResources.setupDistributedCache | origin=org.apache.hadoop.yarn.util.ConverterUtils.getYarnUrlFromURI | version=] method getYarnUrlFromURI in class ConverterUtils is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala:292: [deprecation org.apache.spark.deploy.yarn.Client.createApplicationSubmissionContext | origin=org.apache.hadoop.yarn.api.records.Resource.setMemory | version=] method setMemory in class Resource is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala:307: [deprecation org.apache.spark.deploy.yarn.Client.createApplicationSubmissionContext | origin=org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext.setAMContainerResourceRequest | version=] method setAMContainerResourceRequest in class ApplicationSubmissionContext is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala:392: [deprecation org.apache.spark.deploy.yarn.Client.verifyClusterResources.maxMem | origin=org.apache.hadoop.yarn.api.records.Resource.getMemory | version=] method getMemory in class Resource is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala:76: [deprecation org.apache.spark.deploy.yarn.ClientDistributedCacheManager.addResource | origin=org.apache.hadoop.yarn.util.ConverterUtils.getYarnUrlFromPath | version=] method getYarnUrlFromPath in class ConverterUtils is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala:510: [deprecation org.apache.spark.deploy.yarn.YarnAllocator.updateResourceRequests.$anonfun.requestContainerMessage | origin=org.apache.hadoop.yarn.api.records.Resource.getMemory | version=] method getMemory in class Resource is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala:737: [deprecation org.apache.spark.deploy.yarn.YarnAllocator.runAllocatedContainers.$anonfun | origin=org.apache.hadoop.yarn.api.records.Resource.getMemory | version=] method getMemory in class Resource is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala:737: [deprecation org.apache.spark.deploy.yarn.YarnAllocator.runAllocatedContainers.$anonfun | origin=org.apache.hadoop.yarn.api.records.Resource.getMemory | version=] method getMemory in class Resource is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala:202: [deprecation org.apache.spark.deploy.yarn.YarnSparkHadoopUtil.getContainerId | origin=org.apache.hadoop.yarn.util.ConverterUtils.toContainerId | version=] method toContainerId in class ConverterUtils is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/util/YarnContainerInfoHelper.scala:75: [deprecation org.apache.spark.util.YarnContainerInfoHelper.getAttributes | origin=org.apache.hadoop.yarn.util.ConverterUtils.toString | version=] method toString in class ConverterUtils is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala:83: [deprecation org.apache.spark.deploy.yarn.ClientDistributedCacheManagerSuite..$org_scalatest_assert_macro_expr.$org_scalatest_assert_macro_left | origin=org.apache.hadoop.yarn.util.ConverterUtils.getPathFromYarnURL | version=] method getPathFromYarnURL in class ConverterUtils is deprecated [WARNING] [Warn] /${spark-source-dir}/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala:105: [deprecation
[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow
dongjoon-hyun commented on PR #458: URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522655970 @Yikun . We cannot simply remove anything like this PR. The official way to adopt new standard is to send the email on dev mailing list and build a community consensus. > The reason of we remove v is to follow [DOI rules](https://github.com/docker-library/official-images#tags-and-aliases). If we still think v should be keep in future, we can both keep them (to point new images). As you know, Apache Spark dev mailing list is the only official way to build a community consensus. All the other channels (GitHub or even user mailing list). Please shoot the email. :) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] Yikun commented on pull request #458: Update Spark Docker Images publish workflow
Yikun commented on PR #458: URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522651856 > I believe 3.4.0 and v3.4.0 only causes many confusions to the users. In Apache Spark community, that's the reason why I propose to keep v3.4.0 tag format only (whatever we used; manual or GitHub) The reason of we remove `v` is to follow [DOI rules](https://github.com/docker-library/official-images#tags-and-aliases). If we still think `v` should be keep **in future**, we can both keep them (to point new images). > Given (1) and (2), I don't think this PR claims to maintain an existing manual publish process. In other words, this PR already burn the bridge. At this point, below might the solutions to continue the work: Solution 1: 1. Switch Java 17 to default in spark-docker: https://github.com/apache/spark-docker/pull/35 2. Keep v3.4.0 and 3.4.0 to point GA publish image. 3. Keep this PR to instead of manual process Solution 2: 1. Recover `latest` to manual publish image for 3.4.0 images. 2. Revert this PR (Recover the manual process and also keep the GA publish steps in this PR's doc) 3. `v3.4.0` point to manual image, `3.4.0` point to GA publish image 4. Keep manual image and GA publish image in 3.4.0 series, and unifed all images into GA publish image to in 3.5.0 @dongjoon-hyun @HyukjinKwon @zhengruifeng WDYT? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (b26844ce879 -> a2a5299cb2d)
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from b26844ce879 [SPARK-43136][CONNECT] Adding groupByKey + mapGroup + coGroup functions add a2a5299cb2d [MINOR][BUILD] Correct the error message in `dev/connect-check-protos.py` No new revisions were added by this update. Summary of changes: dev/connect-check-protos.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow
dongjoon-hyun commented on PR #458: URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522621830 1. If we are going to maintain the manual process, why do you remove old description in this PR? > For tag, it's intentional. In 3.4.0, v3.4.0 tag and 3.4.0 serious tags will co-exsiting. 2. To be clear, I really appreciate your contribution because this is good enough to mitigate Apache Spark release managers' burden. Given (1) and (2), I don't think this PR claims to maintain an existing manual publish process. In other words, this PR already burn the bridge. I believe `3.4.0` and `v3.4.0` only causes many confusions to the users. In Apache Spark community, that's the reason why I propose to keep `v3.4.0` tag format only (whatever we used; manual or GitHub) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] Yikun commented on pull request #458: Update Spark Docker Images publish workflow
Yikun commented on PR #458: URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522546716 Yes, but I meant we still need to update in apache/spark-docker. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow
dongjoon-hyun commented on PR #458: URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522543950 It's already Java 17 by default. > For java17, I think we can switch to java17 in docker image. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] Yikun commented on pull request #458: Update Spark Docker Images publish workflow
Yikun commented on PR #458: URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522543145 Thanks for catch and reminder it! @dongjoon-hyun For java17, I think we can switch to java17 in docker image. For tag, it's intentional. In 3.4.0, `v3.4.0` tag and `3.4.0` serious tags will co-exsiting. It should be a new feature but not breaking changes. (But for latest tag where to point is a breaking changes). So do you have any suggestion how we could migrate to new image frequently? Maybe 1. 3.4.0 keep previous and new tags, latest point to previous tag. 2. 3.5.0 keep previous and new tags, and mark previous tag as deprecated, latest point to new tag. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow
dongjoon-hyun commented on PR #458: URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522426433 cc @sunchao , too -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow
dongjoon-hyun commented on PR #458: URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522426236 Hi, @Yikun , @HyukjinKwon , @yaooqinn , @zhengruifeng . it seems that this PR introduced two user-facing breaking changes - Downgrade Java from 17 to 11 ``` $ docker run -it --rm apache/spark:v3.4.0 java -version | tail -n1 OpenJDK 64-Bit Server VM Temurin-17.0.6+10 (build 17.0.6+10, mixed mode, sharing) ``` ``` $ docker run -it --rm apache/spark:3.4.0 java -version | tail -n1 OpenJDK 64-Bit Server VM Temurin-11.0.18+10 (build 11.0.18+10, mixed mode, sharing) ``` - Drop `v` prefix, `v3.4.0` -> `3.4.0` If these breaking changes are intentional, I'm wondering if we discuss the rationals in the `dev` mailing list. Otherwise, I'd like to recommend to revert those breaking changes to avoid any confusion. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (0e6446ac62d -> b26844ce879)
This is an automated email from the ASF dual-hosted git repository. hvanhovell pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 0e6446ac62d [SPARK-43285] Fix ReplE2ESuite consistently failing with JDK 17 add b26844ce879 [SPARK-43136][CONNECT] Adding groupByKey + mapGroup + coGroup functions No new revisions were added by this update. Summary of changes: .../main/scala/org/apache/spark/sql/Dataset.scala | 29 ++ .../apache/spark/sql/KeyValueGroupedDataset.scala | 416 + .../apache/spark/sql/connect/client/UdfUtils.scala | 43 ++- .../sql/KeyValueGroupedDatasetE2ETestSuite.scala | 218 +++ .../CheckConnectJvmClientCompatibility.scala | 19 +- .../main/protobuf/spark/connect/relations.proto| 9 + .../sql/connect/planner/SparkConnectPlanner.scala | 253 ++--- python/pyspark/sql/connect/proto/relations_pb2.py | 24 +- python/pyspark/sql/connect/proto/relations_pb2.pyi | 49 ++- .../spark/sql/catalyst/plans/logical/object.scala | 7 + .../apache/spark/sql/KeyValueGroupedDataset.scala | 7 +- 11 files changed, 996 insertions(+), 78 deletions(-) create mode 100644 connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala create mode 100644 connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-43285] Fix ReplE2ESuite consistently failing with JDK 17
This is an automated email from the ASF dual-hosted git repository. hvanhovell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 0e6446ac62d [SPARK-43285] Fix ReplE2ESuite consistently failing with JDK 17 0e6446ac62d is described below commit 0e6446ac62d6f3f7568e6e35b42f8e395bf490dc Author: vicennial AuthorDate: Tue Apr 25 15:10:57 2023 -0400 [SPARK-43285] Fix ReplE2ESuite consistently failing with JDK 17 ### What changes were proposed in this pull request? The timeout duration for the REPL has been increased from 10 -> 30 seconds (to address slow start on JDK 17 tests) and the semaphore permits are drained after each test (to avoid cascading failures, [context](https://github.com/apache/spark/pull/40675#discussion_r1174917132)). ### Why are the changes needed? The GA JDK 17 tests consistently fails as described in the [jira issue](https://issues.apache.org/jira/browse/SPARK-43285). ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Locally verified by installing and running tests with JDK 17 (both the failure and the subsequent fix). Closes #40948 from vicennial/SPARK-43285. Authored-by: vicennial Signed-off-by: Herman van Hovell --- .../scala/org/apache/spark/sql/application/ReplE2ESuite.scala| 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala index f0ec28a5a87..af920f8c314 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala @@ -20,13 +20,14 @@ import java.io.{PipedInputStream, PipedOutputStream} import java.util.concurrent.{Executors, Semaphore, TimeUnit} import org.apache.commons.io.output.ByteArrayOutputStream +import org.scalatest.BeforeAndAfterEach import org.apache.spark.sql.connect.client.util.RemoteSparkSession -class ReplE2ESuite extends RemoteSparkSession { +class ReplE2ESuite extends RemoteSparkSession with BeforeAndAfterEach { private val executorService = Executors.newSingleThreadExecutor() - private val TIMEOUT_SECONDS = 10 + private val TIMEOUT_SECONDS = 30 private var testSuiteOut: PipedOutputStream = _ private var ammoniteOut: ByteArrayOutputStream = _ @@ -68,6 +69,10 @@ class ReplE2ESuite extends RemoteSparkSession { super.afterAll() } + override def afterEach(): Unit = { +semaphore.drainPermits() + } + def runCommandsInShell(input: String): String = { require(input.nonEmpty) // Pad the input with a semaphore release so that we know when the execution of the provided - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (516d7b3d483 -> 9c237d7bc7b)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 516d7b3d483 [SPARK-42798][BUILD] Upgrade protobuf-java to 3.22.3 add 9c237d7bc7b [SPARK-43225][BUILD][SQL] Remove jackson-core-asl and jackson-mapper-asl from pre-built distribution No new revisions were added by this update. Summary of changes: core/pom.xml | 8 dev/deps/spark-deps-hadoop-3-hive-2.3 | 2 -- pom.xml| 23 ++ .../org/apache/hive/service/cli/CLIService.java| 2 +- .../hive/service/cli/session/HiveSessionImpl.java | 4 ++-- .../cli/session/HiveSessionImplwithUGI.java| 4 ++-- 6 files changed, 7 insertions(+), 36 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-42798][BUILD] Upgrade protobuf-java to 3.22.3
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 516d7b3d483 [SPARK-42798][BUILD] Upgrade protobuf-java to 3.22.3 516d7b3d483 is described below commit 516d7b3d483687018fc457ab2fec82c05b169057 Author: YangJie AuthorDate: Tue Apr 25 08:52:54 2023 -0500 [SPARK-42798][BUILD] Upgrade protobuf-java to 3.22.3 ### What changes were proposed in this pull request? This pr aims upgrade protobuf-java from 3.22.0 to 3.22.3. ### Why are the changes needed? The new version fixed the issue of `NoSuchMethodError` thrown when using Java 8 to run proto compiled with Java 9+ (even if --target 1.8): - https://github.com/protocolbuffers/protobuf/issues/11393 / https://github.com/protocolbuffers/protobuf/pull/12035 The full release notes as follows: - https://github.com/protocolbuffers/protobuf/releases/tag/v22.1 - https://github.com/protocolbuffers/protobuf/releases/tag/v22.2 - https://github.com/protocolbuffers/protobuf/releases/tag/v22.3 - https://github.com/protocolbuffers/protobuf/compare/v3.22.0...v3.22.3 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions Closes #40430 from LuciferYang/SPARK-42798. Lead-authored-by: YangJie Co-authored-by: yangjie01 Signed-off-by: Sean Owen --- pom.xml | 2 +- project/SparkBuild.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index a3b7f57ace9..2cb6ff86bb5 100644 --- a/pom.xml +++ b/pom.xml @@ -124,7 +124,7 @@ 2.5.0 -3.22.0 +3.22.3 3.11.4 ${hadoop.version} 3.6.3 diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index d8a6972bae1..25f5ea4fa85 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -88,7 +88,7 @@ object BuildCommons { // Google Protobuf version used for generating the protobuf. // SPARK-41247: needs to be consistent with `protobuf.version` in `pom.xml`. - val protoVersion = "3.22.0" + val protoVersion = "3.22.3" // GRPC version used for Spark Connect. val gprcVersion = "1.47.0" } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-42419][FOLLOWUP][CONNECT][PYTHON] Remove unused exception
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b59d3af4ca2 [SPARK-42419][FOLLOWUP][CONNECT][PYTHON] Remove unused exception b59d3af4ca2 is described below commit b59d3af4ca244aa9837d0d862012e3bfd380bc05 Author: itholic AuthorDate: Tue Apr 25 19:09:08 2023 +0800 [SPARK-42419][FOLLOWUP][CONNECT][PYTHON] Remove unused exception ### What changes were proposed in this pull request? This is follow-up for https://github.com/apache/spark/pull/39991 to remove unused exception. ### Why are the changes needed? `PySparkTypeError` never raises because we checked the type already in the codes above `if type(startPos) != type(length):` and raise `PySparkTypeError` for unsupported type for `length`. ### Does this PR introduce _any_ user-facing change? No. it's minor code cleanup ### How was this patch tested? The existing CI should pass. Closes #40927 from itholic/connect_col_followup. Authored-by: itholic Signed-off-by: Ruifeng Zheng --- python/pyspark/sql/connect/column.py | 13 ++--- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py index 26cb1477cee..7b60adb1629 100644 --- a/python/pyspark/sql/connect/column.py +++ b/python/pyspark/sql/connect/column.py @@ -239,24 +239,15 @@ class Column: if isinstance(length, Column): length_expr = length._expr +start_expr = startPos._expr # type: ignore[union-attr] elif isinstance(length, int): length_expr = LiteralExpression._from_value(length) -else: -raise PySparkTypeError( -error_class="NOT_COLUMN_OR_INT", -message_parameters={"arg_name": "length", "arg_type": type(length).__name__}, -) - -if isinstance(startPos, Column): -start_expr = startPos._expr -elif isinstance(startPos, int): start_expr = LiteralExpression._from_value(startPos) else: raise PySparkTypeError( error_class="NOT_COLUMN_OR_INT", -message_parameters={"arg_name": "startPos", "arg_type": type(startPos).__name__}, +message_parameters={"arg_name": "length", "arg_type": type(length).__name__}, ) - return Column(UnresolvedFunction("substring", [self._expr, start_expr, length_expr])) substr.__doc__ = PySparkColumn.substr.__doc__ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-43142] Fix DSL expressions on attributes with special characters
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 98ae33b7acb [SPARK-43142] Fix DSL expressions on attributes with special characters 98ae33b7acb is described below commit 98ae33b7acbd932714301c83d71d42ef318dda9b Author: Willi Raschkowski AuthorDate: Tue Apr 25 18:23:36 2023 +0800 [SPARK-43142] Fix DSL expressions on attributes with special characters Re-attempting #40794. #40794 tried to more safely create `AttributeReference` objects from multi-part attributes in `ImplicitAttribute`. But that broke things and we had to revert. This PR is limiting the fix to the `UnresolvedAttribute` object returned by `DslAttr.attr`, which is enough to fix the issue here. ### What changes were proposed in this pull request? This PR fixes DSL expressions on attributes with special characters by making `DslAttr.attr` and `DslAttr.expr` return the implicitly wrapped attribute instead of creating a new one. ### Why are the changes needed? SPARK-43142: DSL expressions on attributes with special characters don't work even if the attribute names are quoted: ```scala scala> "`slashed/col`".attr res0: org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute = 'slashed/col scala> "`slashed/col`".attr.asc org.apache.spark.sql.catalyst.parser.ParseException: mismatched input '/' expecting {, '.', '-'}(line 1, pos 7) == SQL == slashed/col ---^^^ ``` DSL expressions rely on a call to `expr` to get child of the new expression [(e.g.)](https://github.com/apache/spark/blob/87a5442f7ed96b11051d8a9333476d080054e5a0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala#L149). `expr` here is a call on implicit class `DslAttr` that's wrapping the `UnresolvedAttribute` returned by `"...".attr` is wrapped by the implicit class `DslAttr`. `DslAttr` and its super class implement `DslAttr.expr` such that a new `UnresolvedAttribute` is created from `UnresolvedAttribute.name` of the wrapped attribute [(here)](https://github.com/apache/spark/blob/87a5442f7ed96b11051d8a9333476d080054e5a0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala#L273-L280). But `UnresolvedAttribute.name` drops the quotes and thus the newly created `UnresolvedAttribute` parses an identifier that should be quoted but isn't: ```scala scala> "`col/slash`".attr.name res5: String = col/slash ``` ### Does this PR introduce _any_ user-facing change? DSL expressions on attributes with special characters no longer fail. ### How was this patch tested? I couldn't find a suite testing the implicit classes in the DSL package, but the DSL package seems used widely enough that I'm confident this doesn't break existing behavior. Locally, I was able to reproduce with this test; it was failing before and passes now: ```scala test("chained DSL expressions on attributes with special characters") { $"`slashed/col`".asc } ``` Closes #40902 from rshkv/wr/spark-43142-v2. Authored-by: Willi Raschkowski Signed-off-by: Wenchen Fan --- .../apache/spark/sql/catalyst/dsl/package.scala| 73 ++ .../expressions/ExpressionSQLBuilderSuite.scala| 2 +- .../datasources/DataSourceStrategySuite.scala | 12 ++-- .../datasources/v2/DataSourceV2StrategySuite.scala | 12 ++-- 4 files changed, 46 insertions(+), 53 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index ac439203cb7..27d05f3bac7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -271,8 +271,8 @@ package object dsl { override def expr: Expression = Literal(s) def attr: UnresolvedAttribute = analysis.UnresolvedAttribute(s) } -implicit class DslAttr(attr: UnresolvedAttribute) extends ImplicitAttribute { - def s: String = attr.name +implicit class DslAttr(override val attr: UnresolvedAttribute) extends ImplicitAttribute { + def s: String = attr.sql } abstract class ImplicitAttribute extends ImplicitOperators { @@ -280,90 +280,83 @@ package object dsl { def expr: UnresolvedAttribute = attr def attr: UnresolvedAttribute = analysis.UnresolvedAttribute(s) + private def attrRef(dataType: DataType): AttributeReference = +AttributeReference(attr.nameParts.last, dataType)(qualifier = attr.nameParts.init) + /** Creates a new AttributeReference of type boolean */ - def boolean: AttributeReference
[spark] branch master updated (50b652e241f -> 331e5bb25f8)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 50b652e241f [SPARK-43226] Define extractors for file-constant metadata add 331e5bb25f8 [SPARK-43204][SQL] Align MERGE assignments with table attributes No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/analysis/Analyzer.scala | 5 +- .../sql/catalyst/analysis/AssignmentUtils.scala| 61 +- .../ResolveRowLevelCommandAssignments.scala| 45 +- .../sql/catalyst/plans/logical/v2Commands.scala| 15 + .../execution/command/AlignAssignmentsSuite.scala | 211 + .../command/AlignMergeAssignmentsSuite.scala | 937 + .../command/AlignUpdateAssignmentsSuite.scala | 169 +--- 7 files changed, 1266 insertions(+), 177 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlignAssignmentsSuite.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlignMergeAssignmentsSuite.scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-43226] Define extractors for file-constant metadata
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 50b652e241f [SPARK-43226] Define extractors for file-constant metadata 50b652e241f is described below commit 50b652e241f7e31b99303359ec53e26a8989a4f0 Author: Ryan Johnson AuthorDate: Tue Apr 25 16:11:21 2023 +0800 [SPARK-43226] Define extractors for file-constant metadata ### What changes were proposed in this pull request? File-source constant metadata columns are often derived indirectly from file-level metadata values rather than exposing those values directly. Add support for metadata extractors, so that we can express such columns in a generic way. ### Why are the changes needed? Allows to express the existing file-source metadata columns in a generic way (previously hard-wired), and also allows to lazily derive expensive metadata values only if the query actually selects them. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit test. Plus, existing file-source metadata unit tests pass. Closes #40885 from ryan-johnson-databricks/file-constant-metadata-extractors. Authored-by: Ryan Johnson Signed-off-by: Wenchen Fan --- .../spark/sql/execution/DataSourceScanExec.scala | 33 +++--- .../sql/execution/datasources/FileFormat.scala | 124 ++--- .../sql/execution/datasources/FileScanRDD.scala| 70 +++- .../datasources/PartitioningAwareFileIndex.scala | 10 +- .../FileSourceCustomMetadataStructSuite.scala | 35 +- 5 files changed, 156 insertions(+), 116 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 0400a2b6abc..0d5091f4a97 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -548,10 +548,9 @@ case class FileSourceScanExec( hadoopConf = relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options)) val readRDD = if (bucketedScan) { - createBucketedReadRDD(relation.bucketSpec.get, readFile, dynamicallySelectedPartitions, -relation) + createBucketedReadRDD(relation.bucketSpec.get, readFile, dynamicallySelectedPartitions) } else { - createReadRDD(readFile, dynamicallySelectedPartitions, relation) + createReadRDD(readFile, dynamicallySelectedPartitions) } sendDriverMetrics() readRDD @@ -617,13 +616,11 @@ case class FileSourceScanExec( * @param bucketSpec the bucketing spec. * @param readFile a function to read each (part of a) file. * @param selectedPartitions Hive-style partition that are part of the read. - * @param fsRelation [[HadoopFsRelation]] associated with the read. */ private def createBucketedReadRDD( bucketSpec: BucketSpec, readFile: (PartitionedFile) => Iterator[InternalRow], - selectedPartitions: Array[PartitionDirectory], - fsRelation: HadoopFsRelation): RDD[InternalRow] = { + selectedPartitions: Array[PartitionDirectory]): RDD[InternalRow] = { logInfo(s"Planning with ${bucketSpec.numBuckets} buckets") val filesGroupedToBuckets = selectedPartitions.flatMap { p => @@ -660,9 +657,10 @@ case class FileSourceScanExec( } } -new FileScanRDD(fsRelation.sparkSession, readFile, filePartitions, - new StructType(requiredSchema.fields ++ fsRelation.partitionSchema.fields), - fileConstantMetadataColumns, new FileSourceOptions(CaseInsensitiveMap(relation.options))) +new FileScanRDD(relation.sparkSession, readFile, filePartitions, + new StructType(requiredSchema.fields ++ relation.partitionSchema.fields), + fileConstantMetadataColumns, relation.fileFormat.fileConstantMetadataExtractors, + new FileSourceOptions(CaseInsensitiveMap(relation.options))) } /** @@ -671,20 +669,18 @@ case class FileSourceScanExec( * * @param readFile a function to read each (part of a) file. * @param selectedPartitions Hive-style partition that are part of the read. - * @param fsRelation [[HadoopFsRelation]] associated with the read. */ private def createReadRDD( readFile: (PartitionedFile) => Iterator[InternalRow], - selectedPartitions: Array[PartitionDirectory], - fsRelation: HadoopFsRelation): RDD[InternalRow] = { -val openCostInBytes = fsRelation.sparkSession.sessionState.conf.filesOpenCostInBytes + selectedPartitions: Array[PartitionDirectory]): RDD[InternalRow] = { +val openCostInBytes = relation.sparkSession.sessionState.conf.filesOpenCostInBytes
[spark] branch master updated (71969fb9958 -> 4b60de63770)
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 71969fb9958 [SPARK-43231][ML][PYTHON][CONNECT][TESTS] Reduce the memory requirement in torch-related tests add 4b60de63770 [SPARK-43243][PYTHON][CONNECT] Add level param to printSchema for Python No new revisions were added by this update. Summary of changes: .../src/main/protobuf/spark/connect/base.proto | 3 + .../service/SparkConnectAnalyzeHandler.scala | 8 +- python/pyspark/sql/connect/client.py | 3 + python/pyspark/sql/connect/dataframe.py| 10 +- python/pyspark/sql/connect/proto/base_pb2.py | 214 ++--- python/pyspark/sql/connect/proto/base_pb2.pyi | 19 +- python/pyspark/sql/dataframe.py| 29 ++- python/pyspark/sql/tests/test_dataframe.py | 17 ++ 8 files changed, 186 insertions(+), 117 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-43231][ML][PYTHON][CONNECT][TESTS] Reduce the memory requirement in torch-related tests
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 71969fb9958 [SPARK-43231][ML][PYTHON][CONNECT][TESTS] Reduce the memory requirement in torch-related tests 71969fb9958 is described below commit 71969fb9958f0022aabcad36c89a461029cb3b8c Author: Ruifeng Zheng AuthorDate: Tue Apr 25 15:28:05 2023 +0800 [SPARK-43231][ML][PYTHON][CONNECT][TESTS] Reduce the memory requirement in torch-related tests ### What changes were proposed in this pull request? Reduce the memory requirement in torch-related tests ### Why are the changes needed? The computation in torch distributor actually happens in the external torch processes, and the Github Action resources is very limited, this PR tries to make related tests more stable ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? CI, let me keep merging oncoming commits from master to see whether this change is stable enough Closes #40874 from zhengruifeng/torch_reduce_memory. Lead-authored-by: Ruifeng Zheng Co-authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- python/pyspark/ml/tests/connect/test_parity_torch_distributor.py | 4 ++-- python/pyspark/ml/torch/tests/test_distributor.py| 8 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py b/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py index 55ea99a6540..b855332f96c 100644 --- a/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py +++ b/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py @@ -64,7 +64,7 @@ class TorchDistributorLocalUnitTestsOnConnect( builder = builder.config( "spark.driver.resource.gpu.discoveryScript", cls.gpu_discovery_script_file_name ) -cls.spark = builder.remote("local-cluster[2,2,1024]").getOrCreate() +cls.spark = builder.remote("local-cluster[2,2,512]").getOrCreate() @classmethod def tearDownClass(cls): @@ -126,7 +126,7 @@ class TorchDistributorDistributedUnitTestsOnConnect( builder = builder.config( "spark.worker.resource.gpu.discoveryScript", cls.gpu_discovery_script_file_name ) -cls.spark = builder.remote("local-cluster[2,2,1024]").getOrCreate() +cls.spark = builder.remote("local-cluster[2,2,512]").getOrCreate() @classmethod def tearDownClass(cls): diff --git a/python/pyspark/ml/torch/tests/test_distributor.py b/python/pyspark/ml/torch/tests/test_distributor.py index ebd859031bd..9fd0b4cba94 100644 --- a/python/pyspark/ml/torch/tests/test_distributor.py +++ b/python/pyspark/ml/torch/tests/test_distributor.py @@ -148,6 +148,8 @@ def get_local_mode_conf(): return { "spark.test.home": SPARK_HOME, "spark.driver.resource.gpu.amount": "3", +"spark.driver.memory": "512M", +"spark.executor.memory": "512M", } @@ -158,6 +160,8 @@ def get_distributed_mode_conf(): "spark.task.cpus": "2", "spark.task.resource.gpu.amount": "1", "spark.executor.resource.gpu.amount": "1", +"spark.driver.memory": "512M", +"spark.executor.memory": "512M", } @@ -412,7 +416,7 @@ class TorchDistributorLocalUnitTests(TorchDistributorLocalUnitTestsMixin, unitte "spark.driver.resource.gpu.discoveryScript", cls.gpu_discovery_script_file_name ) -sc = SparkContext("local-cluster[2,2,1024]", cls.__name__, conf=conf) +sc = SparkContext("local-cluster[2,2,512]", cls.__name__, conf=conf) cls.spark = SparkSession(sc) @classmethod @@ -502,7 +506,7 @@ class TorchDistributorDistributedUnitTests( "spark.worker.resource.gpu.discoveryScript", cls.gpu_discovery_script_file_name ) -sc = SparkContext("local-cluster[2,2,1024]", cls.__name__, conf=conf) +sc = SparkContext("local-cluster[2,2,512]", cls.__name__, conf=conf) cls.spark = SparkSession(sc) @classmethod - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org