[spark] branch master updated (ad454f46d85 -> 9c19528e337)

2023-04-25 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from ad454f46d85 [SPARK-42992][PYTHON] Introduce PySparkRuntimeError
 add 9c19528e337 [SPARK-43076][PS][CONNECT] Removing the dependency on 
`grpcio` when remote session is not used

No new revisions were added by this update.

Summary of changes:
 python/pyspark/pandas/_typing.py   |   9 -
 python/pyspark/pandas/accessors.py |   2 +-
 python/pyspark/pandas/base.py  |  16 +-
 python/pyspark/pandas/data_type_ops/base.py|  23 ++-
 python/pyspark/pandas/data_type_ops/binary_ops.py  |  16 +-
 python/pyspark/pandas/data_type_ops/boolean_ops.py | 129 +++-
 python/pyspark/pandas/data_type_ops/date_ops.py|  51 +++--
 python/pyspark/pandas/data_type_ops/num_ops.py | 218 +
 python/pyspark/pandas/frame.py | 147 +++---
 python/pyspark/pandas/indexes/base.py  |  45 ++---
 python/pyspark/pandas/indexes/multi.py |   8 +-
 python/pyspark/pandas/indexing.py  | 180 ++---
 python/pyspark/pandas/internal.py  | 176 +
 python/pyspark/pandas/namespace.py |  31 +--
 python/pyspark/pandas/numpy_compat.py  |   9 +-
 python/pyspark/pandas/series.py|  26 +--
 python/pyspark/pandas/spark/accessors.py   |  49 +++--
 python/pyspark/pandas/utils.py |  74 ---
 python/pyspark/sql/connect/dataframe.py|   4 +-
 19 files changed, 699 insertions(+), 514 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-42992][PYTHON] Introduce PySparkRuntimeError

2023-04-25 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new ad454f46d85 [SPARK-42992][PYTHON] Introduce PySparkRuntimeError
ad454f46d85 is described below

commit ad454f46d85e17dc59991ecd6c3326a8c6de8ea3
Author: itholic 
AuthorDate: Wed Apr 26 13:51:52 2023 +0800

[SPARK-42992][PYTHON] Introduce PySparkRuntimeError

### What changes were proposed in this pull request?

This PR proposes to introduce new error for PySpark, and also applied it to 
existing RuntimeError under `python/pyspark/*.py`.

### Why are the changes needed?

To cover the built-in RuntimeError by PySpark error framework.

### Does this PR introduce _any_ user-facing change?

No, it's internal error framework improvement.

### How was this patch tested?

The existing CI should pass.

Closes #40617 from itholic/pyspark_runtime_error.

Authored-by: itholic 
Signed-off-by: Ruifeng Zheng 
---
 python/pyspark/accumulators.py |  15 ++-
 python/pyspark/broadcast.py|  23 +++-
 python/pyspark/conf.py |   6 +-
 python/pyspark/context.py  |  40 +++---
 python/pyspark/errors/__init__.py  |   2 +
 python/pyspark/errors/error_classes.py | 137 +
 python/pyspark/errors/exceptions/base.py   |   6 +
 python/pyspark/errors/utils.py |   4 +-
 python/pyspark/java_gateway.py |  18 ++-
 python/pyspark/profiler.py |   6 +-
 python/pyspark/rdd.py  |  33 +++--
 .../sql/tests/pandas/test_pandas_grouped_map.py|   2 +-
 python/pyspark/taskcontext.py  |  30 +++--
 python/pyspark/tests/test_context.py   |   2 +-
 python/pyspark/tests/test_profiler.py  |  27 ++--
 python/pyspark/util.py |  13 +-
 python/pyspark/worker.py   |  75 ++-
 17 files changed, 346 insertions(+), 93 deletions(-)

diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index ce4bb561814..dc8520a844d 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -23,6 +23,7 @@ import threading
 from typing import Callable, Dict, Generic, Tuple, Type, TYPE_CHECKING, 
TypeVar, Union
 
 from pyspark.serializers import read_int, CPickleSerializer
+from pyspark.errors import PySparkRuntimeError
 
 if TYPE_CHECKING:
 from pyspark._typing import SupportsIAdd  # noqa: F401
@@ -140,14 +141,24 @@ class Accumulator(Generic[T]):
 def value(self) -> T:
 """Get the accumulator's value; only usable in driver program"""
 if self._deserialized:
-raise RuntimeError("Accumulator.value cannot be accessed inside 
tasks")
+raise PySparkRuntimeError(
+error_class="VALUE_NOT_ACCESSIBLE",
+message_parameters={
+"value": "Accumulator.value",
+},
+)
 return self._value
 
 @value.setter
 def value(self, value: T) -> None:
 """Sets the accumulator's value; only usable in driver program"""
 if self._deserialized:
-raise RuntimeError("Accumulator.value cannot be accessed inside 
tasks")
+raise PySparkRuntimeError(
+error_class="VALUE_NOT_ACCESSIBLE",
+message_parameters={
+"value": "Accumulator.value",
+},
+)
 self._value = value
 
 def add(self, term: T) -> None:
diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py
index b1a9b790af5..a72bf1e059b 100644
--- a/python/pyspark/broadcast.py
+++ b/python/pyspark/broadcast.py
@@ -40,6 +40,7 @@ from typing.io import BinaryIO  # type: ignore[import]
 from pyspark.java_gateway import local_connect_and_auth
 from pyspark.serializers import ChunkedStream, pickle_protocol
 from pyspark.util import print_exec
+from pyspark.errors import PySparkRuntimeError
 
 if TYPE_CHECKING:
 from pyspark import SparkContext
@@ -58,7 +59,12 @@ def _from_id(bid: int) -> "Broadcast[Any]":
 from pyspark.broadcast import _broadcastRegistry
 
 if bid not in _broadcastRegistry:
-raise RuntimeError("Broadcast variable '%s' not loaded!" % bid)
+raise PySparkRuntimeError(
+error_class="BROADCAST_VARIABLE_NOT_LOADED",
+message_parameters={
+"variable": str(bid),
+},
+)
 return _broadcastRegistry[bid]
 
 
@@ -293,7 +299,10 @@ class Broadcast(Generic[T]):
 >>> b.unpersist()
 """
 if self._jbroadcast is None:
-raise RuntimeError("Broadcast can 

[spark] branch master updated (b461cdea92e -> eae91ee3c96)

2023-04-25 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from b461cdea92e [SPARK-43156][SPARK-43098][SQL] Extend scalar subquery 
count bug test with decorrelateInnerQuery disabled
 add eae91ee3c96 [SPARK-43274][SPARK-43275][PYTHON][CONNECT] Introduce 
`PySparkNotImplementedError`

No new revisions were added by this update.

Summary of changes:
 python/pyspark/errors/__init__.py  |  2 ++
 python/pyspark/errors/error_classes.py | 17 +-
 python/pyspark/errors/exceptions/base.py   |  6 
 python/pyspark/sql/connect/group.py| 37 ++
 python/pyspark/sql/connect/udf.py  |  5 ++-
 .../sql/tests/connect/test_connect_basic.py| 14 +---
 .../sql/tests/pandas/test_pandas_grouped_map.py|  5 ++-
 python/pyspark/sql/udf.py  |  5 ++-
 8 files changed, 70 insertions(+), 21 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-43156][SPARK-43098][SQL] Extend scalar subquery count bug test with decorrelateInnerQuery disabled

2023-04-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new b461cdea92e [SPARK-43156][SPARK-43098][SQL] Extend scalar subquery 
count bug test with decorrelateInnerQuery disabled
b461cdea92e is described below

commit b461cdea92ea08ce39bb3c9d733f0af7c56abf8d
Author: Jack Chen 
AuthorDate: Wed Apr 26 12:14:24 2023 +0800

[SPARK-43156][SPARK-43098][SQL] Extend scalar subquery count bug test with 
decorrelateInnerQuery disabled

### What changes were proposed in this pull request?
Extend a test to test with both the DecorrelateInnerQuery framework on and 
off, since this came up in both https://github.com/apache/spark/pull/40865 and 
https://github.com/apache/spark/pull/40811. (The default is on.)

### Why are the changes needed?
Improve test coverage

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Test itself

Closes #40946 from jchen5/subq-count-test.

Authored-by: Jack Chen 
Signed-off-by: Wenchen Fan 
---
 .../inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql  | 3 +++
 1 file changed, 3 insertions(+)

diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql
 
b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql
index 0ca2f07b301..df88b8eb74d 100644
--- 
a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql
+++ 
b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-count-bug.sql
@@ -1,3 +1,6 @@
+--CONFIG_DIM1 spark.sql.optimizer.decorrelateInnerQuery.enabled=true
+--CONFIG_DIM1 spark.sql.optimizer.decorrelateInnerQuery.enabled=false
+
 create temp view l (a, b)
 as values
 (1, 2.0),


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (014685c41e4 -> 55865bd1756)

2023-04-25 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 014685c41e4 [SPARK-43277][YARN] Clean up deprecation hadoop api usage 
in `yarn` module
 add 55865bd1756 [SPARK-43276][CONNECT][PYTHON] Migrate Spark Connect 
Window errors into error class

No new revisions were added by this update.

Summary of changes:
 python/pyspark/errors/error_classes.py |  5 +
 python/pyspark/sql/connect/window.py   | 21 +
 2 files changed, 18 insertions(+), 8 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] Yikun commented on pull request #458: Update Spark Docker Images publish workflow

2023-04-25 Thread via GitHub


Yikun commented on PR #458:
URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522735459

   > As you know, Apache Spark dev mailing list is the only official way to 
build a community consensus. All the other channels (GitHub or even user 
mailing list). Please shoot the email. :)
   
   Sure! I will send the mail in this week!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-43277][YARN] Clean up deprecation hadoop api usage in `yarn` module

2023-04-25 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 014685c41e4 [SPARK-43277][YARN] Clean up deprecation hadoop api usage 
in `yarn` module
014685c41e4 is described below

commit 014685c41e4741f83570d8a2a6a253e48967919a
Author: yangjie01 
AuthorDate: Tue Apr 25 22:12:35 2023 -0500

[SPARK-43277][YARN] Clean up deprecation hadoop api usage in `yarn` module

### What changes were proposed in this pull request?
`yarn` module has the following compilation warnings related to the Hadoop 
API:

```
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala:157:
 [deprecation  
org.apache.spark.deploy.yarn.ApplicationMaster.prepareLocalResources.setupDistributedCache
 | origin=org.apache.hadoop.yarn.util.ConverterUtils.getYarnUrlFromURI | 
version=] method getYarnUrlFromURI in class ConverterUtils is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala:292:
 [deprecation  
org.apache.spark.deploy.yarn.Client.createApplicationSubmissionContext | 
origin=org.apache.hadoop.yarn.api.records.Resource.setMemory | version=] method 
setMemory in class Resource is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala:307:
 [deprecation  
org.apache.spark.deploy.yarn.Client.createApplicationSubmissionContext | 
origin=org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext.setAMContainerResourceRequest
 | version=] method setAMContainerResourceRequest in class 
ApplicationSubmissionContext is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala:392:
 [deprecation  
org.apache.spark.deploy.yarn.Client.verifyClusterResources.maxMem | 
origin=org.apache.hadoop.yarn.api.records.Resource.getMemory | version=] method 
getMemory in class Resource is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala:76:
 [deprecation  
org.apache.spark.deploy.yarn.ClientDistributedCacheManager.addResource | 
origin=org.apache.hadoop.yarn.util.ConverterUtils.getYarnUrlFromPath | 
version=] method getYarnUrlFromPath in class ConverterUtils is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala:510:
 [deprecation  
org.apache.spark.deploy.yarn.YarnAllocator.updateResourceRequests.$anonfun.requestContainerMessage
 | origin=org.apache.hadoop.yarn.api.records.Resource.getMemory | version=] 
method getMemory in class Resource is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala:737:
 [deprecation  
org.apache.spark.deploy.yarn.YarnAllocator.runAllocatedContainers.$anonfun | 
origin=org.apache.hadoop.yarn.api.records.Resource.getMemory | version=] method 
getMemory in class Resource is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala:737:
 [deprecation  
org.apache.spark.deploy.yarn.YarnAllocator.runAllocatedContainers.$anonfun | 
origin=org.apache.hadoop.yarn.api.records.Resource.getMemory | version=] method 
getMemory in class Resource is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala:202:
 [deprecation  org.apache.spark.deploy.yarn.YarnSparkHadoopUtil.getContainerId 
| origin=org.apache.hadoop.yarn.util.ConverterUtils.toContainerId | version=] 
method toContainerId in class ConverterUtils is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/main/scala/org/apache/spark/util/YarnContainerInfoHelper.scala:75:
 [deprecation  org.apache.spark.util.YarnContainerInfoHelper.getAttributes | 
origin=org.apache.hadoop.yarn.util.ConverterUtils.toString | version=] method 
toString in class ConverterUtils is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala:83:
 [deprecation  
org.apache.spark.deploy.yarn.ClientDistributedCacheManagerSuite..$org_scalatest_assert_macro_expr.$org_scalatest_assert_macro_left
 | origin=org.apache.hadoop.yarn.util.ConverterUtils.getPathFromYarnURL | 
version=] method getPathFromYarnURL in class ConverterUtils is deprecated
[WARNING] [Warn] 
/${spark-source-dir}/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala:105:
 [deprecation 

[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow

2023-04-25 Thread via GitHub


dongjoon-hyun commented on PR #458:
URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522655970

   @Yikun . We cannot simply remove anything like this PR. The official way to 
adopt new standard is to send the email on dev mailing list and build a 
community consensus. 
   
   > The reason of we remove v is to follow [DOI 
rules](https://github.com/docker-library/official-images#tags-and-aliases). If 
we still think v should be keep in future, we can both keep them (to point new 
images).
   
   As you know, Apache Spark dev mailing list is the only official way to build 
a community consensus. All the other channels (GitHub or even user mailing 
list). Please shoot the email. :)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] Yikun commented on pull request #458: Update Spark Docker Images publish workflow

2023-04-25 Thread via GitHub


Yikun commented on PR #458:
URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522651856

   > I believe 3.4.0 and v3.4.0 only causes many confusions to the users. In 
Apache Spark community, that's the reason why I propose to keep v3.4.0 tag 
format only (whatever we used; manual or GitHub)
   
   The reason of we remove `v` is to follow [DOI 
rules](https://github.com/docker-library/official-images#tags-and-aliases). If 
we still think `v` should be keep **in future**, we can both keep them (to 
point new images).
   
   > Given (1) and (2), I don't think this PR claims to maintain an existing 
manual publish process. In other words, this PR already burn the bridge.
   
   At this point, below might the solutions to continue the work:
   
   Solution 1: 
   1. Switch Java 17 to default in spark-docker: 
https://github.com/apache/spark-docker/pull/35
   2. Keep v3.4.0 and 3.4.0 to point GA publish image.
   3. Keep this PR to instead of manual process
   
   
   Solution 2:
   1. Recover `latest` to manual publish image for 3.4.0 images.
   2. Revert this PR (Recover the manual process and also keep the GA publish 
steps in this PR's doc)
   3. `v3.4.0` point to manual image, `3.4.0` point to GA publish image
   4. Keep manual image  and GA publish image in 3.4.0 series, and unifed all 
images into GA publish image to in 3.5.0
   
   @dongjoon-hyun @HyukjinKwon @zhengruifeng WDYT?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (b26844ce879 -> a2a5299cb2d)

2023-04-25 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from b26844ce879 [SPARK-43136][CONNECT] Adding groupByKey + mapGroup + 
coGroup functions
 add a2a5299cb2d [MINOR][BUILD] Correct the error message in 
`dev/connect-check-protos.py`

No new revisions were added by this update.

Summary of changes:
 dev/connect-check-protos.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow

2023-04-25 Thread via GitHub


dongjoon-hyun commented on PR #458:
URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522621830

   1. If we are going to maintain the manual process, why do you remove old 
description in this PR?
   > For tag, it's intentional. In 3.4.0, v3.4.0 tag and 3.4.0 serious tags 
will co-exsiting.
   
   2. To be clear, I really appreciate your contribution because this is good 
enough to mitigate Apache Spark release managers' burden.
   
   Given (1) and (2), I don't think this PR claims to maintain an existing 
manual publish process. In other words, this PR already burn the bridge.
   
   I believe `3.4.0` and `v3.4.0` only causes many confusions to the users. In 
Apache Spark community, that's the reason why I propose to keep `v3.4.0` tag 
format only (whatever we used; manual or GitHub)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] Yikun commented on pull request #458: Update Spark Docker Images publish workflow

2023-04-25 Thread via GitHub


Yikun commented on PR #458:
URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522546716

   Yes, but I meant we still need to update in apache/spark-docker. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow

2023-04-25 Thread via GitHub


dongjoon-hyun commented on PR #458:
URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522543950

   It's already Java 17 by default.
   
   > For java17, I think we can switch to java17 in docker image.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] Yikun commented on pull request #458: Update Spark Docker Images publish workflow

2023-04-25 Thread via GitHub


Yikun commented on PR #458:
URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522543145

   Thanks for catch and reminder it! @dongjoon-hyun 
   
   For java17, I think we can switch to java17 in docker image.
   
   For tag, it's intentional. In 3.4.0, `v3.4.0` tag and `3.4.0` serious tags 
will co-exsiting.
   
   It should be a new feature but not breaking changes. (But for latest tag 
where to point is a breaking changes).
   
   So do you have any suggestion how we could migrate to new image frequently?
   
   Maybe
   1. 3.4.0 keep previous and new tags, latest point to previous tag.
   2. 3.5.0 keep previous and new tags, and mark previous tag as deprecated, 
latest point to new tag.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow

2023-04-25 Thread via GitHub


dongjoon-hyun commented on PR #458:
URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522426433

   cc @sunchao , too


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] dongjoon-hyun commented on pull request #458: Update Spark Docker Images publish workflow

2023-04-25 Thread via GitHub


dongjoon-hyun commented on PR #458:
URL: https://github.com/apache/spark-website/pull/458#issuecomment-1522426236

   Hi, @Yikun , @HyukjinKwon , @yaooqinn , @zhengruifeng . 
   
   it seems that this PR introduced two user-facing breaking changes 
   - Downgrade Java from 17 to 11
   ```
   $ docker run -it --rm apache/spark:v3.4.0 java -version | tail -n1
   OpenJDK 64-Bit Server VM Temurin-17.0.6+10 (build 17.0.6+10, mixed mode, 
sharing)
   ```
   
   ```
   $ docker run -it --rm apache/spark:3.4.0 java -version | tail -n1
   OpenJDK 64-Bit Server VM Temurin-11.0.18+10 (build 11.0.18+10, mixed mode, 
sharing)
   ```
   - Drop `v` prefix, `v3.4.0` -> `3.4.0`
   
   If these breaking changes are intentional, I'm wondering if we discuss the 
rationals in the `dev` mailing list. Otherwise, I'd like to recommend to revert 
those breaking changes to avoid any confusion.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (0e6446ac62d -> b26844ce879)

2023-04-25 Thread hvanhovell
This is an automated email from the ASF dual-hosted git repository.

hvanhovell pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 0e6446ac62d [SPARK-43285] Fix ReplE2ESuite consistently failing with 
JDK 17
 add b26844ce879 [SPARK-43136][CONNECT] Adding groupByKey + mapGroup + 
coGroup functions

No new revisions were added by this update.

Summary of changes:
 .../main/scala/org/apache/spark/sql/Dataset.scala  |  29 ++
 .../apache/spark/sql/KeyValueGroupedDataset.scala  | 416 +
 .../apache/spark/sql/connect/client/UdfUtils.scala |  43 ++-
 .../sql/KeyValueGroupedDatasetE2ETestSuite.scala   | 218 +++
 .../CheckConnectJvmClientCompatibility.scala   |  19 +-
 .../main/protobuf/spark/connect/relations.proto|   9 +
 .../sql/connect/planner/SparkConnectPlanner.scala  | 253 ++---
 python/pyspark/sql/connect/proto/relations_pb2.py  |  24 +-
 python/pyspark/sql/connect/proto/relations_pb2.pyi |  49 ++-
 .../spark/sql/catalyst/plans/logical/object.scala  |   7 +
 .../apache/spark/sql/KeyValueGroupedDataset.scala  |   7 +-
 11 files changed, 996 insertions(+), 78 deletions(-)
 create mode 100644 
connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
 create mode 100644 
connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-43285] Fix ReplE2ESuite consistently failing with JDK 17

2023-04-25 Thread hvanhovell
This is an automated email from the ASF dual-hosted git repository.

hvanhovell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 0e6446ac62d [SPARK-43285] Fix ReplE2ESuite consistently failing with 
JDK 17
0e6446ac62d is described below

commit 0e6446ac62d6f3f7568e6e35b42f8e395bf490dc
Author: vicennial 
AuthorDate: Tue Apr 25 15:10:57 2023 -0400

[SPARK-43285] Fix ReplE2ESuite consistently failing with JDK 17

### What changes were proposed in this pull request?

The timeout duration for the REPL has been increased from 10 -> 30 seconds 
(to address slow start on JDK 17 tests) and the semaphore permits are drained 
after each test (to avoid cascading failures, 
[context](https://github.com/apache/spark/pull/40675#discussion_r1174917132)).

### Why are the changes needed?

The GA JDK 17 tests consistently fails as described in the [jira 
issue](https://issues.apache.org/jira/browse/SPARK-43285).

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Locally verified by installing and running tests with JDK 17 (both the 
failure and the subsequent fix).

Closes #40948 from vicennial/SPARK-43285.

Authored-by: vicennial 
Signed-off-by: Herman van Hovell 
---
 .../scala/org/apache/spark/sql/application/ReplE2ESuite.scala| 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
index f0ec28a5a87..af920f8c314 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
@@ -20,13 +20,14 @@ import java.io.{PipedInputStream, PipedOutputStream}
 import java.util.concurrent.{Executors, Semaphore, TimeUnit}
 
 import org.apache.commons.io.output.ByteArrayOutputStream
+import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.sql.connect.client.util.RemoteSparkSession
 
-class ReplE2ESuite extends RemoteSparkSession {
+class ReplE2ESuite extends RemoteSparkSession with BeforeAndAfterEach {
 
   private val executorService = Executors.newSingleThreadExecutor()
-  private val TIMEOUT_SECONDS = 10
+  private val TIMEOUT_SECONDS = 30
 
   private var testSuiteOut: PipedOutputStream = _
   private var ammoniteOut: ByteArrayOutputStream = _
@@ -68,6 +69,10 @@ class ReplE2ESuite extends RemoteSparkSession {
 super.afterAll()
   }
 
+  override def afterEach(): Unit = {
+semaphore.drainPermits()
+  }
+
   def runCommandsInShell(input: String): String = {
 require(input.nonEmpty)
 // Pad the input with a semaphore release so that we know when the 
execution of the provided


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (516d7b3d483 -> 9c237d7bc7b)

2023-04-25 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 516d7b3d483 [SPARK-42798][BUILD] Upgrade protobuf-java to 3.22.3
 add 9c237d7bc7b [SPARK-43225][BUILD][SQL] Remove jackson-core-asl and 
jackson-mapper-asl from pre-built distribution

No new revisions were added by this update.

Summary of changes:
 core/pom.xml   |  8 
 dev/deps/spark-deps-hadoop-3-hive-2.3  |  2 --
 pom.xml| 23 ++
 .../org/apache/hive/service/cli/CLIService.java|  2 +-
 .../hive/service/cli/session/HiveSessionImpl.java  |  4 ++--
 .../cli/session/HiveSessionImplwithUGI.java|  4 ++--
 6 files changed, 7 insertions(+), 36 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-42798][BUILD] Upgrade protobuf-java to 3.22.3

2023-04-25 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 516d7b3d483 [SPARK-42798][BUILD] Upgrade protobuf-java to 3.22.3
516d7b3d483 is described below

commit 516d7b3d483687018fc457ab2fec82c05b169057
Author: YangJie 
AuthorDate: Tue Apr 25 08:52:54 2023 -0500

[SPARK-42798][BUILD] Upgrade protobuf-java to 3.22.3

### What changes were proposed in this pull request?
This pr aims upgrade protobuf-java from 3.22.0 to 3.22.3.

### Why are the changes needed?
The new version fixed the issue of `NoSuchMethodError` thrown when using 
Java 8 to run proto compiled with Java 9+ (even if --target 1.8):

- https://github.com/protocolbuffers/protobuf/issues/11393 / 
https://github.com/protocolbuffers/protobuf/pull/12035

The full release notes as follows:

- https://github.com/protocolbuffers/protobuf/releases/tag/v22.1
- https://github.com/protocolbuffers/protobuf/releases/tag/v22.2
- https://github.com/protocolbuffers/protobuf/releases/tag/v22.3
- https://github.com/protocolbuffers/protobuf/compare/v3.22.0...v3.22.3

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Pass GitHub Actions

Closes #40430 from LuciferYang/SPARK-42798.

Lead-authored-by: YangJie 
Co-authored-by: yangjie01 
Signed-off-by: Sean Owen 
---
 pom.xml  | 2 +-
 project/SparkBuild.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index a3b7f57ace9..2cb6ff86bb5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -124,7 +124,7 @@
 
2.5.0
 
 
-3.22.0
+3.22.3
 3.11.4
 ${hadoop.version}
 3.6.3
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index d8a6972bae1..25f5ea4fa85 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -88,7 +88,7 @@ object BuildCommons {
 
   // Google Protobuf version used for generating the protobuf.
   // SPARK-41247: needs to be consistent with `protobuf.version` in `pom.xml`.
-  val protoVersion = "3.22.0"
+  val protoVersion = "3.22.3"
   // GRPC version used for Spark Connect.
   val gprcVersion = "1.47.0"
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-42419][FOLLOWUP][CONNECT][PYTHON] Remove unused exception

2023-04-25 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new b59d3af4ca2 [SPARK-42419][FOLLOWUP][CONNECT][PYTHON] Remove unused 
exception
b59d3af4ca2 is described below

commit b59d3af4ca244aa9837d0d862012e3bfd380bc05
Author: itholic 
AuthorDate: Tue Apr 25 19:09:08 2023 +0800

[SPARK-42419][FOLLOWUP][CONNECT][PYTHON] Remove unused exception

### What changes were proposed in this pull request?

This is follow-up for https://github.com/apache/spark/pull/39991 to remove 
unused exception.

### Why are the changes needed?

`PySparkTypeError` never raises because we checked the type already in the 
codes above `if type(startPos) != type(length):` and raise `PySparkTypeError` 
for unsupported type for `length`.

### Does this PR introduce _any_ user-facing change?

No. it's minor code cleanup

### How was this patch tested?

The existing CI should pass.

Closes #40927 from itholic/connect_col_followup.

Authored-by: itholic 
Signed-off-by: Ruifeng Zheng 
---
 python/pyspark/sql/connect/column.py | 13 ++---
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/sql/connect/column.py 
b/python/pyspark/sql/connect/column.py
index 26cb1477cee..7b60adb1629 100644
--- a/python/pyspark/sql/connect/column.py
+++ b/python/pyspark/sql/connect/column.py
@@ -239,24 +239,15 @@ class Column:
 
 if isinstance(length, Column):
 length_expr = length._expr
+start_expr = startPos._expr  # type: ignore[union-attr]
 elif isinstance(length, int):
 length_expr = LiteralExpression._from_value(length)
-else:
-raise PySparkTypeError(
-error_class="NOT_COLUMN_OR_INT",
-message_parameters={"arg_name": "length", "arg_type": 
type(length).__name__},
-)
-
-if isinstance(startPos, Column):
-start_expr = startPos._expr
-elif isinstance(startPos, int):
 start_expr = LiteralExpression._from_value(startPos)
 else:
 raise PySparkTypeError(
 error_class="NOT_COLUMN_OR_INT",
-message_parameters={"arg_name": "startPos", "arg_type": 
type(startPos).__name__},
+message_parameters={"arg_name": "length", "arg_type": 
type(length).__name__},
 )
-
 return Column(UnresolvedFunction("substring", [self._expr, start_expr, 
length_expr]))
 
 substr.__doc__ = PySparkColumn.substr.__doc__


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-43142] Fix DSL expressions on attributes with special characters

2023-04-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 98ae33b7acb [SPARK-43142] Fix DSL expressions on attributes with 
special characters
98ae33b7acb is described below

commit 98ae33b7acbd932714301c83d71d42ef318dda9b
Author: Willi Raschkowski 
AuthorDate: Tue Apr 25 18:23:36 2023 +0800

[SPARK-43142] Fix DSL expressions on attributes with special characters

Re-attempting #40794. #40794 tried to more safely create 
`AttributeReference` objects from multi-part attributes in `ImplicitAttribute`. 
But that broke things and we had to revert. This PR is limiting the fix to the 
`UnresolvedAttribute` object returned by `DslAttr.attr`, which is enough to fix 
the issue here.

### What changes were proposed in this pull request?
This PR fixes DSL expressions on attributes with special characters by 
making `DslAttr.attr` and `DslAttr.expr` return the implicitly wrapped 
attribute instead of creating a new one.

### Why are the changes needed?
SPARK-43142: DSL expressions on attributes with special characters don't 
work even if the attribute names are quoted:

```scala
scala> "`slashed/col`".attr
res0: org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute = 
'slashed/col

scala> "`slashed/col`".attr.asc
org.apache.spark.sql.catalyst.parser.ParseException:
mismatched input '/' expecting {, '.', '-'}(line 1, pos 7)

== SQL ==
slashed/col
---^^^
```

DSL expressions rely on a call to `expr` to get child of the new expression 
[(e.g.)](https://github.com/apache/spark/blob/87a5442f7ed96b11051d8a9333476d080054e5a0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala#L149).

`expr` here is a call on implicit class `DslAttr` that's wrapping the 
`UnresolvedAttribute` returned by `"...".attr` is wrapped by the implicit class 
`DslAttr`.

`DslAttr` and its super class implement `DslAttr.expr` such that a new 
`UnresolvedAttribute` is created from `UnresolvedAttribute.name` of the wrapped 
attribute 
[(here)](https://github.com/apache/spark/blob/87a5442f7ed96b11051d8a9333476d080054e5a0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala#L273-L280).

But `UnresolvedAttribute.name` drops the quotes and thus the newly created 
`UnresolvedAttribute` parses an identifier that should be quoted but isn't:
```scala
scala> "`col/slash`".attr.name
res5: String = col/slash
```

### Does this PR introduce _any_ user-facing change?
DSL expressions on attributes with special characters no longer fail.

### How was this patch tested?
I couldn't find a suite testing the implicit classes in the DSL package, 
but the DSL package seems used widely enough that I'm confident this doesn't 
break existing behavior.

Locally, I was able to reproduce with this test; it was failing before and 
passes now:
```scala
test("chained DSL expressions on attributes with special characters") {
  $"`slashed/col`".asc
}
```

Closes #40902 from rshkv/wr/spark-43142-v2.

Authored-by: Willi Raschkowski 
Signed-off-by: Wenchen Fan 
---
 .../apache/spark/sql/catalyst/dsl/package.scala| 73 ++
 .../expressions/ExpressionSQLBuilderSuite.scala|  2 +-
 .../datasources/DataSourceStrategySuite.scala  | 12 ++--
 .../datasources/v2/DataSourceV2StrategySuite.scala | 12 ++--
 4 files changed, 46 insertions(+), 53 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index ac439203cb7..27d05f3bac7 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -271,8 +271,8 @@ package object dsl {
   override def expr: Expression = Literal(s)
   def attr: UnresolvedAttribute = analysis.UnresolvedAttribute(s)
 }
-implicit class DslAttr(attr: UnresolvedAttribute) extends 
ImplicitAttribute {
-  def s: String = attr.name
+implicit class DslAttr(override val attr: UnresolvedAttribute) extends 
ImplicitAttribute {
+  def s: String = attr.sql
 }
 
 abstract class ImplicitAttribute extends ImplicitOperators {
@@ -280,90 +280,83 @@ package object dsl {
   def expr: UnresolvedAttribute = attr
   def attr: UnresolvedAttribute = analysis.UnresolvedAttribute(s)
 
+  private def attrRef(dataType: DataType): AttributeReference =
+AttributeReference(attr.nameParts.last, dataType)(qualifier = 
attr.nameParts.init)
+
   /** Creates a new AttributeReference of type boolean */
-  def boolean: AttributeReference 

[spark] branch master updated (50b652e241f -> 331e5bb25f8)

2023-04-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 50b652e241f [SPARK-43226] Define extractors for file-constant metadata
 add 331e5bb25f8 [SPARK-43204][SQL] Align MERGE assignments with table 
attributes

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/catalyst/analysis/Analyzer.scala |   5 +-
 .../sql/catalyst/analysis/AssignmentUtils.scala|  61 +-
 .../ResolveRowLevelCommandAssignments.scala|  45 +-
 .../sql/catalyst/plans/logical/v2Commands.scala|  15 +
 .../execution/command/AlignAssignmentsSuite.scala  | 211 +
 .../command/AlignMergeAssignmentsSuite.scala   | 937 +
 .../command/AlignUpdateAssignmentsSuite.scala  | 169 +---
 7 files changed, 1266 insertions(+), 177 deletions(-)
 create mode 100644 
sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlignAssignmentsSuite.scala
 create mode 100644 
sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlignMergeAssignmentsSuite.scala


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-43226] Define extractors for file-constant metadata

2023-04-25 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 50b652e241f [SPARK-43226] Define extractors for file-constant metadata
50b652e241f is described below

commit 50b652e241f7e31b99303359ec53e26a8989a4f0
Author: Ryan Johnson 
AuthorDate: Tue Apr 25 16:11:21 2023 +0800

[SPARK-43226] Define extractors for file-constant metadata

### What changes were proposed in this pull request?

File-source constant metadata columns are often derived indirectly from 
file-level metadata values rather than exposing those values directly. Add 
support for metadata extractors, so that we can express such columns in a 
generic way.

### Why are the changes needed?

Allows to express the existing file-source metadata columns in a generic 
way (previously hard-wired), and also allows to lazily derive expensive 
metadata values only if the query actually selects them.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New unit test. Plus, existing file-source metadata unit tests pass.

Closes #40885 from 
ryan-johnson-databricks/file-constant-metadata-extractors.

Authored-by: Ryan Johnson 
Signed-off-by: Wenchen Fan 
---
 .../spark/sql/execution/DataSourceScanExec.scala   |  33 +++---
 .../sql/execution/datasources/FileFormat.scala | 124 ++---
 .../sql/execution/datasources/FileScanRDD.scala|  70 +++-
 .../datasources/PartitioningAwareFileIndex.scala   |  10 +-
 .../FileSourceCustomMetadataStructSuite.scala  |  35 +-
 5 files changed, 156 insertions(+), 116 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 0400a2b6abc..0d5091f4a97 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -548,10 +548,9 @@ case class FileSourceScanExec(
 hadoopConf = 
relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options))
 
 val readRDD = if (bucketedScan) {
-  createBucketedReadRDD(relation.bucketSpec.get, readFile, 
dynamicallySelectedPartitions,
-relation)
+  createBucketedReadRDD(relation.bucketSpec.get, readFile, 
dynamicallySelectedPartitions)
 } else {
-  createReadRDD(readFile, dynamicallySelectedPartitions, relation)
+  createReadRDD(readFile, dynamicallySelectedPartitions)
 }
 sendDriverMetrics()
 readRDD
@@ -617,13 +616,11 @@ case class FileSourceScanExec(
* @param bucketSpec the bucketing spec.
* @param readFile a function to read each (part of a) file.
* @param selectedPartitions Hive-style partition that are part of the read.
-   * @param fsRelation [[HadoopFsRelation]] associated with the read.
*/
   private def createBucketedReadRDD(
   bucketSpec: BucketSpec,
   readFile: (PartitionedFile) => Iterator[InternalRow],
-  selectedPartitions: Array[PartitionDirectory],
-  fsRelation: HadoopFsRelation): RDD[InternalRow] = {
+  selectedPartitions: Array[PartitionDirectory]): RDD[InternalRow] = {
 logInfo(s"Planning with ${bucketSpec.numBuckets} buckets")
 val filesGroupedToBuckets =
   selectedPartitions.flatMap { p =>
@@ -660,9 +657,10 @@ case class FileSourceScanExec(
   }
 }
 
-new FileScanRDD(fsRelation.sparkSession, readFile, filePartitions,
-  new StructType(requiredSchema.fields ++ 
fsRelation.partitionSchema.fields),
-  fileConstantMetadataColumns, new 
FileSourceOptions(CaseInsensitiveMap(relation.options)))
+new FileScanRDD(relation.sparkSession, readFile, filePartitions,
+  new StructType(requiredSchema.fields ++ relation.partitionSchema.fields),
+  fileConstantMetadataColumns, 
relation.fileFormat.fileConstantMetadataExtractors,
+  new FileSourceOptions(CaseInsensitiveMap(relation.options)))
   }
 
   /**
@@ -671,20 +669,18 @@ case class FileSourceScanExec(
*
* @param readFile a function to read each (part of a) file.
* @param selectedPartitions Hive-style partition that are part of the read.
-   * @param fsRelation [[HadoopFsRelation]] associated with the read.
*/
   private def createReadRDD(
   readFile: (PartitionedFile) => Iterator[InternalRow],
-  selectedPartitions: Array[PartitionDirectory],
-  fsRelation: HadoopFsRelation): RDD[InternalRow] = {
-val openCostInBytes = 
fsRelation.sparkSession.sessionState.conf.filesOpenCostInBytes
+  selectedPartitions: Array[PartitionDirectory]): RDD[InternalRow] = {
+val openCostInBytes = 
relation.sparkSession.sessionState.conf.filesOpenCostInBytes
 

[spark] branch master updated (71969fb9958 -> 4b60de63770)

2023-04-25 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 71969fb9958 [SPARK-43231][ML][PYTHON][CONNECT][TESTS] Reduce the 
memory requirement in torch-related tests
 add 4b60de63770 [SPARK-43243][PYTHON][CONNECT] Add level param to 
printSchema for Python

No new revisions were added by this update.

Summary of changes:
 .../src/main/protobuf/spark/connect/base.proto |   3 +
 .../service/SparkConnectAnalyzeHandler.scala   |   8 +-
 python/pyspark/sql/connect/client.py   |   3 +
 python/pyspark/sql/connect/dataframe.py|  10 +-
 python/pyspark/sql/connect/proto/base_pb2.py   | 214 ++---
 python/pyspark/sql/connect/proto/base_pb2.pyi  |  19 +-
 python/pyspark/sql/dataframe.py|  29 ++-
 python/pyspark/sql/tests/test_dataframe.py |  17 ++
 8 files changed, 186 insertions(+), 117 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-43231][ML][PYTHON][CONNECT][TESTS] Reduce the memory requirement in torch-related tests

2023-04-25 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 71969fb9958 [SPARK-43231][ML][PYTHON][CONNECT][TESTS] Reduce the 
memory requirement in torch-related tests
71969fb9958 is described below

commit 71969fb9958f0022aabcad36c89a461029cb3b8c
Author: Ruifeng Zheng 
AuthorDate: Tue Apr 25 15:28:05 2023 +0800

[SPARK-43231][ML][PYTHON][CONNECT][TESTS] Reduce the memory requirement in 
torch-related tests

### What changes were proposed in this pull request?
Reduce the memory requirement in torch-related tests

### Why are the changes needed?
The computation in torch distributor actually happens in the external torch 
processes, and the Github Action resources is very limited, this PR tries to 
make related tests more stable

### Does this PR introduce _any_ user-facing change?
no, test-only

### How was this patch tested?
CI, let me keep merging oncoming commits from master to see whether this 
change is stable enough

Closes #40874 from zhengruifeng/torch_reduce_memory.

Lead-authored-by: Ruifeng Zheng 
Co-authored-by: Ruifeng Zheng 
Signed-off-by: Ruifeng Zheng 
---
 python/pyspark/ml/tests/connect/test_parity_torch_distributor.py | 4 ++--
 python/pyspark/ml/torch/tests/test_distributor.py| 8 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py 
b/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py
index 55ea99a6540..b855332f96c 100644
--- a/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py
+++ b/python/pyspark/ml/tests/connect/test_parity_torch_distributor.py
@@ -64,7 +64,7 @@ class TorchDistributorLocalUnitTestsOnConnect(
 builder = builder.config(
 "spark.driver.resource.gpu.discoveryScript", 
cls.gpu_discovery_script_file_name
 )
-cls.spark = builder.remote("local-cluster[2,2,1024]").getOrCreate()
+cls.spark = builder.remote("local-cluster[2,2,512]").getOrCreate()
 
 @classmethod
 def tearDownClass(cls):
@@ -126,7 +126,7 @@ class TorchDistributorDistributedUnitTestsOnConnect(
 builder = builder.config(
 "spark.worker.resource.gpu.discoveryScript", 
cls.gpu_discovery_script_file_name
 )
-cls.spark = builder.remote("local-cluster[2,2,1024]").getOrCreate()
+cls.spark = builder.remote("local-cluster[2,2,512]").getOrCreate()
 
 @classmethod
 def tearDownClass(cls):
diff --git a/python/pyspark/ml/torch/tests/test_distributor.py 
b/python/pyspark/ml/torch/tests/test_distributor.py
index ebd859031bd..9fd0b4cba94 100644
--- a/python/pyspark/ml/torch/tests/test_distributor.py
+++ b/python/pyspark/ml/torch/tests/test_distributor.py
@@ -148,6 +148,8 @@ def get_local_mode_conf():
 return {
 "spark.test.home": SPARK_HOME,
 "spark.driver.resource.gpu.amount": "3",
+"spark.driver.memory": "512M",
+"spark.executor.memory": "512M",
 }
 
 
@@ -158,6 +160,8 @@ def get_distributed_mode_conf():
 "spark.task.cpus": "2",
 "spark.task.resource.gpu.amount": "1",
 "spark.executor.resource.gpu.amount": "1",
+"spark.driver.memory": "512M",
+"spark.executor.memory": "512M",
 }
 
 
@@ -412,7 +416,7 @@ class 
TorchDistributorLocalUnitTests(TorchDistributorLocalUnitTestsMixin, unitte
 "spark.driver.resource.gpu.discoveryScript", 
cls.gpu_discovery_script_file_name
 )
 
-sc = SparkContext("local-cluster[2,2,1024]", cls.__name__, conf=conf)
+sc = SparkContext("local-cluster[2,2,512]", cls.__name__, conf=conf)
 cls.spark = SparkSession(sc)
 
 @classmethod
@@ -502,7 +506,7 @@ class TorchDistributorDistributedUnitTests(
 "spark.worker.resource.gpu.discoveryScript", 
cls.gpu_discovery_script_file_name
 )
 
-sc = SparkContext("local-cluster[2,2,1024]", cls.__name__, conf=conf)
+sc = SparkContext("local-cluster[2,2,512]", cls.__name__, conf=conf)
 cls.spark = SparkSession(sc)
 
 @classmethod


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org