This is an automated email from the ASF dual-hosted git repository.
mgrund pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark-connect-go.git
The following commit(s) were added to refs/heads/master by this push:
new d24eec4 Supporting Spark-Submit for Spark Connect Go
d24eec4 is described below
commit d24eec4473d734c06495684a386c061b930d7889
Author: Martin Grund <[email protected]>
AuthorDate: Tue Dec 31 13:08:11 2024 +0100
Supporting Spark-Submit for Spark Connect Go
### What changes were proposed in this pull request?
This patch adds an example runner script and wrapping Java application to
submit an application written and compiled using Spark Connect Go to a Spark
cluster using Spark Connect.
The easiest way to test this approach is as follows:
```bash
export SPARK_HOME=<path_to_spark>
cd java
sbt publishLocal
./run.sh
../cmd/spark-connect-example-spark-session/spark-connect-example-spark-session
```
This `run.sh` wrapper script is primarily meant for illustrative purposes
and can be extended to work with any Spark distribution. Under the hood it
essentially calls `spark-submit` as follows.
```
$SPARK_HOME/bin/spark-submit \
--master local[4] \
--files $BINARY_PATH \
--conf spark.golang.binary=$BINARY_NAME \
--class org.apache.spark.golang.Runner \
--packages org.apache.spark:spark-connect_$SCALA_VERSION:$SPARK_VERSION \
target/scala-$SCALA_VERSION/sparkconnectgorunner_$SCALA_VERSION-0.1.0-SNAPSHOT.jar
```

### Why are the changes needed?
Compatibility.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
* Manual testing
* Added the example application to the Github CI action so that it's run on
every build.
Closes #98 from grundprinzip/spark_submit.
Authored-by: Martin Grund <[email protected]>
Signed-off-by: Martin Grund <[email protected]>
---
.github/workflows/build.yml | 10 ++++
.gitignore | 2 +
Makefile | 10 ++--
README.md | 6 +++
dev/.rat-excludes | 3 +-
java/.gitignore | 2 +
java/README.md | 29 +++++++++++
java/build.sbt | 30 ++++++++++++
java/project/build.properties | 1 +
java/run.sh | 51 ++++++++++++++++++++
.../scala/org/apache/spark/golang/Runner.scala | 56 ++++++++++++++++++++++
spark/client/conf.go | 15 ++++++
12 files changed, 209 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index fdb8d6f..03a5f1a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -74,6 +74,9 @@ jobs:
path: |
/home/runner/deps/spark-${{ env.SPARK_VERSION }}-bin-hadoop${{
env.HADOOP_VERSION }}
+ - name: Setup SBT
+ uses: sbt/setup-sbt@v1
+
- name: Setup Apache Spark
if: steps.cache.outputs.cache-hit != 'true'
run: |
@@ -98,6 +101,13 @@ jobs:
make gen
make
make test
+ - name: Run Example Spark Submit Application
+ run: |
+ export SPARK_HOME=/home/runner/deps/spark-${{ env.SPARK_VERSION
}}-bin-hadoop${{ env.HADOOP_VERSION }}
+ make
+ cd java
+ sbt publishLocal
+ ./run.sh
../cmd/spark-connect-example-spark-session/spark-connect-example-spark-session
- name: Run Integration Test
run: |
export SPARK_HOME=/home/runner/deps/spark-${{ env.SPARK_VERSION
}}-bin-hadoop${{ env.HADOOP_VERSION }}
diff --git a/.gitignore b/.gitignore
index 3f791f9..f51c9fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,5 @@ target
lib
deps
+
+.DS_Store
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 999fea5..105bac8 100644
--- a/Makefile
+++ b/Makefile
@@ -40,7 +40,7 @@ GOFUMPT_SPLIT_LONG_LINES := on
## Build tools
BUF := $(GO) run github.com/bufbuild/buf/cmd/[email protected]
-BINARIES :=
cmd/spark-connect-example-spark-session
cmd/spark-connect-example-raw-grpc-client
+BINARIES :=
cmd/spark-connect-example-spark-session/spark-connect-example-spark-session
cmd/spark-connect-example-raw-grpc-client/spark-connect-example-raw-grpc-client
# Define the location of SPARK_HOME because we need that to depend on the
build paths
MAKEFILE_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
@@ -54,14 +54,14 @@ all: build
build: $(BUILD_OUTPUT) $(BINARIES) internal/generated.out
-cmd/spark-connect-example-raw-grpc-client: $(GOFILES_BUILD)
+cmd/spark-connect-example-raw-grpc-client/spark-connect-example-raw-grpc-client:
$(GOFILES_BUILD)
@echo ">> BUILD, output = $@"
- @cd $@ && $(GO) build -o $(notdir $@) $(BUILDFLAGS)
+ @cd $(dir $@) && $(GO) build -o $(notdir $@) $(BUILDFLAGS)
@printf '%s\n' '$(OK)'
-cmd/spark-connect-example-spark-session: $(GOFILES_BUILD)
+cmd/spark-connect-example-spark-session/spark-connect-example-spark-session:
$(GOFILES_BUILD)
@echo ">> BUILD, output = $@"
- @cd $@ && $(GO) build -o $(notdir $@) $(BUILDFLAGS)
+ @cd $(dir $@) && $(GO) build -o $(notdir $@) $(BUILDFLAGS)
@printf '%s\n' '$(OK)'
internal/generated.out:
diff --git a/README.md b/README.md
index 1bfaffe..9c4dd32 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,12 @@ Step 5: Run the example Go application.
go run cmd/spark-connect-example-spark-session/main.go
```
+## Runnning Spark Connect Go Application in a Spark Cluster
+
+To run the Spark Connect Go application in a Spark Cluster, you need to build
the Go application and submit it to the Spark Cluster. You can find a more
detailed example runner and wrapper script in the `java` directory.
+
+See the guide here: [Sample Spark-Submit Wrapper](java/README.md).
+
## How to write Spark Connect Go Application in your own project
See [Quick Start Guide](quick-start.md)
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index d21505b..d0fea56 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -14,4 +14,5 @@ target
generated.out
go.sum
deps
-cov.report
\ No newline at end of file
+cov.report
+build.properties
\ No newline at end of file
diff --git a/java/.gitignore b/java/.gitignore
new file mode 100644
index 0000000..a6d7d2c
--- /dev/null
+++ b/java/.gitignore
@@ -0,0 +1,2 @@
+project
+target
\ No newline at end of file
diff --git a/java/README.md b/java/README.md
new file mode 100644
index 0000000..20237ad
--- /dev/null
+++ b/java/README.md
@@ -0,0 +1,29 @@
+# Sample Spark-Submit Wrapper
+
+This directory provides a simple wrapper library that can be used to submit a
Spark Connect Go application to a Spark Cluster.
+
+## Wrapper Library
+
+The wrapper library expects to variable input values:
+
+1. The path to the binary file that contains the Spark Connect Go application.
This path is specified via the Spark conf property `spark.golang.binary`.
+2. The actual binary has to be submitted as part of the application using the
`--files` parameter to the `spark-submit` script.
+
+Building the libary can be done using:
+
+```bash
+sbt package
+```
+
+## Run Script
+
+The `run.sh` script is a simple script that can be used to submit a Spark
Connect Go application to a Spark Cluster. The script can be called as follows:
+
+```bash
+export SPARK_HOME=/path/to/spark
+./run.sh
../cmd/spark-connect-example-spark-session/spark-connect-example-spark-session
+```
+
+When this is called from the current directory and with the Spark Connect
Golang client build, it will submit the example application to the Spark
Cluster.
+
+The `run.sh` script can be modified according to your needs.
\ No newline at end of file
diff --git a/java/build.sbt b/java/build.sbt
new file mode 100644
index 0000000..49d1484
--- /dev/null
+++ b/java/build.sbt
@@ -0,0 +1,30 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+ThisBuild / version := "0.1.0-SNAPSHOT"
+
+ThisBuild / scalaVersion := "2.12.19"
+
+lazy val root = (project in file("."))
+ .settings(
+ name := "SparkConnectGoRunner"
+ )
+
+libraryDependencies += "org.apache.spark" %% "spark-sql-api" % "3.5.4"
+libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.4"
+libraryDependencies += "org.apache.spark" %% "spark-core" % "3.5.4"
+libraryDependencies += "org.apache.spark" %% "spark-connect-common" % "3.5.4"
+libraryDependencies += "org.apache.spark" %% "spark-connect" % "3.5.4"
\ No newline at end of file
diff --git a/java/project/build.properties b/java/project/build.properties
new file mode 100644
index 0000000..73df629
--- /dev/null
+++ b/java/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.10.7
diff --git a/java/run.sh b/java/run.sh
new file mode 100755
index 0000000..ed7cb97
--- /dev/null
+++ b/java/run.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+SCALA_VERSION=2.12
+SPARK_VERSION=3.5.4
+
+if [ -z "$SPARK_HOME" ]; then
+ echo "SPARK_HOME must be set to run this script."
+ exit 1
+fi
+
+BINARY_PATH=$1
+
+if [ -z "$BINARY_PATH" ]; then
+ echo "Usage: $0 <path-to-binary>"
+ exit 1
+fi
+
+# Check if the binary exists.
+if [ ! -f "$BINARY_PATH" ]; then
+ echo "Binary not found: $BINARY_PATH, make sure the path is valid."
+ exit 1
+fi
+
+# Get the absolute path of the binary.
+BINARY_PATH=$(realpath $BINARY_PATH)
+BINARY_NAME=$(basename $BINARY_PATH)
+
+# Call the spark-submit script.
+$SPARK_HOME/bin/spark-submit \
+ --master local[4] \
+ --files $BINARY_PATH \
+ --conf spark.golang.binary=$BINARY_NAME \
+ --class org.apache.spark.golang.Runner \
+ --packages org.apache.spark:spark-connect_$SCALA_VERSION:$SPARK_VERSION \
+
target/scala-$SCALA_VERSION/sparkconnectgorunner_$SCALA_VERSION-0.1.0-SNAPSHOT.jar
\ No newline at end of file
diff --git a/java/src/main/scala/org/apache/spark/golang/Runner.scala
b/java/src/main/scala/org/apache/spark/golang/Runner.scala
new file mode 100644
index 0000000..ee469ff
--- /dev/null
+++ b/java/src/main/scala/org/apache/spark/golang/Runner.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.golang
+
+import scala.sys.process._
+
+import org.apache.spark.{SparkContext, SparkFiles}
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.connect.service.SparkConnectService
+
+/**
+ * This is the main entry point for the Spark Connect Go runner.
+ *
+ * To run any Go code on your Spark cluster using spark-submit, you can use
+ * this very simple wrapper to do so. To
+ */
+object Runner extends Logging {
+ def main(args: Array[String]): Unit = {
+ // Instantiate a new Spark Context.
+ val ctx = SparkContext.getOrCreate()
+ // Start the SparkConnect service which will listen for incoming requests.
+ SparkConnectService.start(ctx)
+
+ // Create a new Spark Session to fetch the port configuration that the
service
+ // listens on.
+ val spark = SparkSession.builder().getOrCreate()
+ val port = spark.conf.get("spark.connect.grpc.binding.port").toInt
+
+ // Fetch the binary of the program to be executed.
+ val bin = spark.conf.get("spark.golang.binary")
+
+ // Fetch the local path of the binary.
+ val path = SparkFiles.get(bin)
+ val process = Process(path, None, "SPARK_REMOTE" ->
s"sc://localhost:$port")
+ process.!
+ logWarning("Stopping Spark Connect service")
+ SparkConnectService.stop()
+ ctx.stop()
+ }
+}
\ No newline at end of file
diff --git a/spark/client/conf.go b/spark/client/conf.go
index ddfe8fc..61c1c5c 100644
--- a/spark/client/conf.go
+++ b/spark/client/conf.go
@@ -1,3 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
package client
import (
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]