This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 2de9160a1 [CI] Docker build improvements (#1425)
2de9160a1 is described below
commit 2de9160a14f03407463126f5277b144599cfa910
Author: Kristin Cowalcijk <[email protected]>
AuthorDate: Fri May 24 14:38:09 2024 +0800
[CI] Docker build improvements (#1425)
* Improve docker file
* Make it work for both latest version and released version
* Fix build for macos
* Fix buildx for macos
* Install docker-buildx for macos-12
* Change matrix definition for docker build
* Check total physical memory and DRIVER_MEM, EXECUTOR_MEM configurations
before launching
* Fix gresearch spark-extension version and improved code for detecting
spark versions
* Use array form of CMD to allow control signals to be passed to jupyter lab
---
.github/workflows/docker-build.yml | 23 +++++---
docker/sedona-spark-jupyterlab/.dockerignore | 3 -
docker/sedona-spark-jupyterlab/build.sh | 64 +++++++++++++++++++--
.../sedona-jupyterlab.dockerfile | 11 +---
.../sedona-jupyterlab.dockerfile.dockerignore | 8 +++
docker/sedona-spark-jupyterlab/start.sh | 66 ++++++++++++++++++++++
docker/sedona.sh | 16 +++---
7 files changed, 157 insertions(+), 34 deletions(-)
diff --git a/.github/workflows/docker-build.yml
b/.github/workflows/docker-build.yml
index d02282e60..4aa8108d4 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -25,13 +25,17 @@ jobs:
fail-fast: true
matrix:
os: ['ubuntu-latest', 'macos-12']
+ spark: ['3.5.1', '3.4.3', '3.3.4']
include:
- - spark: 3.4.1
- sedona: 1.4.1
- - spark: 3.4.1
- sedona: latest
- - spark: 3.3.2
- sedona: latest
+ - spark: 3.5.1
+ sedona: "latest"
+ geotools: "auto"
+ - spark: 3.4.3
+ sedona: 1.6.0
+ geotools: 28.2
+ - spark: 3.3.4
+ sedona: 1.6.0
+ geotools: 28.2
runs-on: ${{ matrix.os }}
defaults:
run:
@@ -54,7 +58,12 @@ jobs:
run: |
brew install docker
colima start
+ DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker}
+ mkdir -p $DOCKER_CONFIG/cli-plugins
+ curl -SL
https://github.com/docker/buildx/releases/download/v0.14.1/buildx-v0.14.1.darwin-amd64
-o $DOCKER_CONFIG/cli-plugins/docker-buildx
+ chmod +x $DOCKER_CONFIG/cli-plugins/docker-buildx
- env:
SPARK_VERSION: ${{ matrix.spark }}
SEDONA_VERSION: ${{ matrix.sedona }}
- run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION}
${SEDONA_VERSION}
+ GEOTOOLS_VERSION: ${{ matrix.geotools }}
+ run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION}
${SEDONA_VERSION} local ${GEOTOOLS_VERSION}
diff --git a/docker/sedona-spark-jupyterlab/.dockerignore
b/docker/sedona-spark-jupyterlab/.dockerignore
deleted file mode 100644
index 4b703cfc8..000000000
--- a/docker/sedona-spark-jupyterlab/.dockerignore
+++ /dev/null
@@ -1,3 +0,0 @@
-Dockerfile
-compose.yml
-README.md
diff --git a/docker/sedona-spark-jupyterlab/build.sh
b/docker/sedona-spark-jupyterlab/build.sh
index 215188aba..c13c91368 100755
--- a/docker/sedona-spark-jupyterlab/build.sh
+++ b/docker/sedona-spark-jupyterlab/build.sh
@@ -20,26 +20,77 @@
SPARK_VERSION=$1
SEDONA_VERSION=$2
BUILD_MODE=$3
+GEOTOOLS_VERSION=${4:-auto}
-lower_version=$(echo -e ${SPARK_VERSION}"\n3.4" | sort -V | head -n1)
-if [ $lower_version = "3.4" ]; then
- SEDONA_SPARK_VERSION=3.4
-else
+SEDONA_SPARK_VERSION=${SPARK_VERSION:0:3}
+if [ ${SPARK_VERSION:0:1} -eq "3" ] && [ ${SPARK_VERSION:2:1} -le "3" ]; then
+ # 3.0, 3.1, 3.2, 3.3
SEDONA_SPARK_VERSION=3.0
fi
+# Function to compare two version numbers
+version_gt() {
+ # Compare two version numbers
+ # Returns 0 if the first version is greater, 1 otherwise
+ [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" != "$1" ]
+}
+
+# Function to get the latest version of a Maven package
+get_latest_version_with_suffix() {
+ BASE_URL=$1
+ SUFFIX=$2
+
+ # Fetch the maven-metadata.xml file
+ METADATA_URL="${BASE_URL}maven-metadata.xml"
+ METADATA_XML=$(curl -s $METADATA_URL)
+
+ # Extract versions from the XML
+ VERSIONS=$(echo "$METADATA_XML" | grep -o '<version>[^<]*</version>' | awk
-F'[<>]' '{print $3}')
+
+ LATEST_VERSION=""
+
+ # Filter versions that end with the specified suffix and find the largest one
+ for VERSION in $VERSIONS; do
+ if [[ $VERSION == *$SUFFIX ]]; then
+ if [[ -z $LATEST_VERSION ]] || version_gt $VERSION $LATEST_VERSION; then
+ LATEST_VERSION=$VERSION
+ fi
+ fi
+ done
+
+ if [[ -z $LATEST_VERSION ]]; then
+ exit 1
+ else
+ echo $LATEST_VERSION
+ fi
+}
+
+if [ "$GEOTOOLS_VERSION" = "auto" ]; then
+ GEOTOOLS_VERSION=$(mvn help:evaluate -Dexpression=geotools.version -q
-DforceStdout)
+ echo "GeoTools version inferred from pom.xml: $GEOTOOLS_VERSION"
+fi
+
+GEOTOOLS_WRAPPER_VERSION="${SEDONA_VERSION}-${GEOTOOLS_VERSION}"
if [ "$SEDONA_VERSION" = "latest" ]; then
+ GEOTOOLS_WRAPPER_VERSION=$(get_latest_version_with_suffix
"https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/"
"$GEOTOOLS_VERSION")
+ if [ -z "$GEOTOOLS_WRAPPER_VERSION" ]; then
+ echo "No geotools-wrapper version with suffix $GEOTOOLS_VERSION"
+ exit 1
+ fi
+ echo "Using latest geotools-wrapper version: $GEOTOOLS_WRAPPER_VERSION"
+
# The compilation must take place outside Docker to avoid unnecessary
maven packages
- mvn clean install -DskipTests -Dspark=${SEDONA_SPARK_VERSION} -Dgeotools
-Dscala=2.12
+ mvn clean install -DskipTests -Dspark=${SEDONA_SPARK_VERSION} -Dscala=2.12
fi
# -- Building the image
if [ -z "$BUILD_MODE" ] || [ "$BUILD_MODE" = "local" ]; then
# If local, build the image for the local environment
- docker build \
+ docker buildx build \
--build-arg spark_version="${SPARK_VERSION}" \
--build-arg sedona_version="${SEDONA_VERSION}" \
+ --build-arg geotools_wrapper_version="${GEOTOOLS_WRAPPER_VERSION}" \
-f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
-t apache/sedona:${SEDONA_VERSION} .
else
@@ -50,6 +101,7 @@ else
--output type=registry \
--build-arg spark_version="${SPARK_VERSION}" \
--build-arg sedona_version="${SEDONA_VERSION}" \
+ --build-arg geotools_wrapper_version="${GEOTOOLS_WRAPPER_VERSION}" \
-f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
-t apache/sedona:${SEDONA_VERSION} .
fi
diff --git a/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
index 6eefd3664..1be8b8f67 100644
--- a/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
+++ b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
@@ -73,12 +73,5 @@ EXPOSE 4040
WORKDIR ${SHARED_WORKSPACE}
-
-
-CMD DRIVER_MEM=${DRIVER_MEM:-4g} && \
- EXECUTOR_MEM=${EXECUTOR_MEM:-4g} && \
- echo "spark.driver.memory $DRIVER_MEM" >>
${SPARK_HOME}/conf/spark-defaults.conf && \
- echo "spark.executor.memory $EXECUTOR_MEM" >>
${SPARK_HOME}/conf/spark-defaults.conf && \
- service ssh start && \
- ${SPARK_HOME}/sbin/start-all.sh && \
- jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root
--NotebookApp.token=
+COPY docker/sedona-spark-jupyterlab/start.sh /opt/
+CMD ["/bin/bash", "/opt/start.sh"]
diff --git
a/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile.dockerignore
b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile.dockerignore
new file mode 100644
index 000000000..e12f07ae2
--- /dev/null
+++ b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile.dockerignore
@@ -0,0 +1,8 @@
+# Ignore everything
+*
+
+# Allow files and folders with a pattern starting with !
+!docker/**
+!docs/usecases/**
+!python/**
+!spark-shaded/target/**
diff --git a/docker/sedona-spark-jupyterlab/start.sh
b/docker/sedona-spark-jupyterlab/start.sh
new file mode 100755
index 000000000..840d80c6b
--- /dev/null
+++ b/docker/sedona-spark-jupyterlab/start.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+DRIVER_MEM=${DRIVER_MEM:-4g}
+EXECUTOR_MEM=${EXECUTOR_MEM:-4g}
+
+# Function to convert memory string to megabytes
+convert_to_mb() {
+ local mem_str=$1
+ local mem_value=${mem_str%[gGmM]}
+ local mem_unit=${mem_str: -1}
+
+ case $mem_unit in
+ [gG])
+ echo $(($mem_value * 1024))
+ ;;
+ [mM])
+ echo $mem_value
+ ;;
+ *)
+ echo "Invalid memory unit: $mem_str" >&2
+ return 1
+ ;;
+ esac
+}
+
+# Convert DRIVER_MEM and EXECUTOR_MEM to megabytes
+DRIVER_MEM_MB=$(convert_to_mb $DRIVER_MEM)
+if [ $? -ne 0 ]; then
+ echo "Error converting DRIVER_MEM to megabytes." >&2
+ exit 1
+fi
+
+EXECUTOR_MEM_MB=$(convert_to_mb $EXECUTOR_MEM)
+if [ $? -ne 0 ]; then
+ echo "Error converting EXECUTOR_MEM to megabytes." >&2
+ exit 1
+fi
+
+# Get total physical memory in megabytes
+TOTAL_PHYSICAL_MEM_MB=$(free -m | awk '/^Mem:/{print $2}')
+
+# Calculate the total required memory
+TOTAL_REQUIRED_MEM_MB=$(($DRIVER_MEM_MB + $EXECUTOR_MEM_MB))
+
+# Compare total required memory with total physical memory
+if [ $TOTAL_REQUIRED_MEM_MB -gt $TOTAL_PHYSICAL_MEM_MB ]; then
+ echo "Error: Insufficient memory" >&2
+ echo " total: $TOTAL_PHYSICAL_MEM_MB MB" >&2
+ echo " required: $TOTAL_REQUIRED_MEM_MB MB (driver: $DRIVER_MEM_MB MB,
executor: $EXECUTOR_MEM_MB MB)" >&2
+ echo "Please tune DRIVER_MEM and EXECUTOR_MEM to smaller values." >&2
+ echo "e.g: docker run -e DRIVER_MEM=2g -e EXECUTOR_MEM=2g ..." >&2
+ exit 1
+fi
+
+# Configure spark
+cp ${SPARK_HOME}/conf/spark-env.sh.template ${SPARK_HOME}/conf/spark-env.sh
+echo "SPARK_WORKER_MEMORY=${EXECUTOR_MEM}" >> ${SPARK_HOME}/conf/spark-env.sh
+echo "spark.driver.memory $DRIVER_MEM" >>
${SPARK_HOME}/conf/spark-defaults.conf
+echo "spark.executor.memory $EXECUTOR_MEM" >>
${SPARK_HOME}/conf/spark-defaults.conf
+
+# Start spark standalone cluster
+service ssh start
+${SPARK_HOME}/sbin/start-all.sh
+
+# Start jupyter lab
+exec jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root
--NotebookApp.token=
diff --git a/docker/sedona.sh b/docker/sedona.sh
index 1a7a0d0ef..6392c5b8a 100755
--- a/docker/sedona.sh
+++ b/docker/sedona.sh
@@ -23,33 +23,31 @@ geotools_wrapper_version=$2
spark_version=$3
spark_extension_version=$4
-lower_version=$(echo -e $spark_version"\n3.4" | sort -V | head -n1)
-if [ $lower_version = "3.4" ]; then
- sedona_spark_version=3.4
-else
+spark_compat_version=${spark_version:0:3}
+sedona_spark_version=${spark_compat_version}
+if [ ${spark_version:0:1} -eq "3" ] && [ ${spark_version:2:1} -le "3" ]; then
+ # 3.0, 3.1, 3.2, 3.3
sedona_spark_version=3.0
fi
if [ $sedona_version = "latest" ]; then
# Code to execute when SEDONA_VERSION is "latest"
cp ${SEDONA_HOME}/spark-shaded/target/sedona-spark-shaded-*.jar
${SPARK_HOME}/jars/
- cd ${SEDONA_HOME}/python;pip3 install shapely==1.8.4;pip3 install .
+ cd ${SEDONA_HOME}/python;pip3 install .
else
# Code to execute when SEDONA_VERSION is not "latest"
# Download Sedona
curl
https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-${sedona_spark_version}_2.12/${sedona_version}/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar
-o
$SPARK_HOME/jars/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar
# Install Sedona Python
- pip3 install shapely==1.8.4
pip3 install apache-sedona==${sedona_version}
-
fi
# Download gresearch spark extension
-curl
https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.12/${spark_extension_version}-${sedona_spark_version}/spark-extension_2.12-${spark_extension_version}-${sedona_spark_version}.jar
-o
$SPARK_HOME/jars/spark-extension_2.12-${spark_extension_version}-${sedona_spark_version}.jar
+curl
https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.12/${spark_extension_version}-${spark_compat_version}/spark-extension_2.12-${spark_extension_version}-${spark_compat_version}.jar
-o
$SPARK_HOME/jars/spark-extension_2.12-${spark_extension_version}-${spark_compat_version}.jar
# Install Spark extension Python
-pip3 install
pyspark-extension==${spark_extension_version}.${sedona_spark_version}
+pip3 install
pyspark-extension==${spark_extension_version}.${spark_compat_version}
# Download GeoTools jar
curl
https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/${geotools_wrapper_version}/geotools-wrapper-${geotools_wrapper_version}.jar
-o $SPARK_HOME/jars/geotools-wrapper-${geotools_wrapper_version}.jar