This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new 2de9160a1 [CI] Docker build improvements (#1425)
2de9160a1 is described below

commit 2de9160a14f03407463126f5277b144599cfa910
Author: Kristin Cowalcijk <[email protected]>
AuthorDate: Fri May 24 14:38:09 2024 +0800

    [CI] Docker build improvements (#1425)
    
    * Improve docker file
    
    * Make it work for both latest version and released version
    
    * Fix build for macos
    
    * Fix buildx for macos
    
    * Install docker-buildx for macos-12
    
    * Change matrix definition for docker build
    
    * Check total physical memory and DRIVER_MEM, EXECUTOR_MEM configurations 
before launching
    
    * Fix gresearch spark-extension version and improved code for detecting 
spark versions
    
    * Use array form of CMD to allow control signals to be passed to jupyter lab
---
 .github/workflows/docker-build.yml                 | 23 +++++---
 docker/sedona-spark-jupyterlab/.dockerignore       |  3 -
 docker/sedona-spark-jupyterlab/build.sh            | 64 +++++++++++++++++++--
 .../sedona-jupyterlab.dockerfile                   | 11 +---
 .../sedona-jupyterlab.dockerfile.dockerignore      |  8 +++
 docker/sedona-spark-jupyterlab/start.sh            | 66 ++++++++++++++++++++++
 docker/sedona.sh                                   | 16 +++---
 7 files changed, 157 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/docker-build.yml 
b/.github/workflows/docker-build.yml
index d02282e60..4aa8108d4 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -25,13 +25,17 @@ jobs:
       fail-fast: true
       matrix:
         os: ['ubuntu-latest', 'macos-12']
+        spark: ['3.5.1', '3.4.3', '3.3.4']
         include:
-          - spark: 3.4.1
-            sedona: 1.4.1
-          - spark: 3.4.1
-            sedona: latest
-          - spark: 3.3.2
-            sedona: latest
+          - spark: 3.5.1
+            sedona: "latest"
+            geotools: "auto"
+          - spark: 3.4.3
+            sedona: 1.6.0
+            geotools: 28.2
+          - spark: 3.3.4
+            sedona: 1.6.0
+            geotools: 28.2
     runs-on: ${{ matrix.os }}
     defaults:
       run:
@@ -54,7 +58,12 @@ jobs:
       run: |
         brew install docker
         colima start
+        DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker}
+        mkdir -p $DOCKER_CONFIG/cli-plugins
+        curl -SL 
https://github.com/docker/buildx/releases/download/v0.14.1/buildx-v0.14.1.darwin-amd64
 -o $DOCKER_CONFIG/cli-plugins/docker-buildx
+        chmod +x $DOCKER_CONFIG/cli-plugins/docker-buildx
     - env:
         SPARK_VERSION: ${{ matrix.spark }}
         SEDONA_VERSION: ${{ matrix.sedona }}
-      run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION} 
${SEDONA_VERSION}
+        GEOTOOLS_VERSION: ${{ matrix.geotools }}
+      run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION} 
${SEDONA_VERSION} local ${GEOTOOLS_VERSION}
diff --git a/docker/sedona-spark-jupyterlab/.dockerignore 
b/docker/sedona-spark-jupyterlab/.dockerignore
deleted file mode 100644
index 4b703cfc8..000000000
--- a/docker/sedona-spark-jupyterlab/.dockerignore
+++ /dev/null
@@ -1,3 +0,0 @@
-Dockerfile
-compose.yml
-README.md
diff --git a/docker/sedona-spark-jupyterlab/build.sh 
b/docker/sedona-spark-jupyterlab/build.sh
index 215188aba..c13c91368 100755
--- a/docker/sedona-spark-jupyterlab/build.sh
+++ b/docker/sedona-spark-jupyterlab/build.sh
@@ -20,26 +20,77 @@
 SPARK_VERSION=$1
 SEDONA_VERSION=$2
 BUILD_MODE=$3
+GEOTOOLS_VERSION=${4:-auto}
 
-lower_version=$(echo -e ${SPARK_VERSION}"\n3.4" | sort -V | head -n1)
-if [ $lower_version = "3.4" ]; then
-    SEDONA_SPARK_VERSION=3.4
-else
+SEDONA_SPARK_VERSION=${SPARK_VERSION:0:3}
+if [ ${SPARK_VERSION:0:1} -eq "3" ] && [ ${SPARK_VERSION:2:1} -le "3" ]; then
+    # 3.0, 3.1, 3.2, 3.3
     SEDONA_SPARK_VERSION=3.0
 fi
 
+# Function to compare two version numbers
+version_gt() {
+  # Compare two version numbers
+  # Returns 0 if the first version is greater, 1 otherwise
+  [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" != "$1" ]
+}
+
+# Function to get the latest version of a Maven package
+get_latest_version_with_suffix() {
+  BASE_URL=$1
+  SUFFIX=$2
+
+  # Fetch the maven-metadata.xml file
+  METADATA_URL="${BASE_URL}maven-metadata.xml"
+  METADATA_XML=$(curl -s $METADATA_URL)
+
+  # Extract versions from the XML
+  VERSIONS=$(echo "$METADATA_XML" | grep -o '<version>[^<]*</version>' | awk 
-F'[<>]' '{print $3}')
+
+  LATEST_VERSION=""
+
+  # Filter versions that end with the specified suffix and find the largest one
+  for VERSION in $VERSIONS; do
+    if [[ $VERSION == *$SUFFIX ]]; then
+      if [[ -z $LATEST_VERSION ]] || version_gt $VERSION $LATEST_VERSION; then
+        LATEST_VERSION=$VERSION
+      fi
+    fi
+  done
+
+  if [[ -z $LATEST_VERSION ]]; then
+    exit 1
+  else
+    echo $LATEST_VERSION
+  fi
+}
+
+if [ "$GEOTOOLS_VERSION" = "auto" ]; then
+    GEOTOOLS_VERSION=$(mvn help:evaluate -Dexpression=geotools.version -q 
-DforceStdout)
+    echo "GeoTools version inferred from pom.xml: $GEOTOOLS_VERSION"
+fi
+
+GEOTOOLS_WRAPPER_VERSION="${SEDONA_VERSION}-${GEOTOOLS_VERSION}"
 if [ "$SEDONA_VERSION" = "latest" ]; then
+    GEOTOOLS_WRAPPER_VERSION=$(get_latest_version_with_suffix 
"https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/"; 
"$GEOTOOLS_VERSION")
+    if [ -z "$GEOTOOLS_WRAPPER_VERSION" ]; then
+        echo "No geotools-wrapper version with suffix $GEOTOOLS_VERSION"
+        exit 1
+    fi
+    echo "Using latest geotools-wrapper version: $GEOTOOLS_WRAPPER_VERSION"
+
     # The compilation must take place outside Docker to avoid unnecessary 
maven packages
-    mvn clean install -DskipTests  -Dspark=${SEDONA_SPARK_VERSION} -Dgeotools 
-Dscala=2.12
+    mvn clean install -DskipTests -Dspark=${SEDONA_SPARK_VERSION} -Dscala=2.12
 fi
 
 # -- Building the image
 
 if [ -z "$BUILD_MODE" ] || [ "$BUILD_MODE" = "local" ]; then
     # If local, build the image for the local environment
-    docker build \
+    docker buildx build \
     --build-arg spark_version="${SPARK_VERSION}" \
     --build-arg sedona_version="${SEDONA_VERSION}" \
+    --build-arg geotools_wrapper_version="${GEOTOOLS_WRAPPER_VERSION}" \
     -f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
     -t apache/sedona:${SEDONA_VERSION} .
 else
@@ -50,6 +101,7 @@ else
     --output type=registry \
     --build-arg spark_version="${SPARK_VERSION}" \
     --build-arg sedona_version="${SEDONA_VERSION}" \
+    --build-arg geotools_wrapper_version="${GEOTOOLS_WRAPPER_VERSION}" \
     -f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
     -t apache/sedona:${SEDONA_VERSION} .
 fi
diff --git a/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile 
b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
index 6eefd3664..1be8b8f67 100644
--- a/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
+++ b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
@@ -73,12 +73,5 @@ EXPOSE 4040
 
 WORKDIR ${SHARED_WORKSPACE}
 
-
-
-CMD DRIVER_MEM=${DRIVER_MEM:-4g} && \
-    EXECUTOR_MEM=${EXECUTOR_MEM:-4g} && \
-    echo "spark.driver.memory $DRIVER_MEM" >> 
${SPARK_HOME}/conf/spark-defaults.conf && \
-    echo "spark.executor.memory $EXECUTOR_MEM" >> 
${SPARK_HOME}/conf/spark-defaults.conf && \
-    service ssh start && \
-    ${SPARK_HOME}/sbin/start-all.sh && \
-    jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root 
--NotebookApp.token=
+COPY docker/sedona-spark-jupyterlab/start.sh /opt/
+CMD ["/bin/bash", "/opt/start.sh"]
diff --git 
a/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile.dockerignore 
b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile.dockerignore
new file mode 100644
index 000000000..e12f07ae2
--- /dev/null
+++ b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile.dockerignore
@@ -0,0 +1,8 @@
+# Ignore everything
+*
+
+# Allow files and folders with a pattern starting with !
+!docker/**
+!docs/usecases/**
+!python/**
+!spark-shaded/target/**
diff --git a/docker/sedona-spark-jupyterlab/start.sh 
b/docker/sedona-spark-jupyterlab/start.sh
new file mode 100755
index 000000000..840d80c6b
--- /dev/null
+++ b/docker/sedona-spark-jupyterlab/start.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+DRIVER_MEM=${DRIVER_MEM:-4g}
+EXECUTOR_MEM=${EXECUTOR_MEM:-4g}
+
+# Function to convert memory string to megabytes
+convert_to_mb() {
+  local mem_str=$1
+  local mem_value=${mem_str%[gGmM]}
+  local mem_unit=${mem_str: -1}
+
+  case $mem_unit in
+    [gG])
+      echo $(($mem_value * 1024))
+      ;;
+    [mM])
+      echo $mem_value
+      ;;
+    *)
+      echo "Invalid memory unit: $mem_str" >&2
+      return 1
+      ;;
+  esac
+}
+
+# Convert DRIVER_MEM and EXECUTOR_MEM to megabytes
+DRIVER_MEM_MB=$(convert_to_mb $DRIVER_MEM)
+if [ $? -ne 0 ]; then
+  echo "Error converting DRIVER_MEM to megabytes." >&2
+  exit 1
+fi
+
+EXECUTOR_MEM_MB=$(convert_to_mb $EXECUTOR_MEM)
+if [ $? -ne 0 ]; then
+  echo "Error converting EXECUTOR_MEM to megabytes." >&2
+  exit 1
+fi
+
+# Get total physical memory in megabytes
+TOTAL_PHYSICAL_MEM_MB=$(free -m | awk '/^Mem:/{print $2}')
+
+# Calculate the total required memory
+TOTAL_REQUIRED_MEM_MB=$(($DRIVER_MEM_MB + $EXECUTOR_MEM_MB))
+
+# Compare total required memory with total physical memory
+if [ $TOTAL_REQUIRED_MEM_MB -gt $TOTAL_PHYSICAL_MEM_MB ]; then
+    echo "Error: Insufficient memory" >&2
+    echo "  total:    $TOTAL_PHYSICAL_MEM_MB MB" >&2
+    echo "  required: $TOTAL_REQUIRED_MEM_MB MB (driver: $DRIVER_MEM_MB MB, 
executor: $EXECUTOR_MEM_MB MB)" >&2
+    echo "Please tune DRIVER_MEM and EXECUTOR_MEM to smaller values." >&2
+    echo "e.g: docker run -e DRIVER_MEM=2g -e EXECUTOR_MEM=2g ..." >&2
+    exit 1
+fi
+
+# Configure spark
+cp ${SPARK_HOME}/conf/spark-env.sh.template ${SPARK_HOME}/conf/spark-env.sh
+echo "SPARK_WORKER_MEMORY=${EXECUTOR_MEM}" >> ${SPARK_HOME}/conf/spark-env.sh
+echo "spark.driver.memory $DRIVER_MEM" >> 
${SPARK_HOME}/conf/spark-defaults.conf
+echo "spark.executor.memory $EXECUTOR_MEM" >> 
${SPARK_HOME}/conf/spark-defaults.conf
+
+# Start spark standalone cluster
+service ssh start
+${SPARK_HOME}/sbin/start-all.sh
+
+# Start jupyter lab
+exec jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root 
--NotebookApp.token=
diff --git a/docker/sedona.sh b/docker/sedona.sh
index 1a7a0d0ef..6392c5b8a 100755
--- a/docker/sedona.sh
+++ b/docker/sedona.sh
@@ -23,33 +23,31 @@ geotools_wrapper_version=$2
 spark_version=$3
 spark_extension_version=$4
 
-lower_version=$(echo -e $spark_version"\n3.4" | sort -V | head -n1)
-if [ $lower_version = "3.4" ]; then
-    sedona_spark_version=3.4
-else
+spark_compat_version=${spark_version:0:3}
+sedona_spark_version=${spark_compat_version}
+if [ ${spark_version:0:1} -eq "3" ] && [ ${spark_version:2:1} -le "3" ]; then
+    # 3.0, 3.1, 3.2, 3.3
     sedona_spark_version=3.0
 fi
 
 if [ $sedona_version = "latest" ]; then
     # Code to execute when SEDONA_VERSION is "latest"
     cp ${SEDONA_HOME}/spark-shaded/target/sedona-spark-shaded-*.jar 
${SPARK_HOME}/jars/
-       cd ${SEDONA_HOME}/python;pip3 install shapely==1.8.4;pip3 install .
+       cd ${SEDONA_HOME}/python;pip3 install .
 else
     # Code to execute when SEDONA_VERSION is not "latest"
     # Download Sedona
        curl 
https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-${sedona_spark_version}_2.12/${sedona_version}/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar
 -o 
$SPARK_HOME/jars/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar
 
        # Install Sedona Python
-       pip3 install shapely==1.8.4
        pip3 install apache-sedona==${sedona_version}
-
 fi
 
 # Download gresearch spark extension
-curl 
https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.12/${spark_extension_version}-${sedona_spark_version}/spark-extension_2.12-${spark_extension_version}-${sedona_spark_version}.jar
 -o 
$SPARK_HOME/jars/spark-extension_2.12-${spark_extension_version}-${sedona_spark_version}.jar
+curl 
https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.12/${spark_extension_version}-${spark_compat_version}/spark-extension_2.12-${spark_extension_version}-${spark_compat_version}.jar
 -o 
$SPARK_HOME/jars/spark-extension_2.12-${spark_extension_version}-${spark_compat_version}.jar
 
 # Install Spark extension Python
-pip3 install 
pyspark-extension==${spark_extension_version}.${sedona_spark_version}
+pip3 install 
pyspark-extension==${spark_extension_version}.${spark_compat_version}
 
 # Download GeoTools jar
 curl 
https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/${geotools_wrapper_version}/geotools-wrapper-${geotools_wrapper_version}.jar
 -o $SPARK_HOME/jars/geotools-wrapper-${geotools_wrapper_version}.jar

Reply via email to