This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new bb872004 [SEDONA-285] Sedona Spark Jupyterlab docker image (#939)
bb872004 is described below
commit bb872004448faa65bdcfa03657c171480703c929
Author: Jia Yu <[email protected]>
AuthorDate: Wed Aug 2 11:53:20 2023 -0700
[SEDONA-285] Sedona Spark Jupyterlab docker image (#939)
Co-authored-by: Hadiya Kartikey <[email protected]>
Co-authored-by: Kartikey <[email protected]>
Co-authored-by: Kartikey <[email protected]>
Co-authored-by: yyy1000 <[email protected]>
---
.github/workflows/docker-build.yml | 53 +++++++++
binder/Pipfile | 2 +-
docker/sedona-spark-jupyterlab/.dockerignore | 3 +
docker/sedona-spark-jupyterlab/.gitignore | 6 +
docker/sedona-spark-jupyterlab/build.sh | 47 ++++++++
docker/sedona-spark-jupyterlab/requirements.txt | 12 ++
.../sedona-jupyterlab.dockerfile | 73 ++++++++++++
docker/sedona.sh | 45 +++++++
docker/spark.sh | 60 ++++++++++
docs/setup/docker.md | 130 +++++++++++++++++++++
mkdocs.yml | 1 +
11 files changed, 431 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/docker-build.yml
b/.github/workflows/docker-build.yml
new file mode 100644
index 00000000..e03e5bc8
--- /dev/null
+++ b/.github/workflows/docker-build.yml
@@ -0,0 +1,53 @@
+name: Docker build
+
+on:
+ push:
+ branches:
+ - master
+ paths:
+ - 'docker/**'
+ pull_request:
+ branches:
+ - '*'
+
+env:
+ MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=60
+
+jobs:
+ build:
+ strategy:
+ fail-fast: true
+ matrix:
+ os: ['ubuntu-latest', 'macos-latest']
+ include:
+ - spark: 3.4.1
+ sedona: 1.4.1
+ - spark: 3.4.1
+ sedona: latest
+ - spark: 3.3.2
+ sedona: latest
+ runs-on: ${{ matrix.os }}
+ defaults:
+ run:
+ shell: bash
+
+ steps:
+ - uses: actions/checkout@v2
+ - uses: actions/setup-java@v1
+ with:
+ java-version: 11
+ - name: Cache Maven packages
+ uses: actions/cache@v2
+ with:
+ path: ~/.m2
+ key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+ restore-keys: ${{ runner.os }}-m2
+ - name: Setup docker (missing on MacOS)
+ if: runner.os == 'macos'
+ run: |
+ brew install docker
+ colima start
+ - env:
+ SPARK_VERSION: ${{ matrix.spark }}
+ SEDONA_VERSION: ${{ matrix.sedona }}
+ run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION}
${SEDONA_VERSION}
\ No newline at end of file
diff --git a/binder/Pipfile b/binder/Pipfile
index 47631489..080c3087 100644
--- a/binder/Pipfile
+++ b/binder/Pipfile
@@ -11,7 +11,7 @@ mkdocs="*"
pytest-cov = "*"
[packages]
-pandas="*"
+pandas="1.3.5"
shapely="==1.8.4"
geopandas="==0.11.1"
pyspark="==3.3.2"
diff --git a/docker/sedona-spark-jupyterlab/.dockerignore
b/docker/sedona-spark-jupyterlab/.dockerignore
new file mode 100644
index 00000000..90b05048
--- /dev/null
+++ b/docker/sedona-spark-jupyterlab/.dockerignore
@@ -0,0 +1,3 @@
+Dockerfile
+compose.yml
+README.md
\ No newline at end of file
diff --git a/docker/sedona-spark-jupyterlab/.gitignore
b/docker/sedona-spark-jupyterlab/.gitignore
new file mode 100644
index 00000000..66f91c7a
--- /dev/null
+++ b/docker/sedona-spark-jupyterlab/.gitignore
@@ -0,0 +1,6 @@
+commands.txt
+docker-compose-orig.yml
+Dockerfile.bak
+log.txt
+examples
+.ipynb_checkpoints
\ No newline at end of file
diff --git a/docker/sedona-spark-jupyterlab/build.sh
b/docker/sedona-spark-jupyterlab/build.sh
new file mode 100755
index 00000000..e3bc7af9
--- /dev/null
+++ b/docker/sedona-spark-jupyterlab/build.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+SPARK_VERSION=$1
+SEDONA_VERSION=$2
+BUILD_MODE=$3
+
+if [ "$SEDONA_VERSION" = "latest" ]; then
+ # The compilation must take place outside Docker to avoid unnecessary
maven packages
+ mvn clean install -DskipTests -Dspark=${SEDONA_SPARK_VERSION} -Dgeotools
-Dscala=2.12
+fi
+
+# -- Building the image
+
+if [ -z "$BUILD_MODE" ] || [ "$BUILD_MODE" = "local" ]; then
+ # If local, build the image for the local environment
+ docker build \
+ --build-arg spark_version="${SPARK_VERSION}" \
+ --build-arg sedona_version="${SEDONA_VERSION}" \
+ -f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
+ -t sedona/sedona-jupyterlab:${SEDONA_VERSION} .
+else
+ # If release, build the image for cross-platform
+ docker buildx build --platform linux/amd64,linux/arm64 \
+ --progress=plain \
+ --output type=registry \
+ --build-arg spark_version="${SPARK_VERSION}" \
+ --build-arg sedona_version="${SEDONA_VERSION}" \
+ -f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
+ -t drjiayu/sedona-jupyterlab:${SEDONA_VERSION} .
+fi
\ No newline at end of file
diff --git a/docker/sedona-spark-jupyterlab/requirements.txt
b/docker/sedona-spark-jupyterlab/requirements.txt
new file mode 100644
index 00000000..9ffcc64f
--- /dev/null
+++ b/docker/sedona-spark-jupyterlab/requirements.txt
@@ -0,0 +1,12 @@
+pandas==1.3.5
+fiona==1.8.22
+geopandas==0.10.2
+keplergl==0.3.2
+pydeck==0.8.0
+attrs
+matplotlib
+descartes
+ipywidgets
+jupyterlab-widgets
+ipykernel
+jupyterlab==3.6.4
\ No newline at end of file
diff --git a/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
new file mode 100644
index 00000000..af1146ac
--- /dev/null
+++ b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
@@ -0,0 +1,73 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM ubuntu:22.04
+
+ARG shared_workspace=/opt/workspace
+ARG spark_version=3.3.2
+ARG hadoop_version=3
+ARG hadoop_s3_version=3.3.4
+ARG aws_sdk_version=1.12.402
+ARG spark_xml_version=0.16.0
+ARG sedona_version=1.4.1
+ARG geotools_wrapper_version=1.4.0-28.2
+
+# Set up envs
+ENV SHARED_WORKSPACE=${shared_workspace}
+ENV SPARK_HOME /opt/spark
+RUN mkdir ${SPARK_HOME}
+ENV SEDONA_HOME /opt/sedona
+RUN mkdir ${SEDONA_HOME}
+
+ENV SPARK_MASTER_HOST localhost
+ENV SPARK_MASTER_PORT 7077
+ENV PYTHONPATH=$SPARK_HOME/python
+ENV PYSPARK_PYTHON python3
+ENV PYSPARK_DRIVER_PYTHON jupyter
+
+COPY ./ ${SEDONA_HOME}/
+
+RUN chmod +x ${SEDONA_HOME}/docker/spark.sh
+RUN chmod +x ${SEDONA_HOME}/docker/sedona.sh
+RUN ${SEDONA_HOME}/docker/spark.sh ${spark_version} ${hadoop_version}
${hadoop_s3_version} ${aws_sdk_version} ${spark_xml_version}
+RUN ${SEDONA_HOME}/docker/sedona.sh ${sedona_version}
${geotools_wrapper_version} ${spark_version}
+
+# Install Python dependencies
+COPY docker/sedona-spark-jupyterlab/requirements.txt /opt/requirements.txt
+RUN pip3 install -r /opt/requirements.txt
+
+COPY binder/*.ipynb /opt/workspace/examples/
+COPY binder/*.py /opt/workspace/examples/
+COPY binder/data /opt/workspace/examples/data
+
+# Add the master IP address to all notebooks
+RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i
's/config = SedonaContext.builder()/config =
SedonaContext.builder().master(\\"spark:\/\/localhost:7077\\")/' {} +
+# Delete packages configured by the notebooks
+RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i
'/spark\.jars\.packages/d' {} +
+RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i
'/org\.apache\.sedona:sedona-spark-shaded-/d' {} +
+RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i
'/org\.datasyslab:geotools-wrapper:/d' {} +
+
+RUN rm -rf ${SEDONA_HOME}
+
+EXPOSE 8888
+EXPOSE 8080
+EXPOSE 8081
+EXPOSE 4040
+
+WORKDIR ${SHARED_WORKSPACE}
+
+CMD service ssh start && ${SPARK_HOME}/sbin/start-all.sh && jupyter lab
--ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=
\ No newline at end of file
diff --git a/docker/sedona.sh b/docker/sedona.sh
new file mode 100644
index 00000000..58f63c5f
--- /dev/null
+++ b/docker/sedona.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# Define variables
+sedona_version=$1
+geotools_wrapper_version=$2
+spark_version=$3
+
+lower_version=$(echo -e $spark_version"\n3.4" | sort -V | head -n1)
+if [ $lower_version = "3.4" ]; then
+ sedona_spark_version=3.4
+else
+ sedona_spark_version=3.0
+fi
+
+if [ $sedona_version = "latest" ]; then
+ # Code to execute when SEDONA_VERSION is "latest"
+ cp ${SEDONA_HOME}/spark-shaded/target/sedona-spark-shaded-*.jar
${SPARK_HOME}/jars/
+ cd ${SEDONA_HOME}/python;pip3 install shapely==1.8.4;pip3 install .
+else
+ # Code to execute when SEDONA_VERSION is not "latest"
+ # Download Sedona
+ curl
https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-${sedona_spark_version}_2.12/${sedona_version}/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar
-o
$SPARK_HOME/jars/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar
+ curl
https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/${geotools_wrapper_version}/geotools-wrapper-${geotools_wrapper_version}.jar
-o $SPARK_HOME/jars/geotools-wrapper-${geotools_wrapper_version}.jar
+
+ # Install Sedona Python
+ pip3 install shapely==1.8.4
+ pip3 install apache-sedona==${sedona_version}
+fi
\ No newline at end of file
diff --git a/docker/spark.sh b/docker/spark.sh
new file mode 100644
index 00000000..652205cb
--- /dev/null
+++ b/docker/spark.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# Define variables
+spark_version=$1
+hadoop_version=$2
+hadoop_s3_version=$3
+aws_sdk_version=$4
+spark_xml_version=$5
+
+# Set up OS libraries
+apt-get update
+apt-get install -y openjdk-19-jdk-headless curl python3-pip maven
+pip3 install --upgrade pip && pip3 install pipenv
+
+# Download Spark jar and set up PySpark
+curl
https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz
-o spark.tgz
+tar -xf spark.tgz && mv spark-${spark_version}-bin-hadoop${hadoop_version}/*
${SPARK_HOME}/
+rm spark.tgz && rm -rf spark-${spark_version}-bin-hadoop${hadoop_version}
+pip3 install pyspark==${spark_version}
+
+# Add S3 jars
+curl
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${hadoop_s3_version}/hadoop-aws-${hadoop_s3_version}.jar
-o ${SPARK_HOME}/jars/hadoop-aws-${hadoop_s3_version}.jar
+curl
https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${aws_sdk_version}/aws-java-sdk-bundle-${aws_sdk_version}.jar
-o ${SPARK_HOME}/jars/aws-java-sdk-bundle-${aws_sdk_version}.jar
+
+# Add spark-xml jar
+curl
https://repo1.maven.org/maven2/com/databricks/spark-xml_2.12/${spark_xml_version}/spark-xml_2.12-${spark_xml_version}.jar
-o ${SPARK_HOME}/jars/spark-xml_2.12-${spark_xml_version}.jar
+
+# Set up master IP address and executor memory
+cp ${SPARK_HOME}/conf/spark-defaults.conf.template
${SPARK_HOME}/conf/spark-defaults.conf
+echo "spark.driver.memory 4g" >> ${SPARK_HOME}/conf/spark-defaults.conf
+echo "spark.executor.memory 4g" >> ${SPARK_HOME}/conf/spark-defaults.conf
+
+# Install required libraries for GeoPandas on Apple chip mac
+apt-get install -y gdal-bin libgdal-dev
+
+# Install OpenSSH for cluster mode
+apt-get install -y openssh-client openssh-server
+systemctl enable ssh
+
+# Enable nopassword ssh
+ssh-keygen -t rsa -f ~/.ssh/id_rsa -N ""
+cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+chmod 600 ~/.ssh/authorized_keys
diff --git a/docs/setup/docker.md b/docs/setup/docker.md
new file mode 100644
index 00000000..21fb87aa
--- /dev/null
+++ b/docs/setup/docker.md
@@ -0,0 +1,130 @@
+# Sedona JupyterLab Docker Image
+
+Dockerfiles for Apache Sedona with JupyterLab and 1 master node and 1 worker
node
+
+## How to use
+
+### Pull the image from DockerHub
+
+Format:
+
+```bash
+docker pull drjiayu/sedona-jupyterlab:<sedona_version>
+```
+
+Example 1: Pull the latest image of Sedona master branch
+
+```bash
+docker pull drjiayu/sedona-jupyterlab:latest
+```
+
+Example 2: Pull the image of a specific Sedona release
+
+```bash
+docker pull drjiayu/sedona-jupyterlab:1.4.1
+```
+
+### Start the container
+
+Format:
+
+```bash
+docker run -p 8888:8888 -p 8080:8080 -p 8081:8081 -p 4040:4040
drjiayu/sedona-jupyterlab:<sedona_version>
+```
+
+Example 1:
+
+```bash
+docker run -p 8888:8888 -p 8080:8080 -p 8081:8081 -p 4040:4040
drjiayu/sedona-jupyterlab:latest
+```
+
+Example 2:
+
+```bash
+docker run -p 8888:8888 -p 8080:8080 -p 8081:8081 -p 4040:4040
drjiayu/sedona-jupyterlab:1.4.1
+```
+
+This command will bind the container's ports 8888, 8080, 8081, 4040 to the
host's ports 8888, 8080, 8081, 4040 respectively.
+
+### Start coding
+
+Open your browser and go to [http://localhost:8888/](http://localhost:8888/)
to start coding with Sedona.
+
+### Notes
+
+* This container assumes you have at least 8GB RAM and takes all your CPU
cores and 8GM RAM.
+* Sedona in this container runs in the cluster mode. Only 1 notebook can be
run at a time. If you want to run another notebook, please shut down the kernel
of the current notebook first
([How?](https://jupyterlab.readthedocs.io/en/stable/user/running.html)).
+
+## How to build
+
+Clone the Sedona GitHub repository
+
+### Build the image against a Sedona release
+
+Requirements: docker ([How?](https://docs.docker.com/engine/install/))
+
+Format:
+
+```bash
+./docker/sedona-spark-jupyterlab/build.sh <spark_version> <sedona_version>
<build_mode>
+```
+
+Example:
+
+```bash
+./docker/sedona-spark-jupyterlab/build.sh 3.4.1 1.4.1
+```
+
+`build_mode` is optional. If its value is not given or is `local`, the script
will build the image locally. Otherwise, it will start a cross-platform
compilation and push images directly to DockerHub.
+
+### Build the image against the latest Sedona master
+
+Requirements: docker ([How?](https://docs.docker.com/engine/install/)), JDK <=
19, maven3
+
+Format:
+
+```bash
+./docker/sedona-spark-jupyterlab/build.sh <spark_version> latest <build_mode>
+```
+
+Example:
+
+```bash
+./docker/sedona-spark-jupyterlab/build.sh 3.4.1 latest
+```
+
+`build_mode` is optional. If its value is not given or is `local`, the script
will build the image locally. Otherwise, it will start a cross-platform
compilation and push images directly to DockerHub.
+
+### Notes
+
+This docker image can only be built against Sedona 1.4.1+ and Spark 3.0+
+
+## Cluster Configuration
+
+### Software
+* OS: Ubuntu 22.02
+* JDK: openjdk-19
+* Python: 3.10
+
+### Web UI
+* JupyterLab: http://localhost:8888/
+* Spark master URL: spark://localhost:7077
+* Spark job UI: http://localhost:4040
+* Spark master web UI: http://localhost:8080/
+* Spark web UI: http://localhost:8081/
+
+## How to push to DockerHub
+
+Format:
+
+```bash
+docker login
+./docker/sedona-spark-jupyterlab/build.sh <spark_version> <sedona_version>
release
+```
+
+Example:
+
+```bash
+docker login
+./docker/sedona-spark-jupyterlab/build.sh 3.4.1 1.4.1 release
+```
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index fb802329..44e04ef7 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -17,6 +17,7 @@ nav:
- Install Sedona Python: setup/install-python.md
- Install Sedona R: api/rdocs
- Install Sedona-Zeppelin: setup/zeppelin.md
+ - Play Sedona in Docker: setup/docker.md
- Install on Databricks: setup/databricks.md
- Install on AWS EMR: setup/emr.md
- Set up Spark cluster: setup/cluster.md