This is an automated email from the ASF dual-hosted git repository.
xushiyan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new aa1dff4d84 [HUDI-4982] Add Utilities and Utilities Slim + Spark Bundle
testing to GH Actions (#7005)
aa1dff4d84 is described below
commit aa1dff4d84aa51f01a500cda9d79d5a16d89c58f
Author: Jon Vexler <[email protected]>
AuthorDate: Tue Oct 25 22:45:12 2022 -0700
[HUDI-4982] Add Utilities and Utilities Slim + Spark Bundle testing to GH
Actions (#7005)
Co-authored-by: Raymond Xu <[email protected]>
---
.github/workflows/bot.yml | 2 +-
.../spark-defaults.conf => Dockerfile} | 15 ++-
.../Dockerfile => Dockerfile-base} | 11 +-
.../{spark-write-hive-sync => }/ci_run.sh | 39 ++++--
.../{spark-write-hive-sync => conf}/hive-site.xml | 0
.../hudi-defaults.conf} | 8 +-
.../spark-defaults.conf | 8 +-
.../spark-write-hive-sync/validate.sh | 30 -----
.../validate.scala | 0
.../utilities/hoodieapp.properties | 23 ++++
.../bundle-validation/utilities/validate.scala | 25 ++++
packaging/bundle-validation/validate.sh | 137 +++++++++++++++++++++
12 files changed, 240 insertions(+), 58 deletions(-)
diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml
index e2491b67db..0a61fa2544 100644
--- a/.github/workflows/bot.yml
+++ b/.github/workflows/bot.yml
@@ -72,7 +72,7 @@ jobs:
if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4
as it's covered by Azure CI
run: |
HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q
-DforceStdout)
- ./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
$HUDI_VERSION
+ ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION
- name: Spark SQL Test
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
diff --git
a/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
b/packaging/bundle-validation/Dockerfile
similarity index 57%
copy from packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
copy to packaging/bundle-validation/Dockerfile
index 136d9d5ddc..3c5500940c 100644
--- a/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
+++ b/packaging/bundle-validation/Dockerfile
@@ -15,6 +15,15 @@
# limitations under the License.
#
-spark.serializer org.apache.spark.serializer.KryoSerializer
-spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
-spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
+ARG IMAGE_TAG=spark313hive313
+FROM apachehudi/hudi-ci-bundle-validation-base:$IMAGE_TAG
+
+# configure the stack
+ADD . .
+ENV HUDI_CONF_DIR=$WORKDIR/conf
+RUN cp conf/hive-site.xml $HIVE_HOME/conf/
+RUN cp conf/hive-site.xml $SPARK_HOME/conf/
+RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
+RUN cp conf/spark-defaults.conf $SPARK_HOME/conf/
+RUN if [[ $SPARK_HOME == *"spark-3.2"* ]] || [[ $SPARK_HOME == *"spark-3.3"*
]]; \
+ then printf "\nspark.sql.catalog.spark_catalog
org.apache.spark.sql.hudi.catalog.HoodieCatalog\n" >>
$SPARK_HOME/conf/spark-defaults.conf; fi
diff --git a/packaging/bundle-validation/spark-write-hive-sync/Dockerfile
b/packaging/bundle-validation/Dockerfile-base
similarity index 88%
rename from packaging/bundle-validation/spark-write-hive-sync/Dockerfile
rename to packaging/bundle-validation/Dockerfile-base
index bc9656ef3f..1e782e08d5 100644
--- a/packaging/bundle-validation/spark-write-hive-sync/Dockerfile
+++ b/packaging/bundle-validation/Dockerfile-base
@@ -18,8 +18,8 @@ FROM adoptopenjdk/openjdk8:alpine
RUN apk add --no-cache --upgrade bash
-RUN mkdir /opt/hudi-bundles
-ENV WORKDIR=/opt/hudi-bundles
+RUN mkdir /opt/bundle-validation
+ENV WORKDIR=/opt/bundle-validation
WORKDIR $WORKDIR
ARG HADOOP_VERSION=2.7.7
@@ -47,10 +47,3 @@ RUN wget
https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK
&& tar -xf
$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
&& rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION
-
-RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
-COPY hive-site.xml $HIVE_HOME/conf/
-RUN ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
-COPY spark-defaults.conf $SPARK_HOME/conf/
-COPY validate.scala .
-COPY validate.sh .
diff --git a/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
b/packaging/bundle-validation/ci_run.sh
similarity index 58%
rename from packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
rename to packaging/bundle-validation/ci_run.sh
index a1e3832105..c2582d4452 100755
--- a/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
+++ b/packaging/bundle-validation/ci_run.sh
@@ -18,15 +18,16 @@
# under the License.
# Note:
-# this script is to run by GitHub Actions CI tasks from the project root
directory
-# and contains environment-specific variables
+#
+# This script is to
+# - set the corresponding variables based on CI job's build profiles
+# - prepare Hudi bundle jars for mounting into Docker container for validation
+# - prepare test datasets for mounting into Docker container for validation
+#
+# This is to run by GitHub Actions CI tasks from the project root directory
+# and it contains the CI environment-specific variables.
HUDI_VERSION=$1
-# to store bundle jars for validation
-mkdir ${GITHUB_WORKSPACE}/jars
-cp packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar
${GITHUB_WORKSPACE}/jars
-echo 'Validating jars below:'
-ls -l ${GITHUB_WORKSPACE}/jars
# choose versions based on build profiles
if [[ ${SPARK_PROFILE} == 'spark2.4' ]]; then
@@ -59,13 +60,33 @@ elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
IMAGE_TAG=spark330hive313
fi
-cd packaging/bundle-validation/spark-write-hive-sync || exit 1
+# Copy bundle jars to temp dir for mounting
+TMP_JARS_DIR=/tmp/jars/$(date +%s)
+mkdir -p $TMP_JARS_DIR
+cp
${GITHUB_WORKSPACE}/packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar
$TMP_JARS_DIR/
+cp
${GITHUB_WORKSPACE}/packaging/hudi-utilities-bundle/target/hudi-*-$HUDI_VERSION.jar
$TMP_JARS_DIR/
+cp
${GITHUB_WORKSPACE}/packaging/hudi-utilities-slim-bundle/target/hudi-*-$HUDI_VERSION.jar
$TMP_JARS_DIR/
+echo 'Validating jars below:'
+ls -l $TMP_JARS_DIR
+
+# Copy test dataset
+TMP_DATA_DIR=/tmp/data/$(date +%s)
+mkdir -p $TMP_DATA_DIR/stocks/data
+cp ${GITHUB_WORKSPACE}/docker/demo/data/*.json $TMP_DATA_DIR/stocks/data/
+cp ${GITHUB_WORKSPACE}/docker/demo/config/schema.avsc $TMP_DATA_DIR/stocks/
+
+# build docker image
+cd ${GITHUB_WORKSPACE}/packaging/bundle-validation || exit 1
docker build \
--build-arg HADOOP_VERSION=$HADOOP_VERSION \
--build-arg HIVE_VERSION=$HIVE_VERSION \
--build-arg DERBY_VERSION=$DERBY_VERSION \
--build-arg SPARK_VERSION=$SPARK_VERSION \
--build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
+--build-arg IMAGE_TAG=$IMAGE_TAG \
-t hudi-ci-bundle-validation:$IMAGE_TAG \
.
-docker run -v ${GITHUB_WORKSPACE}/jars:/opt/hudi-bundles/jars -i
hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
+
+# run validation script in docker
+docker run -v $TMP_JARS_DIR:/opt/bundle-validation/jars -v
$TMP_DATA_DIR:/opt/bundle-validation/data \
+ -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
diff --git a/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml
b/packaging/bundle-validation/conf/hive-site.xml
similarity index 100%
rename from packaging/bundle-validation/spark-write-hive-sync/hive-site.xml
rename to packaging/bundle-validation/conf/hive-site.xml
diff --git
a/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
b/packaging/bundle-validation/conf/hudi-defaults.conf
similarity index 78%
copy from packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
copy to packaging/bundle-validation/conf/hudi-defaults.conf
index 136d9d5ddc..b83aab20b7 100644
--- a/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
+++ b/packaging/bundle-validation/conf/hudi-defaults.conf
@@ -15,6 +15,8 @@
# limitations under the License.
#
-spark.serializer org.apache.spark.serializer.KryoSerializer
-spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
-spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
+hoodie.upsert.shuffle.parallelism 8
+hoodie.insert.shuffle.parallelism 8
+hoodie.delete.shuffle.parallelism 8
+hoodie.bulkinsert.shuffle.parallelism 8
+hoodie.finalize.write.parallelism 8
diff --git
a/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
b/packaging/bundle-validation/conf/spark-defaults.conf
similarity index 71%
rename from
packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
rename to packaging/bundle-validation/conf/spark-defaults.conf
index 136d9d5ddc..07575134c7 100644
--- a/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
+++ b/packaging/bundle-validation/conf/spark-defaults.conf
@@ -15,6 +15,8 @@
# limitations under the License.
#
-spark.serializer org.apache.spark.serializer.KryoSerializer
-spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
-spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
+spark.serializer org.apache.spark.serializer.KryoSerializer
+spark.sql.extensions
org.apache.spark.sql.hudi.HoodieSparkSessionExtension
+spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
+spark.default.parallelism 8
+spark.sql.shuffle.partitions 8
diff --git a/packaging/bundle-validation/spark-write-hive-sync/validate.sh
b/packaging/bundle-validation/spark-write-hive-sync/validate.sh
deleted file mode 100755
index d8526a4815..0000000000
--- a/packaging/bundle-validation/spark-write-hive-sync/validate.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# NOTE: this script runs inside hudi-ci-bundle-validation container
-# $WORKDIR/jars/ is supposed to be mounted to a host directory where bundle
jars are placed
-# TODO: $JAR_COMBINATIONS should have different orders for different jars to
detect class loading issues
-
-$DERBY_HOME/bin/startNetworkServer -h 0.0.0.0 &
-$HIVE_HOME/bin/hiveserver2 &
-WORKDIR=/opt/hudi-bundles
-JAR_COMBINATIONS=$(echo $WORKDIR/jars/*.jar | tr ' ' ',')
-$SPARK_HOME/bin/spark-shell --jars $JAR_COMBINATIONS < $WORKDIR/validate.scala
-
-exit $?
diff --git a/packaging/bundle-validation/spark-write-hive-sync/validate.scala
b/packaging/bundle-validation/spark/validate.scala
similarity index 100%
rename from packaging/bundle-validation/spark-write-hive-sync/validate.scala
rename to packaging/bundle-validation/spark/validate.scala
diff --git a/packaging/bundle-validation/utilities/hoodieapp.properties
b/packaging/bundle-validation/utilities/hoodieapp.properties
new file mode 100644
index 0000000000..6d2382fc89
--- /dev/null
+++ b/packaging/bundle-validation/utilities/hoodieapp.properties
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+hoodie.datasource.write.recordkey.field=key
+hoodie.datasource.write.partitionpath.field=date
+hoodie.datasource.write.precombine.field=ts
+hoodie.metadata.enable=true
+hoodie.deltastreamer.source.dfs.root=file:///opt/bundle-validation/data/stocks/data
+hoodie.deltastreamer.schemaprovider.target.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc
+hoodie.deltastreamer.schemaprovider.source.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc
diff --git a/packaging/bundle-validation/utilities/validate.scala
b/packaging/bundle-validation/utilities/validate.scala
new file mode 100644
index 0000000000..027364205f
--- /dev/null
+++ b/packaging/bundle-validation/utilities/validate.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+val hudiDf = spark.read.format("hudi").load("/tmp/hudi-utilities-test/")
+val inputDf =
spark.read.format("json").load("/opt/bundle-validation/data/stocks/data")
+val hudiCount = hudiDf.select("date", "key").distinct.count
+val srcCount = inputDf.select("date", "key").distinct.count
+if (hudiCount == srcCount) System.exit(0)
+println(s"Counts don't match hudiCount: $hudiCount, srcCount: $srcCount")
+System.exit(1)
diff --git a/packaging/bundle-validation/validate.sh
b/packaging/bundle-validation/validate.sh
new file mode 100755
index 0000000000..ee5255cf0d
--- /dev/null
+++ b/packaging/bundle-validation/validate.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#################################################################################################
+# NOTE: this script runs inside hudi-ci-bundle-validation container
+# $WORKDIR/jars/ is to mount to a host directory where bundle jars are placed
+# $WORKDIR/data/ is to mount to a host directory where test data are placed
with structures like
+# - <dataset name>/schema.avsc
+# - <dataset name>/data/<data files>
+#################################################################################################
+
+WORKDIR=/opt/bundle-validation
+JARS_DIR=${WORKDIR}/jars
+# link the jar names to easier to use names
+ln -sf $JARS_DIR/hudi-spark*.jar $JARS_DIR/spark.jar
+ln -sf $JARS_DIR/hudi-utilities-bundle*.jar $JARS_DIR/utilities.jar
+ln -sf $JARS_DIR/hudi-utilities-slim*.jar $JARS_DIR/utilities-slim.jar
+
+
+##
+# Function to test the spark bundle with hive sync.
+#
+# env vars (defined in container):
+# HIVE_HOME: path to the hive directory
+# DERBY_HOME: path to the derby directory
+# SPARK_HOME: path to the spark directory
+##
+test_spark_bundle () {
+ echo "::warning::validate.sh setting up hive metastore for spark bundle
validation"
+
+ $DERBY_HOME/bin/startNetworkServer -h 0.0.0.0 &
+ $HIVE_HOME/bin/hiveserver2 &
+ echo "::warning::validate.sh hive metastore setup complete. Testing"
+ $SPARK_HOME/bin/spark-shell --jars $JARS_DIR/spark.jar <
$WORKDIR/spark/validate.scala
+ if [ "$?" -ne 0 ]; then
+ echo "::error::validate.sh failed hive testing"
+ exit 1
+ fi
+ echo "::warning::validate.sh spark bundle validation successful"
+}
+
+
+##
+# Function to test the utilities bundle and utilities slim bundle + spark
bundle.
+# It runs deltastreamer and then verifies that deltastreamer worked correctly.
+#
+# 1st arg: main jar to run with spark-submit, usually it's the
utilities(-slim) bundle
+# 2nd arg and beyond: any additional jars to pass to --jars option
+#
+# env vars (defined in container):
+# SPARK_HOME: path to the spark directory
+##
+test_utilities_bundle () {
+ MAIN_JAR=$1
+ printf -v EXTRA_JARS '%s,' "${@:2}"
+ EXTRA_JARS="${EXTRA_JARS%,}"
+ OPT_JARS=""
+ if [[ -n $EXTRA_JARS ]]; then
+ OPT_JARS="--jars $EXTRA_JARS"
+ fi
+ OUTPUT_DIR=/tmp/hudi-utilities-test/
+ rm -r $OUTPUT_DIR
+ echo "::warning::validate.sh running deltastreamer"
+ $SPARK_HOME/bin/spark-submit \
+ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer \
+ $OPT_JARS $MAIN_JAR \
+ --props $WORKDIR/utilities/hoodieapp.properties \
+ --schemaprovider-class
org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
+ --source-class org.apache.hudi.utilities.sources.JsonDFSSource \
+ --source-ordering-field ts --table-type MERGE_ON_READ \
+ --target-base-path ${OUTPUT_DIR} \
+ --target-table utilities_tbl --op UPSERT
+ if [ "$?" -ne 0 ]; then
+ echo "::error::validate.sh deltastreamer failed with exit code $?"
+ exit 1
+ fi
+ echo "::warning::validate.sh done with deltastreamer"
+
+ OUTPUT_SIZE=$(du -s ${OUTPUT_DIR} | awk '{print $1}')
+ if [[ -z $OUTPUT_SIZE || "$OUTPUT_SIZE" -lt "580" ]]; then
+ echo "::error::validate.sh deltastreamer output folder ($OUTPUT_SIZE)
is smaller than minimum expected (580)"
+ exit 1
+ fi
+
+ echo "::warning::validate.sh validating deltastreamer in spark shell"
+ SHELL_COMMAND="$SPARK_HOME/bin/spark-shell $OPT_JARS $MAIN_JAR -i
$WORKDIR/utilities/validate.scala"
+ echo "::debug::this is the shell command: $SHELL_COMMAND"
+ LOGFILE="$WORKDIR/${FUNCNAME[0]}.log"
+ $SHELL_COMMAND >> $LOGFILE
+ if [ "$?" -ne 0 ]; then
+ SHELL_RESULT=$(cat $LOGFILE | grep "Counts don't match")
+ echo "::error::validate.sh $SHELL_RESULT"
+ exit 1
+ fi
+ echo "::warning::validate.sh done validating deltastreamer in spark shell"
+}
+
+
+test_spark_bundle
+if [ "$?" -ne 0 ]; then
+ exit 1
+fi
+
+if [[ $SPARK_HOME == *"spark-2.4"* ]] || [[ $SPARK_HOME == *"spark-3.1"* ]]
+then
+ echo "::warning::validate.sh testing utilities bundle"
+ test_utilities_bundle $JARS_DIR/utilities.jar
+ if [ "$?" -ne 0 ]; then
+ exit 1
+ fi
+ echo "::warning::validate.sh done testing utilities bundle"
+else
+ echo "::warning::validate.sh skip testing utilities bundle for non-spark2.4
& non-spark3.1 build"
+fi
+
+echo "::warning::validate.sh testing utilities slim bundle"
+test_utilities_bundle $JARS_DIR/utilities-slim.jar $JARS_DIR/spark.jar
+if [ "$?" -ne 0 ]; then
+ exit 1
+fi
+echo "::warning::validate.sh done testing utilities slim bundle"