This is an automated email from the ASF dual-hosted git repository.
xushiyan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 3c8988c08e [HUDI-4982] Add validation job for spark bundles in GitHub
Actions (#6954)
3c8988c08e is described below
commit 3c8988c08eb9b9c511509fbb227e1fc9c48eeaaf
Author: Shiyan Xu <[email protected]>
AuthorDate: Wed Oct 19 21:24:41 2022 +0800
[HUDI-4982] Add validation job for spark bundles in GitHub Actions (#6954)
---
.github/workflows/bot.yml | 9 +++
.../spark-write-hive-sync/Dockerfile | 56 +++++++++++++++++
.../spark-write-hive-sync/ci_run.sh | 71 ++++++++++++++++++++++
.../spark-write-hive-sync/hive-site.xml | 53 ++++++++++++++++
.../spark-write-hive-sync/spark-defaults.conf | 20 ++++++
.../spark-write-hive-sync/validate.scala | 57 +++++++++++++++++
.../spark-write-hive-sync/validate.sh | 30 +++++++++
7 files changed, 296 insertions(+)
diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml
index 6ad5a83514..e2491b67db 100644
--- a/.github/workflows/bot.yml
+++ b/.github/workflows/bot.yml
@@ -64,6 +64,15 @@ jobs:
FLINK_PROFILE: ${{ matrix.flinkProfile }}
run:
mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE"
-D"$FLINK_PROFILE" -DfailIfNoTests=false -pl
hudi-examples/hudi-examples-flink,hudi-examples/hudi-examples-java,hudi-examples/hudi-examples-spark
$MVN_ARGS
+ - name: Bundle Validation
+ env:
+ SCALA_PROFILE: ${{ matrix.scalaProfile }}
+ SPARK_PROFILE: ${{ matrix.sparkProfile }}
+ FLINK_PROFILE: ${{ matrix.flinkProfile }}
+ if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4
as it's covered by Azure CI
+ run: |
+ HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q
-DforceStdout)
+ ./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
$HUDI_VERSION
- name: Spark SQL Test
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
diff --git a/packaging/bundle-validation/spark-write-hive-sync/Dockerfile
b/packaging/bundle-validation/spark-write-hive-sync/Dockerfile
new file mode 100644
index 0000000000..bc9656ef3f
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/Dockerfile
@@ -0,0 +1,56 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+FROM adoptopenjdk/openjdk8:alpine
+
+RUN apk add --no-cache --upgrade bash
+
+RUN mkdir /opt/hudi-bundles
+ENV WORKDIR=/opt/hudi-bundles
+WORKDIR $WORKDIR
+
+ARG HADOOP_VERSION=2.7.7
+ARG HIVE_VERSION=3.1.3
+ARG DERBY_VERSION=10.14.1.0
+ARG SPARK_VERSION=3.1.3
+ARG SPARK_HADOOP_VERSION=2.7
+
+RUN wget
https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
-P "$WORKDIR" \
+ && tar -xf $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz -C $WORKDIR/ \
+ && rm $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz
+ENV HADOOP_HOME=$WORKDIR/hadoop-$HADOOP_VERSION
+
+RUN wget
https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz
-P "$WORKDIR" \
+ && tar -xf $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz -C $WORKDIR/ \
+ && rm $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz
+ENV HIVE_HOME=$WORKDIR/apache-hive-$HIVE_VERSION-bin
+
+RUN wget
https://archive.apache.org/dist/db/derby/db-derby-$DERBY_VERSION/db-derby-$DERBY_VERSION-bin.tar.gz
-P "$WORKDIR" \
+ && tar -xf $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz -C $WORKDIR/ \
+ && rm $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz
+ENV DERBY_HOME=$WORKDIR/db-derby-$DERBY_VERSION-bin
+
+RUN wget
https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
-P "$WORKDIR" \
+ && tar -xf
$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
+ && rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
+ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION
+
+RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
+COPY hive-site.xml $HIVE_HOME/conf/
+RUN ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
+COPY spark-defaults.conf $SPARK_HOME/conf/
+COPY validate.scala .
+COPY validate.sh .
diff --git a/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
b/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
new file mode 100755
index 0000000000..a1e3832105
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Note:
+# this script is to run by GitHub Actions CI tasks from the project root
directory
+# and contains environment-specific variables
+
+HUDI_VERSION=$1
+# to store bundle jars for validation
+mkdir ${GITHUB_WORKSPACE}/jars
+cp packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar
${GITHUB_WORKSPACE}/jars
+echo 'Validating jars below:'
+ls -l ${GITHUB_WORKSPACE}/jars
+
+# choose versions based on build profiles
+if [[ ${SPARK_PROFILE} == 'spark2.4' ]]; then
+ HADOOP_VERSION=2.7.7
+ HIVE_VERSION=2.3.9
+ DERBY_VERSION=10.10.2.0
+ SPARK_VERSION=2.4.8
+ SPARK_HADOOP_VERSION=2.7
+ IMAGE_TAG=spark248hive239
+elif [[ ${SPARK_PROFILE} == 'spark3.1' ]]; then
+ HADOOP_VERSION=2.7.7
+ HIVE_VERSION=3.1.3
+ DERBY_VERSION=10.14.1.0
+ SPARK_VERSION=3.1.3
+ SPARK_HADOOP_VERSION=2.7
+ IMAGE_TAG=spark313hive313
+elif [[ ${SPARK_PROFILE} == 'spark3.2' ]]; then
+ HADOOP_VERSION=2.7.7
+ HIVE_VERSION=3.1.3
+ DERBY_VERSION=10.14.1.0
+ SPARK_VERSION=3.2.2
+ SPARK_HADOOP_VERSION=2.7
+ IMAGE_TAG=spark322hive313
+elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
+ HADOOP_VERSION=2.7.7
+ HIVE_VERSION=3.1.3
+ DERBY_VERSION=10.14.1.0
+ SPARK_VERSION=3.3.0
+ SPARK_HADOOP_VERSION=2
+ IMAGE_TAG=spark330hive313
+fi
+
+cd packaging/bundle-validation/spark-write-hive-sync || exit 1
+docker build \
+--build-arg HADOOP_VERSION=$HADOOP_VERSION \
+--build-arg HIVE_VERSION=$HIVE_VERSION \
+--build-arg DERBY_VERSION=$DERBY_VERSION \
+--build-arg SPARK_VERSION=$SPARK_VERSION \
+--build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
+-t hudi-ci-bundle-validation:$IMAGE_TAG \
+.
+docker run -v ${GITHUB_WORKSPACE}/jars:/opt/hudi-bundles/jars -i
hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
diff --git a/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml
b/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml
new file mode 100644
index 0000000000..810cd695f2
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<configuration>
+ <property>
+ <name>system:user.name</name>
+ <value>${user.name}</value>
+ </property>
+ <property>
+ <name>system:java.io.tmpdir</name>
+ <value>file:///tmp/hudi-bundles/hive/java</value>
+ </property>
+ <property>
+ <name>hive.exec.scratchdir</name>
+ <value>file:///tmp/hudi-bundles/hive/exec</value>
+ </property>
+ <property>
+ <name>hive.metastore.warehouse.dir</name>
+ <value>file:///tmp/hudi-bundles/hive/warehouse</value>
+ </property>
+ <property>
+ <name>hive.metastore.schema.verification</name>
+ <value>false</value>
+ </property>
+ <!-- TODO: use autoCreateAll = false for hive 2.x -->
+ <property>
+ <name>datanucleus.schema.autoCreateAll</name>
+ <value>true</value>
+ </property>
+ <property>
+ <name>javax.jdo.option.ConnectionDriverName</name>
+ <value>org.apache.derby.jdbc.ClientDriver</value>
+ </property>
+ <property>
+ <name>javax.jdo.option.ConnectionURL</name>
+ <value>jdbc:derby://localhost:1527/default;create=true</value>
+ </property>
+</configuration>
diff --git
a/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
b/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
new file mode 100644
index 0000000000..136d9d5ddc
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+spark.serializer org.apache.spark.serializer.KryoSerializer
+spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
+spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
diff --git a/packaging/bundle-validation/spark-write-hive-sync/validate.scala
b/packaging/bundle-validation/spark-write-hive-sync/validate.scala
new file mode 100644
index 0000000000..01faa38509
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/validate.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hudi.QuickstartUtils._
+import scala.collection.JavaConversions._
+import org.apache.spark.sql.SaveMode._
+import org.apache.hudi.DataSourceReadOptions._
+import org.apache.hudi.DataSourceWriteOptions._
+import org.apache.hudi.config.HoodieWriteConfig._
+import org.apache.hudi.common.model.HoodieRecord
+
+val expected = 10
+val database = "default"
+val tableName = "trips"
+val basePath = "file:///tmp/hudi-bundles/tests/" + tableName
+val dataGen = new DataGenerator
+val inserts = convertToStringList(dataGen.generateInserts(expected))
+val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
+df.write.format("hudi").
+ options(getQuickstartWriteConfigs).
+ option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+ option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+ option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+ option(TABLE_NAME, tableName).
+ option("hoodie.datasource.meta.sync.enable", "true").
+ option("hoodie.datasource.hive_sync.database", database).
+ option("hoodie.datasource.hive_sync.table", tableName).
+ option("hoodie.datasource.hive_sync.partition_extractor_class",
"org.apache.hudi.hive.SinglePartPartitionValueExtractor").
+ option("hoodie.datasource.hive_sync.mode", "hms").
+ option("hoodie.datasource.hive_sync.metastore.uris",
"thrift://localhost:9083/").
+ mode(Overwrite).
+ save(basePath)
+
+spark.sql("desc " + tableName).show
+val actual = spark.sql("select * from " + tableName).count
+if (expected == actual) {
+ System.out.println($"bundle combination passed sanity run.")
+ System.exit(0)
+} else {
+ System.err.println($"bundle combination failed sanity run:\n\tshould have
written $expected records in $database.$tableName")
+ System.exit(1)
+}
diff --git a/packaging/bundle-validation/spark-write-hive-sync/validate.sh
b/packaging/bundle-validation/spark-write-hive-sync/validate.sh
new file mode 100755
index 0000000000..d8526a4815
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/validate.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NOTE: this script runs inside hudi-ci-bundle-validation container
+# $WORKDIR/jars/ is supposed to be mounted to a host directory where bundle
jars are placed
+# TODO: $JAR_COMBINATIONS should have different orders for different jars to
detect class loading issues
+
+$DERBY_HOME/bin/startNetworkServer -h 0.0.0.0 &
+$HIVE_HOME/bin/hiveserver2 &
+WORKDIR=/opt/hudi-bundles
+JAR_COMBINATIONS=$(echo $WORKDIR/jars/*.jar | tr ' ' ',')
+$SPARK_HOME/bin/spark-shell --jars $JAR_COMBINATIONS < $WORKDIR/validate.scala
+
+exit $?