This is an automated email from the ASF dual-hosted git repository.

xushiyan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 3c8988c08e [HUDI-4982] Add validation job for spark bundles in GitHub 
Actions (#6954)
3c8988c08e is described below

commit 3c8988c08eb9b9c511509fbb227e1fc9c48eeaaf
Author: Shiyan Xu <[email protected]>
AuthorDate: Wed Oct 19 21:24:41 2022 +0800

    [HUDI-4982] Add validation job for spark bundles in GitHub Actions (#6954)
---
 .github/workflows/bot.yml                          |  9 +++
 .../spark-write-hive-sync/Dockerfile               | 56 +++++++++++++++++
 .../spark-write-hive-sync/ci_run.sh                | 71 ++++++++++++++++++++++
 .../spark-write-hive-sync/hive-site.xml            | 53 ++++++++++++++++
 .../spark-write-hive-sync/spark-defaults.conf      | 20 ++++++
 .../spark-write-hive-sync/validate.scala           | 57 +++++++++++++++++
 .../spark-write-hive-sync/validate.sh              | 30 +++++++++
 7 files changed, 296 insertions(+)

diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml
index 6ad5a83514..e2491b67db 100644
--- a/.github/workflows/bot.yml
+++ b/.github/workflows/bot.yml
@@ -64,6 +64,15 @@ jobs:
           FLINK_PROFILE: ${{ matrix.flinkProfile }}
         run:
           mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" 
-D"$FLINK_PROFILE" -DfailIfNoTests=false -pl 
hudi-examples/hudi-examples-flink,hudi-examples/hudi-examples-java,hudi-examples/hudi-examples-spark
 $MVN_ARGS
+      - name: Bundle Validation
+        env:
+          SCALA_PROFILE: ${{ matrix.scalaProfile }}
+          SPARK_PROFILE: ${{ matrix.sparkProfile }}
+          FLINK_PROFILE: ${{ matrix.flinkProfile }}
+        if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 
as it's covered by Azure CI
+        run: |
+          HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q 
-DforceStdout)
+          ./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh 
$HUDI_VERSION
       - name: Spark SQL Test
         env:
           SCALA_PROFILE: ${{ matrix.scalaProfile }}
diff --git a/packaging/bundle-validation/spark-write-hive-sync/Dockerfile 
b/packaging/bundle-validation/spark-write-hive-sync/Dockerfile
new file mode 100644
index 0000000000..bc9656ef3f
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/Dockerfile
@@ -0,0 +1,56 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+FROM adoptopenjdk/openjdk8:alpine
+
+RUN apk add --no-cache --upgrade bash
+
+RUN mkdir /opt/hudi-bundles
+ENV WORKDIR=/opt/hudi-bundles
+WORKDIR $WORKDIR
+
+ARG HADOOP_VERSION=2.7.7
+ARG HIVE_VERSION=3.1.3
+ARG DERBY_VERSION=10.14.1.0
+ARG SPARK_VERSION=3.1.3
+ARG SPARK_HADOOP_VERSION=2.7
+
+RUN wget 
https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
 -P "$WORKDIR" \
+    && tar -xf $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz -C $WORKDIR/ \
+    && rm $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz
+ENV HADOOP_HOME=$WORKDIR/hadoop-$HADOOP_VERSION
+
+RUN wget 
https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz
 -P "$WORKDIR" \
+    && tar -xf $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz -C $WORKDIR/ \
+    && rm $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz
+ENV HIVE_HOME=$WORKDIR/apache-hive-$HIVE_VERSION-bin
+
+RUN wget 
https://archive.apache.org/dist/db/derby/db-derby-$DERBY_VERSION/db-derby-$DERBY_VERSION-bin.tar.gz
 -P "$WORKDIR" \
+    && tar -xf $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz -C $WORKDIR/ \
+    && rm $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz
+ENV DERBY_HOME=$WORKDIR/db-derby-$DERBY_VERSION-bin
+
+RUN wget 
https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
 -P "$WORKDIR" \
+    && tar -xf 
$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
+    && rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
+ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION
+
+RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
+COPY hive-site.xml $HIVE_HOME/conf/
+RUN ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
+COPY spark-defaults.conf $SPARK_HOME/conf/
+COPY validate.scala .
+COPY validate.sh .
diff --git a/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh 
b/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
new file mode 100755
index 0000000000..a1e3832105
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Note:
+# this script is to run by GitHub Actions CI tasks from the project root 
directory
+# and contains environment-specific variables
+
+HUDI_VERSION=$1
+# to store bundle jars for validation
+mkdir ${GITHUB_WORKSPACE}/jars
+cp packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar 
${GITHUB_WORKSPACE}/jars
+echo 'Validating jars below:'
+ls -l ${GITHUB_WORKSPACE}/jars
+
+# choose versions based on build profiles
+if [[ ${SPARK_PROFILE} == 'spark2.4' ]]; then
+  HADOOP_VERSION=2.7.7
+  HIVE_VERSION=2.3.9
+  DERBY_VERSION=10.10.2.0
+  SPARK_VERSION=2.4.8
+  SPARK_HADOOP_VERSION=2.7
+  IMAGE_TAG=spark248hive239
+elif [[ ${SPARK_PROFILE} == 'spark3.1' ]]; then
+  HADOOP_VERSION=2.7.7
+  HIVE_VERSION=3.1.3
+  DERBY_VERSION=10.14.1.0
+  SPARK_VERSION=3.1.3
+  SPARK_HADOOP_VERSION=2.7
+  IMAGE_TAG=spark313hive313
+elif [[ ${SPARK_PROFILE} == 'spark3.2' ]]; then
+  HADOOP_VERSION=2.7.7
+  HIVE_VERSION=3.1.3
+  DERBY_VERSION=10.14.1.0
+  SPARK_VERSION=3.2.2
+  SPARK_HADOOP_VERSION=2.7
+  IMAGE_TAG=spark322hive313
+elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
+  HADOOP_VERSION=2.7.7
+  HIVE_VERSION=3.1.3
+  DERBY_VERSION=10.14.1.0
+  SPARK_VERSION=3.3.0
+  SPARK_HADOOP_VERSION=2
+  IMAGE_TAG=spark330hive313
+fi
+
+cd packaging/bundle-validation/spark-write-hive-sync || exit 1
+docker build \
+--build-arg HADOOP_VERSION=$HADOOP_VERSION \
+--build-arg HIVE_VERSION=$HIVE_VERSION \
+--build-arg DERBY_VERSION=$DERBY_VERSION \
+--build-arg SPARK_VERSION=$SPARK_VERSION \
+--build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
+-t hudi-ci-bundle-validation:$IMAGE_TAG \
+.
+docker run -v ${GITHUB_WORKSPACE}/jars:/opt/hudi-bundles/jars -i 
hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
diff --git a/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml 
b/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml
new file mode 100644
index 0000000000..810cd695f2
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<configuration>
+  <property>
+    <name>system:user.name</name>
+    <value>${user.name}</value>
+  </property>
+  <property>
+    <name>system:java.io.tmpdir</name>
+    <value>file:///tmp/hudi-bundles/hive/java</value>
+  </property>
+  <property>
+    <name>hive.exec.scratchdir</name>
+    <value>file:///tmp/hudi-bundles/hive/exec</value>
+  </property>
+  <property>
+    <name>hive.metastore.warehouse.dir</name>
+    <value>file:///tmp/hudi-bundles/hive/warehouse</value>
+  </property>
+  <property>
+    <name>hive.metastore.schema.verification</name>
+    <value>false</value>
+  </property>
+  <!-- TODO: use autoCreateAll = false for hive 2.x -->
+  <property>
+    <name>datanucleus.schema.autoCreateAll</name>
+    <value>true</value>
+  </property>
+  <property>
+    <name>javax.jdo.option.ConnectionDriverName</name>
+    <value>org.apache.derby.jdbc.ClientDriver</value>
+  </property>
+  <property>
+    <name>javax.jdo.option.ConnectionURL</name>
+    <value>jdbc:derby://localhost:1527/default;create=true</value>
+  </property>
+</configuration>
diff --git 
a/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf 
b/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
new file mode 100644
index 0000000000..136d9d5ddc
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+spark.serializer org.apache.spark.serializer.KryoSerializer
+spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
+spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
diff --git a/packaging/bundle-validation/spark-write-hive-sync/validate.scala 
b/packaging/bundle-validation/spark-write-hive-sync/validate.scala
new file mode 100644
index 0000000000..01faa38509
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/validate.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hudi.QuickstartUtils._
+import scala.collection.JavaConversions._
+import org.apache.spark.sql.SaveMode._
+import org.apache.hudi.DataSourceReadOptions._
+import org.apache.hudi.DataSourceWriteOptions._
+import org.apache.hudi.config.HoodieWriteConfig._
+import org.apache.hudi.common.model.HoodieRecord
+
+val expected = 10
+val database = "default"
+val tableName = "trips"
+val basePath = "file:///tmp/hudi-bundles/tests/" + tableName
+val dataGen = new DataGenerator
+val inserts = convertToStringList(dataGen.generateInserts(expected))
+val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
+df.write.format("hudi").
+  options(getQuickstartWriteConfigs).
+  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+  option(TABLE_NAME, tableName).
+  option("hoodie.datasource.meta.sync.enable", "true").
+  option("hoodie.datasource.hive_sync.database", database).
+  option("hoodie.datasource.hive_sync.table", tableName).
+  option("hoodie.datasource.hive_sync.partition_extractor_class", 
"org.apache.hudi.hive.SinglePartPartitionValueExtractor").
+  option("hoodie.datasource.hive_sync.mode", "hms").
+  option("hoodie.datasource.hive_sync.metastore.uris", 
"thrift://localhost:9083/").
+  mode(Overwrite).
+  save(basePath)
+
+spark.sql("desc " + tableName).show
+val actual = spark.sql("select * from " + tableName).count
+if (expected == actual) {
+  System.out.println($"bundle combination passed sanity run.")
+  System.exit(0)
+} else {
+  System.err.println($"bundle combination failed sanity run:\n\tshould have 
written $expected records in $database.$tableName")
+  System.exit(1)
+}
diff --git a/packaging/bundle-validation/spark-write-hive-sync/validate.sh 
b/packaging/bundle-validation/spark-write-hive-sync/validate.sh
new file mode 100755
index 0000000000..d8526a4815
--- /dev/null
+++ b/packaging/bundle-validation/spark-write-hive-sync/validate.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NOTE: this script runs inside hudi-ci-bundle-validation container
+# $WORKDIR/jars/ is supposed to be mounted to a host directory where bundle 
jars are placed
+# TODO: $JAR_COMBINATIONS should have different orders for different jars to 
detect class loading issues
+
+$DERBY_HOME/bin/startNetworkServer -h 0.0.0.0 &
+$HIVE_HOME/bin/hiveserver2 &
+WORKDIR=/opt/hudi-bundles
+JAR_COMBINATIONS=$(echo $WORKDIR/jars/*.jar | tr ' ' ',')
+$SPARK_HOME/bin/spark-shell --jars $JAR_COMBINATIONS < $WORKDIR/validate.scala
+
+exit $?

Reply via email to