This is an automated email from the ASF dual-hosted git repository.
codope pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 31e13db1f0 [HUDI-4023] Decouple hudi-spark from
hudi-utilities-slim-bundle (#5641)
31e13db1f0 is described below
commit 31e13db1f0e12e107cc02c60dec3e52a8914a5b2
Author: Sagar Sumit <[email protected]>
AuthorDate: Thu May 26 11:28:49 2022 +0530
[HUDI-4023] Decouple hudi-spark from hudi-utilities-slim-bundle (#5641)
---
.../hudi/utilities/deltastreamer/DeltaSync.java | 2 -
packaging/hudi-utilities-slim-bundle/README.md | 89 ++++++++++++-
packaging/hudi-utilities-slim-bundle/pom.xml | 143 +++------------------
pom.xml | 7 +
4 files changed, 109 insertions(+), 132 deletions(-)
diff --git
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java
index a4a7e10abc..0ae72f94b8 100644
---
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java
+++
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java
@@ -605,8 +605,6 @@ public class DeltaSync implements Serializable {
long totalErrorRecords =
writeStatusRDD.mapToDouble(WriteStatus::getTotalErrorRecords).sum().longValue();
long totalRecords =
writeStatusRDD.mapToDouble(WriteStatus::getTotalRecords).sum().longValue();
boolean hasErrors = totalErrorRecords > 0;
- long hiveSyncTimeMs = 0;
- long metaSyncTimeMs = 0;
if (!hasErrors || cfg.commitOnErrors) {
HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
if (checkpointStr != null) {
diff --git a/packaging/hudi-utilities-slim-bundle/README.md
b/packaging/hudi-utilities-slim-bundle/README.md
index 58353c403d..60ee739153 100644
--- a/packaging/hudi-utilities-slim-bundle/README.md
+++ b/packaging/hudi-utilities-slim-bundle/README.md
@@ -17,6 +17,89 @@
# Usage of hudi-utilities-slim-bundle
-Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which
excludes hudi-spark-datasource modules.
-This new bundle is intended to be used with Hudi Spark bundle together, if
using hudi-utilities-bundle solely
-introduces problems for a specific Spark version.
\ No newline at end of file
+Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which
excludes hudi-spark-datasource modules. This new bundle is intended to be used
with Hudi Spark bundle together, if using
+hudi-utilities-bundle solely introduces problems for a specific Spark version.
+
+## Example with Spark 2.4.7
+
+* Build Hudi: `mvn clean install -DskipTests`
+* Run deltastreamer
+
+```
+bin/spark-submit \
+ --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1
\
+ --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
+ --conf spark.sql.catalogImplementation=hive \
+ --conf spark.driver.maxResultSize=1g \
+ --conf spark.ui.port=6679 \
+ --packages org.apache.spark:spark-avro_2.11:2.4.7 \
+ --jars
/path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.12.0-SNAPSHOT.jar
\
+ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls
/path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.11-0.12.0-SNAPSHOT.jar`
\
+ --props `ls /path/to/hudi/dfs-source.properties` \
+ --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
+ --schemaprovider-class
org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
+ --source-ordering-field tpep_dropoff_datetime \
+ --table-type COPY_ON_WRITE \
+ --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark24/ \
+ --target-table ny_hudi_tbl \
+ --op UPSERT \
+ --continuous \
+ --source-limit 5000000 \
+ --min-sync-interval-seconds 60
+```
+
+## Example with Spark 3.1.2
+
+* Build Hudi: `mvn clean install -DskipTests -Dspark3.1 -Dscala-2.12`
+* Run deltastreamer
+
+```
+bin/spark-submit \
+ --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1
\
+ --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
+ --conf spark.sql.catalogImplementation=hive \
+ --conf spark.driver.maxResultSize=1g \
+ --conf spark.ui.port=6679 \
+ --packages org.apache.spark:spark-avro_2.12:3.1.2 \
+ --jars
/path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.1-bundle_2.12-0.12.0-SNAPSHOT.jar
\
+ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls
/path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar`
\
+ --props `ls /path/to/hudi/dfs-source.properties` \
+ --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
+ --schemaprovider-class
org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
+ --source-ordering-field tpep_dropoff_datetime \
+ --table-type COPY_ON_WRITE \
+ --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark31/ \
+ --target-table ny_hudi_tbl \
+ --op UPSERT \
+ --continuous \
+ --source-limit 5000000 \
+ --min-sync-interval-seconds 60
+```
+
+## Example with Spark 3.2.0
+
+* Build Hudi: `mvn clean install -DskipTests -Dspark3.2 -Dscala-2.12`
+* Run deltastreamer
+
+```
+bin/spark-submit \
+ --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1
\
+ --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
+ --conf spark.sql.catalogImplementation=hive \
+ --conf spark.driver.maxResultSize=1g \
+ --conf spark.ui.port=6679 \
+ --packages org.apache.spark:spark-avro_2.12:3.2.0 \
+ --jars
/path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar
\
+ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls
/path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar`
\
+ --props `ls /path/to/hudi/dfs-source.properties` \
+ --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
+ --schemaprovider-class
org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
+ --source-ordering-field tpep_dropoff_datetime \
+ --table-type COPY_ON_WRITE \
+ --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark32/ \
+ --target-table ny_hudi_tbl \
+ --op UPSERT \
+ --continuous \
+ --source-limit 5000000 \
+ --min-sync-interval-seconds 60
+```
diff --git a/packaging/hudi-utilities-slim-bundle/pom.xml
b/packaging/hudi-utilities-slim-bundle/pom.xml
index 60f0af9d64..993e2ad7fd 100644
--- a/packaging/hudi-utilities-slim-bundle/pom.xml
+++ b/packaging/hudi-utilities-slim-bundle/pom.xml
@@ -77,7 +77,7 @@
<transformer
implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer">
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
- <addHeader>true</addHeader>
+ <addHeader>true</addHeader>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/LICENSE</resource>
@@ -92,10 +92,7 @@
<includes>
<include>org.apache.hudi:hudi-common</include>
<include>org.apache.hudi:hudi-client-common</include>
- <include>org.apache.hudi:hudi-spark-client</include>
<include>org.apache.hudi:hudi-utilities_${scala.binary.version}</include>
- <include>org.apache.hudi:hudi-hive-sync</include>
- <include>org.apache.hudi:hudi-sync-common</include>
<include>org.apache.hudi:hudi-hadoop-mr</include>
<include>org.apache.hudi:hudi-timeline-service</include>
<include>org.apache.hudi:hudi-aws</include>
@@ -136,13 +133,6 @@
<include>org.apache.kafka:kafka_${scala.binary.version}</include>
<include>com.101tec:zkclient</include>
<include>org.apache.kafka:kafka-clients</include>
-
- <include>org.apache.hive:hive-common</include>
- <include>org.apache.hive:hive-service</include>
- <include>org.apache.hive:hive-service-rpc</include>
- <include>org.apache.hive:hive-metastore</include>
- <include>org.apache.hive:hive-jdbc</include>
-
<include>org.apache.hbase:hbase-client</include>
<include>org.apache.hbase:hbase-common</include>
<include>org.apache.hbase:hbase-hadoop-compat</include>
@@ -178,10 +168,6 @@
<pattern>com.beust.jcommander.</pattern>
<shadedPattern>org.apache.hudi.com.beust.jcommander.</shadedPattern>
</relocation>
- <relocation>
- <pattern>org.apache.hive.jdbc.</pattern>
-
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.jdbc.</shadedPattern>
- </relocation>
<relocation>
<pattern>org.apache.commons.io.</pattern>
<shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
@@ -205,10 +191,6 @@
<pattern>org.apache.hadoop.hive.metastore.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore.</shadedPattern>
</relocation>
- <relocation>
- <pattern>org.apache.hive.common.</pattern>
-
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.common.</shadedPattern>
- </relocation>
<relocation>
<pattern>org.apache.hadoop.hive.common.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.common.</shadedPattern>
@@ -217,10 +199,6 @@
<pattern>org.apache.hadoop.hive.conf.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.conf.</shadedPattern>
</relocation>
- <relocation>
- <pattern>org.apache.hive.service.</pattern>
-
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.service.</shadedPattern>
- </relocation>
<relocation>
<pattern>org.apache.hadoop.hive.service.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.service.</shadedPattern>
@@ -344,116 +322,27 @@
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
- <artifactId>hudi-client-common</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hudi</groupId>
- <artifactId>hudi-spark-client</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hudi</groupId>
- <artifactId>hudi-hive-sync</artifactId>
+ <artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
- <groupId>javax.servlet</groupId>
- <artifactId>servlet-api</artifactId>
+ <groupId>org.apache.hudi</groupId>
+ <artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.hudi</groupId>
+ <artifactId>hudi-spark_${scala.binary.version}</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.hudi</groupId>
+ <artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.hudi</groupId>
+ <artifactId>${hudi.spark.common.module}</artifactId>
</exclusion>
</exclusions>
</dependency>
- <dependency>
- <groupId>org.apache.hudi</groupId>
- <artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
- <version>${project.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.hudi</groupId>
- <artifactId>hudi-spark_${scala.binary.version}</artifactId>
- <version>${project.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.hudi</groupId>
- <artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId>
- <version>${project.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.hudi</groupId>
- <artifactId>${hudi.spark.common.module}</artifactId>
- <version>${project.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.hudi</groupId>
- <artifactId>hudi-utilities_${scala.binary.version}</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <!-- Hive -->
- <dependency>
- <groupId>${hive.groupid}</groupId>
- <artifactId>hive-service</artifactId>
- <version>${hive.version}</version>
- <scope>${utilities.bundle.hive.scope}</scope>
- </dependency>
-
- <dependency>
- <groupId>${hive.groupid}</groupId>
- <artifactId>hive-service-rpc</artifactId>
- <version>${hive.version}</version>
- <scope>${utilities.bundle.hive.scope}</scope>
- </dependency>
-
- <dependency>
- <groupId>${hive.groupid}</groupId>
- <artifactId>hive-jdbc</artifactId>
- <version>${hive.version}</version>
- <scope>${utilities.bundle.hive.scope}</scope>
- </dependency>
-
- <dependency>
- <groupId>${hive.groupid}</groupId>
- <artifactId>hive-metastore</artifactId>
- <version>${hive.version}</version>
- <scope>${utilities.bundle.hive.scope}</scope>
- </dependency>
-
- <dependency>
- <groupId>${hive.groupid}</groupId>
- <artifactId>hive-common</artifactId>
- <version>${hive.version}</version>
- <scope>${utilities.bundle.hive.scope}</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.htrace</groupId>
- <artifactId>htrace-core</artifactId>
- <version>${htrace.version}</version>
- <scope>compile</scope>
- </dependency>
-
- <!-- zookeeper -->
- <dependency>
- <groupId>org.apache.curator</groupId>
- <artifactId>curator-framework</artifactId>
- <version>${zk-curator.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.curator</groupId>
- <artifactId>curator-client</artifactId>
- <version>${zk-curator.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.curator</groupId>
- <artifactId>curator-recipes</artifactId>
- <version>${zk-curator.version}</version>
- </dependency>
</dependencies>
<profiles>
diff --git a/pom.xml b/pom.xml
index d898d34d35..1188ec620a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -99,6 +99,7 @@
<pulsar.version>2.8.1</pulsar.version>
<confluent.version>5.3.4</confluent.version>
<glassfish.version>2.17</glassfish.version>
+ <glassfish.el.version>3.0.1-b12</glassfish.el.version>
<parquet.version>1.10.1</parquet.version>
<junit.jupiter.version>5.7.0-M1</junit.jupiter.version>
<junit.vintage.version>5.7.0-M1</junit.vintage.version>
@@ -556,6 +557,12 @@
<artifactId>jersey-container-servlet-core</artifactId>
<version>${glassfish.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.glassfish</groupId>
+ <artifactId>javax.el</artifactId>
+ <version>${glassfish.el.version}</version>
+ <scope>provided</scope>
+ </dependency>
<!-- Avro -->
<dependency>