This is an automated email from the ASF dual-hosted git repository.
xushiyan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 6748111d6ad [HUDI-5616][DOCS] Update spark quickstart with notes on
HoodieKryoRegistrar (#11184)
6748111d6ad is described below
commit 6748111d6ad85017fca15d2df74981944857460b
Author: Shiyan Xu <[email protected]>
AuthorDate: Thu May 9 16:39:12 2024 -0500
[HUDI-5616][DOCS] Update spark quickstart with notes on HoodieKryoRegistrar
(#11184)
---
website/docs/quick-start-guide.md | 60 +++++++++++++++++++++++++++++----------
1 file changed, 45 insertions(+), 15 deletions(-)
diff --git a/website/docs/quick-start-guide.md
b/website/docs/quick-start-guide.md
index 1359c5c2957..35e57dd9e01 100644
--- a/website/docs/quick-start-guide.md
+++ b/website/docs/quick-start-guide.md
@@ -53,17 +53,27 @@ From the extracted directory run spark-shell with Hudi:
```shell
# For Spark versions: 3.2 - 3.4
-export SPARK_VERSION=3.4
-spark-shell --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+export SPARK_VERSION=3.4 # or 3.3 or 3.2
+spark-shell --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
\
+--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
```
```shell
# For Spark versions: 3.0 - 3.1
-export SPARK_VERSION=3.1
-spark-shell --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+export SPARK_VERSION=3.1 # or 3.0
+spark-shell --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
```
```shell
# For Spark version: 2.4
-spark-shell --packages org.apache.hudi:hudi-spark2.4-bundle_2.11:0.14.1 --conf
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+spark-shell --packages org.apache.hudi:hudi-spark2.4-bundle_2.11:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
```
</TabItem>
@@ -74,13 +84,13 @@ From the extracted directory run pyspark with Hudi:
```shell
# For Spark versions: 3.2 - 3.4
export PYSPARK_PYTHON=$(which python3)
-export SPARK_VERSION=3.4
+export SPARK_VERSION=3.4 # or 3.3 or 3.2
pyspark --packages org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
```
```shell
# For Spark versions: 3.0 - 3.1
export PYSPARK_PYTHON=$(which python3)
-export SPARK_VERSION=3.1
+export SPARK_VERSION=3.1 # or 3.0
pyspark --packages org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
```
```shell
@@ -97,26 +107,46 @@ From the extracted directory run Spark SQL with Hudi:
```shell
# For Spark versions: 3.2 - 3.4
-export SPARK_VERSION=3.4
-spark-sql --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+export SPARK_VERSION=3.4 # or 3.3 or 3.2
+spark-sql --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
\
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
```
```shell
# For Spark versions: 3.0 - 3.1
-export SPARK_VERSION=3.1
-spark-sql --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+export SPARK_VERSION=3.1 # or 3.0
+spark-sql --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
```
```shell
# For Spark version: 2.4
-spark-sql --packages org.apache.hudi:hudi-spark2.4-bundle_2.11:0.14.1 --conf
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+spark-sql --packages org.apache.hudi:hudi-spark2.4-bundle_2.11:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
```
</TabItem>
+</Tabs>
-</Tabs
->
+:::note on Kryo serialization
+Users are recommended to set this config to reduce Kryo serialization overhead
+
+```
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieKryoRegistrar'
+```
+:::
:::note for Spark 3.2 and higher versions
-Use scala 2.12 builds with an additional config: --conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
+Use scala 2.12 builds with an additional config:
+
+```
+--conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
+```
:::
### Setup project