This is an automated email from the ASF dual-hosted git repository.

xushiyan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 6748111d6ad [HUDI-5616][DOCS] Update spark quickstart with notes on 
HoodieKryoRegistrar (#11184)
6748111d6ad is described below

commit 6748111d6ad85017fca15d2df74981944857460b
Author: Shiyan Xu <[email protected]>
AuthorDate: Thu May 9 16:39:12 2024 -0500

    [HUDI-5616][DOCS] Update spark quickstart with notes on HoodieKryoRegistrar 
(#11184)
---
 website/docs/quick-start-guide.md | 60 +++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 15 deletions(-)

diff --git a/website/docs/quick-start-guide.md 
b/website/docs/quick-start-guide.md
index 1359c5c2957..35e57dd9e01 100644
--- a/website/docs/quick-start-guide.md
+++ b/website/docs/quick-start-guide.md
@@ -53,17 +53,27 @@ From the extracted directory run spark-shell with Hudi:
 
 ```shell
 # For Spark versions: 3.2 - 3.4
-export SPARK_VERSION=3.4
-spark-shell --packages 
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf 
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
 --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+export SPARK_VERSION=3.4 # or 3.3 or 3.2
+spark-shell --packages 
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf 
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
 \
+--conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
 ```
 ```shell
 # For Spark versions: 3.0 - 3.1
-export SPARK_VERSION=3.1
-spark-shell --packages 
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf 
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+export SPARK_VERSION=3.1 # or 3.0
+spark-shell --packages 
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
 ```
 ```shell
 # For Spark version: 2.4
-spark-shell --packages org.apache.hudi:hudi-spark2.4-bundle_2.11:0.14.1 --conf 
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+spark-shell --packages org.apache.hudi:hudi-spark2.4-bundle_2.11:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
 ```
 </TabItem>
 
@@ -74,13 +84,13 @@ From the extracted directory run pyspark with Hudi:
 ```shell
 # For Spark versions: 3.2 - 3.4
 export PYSPARK_PYTHON=$(which python3)
-export SPARK_VERSION=3.4
+export SPARK_VERSION=3.4 # or 3.3 or 3.2
 pyspark --packages org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
 --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
 ```
 ```shell
 # For Spark versions: 3.0 - 3.1
 export PYSPARK_PYTHON=$(which python3)
-export SPARK_VERSION=3.1
+export SPARK_VERSION=3.1 # or 3.0
 pyspark --packages org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
 ```
 ```shell
@@ -97,26 +107,46 @@ From the extracted directory run Spark SQL with Hudi:
 
 ```shell
 # For Spark versions: 3.2 - 3.4
-export SPARK_VERSION=3.4
-spark-sql --packages 
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf 
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--conf 
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
 --conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+export SPARK_VERSION=3.4 # or 3.3 or 3.2
+spark-sql --packages 
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
 \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
 ```
 ```shell
 # For Spark versions: 3.0 - 3.1
-export SPARK_VERSION=3.1
-spark-sql --packages 
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf 
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+export SPARK_VERSION=3.1 # or 3.0
+spark-sql --packages 
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
 ```
 ```shell
 # For Spark version: 2.4
-spark-sql --packages org.apache.hudi:hudi-spark2.4-bundle_2.11:0.14.1 --conf 
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
+spark-sql --packages org.apache.hudi:hudi-spark2.4-bundle_2.11:0.14.1 \
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
 ```
 
 </TabItem>
+</Tabs>
 
-</Tabs
->
+:::note on Kryo serialization
+Users are recommended to set this config to reduce Kryo serialization overhead
+
+```
+--conf 'spark.kryo.registrator=org.apache.spark.HoodieKryoRegistrar'
+```
+:::
 
 :::note for Spark 3.2 and higher versions
-Use scala 2.12 builds with an additional config: --conf 
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
+Use scala 2.12 builds with an additional config: 
+
+```
+--conf 
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
+```
 :::
 
 ### Setup project

Reply via email to