This is an automated email from the ASF dual-hosted git repository.
vinoth pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 5cbcf13 Travis CI build asf-site
5cbcf13 is described below
commit 5cbcf131a82f00b1cb72872fa1302efff4a09646
Author: CI <[email protected]>
AuthorDate: Tue Jul 27 07:50:55 2021 +0000
Travis CI build asf-site
---
content/404.html | 2 +-
content/activity.html | 38 +-
content/assets/css/main.css | 2 +-
.../Hudi_design_diagram_-_Page_2_1.png | Bin 0 -> 52035 bytes
.../Screen_Shot_2021-07-20_at_5.35.47_PM.png | Bin 0 -> 163959 bytes
.../images/blog/datalake-platform/hudi-comic.png | Bin 0 -> 93630 bytes
.../datalake-platform/hudi-data-lake-platform.png | Bin 0 -> 128340 bytes
.../hudi-data-lake-platform_-_Copy_of_Page_1_3.png | Bin 0 -> 130359 bytes
.../hudi-data-lake-platform_-_Page_2_4.png | Bin 0 -> 282177 bytes
.../hudi-design-diagram_-incr-read.png | Bin 0 -> 57567 bytes
.../hudi-design-diagrams-table-format.png | Bin 0 -> 42148 bytes
.../hudi-design-diagrams_-_Page_2_1.png | Bin 0 -> 70552 bytes
.../hudi-design-diagrams_-_Page_4.png | Bin 0 -> 81348 bytes
.../hudi-design-diagrams_-_Page_5.png | Bin 0 -> 123834 bytes
.../hudi-design-diagrams_-_Page_6.png | Bin 0 -> 74018 bytes
.../hudi-design-diagrams_-_Page_7.png | Bin 0 -> 95019 bytes
.../hudi-design-diagrams_-_Page_8.png | Bin 0 -> 39783 bytes
content/assets/js/lunr/lunr-store.js | 5 +
content/blog.html | 38 +-
content/blog/apache-hudi-apache-zepplin/index.html | 2 +-
.../blog/apache-hudi-meets-apache-flink/index.html | 2 +-
content/blog/asf-incubation/index.html | 2 +-
.../async-compaction-deployment-model/index.html | 2 +-
content/blog/batch-vs-incremental/index.html | 4 +-
.../cdc-solution-using-hudi-by-nclouds/index.html | 2 +-
content/blog/change-capture-using-aws/index.html | 4 +-
content/blog/delete-support-in-hudi/index.html | 2 +-
.../index.html | 2 +-
.../index.html | 2 +-
content/blog/exporting-hudi-datasets/index.html | 2 +-
.../index.html | 2 +-
content/blog/hudi-clustering-intro/index.html | 2 +-
content/blog/hudi-file-sizing/index.html | 2 +-
.../index.html | 2 +-
content/blog/hudi-indexing-mechanisms/index.html | 4 +-
content/blog/hudi-key-generators/index.html | 2 +-
.../blog/hudi-meets-aws-emr-and-aws-dms/index.html | 2 +-
.../ingest-multiple-tables-using-hudi/index.html | 2 +-
content/blog/ingesting-database-changes/index.html | 4 +-
.../index.html | 2 +-
.../blog/registering-dataset-to-hive/index.html | 4 +-
content/blog/strata-talk-2017/index.html | 2 +-
.../blog/streaming-data-lake-platform/index.html | 396 +++++++++++++++++++++
content/cn/activity.html | 38 +-
content/cn/community.html | 4 +-
content/cn/contributing.html | 4 +-
content/cn/download.html | 4 +-
content/cn/older-releases.html | 4 +-
content/cn/releases.html | 4 +-
content/cn/security.html | 4 +-
content/community.html | 4 +-
content/contributing.html | 4 +-
content/download.html | 4 +-
content/index.html | 6 +-
content/older-releases.html | 4 +-
content/releases.html | 4 +-
content/roadmap.html | 4 +-
content/security.html | 4 +-
content/sitemap.xml | 4 +
59 files changed, 557 insertions(+), 80 deletions(-)
diff --git a/content/404.html b/content/404.html
index 5913096..09954e4 100644
--- a/content/404.html
+++ b/content/404.html
@@ -130,7 +130,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/activity.html b/content/activity.html
index 9171a39..7731933 100644
--- a/content/activity.html
+++ b/content/activity.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Activities - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
@@ -195,6 +195,30 @@
<h2 class="archive__item-title" itemprop="headline">
+ <a href="/blog/streaming-data-lake-platform/" rel="permalink">Apache
Hudi - The Streaming Data Lake Platform
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2021-07-21">July 21, 2021</time></div>
+
+ <p class="archive__item-excerpt" itemprop="description">It’s been called
many things. But, we have always been building a data lake platform
+</p>
+ </article>
+</div>
+
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
<a href="/blog/employing-right-configurations-for-hudi-cleaner/"
rel="permalink">Employing correct configurations for Hudi’s cleaner table
service
</a>
@@ -329,7 +353,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2020-11-11">November 11, 2020</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2020-11-11">November 11, 2020</time></div>
<p class="archive__item-excerpt" itemprop="description">Detailing
different indexing mechanisms in Hudi and when to use each of them
</p>
@@ -593,7 +617,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2020-01-20">January 20, 2020</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2020-01-20">January 20, 2020</time></div>
<p class="archive__item-excerpt" itemprop="description">In this blog, we
will build an end-end solution for capturing changes from a MySQL instance
running on AWS RDS to a Hudi table on S3, using capabilities in the Hudi 0.5.1
release.
</p>
@@ -648,7 +672,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-09-09">September 9, 2019</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-09-09">September 9, 2019</time></div>
<p class="archive__item-excerpt" itemprop="description">Learn how to
ingesting changes from a HUDI dataset using Sqoop/Hudi
</p>
@@ -672,7 +696,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-05-14">May 14, 2019</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-05-14">May 14, 2019</time></div>
<p class="archive__item-excerpt" itemprop="description">How to manually
register HUDI dataset into Hive using beeline
</p>
@@ -696,7 +720,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-03-07">March 7, 2019</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-03-07">March 7, 2019</time></div>
<p class="archive__item-excerpt" itemprop="description">
</p>
diff --git a/content/assets/css/main.css b/content/assets/css/main.css
index 11fe1ed..8c7ad06 100644
--- a/content/assets/css/main.css
+++ b/content/assets/css/main.css
@@ -1 +1 @@
-table{border-color:#1ab7ea !important}.page a{color:#3b9cba
!important}.page__content{font-size:17px}.page__content.releases{font-size:17px}.page__footer{font-size:15px
!important}.page__footer a{color:#3b9cba !important}.page__content
.notice,.page__content .notice--primary,.page__content
.notice--info,.page__content .notice--warning,.page__content
.notice--success,.page__content .notice--danger{font-size:0.8em
!important}.page__content table{font-size:0.8em !important}.page__content ta
[...]
+table{border-color:#1ab7ea !important}.page a{color:#3b9cba
!important}.page__content{font-size:17px}.page__content.releases{font-size:17px}.page__footer{font-size:15px
!important}.page__footer a{color:#3b9cba !important}.page__content
.notice,.page__content .notice--primary,.page__content
.notice--info,.page__content .notice--warning,.page__content
.notice--success,.page__content .notice--danger{font-size:0.8em
!important}.page__content table{font-size:0.8em !important}.page__content ta
[...]
diff --git
a/content/assets/images/blog/datalake-platform/Hudi_design_diagram_-_Page_2_1.png
b/content/assets/images/blog/datalake-platform/Hudi_design_diagram_-_Page_2_1.png
new file mode 100644
index 0000000..9d4a923
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/Hudi_design_diagram_-_Page_2_1.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/Screen_Shot_2021-07-20_at_5.35.47_PM.png
b/content/assets/images/blog/datalake-platform/Screen_Shot_2021-07-20_at_5.35.47_PM.png
new file mode 100644
index 0000000..69272ca
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/Screen_Shot_2021-07-20_at_5.35.47_PM.png
differ
diff --git a/content/assets/images/blog/datalake-platform/hudi-comic.png
b/content/assets/images/blog/datalake-platform/hudi-comic.png
new file mode 100644
index 0000000..7b5521f
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-comic.png differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-data-lake-platform.png
b/content/assets/images/blog/datalake-platform/hudi-data-lake-platform.png
new file mode 100644
index 0000000..b279316
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-data-lake-platform.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Copy_of_Page_1_3.png
b/content/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Copy_of_Page_1_3.png
new file mode 100644
index 0000000..a291ecd
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Copy_of_Page_1_3.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Page_2_4.png
b/content/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Page_2_4.png
new file mode 100644
index 0000000..8cd3e7d
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Page_2_4.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-design-diagram_-incr-read.png
b/content/assets/images/blog/datalake-platform/hudi-design-diagram_-incr-read.png
new file mode 100644
index 0000000..87ea844
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-design-diagram_-incr-read.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-design-diagrams-table-format.png
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams-table-format.png
new file mode 100644
index 0000000..e00bcec
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams-table-format.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_2_1.png
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_2_1.png
new file mode 100644
index 0000000..a684c5e
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_2_1.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_4.png
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_4.png
new file mode 100644
index 0000000..8747577
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_4.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_5.png
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_5.png
new file mode 100644
index 0000000..6dbd70e
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_5.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_6.png
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_6.png
new file mode 100644
index 0000000..e1decf0
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_6.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_7.png
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_7.png
new file mode 100644
index 0000000..a0f8c25
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_7.png
differ
diff --git
a/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_8.png
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_8.png
new file mode 100644
index 0000000..8b1e55e
Binary files /dev/null and
b/content/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_8.png
differ
diff --git a/content/assets/js/lunr/lunr-store.js
b/content/assets/js/lunr/lunr-store.js
index ec83ab8..2eea083 100644
--- a/content/assets/js/lunr/lunr-store.js
+++ b/content/assets/js/lunr/lunr-store.js
@@ -1703,4 +1703,9 @@ var store = [{
"excerpt":"Apache Hudi provides snapshot isolation between writers and
readers. This is made possible by Hudi’s MVCC concurrency model. In this blog,
we will explain how to employ the right configurations to manage multiple file
versions. Furthermore, we will discuss mechanisms available to users on how to
maintain just the required...","categories": ["blog"],
"tags": [],
"url":
"https://hudi.apache.org/blog/employing-right-configurations-for-hudi-cleaner/",
+ "teaser":"https://hudi.apache.org/assets/images/500x300.png"},{
+ "title": "Apache Hudi - The Streaming Data Lake Platform",
+ "excerpt":"As early as 2016, we set out a bold, new vision reimagining
batch data processing through a new “incremental” data processing stack -
alongside the existing batch and streaming stacks. While a stream processing
pipeline does row-oriented processing, delivering a few seconds of processing
latency, an incremental pipeline would apply...","categories": ["blog"],
+ "tags": [],
+ "url": "https://hudi.apache.org/blog/streaming-data-lake-platform/",
"teaser":"https://hudi.apache.org/assets/images/500x300.png"},]
diff --git a/content/blog.html b/content/blog.html
index 3b4b8fe..a060d9b 100644
--- a/content/blog.html
+++ b/content/blog.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Blog - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -126,7 +126,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
@@ -193,6 +193,30 @@
<h2 class="archive__item-title" itemprop="headline">
+ <a href="/blog/streaming-data-lake-platform/" rel="permalink">Apache
Hudi - The Streaming Data Lake Platform
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2021-07-21">July 21, 2021</time></div>
+
+ <p class="archive__item-excerpt" itemprop="description">It’s been called
many things. But, we have always been building a data lake platform
+</p>
+ </article>
+</div>
+
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
<a href="/blog/employing-right-configurations-for-hudi-cleaner/"
rel="permalink">Employing correct configurations for Hudi’s cleaner table
service
</a>
@@ -327,7 +351,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2020-11-11">November 11, 2020</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2020-11-11">November 11, 2020</time></div>
<p class="archive__item-excerpt" itemprop="description">Detailing
different indexing mechanisms in Hudi and when to use each of them
</p>
@@ -591,7 +615,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2020-01-20">January 20, 2020</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2020-01-20">January 20, 2020</time></div>
<p class="archive__item-excerpt" itemprop="description">In this blog, we
will build an end-end solution for capturing changes from a MySQL instance
running on AWS RDS to a Hudi table on S3, using capabilities in the Hudi 0.5.1
release.
</p>
@@ -646,7 +670,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-09-09">September 9, 2019</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-09-09">September 9, 2019</time></div>
<p class="archive__item-excerpt" itemprop="description">Learn how to
ingesting changes from a HUDI dataset using Sqoop/Hudi
</p>
@@ -670,7 +694,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-05-14">May 14, 2019</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-05-14">May 14, 2019</time></div>
<p class="archive__item-excerpt" itemprop="description">How to manually
register HUDI dataset into Hive using beeline
</p>
@@ -694,7 +718,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-03-07">March 7, 2019</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-03-07">March 7, 2019</time></div>
<p class="archive__item-excerpt" itemprop="description">
</p>
diff --git a/content/blog/apache-hudi-apache-zepplin/index.html
b/content/blog/apache-hudi-apache-zepplin/index.html
index fe633ab..8c2bbdb 100644
--- a/content/blog/apache-hudi-apache-zepplin/index.html
+++ b/content/blog/apache-hudi-apache-zepplin/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/apache-hudi-meets-apache-flink/index.html
b/content/blog/apache-hudi-meets-apache-flink/index.html
index a27ee5f..8102a6d 100644
--- a/content/blog/apache-hudi-meets-apache-flink/index.html
+++ b/content/blog/apache-hudi-meets-apache-flink/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/asf-incubation/index.html
b/content/blog/asf-incubation/index.html
index 5d99608..c0d3b8b 100644
--- a/content/blog/asf-incubation/index.html
+++ b/content/blog/asf-incubation/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/async-compaction-deployment-model/index.html
b/content/blog/async-compaction-deployment-model/index.html
index 3f0d2e2..2a322c4 100644
--- a/content/blog/async-compaction-deployment-model/index.html
+++ b/content/blog/async-compaction-deployment-model/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/batch-vs-incremental/index.html
b/content/blog/batch-vs-incremental/index.html
index bbf65f8..ac4aa1a 100644
--- a/content/blog/batch-vs-incremental/index.html
+++ b/content/blog/batch-vs-incremental/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
@@ -187,7 +187,7 @@
<h1 id="page-title" class="page__title" itemprop="headline">Big
Batch vs Incremental Processing
</h1>
<!-- Output author details if some exist. -->
- <div class="page__author"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-03-07">March 7, 2019</time></span>
+ <div class="page__author"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-03-07">March 7, 2019</time></span>
</header>
diff --git a/content/blog/cdc-solution-using-hudi-by-nclouds/index.html
b/content/blog/cdc-solution-using-hudi-by-nclouds/index.html
index 2b883bc..013627e 100644
--- a/content/blog/cdc-solution-using-hudi-by-nclouds/index.html
+++ b/content/blog/cdc-solution-using-hudi-by-nclouds/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/change-capture-using-aws/index.html
b/content/blog/change-capture-using-aws/index.html
index 3bb299c..4abe208 100644
--- a/content/blog/change-capture-using-aws/index.html
+++ b/content/blog/change-capture-using-aws/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
@@ -187,7 +187,7 @@
<h1 id="page-title" class="page__title" itemprop="headline">Change
Capture Using AWS Database Migration Service and Hudi
</h1>
<!-- Output author details if some exist. -->
- <div class="page__author"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2020-01-20">January 20, 2020</time></span>
+ <div class="page__author"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2020-01-20">January 20, 2020</time></span>
</header>
diff --git a/content/blog/delete-support-in-hudi/index.html
b/content/blog/delete-support-in-hudi/index.html
index 12df6ac..b814973 100644
--- a/content/blog/delete-support-in-hudi/index.html
+++ b/content/blog/delete-support-in-hudi/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git
a/content/blog/efficient-migration-of-large-parquet-tables/index.html
b/content/blog/efficient-migration-of-large-parquet-tables/index.html
index b6e668d..54d2ab9 100644
--- a/content/blog/efficient-migration-of-large-parquet-tables/index.html
+++ b/content/blog/efficient-migration-of-large-parquet-tables/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git
a/content/blog/employing-right-configurations-for-hudi-cleaner/index.html
b/content/blog/employing-right-configurations-for-hudi-cleaner/index.html
index 1482db9..fe2ab7d 100644
--- a/content/blog/employing-right-configurations-for-hudi-cleaner/index.html
+++ b/content/blog/employing-right-configurations-for-hudi-cleaner/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/exporting-hudi-datasets/index.html
b/content/blog/exporting-hudi-datasets/index.html
index 207da52..7d43924 100644
--- a/content/blog/exporting-hudi-datasets/index.html
+++ b/content/blog/exporting-hudi-datasets/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git
a/content/blog/high-perf-data-lake-with-hudi-and-alluxio-t3go/index.html
b/content/blog/high-perf-data-lake-with-hudi-and-alluxio-t3go/index.html
index e9fb1b9..a932cde 100644
--- a/content/blog/high-perf-data-lake-with-hudi-and-alluxio-t3go/index.html
+++ b/content/blog/high-perf-data-lake-with-hudi-and-alluxio-t3go/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/hudi-clustering-intro/index.html
b/content/blog/hudi-clustering-intro/index.html
index 1e15398..86f0e21 100644
--- a/content/blog/hudi-clustering-intro/index.html
+++ b/content/blog/hudi-clustering-intro/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/hudi-file-sizing/index.html
b/content/blog/hudi-file-sizing/index.html
index e900624..22cf3ba 100644
--- a/content/blog/hudi-file-sizing/index.html
+++ b/content/blog/hudi-file-sizing/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/hudi-incremental-processing-on-data-lakes/index.html
b/content/blog/hudi-incremental-processing-on-data-lakes/index.html
index 1c7a0f5..88a204f 100644
--- a/content/blog/hudi-incremental-processing-on-data-lakes/index.html
+++ b/content/blog/hudi-incremental-processing-on-data-lakes/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/hudi-indexing-mechanisms/index.html
b/content/blog/hudi-indexing-mechanisms/index.html
index e6115fa..6c58b04 100644
--- a/content/blog/hudi-indexing-mechanisms/index.html
+++ b/content/blog/hudi-indexing-mechanisms/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
@@ -187,7 +187,7 @@
<h1 id="page-title" class="page__title"
itemprop="headline">Employing the right indexes for fast updates, deletes in
Apache Hudi
</h1>
<!-- Output author details if some exist. -->
- <div class="page__author"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2020-11-11">November 11, 2020</time></span>
+ <div class="page__author"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2020-11-11">November 11, 2020</time></span>
</header>
diff --git a/content/blog/hudi-key-generators/index.html
b/content/blog/hudi-key-generators/index.html
index 568f79e..c34cead 100644
--- a/content/blog/hudi-key-generators/index.html
+++ b/content/blog/hudi-key-generators/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/hudi-meets-aws-emr-and-aws-dms/index.html
b/content/blog/hudi-meets-aws-emr-and-aws-dms/index.html
index c37ed8b..156e5af 100644
--- a/content/blog/hudi-meets-aws-emr-and-aws-dms/index.html
+++ b/content/blog/hudi-meets-aws-emr-and-aws-dms/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/ingest-multiple-tables-using-hudi/index.html
b/content/blog/ingest-multiple-tables-using-hudi/index.html
index b6f5fab..6f3132a 100644
--- a/content/blog/ingest-multiple-tables-using-hudi/index.html
+++ b/content/blog/ingest-multiple-tables-using-hudi/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/ingesting-database-changes/index.html
b/content/blog/ingesting-database-changes/index.html
index ea21b20..fa53a84 100644
--- a/content/blog/ingesting-database-changes/index.html
+++ b/content/blog/ingesting-database-changes/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
@@ -187,7 +187,7 @@
<h1 id="page-title" class="page__title"
itemprop="headline">Ingesting Database changes via Sqoop/Hudi
</h1>
<!-- Output author details if some exist. -->
- <div class="page__author"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-09-09">September 9, 2019</time></span>
+ <div class="page__author"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-09-09">September 9, 2019</time></span>
</header>
diff --git a/content/blog/monitoring-hudi-metrics-with-datadog/index.html
b/content/blog/monitoring-hudi-metrics-with-datadog/index.html
index 2ef7eda..9cfe44b 100644
--- a/content/blog/monitoring-hudi-metrics-with-datadog/index.html
+++ b/content/blog/monitoring-hudi-metrics-with-datadog/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/registering-dataset-to-hive/index.html
b/content/blog/registering-dataset-to-hive/index.html
index fd8b6ee..647147f 100644
--- a/content/blog/registering-dataset-to-hive/index.html
+++ b/content/blog/registering-dataset-to-hive/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
@@ -187,7 +187,7 @@
<h1 id="page-title" class="page__title"
itemprop="headline">Registering sample dataset to Hive via beeline
</h1>
<!-- Output author details if some exist. -->
- <div class="page__author"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-05-14">May 14, 2019</time></span>
+ <div class="page__author"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-05-14">May 14, 2019</time></span>
</header>
diff --git a/content/blog/strata-talk-2017/index.html
b/content/blog/strata-talk-2017/index.html
index b2165c9..4e65e21 100644
--- a/content/blog/strata-talk-2017/index.html
+++ b/content/blog/strata-talk-2017/index.html
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/blog/streaming-data-lake-platform/index.html
b/content/blog/streaming-data-lake-platform/index.html
new file mode 100644
index 0000000..b4b6135
--- /dev/null
+++ b/content/blog/streaming-data-lake-platform/index.html
@@ -0,0 +1,396 @@
+<!doctype html>
+<html lang="en" class="no-js">
+ <head>
+ <meta charset="utf-8">
+
+<!-- begin _includes/seo.html --><title>Apache Hudi - The Streaming Data Lake
Platform - Apache Hudi</title>
+<meta name="description" content="It’s been called many things. But, we have
always been building a data lake platform">
+
+<meta property="og:type" content="article">
+<meta property="og:locale" content="en_US">
+<meta property="og:site_name" content="">
+<meta property="og:title" content="Apache Hudi - The Streaming Data Lake
Platform">
+<meta property="og:url"
content="https://hudi.apache.org/blog/streaming-data-lake-platform/">
+
+
+ <meta property="og:description" content="It’s been called many things. But,
we have always been building a data lake platform">
+
+
+
+
+
+
+
+
+
+
+
+<!-- end _includes/seo.html -->
+
+
+<!--<link href="/feed.xml" type="application/atom+xml" rel="alternate" title="
Feed">-->
+
+<!-- https://t.co/dKP3o1e -->
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+<script>
+ document.documentElement.className =
document.documentElement.className.replace(/\bno-js\b/g, '') + ' js ';
+</script>
+
+<!-- For all browsers -->
+<link rel="stylesheet" href="/assets/css/main.css">
+
+<!--[if IE]>
+ <style>
+ /* old IE unsupported flexbox fixes */
+ .greedy-nav .site-title {
+ padding-right: 3em;
+ }
+ .greedy-nav button {
+ position: absolute;
+ top: 0;
+ right: 0;
+ height: 100%;
+ }
+ </style>
+<![endif]-->
+
+
+
+<link rel="icon" type="image/x-icon" href="/assets/images/favicon.ico">
+<link rel="stylesheet" href="/assets/css/font-awesome.min.css">
+<script src="/assets/js/jquery.min.js"></script>
+
+
+<script src="/assets/js/main.min.js"></script>
+
+ </head>
+
+ <body class="layout--single">
+ <!--[if lt IE 9]>
+<div class="notice--danger align-center" style="margin: 0;">You are using an
<strong>outdated</strong> browser. Please <a
href="https://browsehappy.com/">upgrade your browser</a> to improve your
experience.</div>
+<![endif]-->
+
+ <div class="masthead">
+ <div class="masthead__inner-wrap" id="masthead__inner-wrap">
+ <div class="masthead__menu">
+ <nav id="site-nav" class="greedy-nav">
+
+ <a class="site-logo" href="/">
+ <div style="width: 150px; height: 40px">
+ </div>
+ </a>
+
+ <a class="site-title" href="/">
+
+ </a>
+ <ul class="visible-links"><li class="masthead__menu-item">
+ <a href="/docs/spark_quick-start-guide.html" target="_self"
>Documentation</a>
+ </li><li class="masthead__menu-item">
+ <a href="/community.html" target="_self" >Community</a>
+ </li><li class="masthead__menu-item">
+ <a href="/blog.html" target="_self" >Blog</a>
+ </li><li class="masthead__menu-item">
+ <a href="https://cwiki.apache.org/confluence/display/HUDI/FAQ"
target="_blank" >FAQ</a>
+ </li><li class="masthead__menu-item">
+ <a href="/docs/powered_by.html" target="_self" >Powered By</a>
+ </li><li class="masthead__menu-item">
+ <a href="/releases.html" target="_self" >Releases</a>
+ </li><li class="masthead__menu-item">
+ <a href="/download.html" target="_self" >Download</a>
+ </li></ul>
+ <button class="greedy-nav__toggle hidden" type="button">
+ <span class="visually-hidden">Toggle menu</span>
+ <div class="navicon"></div>
+ </button>
+ <ul class="hidden-links hidden"></ul>
+ </nav>
+ </div>
+ </div>
+</div>
+<!--
+<p class="notice--warning" style="margin: 0 !important; text-align: center
!important;"><strong>Note:</strong> This site is work in progress, if you
notice any issues, please <a target="_blank"
href="https://github.com/apache/hudi/issues">Report on Issue</a>.
+ Click <a href="/"> here</a> back to old site.</p>
+-->
+
+ <div class="initial-content">
+ <div id="main" role="main">
+
+
+ <div class="sidebar sticky">
+
+
+ <div itemscope itemtype="https://schema.org/Person">
+
+ <div class="author__content">
+
+ <h3 class="author__name" itemprop="name">Quick Links</h3>
+
+
+ <div class="author__bio" itemprop="description">
+ <p>Hudi is the Streaming Data Lake Platform.</p>
+
+ </div>
+
+ </div>
+
+ <div class="author__urls-wrapper">
+ <ul class="author__urls social-icons">
+
+
+ <li><a href="/docs/spark_quick-start-guide" target="_self"
rel="nofollow noopener noreferrer"><i class="fa fa-book"
aria-hidden="true"></i> Documentation</a></li>
+
+
+
+ <li><a href="https://cwiki.apache.org/confluence/display/HUDI"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-wikipedia-w"
aria-hidden="true"></i> Technical Wiki</a></li>
+
+
+
+ <li><a href="/contributing" target="_self" rel="nofollow noopener
noreferrer"><i class="fa fa-thumbs-o-up" aria-hidden="true"></i> Contribution
Guide</a></li>
+
+
+
+ <li><a
href="https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-slack"
aria-hidden="true"></i> Join on Slack</a></li>
+
+
+
+ <li><a href="https://github.com/apache/hudi" target="_blank"
rel="nofollow noopener noreferrer"><i class="fa fa-github"
aria-hidden="true"></i> Fork on GitHub</a></li>
+
+
+
+ <li><a href="https://issues.apache.org/jira/projects/HUDI/summary"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-navicon"
aria-hidden="true"></i> Report Issues</a></li>
+
+
+
+ <li><a href="/security" target="_self" rel="nofollow noopener
noreferrer"><i class="fa fa-navicon" aria-hidden="true"></i> Report Security
Issues</a></li>
+
+
+
+
+ </ul>
+ </div>
+</div>
+
+
+
+
+ </div>
+
+
+ <article class="page" itemscope itemtype="https://schema.org/CreativeWork">
+ <!-- Look the author details up from the site config. -->
+
+
+ <div class="page__inner-wrap">
+
+ <header>
+ <h1 id="page-title" class="page__title" itemprop="headline">Apache
Hudi - The Streaming Data Lake Platform
+</h1>
+ <!-- Output author details if some exist. -->
+ <div class="page__author"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2021-07-21">July 21, 2021</time></span>
+ </header>
+
+
+ <section class="page__content" itemprop="text">
+
+ <style>
+ .page {
+ padding-right: 0 !important;
+ }
+ </style>
+
+ <p>As early as 2016, we set out a <a
href="https://www.oreilly.com/content/ubers-case-for-incremental-processing-on-hadoop/">bold,
new vision</a> reimagining batch data processing through a new
“<strong>incremental</strong>” data processing stack - alongside the existing
batch and streaming stacks.
+While a stream processing pipeline does row-oriented processing, delivering a
few seconds of processing latency, an incremental pipeline would apply the same
principles to <em>columnar</em> data in the data lake,
+delivering orders of magnitude improvements in processing efficiency within
few minutes, on extremely scalable batch storage/compute infrastructure. This
new stack would be able to effortlessly support regular batch processing for
bulk reprocessing/backfilling as well.
+Hudi was built as the manifestation of this vision, rooted in real, hard
problems faced at <a
href="https://eng.uber.com/uber-big-data-platform/">Uber</a> and later took a
life of its own in the open source community. Together, we have been able to
+usher in fully incremental data ingestion and moderately complex ETLs on data
lakes already.</p>
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Page_2_4.png"
alt="the different components that make up the stream and batch processing
stack today, showing how an incremental stack blends the best of both the
worlds." /></p>
+
+<p>Today, this grand vision of being able to express almost any batch pipeline
incrementally is more attainable than it ever was. Stream processing is <a
href="https://flink.apache.org/blog/">maturing rapidly</a> and gaining <a
href="https://www.confluent.io/blog/every-company-is-becoming-software/">tremendous
momentum</a>,
+with <a
href="https://flink.apache.org/2021/03/11/batch-execution-mode.html">generalization</a>
of stream processing APIs to work over a batch execution model. Hudi completes
the missing pieces of the puzzle by providing streaming optimized lake storage,
+much like how Kafka/Pulsar enable efficient storage for event streaming. <a
href="https://hudi.apache.org/docs/powered_by.html">Many organizations</a> have
already reaped real benefits of adopting a streaming model for their data
lakes, in terms of fresh data, simplified architecture and great cost
reductions.</p>
+
+<p>But first, we needed to tackle the basics - transactions and mutability -
on the data lake. In many ways, Apache Hudi pioneered the transactional data
lake movement as we know it today. Specifically, during a time when more
special-purpose systems were being born, Hudi introduced a server-less,
transaction layer, which worked over the general-purpose Hadoop FileSystem
abstraction on Cloud Stores/HDFS. This model helped Hudi to scale
writers/readers to 1000s of cores on day one, compar [...]
+
+<p>This is going to be a rather long post, but we will do our best to make it
worth your time. Let’s roll.</p>
+
+<h2 id="data-lake-platform">Data Lake Platform</h2>
+
+<p>We have noticed that, Hudi is sometimes positioned as a “<a
href="https://cloud.google.com/blog/products/data-analytics/getting-started-with-new-table-formats-on-dataproc">table
format</a>” or “transactional layer”. While this is not incorrect, this does
not do full justice to all that Hudi has to offer.</p>
+
+<h3 id="is-hudi-a-format">Is Hudi a “format”?</h3>
+
+<p>Hudi was not designed as a general purpose table format, tracking
files/folders for batch processing. Rather, the functionality provided by a
table format is merely one layer in the Hudi software stack. Hudi was designed
to play well with the Hive format (if you will), given how popular and
widespread it is. Over time, to solve scaling challenges or bring in additional
functionality, we have invested in our own native table format with an eye for
incremental processing vision. for e.g [...]
+
+<h3 id="is-hudi-a-transactional-layer">Is Hudi a transactional layer?</h3>
+
+<p>Of course, Hudi had to provide transactions for implementing
deletes/updates, but Hudi’s transactional layer is designed around an <a
href="https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying">event
log</a> that is also well-integrated with an entire set of built-in table/data
services. For e.g compaction is aware of clustering actions already scheduled
and optimizes by skipping over the files being clustered [...]
+
+<p>Thus, the best way to describe Apache Hudi is as a <strong>Streaming Data
Lake Platform</strong> built around a <em>database kernel</em>. The words carry
significant meaning.</p>
+
+<p><img
src="/assets/images/blog/datalake-platform/Screen_Shot_2021-07-20_at_5.35.47_PM.png"
alt="/assets/images/blog/datalake-platform/Screen_Shot_2021-07-20_at_5.35.47_PM.png"
/></p>
+
+<p><strong>Streaming</strong>: At its core, by optimizing for fast upserts
& change streams, Hudi provides the primitives to data lake workloads that
are comparable to what <a href="https://kafka.apache.org/">Apache Kafka</a>
does for event-streaming (namely, incremental produce/consume of events and a
state-store for interactive querying).</p>
+
+<p><strong>Data Lake</strong>: Nonetheless, Hudi provides an optimized,
self-managing data plane for large scale data processing on the lake (adhoc
queries, ML pipelines, batch pipelines), powering arguably the <a
href="https://eng.uber.com/apache-hudi-graduation/">largest transactional
lake</a> in the world. While Hudi can be used to build a <a
href="https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html">lakehouse</a>,
given its transactional capabilities, Hudi goes beyon [...]
+
+<p><strong>Platform</strong>: Oftentimes in open source, there is great tech,
but there is just too many of them - all differing ever so slightly in their
opinionated ways, ultimately making the integration task onerous on the end
user. Lake users deserve the same great usability that cloud warehouses
provide, with the additional freedom and transparency of a true open source
community. Hudi’s data and table services, tightly integrated with the Hudi
“kernel”, gives us the ability to del [...]
+
+<h2 id="hudi-stack">Hudi Stack</h2>
+
+<p>The following stack captures layers of software components that make up
Hudi, with each layer depending on and drawing strength from the layer below.
Typically, data lake users write data out once using an open file format like
Apache <a href="http://parquet.apache.org/">Parquet</a>/<a
href="https://orc.apache.org/">ORC</a> stored on top of extremely scalable
cloud storage or distributed file systems. Hudi provides a self-managing data
plane to ingest, transform and manage this data, [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Copy_of_Page_1_3.png"
alt="Figure showing the Hudi stack" /></p>
+
+<p>Furthermore, Hudi either already provides or plans to add components that
make this data universally accessible to all the different query engines out
there. The features annotated with <code class="highlighter-rouge">*</code>
represent work in progress and dotted boxes represent planned future work, to
complete our vision for the project.
+While we have strawman designs outlined for the newer components in the blog,
we welcome with open arms fresh perspectives from the community.
+Rest of the blog will delve into each layer in our stack - explaining what it
does, how it’s designed for incremental processing and how it will evolve in
the future.</p>
+
+<h2 id="lake-storage">Lake Storage</h2>
+
+<p>Hudi interacts with lake storage using the <a
href="https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/fs/FileSystem.html">Hadoop
FileSystem API</a>, which makes it compatible with all of its implementations
ranging from HDFS to Cloud Stores to even in-memory filesystems like <a
href="https://www.alluxio.io/blog/building-high-performance-data-lake-using-apache-hudi-and-alluxio-at-t3go/">Alluxio</a>/Ignite.
Hudi internally implements its own <a href="https://github.com/apache/ [...]
+
+<h2 id="file-format">File Format</h2>
+
+<p>Hudi is designed around the notion of base file and delta log files that
store updates/deltas to a given base file (called a file slice). Their formats
are pluggable, with Parquet (columnar access) and HFile (indexed access) being
the supported base file formats today. The delta logs encode data in <a
href="http://avro.apache.org/">Avro</a> (row oriented) format for speedier
logging (just like Kafka topics for e.g). Going forward, we plan to <a
href="https://github.com/apache/hudi/pul [...]
+
+<p>Zooming one level up, Hudi’s unique file layout scheme encodes all changes
to a given base file, as a sequence of blocks (data blocks, delete blocks,
rollback blocks) that are merged in order to derive newer base files. In
essence, this makes up a self contained redo log that the lets us implement
interesting features on top. For e.g, most of today’s data privacy enforcement
happens by masking data read off the lake storage on-the-fly, invoking
hashing/encryption algorithms over and o [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_2_1.png"
alt="Hudi base and delta logs" /></p>
+
+<h2 id="table-format">Table Format</h2>
+
+<p>The term “table format” is new and still means many things to many people.
Drawing an analogy to file formats, a table format simply consists of : the
file layout of the table, table’s schema and metadata tracking changes to the
table. Hudi is not a table format, it implements one internally. Hudi uses Avro
schemas to store, manage and evolve a table’s schema. Currently, Hudi enforces
schema-on-write, which although stricter than schema-on-read, is adopted <a
href="https://docs.conflu [...]
+
+<p>Hudi consciously lays out files within a table/partition into groups and
maintains a mapping between an incoming record’s key to an existing file group.
All updates are recorded into delta log files specific to a given file group
and this design ensures low merge overhead compared to approaches like Hive
ACID, which have to merge all delta records against all base files to satisfy
queries. For e.g, with uuid keys (used very widely) all base files are very
likely to overlap with all de [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-design-diagrams-table-format.png"
alt="Shows the Hudi table format components" /></p>
+
+<p>The <em>timeline</em> is the source-of-truth event log for all Hudi’s table
metadata, stored under the <code class="highlighter-rouge">.hoodie</code>
folder, that provides an ordered log of all actions performed on the table.
Events are retained on the timeline up to a configured interval of
time/activity. Each file group is also designed as it’s own self-contained log,
which means that even if an action that affected a file group is archived from
the timeline, the right state of the [...]
+
+<p>Lastly, new events on the timeline are then consumed and reflected onto an
internal metadata table, implemented as another merge-on-read table offering
low write amplification. Hudi is able to absorb quick/rapid changes to table’s
metadata, unlike table formats designed for slow-moving data. Additionally, the
metadata table uses the <a
href="https://hbase.apache.org/2.0/devapidocs/org/apache/hadoop/hbase/io/hfile/HFile.html">HFile</a>
base file format, which provides indexed lookups o [...]
+
+<p>A key challenge faced by all the table formats out there today, is the need
for expiring snapshots/controlling retention for time travel queries such that
it does not interfere with query planning/performance. In the future, we plan
to build an indexed timeline in Hudi, which can span the entire history of the
table, supporting a time travel look back window of several months/years.</p>
+
+<h2 id="indexes">Indexes</h2>
+
+<p>Indexes help databases plan better queries, that reduce the overall amount
of I/O and deliver faster response times. Table metadata about file listings
and column statistics are often enough for lake query engines to generate
optimized, engine specific query plans quickly. This is however not sufficient
for Hudi to realize fast upserts. Hudi already supports different key based
indexing schemes to quickly map incoming record keys into the file group they
reside in. For this purpose, H [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_5.png"
alt="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_5.png"
/></p>
+
+<p>In the future, we intend to add additional forms of indexing as new
partitions on the metadata table. Let’s discuss the role each one has to play
briefly. Query engines typically rely on partitioning to cut down the number of
files read for a given query. In database terms, a Hive partition is nothing
but a coarse range index, that maps a set of columns to a list of files. Table
formats born in the cloud like Iceberg/Delta Lake, have built-in tracking of
column ranges per file in a s [...]
+
+<p>While Hudi already supports external indexes for random write workloads, we
would like to support <a
href="https://github.com/apache/hudi/pull/2487">point-lookup-ish queries</a>
right on top of lake storage, which helps avoid the overhead of an additional
database for many classes of data applications. We also anticipate that
uuid/key based joins will be sped up a lot, by leveraging record level indexing
schemes, we build out for fast upsert performance. We also plan to move our
track [...]
+
+<h2 id="concurrency-control">Concurrency Control</h2>
+
+<p>Concurrency control defines how different writers/readers coordinate access
to the table. Hudi ensures atomic writes, by way of publishing commits
atomically to the timeline, stamped with an instant time that denotes the time
at which the action is deemed to have occurred. Unlike general purpose file
version control, Hudi draws clear distinction between writer processes (that
issue user’s upserts/deletes), table services (that write data/metadata to
optimize/perform bookkeeping) and r [...]
+
+<p>Projects that solely rely on OCC deal with competing operations, by either
implementing a lock or relying on atomic renames. Such approaches are
optimistic that real contention never happens and resort to failing one of the
writer operations if conflicts occur, which can cause significant resource
wastage or operational overhead. Imagine a scenario of two writer processes :
an ingest writer job producing new data every 30 minutes and a deletion writer
job that is enforcing GDPR taking [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/Hudi_design_diagram_-_Page_2_1.png"
alt="Figure showing competing transactions leading to starvation with just OCC"
/></p>
+
+<p>We are hard at work, improving our OCC based implementation around early
detection of conflicts for concurrent writers and terminate early without
burning up CPU resources. We are also working on <a
href="https://cwiki.apache.org/confluence/display/HUDI/RFC+-+22+%3A+Snapshot+Isolation+using+Optimistic+Concurrency+Control+for+multi-writers#RFC22:SnapshotIsolationusingOptimisticConcurrencyControlformultiwriters-FutureWork(LockFree-ishConcurrencyControl)">adding
fully log based</a>, non- [...]
+
+<h2 id="writers">Writers</h2>
+
+<p>Hudi tables can be used as sinks for Spark/Flink pipelines and the Hudi
writing path provides several enhanced capabilities over file writing done by
vanilla parquet/avro sinks. Hudi classifies write operations carefully into
incremental (<code class="highlighter-rouge">insert</code>, <code
class="highlighter-rouge">upsert</code>, <code
class="highlighter-rouge">delete</code>) and batch/bulk operations (<code
class="highlighter-rouge">insert_overwrite</code>, <code class="highlighter-
[...]
+
+<p>Keys are first class citizens inside Hudi and the pre-combining/index
lookups done before upsert/deletes ensure a key is unique across partitions or
within partitions, as desired. In contrast with other approaches where this is
left to data engineer to co-ordinate using <code
class="highlighter-rouge">MERGE INTO</code> statements, this approach ensures
quality data especially for critical use-cases. Hudi also ships with several <a
href="http://hudi.apache.org/blog/hudi-key-generators/ [...]
+
+<p>Hudi writers add metadata to each record, that codify the commit time and a
sequence number for each record within that commit (comparable to a Kafka
offset), which make it possible to derive record level change streams. Hudi
also provides users the ability to specify event time fields in incoming data
streams and track them in the timeline.Mapping these to stream processing
concepts, Hudi contains both <a
href="https://www.oreilly.com/radar/the-world-beyond-batch-streaming-101/">arri
[...]
+
+<h2 id="readers">Readers</h2>
+
+<p>Hudi provides snapshot isolation between writers and readers and allows for
any table snapshot to be queries consistently from all major lake query engines
(Spark, Hive, Flink, Presto, Trino, Impala) and even cloud warehouses like
Redshift. In fact, we would love to bring Hudi tables as external tables with
BigQuery/Snowflake as well, once they also embrace the lake table formats more
natively. Our design philosophy around query performance has been to make Hudi
as lightweight as poss [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-design-diagram_-incr-read.png"
alt="Log merging done for incremental queries" /></p>
+
+<p>True to its design goals, Hudi provides some very powerful incremental
querying capabilities that tied together the meta fields added during writing
and the file group based storage layout. While table formats that merely track
files, are only able to provide information about files that changed during
each snapshot or commits, Hudi generates the exact set of records that changed
given a point in the timeline, due to tracking of record level event and
arrival times. Further more, this [...]
+
+<h2 id="table-services">Table Services</h2>
+
+<p>What defines and sustains a project’s value over years are its fundamental
design principles and the subtle trade offs. Databases often consist of several
internal components, working in tandem to deliver efficiency, performance and
great operability to its users. True to intent to act as state store for
incremental data pipelines, we designed Hudi with built-in table services and
self-managing runtime that can orchestrate/trigger these services to optimize
everything internally. In f [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_4.png"
alt="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_4.png"
/></p>
+
+<p>There are several built-in table services, all with the goal of ensuring
performant table storage layout and metadata management, which are
automatically invoked either synchronously after each write operation, or
asynchronously as a separate background job. Furthermore, Spark (and Flink)
streaming writers can run in continuous mode, and invoke table services
asynchronously sharing the underlying executors intelligently with writers.
Archival service ensures that the timeline holds su [...]
+
+<p>We are always looking for ways to improve and enhance our table services in
meaningful ways. In the coming releases, we are working towards a much more <a
href="https://github.com/apache/hudi/pull/3233">scalable model</a> of cleaning
up partial writes, by consolidating marker file creation using our timeline
metaserver, which avoids expensive full table scans to seek out and remove
uncommitted files. We also have <a
href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageI [...]
+
+<h2 id="data-services">Data Services</h2>
+
+<p>As noted at the start, we wanted to make Hudi immediately usable for common
end-end use-cases and thus invested deeply into a set of data services, that
provide functionality that is data/workload specific, sitting on top of the
table services, writers/readers directly. Foremost in that list, is the Hudi
DeltaStreamer utility, which has been an extremely popular choice for
painlessly building a data lake out of Kafka streams and files landing in
different formats on top of lake stora [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_8.png"
alt="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_8.png"
/></p>
+
+<p>Going forward, we would love contributions to enhance our <a
href="http://hudi.apache.org/blog/ingest-multiple-tables-using-hudi/">multi
delta streamer utility</a>, which can ingest entire Kafka clusters in a single
large Spark application, to be on par and hardened. To further our progress
towards end-end complex incremental pipelines, we plan to work towards
enhancing the delta streamer utility and its SQL transformers to be triggered
by multiple source streams (as opposed to just t [...]
+
+<h2 id="timeline-metaserver">Timeline Metaserver</h2>
+
+<p>Storing and serving table metadata right on the lake storage is scalable,
but can be much less performant compared to RPCs against a scalable meta
server. Most cloud warehouses internally are built on a metadata layer that
leverages an external database (e.g <a
href="https://www.snowflake.com/blog/how-foundationdb-powers-snowflake-metadata-forward/">Snowflake
uses foundationDB</a>). Hudi also provides a metadata server, called the
“Timeline server”, which offers an alternative backing [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_6.png"
alt="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_6.png"
/></p>
+
+<h2 id="lake-cache">Lake Cache</h2>
+
+<p>There is a fundamental tradeoff today in data lakes between faster writing
and great query performance. Faster writing typically involves writing smaller
files (and later clustering them) or logging deltas (and later merging on
read). While this provides good performance already, the pursuit of great query
performance often warrants opening fewer number of files/objects on lake
storage and may be pre-materializing the merges between base and delta logs.
After all, most databases emplo [...]
+
+<p><img
src="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_7.png"
alt="/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_7.png"
/></p>
+
+<h2 id="onwards">Onwards</h2>
+
+<p>We hope that this blog painted a complete picture of Apache Hudi, staying
true to its founding principles. Interested users and readers can expect blogs
delving into each layer of the stack and an overhaul of our docs along these
lines in the coming weeks/months. We view the current efforts around table
formats as merely removing decade-old bottlenecks in data lake storage/query
planes, problems which have been already solved very well in cloud warehouses
like Big Query/Snowflake. We [...]
+
+ </section>
+
+ <a href="#masthead__inner-wrap" class="back-to-top">Back to top
↑</a>
+
+
+
+
+ </div>
+
+ </article>
+
+</div>
+
+ </div>
+
+ <div class="page__footer">
+ <footer>
+
+<div class="row">
+ <div class="col-lg-12 footer">
+ <p>
+ <table class="table-apache-info">
+ <tr>
+ <td>
+ <a class="footer-link-img" href="https://apache.org">
+ <img width="250px" src="/assets/images/asf_logo.svg" alt="The
Apache Software Foundation">
+ </a>
+ </td>
+ <td>
+ <a style="float: right"
href="https://www.apache.org/events/current-event.html">
+ <img
src="https://www.apache.org/events/current-event-234x60.png" />
+ </a>
+ </td>
+ </tr>
+ </table>
+ </p>
+ <p>
+ <a href="https://www.apache.org/licenses/">License</a> | <a
href="https://www.apache.org/security/">Security</a> | <a
href="https://www.apache.org/foundation/thanks.html">Thanks</a> | <a
href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a>
+ </p>
+ <p>
+ Copyright © <span id="copyright-year">2019</span> <a
href="https://apache.org">The Apache Software Foundation</a>, Licensed under
the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License,
Version 2.0</a>.
+ Hudi, Apache and the Apache feather logo are trademarks of The Apache
Software Foundation. <a href="/docs/privacy">Privacy Policy</a>
+ </p>
+ </div>
+</div>
+ </footer>
+ </div>
+
+
+ </body>
+</html>
\ No newline at end of file
diff --git a/content/cn/activity.html b/content/cn/activity.html
index dce906a..f95788c 100644
--- a/content/cn/activity.html
+++ b/content/cn/activity.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Activities - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -126,7 +126,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
@@ -193,6 +193,30 @@
<h2 class="archive__item-title" itemprop="headline">
+ <a href="/blog/streaming-data-lake-platform/" rel="permalink">Apache
Hudi - The Streaming Data Lake Platform
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2021-07-21">July 21, 2021</time></div>
+
+ <p class="archive__item-excerpt" itemprop="description">It’s been called
many things. But, we have always been building a data lake platform
+</p>
+ </article>
+</div>
+
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
<a href="/blog/employing-right-configurations-for-hudi-cleaner/"
rel="permalink">Employing correct configurations for Hudi’s cleaner table
service
</a>
@@ -327,7 +351,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2020-11-11">November 11, 2020</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2020-11-11">November 11, 2020</time></div>
<p class="archive__item-excerpt" itemprop="description">Detailing
different indexing mechanisms in Hudi and when to use each of them
</p>
@@ -591,7 +615,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2020-01-20">January 20, 2020</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2020-01-20">January 20, 2020</time></div>
<p class="archive__item-excerpt" itemprop="description">In this blog, we
will build an end-end solution for capturing changes from a MySQL instance
running on AWS RDS to a Hudi table on S3, using capabilities in the Hudi 0.5.1
release.
</p>
@@ -646,7 +670,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-09-09">September 9, 2019</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-09-09">September 9, 2019</time></div>
<p class="archive__item-excerpt" itemprop="description">Learn how to
ingesting changes from a HUDI dataset using Sqoop/Hudi
</p>
@@ -670,7 +694,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-05-14">May 14, 2019</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-05-14">May 14, 2019</time></div>
<p class="archive__item-excerpt" itemprop="description">How to manually
register HUDI dataset into Hive using beeline
</p>
@@ -694,7 +718,7 @@
<!-- Look the author details up from the site config. -->
<!-- Output author details if some exist. -->
- <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoth">Vinoth Chandar</a>
posted on <time datetime="2019-03-07">March 7, 2019</time></div>
+ <div class="archive__item-meta"><a
href="https://twitter.com/byte_array">Vinoth Chandar</a> posted on <time
datetime="2019-03-07">March 7, 2019</time></div>
<p class="archive__item-excerpt" itemprop="description">
</p>
diff --git a/content/cn/community.html b/content/cn/community.html
index 30f8a53..c989e87 100644
--- a/content/cn/community.html
+++ b/content/cn/community.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Community - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -126,7 +126,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/cn/contributing.html b/content/cn/contributing.html
index 6ca346e..385de22 100644
--- a/content/cn/contributing.html
+++ b/content/cn/contributing.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Developer Setup - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -126,7 +126,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/cn/download.html b/content/cn/download.html
index fb751f8..4b20130 100644
--- a/content/cn/download.html
+++ b/content/cn/download.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Download - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/cn/older-releases.html b/content/cn/older-releases.html
index d955abc..85e5bbb 100644
--- a/content/cn/older-releases.html
+++ b/content/cn/older-releases.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Older Releases - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -126,7 +126,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/cn/releases.html b/content/cn/releases.html
index 1228ce7..1e9ee30 100644
--- a/content/cn/releases.html
+++ b/content/cn/releases.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Releases - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -126,7 +126,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/cn/security.html b/content/cn/security.html
index 9e54068..b11d69f 100644
--- a/content/cn/security.html
+++ b/content/cn/security.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Security - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -126,7 +126,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/community.html b/content/community.html
index 07b8494..631d0c3 100644
--- a/content/community.html
+++ b/content/community.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Community - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/contributing.html b/content/contributing.html
index db19b97..9174881 100644
--- a/content/contributing.html
+++ b/content/contributing.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Developer Setup - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/download.html b/content/download.html
index f94dc1c..7c54bb7 100644
--- a/content/download.html
+++ b/content/download.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Download - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/index.html b/content/index.html
index 749448f..40e42dd 100644
--- a/content/index.html
+++ b/content/index.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Welcome to Apache Hudi ! - Apache
Hudi</title>
-<meta name="description" content="Apache Hudi ingests & manages storage of
large analytical datasets over DFS (hdfs or cloud stores). Latest release
0.8.0">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform. Latest release 0.8.0">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -13,7 +13,7 @@
<meta property="og:url" content="https://hudi.apache.org/">
- <meta property="og:description" content="Apache Hudi ingests & manages
storage of large analytical datasets over DFS (hdfs or cloud stores). Latest
release 0.8.0">
+ <meta property="og:description" content="Apache Hudi is the Streaming Data
Lake Platform. Latest release 0.8.0">
@@ -116,7 +116,7 @@
Welcome to Apache Hudi !
</h1>
- <p class="page__lead">Apache Hudi ingests & manages storage of large
analytical datasets over DFS (hdfs or cloud stores).<br /> <small><a
href="https://github.com/apache/hudi/releases/tag/release-0.8.0"
target="_blank">Latest release 0.8.0</a></small>
+ <p class="page__lead">Apache Hudi is the Streaming Data Lake
Platform.<br /> <small><a
href="https://github.com/apache/hudi/releases/tag/release-0.8.0"
target="_blank">Latest release 0.8.0</a></small>
</p>
<p>
<a href="/docs/spark_quick-start-guide.html" class="btn
btn--light-outline btn--large"><i class="fa fa-paper-plane"></i> Get Started</a>
diff --git a/content/older-releases.html b/content/older-releases.html
index 9f28729..9dd1195 100644
--- a/content/older-releases.html
+++ b/content/older-releases.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Older Releases - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/releases.html b/content/releases.html
index c616bb2..6d8efb6 100644
--- a/content/releases.html
+++ b/content/releases.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Releases - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/roadmap.html b/content/roadmap.html
index 78827fb..cfad109 100644
--- a/content/roadmap.html
+++ b/content/roadmap.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Roadmap - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/security.html b/content/security.html
index 9d3753b..a5ee070 100644
--- a/content/security.html
+++ b/content/security.html
@@ -4,7 +4,7 @@
<meta charset="utf-8">
<!-- begin _includes/seo.html --><title>Security - Apache Hudi</title>
-<meta name="description" content="Apache Hudi brings upserts, deletes and
stream processing to data lakes built on HDFS or cloud storage.">
+<meta name="description" content="Apache Hudi is the Streaming Data Lake
Platform.">
<meta property="og:type" content="website">
<meta property="og:locale" content="en_US">
@@ -128,7 +128,7 @@
<div class="author__bio" itemprop="description">
- <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+ <p>Hudi is the Streaming Data Lake Platform.</p>
</div>
diff --git a/content/sitemap.xml b/content/sitemap.xml
index c7388f0..fc47e3b 100644
--- a/content/sitemap.xml
+++ b/content/sitemap.xml
@@ -1365,6 +1365,10 @@
<lastmod>2021-06-10T00:00:00-04:00</lastmod>
</url>
<url>
+<loc>https://hudi.apache.org/blog/streaming-data-lake-platform/</loc>
+<lastmod>2021-07-21T00:00:00-04:00</lastmod>
+</url>
+<url>
<loc>https://hudi.apache.org/cn/activity</loc>
<lastmod>2019-12-30T14:59:57-05:00</lastmod>
</url>