This is an automated email from the ASF dual-hosted git repository.
vinoth pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push:
new b9f5826 Travis CI build asf-site
b9f5826 is described below
commit b9f5826062eb3612eca04e176d1fce7d99da9044
Author: CI <[email protected]>
AuthorDate: Mon Aug 31 16:59:10 2020 +0000
Travis CI build asf-site
---
content/activity.html | 24 +
.../assets/images/blog/incr-processing/image1.png | Bin 0 -> 59805 bytes
.../assets/images/blog/incr-processing/image2.png | Bin 0 -> 385336 bytes
.../assets/images/blog/incr-processing/image3.png | Bin 0 -> 167680 bytes
.../assets/images/blog/incr-processing/image4.jpg | Bin 0 -> 19807 bytes
.../assets/images/blog/incr-processing/image5.png | Bin 0 -> 225670 bytes
.../assets/images/blog/incr-processing/image6.png | Bin 0 -> 67083 bytes
.../assets/images/blog/incr-processing/image7.png | Bin 0 -> 44297 bytes
.../assets/images/blog/incr-processing/image8.png | Bin 0 -> 209792 bytes
content/assets/js/lunr/lunr-store.js | 5 +
content/blog.html | 24 +
.../index.html | 524 +++++++++++++++++++++
content/cn/activity.html | 24 +
content/sitemap.xml | 4 +
14 files changed, 605 insertions(+)
diff --git a/content/activity.html b/content/activity.html
index 44b6969..46e0160 100644
--- a/content/activity.html
+++ b/content/activity.html
@@ -191,6 +191,30 @@
<h2 class="archive__item-title" itemprop="headline">
+ <a href="/blog/hudi-incremental-processing-on-data-lakes/"
rel="permalink">Incremental Processing on the Data Lake
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+ <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoyang">Vino Yang</a>
posted on <time datetime="2020-08-18">August 18, 2020</time></div>
+
+ <p class="archive__item-excerpt" itemprop="description">How Apache Hudi
provides ability for incremental data processing.
+</p>
+ </article>
+</div>
+
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
<a href="/blog/monitoring-hudi-metrics-with-datadog/"
rel="permalink">Monitor Hudi metrics with Datadog
</a>
diff --git a/content/assets/images/blog/incr-processing/image1.png
b/content/assets/images/blog/incr-processing/image1.png
new file mode 100644
index 0000000..b744803
Binary files /dev/null and
b/content/assets/images/blog/incr-processing/image1.png differ
diff --git a/content/assets/images/blog/incr-processing/image2.png
b/content/assets/images/blog/incr-processing/image2.png
new file mode 100644
index 0000000..becc5aa
Binary files /dev/null and
b/content/assets/images/blog/incr-processing/image2.png differ
diff --git a/content/assets/images/blog/incr-processing/image3.png
b/content/assets/images/blog/incr-processing/image3.png
new file mode 100644
index 0000000..d570455
Binary files /dev/null and
b/content/assets/images/blog/incr-processing/image3.png differ
diff --git a/content/assets/images/blog/incr-processing/image4.jpg
b/content/assets/images/blog/incr-processing/image4.jpg
new file mode 100644
index 0000000..dbacbf2
Binary files /dev/null and
b/content/assets/images/blog/incr-processing/image4.jpg differ
diff --git a/content/assets/images/blog/incr-processing/image5.png
b/content/assets/images/blog/incr-processing/image5.png
new file mode 100644
index 0000000..50b01bf
Binary files /dev/null and
b/content/assets/images/blog/incr-processing/image5.png differ
diff --git a/content/assets/images/blog/incr-processing/image6.png
b/content/assets/images/blog/incr-processing/image6.png
new file mode 100644
index 0000000..9f07ad9
Binary files /dev/null and
b/content/assets/images/blog/incr-processing/image6.png differ
diff --git a/content/assets/images/blog/incr-processing/image7.png
b/content/assets/images/blog/incr-processing/image7.png
new file mode 100644
index 0000000..909d6f6
Binary files /dev/null and
b/content/assets/images/blog/incr-processing/image7.png differ
diff --git a/content/assets/images/blog/incr-processing/image8.png
b/content/assets/images/blog/incr-processing/image8.png
new file mode 100644
index 0000000..2260886
Binary files /dev/null and
b/content/assets/images/blog/incr-processing/image8.png differ
diff --git a/content/assets/js/lunr/lunr-store.js
b/content/assets/js/lunr/lunr-store.js
index 10258db..0a94f11 100644
--- a/content/assets/js/lunr/lunr-store.js
+++ b/content/assets/js/lunr/lunr-store.js
@@ -1158,4 +1158,9 @@ var store = [{
"excerpt":"Availability 0.6.0 (unreleased) Introduction Datadog is a
popular monitoring service. In the upcoming 0.6.0 release of Apache Hudi, we
will introduce the feature of reporting Hudi metrics via Datadog HTTP API, in
addition to the current reporter types: Graphite and JMX. Configurations
Similar to other supported reporters, turning on Datadog...","categories":
["blog"],
"tags": [],
"url":
"https://hudi.apache.org/blog/monitoring-hudi-metrics-with-datadog/",
+ "teaser":"https://hudi.apache.org/assets/images/500x300.png"},{
+ "title": "Incremental Processing on the Data Lake",
+ "excerpt":"NOTE: This article is a translation of the infoq.cn
article, found here, with minor edits Apache Hudi is a data lake framework
which provides the ability to ingest, manage and query large analytical data
sets on a distributed file system/cloud stores. Hudi joined the Apache
incubator for incubation in January...","categories": ["blog"],
+ "tags": [],
+ "url":
"https://hudi.apache.org/blog/hudi-incremental-processing-on-data-lakes/",
"teaser":"https://hudi.apache.org/assets/images/500x300.png"},]
diff --git a/content/blog.html b/content/blog.html
index fbe3148..db59c19 100644
--- a/content/blog.html
+++ b/content/blog.html
@@ -189,6 +189,30 @@
<h2 class="archive__item-title" itemprop="headline">
+ <a href="/blog/hudi-incremental-processing-on-data-lakes/"
rel="permalink">Incremental Processing on the Data Lake
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+ <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoyang">Vino Yang</a>
posted on <time datetime="2020-08-18">August 18, 2020</time></div>
+
+ <p class="archive__item-excerpt" itemprop="description">How Apache Hudi
provides ability for incremental data processing.
+</p>
+ </article>
+</div>
+
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
<a href="/blog/monitoring-hudi-metrics-with-datadog/"
rel="permalink">Monitor Hudi metrics with Datadog
</a>
diff --git a/content/blog/hudi-incremental-processing-on-data-lakes/index.html
b/content/blog/hudi-incremental-processing-on-data-lakes/index.html
new file mode 100644
index 0000000..1704322
--- /dev/null
+++ b/content/blog/hudi-incremental-processing-on-data-lakes/index.html
@@ -0,0 +1,524 @@
+<!doctype html>
+<html lang="en" class="no-js">
+ <head>
+ <meta charset="utf-8">
+
+<!-- begin _includes/seo.html --><title>Incremental Processing on the Data
Lake - Apache Hudi</title>
+<meta name="description" content="How Apache Hudi provides ability for
incremental data processing.">
+
+<meta property="og:type" content="article">
+<meta property="og:locale" content="en_US">
+<meta property="og:site_name" content="">
+<meta property="og:title" content="Incremental Processing on the Data Lake">
+<meta property="og:url"
content="https://hudi.apache.org/blog/hudi-incremental-processing-on-data-lakes/">
+
+
+ <meta property="og:description" content="How Apache Hudi provides ability
for incremental data processing.">
+
+
+
+
+
+
+
+
+
+
+
+<!-- end _includes/seo.html -->
+
+
+<!--<link href="/feed.xml" type="application/atom+xml" rel="alternate" title="
Feed">-->
+
+<!-- https://t.co/dKP3o1e -->
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+<script>
+ document.documentElement.className =
document.documentElement.className.replace(/\bno-js\b/g, '') + ' js ';
+</script>
+
+<!-- For all browsers -->
+<link rel="stylesheet" href="/assets/css/main.css">
+
+<!--[if IE]>
+ <style>
+ /* old IE unsupported flexbox fixes */
+ .greedy-nav .site-title {
+ padding-right: 3em;
+ }
+ .greedy-nav button {
+ position: absolute;
+ top: 0;
+ right: 0;
+ height: 100%;
+ }
+ </style>
+<![endif]-->
+
+
+
+<link rel="icon" type="image/x-icon" href="/assets/images/favicon.ico">
+<link rel="stylesheet" href="/assets/css/font-awesome.min.css">
+<script src="/assets/js/jquery.min.js"></script>
+
+
+<script src="/assets/js/main.min.js"></script>
+
+ </head>
+
+ <body class="layout--single">
+ <!--[if lt IE 9]>
+<div class="notice--danger align-center" style="margin: 0;">You are using an
<strong>outdated</strong> browser. Please <a
href="https://browsehappy.com/">upgrade your browser</a> to improve your
experience.</div>
+<![endif]-->
+
+ <div class="masthead">
+ <div class="masthead__inner-wrap" id="masthead__inner-wrap">
+ <div class="masthead__menu">
+ <nav id="site-nav" class="greedy-nav">
+
+ <a class="site-logo" href="/">
+ <div style="width: 150px; height: 40px">
+ </div>
+ </a>
+
+ <a class="site-title" href="/">
+
+ </a>
+ <ul class="visible-links"><li class="masthead__menu-item">
+ <a href="/docs/quick-start-guide.html" target="_self"
>Documentation</a>
+ </li><li class="masthead__menu-item">
+ <a href="/community.html" target="_self" >Community</a>
+ </li><li class="masthead__menu-item">
+ <a href="/blog.html" target="_self" >Blog</a>
+ </li><li class="masthead__menu-item">
+ <a href="https://cwiki.apache.org/confluence/display/HUDI/FAQ"
target="_blank" >FAQ</a>
+ </li><li class="masthead__menu-item">
+ <a href="/releases.html" target="_self" >Releases</a>
+ </li></ul>
+ <button class="greedy-nav__toggle hidden" type="button">
+ <span class="visually-hidden">Toggle menu</span>
+ <div class="navicon"></div>
+ </button>
+ <ul class="hidden-links hidden"></ul>
+ </nav>
+ </div>
+ </div>
+</div>
+<!--
+<p class="notice--warning" style="margin: 0 !important; text-align: center
!important;"><strong>Note:</strong> This site is work in progress, if you
notice any issues, please <a target="_blank"
href="https://github.com/apache/hudi/issues">Report on Issue</a>.
+ Click <a href="/"> here</a> back to old site.</p>
+-->
+
+ <div class="initial-content">
+ <div id="main" role="main">
+
+
+ <div class="sidebar sticky">
+
+
+ <div itemscope itemtype="https://schema.org/Person">
+
+ <div class="author__content">
+
+ <h3 class="author__name" itemprop="name">Quick Links</h3>
+
+
+ <div class="author__bio" itemprop="description">
+ <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+
+ </div>
+
+ </div>
+
+ <div class="author__urls-wrapper">
+ <ul class="author__urls social-icons">
+
+
+ <li><a href="/docs/quick-start-guide" target="_self" rel="nofollow
noopener noreferrer"><i class="fa fa-book" aria-hidden="true"></i>
Documentation</a></li>
+
+
+
+ <li><a href="https://cwiki.apache.org/confluence/display/HUDI"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-wikipedia-w"
aria-hidden="true"></i> Technical Wiki</a></li>
+
+
+
+ <li><a href="/contributing" target="_self" rel="nofollow noopener
noreferrer"><i class="fa fa-thumbs-o-up" aria-hidden="true"></i> Contribution
Guide</a></li>
+
+
+
+ <li><a
href="https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-slack"
aria-hidden="true"></i> Join on Slack</a></li>
+
+
+
+ <li><a href="https://github.com/apache/hudi" target="_blank"
rel="nofollow noopener noreferrer"><i class="fa fa-github"
aria-hidden="true"></i> Fork on GitHub</a></li>
+
+
+
+ <li><a href="https://issues.apache.org/jira/projects/HUDI/summary"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-navicon"
aria-hidden="true"></i> Report Issues</a></li>
+
+
+
+ <li><a href="/security" target="_self" rel="nofollow noopener
noreferrer"><i class="fa fa-navicon" aria-hidden="true"></i> Report Security
Issues</a></li>
+
+
+
+
+ </ul>
+ </div>
+</div>
+
+
+
+
+ </div>
+
+
+ <article class="page" itemscope itemtype="https://schema.org/CreativeWork">
+ <!-- Look the author details up from the site config. -->
+
+
+ <div class="page__inner-wrap">
+
+ <header>
+ <h1 id="page-title" class="page__title"
itemprop="headline">Incremental Processing on the Data Lake
+</h1>
+ <!-- Output author details if some exist. -->
+ <div class="page__author"><a
href="https://cwiki.apache.org/confluence/display/~vinoyang">Vino Yang</a>
posted on <time datetime="2020-08-18">August 18, 2020</time></span>
+ </header>
+
+
+ <section class="page__content" itemprop="text">
+
+ <style>
+ .page {
+ padding-right: 0 !important;
+ }
+ </style>
+
+ <h3
id="note-this-article-is-a-translation-of-the-infoqcn-article-found-here-with-minor-edits">NOTE:
This article is a translation of the infoq.cn article, found <a
href="https://www.infoq.cn/article/CAgIDpfJBVcJHKJLSbhe">here</a>, with minor
edits</h3>
+
+<p>Apache Hudi is a data lake framework which provides the ability to ingest,
manage and query large analytical data sets on a distributed file system/cloud
stores.
+Hudi joined the Apache incubator for incubation in January 2019, and was
promoted to the top Apache project in May 2020. This article mainly discusses
the importance
+of Hudi to the data lake from the perspective of “incremental processing”.
More information about Apache Hudi’s framework functions, features, usage
scenarios, and
+latest developments can be found at <a
href="https://qconplus.infoq.cn/2020/shanghai/presentation/2646">QCon Global
Software Development Conference (Shanghai Station) 2020</a>.</p>
+
+<p>Throughout the development of big data technology, Hadoop has steadily
seized the opportunities of this era and has become the de-facto standard for
enterprises to build big data infrastructure.
+Among them, the distributed file system HDFS that supports the Hadoop
ecosystem almost naturally has become the standard interface for big data
storage systems. In recent years, with the rise of
+cloud-native architectures, we have seen a wave of newer models embracing
low-cost cloud storage emerging, a number of data lake frameworks compatible
with HDFS interfaces
+embracing cloud vendor storage have emerged in the industry as well.</p>
+
+<p>However, we are still processing data pretty much in the same way we did 10
years ago. This article will try to talk about its importance to the data lake
from the perspective of “incremental processing”.</p>
+
+<h2
id="traditional-data-lakes-lack-the-primitives-for-incremental-processing">Traditional
data lakes lack the primitives for incremental processing</h2>
+
+<p>In the era of mobile Internet and Internet of Things, delayed arrival of
data is very common.
+Here we are involved in the definition of two time semantics: <a
href="https://www.oreilly.com/radar/the-world-beyond-batch-streaming-101/">event
time and processing time</a>.</p>
+
+<p>As the name suggests:</p>
+
+<ul>
+ <li><strong>Event time:</strong> the time when the event actually
occurred;</li>
+ <li><strong>Processing time:</strong> the time when an event is observed
(processed) in the system;</li>
+</ul>
+
+<p>Ideally, the event time and the processing time are the same, but in
reality, they may have more or less deviation, which we often call “Time Skew”.
+Whether for low-latency stream computing or common batch processing, the
processing of event time and processing time and late data is a common and
difficult problem.
+In general, in order to ensure correctness, when we strictly follow the “event
time” semantics, late data will trigger the
+<a
href="https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/stream/operators/windows.html#late-elements-considerations">recalculation
of the time window</a>
+(usually Hive partitions for batch processing), although the results of these
“windows” may have been calculated or even interacted with the end user.
+For recalculation, the extensible key-value storage structure is usually used
in streaming processing, which is processed incrementally at the record/event
level and optimized
+based on point queries and updates. However, in data lakes, recalculating
usually means rewriting the entire (immutable) Hive partition (or simply a
folder in DFS), and
+re-triggering the recalculation of cascading tasks that have consumed that
Hive partition.</p>
+
+<p>With data lakes supporting massive amounts of data, many long-tail
businesses still have a strong demand for updating cold data. However, for a
long time,
+the data in a single partition in the data lake was designed to be
non-updatable. If it needs to be updated, the entire partition needs to be
rewritten.
+This will seriously damage the efficiency of the entire ecosystem. From the
perspective of latency and resource utilization, these operations on Hadoop
will incur expensive overhead.
+Besides, this overhead is usually also cascaded to the entire Hadoop data
processing pipeline, which ultimately leads to an increase in latency by
several hours.</p>
+
+<p>In response to the two problems mentioned above, if the data lake supports
fine-grained incremental processing, we can incorporate changes into existing
Hive partitions
+more effectively, and provide a way for downstream table consumers to obtain
only the changed data. For effectively supporting incremental processing, we
can decompose it into the
+following two primitive operations:</p>
+
+<ul>
+ <li>
+ <p><strong>Update insert (upsert):</strong> Conceptually, rewriting the
entire partition can be regarded as a very inefficient upsert operation, which
will eventually write much more data than the
+original data itself. Therefore, support for (bulk) upsert is considered a
very important feature. <a
href="https://research.google/pubs/pub42851/">Google’s Mesa</a> (Google’s data
warehouse system) also
+talks about several techniques that can be applied to rapid data ingestion
scenarios.</p>
+ </li>
+ <li>
+ <p><strong>Incremental consumption:</strong> Although upsert can solve the
problem of quickly releasing new data to a partition, downstream data consumers
do not know
+ which data has been changed from which time in the past. Usually, consumers
can only know the changed data by scanning the entire partition/data table and
+ recalculating all the data, which requires considerable time and resources.
Therefore, we also need a mechanism to more efficiently obtain data records
that
+ have changed since the last time the partition was consumed.</p>
+ </li>
+</ul>
+
+<p>With the above two primitive operations, you can upsert a data set, and
then incrementally consume from it, and create another (also incremental) data
set to solve the two problems
+we mentioned above and support many common cases, so as to support end-to-end
incremental processing and reduce end-to-end latency. These two primitives
combine with each other,
+unlocking the ability of stream/incremental processing based on DFS
abstraction.</p>
+
+<p>The storage scale of the data lake far exceeds that of the data warehouse.
Although the two have different focuses on the definition of functions,
+there is still a considerable intersection (of course, there are still
disputes and deviations from definition and implementation.
+This is not the topic this article tries to discuss). In any case, the data
lake will support larger analytical data sets with cheaper storage,
+so incremental processing is also very important for it. Next let’s discuss
the significance of incremental processing for the data lake.</p>
+
+<h2 id="the-significance-of-incremental-processing-for-the-data-lake">The
significance of incremental processing for the data lake</h2>
+
+<h3 id="streaming-semantics">Streaming Semantics</h3>
+
+<p>It has long been stated that there is a “<a
href="https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying">dualism</a>”
+between the change log (that is, the “flow” in the conventional sense we
understand) and the table.</p>
+
+<p><img src="/assets/images/blog/incr-processing/image4.jpg" alt="dualism"
/></p>
+
+<p>The core of this discussion is: if there is a change log, you can use these
changes to generate a data table and get the current status. If you update a
table,
+you can record these changes and publish all “change logs” to the table’s
status information. This interchangeable nature is called “stream table
duality” for short.</p>
+
+<p>A more general understanding of “stream table duality”: when the business
system is modifying the data in the MySQL table, MySQL will reflect these
changes as Binlog,
+if we publish these continuous Binlog (stream) to Kafka, and then let the
downstream processing system subscribe to the Kafka, and use the state store to
gradually
+accumulate the intermediate results. Then the current state of this
intermediate result can reflects the current snapshot of the table.</p>
+
+<p>If the two primitives mentioned above that support incremental processing
can be introduced to the data lake, the above pipeline, which can reflect the
+“stream table duality”, is also applicable on the data lake. Based on the
first primitive, the data lake can also ingest the Binlog log streams in Kafka,
+and then store these Binlog log streams into “tables” on the data lake. Based
on the second primitive, these tables recognize the changed records as “Binlog”
+streams to support the incremental consumption of subsequent cascading
tasks.</p>
+
+<p>Of course, as the data in the data lake needs to be landed on the final
file/object storage, considering the trade-off between throughput and write
performance,
+Binlog on the data lake reacts to a small batch of change logs over a period
of time on the stream. For example, the Apache Hudi community is further trying
to
+provide an incremental view similar to Binlog for different Commits (a Commit
refers to a batch of data write commit),
+as shown in the following figure:</p>
+
+<p><img src="/assets/images/blog/incr-processing/image1.png" alt="idu" /></p>
+
+<p>Remarks in the “Flag” column:</p>
+
+<p>I: Insert;
+D: Delete;
+U: After image of Update;
+X: Before image of Update;</p>
+
+<p>Based on the above discussion, we can think that incremental processing and
stream are naturally compatible, and we can naturally connect them on the data
lake.</p>
+
+<h3 id="warehousing-needs-incremental-processing">Warehousing needs
Incremental Processing</h3>
+
+<p>In the data warehouse, whether it is dimensional modeling or relational
modeling theory, it is usually constructed based on the <a
href="https://en.wikipedia.org/wiki/Data_warehouse#Design_methods">layered
design ideas</a>.
+In terms of technical implementation, multiple stages (steps) of a long
pipeline are formed by connecting multiple levels of ETL tasks through a
workflow scheduling engine,
+as shown in the following figure:</p>
+
+<p><img src="/assets/images/blog/incr-processing/image2.png" alt="image2"
/></p>
+
+<p>As the main application of the data warehouse, in the OLAP field, for the
conventional business scenarios(for no or few changes), there are already some
frameworks in the industry
+that focus on the scenarios where they are good at providing efficient
analysis capabilities. However, in the Hadoop data warehouse/data lake
ecosystem,
+there is still no good solution for the analysis scenario of frequent changes
of business data.</p>
+
+<p>For example, let’s consider the scenario of updating the order status of a
travel business. This scenario has a typical long-tail effect:
+you cannot know whether an order will be billed tomorrow, one month later, or
one year later. In this scenario, the order table is the main data table,
+but usually we will derive other derived tables based on this table to support
the modeling of various business scenarios.
+The initial update takes place in the order table at the ODS level, but the
derived tables need to be updated in cascade.</p>
+
+<p>For this scenario, in the past, once there is a change, people usually need
to find the partition where the data to be updated is located in the Hive order
+table of the ODS layer, and update the entire partition, besides, the
partition of the relevant data of the derived table needs to be updated in
cascade.</p>
+
+<p>Yes, someone will definitely think of that Kudu’s support for Upsert can
solve the problem of the old version of Hive missing the first incremental
primitive.
+But the Kudu storage engine has its own limitations:</p>
+
+<ol>
+ <li>Performance: additional requirements for the hardware itself;</li>
+ <li>Ecologically: In terms of adapting to mainstream big data computing
frameworks and machine learning frameworks, it is far less advantageous than
Hive;</li>
+ <li>Cost: requires special maintenance costs and expenses;</li>
+ <li>Did not solve the second primitive of incremental processing mentioned
above: the problem of incremental consumption.</li>
+</ol>
+
+<p>In summary, incremental processing has the following advantages on the data
lake:</p>
+
+<p><strong>Performance improvement:</strong> Ingesting data usually needs to
handle updates, deletes, and enforce unique key constraints. Since incremental
primitives support record-level updates,
+it can bring orders of magnitude performance improvements to these
operations.</p>
+
+<p><strong>Faster ETL/derived Pipelines:</strong> An ubiquitous next step,
once the data has been ingested from external sources is to build derived data
pipelines using
+Apache Spark/Apache Hive or any other data processing framework to ETL the
ingested data for a variety of use-cases like data warehouse,
+machine learning, or even just analytics. Typically, such processes again rely
on batch processing jobs expressed in code or SQL. Such data pipelines can be
speed up dramatically,
+by querying one or more input tables using an incremental query instead of a
regular snapshot query, resulting in only processing the incremental changes
from upstream tables and
+then upsert or delete the target derived table.Similar to raw data ingestion,
in order to reduce the data delay of the modelled table, the ETL job only needs
to gradually extract the
+changed data from the original table and update the previously derived output
table instead of rebuilding the entire output table every few hours .</p>
+
+<p><strong>Unified storage:</strong> Based on the above two advantages, faster
and lighter processing on the existing data lake means that only for the
purpose of accessing near real-time data,
+no special storage or data mart is needed.</p>
+
+<p>Next, we use two simple examples to illustrate how <a
href="https://www.oreilly.com/content/ubers-case-for-incremental-processing-on-hadoop/">incremental
processing</a> can speed up the processing
+of pipelines in analytical scenarios. First of all, data projection is the
most common and easy to understand case:</p>
+
+<p><img src="/assets/images/blog/incr-processing/image7.png" alt="image7"
/></p>
+
+<p>This simple example shows that: by upserting new changes into table_1 and
establishing a simple projected table (projected_table) through incremental
consumption, we can
+operate simpler with lower latency more efficiently projection.</p>
+
+<p>Next, for a more complex scenario, we can use incremental processing to
support the stream and batch connections supported by the stream computing
framework,
+and stream-stream connections (just need to add some additional logic to align
window) :</p>
+
+<p><img src="/assets/images/blog/incr-processing/image6.png" alt="image6"
/></p>
+
+<p>The example in the figure above connects a fact table to multiple dimension
tables to create a connected table. This case is one of the rare scenarios
where we can save hardware
+costs while significantly reducing latency.</p>
+
+<h3
id="quasi-real-time-scenarios-resourceefficiency-trade-offs">Quasi-real-time
scenarios, resource/efficiency trade-offs</h3>
+
+<p>Incremental processing of new data in mini batches can use resources more
efficiently. Let’s refer to a specific example. We have a Kafka event stream
that is pouring in
+at a rate of 10,000 per second. We want to count the number of messages in
some dimensions over the past 15 minutes. Many stream processing pipelines use
an external/internal
+result state store (such as RocksDB, Cassandra, ElasticSearch) to save the
aggregated count results, and run the containers in resource managers such as
YARN/Mesos continuously,
+which is very reasonable in less than a five-minute delay window scene. In
fact, the YARN container itself has some startup overhead. In addition, in
order to improve the
+performance of writing to result storage system, we usually cache the results
before performing batch updates. This kind of protocol requires the container
to run continuously.</p>
+
+<p>However, in quasi-real-time processing scenarios, these options may not be
optimal. To achieve the same effect, you can use short-life containers and
optimize overall
+resource utilization. For example, a streaming processor may need to perform
six million updates to the result storage system in 15 minutes. However, in the
incremental
+batch mode, we only need to perform an in-memory merge on the accumulated data
and update the result storage system only once, then only use the resource
container for
+five minutes. Compared with the pure stream processing mode, the incremental
batch processing mode has several times the CPU efficiency improvement, and
there are several
+orders of magnitude efficiency improvement in updating to the result storage.
Basically, this processing method obtains resources on demand, instead of
swallowing CPU and
+memory while waiting for data to be calculated in real time.</p>
+
+<h3
id="incremental-processing-facilitates-unified-data-lake-architecture">Incremental
processing facilitates unified data lake architecture</h3>
+
+<p>Whether in the data warehouse or in the data lake, data processing is an
unavoidable problem. Data processing involves the selection of computing
engines and
+the design of architectures. There are currently two mainstream architectures
in the industry: Lambda and Kappa architectures. Each architecture has its own
+characteristics and existing problems. Derivative versions of these
architectures are also <a
href="https://www.infoq.cn/article/Uo4pFswlMzBVhq*Y2tB9">emerging
endlessly</a>.</p>
+
+<p>In reality, many enterprises still maintain the implementation of the <a
href="https://en.wikipedia.org/wiki/Lambda_architecture">Lambda
architecture</a>.
+The typical Lambda architecture has two modules for the data processing part:
the speed layer and the batch layer.</p>
+
+<p><img src="/assets/images/blog/incr-processing/image5.png" alt="image5"
/></p>
+
+<p>They are usually two independent implementations (from code to
infrastructure). For example, Flink (formerly Storm) is a popular option on the
speed layer,
+while MapReduce/Spark can serve as a batch layer. In fact, people often rely
on the speed layer to provide updated results (which may not be accurate), and
+once the data is considered complete, the results of the speed layer are
corrected at a later time through the batch layer. With incremental processing,
+we have the opportunity to implement the Lambda architecture for batch
processing and quasi-real-time processing at the code level and infrastructure
level in
+a unified manner. It typically looks like below:</p>
+
+<p><img src="/assets/images/blog/incr-processing/image3.png" alt="image3"
/></p>
+
+<p>As we said, you can use SQL or a batch processing framework like Spark to
consistently implement your processing logic. The result table is built
incrementally,
+and SQL is executed on “new data” like streaming to produce a quick view of
the results. The same SQL can be executed periodically on the full amount of
data to
+correct any inaccurate results (remember, join operations are always tricky!)
and produce a more “complete” view of the results. In both cases, we will use
the
+same infrastructure to perform calculations, which can reduce overall
operating costs and complexity.</p>
+
+<p>Setting aside the Lambda architecture, even in the Kappa architecture, the
first primitive of incremental processing (upsert) also plays an important
role.
+Uber <a
href="https://www.slideshare.net/FlinkForward/flink-forward-san-francisco-2019-moving-from-lambda-and-kappa-architectures-to-kappa-at-uber-roshan-naik">proposed</a>
the Kappa + architecture
+based on this. The Kappa architecture advocates a single stream computing
layer sufficient to become a general solution
+for data processing. Although the batch layer is removed in this model, there
are still two problems in the service layer:</p>
+
+<p>Now days many stream processing engines support row-level data processing,
which requires that our service layer should also support row-level updates;
+The trade-offs between data ingestion delay, scanning performance and
computing resources and operational complexity are unavoidable.</p>
+
+<p><img src="/assets/images/blog/incr-processing/image8.png" alt="image8"
/></p>
+
+<p>However, if our business scenarios have low latency requirements, for
example, we can accept a delay of about 10 minutes. And if we can quickly
ingest and prepare data on DFS,
+effectively connect and propagate updates to the upper-level modeling data
set, Speed Serving in the service layer is unnecessary. Then the service layer
can be unified,
+greatly reducing the overall complexity and resource consumption of the
system.</p>
+
+<p>Above, we introduced the significance of incremental processing for the
data lake. Next, we introduce the implementation and support of incremental
processing.
+Among the three open source data lake frameworks (Apache Hudi/Iceberg, Delta
Lake), only Apache Hudi provides good support for incremental processing.
+This is completely rooted in a framework developed by Uber at the time when it
encountered the pain points of data analysis on the Hadoop data lake.
+So, next, let’s introduce how Hudi supports incremental processing.</p>
+
+<h2 id="hudis-support-for-incremental-processing">Hudi’s support for
incremental processing</h2>
+
+<p>Apache Hudi (Hadoop Upserts Deletes and Incrementals) is a top-level
project of the Apache Foundation. It allows you to process very large-scale
data on
+top of Hadoop-compatible storage, and it also provides two primitives that
enable stream processing on the data lake in addition to classic batch
processing.</p>
+
+<p>From the naming of the letter “I” denotes “Incremental Processing”, we can
see that it will support incremental processing as a first class citizen.
+The two primitives we mentioned at the beginning of this article that support
incremental processing are reflected in the following two aspects in Apache
Hudi:</p>
+
+<p>Update/Delete operation:Hudi provides support for updating/deleting
records, using fine-grained file/record level indexes while providing
transactional guarantees
+for the write operation. Queries process the last such committed snapshot, to
produce results..</p>
+
+<p>Change stream: Hudi also provides first-class support for obtaining an
incremental stream of all the records that were updated/inserted/deleted in a
given table, from a given point-in-time.</p>
+
+<p>The specific implementation of the change flow is “incremental view”. Hudi
is the only one of the three open source data lake frameworks that supports
+the incremental query feature, with support for record level change streams.
The following sample code snippet shows us how to query the incremental
view:</p>
+
+<div class="language-java highlighter-rouge"><div class="highlight"><pre
class="highlight"><code><span class="c1">// spark-shell</span>
+<span class="c1">// reload data</span>
+<span class="n">spark</span><span class="o">.</span>
+ <span class="n">read</span><span class="o">.</span>
+ <span class="nf">format</span><span class="o">(</span><span
class="s">"hudi"</span><span class="o">).</span>
+ <span class="n">load</span><span class="o">(</span><span
class="n">basePath</span> <span class="o">+</span> <span
class="s">"/*/*/*/*"</span><span class="o">).</span>
+ <span class="n">createOrReplaceTempView</span><span class="o">(</span><span
class="s">"hudi_trips_snapshot"</span><span class="o">)</span>
+
+<span class="n">val</span> <span class="n">commits</span> <span
class="o">=</span> <span class="n">spark</span><span class="o">.</span><span
class="na">sql</span><span class="o">(</span><span class="s">"select
distinct(_hoodie_commit_time) as commitTime from hudi_trips_snapshot order by
commitTime"</span><span class="o">).</span><span class="na">map</span><span
class="o">(</span><span class="n">k</span> <span class="o">=></span> <span
class="n">k</span><span class="o">.</span><span c [...]
+<span class="n">val</span> <span class="n">beginTime</span> <span
class="o">=</span> <span class="n">commits</span><span class="o">(</span><span
class="n">commits</span><span class="o">.</span><span class="na">length</span>
<span class="o">-</span> <span class="mi">2</span><span class="o">)</span>
<span class="c1">// commit time we are interested in</span>
+
+<span class="c1">// incrementally query data</span>
+<span class="n">val</span> <span class="n">tripsIncrementalDF</span> <span
class="o">=</span> <span class="n">spark</span><span class="o">.</span><span
class="na">read</span><span class="o">.</span><span
class="na">format</span><span class="o">(</span><span
class="s">"hudi"</span><span class="o">).</span>
+ <span class="n">option</span><span class="o">(</span><span
class="no">QUERY_TYPE_OPT_KEY</span><span class="o">,</span> <span
class="no">QUERY_TYPE_INCREMENTAL_OPT_VAL</span><span class="o">).</span>
+ <span class="n">option</span><span class="o">(</span><span
class="no">BEGIN_INSTANTTIME_OPT_KEY</span><span class="o">,</span> <span
class="n">beginTime</span><span class="o">).</span>
+ <span class="n">load</span><span class="o">(</span><span
class="n">basePath</span><span class="o">)</span>
+<span class="n">tripsIncrementalDF</span><span class="o">.</span><span
class="na">createOrReplaceTempView</span><span class="o">(</span><span
class="s">"hudi_trips_incremental"</span><span class="o">)</span>
+
+<span class="n">spark</span><span class="o">.</span><span
class="na">sql</span><span class="o">(</span><span class="s">"select
`_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from
hudi_trips_incremental where fare > 20.0"</span><span
class="o">).</span><span class="na">show</span><span class="o">()</span>
+
+</code></pre></div></div>
+
+<p>The code snippet above creates a Hudi trip increment table
(hudi_trips_incremental), and then queries all the change records in the
increment table after the “beginTime” submission time
+and the “cost” is greater than 20.0. Based on this query, you can create
incremental data pipelines on batch data.</p>
+
+<h2 id="summary">Summary</h2>
+
+<p>In this article, we first elaborated many problems caused by the lack of
incremental processing primitives in the traditional Hadoop data warehouse due
to the trade-off between data integrity
+and latency, and some long-tail applications that rely heavily on updates.
Next, we argued that to support incremental processing, we must have at least
two primitives: upsert and
+incremental consumption, and explained why these two primitives can solve the
problems explained above.</p>
+
+<p>Then, we introduced why incremental processing is also important to the
data lake. There are many common parts in data processing between the data lake
and the data warehouse.
+In the data warehouse, some “pain points” caused by the lack of incremental
processing also exist in the data lake. We elaborated its significance to the
data lake from four
+aspects: incremental processing of semantics of natural fit flow, the need for
analytical scenarios, quasi-real-time scene resource/efficiency trade-offs, and
unified lake architecture.</p>
+
+<p>Finally, we introduced the open source data lake storage framework Apache
Hudi’s support for incremental processing and simple cases.</p>
+
+ </section>
+
+ <a href="#masthead__inner-wrap" class="back-to-top">Back to top
↑</a>
+
+
+
+
+ </div>
+
+ </article>
+
+</div>
+
+ </div>
+
+ <div class="page__footer">
+ <footer>
+
+<div class="row">
+ <div class="col-lg-12 footer">
+ <p>
+ <table class="table-apache-info">
+ <tr>
+ <td>
+ <a class="footer-link-img" href="https://apache.org">
+ <img width="250px" src="/assets/images/asf_logo.svg" alt="The
Apache Software Foundation">
+ </a>
+ </td>
+ <td>
+ <a style="float: right"
href="https://www.apache.org/events/current-event.html">
+ <img
src="https://www.apache.org/events/current-event-234x60.png" />
+ </a>
+ </td>
+ </tr>
+ </table>
+ </p>
+ <p>
+ <a href="https://www.apache.org/licenses/">License</a> | <a
href="https://www.apache.org/security/">Security</a> | <a
href="https://www.apache.org/foundation/thanks.html">Thanks</a> | <a
href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a>
+ </p>
+ <p>
+ Copyright © <span id="copyright-year">2019</span> <a
href="https://apache.org">The Apache Software Foundation</a>, Licensed under
the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License,
Version 2.0</a>.
+ Hudi, Apache and the Apache feather logo are trademarks of The Apache
Software Foundation. <a href="/docs/privacy">Privacy Policy</a>
+ </p>
+ </div>
+</div>
+ </footer>
+ </div>
+
+
+ </body>
+</html>
\ No newline at end of file
diff --git a/content/cn/activity.html b/content/cn/activity.html
index a834d14..69cfba7 100644
--- a/content/cn/activity.html
+++ b/content/cn/activity.html
@@ -191,6 +191,30 @@
<h2 class="archive__item-title" itemprop="headline">
+ <a href="/blog/hudi-incremental-processing-on-data-lakes/"
rel="permalink">Incremental Processing on the Data Lake
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+ <div class="archive__item-meta"><a
href="https://cwiki.apache.org/confluence/display/~vinoyang">Vino Yang</a>
posted on <time datetime="2020-08-18">August 18, 2020</time></div>
+
+ <p class="archive__item-excerpt" itemprop="description">How Apache Hudi
provides ability for incremental data processing.
+</p>
+ </article>
+</div>
+
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
<a href="/blog/monitoring-hudi-metrics-with-datadog/"
rel="permalink">Monitor Hudi metrics with Datadog
</a>
diff --git a/content/sitemap.xml b/content/sitemap.xml
index e344588..b3e6d0e 100644
--- a/content/sitemap.xml
+++ b/content/sitemap.xml
@@ -929,6 +929,10 @@
<lastmod>2020-05-28T00:00:00-04:00</lastmod>
</url>
<url>
+<loc>https://hudi.apache.org/blog/hudi-incremental-processing-on-data-lakes/</loc>
+<lastmod>2020-08-18T00:00:00-04:00</lastmod>
+</url>
+<url>
<loc>https://hudi.apache.org/cn/activity</loc>
<lastmod>2019-12-30T14:59:57-05:00</lastmod>
</url>