[hudi] branch asf-site updated: Travis CI build asf-site

vinoth Mon, 31 Aug 2020 09:59:29 -0700

This is an automated email from the ASF dual-hosted git repository.

vinoth pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git



The following commit(s) were added to refs/heads/asf-site by this push:
     new b9f5826  Travis CI build asf-site
b9f5826 is described below

commit b9f5826062eb3612eca04e176d1fce7d99da9044
Author: CI <[email protected]>
AuthorDate: Mon Aug 31 16:59:10 2020 +0000

    Travis CI build asf-site
---
 content/activity.html                              |  24 +
 .../assets/images/blog/incr-processing/image1.png  | Bin 0 -> 59805 bytes
 .../assets/images/blog/incr-processing/image2.png  | Bin 0 -> 385336 bytes
 .../assets/images/blog/incr-processing/image3.png  | Bin 0 -> 167680 bytes
 .../assets/images/blog/incr-processing/image4.jpg  | Bin 0 -> 19807 bytes
 .../assets/images/blog/incr-processing/image5.png  | Bin 0 -> 225670 bytes
 .../assets/images/blog/incr-processing/image6.png  | Bin 0 -> 67083 bytes
 .../assets/images/blog/incr-processing/image7.png  | Bin 0 -> 44297 bytes
 .../assets/images/blog/incr-processing/image8.png  | Bin 0 -> 209792 bytes
 content/assets/js/lunr/lunr-store.js               |   5 +
 content/blog.html                                  |  24 +
 .../index.html                                     | 524 +++++++++++++++++++++
 content/cn/activity.html                           |  24 +
 content/sitemap.xml                                |   4 +
 14 files changed, 605 insertions(+)

diff --git a/content/activity.html b/content/activity.html
index 44b6969..46e0160 100644
--- a/content/activity.html
+++ b/content/activity.html
@@ -191,6 +191,30 @@
     
     <h2 class="archive__item-title" itemprop="headline">
       
+        <a href="/blog/hudi-incremental-processing-on-data-lakes/" 
rel="permalink">Incremental Processing on the Data Lake
+</a>
+      
+    </h2>
+    <!-- Look the author details up from the site config. -->
+    
+    <!-- Output author details if some exist. -->
+    <div class="archive__item-meta"><a 
href="https://cwiki.apache.org/confluence/display/~vinoyang";>Vino Yang</a> 
posted on <time datetime="2020-08-18">August 18, 2020</time></div>
+ 
+    <p class="archive__item-excerpt" itemprop="description">How Apache Hudi 
provides ability for incremental data processing.
+</p>
+  </article>
+</div>
+
+        
+        
+
+
+
+<div class="list__item">
+  <article class="archive__item" itemscope 
itemtype="https://schema.org/CreativeWork";>
+    
+    <h2 class="archive__item-title" itemprop="headline">
+      
         <a href="/blog/monitoring-hudi-metrics-with-datadog/" 
rel="permalink">Monitor Hudi metrics with Datadog
 </a>
       
diff --git a/content/assets/images/blog/incr-processing/image1.png 
b/content/assets/images/blog/incr-processing/image1.png
new file mode 100644
index 0000000..b744803
Binary files /dev/null and 
b/content/assets/images/blog/incr-processing/image1.png differ
diff --git a/content/assets/images/blog/incr-processing/image2.png 
b/content/assets/images/blog/incr-processing/image2.png
new file mode 100644
index 0000000..becc5aa
Binary files /dev/null and 
b/content/assets/images/blog/incr-processing/image2.png differ
diff --git a/content/assets/images/blog/incr-processing/image3.png 
b/content/assets/images/blog/incr-processing/image3.png
new file mode 100644
index 0000000..d570455
Binary files /dev/null and 
b/content/assets/images/blog/incr-processing/image3.png differ
diff --git a/content/assets/images/blog/incr-processing/image4.jpg 
b/content/assets/images/blog/incr-processing/image4.jpg
new file mode 100644
index 0000000..dbacbf2
Binary files /dev/null and 
b/content/assets/images/blog/incr-processing/image4.jpg differ
diff --git a/content/assets/images/blog/incr-processing/image5.png 
b/content/assets/images/blog/incr-processing/image5.png
new file mode 100644
index 0000000..50b01bf
Binary files /dev/null and 
b/content/assets/images/blog/incr-processing/image5.png differ
diff --git a/content/assets/images/blog/incr-processing/image6.png 
b/content/assets/images/blog/incr-processing/image6.png
new file mode 100644
index 0000000..9f07ad9
Binary files /dev/null and 
b/content/assets/images/blog/incr-processing/image6.png differ
diff --git a/content/assets/images/blog/incr-processing/image7.png 
b/content/assets/images/blog/incr-processing/image7.png
new file mode 100644
index 0000000..909d6f6
Binary files /dev/null and 
b/content/assets/images/blog/incr-processing/image7.png differ
diff --git a/content/assets/images/blog/incr-processing/image8.png 
b/content/assets/images/blog/incr-processing/image8.png
new file mode 100644
index 0000000..2260886
Binary files /dev/null and 
b/content/assets/images/blog/incr-processing/image8.png differ
diff --git a/content/assets/js/lunr/lunr-store.js 
b/content/assets/js/lunr/lunr-store.js
index 10258db..0a94f11 100644
--- a/content/assets/js/lunr/lunr-store.js
+++ b/content/assets/js/lunr/lunr-store.js
@@ -1158,4 +1158,9 @@ var store = [{
         "excerpt":"Availability 0.6.0 (unreleased) Introduction Datadog is a 
popular monitoring service. In the upcoming 0.6.0 release of Apache Hudi, we 
will introduce the feature of reporting Hudi metrics via Datadog HTTP API, in 
addition to the current reporter types: Graphite and JMX. Configurations 
Similar to other supported reporters, turning on Datadog...","categories": 
["blog"],
         "tags": [],
         "url": 
"https://hudi.apache.org/blog/monitoring-hudi-metrics-with-datadog/";,
+        "teaser":"https://hudi.apache.org/assets/images/500x300.png"},{
+        "title": "Incremental Processing on the Data Lake",
+        "excerpt":"NOTE: This article is a translation of the infoq.cn 
article, found here, with minor edits Apache Hudi is a data lake framework 
which provides the ability to ingest, manage and query large analytical data 
sets on a distributed file system/cloud stores. Hudi joined the Apache 
incubator for incubation in January...","categories": ["blog"],
+        "tags": [],
+        "url": 
"https://hudi.apache.org/blog/hudi-incremental-processing-on-data-lakes/";,
         "teaser":"https://hudi.apache.org/assets/images/500x300.png"},]
diff --git a/content/blog.html b/content/blog.html
index fbe3148..db59c19 100644
--- a/content/blog.html
+++ b/content/blog.html
@@ -189,6 +189,30 @@
     
     <h2 class="archive__item-title" itemprop="headline">
       
+        <a href="/blog/hudi-incremental-processing-on-data-lakes/" 
rel="permalink">Incremental Processing on the Data Lake
+</a>
+      
+    </h2>
+    <!-- Look the author details up from the site config. -->
+    
+    <!-- Output author details if some exist. -->
+    <div class="archive__item-meta"><a 
href="https://cwiki.apache.org/confluence/display/~vinoyang";>Vino Yang</a> 
posted on <time datetime="2020-08-18">August 18, 2020</time></div>
+ 
+    <p class="archive__item-excerpt" itemprop="description">How Apache Hudi 
provides ability for incremental data processing.
+</p>
+  </article>
+</div>
+
+        
+        
+
+
+
+<div class="list__item">
+  <article class="archive__item" itemscope 
itemtype="https://schema.org/CreativeWork";>
+    
+    <h2 class="archive__item-title" itemprop="headline">
+      
         <a href="/blog/monitoring-hudi-metrics-with-datadog/" 
rel="permalink">Monitor Hudi metrics with Datadog
 </a>
       
diff --git a/content/blog/hudi-incremental-processing-on-data-lakes/index.html 
b/content/blog/hudi-incremental-processing-on-data-lakes/index.html
new file mode 100644
index 0000000..1704322
--- /dev/null
+++ b/content/blog/hudi-incremental-processing-on-data-lakes/index.html
@@ -0,0 +1,524 @@
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    <meta charset="utf-8">
+
+<!-- begin _includes/seo.html --><title>Incremental Processing on the Data 
Lake - Apache Hudi</title>
+<meta name="description" content="How Apache Hudi provides ability for 
incremental data processing.">
+
+<meta property="og:type" content="article">
+<meta property="og:locale" content="en_US">
+<meta property="og:site_name" content="">
+<meta property="og:title" content="Incremental Processing on the Data Lake">
+<meta property="og:url" 
content="https://hudi.apache.org/blog/hudi-incremental-processing-on-data-lakes/";>
+
+
+  <meta property="og:description" content="How Apache Hudi provides ability 
for incremental data processing.">
+
+
+
+
+
+
+
+
+
+
+
+<!-- end _includes/seo.html -->
+
+
+<!--<link href="/feed.xml" type="application/atom+xml" rel="alternate" title=" 
Feed">-->
+
+<!-- https://t.co/dKP3o1e -->
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+<script>
+  document.documentElement.className = 
document.documentElement.className.replace(/\bno-js\b/g, '') + ' js ';
+</script>
+
+<!-- For all browsers -->
+<link rel="stylesheet" href="/assets/css/main.css">
+
+<!--[if IE]>
+  <style>
+    /* old IE unsupported flexbox fixes */
+    .greedy-nav .site-title {
+      padding-right: 3em;
+    }
+    .greedy-nav button {
+      position: absolute;
+      top: 0;
+      right: 0;
+      height: 100%;
+    }
+  </style>
+<![endif]-->
+
+
+
+<link rel="icon" type="image/x-icon" href="/assets/images/favicon.ico">
+<link rel="stylesheet" href="/assets/css/font-awesome.min.css">
+<script src="/assets/js/jquery.min.js"></script>
+
+    
+<script src="/assets/js/main.min.js"></script>
+
+  </head>
+
+  <body class="layout--single">
+    <!--[if lt IE 9]>
+<div class="notice--danger align-center" style="margin: 0;">You are using an 
<strong>outdated</strong> browser. Please <a 
href="https://browsehappy.com/";>upgrade your browser</a> to improve your 
experience.</div>
+<![endif]-->
+
+    <div class="masthead">
+  <div class="masthead__inner-wrap" id="masthead__inner-wrap">
+    <div class="masthead__menu">
+      <nav id="site-nav" class="greedy-nav">
+        
+          <a class="site-logo" href="/">
+              <div style="width: 150px; height: 40px">
+              </div>
+          </a>
+        
+        <a class="site-title" href="/">
+          
+        </a>
+        <ul class="visible-links"><li class="masthead__menu-item">
+              <a href="/docs/quick-start-guide.html" target="_self" 
>Documentation</a>
+            </li><li class="masthead__menu-item">
+              <a href="/community.html" target="_self" >Community</a>
+            </li><li class="masthead__menu-item">
+              <a href="/blog.html" target="_self" >Blog</a>
+            </li><li class="masthead__menu-item">
+              <a href="https://cwiki.apache.org/confluence/display/HUDI/FAQ"; 
target="_blank" >FAQ</a>
+            </li><li class="masthead__menu-item">
+              <a href="/releases.html" target="_self" >Releases</a>
+            </li></ul>
+        <button class="greedy-nav__toggle hidden" type="button">
+          <span class="visually-hidden">Toggle menu</span>
+          <div class="navicon"></div>
+        </button>
+        <ul class="hidden-links hidden"></ul>
+      </nav>
+    </div>
+  </div>
+</div>
+<!--
+<p class="notice--warning" style="margin: 0 !important; text-align: center 
!important;"><strong>Note:</strong> This site is work in progress, if you 
notice any issues, please <a target="_blank" 
href="https://github.com/apache/hudi/issues";>Report on Issue</a>.
+  Click <a href="/"> here</a> back to old site.</p>
+-->
+
+    <div class="initial-content">
+      <div id="main" role="main">
+  
+
+  <div class="sidebar sticky">
+
+  
+    <div itemscope itemtype="https://schema.org/Person";>
+
+  <div class="author__content">
+    
+      <h3 class="author__name" itemprop="name">Quick Links</h3>
+    
+    
+      <div class="author__bio" itemprop="description">
+        <p>Hudi <em>ingests</em> &amp; <em>manages</em> storage of large 
analytical datasets over DFS.</p>
+
+      </div>
+    
+  </div>
+
+  <div class="author__urls-wrapper">
+    <ul class="author__urls social-icons">
+      
+        
+          <li><a href="/docs/quick-start-guide" target="_self" rel="nofollow 
noopener noreferrer"><i class="fa fa-book" aria-hidden="true"></i> 
Documentation</a></li>
+
+          
+        
+          <li><a href="https://cwiki.apache.org/confluence/display/HUDI"; 
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-wikipedia-w" 
aria-hidden="true"></i> Technical Wiki</a></li>
+
+          
+        
+          <li><a href="/contributing" target="_self" rel="nofollow noopener 
noreferrer"><i class="fa fa-thumbs-o-up" aria-hidden="true"></i> Contribution 
Guide</a></li>
+
+          
+        
+          <li><a 
href="https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE";
 target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-slack" 
aria-hidden="true"></i> Join on Slack</a></li>
+
+          
+        
+          <li><a href="https://github.com/apache/hudi"; target="_blank" 
rel="nofollow noopener noreferrer"><i class="fa fa-github" 
aria-hidden="true"></i> Fork on GitHub</a></li>
+
+          
+        
+          <li><a href="https://issues.apache.org/jira/projects/HUDI/summary"; 
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-navicon" 
aria-hidden="true"></i> Report Issues</a></li>
+
+          
+        
+          <li><a href="/security" target="_self" rel="nofollow noopener 
noreferrer"><i class="fa fa-navicon" aria-hidden="true"></i> Report Security 
Issues</a></li>
+
+          
+        
+      
+    </ul>
+  </div>
+</div>
+
+  
+
+  
+  </div>
+
+
+  <article class="page" itemscope itemtype="https://schema.org/CreativeWork";>
+    <!-- Look the author details up from the site config. -->
+    
+
+    <div class="page__inner-wrap">
+      
+        <header>
+          <h1 id="page-title" class="page__title" 
itemprop="headline">Incremental Processing on the Data Lake
+</h1>
+          <!-- Output author details if some exist. -->
+          <div class="page__author"><a 
href="https://cwiki.apache.org/confluence/display/~vinoyang";>Vino Yang</a> 
posted on <time datetime="2020-08-18">August 18, 2020</time></span>
+        </header>
+      
+
+      <section class="page__content" itemprop="text">
+        
+          <style>
+            .page {
+              padding-right: 0 !important;
+            }
+          </style>
+        
+        <h3 
id="note-this-article-is-a-translation-of-the-infoqcn-article-found-here-with-minor-edits">NOTE:
 This article is a translation of the infoq.cn article, found <a 
href="https://www.infoq.cn/article/CAgIDpfJBVcJHKJLSbhe";>here</a>, with minor 
edits</h3>
+
+<p>Apache Hudi is a data lake framework which provides the ability to ingest, 
manage and query large analytical data sets on a distributed file system/cloud 
stores. 
+Hudi joined the Apache incubator for incubation in January 2019, and was 
promoted to the top Apache project in May 2020. This article mainly discusses 
the importance 
+of Hudi to the data lake from the perspective of “incremental processing”. 
More information about Apache Hudi’s framework functions, features, usage 
scenarios, and 
+latest developments can be found at <a 
href="https://qconplus.infoq.cn/2020/shanghai/presentation/2646";>QCon Global 
Software Development Conference (Shanghai Station) 2020</a>.</p>
+
+<p>Throughout the development of big data technology, Hadoop has steadily 
seized the opportunities of this era and has become the de-facto standard for 
enterprises to build big data infrastructure. 
+Among them, the distributed file system HDFS that supports the Hadoop 
ecosystem almost naturally has become the standard interface for big data 
storage systems. In recent years, with the rise of 
+cloud-native architectures, we have seen a wave of newer models embracing 
low-cost cloud storage emerging, a number of data lake frameworks compatible 
with HDFS interfaces 
+embracing cloud vendor storage have emerged in the industry as well.</p>
+
+<p>However, we are still processing data pretty much in the same way we did 10 
years ago. This article will try to talk about its importance to the data lake 
from the perspective of “incremental processing”.</p>
+
+<h2 
id="traditional-data-lakes-lack-the-primitives-for-incremental-processing">Traditional
 data lakes lack the primitives for incremental processing</h2>
+
+<p>In the era of mobile Internet and Internet of Things, delayed arrival of 
data is very common. 
+Here we are involved in the definition of two time semantics: <a 
href="https://www.oreilly.com/radar/the-world-beyond-batch-streaming-101/";>event
 time and processing time</a>.</p>
+
+<p>As the name suggests:</p>
+
+<ul>
+  <li><strong>Event time:</strong> the time when the event actually 
occurred;</li>
+  <li><strong>Processing time:</strong> the time when an event is observed 
(processed) in the system;</li>
+</ul>
+
+<p>Ideally, the event time and the processing time are the same, but in 
reality, they may have more or less deviation, which we often call “Time Skew”. 
+Whether for low-latency stream computing or common batch processing, the 
processing of event time and processing time and late data is a common and 
difficult problem. 
+In general, in order to ensure correctness, when we strictly follow the “event 
time” semantics, late data will trigger the 
+<a 
href="https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/stream/operators/windows.html#late-elements-considerations";>recalculation
 of the time window</a> 
+(usually Hive partitions for batch processing), although the results of these 
“windows” may have been calculated or even interacted with the end user. 
+For recalculation, the extensible key-value storage structure is usually used 
in streaming processing, which is processed incrementally at the record/event 
level and optimized 
+based on point queries and updates. However, in data lakes, recalculating 
usually means rewriting the entire (immutable) Hive partition (or simply a 
folder in DFS), and 
+re-triggering the recalculation of cascading tasks that have consumed that 
Hive partition.</p>
+
+<p>With data lakes supporting massive amounts of data, many long-tail 
businesses still have a strong demand for updating cold data. However, for a 
long time, 
+the data in a single partition in the data lake was designed to be 
non-updatable. If it needs to be updated, the entire partition needs to be 
rewritten. 
+This will seriously damage the efficiency of the entire ecosystem. From the 
perspective of latency and resource utilization, these operations on Hadoop 
will incur expensive overhead.
+Besides, this overhead is usually also cascaded to the entire Hadoop data 
processing pipeline, which ultimately leads to an increase in latency by 
several hours.</p>
+
+<p>In response to the two problems mentioned above, if the data lake supports 
fine-grained incremental processing, we can incorporate changes into existing 
Hive partitions 
+more effectively, and provide a way for downstream table consumers to obtain 
only the changed data. For effectively supporting incremental processing, we 
can decompose it into the 
+following two primitive operations:</p>
+
+<ul>
+  <li>
+    <p><strong>Update insert (upsert):</strong> Conceptually, rewriting the 
entire partition can be regarded as a very inefficient upsert operation, which 
will eventually write much more data than the 
+original data itself. Therefore, support for (bulk) upsert is considered a 
very important feature. <a 
href="https://research.google/pubs/pub42851/";>Google’s Mesa</a> (Google’s data 
warehouse system) also 
+talks about several techniques that can be applied to rapid data ingestion 
scenarios.</p>
+  </li>
+  <li>
+    <p><strong>Incremental consumption:</strong> Although upsert can solve the 
problem of quickly releasing new data to a partition, downstream data consumers 
do not know 
+ which data has been changed from which time in the past. Usually, consumers 
can only know the changed data by scanning the entire partition/data table and 
+ recalculating all the data, which requires considerable time and resources. 
Therefore, we also need a mechanism to more efficiently obtain data records 
that 
+ have changed since the last time the partition was consumed.</p>
+  </li>
+</ul>
+
+<p>With the above two primitive operations, you can upsert a data set, and 
then incrementally consume from it, and create another (also incremental) data 
set to solve the two problems 
+we mentioned above and support many common cases, so as to support end-to-end 
incremental processing and reduce end-to-end latency. These two primitives 
combine with each other, 
+unlocking the ability of stream/incremental processing based on DFS 
abstraction.</p>
+
+<p>The storage scale of the data lake far exceeds that of the data warehouse. 
Although the two have different focuses on the definition of functions, 
+there is still a considerable intersection (of course, there are still 
disputes and deviations from definition and implementation. 
+This is not the topic this article tries to discuss). In any case, the data 
lake will support larger analytical data sets with cheaper storage, 
+so incremental processing is also very important for it. Next let’s discuss 
the significance of incremental processing for the data lake.</p>
+
+<h2 id="the-significance-of-incremental-processing-for-the-data-lake">The 
significance of incremental processing for the data lake</h2>
+
+<h3 id="streaming-semantics">Streaming Semantics</h3>
+
+<p>It has long been stated that there is a “<a 
href="https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying";>dualism</a>”
 
+between the change log (that is, the “flow” in the conventional sense we 
understand) and the table.</p>
+
+<p><img src="/assets/images/blog/incr-processing/image4.jpg" alt="dualism" 
/></p>
+
+<p>The core of this discussion is: if there is a change log, you can use these 
changes to generate a data table and get the current status. If you update a 
table, 
+you can record these changes and publish all “change logs” to the table’s 
status information. This interchangeable nature is called “stream table 
duality” for short.</p>
+
+<p>A more general understanding of “stream table duality”: when the business 
system is modifying the data in the MySQL table, MySQL will reflect these 
changes as Binlog, 
+if we publish these continuous Binlog (stream) to Kafka, and then let the 
downstream processing system subscribe to the Kafka, and use the state store to 
gradually 
+accumulate the intermediate results. Then the current state of this 
intermediate result can reflects the current snapshot of the table.</p>
+
+<p>If the two primitives mentioned above that support incremental processing 
can be introduced to the data lake, the above pipeline, which can reflect the 
+“stream table duality”, is also applicable on the data lake. Based on the 
first primitive, the data lake can also ingest the Binlog log streams in Kafka, 
+and then store these Binlog log streams into “tables” on the data lake. Based 
on the second primitive, these tables recognize the changed records as “Binlog” 
+streams to support the incremental consumption of subsequent cascading 
tasks.</p>
+
+<p>Of course, as the data in the data lake needs to be landed on the final 
file/object storage, considering the trade-off between throughput and write 
performance, 
+Binlog on the data lake reacts to a small batch of change logs over a period 
of time on the stream. For example, the Apache Hudi community is further trying 
to 
+provide an incremental view similar to Binlog for different Commits (a Commit 
refers to a batch of data write commit), 
+as shown in the following figure:</p>
+
+<p><img src="/assets/images/blog/incr-processing/image1.png" alt="idu" /></p>
+
+<p>Remarks in the “Flag” column:</p>
+
+<p>I: Insert;
+D: Delete;
+U: After image of Update;
+X: Before image of Update;</p>
+
+<p>Based on the above discussion, we can think that incremental processing and 
stream are naturally compatible, and we can naturally connect them on the data 
lake.</p>
+
+<h3 id="warehousing-needs-incremental-processing">Warehousing needs 
Incremental Processing</h3>
+
+<p>In the data warehouse, whether it is dimensional modeling or relational 
modeling theory, it is usually constructed based on the <a 
href="https://en.wikipedia.org/wiki/Data_warehouse#Design_methods";>layered 
design ideas</a>. 
+In terms of technical implementation, multiple stages (steps) of a long 
pipeline are formed by connecting multiple levels of ETL tasks through a 
workflow scheduling engine, 
+as shown in the following figure:</p>
+
+<p><img src="/assets/images/blog/incr-processing/image2.png" alt="image2" 
/></p>
+
+<p>As the main application of the data warehouse, in the OLAP field, for the 
conventional business scenarios(for no or few changes), there are already some 
frameworks in the industry 
+that focus on the scenarios where they are good at providing efficient 
analysis capabilities. However, in the Hadoop data warehouse/data lake 
ecosystem, 
+there is still no good solution for the analysis scenario of frequent changes 
of business data.</p>
+
+<p>For example, let’s consider the scenario of updating the order status of a 
travel business. This scenario has a typical long-tail effect: 
+you cannot know whether an order will be billed tomorrow, one month later, or 
one year later. In this scenario, the order table is the main data table, 
+but usually we will derive other derived tables based on this table to support 
the modeling of various business scenarios. 
+The initial update takes place in the order table at the ODS level, but the 
derived tables need to be updated in cascade.</p>
+
+<p>For this scenario, in the past, once there is a change, people usually need 
to find the partition where the data to be updated is located in the Hive order 
+table of the ODS layer, and update the entire partition, besides, the 
partition of the relevant data of the derived table needs to be updated in 
cascade.</p>
+
+<p>Yes, someone will definitely think of that Kudu’s support for Upsert can 
solve the problem of the old version of Hive missing the first incremental 
primitive. 
+But the Kudu storage engine has its own limitations:</p>
+
+<ol>
+  <li>Performance: additional requirements for the hardware itself;</li>
+  <li>Ecologically: In terms of adapting to mainstream big data computing 
frameworks and machine learning frameworks, it is far less advantageous than 
Hive;</li>
+  <li>Cost: requires special maintenance costs and expenses;</li>
+  <li>Did not solve the second primitive of incremental processing mentioned 
above: the problem of incremental consumption.</li>
+</ol>
+
+<p>In summary, incremental processing has the following advantages on the data 
lake:</p>
+
+<p><strong>Performance improvement:</strong> Ingesting data usually needs to 
handle updates, deletes, and enforce unique key constraints. Since incremental 
primitives support record-level updates, 
+it can bring orders of magnitude performance improvements to these 
operations.</p>
+
+<p><strong>Faster ETL/derived Pipelines:</strong> An ubiquitous next step, 
once the data has been ingested from external sources is to build derived data 
pipelines using 
+Apache Spark/Apache Hive or any other data processing framework to ETL the 
ingested data for a variety of use-cases like data warehouse, 
+machine learning, or even just analytics. Typically, such processes again rely 
on batch processing jobs expressed in code or SQL. Such data pipelines can be 
speed up dramatically, 
+by querying one or more input tables using an incremental query instead of a 
regular snapshot query, resulting in only processing the incremental changes 
from upstream tables and 
+then upsert or delete the target derived table.Similar to raw data ingestion, 
in order to reduce the data delay of the modelled table, the ETL job only needs 
to gradually extract the 
+changed data from the original table and update the previously derived output 
table instead of rebuilding the entire output table every few hours .</p>
+
+<p><strong>Unified storage:</strong> Based on the above two advantages, faster 
and lighter processing on the existing data lake means that only for the 
purpose of accessing near real-time data, 
+no special storage or data mart is needed.</p>
+
+<p>Next, we use two simple examples to illustrate how <a 
href="https://www.oreilly.com/content/ubers-case-for-incremental-processing-on-hadoop/";>incremental
 processing</a> can speed up the processing 
+of pipelines in analytical scenarios. First of all, data projection is the 
most common and easy to understand case:</p>
+
+<p><img src="/assets/images/blog/incr-processing/image7.png" alt="image7" 
/></p>
+
+<p>This simple example shows that: by upserting new changes into table_1 and 
establishing a simple projected table (projected_table) through incremental 
consumption, we can 
+operate simpler with lower latency more efficiently projection.</p>
+
+<p>Next, for a more complex scenario, we can use incremental processing to 
support the stream and batch connections supported by the stream computing 
framework, 
+and stream-stream connections (just need to add some additional logic to align 
window) :</p>
+
+<p><img src="/assets/images/blog/incr-processing/image6.png" alt="image6" 
/></p>
+
+<p>The example in the figure above connects a fact table to multiple dimension 
tables to create a connected table. This case is one of the rare scenarios 
where we can save hardware 
+costs while significantly reducing latency.</p>
+
+<h3 
id="quasi-real-time-scenarios-resourceefficiency-trade-offs">Quasi-real-time 
scenarios, resource/efficiency trade-offs</h3>
+
+<p>Incremental processing of new data in mini batches can use resources more 
efficiently. Let’s refer to a specific example. We have a Kafka event stream 
that is pouring in 
+at a rate of 10,000 per second. We want to count the number of messages in 
some dimensions over the past 15 minutes. Many stream processing pipelines use 
an external/internal 
+result state store (such as RocksDB, Cassandra, ElasticSearch) to save the 
aggregated count results, and run the containers in resource managers such as 
YARN/Mesos continuously, 
+which is very reasonable in less than a five-minute delay window scene. In 
fact, the YARN container itself has some startup overhead. In addition, in 
order to improve the 
+performance of writing to result storage system, we usually cache the results 
before performing batch updates. This kind of protocol requires the container 
to run continuously.</p>
+
+<p>However, in quasi-real-time processing scenarios, these options may not be 
optimal. To achieve the same effect, you can use short-life containers and 
optimize overall 
+resource utilization. For example, a streaming processor may need to perform 
six million updates to the result storage system in 15 minutes. However, in the 
incremental 
+batch mode, we only need to perform an in-memory merge on the accumulated data 
and update the result storage system only once, then only use the resource 
container for 
+five minutes. Compared with the pure stream processing mode, the incremental 
batch processing mode has several times the CPU efficiency improvement, and 
there are several 
+orders of magnitude efficiency improvement in updating to the result storage. 
Basically, this processing method obtains resources on demand, instead of 
swallowing CPU and 
+memory while waiting for data to be calculated in real time.</p>
+
+<h3 
id="incremental-processing-facilitates-unified-data-lake-architecture">Incremental
 processing facilitates unified data lake architecture</h3>
+
+<p>Whether in the data warehouse or in the data lake, data processing is an 
unavoidable problem. Data processing involves the selection of computing 
engines and 
+the design of architectures. There are currently two mainstream architectures 
in the industry: Lambda and Kappa architectures. Each architecture has its own 
+characteristics and existing problems. Derivative versions of these 
architectures are also <a 
href="https://www.infoq.cn/article/Uo4pFswlMzBVhq*Y2tB9";>emerging 
endlessly</a>.</p>
+
+<p>In reality, many enterprises still maintain the implementation of the <a 
href="https://en.wikipedia.org/wiki/Lambda_architecture";>Lambda 
architecture</a>. 
+The typical Lambda architecture has two modules for the data processing part: 
the speed layer and the batch layer.</p>
+
+<p><img src="/assets/images/blog/incr-processing/image5.png" alt="image5" 
/></p>
+
+<p>They are usually two independent implementations (from code to 
infrastructure). For example, Flink (formerly Storm) is a popular option on the 
speed layer, 
+while MapReduce/Spark can serve as a batch layer. In fact, people often rely 
on the speed layer to provide updated results (which may not be accurate), and 
+once the data is considered complete, the results of the speed layer are 
corrected at a later time through the batch layer. With incremental processing, 
+we have the opportunity to implement the Lambda architecture for batch 
processing and quasi-real-time processing at the code level and infrastructure 
level in 
+a unified manner. It typically looks like below:</p>
+
+<p><img src="/assets/images/blog/incr-processing/image3.png" alt="image3" 
/></p>
+
+<p>As we said, you can use SQL or a batch processing framework like Spark to 
consistently implement your processing logic. The result table is built 
incrementally, 
+and SQL is executed on “new data” like streaming to produce a quick view of 
the results. The same SQL can be executed periodically on the full amount of 
data to 
+correct any inaccurate results (remember, join operations are always tricky!) 
and produce a more “complete” view of the results. In both cases, we will use 
the 
+same infrastructure to perform calculations, which can reduce overall 
operating costs and complexity.</p>
+
+<p>Setting aside the Lambda architecture, even in the Kappa architecture, the 
first primitive of incremental processing (upsert) also plays an important 
role. 
+Uber <a 
href="https://www.slideshare.net/FlinkForward/flink-forward-san-francisco-2019-moving-from-lambda-and-kappa-architectures-to-kappa-at-uber-roshan-naik";>proposed</a>
 the Kappa + architecture 
+based on this. The Kappa architecture advocates a single stream computing 
layer sufficient to become a general solution 
+for data processing. Although the batch layer is removed in this model, there 
are still two problems in the service layer:</p>
+
+<p>Now days many stream processing engines support row-level data processing, 
which requires that our service layer should also support row-level updates;
+The trade-offs between data ingestion delay, scanning performance and 
computing resources and operational complexity are unavoidable.</p>
+
+<p><img src="/assets/images/blog/incr-processing/image8.png" alt="image8" 
/></p>
+
+<p>However, if our business scenarios have low latency requirements, for 
example, we can accept a delay of about 10 minutes. And if we can quickly 
ingest and prepare data on DFS, 
+effectively connect and propagate updates to the upper-level modeling data 
set, Speed Serving in the service layer is unnecessary. Then the service layer 
can be unified, 
+greatly reducing the overall complexity and resource consumption of the 
system.</p>
+
+<p>Above, we introduced the significance of incremental processing for the 
data lake. Next, we introduce the implementation and support of incremental 
processing. 
+Among the three open source data lake frameworks (Apache Hudi/Iceberg, Delta 
Lake), only Apache Hudi provides good support for incremental processing. 
+This is completely rooted in a framework developed by Uber at the time when it 
encountered the pain points of data analysis on the Hadoop data lake. 
+So, next, let’s introduce how Hudi supports incremental processing.</p>
+
+<h2 id="hudis-support-for-incremental-processing">Hudi’s support for 
incremental processing</h2>
+
+<p>Apache Hudi (Hadoop Upserts Deletes and Incrementals) is a top-level 
project of the Apache Foundation. It allows you to process very large-scale 
data on 
+top of Hadoop-compatible storage, and it also provides two primitives that 
enable stream processing on the data lake in addition to classic batch 
processing.</p>
+
+<p>From the naming of the letter “I” denotes “Incremental Processing”, we can 
see that it will support incremental processing as a first class citizen. 
+The two primitives we mentioned at the beginning of this article that support 
incremental processing are reflected in the following two aspects in Apache 
Hudi:</p>
+
+<p>Update/Delete operation:Hudi provides support for updating/deleting 
records, using fine-grained file/record level indexes while providing 
transactional guarantees 
+for the write operation. Queries process the last such committed snapshot, to 
produce results..</p>
+
+<p>Change stream: Hudi also provides first-class support for obtaining an 
incremental stream of all the records that were updated/inserted/deleted in a 
given table, from a given point-in-time.</p>
+
+<p>The specific implementation of the change flow is “incremental view”. Hudi 
is the only one of the three open source data lake frameworks that supports 
+the incremental query feature, with support for record level change streams. 
The following sample code snippet shows us how to query the incremental 
view:</p>
+
+<div class="language-java highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code><span class="c1">// spark-shell</span>
+<span class="c1">// reload data</span>
+<span class="n">spark</span><span class="o">.</span>
+  <span class="n">read</span><span class="o">.</span>
+  <span class="nf">format</span><span class="o">(</span><span 
class="s">"hudi"</span><span class="o">).</span>
+  <span class="n">load</span><span class="o">(</span><span 
class="n">basePath</span> <span class="o">+</span> <span 
class="s">"/*/*/*/*"</span><span class="o">).</span>
+  <span class="n">createOrReplaceTempView</span><span class="o">(</span><span 
class="s">"hudi_trips_snapshot"</span><span class="o">)</span>
+
+<span class="n">val</span> <span class="n">commits</span> <span 
class="o">=</span> <span class="n">spark</span><span class="o">.</span><span 
class="na">sql</span><span class="o">(</span><span class="s">"select 
distinct(_hoodie_commit_time) as commitTime from  hudi_trips_snapshot order by 
commitTime"</span><span class="o">).</span><span class="na">map</span><span 
class="o">(</span><span class="n">k</span> <span class="o">=&gt;</span> <span 
class="n">k</span><span class="o">.</span><span c [...]
+<span class="n">val</span> <span class="n">beginTime</span> <span 
class="o">=</span> <span class="n">commits</span><span class="o">(</span><span 
class="n">commits</span><span class="o">.</span><span class="na">length</span> 
<span class="o">-</span> <span class="mi">2</span><span class="o">)</span> 
<span class="c1">// commit time we are interested in</span>
+
+<span class="c1">// incrementally query data</span>
+<span class="n">val</span> <span class="n">tripsIncrementalDF</span> <span 
class="o">=</span> <span class="n">spark</span><span class="o">.</span><span 
class="na">read</span><span class="o">.</span><span 
class="na">format</span><span class="o">(</span><span 
class="s">"hudi"</span><span class="o">).</span>
+  <span class="n">option</span><span class="o">(</span><span 
class="no">QUERY_TYPE_OPT_KEY</span><span class="o">,</span> <span 
class="no">QUERY_TYPE_INCREMENTAL_OPT_VAL</span><span class="o">).</span>
+  <span class="n">option</span><span class="o">(</span><span 
class="no">BEGIN_INSTANTTIME_OPT_KEY</span><span class="o">,</span> <span 
class="n">beginTime</span><span class="o">).</span>
+  <span class="n">load</span><span class="o">(</span><span 
class="n">basePath</span><span class="o">)</span>
+<span class="n">tripsIncrementalDF</span><span class="o">.</span><span 
class="na">createOrReplaceTempView</span><span class="o">(</span><span 
class="s">"hudi_trips_incremental"</span><span class="o">)</span>
+
+<span class="n">spark</span><span class="o">.</span><span 
class="na">sql</span><span class="o">(</span><span class="s">"select 
`_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from  
hudi_trips_incremental where fare &gt; 20.0"</span><span 
class="o">).</span><span class="na">show</span><span class="o">()</span>
+
+</code></pre></div></div>
+
+<p>The code snippet above creates a Hudi trip increment table 
(hudi_trips_incremental), and then queries all the change records in the 
increment table after the “beginTime” submission time 
+and the “cost”  is greater than 20.0. Based on this query, you can create 
incremental data pipelines on batch data.</p>
+
+<h2 id="summary">Summary</h2>
+
+<p>In this article, we first elaborated many problems caused by the lack of 
incremental processing primitives in the traditional Hadoop data warehouse due 
to the trade-off between data integrity 
+and latency, and some long-tail applications that rely heavily on updates. 
Next, we argued that to support incremental processing, we must have at least 
two primitives: upsert and 
+incremental consumption, and explained why these two primitives can solve the 
problems explained above.</p>
+
+<p>Then, we introduced why incremental processing is also important to the 
data lake. There are many common parts in data processing between the data lake 
and the data warehouse. 
+In the data warehouse, some “pain points” caused by the lack of incremental 
processing also exist in the data lake. We elaborated its significance to the 
data lake from four 
+aspects: incremental processing of semantics of natural fit flow, the need for 
analytical scenarios, quasi-real-time scene resource/efficiency trade-offs, and 
unified lake architecture.</p>
+
+<p>Finally, we introduced the open source data lake storage framework Apache 
Hudi’s support for incremental processing and simple cases.</p>
+
+      </section>
+
+      <a href="#masthead__inner-wrap" class="back-to-top">Back to top 
&uarr;</a>
+
+
+      
+
+    </div>
+
+  </article>
+
+</div>
+
+    </div>
+
+    <div class="page__footer">
+      <footer>
+        
+<div class="row">
+  <div class="col-lg-12 footer">
+    <p>
+      <table class="table-apache-info">
+        <tr>
+          <td>
+            <a class="footer-link-img" href="https://apache.org";>
+              <img width="250px" src="/assets/images/asf_logo.svg" alt="The 
Apache Software Foundation">
+            </a>
+          </td>
+          <td>
+            <a style="float: right" 
href="https://www.apache.org/events/current-event.html";>
+              <img 
src="https://www.apache.org/events/current-event-234x60.png"; />
+            </a>
+          </td>
+        </tr>
+      </table>
+    </p>
+    <p>
+      <a href="https://www.apache.org/licenses/";>License</a> | <a 
href="https://www.apache.org/security/";>Security</a> | <a 
href="https://www.apache.org/foundation/thanks.html";>Thanks</a> | <a 
href="https://www.apache.org/foundation/sponsorship.html";>Sponsorship</a>
+    </p>
+    <p>
+      Copyright &copy; <span id="copyright-year">2019</span> <a 
href="https://apache.org";>The Apache Software Foundation</a>, Licensed under 
the <a href="https://www.apache.org/licenses/LICENSE-2.0";> Apache License, 
Version 2.0</a>.
+      Hudi, Apache and the Apache feather logo are trademarks of The Apache 
Software Foundation. <a href="/docs/privacy">Privacy Policy</a>
+    </p>
+  </div>
+</div>
+      </footer>
+    </div>
+
+
+  </body>
+</html>
\ No newline at end of file
diff --git a/content/cn/activity.html b/content/cn/activity.html
index a834d14..69cfba7 100644
--- a/content/cn/activity.html
+++ b/content/cn/activity.html
@@ -191,6 +191,30 @@
     
     <h2 class="archive__item-title" itemprop="headline">
       
+        <a href="/blog/hudi-incremental-processing-on-data-lakes/" 
rel="permalink">Incremental Processing on the Data Lake
+</a>
+      
+    </h2>
+    <!-- Look the author details up from the site config. -->
+    
+    <!-- Output author details if some exist. -->
+    <div class="archive__item-meta"><a 
href="https://cwiki.apache.org/confluence/display/~vinoyang";>Vino Yang</a> 
posted on <time datetime="2020-08-18">August 18, 2020</time></div>
+ 
+    <p class="archive__item-excerpt" itemprop="description">How Apache Hudi 
provides ability for incremental data processing.
+</p>
+  </article>
+</div>
+
+        
+        
+
+
+
+<div class="list__item">
+  <article class="archive__item" itemscope 
itemtype="https://schema.org/CreativeWork";>
+    
+    <h2 class="archive__item-title" itemprop="headline">
+      
         <a href="/blog/monitoring-hudi-metrics-with-datadog/" 
rel="permalink">Monitor Hudi metrics with Datadog
 </a>
       
diff --git a/content/sitemap.xml b/content/sitemap.xml
index e344588..b3e6d0e 100644
--- a/content/sitemap.xml
+++ b/content/sitemap.xml
@@ -929,6 +929,10 @@
 <lastmod>2020-05-28T00:00:00-04:00</lastmod>
 </url>
 <url>
+<loc>https://hudi.apache.org/blog/hudi-incremental-processing-on-data-lakes/</loc>
+<lastmod>2020-08-18T00:00:00-04:00</lastmod>
+</url>
+<url>
 <loc>https://hudi.apache.org/cn/activity</loc>
 <lastmod>2019-12-30T14:59:57-05:00</lastmod>
 </url>

[hudi] branch asf-site updated: Travis CI build asf-site

Reply via email to