This is an automated email from the ASF dual-hosted git repository.
vinoth pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push:
new a49b4a2 Travis CI build asf-site
a49b4a2 is described below
commit a49b4a2994456b01afd0a405cba3bc6da7b81b22
Author: CI <[email protected]>
AuthorDate: Thu Jan 28 05:20:51 2021 +0000
Travis CI build asf-site
---
content/activity.html | 31 ++
.../clustering/Query_Plan_After_Clustering.png | Bin 0 -> 97289 bytes
.../clustering/Query_Plan_Before_Clustering.png | Bin 0 -> 96605 bytes
.../blog/clustering/example_perf_improvement.png | Bin 0 -> 416119 bytes
content/assets/js/lunr/lunr-store.js | 5 +
content/blog.html | 31 ++
content/blog/hudi-clustering-intro/index.html | 396 +++++++++++++++++++++
content/cn/activity.html | 31 ++
content/sitemap.xml | 4 +
9 files changed, 498 insertions(+)
diff --git a/content/activity.html b/content/activity.html
index bb904d2..7ec278e 100644
--- a/content/activity.html
+++ b/content/activity.html
@@ -180,6 +180,37 @@
+ <section id="2021" class="taxonomy__section">
+ <h2 class="archive__subtitle">2021</h2>
+ <div class="entries-list">
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
+ <a href="/blog/hudi-clustering-intro/" rel="permalink">Optimize Data
lake layout using Clustering in Apache Hudi
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+
+
+ <p class="archive__item-excerpt" itemprop="description">Introduce
clustering feature to change data layout
+</p>
+ </article>
+</div>
+
+
+ </div>
+ </section>
+
<section id="2020" class="taxonomy__section">
<h2 class="archive__subtitle">2020</h2>
<div class="entries-list">
diff --git
a/content/assets/images/blog/clustering/Query_Plan_After_Clustering.png
b/content/assets/images/blog/clustering/Query_Plan_After_Clustering.png
new file mode 100644
index 0000000..4c2b216
Binary files /dev/null and
b/content/assets/images/blog/clustering/Query_Plan_After_Clustering.png differ
diff --git
a/content/assets/images/blog/clustering/Query_Plan_Before_Clustering.png
b/content/assets/images/blog/clustering/Query_Plan_Before_Clustering.png
new file mode 100644
index 0000000..8801028
Binary files /dev/null and
b/content/assets/images/blog/clustering/Query_Plan_Before_Clustering.png differ
diff --git a/content/assets/images/blog/clustering/example_perf_improvement.png
b/content/assets/images/blog/clustering/example_perf_improvement.png
new file mode 100644
index 0000000..15205c5
Binary files /dev/null and
b/content/assets/images/blog/clustering/example_perf_improvement.png differ
diff --git a/content/assets/js/lunr/lunr-store.js
b/content/assets/js/lunr/lunr-store.js
index 9d0ac8b..bf8a81a 100644
--- a/content/assets/js/lunr/lunr-store.js
+++ b/content/assets/js/lunr/lunr-store.js
@@ -1433,4 +1433,9 @@ var store = [{
"excerpt":"Building High-Performance Data Lake Using Apache Hudi and
Alluxio at T3Go T3Go is China’s first platform for smart travel based on the
Internet of Vehicles. In this article, Trevor Zhang and Vino Yang from T3Go
describe the evolution of their data lake architecture, built on cloud-native
or open-source technologies including...","categories": ["blog"],
"tags": [],
"url":
"https://hudi.apache.org/blog/high-perf-data-lake-with-hudi-and-alluxio-t3go/",
+ "teaser":"https://hudi.apache.org/assets/images/500x300.png"},{
+ "title": "Optimize Data lake layout using Clustering in Apache Hudi",
+ "excerpt":"Background Apache Hudi brings stream processing to big
data, providing fresh data while being an order of magnitude efficient over
traditional batch processing. In a data lake/warehouse, one of the key
trade-offs is between ingestion speed and query performance. Data ingestion
typically prefers small files to improve parallelism and make...","categories":
["blog"],
+ "tags": [],
+ "url": "https://hudi.apache.org/blog/hudi-clustering-intro/",
"teaser":"https://hudi.apache.org/assets/images/500x300.png"},]
diff --git a/content/blog.html b/content/blog.html
index 31e7ec9..004a368 100644
--- a/content/blog.html
+++ b/content/blog.html
@@ -178,6 +178,37 @@
+ <section id="2021" class="taxonomy__section">
+ <h2 class="archive__subtitle">2021</h2>
+ <div class="entries-list">
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
+ <a href="/blog/hudi-clustering-intro/" rel="permalink">Optimize Data
lake layout using Clustering in Apache Hudi
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+
+
+ <p class="archive__item-excerpt" itemprop="description">Introduce
clustering feature to change data layout
+</p>
+ </article>
+</div>
+
+
+ </div>
+ </section>
+
<section id="2020" class="taxonomy__section">
<h2 class="archive__subtitle">2020</h2>
<div class="entries-list">
diff --git a/content/blog/hudi-clustering-intro/index.html
b/content/blog/hudi-clustering-intro/index.html
new file mode 100644
index 0000000..d16b0dc
--- /dev/null
+++ b/content/blog/hudi-clustering-intro/index.html
@@ -0,0 +1,396 @@
+<!doctype html>
+<html lang="en" class="no-js">
+ <head>
+ <meta charset="utf-8">
+
+<!-- begin _includes/seo.html --><title>Optimize Data lake layout using
Clustering in Apache Hudi - Apache Hudi</title>
+<meta name="description" content="Introduce clustering feature to change data
layout">
+
+<meta property="og:type" content="article">
+<meta property="og:locale" content="en_US">
+<meta property="og:site_name" content="">
+<meta property="og:title" content="Optimize Data lake layout using Clustering
in Apache Hudi">
+<meta property="og:url"
content="https://hudi.apache.org/blog/hudi-clustering-intro/">
+
+
+ <meta property="og:description" content="Introduce clustering feature to
change data layout">
+
+
+
+
+
+
+
+
+
+
+
+<!-- end _includes/seo.html -->
+
+
+<!--<link href="/feed.xml" type="application/atom+xml" rel="alternate" title="
Feed">-->
+
+<!-- https://t.co/dKP3o1e -->
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+<script>
+ document.documentElement.className =
document.documentElement.className.replace(/\bno-js\b/g, '') + ' js ';
+</script>
+
+<!-- For all browsers -->
+<link rel="stylesheet" href="/assets/css/main.css">
+
+<!--[if IE]>
+ <style>
+ /* old IE unsupported flexbox fixes */
+ .greedy-nav .site-title {
+ padding-right: 3em;
+ }
+ .greedy-nav button {
+ position: absolute;
+ top: 0;
+ right: 0;
+ height: 100%;
+ }
+ </style>
+<![endif]-->
+
+
+
+<link rel="icon" type="image/x-icon" href="/assets/images/favicon.ico">
+<link rel="stylesheet" href="/assets/css/font-awesome.min.css">
+<script src="/assets/js/jquery.min.js"></script>
+
+
+<script src="/assets/js/main.min.js"></script>
+
+ </head>
+
+ <body class="layout--single">
+ <!--[if lt IE 9]>
+<div class="notice--danger align-center" style="margin: 0;">You are using an
<strong>outdated</strong> browser. Please <a
href="https://browsehappy.com/">upgrade your browser</a> to improve your
experience.</div>
+<![endif]-->
+
+ <div class="masthead">
+ <div class="masthead__inner-wrap" id="masthead__inner-wrap">
+ <div class="masthead__menu">
+ <nav id="site-nav" class="greedy-nav">
+
+ <a class="site-logo" href="/">
+ <div style="width: 150px; height: 40px">
+ </div>
+ </a>
+
+ <a class="site-title" href="/">
+
+ </a>
+ <ul class="visible-links"><li class="masthead__menu-item">
+ <a href="/docs/quick-start-guide.html" target="_self"
>Documentation</a>
+ </li><li class="masthead__menu-item">
+ <a href="/community.html" target="_self" >Community</a>
+ </li><li class="masthead__menu-item">
+ <a href="/blog.html" target="_self" >Blog</a>
+ </li><li class="masthead__menu-item">
+ <a href="https://cwiki.apache.org/confluence/display/HUDI/FAQ"
target="_blank" >FAQ</a>
+ </li><li class="masthead__menu-item">
+ <a href="/docs/powered_by.html" target="_self" >Powered By</a>
+ </li><li class="masthead__menu-item">
+ <a href="/releases.html" target="_self" >Releases</a>
+ </li></ul>
+ <button class="greedy-nav__toggle hidden" type="button">
+ <span class="visually-hidden">Toggle menu</span>
+ <div class="navicon"></div>
+ </button>
+ <ul class="hidden-links hidden"></ul>
+ </nav>
+ </div>
+ </div>
+</div>
+<!--
+<p class="notice--warning" style="margin: 0 !important; text-align: center
!important;"><strong>Note:</strong> This site is work in progress, if you
notice any issues, please <a target="_blank"
href="https://github.com/apache/hudi/issues">Report on Issue</a>.
+ Click <a href="/"> here</a> back to old site.</p>
+-->
+
+ <div class="initial-content">
+ <div id="main" role="main">
+
+
+ <div class="sidebar sticky">
+
+
+ <div itemscope itemtype="https://schema.org/Person">
+
+ <div class="author__content">
+
+ <h3 class="author__name" itemprop="name">Quick Links</h3>
+
+
+ <div class="author__bio" itemprop="description">
+ <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+
+ </div>
+
+ </div>
+
+ <div class="author__urls-wrapper">
+ <ul class="author__urls social-icons">
+
+
+ <li><a href="/docs/quick-start-guide" target="_self" rel="nofollow
noopener noreferrer"><i class="fa fa-book" aria-hidden="true"></i>
Documentation</a></li>
+
+
+
+ <li><a href="https://cwiki.apache.org/confluence/display/HUDI"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-wikipedia-w"
aria-hidden="true"></i> Technical Wiki</a></li>
+
+
+
+ <li><a href="/contributing" target="_self" rel="nofollow noopener
noreferrer"><i class="fa fa-thumbs-o-up" aria-hidden="true"></i> Contribution
Guide</a></li>
+
+
+
+ <li><a
href="https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-slack"
aria-hidden="true"></i> Join on Slack</a></li>
+
+
+
+ <li><a href="https://github.com/apache/hudi" target="_blank"
rel="nofollow noopener noreferrer"><i class="fa fa-github"
aria-hidden="true"></i> Fork on GitHub</a></li>
+
+
+
+ <li><a href="https://issues.apache.org/jira/projects/HUDI/summary"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-navicon"
aria-hidden="true"></i> Report Issues</a></li>
+
+
+
+ <li><a href="/security" target="_self" rel="nofollow noopener
noreferrer"><i class="fa fa-navicon" aria-hidden="true"></i> Report Security
Issues</a></li>
+
+
+
+
+ </ul>
+ </div>
+</div>
+
+
+
+
+ </div>
+
+
+ <article class="page" itemscope itemtype="https://schema.org/CreativeWork">
+ <!-- Look the author details up from the site config. -->
+
+
+ <div class="page__inner-wrap">
+
+ <header>
+ <h1 id="page-title" class="page__title" itemprop="headline">Optimize
Data lake layout using Clustering in Apache Hudi
+</h1>
+ <!-- Output author details if some exist. -->
+
+ </header>
+
+
+ <section class="page__content" itemprop="text">
+
+ <style>
+ .page {
+ padding-right: 0 !important;
+ }
+ </style>
+
+ <h1 id="background">Background</h1>
+
+<p>Apache Hudi brings stream processing to big data, providing fresh data
while being an order of magnitude efficient over traditional batch processing.
In a data lake/warehouse, one of the key trade-offs is between ingestion speed
and query performance. Data ingestion typically prefers small files to improve
parallelism and make data available to queries as soon as possible. However,
query performance degrades poorly with a lot of small files. Also, during
ingestion, data is typically c [...]
+
+<h1 id="clustering-architecture">Clustering Architecture</h1>
+
+<p>At a high level, Hudi provides different operations such as
insert/upsert/bulk_insert through it’s write client API to be able to write
data to a Hudi table. To be able to choose a trade-off between file size and
ingestion speed, Hudi provides a knob <code
class="highlighter-rouge">hoodie.parquet.small.file.limit</code> to be able to
configure the smallest allowable file size. Users are able to configure the
small file <a href="https://hudi.apache.org/docs/configurations.html#compacti
[...]
+
+<p>To be able to support an architecture that allows for fast ingestion
without compromising query performance, we have introduced a ‘clustering’
service to rewrite the data to optimize Hudi data lake file layout.</p>
+
+<p>Clustering table service can run asynchronously or synchronously adding a
new action type called “REPLACE”, that will mark the clustering action in the
Hudi metadata timeline.</p>
+
+<h3 id="overall-there-are-2-parts-to-clustering">Overall, there are 2 parts to
clustering</h3>
+
+<ol>
+ <li>Scheduling clustering: Create a clustering plan using a pluggable
clustering strategy.</li>
+ <li>Execute clustering: Process the plan using an execution strategy to
create new files and replace old files.</li>
+</ol>
+
+<h3 id="scheduling-clustering">Scheduling clustering</h3>
+
+<p>Following steps are followed to schedule clustering.</p>
+
+<ol>
+ <li>Identify files that are eligible for clustering: Depending on the
clustering strategy chosen, the scheduling logic will identify the files
eligible for clustering.</li>
+ <li>Group files that are eligible for clustering based on specific criteria.
Each group is expected to have data size in multiples of ‘targetFileSize’.
Grouping is done as part of ‘strategy’ defined in the plan. Additionally, there
is an option to put a cap on group size to improve parallelism and avoid
shuffling large amounts of data.</li>
+ <li>Finally, the clustering plan is saved to the timeline in an avro <a
href="https://github.com/apache/hudi/blob/master/hudi-common/src/main/avro/HoodieClusteringPlan.avsc">metadata
format</a>.</li>
+</ol>
+
+<h3 id="running-clustering">Running clustering</h3>
+
+<ol>
+ <li>Read the clustering plan and get the ‘clusteringGroups’ that mark the
file groups that need to be clustered.</li>
+ <li>For each group, we instantiate appropriate strategy class with
strategyParams (example: sortColumns) and apply that strategy to rewrite the
data.</li>
+ <li>Create a “REPLACE” commit and update the metadata in <a
href="https://github.com/apache/hudi/blob/master/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieReplaceCommitMetadata.java">HoodieReplaceCommitMetadata</a>.</li>
+</ol>
+
+<p>Clustering Service builds on Hudi’s MVCC based design to allow for writers
to continue to insert new data while clustering action runs in the background
to reformat data layout, ensuring snapshot isolation between concurrent readers
and writers.</p>
+
+<p>NOTE: Clustering can only be scheduled for tables / partitions not
receiving any concurrent updates. In the future, concurrent updates use-case
will be supported as well.</p>
+
+<p><img src="/assets/images/blog/clustering/example_perf_improvement.png"
alt="Clustering example" />
+<em>Figure: Illustrating query performance improvements by clustering</em></p>
+
+<h3 id="setting-up-clustering">Setting up clustering</h3>
+<p>Inline clustering can be setup easily using spark dataframe options. See
sample below</p>
+
+<div class="language-scala highlighter-rouge"><div class="highlight"><pre
class="highlight"><code><span class="k">import</span> <span
class="nn">org.apache.hudi.QuickstartUtils._</span>
+<span class="k">import</span> <span
class="nn">scala.collection.JavaConversions._</span>
+<span class="k">import</span> <span
class="nn">org.apache.spark.sql.SaveMode._</span>
+<span class="k">import</span> <span
class="nn">org.apache.hudi.DataSourceReadOptions._</span>
+<span class="k">import</span> <span
class="nn">org.apache.hudi.DataSourceWriteOptions._</span>
+<span class="k">import</span> <span
class="nn">org.apache.hudi.config.HoodieWriteConfig._</span>
+
+
+<span class="k">val</span> <span class="nv">df</span> <span class="k">=</span>
<span class="c1">//generate data frame
+</span><span class="nv">df</span><span class="o">.</span><span
class="py">write</span><span class="o">.</span><span
class="py">format</span><span class="o">(</span><span
class="s">"org.apache.hudi"</span><span class="o">).</span>
+ <span class="nf">options</span><span class="o">(</span><span
class="n">getQuickstartWriteConfigs</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="nc">PRECOMBINE_FIELD_OPT_KEY</span><span class="o">,</span> <span
class="s">"ts"</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="nc">RECORDKEY_FIELD_OPT_KEY</span><span class="o">,</span> <span
class="s">"uuid"</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="nc">PARTITIONPATH_FIELD_OPT_KEY</span><span class="o">,</span> <span
class="s">"partitionpath"</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="nc">TABLE_NAME</span><span class="o">,</span> <span
class="s">"tableName"</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="s">"hoodie.parquet.small.file.limit"</span><span class="o">,</span>
<span class="s">"0"</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="s">"hoodie.clustering.inline"</span><span class="o">,</span> <span
class="s">"true"</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="s">"hoodie.clustering.inline.max.commits"</span><span class="o">,</span>
<span class="s">"4"</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="s">"hoodie.clustering.plan.strategy.target.file.max.bytes"</span><span
class="o">,</span> <span class="s">"1073741824"</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="s">"hoodie.clustering.plan.strategy.small.file.limit"</span><span
class="o">,</span> <span class="s">"629145600"</span><span class="o">).</span>
+ <span class="nf">option</span><span class="o">(</span><span
class="s">"hoodie.clustering.plan.strategy.sort.columns"</span><span
class="o">,</span> <span class="s">"column1,column2"</span><span
class="o">).</span> <span class="c1">//optional, if sorting is needed as part
of rewriting data
+</span> <span class="nf">mode</span><span class="o">(</span><span
class="nc">Append</span><span class="o">).</span>
+ <span class="nf">save</span><span class="o">(</span><span
class="s">"dfs://location"</span><span class="o">);</span>
+</code></pre></div></div>
+
+<p>For more advanced usecases, async clustering pipeline can also be setup.
See an example <a
href="https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance#RFC19Clusteringdataforfreshnessandqueryperformance-SetupforAsyncclusteringJob">here</a>.</p>
+
+<h1 id="table-query-performance">Table Query Performance</h1>
+
+<p>We created a dataset from one partition of a known production style table
with ~20M records and on-disk size of ~200GB. The dataset has rows for multiple
“sessions”. Users always query this data using a predicate on session. Data for
a single session is spread across multiple data files because ingestion groups
data based on arrival time. The below experiment shows that by clustering on
session, we are able to improve the data locality and reduce query execution
time by more than 50%.</p>
+
+<p>Query:</p>
+<div class="language-scala highlighter-rouge"><div class="highlight"><pre
class="highlight"><code><span class="nv">spark</span><span
class="o">.</span><span class="py">sql</span><span class="o">(</span><span
class="s">"select * from table where session_id=123"</span><span
class="o">)</span>
+</code></pre></div></div>
+
+<h2 id="before-clustering">Before Clustering</h2>
+
+<p>Query took 2.2 minutes to complete. Note that the number of output rows in
the “scan parquet” part of the query plan includes all 20M rows in the
table.</p>
+
+<p><img src="/assets/images/blog/clustering/Query_Plan_Before_Clustering.png"
alt="Query Plan Before Clustering" />
+<em>Figure: Spark SQL query details before clustering</em></p>
+
+<h2 id="after-clustering">After Clustering</h2>
+
+<p>The query plan is similar to above. But, because of improved data locality
and predicate push down, spark is able to prune a lot of rows. After
clustering, the same query only outputs 110K rows (out of 20M rows) while
scanning parquet files. This cuts query time to less than a minute from 2.2
minutes.</p>
+
+<p><img src="/assets/images/blog/clustering/Query_Plan_After_Clustering.png"
alt="Query Plan Before Clustering" />
+<em>Figure: Spark SQL query details after clustering</em></p>
+
+<p>The table below summarizes query performance improvements from experiments
run using Spark3</p>
+
+<table>
+ <thead>
+ <tr>
+ <th>Table State</th>
+ <th>Query runtime</th>
+ <th>Num Records Processed</th>
+ <th>Num files on disk</th>
+ <th>Size of each file</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td><strong>Unclustered</strong></td>
+ <td>130,673 ms</td>
+ <td>~20M</td>
+ <td>13642</td>
+ <td>~150 MB</td>
+ </tr>
+ <tr>
+ <td><strong>Clustered</strong></td>
+ <td>55,963 ms</td>
+ <td>~110K</td>
+ <td>294</td>
+ <td>~600 MB</td>
+ </tr>
+ </tbody>
+</table>
+
+<p>Query runtime is reduced by 60% after clustering. Similar results were
observed on other sample datasets. See example query plans and more details at
the <a
href="https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance#RFC19Clusteringdataforfreshnessandqueryperformance-PerformanceEvaluation">RFC-19
performance evaluation</a>.</p>
+
+<p>We expect dramatic speedup for large tables, where the query runtime is
almost entirely dominated by actual I/O and not query planning, unlike the
example above.</p>
+
+<h1 id="summary">Summary</h1>
+
+<p>Using clustering, we can improve query performance by</p>
+<ol>
+ <li>Leveraging concepts such as <a
href="https://en.wikipedia.org/wiki/Z-order_curve">space filling curves</a> to
adapt data lake layout and reduce the amount of data read during queries.</li>
+ <li>Stitch small files into larger ones and reduce the total number of files
that need to be scanned by the query engine.</li>
+</ol>
+
+<p>Clustering also enables stream processing over big data. Ingestion can
write small files to satisfy latency requirements of stream processing.
Clustering can be used in the background to stitch these small files into
larger files and reduce file count.</p>
+
+<p>Besides this, the clustering framework also provides the flexibility to
asynchronously rewrite data based on specific requirements. We foresee many
other use-cases adopting clustering framework with custom pluggable strategies
to satisfy on-demand data lake management activities. Some such notable
use-cases that are actively being solved using clustering:</p>
+<ol>
+ <li>Rewrite data and encrypt data at rest.</li>
+ <li>Prune unused columns from tables and reduce storage footprint.</li>
+</ol>
+
+ </section>
+
+ <a href="#masthead__inner-wrap" class="back-to-top">Back to top
↑</a>
+
+
+
+
+ </div>
+
+ </article>
+
+</div>
+
+ </div>
+
+ <div class="page__footer">
+ <footer>
+
+<div class="row">
+ <div class="col-lg-12 footer">
+ <p>
+ <table class="table-apache-info">
+ <tr>
+ <td>
+ <a class="footer-link-img" href="https://apache.org">
+ <img width="250px" src="/assets/images/asf_logo.svg" alt="The
Apache Software Foundation">
+ </a>
+ </td>
+ <td>
+ <a style="float: right"
href="https://www.apache.org/events/current-event.html">
+ <img
src="https://www.apache.org/events/current-event-234x60.png" />
+ </a>
+ </td>
+ </tr>
+ </table>
+ </p>
+ <p>
+ <a href="https://www.apache.org/licenses/">License</a> | <a
href="https://www.apache.org/security/">Security</a> | <a
href="https://www.apache.org/foundation/thanks.html">Thanks</a> | <a
href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a>
+ </p>
+ <p>
+ Copyright © <span id="copyright-year">2019</span> <a
href="https://apache.org">The Apache Software Foundation</a>, Licensed under
the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License,
Version 2.0</a>.
+ Hudi, Apache and the Apache feather logo are trademarks of The Apache
Software Foundation. <a href="/docs/privacy">Privacy Policy</a>
+ </p>
+ </div>
+</div>
+ </footer>
+ </div>
+
+
+ </body>
+</html>
\ No newline at end of file
diff --git a/content/cn/activity.html b/content/cn/activity.html
index f1473db..eb734ec 100644
--- a/content/cn/activity.html
+++ b/content/cn/activity.html
@@ -178,6 +178,37 @@
+ <section id="2021" class="taxonomy__section">
+ <h2 class="archive__subtitle">2021</h2>
+ <div class="entries-list">
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
+ <a href="/blog/hudi-clustering-intro/" rel="permalink">Optimize Data
lake layout using Clustering in Apache Hudi
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+
+
+ <p class="archive__item-excerpt" itemprop="description">Introduce
clustering feature to change data layout
+</p>
+ </article>
+</div>
+
+
+ </div>
+ </section>
+
<section id="2020" class="taxonomy__section">
<h2 class="archive__subtitle">2020</h2>
<div class="entries-list">
diff --git a/content/sitemap.xml b/content/sitemap.xml
index 4ad54a9..a6de7ea 100644
--- a/content/sitemap.xml
+++ b/content/sitemap.xml
@@ -1149,6 +1149,10 @@
<lastmod>2020-12-01T00:00:00-05:00</lastmod>
</url>
<url>
+<loc>https://hudi.apache.org/blog/hudi-clustering-intro/</loc>
+<lastmod>2021-01-27T00:00:00-05:00</lastmod>
+</url>
+<url>
<loc>https://hudi.apache.org/cn/activity</loc>
<lastmod>2019-12-30T14:59:57-05:00</lastmod>
</url>