This is an automated email from the ASF dual-hosted git repository.
vinoth pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push:
new a73ce5d Travis CI build asf-site
a73ce5d is described below
commit a73ce5dbc557cae4a2b28a7da8ffdba7b17d84d3
Author: CI <[email protected]>
AuthorDate: Sat Dec 19 09:07:21 2020 +0000
Travis CI build asf-site
---
content/activity.html | 24 ++
.../images/blog/hudi-indexes/Dimension20tables.gif | Bin 0 -> 577717 bytes
.../images/blog/hudi-indexes/Event20tables.gif | Bin 0 -> 558858 bytes
.../images/blog/hudi-indexes/Fact20tables.gif | Bin 0 -> 595395 bytes
.../hudi-indexes/Hudi_Index_Blog_Event_table.png | Bin 0 -> 20851 bytes
.../hudi-indexes/Hudi_Index_Blog_Fact_table.png | Bin 0 -> 20910 bytes
.../Hudi_Index_Blog_dimensions_table.png | Bin 0 -> 21011 bytes
.../blog/hudi-indexes/with-and-without-index.png | Bin 0 -> 139025 bytes
content/assets/js/lunr/lunr-store.js | 5 +
content/blog.html | 24 ++
content/blog/hudi-indexing-mechanisms/index.html | 373 +++++++++++++++++++++
content/cn/activity.html | 24 ++
content/sitemap.xml | 4 +
13 files changed, 454 insertions(+)
diff --git a/content/activity.html b/content/activity.html
index f2783f2..bda641f 100644
--- a/content/activity.html
+++ b/content/activity.html
@@ -215,6 +215,30 @@
<h2 class="archive__item-title" itemprop="headline">
+ <a href="/blog/hudi-indexing-mechanisms/" rel="permalink">Employing
the right indexes for fast updates, deletes
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+
+
+ <p class="archive__item-excerpt" itemprop="description">Detailing
different indexing mechanisms in Hudi and when to use each of them
+</p>
+ </article>
+</div>
+
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
<a href="/blog/hudi-meets-aws-emr-and-aws-dms/" rel="permalink">Apply
record level changes from relational databases to Amazon S3 data lake using
Apache Hudi on Amazon EMR and AWS Database Migration Service
</a>
diff --git a/content/assets/images/blog/hudi-indexes/Dimension20tables.gif
b/content/assets/images/blog/hudi-indexes/Dimension20tables.gif
new file mode 100644
index 0000000..83ac5cc
Binary files /dev/null and
b/content/assets/images/blog/hudi-indexes/Dimension20tables.gif differ
diff --git a/content/assets/images/blog/hudi-indexes/Event20tables.gif
b/content/assets/images/blog/hudi-indexes/Event20tables.gif
new file mode 100644
index 0000000..7942bc8
Binary files /dev/null and
b/content/assets/images/blog/hudi-indexes/Event20tables.gif differ
diff --git a/content/assets/images/blog/hudi-indexes/Fact20tables.gif
b/content/assets/images/blog/hudi-indexes/Fact20tables.gif
new file mode 100644
index 0000000..2db8260
Binary files /dev/null and
b/content/assets/images/blog/hudi-indexes/Fact20tables.gif differ
diff --git
a/content/assets/images/blog/hudi-indexes/Hudi_Index_Blog_Event_table.png
b/content/assets/images/blog/hudi-indexes/Hudi_Index_Blog_Event_table.png
new file mode 100644
index 0000000..42da838
Binary files /dev/null and
b/content/assets/images/blog/hudi-indexes/Hudi_Index_Blog_Event_table.png differ
diff --git
a/content/assets/images/blog/hudi-indexes/Hudi_Index_Blog_Fact_table.png
b/content/assets/images/blog/hudi-indexes/Hudi_Index_Blog_Fact_table.png
new file mode 100644
index 0000000..1b4837b
Binary files /dev/null and
b/content/assets/images/blog/hudi-indexes/Hudi_Index_Blog_Fact_table.png differ
diff --git
a/content/assets/images/blog/hudi-indexes/Hudi_Index_Blog_dimensions_table.png
b/content/assets/images/blog/hudi-indexes/Hudi_Index_Blog_dimensions_table.png
new file mode 100644
index 0000000..550d6ab
Binary files /dev/null and
b/content/assets/images/blog/hudi-indexes/Hudi_Index_Blog_dimensions_table.png
differ
diff --git a/content/assets/images/blog/hudi-indexes/with-and-without-index.png
b/content/assets/images/blog/hudi-indexes/with-and-without-index.png
new file mode 100644
index 0000000..f1daac6
Binary files /dev/null and
b/content/assets/images/blog/hudi-indexes/with-and-without-index.png differ
diff --git a/content/assets/js/lunr/lunr-store.js
b/content/assets/js/lunr/lunr-store.js
index b1db17b..511e4bf 100644
--- a/content/assets/js/lunr/lunr-store.js
+++ b/content/assets/js/lunr/lunr-store.js
@@ -1204,6 +1204,11 @@ var store = [{
"tags": [],
"url": "https://hudi.apache.org/blog/hudi-meets-aws-emr-and-aws-dms/",
"teaser":"https://hudi.apache.org/assets/images/500x300.png"},{
+ "title": "Employing the right indexes for fast updates, deletes",
+ "excerpt":"Apache Hudi employs an index to locate the file group, that
an update/delete belong to. For Copy-On-Write tables, this enables fast
upsert/delete operations, by avoiding the need to join against the entire
dataset to determine which files to rewrite. For Merge-On-Read tables, this
design allows Hudi to bound the amount...","categories": ["blog"],
+ "tags": [],
+ "url": "https://hudi.apache.org/blog/hudi-indexing-mechanisms/",
+ "teaser":"https://hudi.apache.org/assets/images/500x300.png"},{
"title": "Building High-Performance Data Lake Using Apache Hudi and
Alluxio at T3Go",
"excerpt":"Building High-Performance Data Lake Using Apache Hudi and
Alluxio at T3Go T3Go is China’s first platform for smart travel based on the
Internet of Vehicles. In this article, Trevor Zhang and Vino Yang from T3Go
describe the evolution of their data lake architecture, built on cloud-native
or open-source technologies including...","categories": ["blog"],
"tags": [],
diff --git a/content/blog.html b/content/blog.html
index 3253983..943c988 100644
--- a/content/blog.html
+++ b/content/blog.html
@@ -213,6 +213,30 @@
<h2 class="archive__item-title" itemprop="headline">
+ <a href="/blog/hudi-indexing-mechanisms/" rel="permalink">Employing
the right indexes for fast updates, deletes
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+
+
+ <p class="archive__item-excerpt" itemprop="description">Detailing
different indexing mechanisms in Hudi and when to use each of them
+</p>
+ </article>
+</div>
+
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
<a href="/blog/hudi-meets-aws-emr-and-aws-dms/" rel="permalink">Apply
record level changes from relational databases to Amazon S3 data lake using
Apache Hudi on Amazon EMR and AWS Database Migration Service
</a>
diff --git a/content/blog/hudi-indexing-mechanisms/index.html
b/content/blog/hudi-indexing-mechanisms/index.html
new file mode 100644
index 0000000..fa47738
--- /dev/null
+++ b/content/blog/hudi-indexing-mechanisms/index.html
@@ -0,0 +1,373 @@
+<!doctype html>
+<html lang="en" class="no-js">
+ <head>
+ <meta charset="utf-8">
+
+<!-- begin _includes/seo.html --><title>Employing the right indexes for fast
updates, deletes - Apache Hudi</title>
+<meta name="description" content="Detailing different indexing mechanisms in
Hudi and when to use each of them">
+
+<meta property="og:type" content="article">
+<meta property="og:locale" content="en_US">
+<meta property="og:site_name" content="">
+<meta property="og:title" content="Employing the right indexes for fast
updates, deletes">
+<meta property="og:url"
content="https://hudi.apache.org/blog/hudi-indexing-mechanisms/">
+
+
+ <meta property="og:description" content="Detailing different indexing
mechanisms in Hudi and when to use each of them">
+
+
+
+
+
+
+
+
+
+
+
+<!-- end _includes/seo.html -->
+
+
+<!--<link href="/feed.xml" type="application/atom+xml" rel="alternate" title="
Feed">-->
+
+<!-- https://t.co/dKP3o1e -->
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+<script>
+ document.documentElement.className =
document.documentElement.className.replace(/\bno-js\b/g, '') + ' js ';
+</script>
+
+<!-- For all browsers -->
+<link rel="stylesheet" href="/assets/css/main.css">
+
+<!--[if IE]>
+ <style>
+ /* old IE unsupported flexbox fixes */
+ .greedy-nav .site-title {
+ padding-right: 3em;
+ }
+ .greedy-nav button {
+ position: absolute;
+ top: 0;
+ right: 0;
+ height: 100%;
+ }
+ </style>
+<![endif]-->
+
+
+
+<link rel="icon" type="image/x-icon" href="/assets/images/favicon.ico">
+<link rel="stylesheet" href="/assets/css/font-awesome.min.css">
+<script src="/assets/js/jquery.min.js"></script>
+
+
+<script src="/assets/js/main.min.js"></script>
+
+ </head>
+
+ <body class="layout--single">
+ <!--[if lt IE 9]>
+<div class="notice--danger align-center" style="margin: 0;">You are using an
<strong>outdated</strong> browser. Please <a
href="https://browsehappy.com/">upgrade your browser</a> to improve your
experience.</div>
+<![endif]-->
+
+ <div class="masthead">
+ <div class="masthead__inner-wrap" id="masthead__inner-wrap">
+ <div class="masthead__menu">
+ <nav id="site-nav" class="greedy-nav">
+
+ <a class="site-logo" href="/">
+ <div style="width: 150px; height: 40px">
+ </div>
+ </a>
+
+ <a class="site-title" href="/">
+
+ </a>
+ <ul class="visible-links"><li class="masthead__menu-item">
+ <a href="/docs/quick-start-guide.html" target="_self"
>Documentation</a>
+ </li><li class="masthead__menu-item">
+ <a href="/community.html" target="_self" >Community</a>
+ </li><li class="masthead__menu-item">
+ <a href="/blog.html" target="_self" >Blog</a>
+ </li><li class="masthead__menu-item">
+ <a href="https://cwiki.apache.org/confluence/display/HUDI/FAQ"
target="_blank" >FAQ</a>
+ </li><li class="masthead__menu-item">
+ <a href="/releases.html" target="_self" >Releases</a>
+ </li></ul>
+ <button class="greedy-nav__toggle hidden" type="button">
+ <span class="visually-hidden">Toggle menu</span>
+ <div class="navicon"></div>
+ </button>
+ <ul class="hidden-links hidden"></ul>
+ </nav>
+ </div>
+ </div>
+</div>
+<!--
+<p class="notice--warning" style="margin: 0 !important; text-align: center
!important;"><strong>Note:</strong> This site is work in progress, if you
notice any issues, please <a target="_blank"
href="https://github.com/apache/hudi/issues">Report on Issue</a>.
+ Click <a href="/"> here</a> back to old site.</p>
+-->
+
+ <div class="initial-content">
+ <div id="main" role="main">
+
+
+ <div class="sidebar sticky">
+
+
+ <div itemscope itemtype="https://schema.org/Person">
+
+ <div class="author__content">
+
+ <h3 class="author__name" itemprop="name">Quick Links</h3>
+
+
+ <div class="author__bio" itemprop="description">
+ <p>Hudi <em>ingests</em> & <em>manages</em> storage of large
analytical datasets over DFS.</p>
+
+ </div>
+
+ </div>
+
+ <div class="author__urls-wrapper">
+ <ul class="author__urls social-icons">
+
+
+ <li><a href="/docs/quick-start-guide" target="_self" rel="nofollow
noopener noreferrer"><i class="fa fa-book" aria-hidden="true"></i>
Documentation</a></li>
+
+
+
+ <li><a href="https://cwiki.apache.org/confluence/display/HUDI"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-wikipedia-w"
aria-hidden="true"></i> Technical Wiki</a></li>
+
+
+
+ <li><a href="/contributing" target="_self" rel="nofollow noopener
noreferrer"><i class="fa fa-thumbs-o-up" aria-hidden="true"></i> Contribution
Guide</a></li>
+
+
+
+ <li><a
href="https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-slack"
aria-hidden="true"></i> Join on Slack</a></li>
+
+
+
+ <li><a href="https://github.com/apache/hudi" target="_blank"
rel="nofollow noopener noreferrer"><i class="fa fa-github"
aria-hidden="true"></i> Fork on GitHub</a></li>
+
+
+
+ <li><a href="https://issues.apache.org/jira/projects/HUDI/summary"
target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-navicon"
aria-hidden="true"></i> Report Issues</a></li>
+
+
+
+ <li><a href="/security" target="_self" rel="nofollow noopener
noreferrer"><i class="fa fa-navicon" aria-hidden="true"></i> Report Security
Issues</a></li>
+
+
+
+
+ </ul>
+ </div>
+</div>
+
+
+
+
+ </div>
+
+
+ <article class="page" itemscope itemtype="https://schema.org/CreativeWork">
+ <!-- Look the author details up from the site config. -->
+
+
+ <div class="page__inner-wrap">
+
+ <header>
+ <h1 id="page-title" class="page__title"
itemprop="headline">Employing the right indexes for fast updates, deletes
+</h1>
+ <!-- Output author details if some exist. -->
+
+ </header>
+
+
+ <section class="page__content" itemprop="text">
+
+ <style>
+ .page {
+ padding-right: 0 !important;
+ }
+ </style>
+
+ <p>Apache Hudi employs an index to locate the file group, that an
update/delete belong to. For Copy-On-Write tables, this enables
+fast upsert/delete operations, by avoiding the need to join against the entire
dataset to determine which files to rewrite.
+For Merge-On-Read tables, this design allows Hudi to bound the amount of
records any given base file needs to be merged against.
+Specifically, a given base file needs to merged only against updates for
records that are part of that base file. In contrast,
+designs without an indexing component like <a
href="https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions">Apache
Hive ACID</a>,
+could end up having to merge all the base files against all incoming
updates/delete records.</p>
+
+<p>At a high level, an index maps a record key + an optional partition path to
a file group ID on storage (explained
+more in detail <a href="/docs/concepts.html">here</a>) and during write
operations, we lookup this mapping to route an incoming update/delete
+to a log file attached to the base file (MOR) or to the latest base file that
now needs to be merged against (COW). The index also enables
+Hudi to enforce unique constraints based on the record keys.</p>
+
+<p><img src="/assets/images/blog/hudi-indexes/with-and-without-index.png"
alt="Fact table" />
+<em>Figure: Comparison of merge cost for updates (yellow blocks) against base
files (white blocks)</em></p>
+
+<p>Given that Hudi already supports few different indexing techniques and is
also continuously improving/adding more to its toolkit, the rest of the blog
+attempts to explain different categories of workloads, from our experience and
suggests what index types to use for each. We will also interlace
+commentary on existing limitations, upcoming work and optimizations/tradeoffs
along the way.</p>
+
+<h2 id="index-types-in-hudi">Index Types in Hudi</h2>
+
+<p>Currently, Hudi supports the following indexing options.</p>
+
+<ul>
+ <li><strong>Bloom Index (default):</strong> Employs bloom filters built out
of the record keys, optionally also pruning candidate files using record key
ranges.</li>
+ <li><strong>Simple Index:</strong> Performs a lean join of the incoming
update/delete records against keys extracted from the table on storage.</li>
+ <li><strong>HBase Index:</strong> Manages the index mapping in an external
Apache HBase table.</li>
+</ul>
+
+<p>Writers can pick one of these options using <code
class="highlighter-rouge">hoodie.index.type</code> config option. Additionally,
a custom index implementation can also be employed
+using <code class="highlighter-rouge">hoodie.index.class</code> and supplying
a subclass of <code class="highlighter-rouge">SparkHoodieIndex</code> (for
Apache Spark writers)</p>
+
+<p>Another key aspect worth understanding is the difference between global and
non-global indexes. Both bloom and simple index have
+global options - <code
class="highlighter-rouge">hoodie.index.type=GLOBAL_BLOOM</code> and <code
class="highlighter-rouge">hoodie.index.type=GLOBAL_SIMPLE</code> -
respectively. HBase index is by nature a global index.</p>
+
+<ul>
+ <li>
+ <p><strong>Global index:</strong> Global indexes enforce uniqueness of
keys across all partitions of a table i.e guarantees that exactly
+one record exists in the table for a given record key. Global indexes offer
stronger guarantees, but the update/delete cost grows
+with size of the table <code class="highlighter-rouge">O(size of
table)</code>, which might still be acceptable for smaller tables.</p>
+ </li>
+ <li>
+ <p><strong>Non Global index:</strong> On the other hand, the default index
implementations enforce this constraint only within a specific partition.
+As one might imagine, non global indexes depends on the writer to provide the
same consistent partition path for a given record key during update/delete,
+but can deliver much better performance since the index lookup operation
becomes <code class="highlighter-rouge">O(number of records
updated/deleted)</code> and
+scales well with write volume.</p>
+ </li>
+</ul>
+
+<p>Since data comes in at different volumes, velocity and has different access
patterns, different indices could be used for different workloads.
+Next, let’s walk through some typical workloads and see how to leverage the
right Hudi index for such use-cases.</p>
+
+<h2 id="workload-late-arriving-updates-to-fact-tables">Workload: Late arriving
updates to fact tables</h2>
+
+<p>Many companies store large volumes of transactional data in NoSQL data
stores. For eg, trip tables in case of ride-sharing, buying and selling of
shares,
+orders in an e-commerce site. These tables are usually ever growing with
random updates on most recent data with long tail updates going to older data,
either
+due to transactions settling at a later date/data corrections. In other words,
most updates go into the latest partitions with few updates going to older
ones.</p>
+
+<p><img src="/assets/images/blog/hudi-indexes/Fact20tables.gif" alt="Fact
table" />
+<em>Figure: Typical update pattern for Fact tables</em></p>
+
+<p>For such workloads, the <code class="highlighter-rouge">BLOOM</code> index
performs well, since index look-up will prune a lot of data files based on a
well-sized bloom filter.
+Additionally, if the keys can be constructed such that they have a certain
ordering, the number of files to be compared is further reduced by range
pruning.
+Hudi constructs an interval tree with all the file key ranges and efficiently
filters out the files that don’t match any key ranges in the updates/deleted
records.</p>
+
+<p>In order to efficiently compare incoming record keys against bloom filters
i.e with minimal number of bloom filter reads and uniform distribution of work
across
+the executors, Hudi leverages caching of input records and employs a custom
partitioner that can iron out data skews using statistics. At times, if the
bloom filter
+false positive ratio is high, it could increase the amount of data shuffled to
perform the lookup. Hudi supports dynamic bloom filters
+(enabled using <code
class="highlighter-rouge">hoodie.bloom.index.filter.type=DYNAMIC_V0</code>),
which adjusts its size based on the number of records stored in a given file to
deliver the
+configured false positive ratio.</p>
+
+<p>In the near future, we plan to introduce a much speedier version of the
BLOOM index that tracks bloom filters/ranges in an internal Hudi metadata
table, indexed for fast
+point lookups. This would avoid any current limitations around reading bloom
filters/ranges from the base files themselves, to perform the lookup. (see
+<a
href="https://cwiki.apache.org/confluence/display/HUDI/RFC+-+15%3A+HUDI+File+Listing+and+Query+Planning+Improvements?src=contextnavpagetreemode">RFC-15</a>
for the general design)</p>
+
+<h2 id="workload-duplicated-records-in-event-tables">Workload: Duplicated
records in event tables</h2>
+
+<p>Event Streaming is everywhere. Events coming from Apache Kafka or similar
message bus are typically 10-100x the size of fact tables and often treat
“time” (event’s arrival time/processing
+time) as a first class citizen. For eg, IoT event stream, click stream data,
ad impressions etc. Inserts and updates only span the last few partitions as
these are mostly append only data.
+Given duplicate events can be introduced anywhere in the end-end pipeline,
de-duplication before storing on the data lake is a common requirement.</p>
+
+<p><img src="/assets/images/blog/hudi-indexes/Event20tables.gif" alt="Event
table" />
+<em>Figure showing the spread of updates for Event table.</em></p>
+
+<p>In general, this is a very challenging problem to solve at lower cost.
Although, we could even employ a key value store to perform this de-duplication
ala HBASE index, the index storage
+costs would grow linear with number of events and thus can be prohibitively
expensive. In fact, <code class="highlighter-rouge">BLOOM</code> index with
range pruning is the optimal solution here. One can leverage the fact
+that time is often a first class citizen and construct a key such as <code
class="highlighter-rouge">event_ts + event_id</code> such that the inserted
records have monotonically increasing keys. This yields great returns
+by pruning large amounts of files even within the latest table partitions.</p>
+
+<h2
id="workload-completely-random-updatesdeletes-to-a-dimension-table">Workload:
Completely random updates/deletes to a dimension table</h2>
+
+<p>These types of tables usually contain high dimensional data and hold
reference data e.g user profile, merchant information. These are high fidelity
tables where the updates are often small but also spread
+across a lot of partitions and data files ranging across the dataset from old
to new. Often times, these tables are also un-partitioned, since there is also
not a good way to partition these tables.</p>
+
+<p><img src="/assets/images/blog/hudi-indexes/Dimension20tables.gif"
alt="Dimensions table" />
+<em>Figure showing the spread of updates for Dimensions table.</em></p>
+
+<p>As discussed before, the <code class="highlighter-rouge">BLOOM</code> index
may not yield benefits if a good number of files cannot be pruned out by
comparing ranges/filters. In such a random write workload, updates end up
touching
+most files within in the table and thus bloom filters will typically indicate
a true positive for all files based on some incoming update. Consequently, we
would end up comparing ranges/filter, only
+to finally check the incoming updates against all files. The <code
class="highlighter-rouge">SIMPLE</code> Index will be a better fit as it does
not do any upfront pruning based, but directly joins with interested fields
from every data file.
+<code class="highlighter-rouge">HBASE</code> index can be employed, if the
operational overhead is acceptable and would provide much better lookup times
for these tables.</p>
+
+<p>When using a global index, users should also consider setting <code
class="highlighter-rouge">hoodie.bloom.index.update.partition.path=true</code>
or <code
class="highlighter-rouge">hoodie.simple.index.update.partition.path=true</code>
to deal with cases where the
+partition path value could change due to an update e.g users table partitioned
by home city; user relocates to a different city. These tables are also
excellent candidates for the Merge-On-Read table type.</p>
+
+<p>Going forward, we plan to build <a
href="https://cwiki.apache.org/confluence/display/HUDI/RFC+-+08+%3A+Record+level+indexing+mechanisms+for+Hudi+datasets?src=contextnavpagetreemode">record
level indexing</a>
+right within Hudi, which will improve the index look-up time and will also
avoid additional overhead of maintaining an external system like hbase.</p>
+
+<h2 id="summary">Summary</h2>
+
+<p>Without the indexing capabilities in Hudi, it would not been possible to
make upserts/deletes happen at <a
href="https://eng.uber.com/apache-hudi-graduation/">very large scales</a>.
+Hopefully this post gave you good enough context on the indexing mechanisms
today and how different tradeoffs play out.</p>
+
+<p>Some interesting work underway in this area:</p>
+
+<ul>
+ <li>Apache Flink based writing with a RocksDB state store backed indexing
mechanism, unlocking true streaming upserts on data lakes.</li>
+ <li>A brand new MetadataIndex, which reimagines the bloom index today on top
of the metadata table in Hudi.</li>
+ <li>Record level index implementation, as a secondary index using another
Hudi table.</li>
+</ul>
+
+<p>Going forward, this will remain an area of active investment for the
project. we are always looking for contributors who can drive these roadmap
items forward.
+Please <a href="/community.html">engage</a> with our community if you want to
get involved.</p>
+
+
+ </section>
+
+ <a href="#masthead__inner-wrap" class="back-to-top">Back to top
↑</a>
+
+
+
+
+ </div>
+
+ </article>
+
+</div>
+
+ </div>
+
+ <div class="page__footer">
+ <footer>
+
+<div class="row">
+ <div class="col-lg-12 footer">
+ <p>
+ <table class="table-apache-info">
+ <tr>
+ <td>
+ <a class="footer-link-img" href="https://apache.org">
+ <img width="250px" src="/assets/images/asf_logo.svg" alt="The
Apache Software Foundation">
+ </a>
+ </td>
+ <td>
+ <a style="float: right"
href="https://www.apache.org/events/current-event.html">
+ <img
src="https://www.apache.org/events/current-event-234x60.png" />
+ </a>
+ </td>
+ </tr>
+ </table>
+ </p>
+ <p>
+ <a href="https://www.apache.org/licenses/">License</a> | <a
href="https://www.apache.org/security/">Security</a> | <a
href="https://www.apache.org/foundation/thanks.html">Thanks</a> | <a
href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a>
+ </p>
+ <p>
+ Copyright © <span id="copyright-year">2019</span> <a
href="https://apache.org">The Apache Software Foundation</a>, Licensed under
the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License,
Version 2.0</a>.
+ Hudi, Apache and the Apache feather logo are trademarks of The Apache
Software Foundation. <a href="/docs/privacy">Privacy Policy</a>
+ </p>
+ </div>
+</div>
+ </footer>
+ </div>
+
+
+ </body>
+</html>
\ No newline at end of file
diff --git a/content/cn/activity.html b/content/cn/activity.html
index c87c2fe..b46a04b 100644
--- a/content/cn/activity.html
+++ b/content/cn/activity.html
@@ -215,6 +215,30 @@
<h2 class="archive__item-title" itemprop="headline">
+ <a href="/blog/hudi-indexing-mechanisms/" rel="permalink">Employing
the right indexes for fast updates, deletes
+</a>
+
+ </h2>
+ <!-- Look the author details up from the site config. -->
+
+ <!-- Output author details if some exist. -->
+
+
+ <p class="archive__item-excerpt" itemprop="description">Detailing
different indexing mechanisms in Hudi and when to use each of them
+</p>
+ </article>
+</div>
+
+
+
+
+
+
+<div class="list__item">
+ <article class="archive__item" itemscope
itemtype="https://schema.org/CreativeWork">
+
+ <h2 class="archive__item-title" itemprop="headline">
+
<a href="/blog/hudi-meets-aws-emr-and-aws-dms/" rel="permalink">Apply
record level changes from relational databases to Amazon S3 data lake using
Apache Hudi on Amazon EMR and AWS Database Migration Service
</a>
diff --git a/content/sitemap.xml b/content/sitemap.xml
index 432cf35..d2b7c44 100644
--- a/content/sitemap.xml
+++ b/content/sitemap.xml
@@ -965,6 +965,10 @@
<lastmod>2020-10-19T00:00:00-04:00</lastmod>
</url>
<url>
+<loc>https://hudi.apache.org/blog/hudi-indexing-mechanisms/</loc>
+<lastmod>2020-11-11T00:00:00-05:00</lastmod>
+</url>
+<url>
<loc>https://hudi.apache.org/blog/high-perf-data-lake-with-hudi-and-alluxio-t3go/</loc>
<lastmod>2020-12-01T00:00:00-05:00</lastmod>
</url>