This is an automated email from the ASF dual-hosted git repository.
vinoth pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 26ef788 Travis CI build asf-site
26ef788 is described below
commit 26ef788715ac2a7024413c50b8a043695282478c
Author: CI <[email protected]>
AuthorDate: Tue Mar 23 22:41:07 2021 +0000
Travis CI build asf-site
---
content/assets/js/lunr/lunr-store.js | 5 +
content/docs/0.5.3-cloud.html | 11 +
content/docs/azure_hoodie.html | 11 +
content/docs/cloud.html | 11 +
content/docs/comparison.html | 11 +
content/docs/concurrency_control.html | 568 ++++++++++++++++++++++++++++++
content/docs/configurations.html | 73 ++++
content/docs/cos_hoodie.html | 11 +
content/docs/deployment.html | 11 +
content/docs/docker_demo.html | 11 +
content/docs/docs-versions.html | 11 +
content/docs/flink-quick-start-guide.html | 11 +
content/docs/gcs_hoodie.html | 11 +
content/docs/ibm_cos_hoodie.html | 11 +
content/docs/metrics.html | 11 +
content/docs/migration_guide.html | 11 +
content/docs/oss_hoodie.html | 11 +
content/docs/overview.html | 11 +
content/docs/performance.html | 11 +
content/docs/powered_by.html | 11 +
content/docs/privacy.html | 11 +
content/docs/querying_data.html | 11 +
content/docs/s3_hoodie.html | 11 +
content/docs/spark_quick-start-guide.html | 11 +
content/docs/structure.html | 11 +
content/docs/use_cases.html | 11 +
content/docs/writing_data.html | 11 +
content/sitemap.xml | 4 +
28 files changed, 914 insertions(+)
diff --git a/content/assets/js/lunr/lunr-store.js
b/content/assets/js/lunr/lunr-store.js
index f13c4df..4fa69d9 100644
--- a/content/assets/js/lunr/lunr-store.js
+++ b/content/assets/js/lunr/lunr-store.js
@@ -1324,6 +1324,11 @@ var store = [{
"tags": [],
"url": "https://hudi.apache.org/docs/metrics.html",
"teaser":"https://hudi.apache.org/assets/images/500x300.png"},{
+ "title": "Concurrent Writes to Hudi Tables",
+ "excerpt":"In this section, we will cover Hudi’s concurrency model and
describe ways to ingest data into a Hudi Table from multiple writers; using the
DeltaStreamer tool as well as using the Hudi datasource. Supported Concurrency
Controls MVCC : Hudi table services such as compaction, cleaning, clustering
leverage Multi Version Concurrency...","categories": [],
+ "tags": [],
+ "url": "https://hudi.apache.org/docs/concurrency_control.html",
+ "teaser":"https://hudi.apache.org/assets/images/500x300.png"},{
"title": "Privacy Policy",
"excerpt":"Information about your use of this website is collected
using server access logs and a tracking cookie. The collected information
consists of the following: The IP address from which you access the website;
The type of browser and operating system you use to access our site; The date
and time...","categories": [],
"tags": [],
diff --git a/content/docs/0.5.3-cloud.html b/content/docs/0.5.3-cloud.html
index b045e52..405c630 100644
--- a/content/docs/0.5.3-cloud.html
+++ b/content/docs/0.5.3-cloud.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/azure_hoodie.html b/content/docs/azure_hoodie.html
index a781bde..4427c6e 100644
--- a/content/docs/azure_hoodie.html
+++ b/content/docs/azure_hoodie.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/cloud.html b/content/docs/cloud.html
index 99a378f..2a5b12a 100644
--- a/content/docs/cloud.html
+++ b/content/docs/cloud.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/comparison.html b/content/docs/comparison.html
index 7dcc624..99651ea 100644
--- a/content/docs/comparison.html
+++ b/content/docs/comparison.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/concurrency_control.html
b/content/docs/concurrency_control.html
new file mode 100644
index 0000000..af97c0f
--- /dev/null
+++ b/content/docs/concurrency_control.html
@@ -0,0 +1,568 @@
+<!doctype html>
+<html lang="en" class="no-js">
+ <head>
+ <meta charset="utf-8">
+
+<!-- begin _includes/seo.html --><title>Concurrent Writes to Hudi Tables -
Apache Hudi</title>
+<meta name="description" content="In this section, we will cover Hudi’s
concurrency model and describe ways to ingest data into a Hudi Table from
multiple writers; using the DeltaStreamer tool as well as using the Hudi
datasource.">
+
+<meta property="og:type" content="article">
+<meta property="og:locale" content="en_US">
+<meta property="og:site_name" content="">
+<meta property="og:title" content="Concurrent Writes to Hudi Tables">
+<meta property="og:url"
content="https://hudi.apache.org/docs/concurrency_control.html">
+
+
+ <meta property="og:description" content="In this section, we will cover
Hudi’s concurrency model and describe ways to ingest data into a Hudi Table
from multiple writers; using the DeltaStreamer tool as well as using the Hudi
datasource.">
+
+
+
+
+
+ <meta property="article:modified_time" content="2021-03-19T15:59:57-04:00">
+
+
+
+
+
+
+
+<!-- end _includes/seo.html -->
+
+
+<!--<link href="/feed.xml" type="application/atom+xml" rel="alternate" title="
Feed">-->
+
+<!-- https://t.co/dKP3o1e -->
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+<script>
+ document.documentElement.className =
document.documentElement.className.replace(/\bno-js\b/g, '') + ' js ';
+</script>
+
+<!-- For all browsers -->
+<link rel="stylesheet" href="/assets/css/main.css">
+
+<!--[if IE]>
+ <style>
+ /* old IE unsupported flexbox fixes */
+ .greedy-nav .site-title {
+ padding-right: 3em;
+ }
+ .greedy-nav button {
+ position: absolute;
+ top: 0;
+ right: 0;
+ height: 100%;
+ }
+ </style>
+<![endif]-->
+
+
+
+<link rel="icon" type="image/x-icon" href="/assets/images/favicon.ico">
+<link rel="stylesheet" href="/assets/css/font-awesome.min.css">
+<script src="/assets/js/jquery.min.js"></script>
+
+
+<script src="/assets/js/main.min.js"></script>
+
+ </head>
+
+ <body class="layout--single">
+ <!--[if lt IE 9]>
+<div class="notice--danger align-center" style="margin: 0;">You are using an
<strong>outdated</strong> browser. Please <a
href="https://browsehappy.com/">upgrade your browser</a> to improve your
experience.</div>
+<![endif]-->
+
+ <div class="masthead">
+ <div class="masthead__inner-wrap" id="masthead__inner-wrap">
+ <div class="masthead__menu">
+ <nav id="site-nav" class="greedy-nav">
+
+ <a class="site-logo" href="/">
+ <div style="width: 150px; height: 40px">
+ </div>
+ </a>
+
+ <a class="site-title" href="/">
+
+ </a>
+ <ul class="visible-links"><li class="masthead__menu-item">
+ <a href="/docs/spark_quick-start-guide.html" target="_self"
>Documentation</a>
+ </li><li class="masthead__menu-item">
+ <a href="/community.html" target="_self" >Community</a>
+ </li><li class="masthead__menu-item">
+ <a href="/blog.html" target="_self" >Blog</a>
+ </li><li class="masthead__menu-item">
+ <a href="https://cwiki.apache.org/confluence/display/HUDI/FAQ"
target="_blank" >FAQ</a>
+ </li><li class="masthead__menu-item">
+ <a href="/docs/powered_by.html" target="_self" >Powered By</a>
+ </li><li class="masthead__menu-item">
+ <a href="/releases.html" target="_self" >Releases</a>
+ </li></ul>
+ <button class="greedy-nav__toggle hidden" type="button">
+ <span class="visually-hidden">Toggle menu</span>
+ <div class="navicon"></div>
+ </button>
+ <ul class="hidden-links hidden"></ul>
+ </nav>
+ </div>
+ </div>
+</div>
+<!--
+<p class="notice--warning" style="margin: 0 !important; text-align: center
!important;"><strong>Note:</strong> This site is work in progress, if you
notice any issues, please <a target="_blank"
href="https://github.com/apache/hudi/issues">Report on Issue</a>.
+ Click <a href="/"> here</a> back to old site.</p>
+-->
+
+ <div class="initial-content">
+ <div id="main" role="main">
+
+
+ <div class="sidebar sticky">
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<nav class="nav__list">
+
+ <input id="ac-toc" name="accordion-toc" type="checkbox" />
+ <label for="ac-toc">Toggle Menu</label>
+ <ul class="nav__items">
+
+ <li>
+
+ <span class="nav__sub-title">Documentation</span>
+
+
+
+ <ul>
+
+
+
+
+
+
+
+
+ <li><a href="/docs/overview.html" class="">Overview</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/spark_quick-start-guide.html" class="">Quick
Start(Spark)</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/flink-quick-start-guide.html" class="">Quick
Start(Flink)</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/use_cases.html" class="">Use Cases</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/writing_data.html" class="">Writing
Data</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/concurrency_control.html"
class="active">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/configurations.html"
class="">Configuration</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/performance.html"
class="">Performance</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/deployment.html" class="">Deployment</a></li>
+
+
+
+ </ul>
+
+ </li>
+
+ <li>
+
+ <span class="nav__sub-title">Resources</span>
+
+
+
+ <ul>
+
+
+
+
+
+
+
+
+ <li><a href="/docs/docker_demo.html" class="">Dockerized
Demo</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/cloud.html" class="">Storage
Configuration</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/metrics.html" class="">Metrics</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/docs-versions.html" class="">Docs
Versions</a></li>
+
+
+
+
+
+
+
+
+
+
+ <li><a href="/docs/privacy.html" class="">Privacy Policy</a></li>
+
+
+
+ </ul>
+
+ </li>
+
+ </ul>
+</nav>
+
+
+
+
+ </div>
+
+
+ <article class="page" itemscope itemtype="https://schema.org/CreativeWork">
+ <!-- Look the author details up from the site config. -->
+
+
+ <div class="page__inner-wrap">
+
+ <header>
+ <h1 id="page-title" class="page__title"
itemprop="headline">Concurrent Writes to Hudi Tables
+</h1>
+ <!-- Output author details if some exist. -->
+
+ </header>
+
+
+ <section class="page__content" itemprop="text">
+
+ <aside class="sidebar__right sticky">
+ <nav class="toc">
+ <header><h4 class="nav__title"><i class="fas fa-file-alt"></i> IN
THIS PAGE</h4></header>
+ <ul class="toc__menu">
+ <li><a href="#supported-concurrency-controls">Supported Concurrency
Controls</a></li>
+ <li><a href="#single-writer-guarantees">Single Writer Guarantees</a></li>
+ <li><a href="#multi-writer-guarantees">Multi Writer Guarantees</a></li>
+ <li><a href="#enabling-multi-writing">Enabling Multi Writing</a></li>
+ <li><a href="#datasource-writer">Datasource Writer</a></li>
+ <li><a href="#deltastreamer">DeltaStreamer</a></li>
+ <li><a href="#best-practices-when-using-optimistic-concurrency-control">Best
Practices when using Optimistic Concurrency Control</a></li>
+ <li><a href="#disabling-multi-writing">Disabling Multi Writing</a></li>
+</ul>
+ </nav>
+ </aside>
+
+ <p>In this section, we will cover Hudi’s concurrency model and
describe ways to ingest data into a Hudi Table from multiple writers; using the
<a href="#deltastreamer">DeltaStreamer</a> tool as well as
+using the <a href="#datasource-writer">Hudi datasource</a>.</p>
+
+<h2 id="supported-concurrency-controls">Supported Concurrency Controls</h2>
+
+<ul>
+ <li>
+ <p><strong>MVCC</strong> : Hudi table services such as compaction,
cleaning, clustering leverage Multi Version Concurrency Control to provide
snapshot isolation
+between multiple table service writers and readers. Additionally, using MVCC,
Hudi provides snapshot isolation between an ingestion writer and multiple
concurrent readers.
+With this model, Hudi supports running any number of table service jobs
concurrently, without any concurrency conflict.
+This is made possible by ensuring that scheduling plans of such table services
always happens in a single writer mode to ensure no conflict and avoids race
conditions.</p>
+ </li>
+ <li>
+ <p><strong>[NEW] OPTIMISTIC CONCURRENCY</strong> : Write operations such
as the ones described above (UPSERT, INSERT) etc, leverage optimistic
concurrency control to enable multiple ingestion writers to
+the same Hudi Table. Hudi supports <code class="highlighter-rouge">file level
OCC</code>, i.e., for any 2 commits (or writers) happening to the same table,
if they do not have writes to overlapping files being changed, both writers are
allowed to succeed.
+This feature is currently <em>experimental</em> and requires either Zookeeper
or HiveMetastore to acquire locks.</p>
+ </li>
+</ul>
+
+<p>It may be helpful to understand the different guarantees provided by <a
href="/docs/writing_data.html#write-operations">write operations</a> via Hudi
datasource or the delta streamer.</p>
+
+<h2 id="single-writer-guarantees">Single Writer Guarantees</h2>
+
+<ul>
+ <li><em>UPSERT Guarantee</em>: The target table will NEVER show
duplicates.</li>
+ <li><em>INSERT Guarantee</em>: The target table wilL NEVER have duplicates
if <a href="/docs/configurations.html#INSERT_DROP_DUPS_OPT_KEY">dedup</a> is
enabled.</li>
+ <li><em>BULK_INSERT Guarantee</em>: The target table will NEVER have
duplicates if <a
href="/docs/configurations.html#INSERT_DROP_DUPS_OPT_KEY">dedup</a> is
enabled.</li>
+ <li><em>INCREMENTAL PULL Guarantee</em>: Data consumption and checkpoints
are NEVER out of order.</li>
+</ul>
+
+<h2 id="multi-writer-guarantees">Multi Writer Guarantees</h2>
+
+<p>With multiple writers using OCC, some of the above guarantees change as
follows</p>
+
+<ul>
+ <li><em>UPSERT Guarantee</em>: The target table will NEVER show
duplicates.</li>
+ <li><em>INSERT Guarantee</em>: The target table MIGHT have duplicates even
if <a href="/docs/configurations.html#INSERT_DROP_DUPS_OPT_KEY">dedup</a> is
enabled.</li>
+ <li><em>BULK_INSERT Guarantee</em>: The target table MIGHT have duplicates
even if <a href="/docs/configurations.html#INSERT_DROP_DUPS_OPT_KEY">dedup</a>
is enabled.</li>
+ <li><em>INCREMENTAL PULL Guarantee</em>: Data consumption and checkpoints
MIGHT be out of order due to multiple writer jobs finishing at different
times.</li>
+</ul>
+
+<h2 id="enabling-multi-writing">Enabling Multi Writing</h2>
+
+<p>The following properties are needed to be set properly to turn on
optimistic concurrency control.</p>
+
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>hoodie.write.concurrency.mode=optimistic_concurrency_control
+hoodie.failed.writes.cleaner.policy=LAZY
+hoodie.writer.lock.provider=<lock-provider-classname>
+</code></pre></div></div>
+
+<p>There are 2 different server based lock providers that require different
configuration to be set.</p>
+
+<p><strong><code class="highlighter-rouge">Zookeeper</code></strong> based
lock provider</p>
+
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>hoodie.writer.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider
+hoodie.writer.lock.zookeeper.url
+hoodie.writer.lock.zookeeper.port
+hoodie.writer.lock.wait_time_ms
+hoodie.writer.lock.num_retries
+hoodie.writer.lock.lock_key
+hoodie.writer.lock.zookeeper.zk_base_path
+</code></pre></div></div>
+
+<p><strong><code class="highlighter-rouge">HiveMetastore</code></strong> based
lock provider</p>
+
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>hoodie.writer.lock.provider=org.apache.hudi.hive.HiveMetastoreBasedLockProvider
+hoodie.writer.lock.hivemetastore.database
+hoodie.writer.lock.hivemetastore.table
+hoodie.writer.lock.wait_time_ms
+hoodie.writer.lock.num_retries
+</code></pre></div></div>
+
+<p><code class="highlighter-rouge">The HiveMetastore URI's are picked up from
the hadoop configuration file loaded during runtime.</code></p>
+
+<h2 id="datasource-writer">Datasource Writer</h2>
+
+<p>The <code class="highlighter-rouge">hudi-spark</code> module offers the
DataSource API to write (and read) a Spark DataFrame into a Hudi table.</p>
+
+<p>Following is an example of how to use optimistic_concurrency_control via
spark datasource</p>
+
+<div class="language-java highlighter-rouge"><div class="highlight"><pre
class="highlight"><code><span class="n">inputDF</span><span
class="o">.</span><span class="na">write</span><span class="o">.</span><span
class="na">format</span><span class="o">(</span><span
class="s">"hudi"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">options</span><span
class="o">(</span><span class="n">getQuickstartWriteConfigs</span><span
class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span class="no">PRECOMBINE_FIELD_OPT_KEY</span><span
class="o">,</span> <span class="s">"ts"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span
class="s">"hoodie.failed.writes.cleaner.policy"</span><span class="o">,</span>
<span class="s">"LAZY"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span class="s">"hoodie.write.concurrency.mode"</span><span
class="o">,</span> <span class="s">"optimistic_concurrency_control"</span><span
class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span
class="s">"hoodie.writer.lock.zookeeper.url"</span><span class="o">,</span>
<span class="s">"zookeeper"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span
class="s">"hoodie.writer.lock.zookeeper.port"</span><span class="o">,</span>
<span class="s">"2181"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span class="s">"hoodie.writer.lock.wait_time_ms"</span><span
class="o">,</span> <span class="s">"12000"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span class="s">"hoodie.writer.lock.num_retries"</span><span
class="o">,</span> <span class="s">"2"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span class="s">"hoodie.writer.lock.lock_key"</span><span
class="o">,</span> <span class="s">"test_table"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span
class="s">"hoodie.writer.lock.zookeeper.zk_base_path"</span><span
class="o">,</span> <span class="s">"/test"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span class="no">RECORDKEY_FIELD_OPT_KEY</span><span
class="o">,</span> <span class="s">"uuid"</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span class="no">PARTITIONPATH_FIELD_OPT_KEY</span><span
class="o">,</span> <span class="s">"partitionpath"</span><span
class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span
class="o">(</span><span class="no">TABLE_NAME</span><span class="o">,</span>
<span class="n">tableName</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">mode</span><span
class="o">(</span><span class="nc">Overwrite</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">save</span><span
class="o">(</span><span class="n">basePath</span><span class="o">)</span>
+</code></pre></div></div>
+
+<h2 id="deltastreamer">DeltaStreamer</h2>
+
+<p>The <code class="highlighter-rouge">HoodieDeltaStreamer</code> utility
(part of hudi-utilities-bundle) provides ways to ingest from different sources
such as DFS or Kafka, with the following capabilities.</p>
+
+<p>Using optimistic_concurrency_control via delta streamer requires adding the
above configs to the properties file that can be passed to the
+job. For example below, adding the configs to kafka-source.properties file and
passing them to deltastreamer will enable optimistic concurrency.
+A deltastreamer job can then be triggered as follows:</p>
+
+<div class="language-java highlighter-rouge"><div class="highlight"><pre
class="highlight"><code><span class="o">[</span><span
class="n">hoodie</span><span class="o">]</span><span class="err">$</span> <span
class="n">spark</span><span class="o">-</span><span class="n">submit</span>
<span class="o">--</span><span class="kd">class</span> <span
class="nc">org</span><span class="o">.</span><span
class="na">apache</span><span class="o">.</span><span
class="na">hudi</span><span class="o">.</sp [...]
+ <span class="o">--</span><span class="n">props</span> <span
class="nl">file:</span><span
class="c1">//${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties
\</span>
+ <span class="o">--</span><span class="n">schemaprovider</span><span
class="o">-</span><span class="kd">class</span> <span
class="nc">org</span><span class="o">.</span><span
class="na">apache</span><span class="o">.</span><span
class="na">hudi</span><span class="o">.</span><span
class="na">utilities</span><span class="o">.</span><span
class="na">schema</span><span class="o">.</span><span
class="na">SchemaRegistryProvider</span> <span class="err">\</span>
+ <span class="o">--</span><span class="n">source</span><span
class="o">-</span><span class="kd">class</span> <span
class="nc">org</span><span class="o">.</span><span
class="na">apache</span><span class="o">.</span><span
class="na">hudi</span><span class="o">.</span><span
class="na">utilities</span><span class="o">.</span><span
class="na">sources</span><span class="o">.</span><span
class="na">AvroKafkaSource</span> <span class="err">\</span>
+ <span class="o">--</span><span class="n">source</span><span
class="o">-</span><span class="n">ordering</span><span class="o">-</span><span
class="n">field</span> <span class="n">impresssiontime</span> <span
class="err">\</span>
+ <span class="o">--</span><span class="n">target</span><span
class="o">-</span><span class="n">base</span><span class="o">-</span><span
class="n">path</span> <span class="nl">file:</span><span
class="err">\</span><span class="o">/</span><span class="err">\</span><span
class="o">/</span><span class="err">\</span><span class="o">/</span><span
class="n">tmp</span><span class="o">/</span><span class="n">hudi</span><span
class="o">-</span><span class="n">deltastreamer</span><span class="o">- [...]
+ <span class="o">--</span><span class="n">target</span><span
class="o">-</span><span class="n">table</span> <span class="n">uber</span><span
class="o">.</span><span class="na">impressions</span> <span class="err">\</span>
+ <span class="o">--</span><span class="n">op</span> <span
class="no">BULK_INSERT</span>
+</code></pre></div></div>
+
+<h2 id="best-practices-when-using-optimistic-concurrency-control">Best
Practices when using Optimistic Concurrency Control</h2>
+
+<p>Concurrent Writing to Hudi tables requires acquiring a lock with either
Zookeeper or HiveMetastore. Due to several reasons you might want to configure
retries to allow your application to acquire the lock.</p>
+<ol>
+ <li>Network connectivity or excessive load on servers increasing time for
lock acquisition resulting in timeouts</li>
+ <li>Running a large number of concurrent jobs that are writing to the same
hudi table can result in contention during lock acquisition can cause
timeouts</li>
+ <li>In some scenarios of conflict resolution, Hudi commit operations might
take upto 10’s of seconds while the lock is being held. This can result in
timeouts for other jobs waiting to acquire a lock.</li>
+</ol>
+
+<p>Set the correct native lock provider client retries. NOTE that sometimes
these settings are set on the server once and all clients inherit the same
configs. Please check your settings before enabling optimistic concurrency.</p>
+
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>hoodie.writer.lock.wait_time_ms
+hoodie.writer.lock.num_retries
+</code></pre></div></div>
+
+<p>Set the correct hudi client retries for Zookeeper & HiveMetastore. This
is useful in cases when native client retry settings cannot be changed. Please
note that these retries will happen in addition to any native client retries
that you may have set.</p>
+
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>hoodie.writer.lock.client.wait_time_ms
+hoodie.writer.lock.client.num_retries
+</code></pre></div></div>
+
+<p><em>Setting the right values for these depends on a case by case basis;
some defaults have been provided for general cases.</em></p>
+
+<h2 id="disabling-multi-writing">Disabling Multi Writing</h2>
+
+<p>Remove the following settings that were used to enable multi-writer or
override with default values.</p>
+
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>hoodie.write.concurrency.mode=single_writer
+hoodie.failed.writes.cleaner.policy=EAGER
+</code></pre></div></div>
+
+ </section>
+
+ <a href="#masthead__inner-wrap" class="back-to-top">Back to top
↑</a>
+
+
+
+
+ </div>
+
+ </article>
+
+</div>
+
+ </div>
+
+ <div class="page__footer">
+ <footer>
+
+<div class="row">
+ <div class="col-lg-12 footer">
+ <p>
+ <table class="table-apache-info">
+ <tr>
+ <td>
+ <a class="footer-link-img" href="https://apache.org">
+ <img width="250px" src="/assets/images/asf_logo.svg" alt="The
Apache Software Foundation">
+ </a>
+ </td>
+ <td>
+ <a style="float: right"
href="https://www.apache.org/events/current-event.html">
+ <img
src="https://www.apache.org/events/current-event-234x60.png" />
+ </a>
+ </td>
+ </tr>
+ </table>
+ </p>
+ <p>
+ <a href="https://www.apache.org/licenses/">License</a> | <a
href="https://www.apache.org/security/">Security</a> | <a
href="https://www.apache.org/foundation/thanks.html">Thanks</a> | <a
href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a>
+ </p>
+ <p>
+ Copyright © <span id="copyright-year">2019</span> <a
href="https://apache.org">The Apache Software Foundation</a>, Licensed under
the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License,
Version 2.0</a>.
+ Hudi, Apache and the Apache feather logo are trademarks of The Apache
Software Foundation. <a href="/docs/privacy">Privacy Policy</a>
+ </p>
+ </div>
+</div>
+ </footer>
+ </div>
+
+
+ </body>
+</html>
\ No newline at end of file
diff --git a/content/docs/configurations.html b/content/docs/configurations.html
index e4f1f61..99bd6f3 100644
--- a/content/docs/configurations.html
+++ b/content/docs/configurations.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
@@ -369,6 +380,7 @@
<li><a href="#metrics-configs">Metrics configs</a></li>
<li><a href="#memory-configs">Memory configs</a></li>
<li><a href="#write-commit-callback-configs">Write commit callback
configs</a></li>
+ <li><a href="#locking-configs">Locking configs</a></li>
</ul>
</li>
</ul>
@@ -1365,6 +1377,67 @@ Each clustering operation can create multiple groups.
Total amount of data proce
<p>Property: <code
class="highlighter-rouge">hoodie.write.commit.callback.kafka.retries</code> <br
/>
<span style="color:grey">Times to retry. 3 by default</span></p>
+<h3 id="locking-configs">Locking configs</h3>
+<p>Configs that control locking mechanisms if <a
href="#WriteConcurrencyMode">WriteConcurrencyMode=optimistic_concurrency_control</a>
is enabled
+<a href="#withLockConfig">withLockConfig</a> (HoodieLockConfig) <br /></p>
+
+<h4 id="withLockProvider">withLockProvider(lockProvider =
org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.provider</code> <br />
+<span style="color:grey">Lock provider class name, user can provide their own
implementation of LockProvider which should be subclass of
org.apache.hudi.common.lock.LockProvider</span></p>
+
+<h4 id="withZkQuorum">withZkQuorum(zkQuorum)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.zookeeper.url</code> <br />
+<span style="color:grey">Set the list of comma separated servers to connect
to</span></p>
+
+<h4 id="withZkBasePath">withZkBasePath(zkBasePath)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.zookeeper.base_path</code>
[Required] <br />
+<span style="color:grey">The base path on Zookeeper under which to create a
ZNode to acquire the lock. This should be common for all jobs writing to the
same table</span></p>
+
+<h4 id="withZkPort">withZkPort(zkPort)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.zookeeper.port</code> [Required]
<br />
+<span style="color:grey">The connection port to be used for
Zookeeper</span></p>
+
+<h4 id="withZkLockKey">withZkLockKey(zkLockKey)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.zookeeper.lock_key</code>
[Required] <br />
+<span style="color:grey">Key name under base_path at which to create a ZNode
and acquire lock. Final path on zk will look like base_path/lock_key. We
recommend setting this to the table name</span></p>
+
+<h4
id="withZkConnectionTimeoutInMs">withZkConnectionTimeoutInMs(connectionTimeoutInMs
= 15000)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.zookeeper.connection_timeout_ms</code>
<br />
+<span style="color:grey">How long to wait when connecting to ZooKeeper before
considering the connection a failure</span></p>
+
+<h4 id="withZkSessionTimeoutInMs">withZkSessionTimeoutInMs(sessionTimeoutInMs
= 60000)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.zookeeper.session_timeout_ms</code>
<br />
+<span style="color:grey">How long to wait after losing a connection to
ZooKeeper before the session is expired</span></p>
+
+<h4 id="withNumRetries">withNumRetries(num_retries = 3)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.num_retries</code> <br />
+<span style="color:grey">Maximum number of times to retry by lock provider
client</span></p>
+
+<h4
id="withRetryWaitTimeInMillis">withRetryWaitTimeInMillis(retryWaitTimeInMillis
= 5000)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.wait_time_ms_between_retry</code>
<br />
+<span style="color:grey">Initial amount of time to wait between retries by
lock provider client</span></p>
+
+<h4 id="withHiveDatabaseName">withHiveDatabaseName(hiveDatabaseName)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.hivemetastore.database</code>
[Required] <br />
+<span style="color:grey">The Hive database to acquire lock against</span></p>
+
+<h4 id="withHiveTableName">withHiveTableName(hiveTableName)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.hivemetastore.table</code>
[Required] <br />
+<span style="color:grey">The Hive table under the hive database to acquire
lock against</span></p>
+
+<h4 id="withClientNumRetries">withClientNumRetries(clientNumRetries = 0)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.client.num_retries</code> <br />
+<span style="color:grey">Maximum number of times to retry to acquire lock
additionally from the hudi client</span></p>
+
+<h4
id="withRetryWaitTimeInMillis">withRetryWaitTimeInMillis(retryWaitTimeInMillis
= 10000)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.client.wait_time_ms_between_retry</code>
<br />
+<span style="color:grey">Amount of time to wait between retries from the hudi
client</span></p>
+
+<h4
id="withConflictResolutionStrategy">withConflictResolutionStrategy(lockProvider
=
org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy)</h4>
+<p>Property: <code
class="highlighter-rouge">hoodie.writer.lock.conflict.resolution.strategy</code>
<br />
+<span style="color:grey">Lock provider class name, this should be subclass of
org.apache.hudi.client.transaction.ConflictResolutionStrategy</span></p>
+
+
</section>
<a href="#masthead__inner-wrap" class="back-to-top">Back to top
↑</a>
diff --git a/content/docs/cos_hoodie.html b/content/docs/cos_hoodie.html
index 2a9d9df..5f7977f 100644
--- a/content/docs/cos_hoodie.html
+++ b/content/docs/cos_hoodie.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/deployment.html b/content/docs/deployment.html
index 21ef6d0..30d144b 100644
--- a/content/docs/deployment.html
+++ b/content/docs/deployment.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/docker_demo.html b/content/docs/docker_demo.html
index f7baaf9..e24699f 100644
--- a/content/docs/docker_demo.html
+++ b/content/docs/docker_demo.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/docs-versions.html b/content/docs/docs-versions.html
index 6d47eeb..6db93ed 100644
--- a/content/docs/docs-versions.html
+++ b/content/docs/docs-versions.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/flink-quick-start-guide.html
b/content/docs/flink-quick-start-guide.html
index 6e5d4ee..f4f48e7 100644
--- a/content/docs/flink-quick-start-guide.html
+++ b/content/docs/flink-quick-start-guide.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/gcs_hoodie.html b/content/docs/gcs_hoodie.html
index 138cb1b..d71d866 100644
--- a/content/docs/gcs_hoodie.html
+++ b/content/docs/gcs_hoodie.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/ibm_cos_hoodie.html b/content/docs/ibm_cos_hoodie.html
index 68c65ce..9307778 100644
--- a/content/docs/ibm_cos_hoodie.html
+++ b/content/docs/ibm_cos_hoodie.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/metrics.html b/content/docs/metrics.html
index 9eb712b..65ca70e 100644
--- a/content/docs/metrics.html
+++ b/content/docs/metrics.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/migration_guide.html
b/content/docs/migration_guide.html
index d671c72..3ef7f99 100644
--- a/content/docs/migration_guide.html
+++ b/content/docs/migration_guide.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/oss_hoodie.html b/content/docs/oss_hoodie.html
index ec429f3..a1a643b 100644
--- a/content/docs/oss_hoodie.html
+++ b/content/docs/oss_hoodie.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/overview.html b/content/docs/overview.html
index d6a78d2..04de878 100644
--- a/content/docs/overview.html
+++ b/content/docs/overview.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/performance.html b/content/docs/performance.html
index b64c5c0..48b1183 100644
--- a/content/docs/performance.html
+++ b/content/docs/performance.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/powered_by.html b/content/docs/powered_by.html
index 8761748..e604d51 100644
--- a/content/docs/powered_by.html
+++ b/content/docs/powered_by.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/privacy.html b/content/docs/privacy.html
index 22e16b7..5b03c8e 100644
--- a/content/docs/privacy.html
+++ b/content/docs/privacy.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/querying_data.html b/content/docs/querying_data.html
index fb0bfc9..18f5c59 100644
--- a/content/docs/querying_data.html
+++ b/content/docs/querying_data.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="active">Querying
Data</a></li>
diff --git a/content/docs/s3_hoodie.html b/content/docs/s3_hoodie.html
index e00a6de..b86d9f3 100644
--- a/content/docs/s3_hoodie.html
+++ b/content/docs/s3_hoodie.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/spark_quick-start-guide.html
b/content/docs/spark_quick-start-guide.html
index 8c36992..ad47de5 100644
--- a/content/docs/spark_quick-start-guide.html
+++ b/content/docs/spark_quick-start-guide.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/structure.html b/content/docs/structure.html
index 2adc401..061aa5e 100644
--- a/content/docs/structure.html
+++ b/content/docs/structure.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/use_cases.html b/content/docs/use_cases.html
index d563210..bd823fb 100644
--- a/content/docs/use_cases.html
+++ b/content/docs/use_cases.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/docs/writing_data.html b/content/docs/writing_data.html
index 72e2a67..1fb6536 100644
--- a/content/docs/writing_data.html
+++ b/content/docs/writing_data.html
@@ -208,6 +208,17 @@
+ <li><a href="/docs/concurrency_control.html"
class="">Concurrency Control</a></li>
+
+
+
+
+
+
+
+
+
+
<li><a href="/docs/querying_data.html" class="">Querying
Data</a></li>
diff --git a/content/sitemap.xml b/content/sitemap.xml
index 229a39d..125ecc7 100644
--- a/content/sitemap.xml
+++ b/content/sitemap.xml
@@ -1061,6 +1061,10 @@
<lastmod>2020-06-20T15:59:57-04:00</lastmod>
</url>
<url>
+<loc>https://hudi.apache.org/docs/concurrency_control.html</loc>
+<lastmod>2021-03-19T15:59:57-04:00</lastmod>
+</url>
+<url>
<loc>https://hudi.apache.org/cn/docs/privacy.html</loc>
<lastmod>2019-12-30T14:59:57-05:00</lastmod>
</url>