Added: zookeeper/site/trunk/content/doc/r3.5.2-alpha/recipes.html URL: http://svn.apache.org/viewvc/zookeeper/site/trunk/content/doc/r3.5.2-alpha/recipes.html?rev=1753714&view=auto ============================================================================== --- zookeeper/site/trunk/content/doc/r3.5.2-alpha/recipes.html (added) +++ zookeeper/site/trunk/content/doc/r3.5.2-alpha/recipes.html Thu Jul 21 17:02:42 2016 @@ -0,0 +1,1024 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> +<head> +<META http-equiv="Content-Type" content="text/html; charset=UTF-8"> +<meta content="Apache Forrest" name="Generator"> +<meta name="Forrest-version" content="0.9"> +<meta name="Forrest-skin-name" content="pelt"> +<title>ZooKeeper Recipes and Solutions</title> +<link type="text/css" href="skin/basic.css" rel="stylesheet"> +<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet"> +<link media="print" type="text/css" href="skin/print.css" rel="stylesheet"> +<link type="text/css" href="skin/profile.css" rel="stylesheet"> +<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script> +<link rel="shortcut icon" href="images/favicon.ico"> +</head> +<body onload="init()"> +<script type="text/javascript">ndeSetTextSize();</script> +<div id="top"> +<!--+ + |breadtrail + +--> +<div class="breadtrail"> +<a href="http://www.apache.org/">Apache</a> > <a href="http://zookeeper.apache.org/">ZooKeeper</a> > <a href="http://zookeeper.apache.org/">ZooKeeper</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script> +</div> +<!--+ + |header + +--> +<div class="header"> +<!--+ + |start group logo + +--> +<div class="grouplogo"> +<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a> +</div> +<!--+ + |end group logo + +--> +<!--+ + |start Project Logo + +--> +<div class="projectlogo"> +<a href="http://zookeeper.apache.org/"><img class="logoImage" alt="ZooKeeper" src="images/zookeeper_small.gif" title="ZooKeeper: distributed coordination"></a> +</div> +<!--+ + |end Project Logo + +--> +<!--+ + |start Search + +--> +<div class="searchbox"> +<form action="http://www.google.com/search" method="get" class="roundtopsmall"> +<input value="zookeeper.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google"> + <input name="Search" value="Search" type="submit"> +</form> +</div> +<!--+ + |end search + +--> +<!--+ + |start Tabs + +--> +<ul id="tabs"> +<li> +<a class="unselected" href="http://zookeeper.apache.org/">Project</a> +</li> +<li> +<a class="unselected" href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/">Wiki</a> +</li> +<li class="current"> +<a class="selected" href="index.html">ZooKeeper 3.5 Documentation</a> +</li> +</ul> +<!--+ + |end Tabs + +--> +</div> +</div> +<div id="main"> +<div id="publishedStrip"> +<!--+ + |start Subtabs + +--> +<div id="level2tabs"></div> +<!--+ + |end Endtabs + +--> +<script type="text/javascript"><!-- +document.write("Last Published: " + document.lastModified); +// --></script> +</div> +<!--+ + |breadtrail + +--> +<div class="breadtrail"> + + + </div> +<!--+ + |start Menu, mainarea + +--> +<!--+ + |start Menu + +--> +<div id="menu"> +<div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Overview</div> +<div id="menu_1.1" class="menuitemgroup"> +<div class="menuitem"> +<a href="index.html">Welcome</a> +</div> +<div class="menuitem"> +<a href="zookeeperOver.html">Overview</a> +</div> +<div class="menuitem"> +<a href="zookeeperStarted.html">Getting Started</a> +</div> +<div class="menuitem"> +<a href="releasenotes.html">Release Notes</a> +</div> +</div> +<div onclick="SwitchMenu('menu_selected_1.2', 'skin/')" id="menu_selected_1.2Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Developer</div> +<div id="menu_selected_1.2" class="selectedmenuitemgroup" style="display: block;"> +<div class="menuitem"> +<a href="api/index.html">API Docs</a> +</div> +<div class="menuitem"> +<a href="zookeeperProgrammers.html">Programmer's Guide</a> +</div> +<div class="menuitem"> +<a href="javaExample.html">Java Example</a> +</div> +<div class="menuitem"> +<a href="zookeeperTutorial.html">Barrier and Queue Tutorial</a> +</div> +<div class="menupage"> +<div class="menupagetitle">Recipes</div> +</div> +</div> +<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Admin & Ops</div> +<div id="menu_1.3" class="menuitemgroup"> +<div class="menuitem"> +<a href="zookeeperAdmin.html">Administrator's Guide</a> +</div> +<div class="menuitem"> +<a href="zookeeperQuotas.html">Quota Guide</a> +</div> +<div class="menuitem"> +<a href="zookeeperJMX.html">JMX</a> +</div> +<div class="menuitem"> +<a href="zookeeperObservers.html">Observers Guide</a> +</div> +<div class="menuitem"> +<a href="zookeeperReconfig.html">Dynamic Reconfiguration</a> +</div> +</div> +<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Contributor</div> +<div id="menu_1.4" class="menuitemgroup"> +<div class="menuitem"> +<a href="zookeeperInternals.html">ZooKeeper Internals</a> +</div> +</div> +<div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Miscellaneous</div> +<div id="menu_1.5" class="menuitemgroup"> +<div class="menuitem"> +<a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER">Wiki</a> +</div> +<div class="menuitem"> +<a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/FAQ">FAQ</a> +</div> +<div class="menuitem"> +<a href="http://zookeeper.apache.org/mailing_lists.html">Mailing Lists</a> +</div> +</div> +<div id="credit"></div> +<div id="roundbottom"> +<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div> +<!--+ + |alternative credits + +--> +<div id="credit2"></div> +</div> +<!--+ + |end Menu + +--> +<!--+ + |start content + +--> +<div id="content"> +<div title="Portable Document Format" class="pdflink"> +<a class="dida" href="recipes.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br> + PDF</a> +</div> +<h1>ZooKeeper Recipes and Solutions</h1> +<div id="front-matter"> +<div id="minitoc-area"> +<ul class="minitoc"> +<li> +<a href="#ch_recipes">A Guide to Creating Higher-level Constructs with ZooKeeper</a> +<ul class="minitoc"> +<li> +<a href="#sc_recipes_errorHandlingNote">Important Note About Error Handling</a> +</li> +<li> +<a href="#sc_outOfTheBox">Out of the Box Applications: Name Service, Configuration, Group + Membership</a> +</li> +<li> +<a href="#sc_recipes_eventHandles">Barriers</a> +<ul class="minitoc"> +<li> +<a href="#sc_doubleBarriers">Double Barriers</a> +</li> +</ul> +</li> +<li> +<a href="#sc_recipes_Queues">Queues</a> +<ul class="minitoc"> +<li> +<a href="#sc_recipes_priorityQueues">Priority Queues</a> +</li> +</ul> +</li> +<li> +<a href="#sc_recipes_Locks">Locks</a> +<ul class="minitoc"> +<li> +<a href="#sc_recipes_GuidNote">Recoverable Errors and the GUID</a> +</li> +<li> +<a href="#Shared+Locks">Shared Locks</a> +</li> +<li> +<a href="#sc_revocableSharedLocks">Revocable Shared Locks</a> +</li> +</ul> +</li> +<li> +<a href="#sc_recipes_twoPhasedCommit">Two-phased Commit</a> +</li> +<li> +<a href="#sc_leaderElection">Leader Election</a> +</li> +</ul> +</li> +</ul> +</div> +</div> + + + + + +<a name="ch_recipes"></a> +<h2 class="h3">A Guide to Creating Higher-level Constructs with ZooKeeper</h2> +<div class="section"> +<p>In this article, you'll find guidelines for using + ZooKeeper to implement higher order functions. All of them are conventions + implemented at the client and do not require special support from + ZooKeeper. Hopfully the community will capture these conventions in client-side libraries + to ease their use and to encourage standardization.</p> +<p>One of the most interesting things about ZooKeeper is that even + though ZooKeeper uses <em>asynchronous</em> notifications, you + can use it to build <em>synchronous</em> consistency + primitives, such as queues and locks. As you will see, this is possible + because ZooKeeper imposes an overall order on updates, and has mechanisms + to expose this ordering.</p> +<p>Note that the recipes below attempt to employ best practices. In + particular, they avoid polling, timers or anything else that would result + in a "herd effect", causing bursts of traffic and limiting + scalability.</p> +<p>There are many useful functions that can be imagined that aren't + included here - revocable read-write priority locks, as just one example. + And some of the constructs mentioned here - locks, in particular - + illustrate certain points, even though you may find other constructs, such + as event handles or queues, a more practical means of performing the same + function. In general, the examples in this section are designed to + stimulate thought.</p> +<a name="sc_recipes_errorHandlingNote"></a> +<h3 class="h4">Important Note About Error Handling</h3> +<p>When implementing the recipes you must handle recoverable exceptions + (see the <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/FAQ">FAQ</a>). In + particular, several of the recipes employ sequential ephemeral + nodes. When creating a sequential ephemeral node there is an error case in + which the create() succeeds on the server but the server crashes before + returning the name of the node to the client. When the client reconnects its + session is still valid and, thus, the node is not removed. The implication is + that it is difficult for the client to know if its node was created or not. The + recipes below include measures to handle this.</p> +<a name="sc_outOfTheBox"></a> +<h3 class="h4">Out of the Box Applications: Name Service, Configuration, Group + Membership</h3> +<p>Name service and configuration are two of the primary applications + of ZooKeeper. These two functions are provided directly by the ZooKeeper + API.</p> +<p>Another function directly provided by ZooKeeper is <em>group + membership</em>. The group is represented by a node. Members of the + group create ephemeral nodes under the group node. Nodes of the members + that fail abnormally will be removed automatically when ZooKeeper detects + the failure.</p> +<a name="sc_recipes_eventHandles"></a> +<h3 class="h4">Barriers</h3> +<p>Distributed systems use <em>barriers</em> + to block processing of a set of nodes until a condition is met + at which time all the nodes are allowed to proceed. Barriers are + implemented in ZooKeeper by designating a barrier node. The + barrier is in place if the barrier node exists. Here's the + pseudo code:</p> +<ol> + +<li> + +<p>Client calls the ZooKeeper API's <strong>exists()</strong> function on the barrier node, with + <em>watch</em> set to true.</p> + +</li> + + +<li> + +<p>If <strong>exists()</strong> returns false, the + barrier is gone and the client proceeds</p> + +</li> + + +<li> + +<p>Else, if <strong>exists()</strong> returns true, + the clients wait for a watch event from ZooKeeper for the barrier + node.</p> + +</li> + + +<li> + +<p>When the watch event is triggered, the client reissues the + <strong>exists( )</strong> call, again waiting until + the barrier node is removed.</p> + +</li> + +</ol> +<a name="sc_doubleBarriers"></a> +<h4>Double Barriers</h4> +<p>Double barriers enable clients to synchronize the beginning and + the end of a computation. When enough processes have joined the barrier, + processes start their computation and leave the barrier once they have + finished. This recipe shows how to use a ZooKeeper node as a + barrier.</p> +<p>The pseudo code in this recipe represents the barrier node as + <em>b</em>. Every client process <em>p</em> + registers with the barrier node on entry and unregisters when it is + ready to leave. A node registers with the barrier node via the <strong>Enter</strong> procedure below, it waits until + <em>x</em> client process register before proceeding with + the computation. (The <em>x</em> here is up to you to + determine for your system.)</p> +<table class="ForrestTable" cellspacing="1" cellpadding="4"> + + +<tr> + +<td><strong>Enter</strong></td> + + <td><strong>Leave</strong></td> + +</tr> + + +<tr> + +<td> +<ol> + +<li> + +<p>Create a name <em><em>n</em> = + <em>b</em>+“/”+<em>p</em></em> +</p> + +</li> + + +<li> + +<p>Set watch: <strong>exists(<em>b</em> + ‘‘/ready’’, + true)</strong> +</p> + +</li> + + +<li> + +<p>Create child: <strong>create( + <em>n</em>, EPHEMERAL)</strong> +</p> + +</li> + + +<li> + +<p> +<strong>L = getChildren(b, + false)</strong> +</p> + +</li> + + +<li> + +<p>if fewer children in L than<em> + x</em>, wait for watch event</p> + +</li> + + +<li> + +<p>else <strong>create(b + ‘‘/ready’’, + REGULAR)</strong> +</p> + +</li> + +</ol> +</td> + + <td> +<ol> + +<li> + +<p> +<strong>L = getChildren(b, + false)</strong> +</p> + +</li> + + +<li> + +<p>if no children, exit</p> + +</li> + + +<li> + +<p>if <em>p</em> is only process node in + L, delete(n) and exit</p> + +</li> + + +<li> + +<p>if <em>p</em> is the lowest process + node in L, wait on highest process node in L</p> + +</li> + + +<li> + +<p>else <strong>delete(<em>n</em>) </strong>if + still exists and wait on lowest process node in L</p> + +</li> + + +<li> + +<p>goto 1</p> + +</li> + +</ol> +</td> + +</tr> + + +</table> +<p>On entering, all processes watch on a ready node and + create an ephemeral node as a child of the barrier node. Each process + but the last enters the barrier and waits for the ready node to appear + at line 5. The process that creates the xth node, the last process, will + see x nodes in the list of children and create the ready node, waking up + the other processes. Note that waiting processes wake up only when it is + time to exit, so waiting is efficient. + </p> +<p>On exit, you can't use a flag such as <em>ready</em> + because you are watching for process nodes to go away. By using + ephemeral nodes, processes that fail after the barrier has been entered + do not prevent correct processes from finishing. When processes are + ready to leave, they need to delete their process nodes and wait for all + other processes to do the same.</p> +<p>Processes exit when there are no process nodes left as children of + <em>b</em>. However, as an efficiency, you can use the + lowest process node as the ready flag. All other processes that are + ready to exit watch for the lowest existing process node to go away, and + the owner of the lowest process watches for any other process node + (picking the highest for simplicity) to go away. This means that only a + single process wakes up on each node deletion except for the last node, + which wakes up everyone when it is removed.</p> +<a name="sc_recipes_Queues"></a> +<h3 class="h4">Queues</h3> +<p>Distributed queues are a common data structure. To implement a + distributed queue in ZooKeeper, first designate a znode to hold the queue, + the queue node. The distributed clients put something into the queue by + calling create() with a pathname ending in "queue-", with the + <em>sequence</em> and <em>ephemeral</em> flags in + the create() call set to true. Because the <em>sequence</em> + flag is set, the new pathnames will have the form + _path-to-queue-node_/queue-X, where X is a monotonic increasing number. A + client that wants to be removed from the queue calls ZooKeeper's <strong>getChildren( )</strong> function, with + <em>watch</em> set to true on the queue node, and begins + processing nodes with the lowest number. The client does not need to issue + another <strong>getChildren( )</strong> until it exhausts + the list obtained from the first <strong>getChildren( + )</strong> call. If there are are no children in the queue node, the + reader waits for a watch notification to check the queue again.</p> +<div class="note"> +<div class="label">Note</div> +<div class="content"> + +<p>There now exists a Queue implementation in ZooKeeper + recipes directory. This is distributed with the release -- + src/recipes/queue directory of the release artifact. + </p> + +</div> +</div> +<a name="sc_recipes_priorityQueues"></a> +<h4>Priority Queues</h4> +<p>To implement a priority queue, you need only make two simple + changes to the generic <a href="#sc_recipes_Queues">queue + recipe</a> . First, to add to a queue, the pathname ends with + "queue-YY" where YY is the priority of the element with lower numbers + representing higher priority (just like UNIX). Second, when removing + from the queue, a client uses an up-to-date children list meaning that + the client will invalidate previously obtained children lists if a watch + notification triggers for the queue node.</p> +<a name="sc_recipes_Locks"></a> +<h3 class="h4">Locks</h3> +<p>Fully distributed locks that are globally synchronous, meaning at + any snapshot in time no two clients think they hold the same lock. These + can be implemented using ZooKeeeper. As with priority queues, first define + a lock node.</p> +<div class="note"> +<div class="label">Note</div> +<div class="content"> + +<p>There now exists a Lock implementation in ZooKeeper + recipes directory. This is distributed with the release -- + src/recipes/lock directory of the release artifact. + </p> + +</div> +</div> +<p>Clients wishing to obtain a lock do the following:</p> +<ol> + +<li> + +<p>Call <strong>create( )</strong> with a pathname + of "_locknode_/guid-lock-" and the <em>sequence</em> and + <em>ephemeral</em> flags set. The <em>guid</em> + is needed in case the create() result is missed. See the note below.</p> + +</li> + + +<li> + +<p>Call <strong>getChildren( )</strong> on the lock + node <em>without</em> setting the watch flag (this is + important to avoid the herd effect).</p> + +</li> + + +<li> + +<p>If the pathname created in step <strong>1</strong> has the lowest sequence number suffix, the + client has the lock and the client exits the protocol.</p> + +</li> + + +<li> + +<p>The client calls <strong>exists( )</strong> with + the watch flag set on the path in the lock directory with the next + lowest sequence number.</p> + +</li> + + +<li> + +<p>if <strong>exists( )</strong> returns false, go + to step <strong>2</strong>. Otherwise, wait for a + notification for the pathname from the previous step before going to + step <strong>2</strong>.</p> + +</li> + +</ol> +<p>The unlock protocol is very simple: clients wishing to release a + lock simply delete the node they created in step 1.</p> +<p>Here are a few things to notice:</p> +<ul> + +<li> + +<p>The removal of a node will only cause one client to wake up + since each node is watched by exactly one client. In this way, you + avoid the herd effect.</p> + +</li> + +</ul> +<ul> + +<li> + +<p>There is no polling or timeouts.</p> + +</li> + +</ul> +<ul> + +<li> + +<p>Because of the way you implement locking, it is easy to see the + amount of lock contention, break locks, debug locking problems, + etc.</p> + +</li> + +</ul> +<a name="sc_recipes_GuidNote"></a> +<h4>Recoverable Errors and the GUID</h4> +<ul> + +<li> + +<p>If a recoverable error occurs calling <strong>create()</strong> the + client should call <strong>getChildren()</strong> and check for a node + containing the <em>guid</em> used in the path name. + This handles the case (noted <a href="#sc_recipes_errorHandlingNote">above</a>) of + the create() succeeding on the server but the server crashing before returning the name + of the new node.</p> + +</li> + +</ul> +<a name="Shared+Locks"></a> +<h4>Shared Locks</h4> +<p>You can implement shared locks by with a few changes to the lock + protocol:</p> +<table class="ForrestTable" cellspacing="1" cellpadding="4"> + + +<tr> + +<td><strong>Obtaining a read + lock:</strong></td> + + <td><strong>Obtaining a write + lock:</strong></td> + +</tr> + + +<tr> + +<td> +<ol> + +<li> + +<p>Call <strong>create( )</strong> to + create a node with pathname + "<span class="codefrag filename">guid-/read-</span>". This is the + lock node use later in the protocol. Make sure to set both + the <em>sequence</em> and + <em>ephemeral</em> flags.</p> + +</li> + + +<li> + +<p>Call <strong>getChildren( )</strong> + on the lock node <em>without</em> setting the + <em>watch</em> flag - this is important, as it + avoids the herd effect.</p> + +</li> + + +<li> + +<p>If there are no children with a pathname starting + with "<span class="codefrag filename">write-</span>" and having a lower + sequence number than the node created in step <strong>1</strong>, the client has the lock and can + exit the protocol. </p> + +</li> + + +<li> + +<p>Otherwise, call <strong>exists( + )</strong>, with <em>watch</em> flag, set on + the node in lock directory with pathname staring with + "<span class="codefrag filename">write-</span>" having the next lowest + sequence number.</p> + +</li> + + +<li> + +<p>If <strong>exists( )</strong> + returns <em>false</em>, goto step <strong>2</strong>.</p> + +</li> + + +<li> + +<p>Otherwise, wait for a notification for the pathname + from the previous step before going to step <strong>2</strong> +</p> + +</li> + +</ol> +</td> + + <td> +<ol> + +<li> + +<p>Call <strong>create( )</strong> to + create a node with pathname + "<span class="codefrag filename">guid-/write-</span>". This is the + lock node spoken of later in the protocol. Make sure to + set both <em>sequence</em> and + <em>ephemeral</em> flags.</p> + +</li> + + +<li> + +<p>Call <strong>getChildren( ) + </strong> on the lock node <em>without</em> + setting the <em>watch</em> flag - this is + important, as it avoids the herd effect.</p> + +</li> + + +<li> + +<p>If there are no children with a lower sequence + number than the node created in step <strong>1</strong>, the client has the lock and the + client exits the protocol.</p> + +</li> + + +<li> + +<p>Call <strong>exists( ),</strong> + with <em>watch</em> flag set, on the node with + the pathname that has the next lowest sequence + number.</p> + +</li> + + +<li> + +<p>If <strong>exists( )</strong> + returns <em>false</em>, goto step <strong>2</strong>. Otherwise, wait for a + notification for the pathname from the previous step + before going to step <strong>2</strong>.</p> + +</li> + +</ol> +</td> + +</tr> + + +</table> +<p>Notes:</p> +<ul> + +<li> + +<p>It might appear that this recipe creates a herd effect: + when there is a large group of clients waiting for a read + lock, and all getting notified more or less simultaneously + when the "<span class="codefrag filename">write-</span>" node with the lowest + sequence number is deleted. In fact. that's valid behavior: + as all those waiting reader clients should be released since + they have the lock. The herd effect refers to releasing a + "herd" when in fact only a single or a small number of + machines can proceed.</p> + +</li> + +</ul> +<ul> + +<li> + +<p>See the <a href="#sc_recipes_GuidNote">note for Locks</a> on how to use the guid in the node.</p> + +</li> + +</ul> +<a name="sc_revocableSharedLocks"></a> +<h4>Revocable Shared Locks</h4> +<p>With minor modifications to the Shared Lock protocol, you make + shared locks revocable by modifying the shared lock protocol:</p> +<p>In step <strong>1</strong>, of both obtain reader + and writer lock protocols, call <strong>getData( + )</strong> with <em>watch</em> set, immediately after the + call to <strong>create( )</strong>. If the client + subsequently receives notification for the node it created in step + <strong>1</strong>, it does another <strong>getData( )</strong> on that node, with + <em>watch</em> set and looks for the string "unlock", which + signals to the client that it must release the lock. This is because, + according to this shared lock protocol, you can request the client with + the lock give up the lock by calling <strong>setData() + </strong> on the lock node, writing "unlock" to that node.</p> +<p>Note that this protocol requires the lock holder to consent to + releasing the lock. Such consent is important, especially if the lock + holder needs to do some processing before releasing the lock. Of course + you can always implement <em>Revocable Shared Locks with Freaking + Laser Beams</em> by stipulating in your protocol that the revoker + is allowed to delete the lock node if after some length of time the lock + isn't deleted by the lock holder.</p> +<a name="sc_recipes_twoPhasedCommit"></a> +<h3 class="h4">Two-phased Commit</h3> +<p>A two-phase commit protocol is an algorithm that lets all clients in + a distributed system agree either to commit a transaction or abort.</p> +<p>In ZooKeeper, you can implement a two-phased commit by having a + coordinator create a transaction node, say "/app/Tx", and one child node + per participating site, say "/app/Tx/s_i". When coordinator creates the + child node, it leaves the content undefined. Once each site involved in + the transaction receives the transaction from the coordinator, the site + reads each child node and sets a watch. Each site then processes the query + and votes "commit" or "abort" by writing to its respective node. Once the + write completes, the other sites are notified, and as soon as all sites + have all votes, they can decide either "abort" or "commit". Note that a + node can decide "abort" earlier if some site votes for "abort".</p> +<p>An interesting aspect of this implementation is that the only role + of the coordinator is to decide upon the group of sites, to create the + ZooKeeper nodes, and to propagate the transaction to the corresponding + sites. In fact, even propagating the transaction can be done through + ZooKeeper by writing it in the transaction node.</p> +<p>There are two important drawbacks of the approach described above. + One is the message complexity, which is O(n²). The second is the + impossibility of detecting failures of sites through ephemeral nodes. To + detect the failure of a site using ephemeral nodes, it is necessary that + the site create the node.</p> +<p>To solve the first problem, you can have only the coordinator + notified of changes to the transaction nodes, and then notify the sites + once coordinator reaches a decision. Note that this approach is scalable, + but it's is slower too, as it requires all communication to go through the + coordinator.</p> +<p>To address the second problem, you can have the coordinator + propagate the transaction to the sites, and have each site creating its + own ephemeral node.</p> +<a name="sc_leaderElection"></a> +<h3 class="h4">Leader Election</h3> +<p>A simple way of doing leader election with ZooKeeper is to use the + <strong>SEQUENCE|EPHEMERAL</strong> flags when creating + znodes that represent "proposals" of clients. The idea is to have a znode, + say "/election", such that each znode creates a child znode "/election/guid-n_" + with both flags SEQUENCE|EPHEMERAL. With the sequence flag, ZooKeeper + automatically appends a sequence number that is greater than any one + previously appended to a child of "/election". The process that created + the znode with the smallest appended sequence number is the leader. + </p> +<p>That's not all, though. It is important to watch for failures of the + leader, so that a new client arises as the new leader in the case the + current leader fails. A trivial solution is to have all application + processes watching upon the current smallest znode, and checking if they + are the new leader when the smallest znode goes away (note that the + smallest znode will go away if the leader fails because the node is + ephemeral). But this causes a herd effect: upon a failure of the current + leader, all other processes receive a notification, and execute + getChildren on "/election" to obtain the current list of children of + "/election". If the number of clients is large, it causes a spike on the + number of operations that ZooKeeper servers have to process. To avoid the + herd effect, it is sufficient to watch for the next znode down on the + sequence of znodes. If a client receives a notification that the znode it + is watching is gone, then it becomes the new leader in the case that there + is no smaller znode. Note that this avoids the herd effect by not having + all clients watching the same znode. </p> +<p>Here's the pseudo code:</p> +<p>Let ELECTION be a path of choice of the application. To volunteer to + be a leader: </p> +<ol> + +<li> + +<p>Create znode z with path "ELECTION/guid-n_" with both SEQUENCE and + EPHEMERAL flags;</p> + +</li> + + +<li> + +<p>Let C be the children of "ELECTION", and i be the sequence + number of z;</p> + +</li> + + +<li> + +<p>Watch for changes on "ELECTION/guid-n_j", where j is the largest + sequence number such that j < i and n_j is a znode in C;</p> + +</li> + +</ol> +<p>Upon receiving a notification of znode deletion: </p> +<ol> + +<li> + +<p>Let C be the new set of children of ELECTION; </p> + +</li> + + +<li> + +<p>If z is the smallest node in C, then execute leader + procedure;</p> + +</li> + + +<li> + +<p>Otherwise, watch for changes on "ELECTION/guid-n_j", where j is the + largest sequence number such that j < i and n_j is a znode in C; + </p> + +</li> + +</ol> +<p>Notes:</p> +<ul> + +<li> + +<p>Note that the znode having no preceding znode on the list of + children does not imply that the creator of this znode is aware that it is + the current leader. Applications may consider creating a separate znode + to acknowledge that the leader has executed the leader procedure. </p> + +</li> + +</ul> +<ul> + +<li> + +<p>See the <a href="#sc_recipes_GuidNote">note for Locks</a> on how to use the guid in the node.</p> + +</li> + +</ul> +</div> + +<p align="right"> +<font size="-2"></font> +</p> +</div> +<!--+ + |end content + +--> +<div class="clearboth"> </div> +</div> +<div id="footer"> +<!--+ + |start bottomstrip + +--> +<div class="lastmodified"> +<script type="text/javascript"><!-- +document.write("Last Published: " + document.lastModified); +// --></script> +</div> +<div class="copyright"> + Copyright © + 2008-2013 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a> +</div> +<!--+ + |end bottomstrip + +--> +</div> +</body> +</html>
Added: zookeeper/site/trunk/content/doc/r3.5.2-alpha/recipes.pdf URL: http://svn.apache.org/viewvc/zookeeper/site/trunk/content/doc/r3.5.2-alpha/recipes.pdf?rev=1753714&view=auto ============================================================================== Binary file - no diff available. Propchange: zookeeper/site/trunk/content/doc/r3.5.2-alpha/recipes.pdf ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream
