Author: junrao
Date: Tue Dec 15 22:30:06 2009
New Revision: 891040
URL: http://svn.apache.org/viewvc?rev=891040&view=rev
Log:
use xor for hash in inner nodes in Merkle tree; patched by Stu Hood, reviewed
by junrao for CASSANDRA-629
Modified:
incubator/cassandra/trunk/src/java/org/apache/cassandra/service/AntiEntropyService.java
incubator/cassandra/trunk/src/java/org/apache/cassandra/utils/MerkleTree.java
incubator/cassandra/trunk/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
Modified:
incubator/cassandra/trunk/src/java/org/apache/cassandra/service/AntiEntropyService.java
URL:
http://svn.apache.org/viewvc/incubator/cassandra/trunk/src/java/org/apache/cassandra/service/AntiEntropyService.java?rev=891040&r1=891039&r2=891040&view=diff
==============================================================================
---
incubator/cassandra/trunk/src/java/org/apache/cassandra/service/AntiEntropyService.java
(original)
+++
incubator/cassandra/trunk/src/java/org/apache/cassandra/service/AntiEntropyService.java
Tue Dec 15 22:30:06 2009
@@ -311,15 +311,16 @@
public final InetAddress initiator;
public final MerkleTree tree;
- private transient final List<MerkleTree.RowHash> rows;
// the minimum token sorts first, but falls into the last range
private transient List<MerkleTree.RowHash> minrows;
// null when all rows with the min token have been consumed
private transient Token mintoken;
private transient long validated;
+ private transient MerkleTree.TreeRange range;
private transient MerkleTree.TreeRangeIterator ranges;
public final static Predicate<DecoratedKey> DKPRED =
Predicates.alwaysTrue();
+ public final static MerkleTree.RowHash EMPTY_ROW = new
MerkleTree.RowHash(null, new byte[0]);
Validator(CFTuple cf, InetAddress initiator)
{
@@ -336,10 +337,10 @@
this.cf = cf;
this.initiator = initiator;
this.tree = tree;
- rows = new ArrayList<MerkleTree.RowHash>();
minrows = new ArrayList<MerkleTree.RowHash>();
mintoken = null;
validated = 0;
+ range = null;
ranges = null;
}
@@ -380,10 +381,14 @@
* Called (in order) for every row present in the CF.
* Hashes the row, and adds it to the tree being built.
*
- * There are three possible cases:
+ * There are four possible cases:
* 1. Token is greater than range.right (we haven't generated a range
for it yet),
* 2. Token is less than/equal to range.left (the range was valid),
- * 3. Token is contained in the range (the range is in progress).
+ * 3. Token is contained in the range (the range is in progress),
+ * 4. No more invalid ranges exist.
+ *
+ * TODO: Because we only validate completely empty trees at the
moment, we
+ * do not both dealing with case 2 and case 4 should result in an
error.
*
* Additionally, there is a special case for the minimum token, because
* although it sorts first, it is contained in the last possible range.
@@ -401,42 +406,31 @@
{
// and store it to be appended when we complete
minrows.add(rowHash(row));
- validated++;
return;
}
mintoken = null;
}
- if (!ranges.hasNext())
- return;
+ if (range == null)
+ range = ranges.next();
- MerkleTree.TreeRange range = ranges.peek();
// generate new ranges as long as case 1 is true
- while (range.right().compareTo(row.key.token) < 0)
+ while (!range.contains(row.key.token))
{
- // token is past the current range: finalize
- range.validate(rows);
- rows.clear();
-
- // and generate a new range
- ranges.next();
- if (!ranges.hasNext())
- return;
- range = ranges.peek();
+ // add the empty hash, and move to the next range
+ range.addHash(EMPTY_ROW);
+ range = ranges.next();
}
- // if case 2 is true, ignore the token
- if (row.key.token.compareTo(range.left()) <= 0)
- return;
-
- // case 3 must be true: buffer the hashed row
- rows.add(rowHash(row));
- validated++;
+ // case 3 must be true: mix in the hashed row
+ range.addHash(rowHash(row));
}
private MerkleTree.RowHash rowHash(CompactedRow row)
{
- byte[] rowhash = FBUtilities.hash("MD5", row.key.key.getBytes(),
row.buffer.getData());
+ validated++;
+ // MerkleTree uses XOR internally, so we want lots of output bits
here
+ byte[] rowhash = FBUtilities.hash("SHA-256",
row.key.key.getBytes(), row.buffer.getData());
return new MerkleTree.RowHash(row.key.token, rowhash);
}
@@ -448,20 +442,17 @@
{
assert ranges != null : "Validator was not prepared()";
- // finish validating remaining rows
+ if (range != null)
+ range.addHash(EMPTY_ROW);
while (ranges.hasNext())
{
- MerkleTree.TreeRange range = ranges.next();
- if (!ranges.hasNext() && !minrows.isEmpty() &&
range.contains(tree.partitioner().getMinimumToken()))
- {
- // append rows with the minimum token into the last range
- rows.addAll(minrows);
- minrows.clear();
- }
- range.validate(rows);
- rows.clear();
+ range = ranges.next();
+ range.addHash(EMPTY_ROW);
}
- assert rows.isEmpty() && minrows.isEmpty();
+ // add rows with the minimum token to the final range
+ if (!minrows.isEmpty())
+ for (MerkleTree.RowHash minrow : minrows)
+ range.addHash(minrow);
StageManager.getStage(AE_SERVICE_STAGE).execute(this);
logger.debug("Validated " + validated + " rows into AEService tree
for " + cf);
Modified:
incubator/cassandra/trunk/src/java/org/apache/cassandra/utils/MerkleTree.java
URL:
http://svn.apache.org/viewvc/incubator/cassandra/trunk/src/java/org/apache/cassandra/utils/MerkleTree.java?rev=891040&r1=891039&r2=891040&view=diff
==============================================================================
---
incubator/cassandra/trunk/src/java/org/apache/cassandra/utils/MerkleTree.java
(original)
+++
incubator/cassandra/trunk/src/java/org/apache/cassandra/utils/MerkleTree.java
Tue Dec 15 22:30:06 2009
@@ -38,39 +38,28 @@
* which contain the computed values of the nodes that would be below them if
* the tree were perfect.
*
- * All nodes of the perfect tree are calculated using a MD5 hash: leaves are
- * sequential hashes of the rows that fall into the range they represent, and
- * inner nodes are a binary hash of their children.
+ * Inputs passed to TreeRange.validate should be calculated using a very
secure hash,
+ * because all hashing internal to the tree is accomplished using XOR.
*
* If two MerkleTrees have the same hashdepth, they represent a perfect tree
* of the same depth, and can always be compared, regardless of size or splits.
*/
public class MerkleTree implements Serializable
{
- private static final long serialVersionUID = 1L;
+ private static final long serialVersionUID = 2L;
- public static final byte RECOMMENDED_DEPTH = (byte)64;
+ public static final byte RECOMMENDED_DEPTH = Byte.MAX_VALUE;
public static final int CONSISTENT = 0;
public static final int FULLY_INCONSISTENT = 1;
public static final int PARTIALLY_INCONSISTENT = 2;
- // cache of empty hash trees up to the maximum depth (0 to 127)
- public static final byte[][] EMPTIES = new byte[Byte.MAX_VALUE][];
- static {
- EMPTIES[0] = new byte[0];
- for (int i = 1; i < EMPTIES.length; i++)
- {
- EMPTIES[i] = Hashable.binaryHash(EMPTIES[i-1], EMPTIES[i-1]);
- }
- }
-
public final byte hashdepth;
private transient IPartitioner partitioner;
- private int maxsize;
- private int size;
+ private long maxsize;
+ private long size;
private Hashable root;
/**
@@ -79,7 +68,7 @@
* of the key space covered by each subrange of a fully populated
tree.
* @param maxsize The maximum number of subranges in the tree.
*/
- public MerkleTree(IPartitioner partitioner, byte hashdepth, int maxsize)
+ public MerkleTree(IPartitioner partitioner, byte hashdepth, long maxsize)
{
this.partitioner = partitioner;
this.hashdepth = hashdepth;
@@ -96,32 +85,31 @@
}
/**
- * Initializes this tree by splitting it into maxsize ranges, or
- * until hashdepth is reached.
- *
- * TODO: could be optimized as breadth first generation of nodes.
+ * Initializes this tree by splitting it until hashdepth is reached,
+ * or until an additional level of splits would violate maxsize.
*
- * NB: asserts that the tree is of size 1.
+ * NB: Replaces all nodes in the tree.
*/
public void init()
{
- assert size() == 1;
+ // determine the depth to which we can safely split the tree
+ byte sizedepth = (byte)(Math.log10(maxsize) / Math.log10(2));
+ byte depth = (byte)Math.min(sizedepth, hashdepth);
- Queue<Range> ranges = new ArrayDeque<Range>();
- ranges.add(new Range(partitioner.getMinimumToken(),
- partitioner.getMinimumToken()));
- while (true)
- {
- Range range = ranges.remove();
- Token mid = partitioner.midpoint(range.left(),
- range.right());
- if (!split(mid))
- // we've reached maxsize or hashdepth
- return;
+ Token mintoken = partitioner.getMinimumToken();
+ root = initHelper(mintoken, mintoken, (byte)0, depth);
+ size = (long)Math.pow(2, depth);
+ }
- ranges.add(new Range(range.left(), mid));
- ranges.add(new Range(mid, range.right()));
- }
+ private Hashable initHelper(Token left, Token right, byte depth, byte max)
+ {
+ if (depth == max)
+ // we've reached the leaves
+ return new Leaf();
+ Token midpoint = partitioner.midpoint(left, right);
+ Hashable lchild = initHelper(left, midpoint, inc(depth), max);
+ Hashable rchild = initHelper(midpoint, right, inc(depth), max);
+ return new Inner(midpoint, lchild, rchild);
}
Hashable root()
@@ -138,17 +126,17 @@
* The number of distinct ranges contained in this tree. This is a
reasonable
* measure of the memory usage of the tree (assuming 'this.order' is
significant).
*/
- public int size()
+ public long size()
{
return size;
}
- public int maxsize()
+ public long maxsize()
{
return maxsize;
}
- public void maxsize(int maxsize)
+ public void maxsize(long maxsize)
{
this.maxsize = maxsize;
}
@@ -475,7 +463,7 @@
public static final long serialVersionUID = 1L;
private final MerkleTree tree;
public final byte depth;
- public final Hashable hashable;
+ private final Hashable hashable;
TreeRange(MerkleTree tree, Token left, Token right, byte depth,
Hashable hashable)
{
@@ -497,89 +485,20 @@
}
/**
- * Consumes a collection of entries within this range.
- */
- public void validate(Collection<RowHash> entries)
- {
- PeekingIterator<RowHash> iter =
Iterators.peekingIterator(entries.iterator());
- validate(iter);
- }
-
- /**
- * Consumes an iterator over entries within this range, setting the
- * value of this range's Leaf to the computed value.
+ * @param entry Row to mix into the hash for this range.
*/
- public void validate(PeekingIterator<RowHash> entries)
+ public void addHash(RowHash entry)
{
assert tree != null : "Not intended for modification!";
assert hashable instanceof Leaf;
- byte[] roothash;
- try
- {
- roothash = validateHelper(left(), right(), depth, entries);
- }
- catch (StopRecursion e)
- {
- throw new RuntimeException("Iterator contained invalid
entry!");
- }
-
- // check that all values were consumed from the iterator, and that
- // a valid hash could be generated
- if (entries.hasNext() || roothash == null)
- throw new RuntimeException("Bad iterator for " + this + "!");
- hashable.hash(roothash);
- }
- /**
- * Collects values from the given iterator that fall into the
- * range represented by left and right. Recurses until we reach
- * hashdepth, where hashes are added sequentially, and then binary
- * hashes the results back to the root.
- *
- * @param left The left token of the active range.
- * @param right The right token of the active range.
- * @param depth The depth of the active range.
- * @param entries A peek()able iterator.
- */
- private byte[] validateHelper(Token left, Token right, byte depth,
PeekingIterator<RowHash> entries) throws StopRecursion.InvalidHash
- {
- if (entries.hasNext() && Range.contains(left, right,
entries.peek().token))
- {
- // see if we can recurse deeper
- if (depth < tree.hashdepth)
- {
- Token midpoint = tree.partitioner().midpoint(left, right);
- if (left.compareTo(midpoint) < 0 &&
midpoint.compareTo(right) < 0)
- {
- // we can recurse deeper
- byte[] lhash = validateHelper(left, midpoint,
inc(depth), entries);
- byte[] rhash = validateHelper(midpoint, right,
inc(depth), entries);
- return Hashable.binaryHash(lhash, rhash);
- }
- // else: the Token impl. cannot provide more resolution
for this range
- }
-
- // hash relevant values from the iterator, and add to the
context
- return consume(left, right, entries);
- }
- else
- {
- // this range is empty: return static hash value:
- // the hash is the one generated by a binary tree of depth
(tree.hashdepth-depth)
- return EMPTIES[tree.hashdepth-depth];
- }
+ hashable.addHash(entry.hash);
}
- /**
- * Consumes and sequentially hashes values from the iterator that fall
into the active
- * range. Should be called with an iterator that contains at least one
matching entry.
- */
- private byte[] consume(Token left, Token right,
PeekingIterator<RowHash> entries)
+ public void addAll(Iterator<RowHash> entries)
{
- byte[] sequentialHash = entries.next().hash;
- while (entries.hasNext() && Range.contains(left, right,
entries.peek().token))
- sequentialHash = Hashable.binaryHash(sequentialHash,
entries.next().hash);
- return sequentialHash;
+ while (entries.hasNext())
+ addHash(entries.next());
}
@Override
@@ -777,7 +696,8 @@
/**
* Hash value representing a row, to be used to pass hashes to the
MerkleTree.
- * The byte[] hash value should contain a digest of the key and value of
the row.
+ * The byte[] hash value should contain a digest of the key and value of
the row
+ * created using a very strong hash function.
*/
public static class RowHash
{
@@ -831,15 +751,24 @@
}
/**
+ * Mixes the given value into our hash. If our hash is null,
+ * our hash will become the given value.
+ */
+ void addHash(byte[] righthash)
+ {
+ if (hash == null)
+ hash = righthash;
+ else
+ hash = binaryHash(hash, righthash);
+ }
+
+ /**
* The primitive with which all hashing should be accomplished: hashes
* a left and right value together.
*/
static byte[] binaryHash(final byte[] left, final byte[] right)
{
- if (left == null || right == null)
- return null;
- else
- return FBUtilities.hash("MD5", left, right);
+ return FBUtilities.xor(left, right);
}
public abstract void toString(StringBuilder buff, int maxdepth);
Modified:
incubator/cassandra/trunk/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
URL:
http://svn.apache.org/viewvc/incubator/cassandra/trunk/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java?rev=891040&r1=891039&r2=891040&view=diff
==============================================================================
---
incubator/cassandra/trunk/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
(original)
+++
incubator/cassandra/trunk/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
Tue Dec 15 22:30:06 2009
@@ -405,7 +405,7 @@
// validate the tree
TreeRangeIterator ranges = mt.invalids(new Range(tok(0), tok(0)));
for (TreeRange range : ranges)
- range.validate(new HIterator(/*empty*/ new int[0]));
+ range.addHash(new RowHash(range.right(), new byte[0]));
assert null != mt.hash(new Range(tok(0), tok(0))) :
"Could not hash tree " + mt;
@@ -433,12 +433,12 @@
mt.split(tok(10));
ranges = mt.invalids(full);
- ranges.next().validate(new HIterator(2, 4)); // (0,4]: depth 2
- ranges.next().validate(new HIterator(6)); // (4,6]
- ranges.next().validate(new HIterator(8)); // (6,8]
- ranges.next().validate(new HIterator(/*empty*/ new int[0])); // (8,10]
- ranges.next().validate(new HIterator(12)); // (10,12]
- ranges.next().validate(new HIterator(14, 0)); // (12,0]: depth 2
+ ranges.next().addAll(new HIterator(2, 4)); // (0,4]: depth 2
+ ranges.next().addAll(new HIterator(6)); // (4,6]
+ ranges.next().addAll(new HIterator(8)); // (6,8]
+ ranges.next().addAll(new HIterator(/*empty*/ new int[0])); // (8,10]
+ ranges.next().addAll(new HIterator(12)); // (10,12]
+ ranges.next().addAll(new HIterator(14, 0)); // (12,0]: depth 2
mt2.split(tok(8));
@@ -450,14 +450,14 @@
mt2.split(tok(11));
ranges = mt2.invalids(full);
- ranges.next().validate(new HIterator(2)); // (0,2]
- ranges.next().validate(new HIterator(4)); // (2,4]
- ranges.next().validate(new HIterator(6, 8)); // (4,8]: depth 2
- ranges.next().validate(new HIterator(/*empty*/ new int[0])); // (8,9]
- ranges.next().validate(new HIterator(/*empty*/ new int[0])); // (9,10]
- ranges.next().validate(new HIterator(/*empty*/ new int[0])); //
(10,11]: depth 4
- ranges.next().validate(new HIterator(12)); // (11,12]: depth 4
- ranges.next().validate(new HIterator(14, 0)); // (12,0]: depth 2
+ ranges.next().addAll(new HIterator(2)); // (0,2]
+ ranges.next().addAll(new HIterator(4)); // (2,4]
+ ranges.next().addAll(new HIterator(6, 8)); // (4,8]: depth 2
+ ranges.next().addAll(new HIterator(/*empty*/ new int[0])); // (8,9]
+ ranges.next().addAll(new HIterator(/*empty*/ new int[0])); // (9,10]
+ ranges.next().addAll(new HIterator(/*empty*/ new int[0])); // (10,11]:
depth 4
+ ranges.next().addAll(new HIterator(12)); // (11,12]: depth 4
+ ranges.next().addAll(new HIterator(14, 0)); // (12,0]: depth 2
byte[] mthash = mt.hash(full);
byte[] mt2hash = mt2.hash(full);
@@ -475,7 +475,7 @@
mt.maxsize(256);
mt.init();
for (TreeRange range : mt.invalids(full))
- range.validate(new HIterator(range.right()));
+ range.addAll(new HIterator(range.right()));
byte[] initialhash = mt.hash(full);
oout.writeObject(mt);
@@ -522,9 +522,9 @@
// add dummy hashes to the rest of both trees
for (TreeRange range : mt.invalids(full))
- range.validate(new HIterator(range.right()));
+ range.addAll(new HIterator(range.right()));
for (TreeRange range : mt2.invalids(full))
- range.validate(new HIterator(range.right()));
+ range.addAll(new HIterator(range.right()));
// trees should disagree for leftmost, (middle.left, rightmost.right]
List<TreeRange> diffs = MerkleTree.difference(mt, mt2);
@@ -564,7 +564,7 @@
return hstack.pop();
}
- static class HIterator extends AbstractIterator<RowHash> implements
PeekingIterator<RowHash>
+ static class HIterator extends AbstractIterator<RowHash>
{
private Iterator<Token> tokens;