Author: jimk Date: Mon Jan 14 00:09:11 2008 New Revision: 611734 URL: http://svn.apache.org/viewvc?rev=611734&view=rev Log: HADOOP-2558 org.onelab.filter.BloomFilter class uses 8X the memory it should be using
Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/BloomFilter.java lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/CountingBloomFilter.java lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/DynamicBloomFilter.java lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/Filter.java lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/RetouchedBloomFilter.java Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt?rev=611734&r1=611733&r2=611734&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt (original) +++ lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt Mon Jan 14 00:09:11 2008 @@ -177,6 +177,8 @@ HADOOP-2548 Make TableMap and TableReduce generic (Frederik Hedberg via Stack) HADOOP-2557 Shell count function (Edward Yoon via Stack) + HADOOP-2558 org.onelab.filter.BloomFilter class uses 8X the memory it should + be using Release 0.15.1 Branch 0.15 Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/BloomFilter.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/BloomFilter.java?rev=611734&r1=611733&r2=611734&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/BloomFilter.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/BloomFilter.java Mon Jan 14 00:09:11 2008 @@ -51,6 +51,8 @@ import java.io.DataOutput; import java.io.IOException; +import java.util.BitSet; + /** * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970. * <p> @@ -72,11 +74,24 @@ * @see <a href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">Space/Time Trade-Offs in Hash Coding with Allowable Errors</a> */ public class BloomFilter extends Filter { + private static final byte[] bitvalues = new byte[] { + (byte)0x01, + (byte)0x02, + (byte)0x04, + (byte)0x08, + (byte)0x10, + (byte)0x20, + (byte)0x40, + (byte)0x80 + }; + /** The bit vector. */ - boolean[] vector; + BitSet bits; /** Default constructor - use with readFields */ - public BloomFilter() {} + public BloomFilter() { + super(); + } /** * Constructor @@ -86,7 +101,7 @@ public BloomFilter(int vectorSize, int nbHash){ super(vectorSize, nbHash); - vector = new boolean[this.vectorSize]; + bits = new BitSet(this.vectorSize); }//end constructor /** [EMAIL PROTECTED] */ @@ -100,7 +115,7 @@ hash.clear(); for(int i = 0; i < nbHash; i++) { - vector[h[i]] = true; + bits.set(h[i]); } }//end add() @@ -114,11 +129,7 @@ throw new IllegalArgumentException("filters cannot be and-ed"); } - BloomFilter bf = (BloomFilter)filter; - - for(int i = 0; i < vectorSize; i++) { - this.vector[i] &= bf.vector[i]; - } + this.bits.and(((BloomFilter) filter).bits); }//end and() /** [EMAIL PROTECTED] */ @@ -131,7 +142,7 @@ int[] h = hash.hash(key); hash.clear(); for(int i = 0; i < nbHash; i++) { - if(!vector[h[i]]) { + if(!bits.get(h[i])) { return false; } } @@ -141,9 +152,7 @@ /** [EMAIL PROTECTED] */ @Override public void not(){ - for(int i = 0; i < vectorSize; i++) { - vector[i] = !vector[i]; - } + bits.flip(0, vectorSize - 1); }//end not() /** [EMAIL PROTECTED] */ @@ -155,12 +164,7 @@ || filter.nbHash != this.nbHash) { throw new IllegalArgumentException("filters cannot be or-ed"); } - - BloomFilter bf = (BloomFilter)filter; - - for(int i = 0; i < vectorSize; i++) { - this.vector[i] |= bf.vector[i]; - } + bits.or(((BloomFilter) filter).bits); }//end or() /** [EMAIL PROTECTED] */ @@ -172,24 +176,13 @@ || filter.nbHash != this.nbHash) { throw new IllegalArgumentException("filters cannot be xor-ed"); } - - BloomFilter bf = (BloomFilter)filter; - - for(int i = 0; i < vectorSize; i++) { - this.vector[i] = (this.vector[i] && !bf.vector[i]) - || (!this.vector[i] && bf.vector[i]); - } + bits.xor(((BloomFilter) filter).bits); }//and xor() /** [EMAIL PROTECTED] */ @Override public String toString(){ - StringBuilder res = new StringBuilder(); - - for(int i = 0; i < vectorSize; i++) { - res.append(vector[i] ? "1" : "0"); - } - return res.toString(); + return bits.toString(); }//end toString() /** [EMAIL PROTECTED] */ @@ -200,56 +193,50 @@ return bf; }//end clone() - /** [EMAIL PROTECTED] */ - @Override - public boolean equals(Object o) { - return this.compareTo(o) == 0; - } - - /** [EMAIL PROTECTED] */ - @Override - public int hashCode() { - int result = super.hashCode(); - for(int i = 0; i < vector.length; i++) { - result ^= Boolean.valueOf(vector[i]).hashCode(); - } - return result; - } - // Writable /** [EMAIL PROTECTED] */ @Override public void write(DataOutput out) throws IOException { super.write(out); - for(int i = 0; i < vector.length; i++) { - out.writeBoolean(vector[i]); + byte[] bytes = new byte[getNBytes()]; + for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { + if (bitIndex == 8) { + bitIndex = 0; + byteIndex++; + } + if (bitIndex == 0) { + bytes[byteIndex] = 0; + } + if (bits.get(i)) { + bytes[byteIndex] |= bitvalues[bitIndex]; + } } + out.write(bytes); } /** [EMAIL PROTECTED] */ @Override public void readFields(DataInput in) throws IOException { super.readFields(in); - vector = new boolean[vectorSize]; - for(int i = 0; i < vector.length; i++) { - vector[i] = in.readBoolean(); + byte[] bytes = new byte[getNBytes()]; + in.readFully(bytes); + for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { + if (bitIndex == 8) { + bitIndex = 0; + byteIndex++; + } + if (bitIndex == 0) { + bytes[byteIndex] = 0; + } + if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) { + bits.set(i); + } } } - - // Comparable - /** [EMAIL PROTECTED] */ - @Override - public int compareTo(Object o) { - int result = super.compareTo(o); - - BloomFilter other = (BloomFilter)o; - - for(int i = 0; result == 0 && i < vector.length; i++) { - result = (vector[i] == other.vector[i] ? 0 - : (vector[i] ? 1 : -1)); - } - return result; - }// end compareTo + /* @return number of bytes needed to hold bit vector */ + private int getNBytes() { + return (vectorSize + 7) / 8; + } }//end class Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/CountingBloomFilter.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/CountingBloomFilter.java?rev=611734&r1=611733&r2=611734&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/CountingBloomFilter.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/CountingBloomFilter.java Mon Jan 14 00:09:11 2008 @@ -213,22 +213,6 @@ return cbf; }//end clone() - /** [EMAIL PROTECTED] */ - @Override - public boolean equals(Object o) { - return this.compareTo(o) == 0; - } - - /** [EMAIL PROTECTED] */ - @Override - public int hashCode() { - int result = super.hashCode(); - for(int i = 0; i < vector.length; i++) { - result ^= Byte.valueOf(vector[i]).hashCode(); - } - return result; - } - // Writable /** [EMAIL PROTECTED] */ @@ -249,25 +233,4 @@ vector[i] = in.readByte(); } } - - // Comparable - - /** [EMAIL PROTECTED] */ - @Override - public int compareTo(Object o) { - int result = super.compareTo(o); - - if(result == 0) { - CountingBloomFilter other = (CountingBloomFilter)o; - - for(int i = 0; i < vector.length; i++) { - result = vector[i] - other.vector[i]; - - if(result != 0) { - break; - } - } - } - return result; - }// end compareTo }//end class Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/DynamicBloomFilter.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/DynamicBloomFilter.java?rev=611734&r1=611733&r2=611734&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/DynamicBloomFilter.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/DynamicBloomFilter.java Mon Jan 14 00:09:11 2008 @@ -247,22 +247,6 @@ return dbf; }//end clone() - /** [EMAIL PROTECTED] */ - @Override - public boolean equals(Object o) { - return this.compareTo(o) == 0; - } - - /** [EMAIL PROTECTED] */ - @Override - public int hashCode() { - int result = super.hashCode(); - for(int i = 0; i < matrix.length; i++) { - result ^= matrix[i].hashCode(); - } - return result; - } - // Writable /** [EMAIL PROTECTED] */ @@ -283,35 +267,6 @@ matrix[i].readFields(in); } } - - // Comparable - - /** [EMAIL PROTECTED] */ - @Override - public int compareTo(Object o) { - int result = super.compareTo(o); - - if(result == 0) { - DynamicBloomFilter other = (DynamicBloomFilter)o; - - result = this.nr - other.nr; - - if(result == 0) { - result = this.currentNbRecord - other.currentNbRecord; - - if(result == 0) { - for(int i = 0; i < matrix.length; i++) { - result = matrix[i].compareTo(other.matrix[i]) ; - - if(result != 0) { - break; - } - } - } - } - } - return result; - }// end compareTo /** * Adds a new row to <i>this</i> dynamic Bloom filter. Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/Filter.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/Filter.java?rev=611734&r1=611733&r2=611734&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/Filter.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/Filter.java Mon Jan 14 00:09:11 2008 @@ -54,7 +54,7 @@ import java.io.IOException; import java.util.Collection; import java.util.List; -import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.Writable; /** * Defines the general behavior of a filter. @@ -74,7 +74,7 @@ * @see org.onelab.filter.Key The general behavior of a key * @see org.onelab.filter.HashFunction A hash function */ -public abstract class Filter implements WritableComparable { +public abstract class Filter implements Writable { /** The vector size of <i>this</i> filter. */ int vectorSize; @@ -182,14 +182,6 @@ } }//end add() - /** [EMAIL PROTECTED] */ - @Override - public int hashCode() { - int result = Integer.valueOf(this.nbHash).hashCode(); - result ^= Integer.valueOf(this.vectorSize); - return result; - } - // Writable interface /** [EMAIL PROTECTED] */ @@ -204,19 +196,4 @@ this.vectorSize = in.readInt(); this.hash = new HashFunction(this.vectorSize, this.nbHash); } - - // Comparable interface - - /** [EMAIL PROTECTED] */ - public int compareTo(Object o) { - Filter other = (Filter)o; - int result = this.vectorSize - other.vectorSize; - if(result == 0) { - result = this.nbHash - other.nbHash; - } - - return result; - } - - }//end class Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/RetouchedBloomFilter.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/RetouchedBloomFilter.java?rev=611734&r1=611733&r2=611734&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/RetouchedBloomFilter.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/onelab/filter/RetouchedBloomFilter.java Mon Jan 14 00:09:11 2008 @@ -118,7 +118,7 @@ hash.clear(); for(int i = 0; i < nbHash; i++) { - vector[h[i]] = true; + bits.set(h[i]); keyVector[h[i]].add(key); }//end for - i }//end add() @@ -333,7 +333,7 @@ ratio[index] = 0.0; //update bit vector - vector[index] = false; + bits.clear(index); }//end clearBit() /** @@ -395,28 +395,6 @@ }//end for -i }//end createVector() - /** [EMAIL PROTECTED] */ - @Override - public boolean equals(Object o) { - return this.compareTo(o) == 0; - } - - /** [EMAIL PROTECTED] */ - @Override - public int hashCode() { - int result = super.hashCode(); - for(int i = 0; i < fpVector.length; i++) { - result ^= fpVector[i].hashCode(); - } - for(int i = 0; i < keyVector.length; i++) { - result ^= keyVector[i].hashCode(); - } - for(int i = 0; i < ratio.length; i++) { - result ^= Double.valueOf(ratio[i]).hashCode(); - } - return result; - } - // Writable /** [EMAIL PROTECTED] */ @@ -469,38 +447,4 @@ ratio[i] = in.readDouble(); } } - - // Comparable - - /** [EMAIL PROTECTED] */ - @Override - public int compareTo(Object o) { - int result = super.compareTo(o); - - RetouchedBloomFilter other = (RetouchedBloomFilter)o; - - for(int i = 0; result == 0 && i < fpVector.length; i++) { - List<Key> mylist = fpVector[i]; - List<Key> otherlist = other.fpVector[i]; - - for(int j = 0; result == 0 && j < mylist.size(); j++) { - result = mylist.get(j).compareTo(otherlist.get(j)); - } - } - - for(int i = 0; result == 0 && i < keyVector.length; i++) { - List<Key> mylist = keyVector[i]; - List<Key> otherlist = other.keyVector[i]; - - for(int j = 0; result == 0 && j < mylist.size(); j++) { - result = mylist.get(j).compareTo(otherlist.get(j)); - } - } - - for(int i = 0; result == 0 && i < ratio.length; i++) { - result = Double.valueOf(this.ratio[i] - other.ratio[i]).intValue(); - } - - return result; - }// end compareTo }//end class