Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java?rev=1339121&view=auto ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java (added) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java Wed May 16 11:29:08 2012 @@ -0,0 +1,213 @@ +/* +Copyright � 1999 CERN - European Organization for Nuclear Research. +Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose +is hereby granted without fee, provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear in supporting documentation. +CERN makes no representations about the suitability of this software for any purpose. +It is provided "as is" without expressed or implied warranty. +*/ +package org.apache.mahout.math.map; + +/** + * Status: Experimental; Do not use for production yet. Hash map holding (key,value) associations of type + * <tt>(int-->int)</tt>; Automatically grows and shrinks as needed; Implemented using open addressing with double + * hashing. First see the <a href="package-summary.html">package summary</a> and javadoc <a + * href="package-tree.html">tree view</a> to get the broad picture. + * + * Implements open addressing with double hashing, using "Brent's variation". Brent's variation slows insertions a bit + * down (not much) but reduces probes (collisions) for successful searches, in particular for large load factors. (It + * does not improve unsuccessful searches.) See D. Knuth, Searching and Sorting, 3rd ed., p.533-545 + * + * @author [email protected] + * @version 1.0, 09/24/99 + * @see java.util.HashMap + */ +class QuickOpenIntIntHashMap extends OpenIntIntHashMap { + //public int totalProbesSaved = 0; // benchmark only + + /** Constructs an empty map with default capacity and default load factors. */ + QuickOpenIntIntHashMap() { + this(defaultCapacity); + } + + /** + * Constructs an empty map with the specified initial capacity and default load factors. + * + * @param initialCapacity the initial capacity of the map. + * @throws IllegalArgumentException if the initial capacity is less than zero. + */ + QuickOpenIntIntHashMap(int initialCapacity) { + this(initialCapacity, defaultMinLoadFactor, defaultMaxLoadFactor); + } + + /** + * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor. + * + * @param initialCapacity the initial capacity. + * @param minLoadFactor the minimum load factor. + * @param maxLoadFactor the maximum load factor. + * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) || + * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >= + * maxLoadFactor)</tt>. + */ + QuickOpenIntIntHashMap(int initialCapacity, double minLoadFactor, double maxLoadFactor) { + setUp(initialCapacity, minLoadFactor, maxLoadFactor); + } + + /** + * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if + * existing. + * + * @param key the key the value shall be associated with. + * @param value the value to be associated. + * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did + * already contain such a key - the new value has now replaced the formerly associated value. + */ + @Override + public boolean put(int key, int value) { + /* + This is open addressing with double hashing, using "Brent's variation". + Brent's variation slows insertions a bit down (not much) but reduces probes (collisions) for successful searches, in particular for large load factors. + (It does not improve unsuccessful searches.) + See D. Knuth, Searching and Sorting, 3rd ed., p.533-545 + + h1(key) = hash % M + h2(key) = decrement = Max(1, hash/M % M) + M is prime = capacity = table.length + probing positions are table[(h1-j*h2) % M] for j=0,1,... + (M and h2 could also be chosen differently, but h2 is required to be relative prime to M.) + */ + + int[] tab = table; + byte[] stat = state; + int length = tab.length; + + int hash = HashFunctions.hash(key) & 0x7FFFFFFF; + int i = hash % length; + int decrement = (hash / length) % length; + if (decrement == 0) { + decrement = 1; + } + + // stop if we find a removed or free slot, or if we find the key itself + // do NOT skip over removed slots (yes, open addressing is like that...) + //int comp = comparisons; + int t = 0; // the number of probes + int p0 = i; // the first position to probe + while (stat[i] == FULL && tab[i] != key) { + t++; + i -= decrement; + //hashCollisions++; + if (i < 0) { + i += length; + } + } + if (stat[i] == FULL) { + // key already contained at slot i. + this.values[i] = value; + return false; + } + // not already contained, should be inserted at slot i. + + if (this.distinct > this.highWaterMark) { + int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor); + rehash(newCapacity); + return put(key, value); + } + + /* + Brent's variation does a local reorganization to reduce probes. It essentially means: + We test whether it is possible to move the association we probed first (table[p0]) out of the way. + If this is possible, it will reduce probes for the key to be inserted, since it takes its place; it gets hit earlier. + However, future probes for the key that we move out of the way will increase. + Thus we only move it out of the way, if we have a net gain, that is, if we save more probes than we loose. + For the first probe we safe more than we loose if the number of probes we needed was >=2 (t>=2). + If the first probe cannot be moved out of the way, we try the next probe (p1). + Now we safe more than we loose if t>=3. + We repeat this until we find that we cannot gain or that we can indeed move p(x) out of the way. + + Note: Under the great majority of insertions t<=1, so the loop is entered very infrequently. + */ + while (t > 1) { + int key0 = tab[p0]; + hash = HashFunctions.hash(key0) & 0x7FFFFFFF; + decrement = (hash / length) % length; + if (decrement == 0) { + decrement = 1; + } + int pc = p0 - decrement; // pc = (p0-j*decrement) % M, j=1,2,.. + if (pc < 0) { + pc += length; + } + + if (stat[pc] != FREE) { // not a free slot, continue searching for free slot to move to, or break. + p0 = pc; + t--; + } else { // free or removed slot found, now move... + tab[pc] = key0; + stat[pc] = FULL; + values[pc] = values[p0]; + i = p0; // prepare to insert: table[p0]=key + t = 0; // break loop + } + } + + this.table[i] = key; + this.values[i] = value; + if (this.state[i] == FREE) { + this.freeEntries--; + } + this.state[i] = FULL; + this.distinct++; + + if (this.freeEntries < 1) { //delta + int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor); + rehash(newCapacity); + } + + return true; + } + + /** + * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called + * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water + * mark. + */ + @Override + protected void rehash(int newCapacity) { + int oldCapacity = table.length; + //if (oldCapacity == newCapacity) return; + + int[] oldTable = table; + int[] oldValues = values; + byte[] oldState = state; + + int[] newTable = new int[newCapacity]; + int[] newValues = new int[newCapacity]; + byte[] newState = new byte[newCapacity]; + + this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor); + this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor); + + this.table = newTable; + this.values = newValues; + this.state = newState; + this.freeEntries = newCapacity - this.distinct; // delta + + int tmp = this.distinct; + this.distinct = Integer.MIN_VALUE; // switch of watermarks + for (int i = oldCapacity; i-- > 0;) { + if (oldState[i] == FULL) { + put(oldTable[i], oldValues[i]); + /* + int element = oldTable[i]; + int index = indexOfInsertion(element); + newTable[index]=element; + newValues[index]=oldValues[i]; + newState[index]=FULL; + */ + } + } + this.distinct = tmp; + } +}
Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/map/package.html URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/map/package.html?rev=1339121&view=auto ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/map/package.html (added) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/map/package.html Wed May 16 11:29:08 2012 @@ -0,0 +1,290 @@ +<HTML> +<BODY> +Automatically growing and shrinking maps holding objects or primitive +data types such as <tt>int</tt>, <tt>double</tt>, etc. Currently all maps are +based upon hashing. +<h2><a name="Overview"></a>1. Overview</h2> + +<p>The map package offers flexible object oriented abstractions modelling automatically + resizing maps. It is designed to be scalable in terms of performance and memory + requirements.</p> + +<p>Features include: </p> + +<p></p> +<ul> + <li>Maps operating on objects as well as all primitive data types such as <code>int</code>, + <code>double</code>, etc. + </li> + <li>Compact representations</li> + <li>Support for quick access to associations</li> + <li>A number of general purpose map operations</li> +</ul> +<p>File-based I/O can be achieved through the standard Java built-in serialization + mechanism. All classes implement the {@link java.io.Serializable} interface. + However, the toolkit is entirely decoupled from advanced I/O. It provides data + structures and algorithms only. + +<p> This toolkit borrows some terminology from the Javasoft <a + href="http://www.javasoft.com/products/jdk/1.2/docs/guide/collections/index.html"> + Collections framework</a> written by Josh Bloch and introduced in JDK 1.2. + +<h2>2. Introduction</h2> + +<p>A map is an associative container that manages a set of (key,value) pairs. + It is useful for implementing a collection of one-to-one mappings. A (key,value) + pair is called an <i>association</i>. A value can be looked up up via its key. + Associations can quickly be set, removed and retrieved. They are stored in a + hashing structure based on the hash code of their keys, which is obtained by + using a hash function. </p> + +<p> A map can, for example, contain <tt>Name-->Location</tt> associations like + <tt>{("Pete", "Geneva"), ("Steve", "Paris"), ("Robert", "New York")}</tt> used + in address books or <tt>Index-->Value</tt> mappings like <tt>{(0, 100), (3, + 1000), (100000, 70)}</tt> representing sparse lists or matrices. For example + this could mean at index 0 we have a value of 100, at index 3 we have a value + of 1000, at index 1000000 we have a value of 70, and at all other indexes we + have a value of, say, zero. Another example is a map of IP addresses to domain + names (DNS). Maps can also be useful to represent<i> multi sets</i>, that is, + sets where elements can occur more than once. For multi sets one would have + <tt>Value-->Frequency</tt> mappings like <tt>{(100, 1), (50, 1000), (101, 3))}</tt> + meaning element 100 occurs 1 time, element 50 occurs 1000 times, element 101 + occurs 3 times. Further, maps can also manage <tt>ObjectIdentifier-->Object</tt> + mappings like <tt>{(12, obj1), (7, obj2), (10000, obj3), (9, obj4)}</tt> used + in Object Databases. + +<p> A map cannot contain two or more <i>equal</i> keys; a key can map to at most + one value. However, more than one key can map to identical values. For primitive + data types "equality" of keys is defined as identity (operator <tt>==</tt>). + For maps using <tt>Object</tt> keys, the meaning of "equality" can be specified + by the user upon instance construction. It can either be defined to be identity + (operator <tt>==</tt>) or to be given by the method {@link java.lang.Object#equals(Object)}. + Associations of kind <tt>(AnyType,Object)</tt> can be of the form <tt>(AnyKey,null) + </tt>, i.e. values can be <tt>null</tt>. + +<p> The classes of this package make no guarantees as to the order of the elements + returned by iterators; in particular, they do not guarantee that the order will + remain constant over time. + +<h2></h2> +<h4>Copying</h4> + +<p> + +<p>Any map can be copied. A copy is <i>equal</i> to the original but entirely + independent of the original. So changes in the copy are not reflected in the + original, and vice-versa. + +<h2>3. Package organization</h2> + +<p>For most primitive data types and for objects there exists a separate map version. + All versions are just the same, except that they operate on different data types. + Colt includes two kinds of implementations for maps: The two different implementations + are tagged <b>Chained</b> and <b>Open</b>. + Note: Chained is no more included. Wherever it is mentioned it is of historic interest only.</p> +<ul> + <li><b>Chained</b> uses extendible separate chaining with chains holding unsorted + dynamically linked collision lists. + <li><b>Open</b> uses extendible open addressing with double hashing. +</ul> +<p>Class naming follows the schema <tt><Implementation><KeyType><ValueType>HashMap</tt>. + For example, a {@link org.apache.mahout.math.map.OpenIntDoubleHashMap} holds <tt>(int-->double)</tt> + associations and is implemented with open addressing. A {@link org.apache.mahout.math.map.OpenIntObjectHashMap} + holds <tt>(int-->Object)</tt> associations and is implemented with open addressing. +</p> + +<p>The classes for maps of a given (key,value) type are derived from a common + abstract base class tagged <tt>Abstract<KeyType><ValueType></tt><tt>Map</tt>. + For example, all maps operating on <tt>(int-->double)</tt> associations are + derived from {@link org.apache.mahout.math.map.AbstractIntDoubleMap}, which in turn is derived + from an abstract base class tying together all maps regardless of assocation + type, {@link org.apache.mahout.math.set.AbstractSet}. The abstract base classes provide skeleton + implementations for all but few methods. Experimental layouts (such as chaining, + open addressing, extensible hashing, red-black-trees, etc.) can easily be implemented + and inherit a rich set of functionality. Have a look at the javadoc <a href="package-tree.html">tree + view</a> to get the broad picture.</p> + +<h2>4. Example usage</h2> +<TABLE> + <TD CLASS="PRE"> + <PRE> +int[] keys = {0 , 3 , 100000, 9 }; +double[] values = {100.0, 1000.0, 70.0 , 71.0}; + +AbstractIntDoubleMap map = new OpenIntDoubleHashMap(); + +// add several associations +for (int i=0; i < keys.length; i++) map.put(keys[i], values[i]); + +log.info("map="+map); +log.info("size="+map.size()); +log.info(map.containsKey(3)); +log.info("get(3)="+map.get(3)); +log.info(map.containsKey(4)); +log.info("get(4)="+map.get(4)); +log.info(map.containsValue(71.0)); +log.info("keyOf(71.0)="+map.keyOf(71.0)); + +// remove one association +map.removeKey(3); +log.info("\nmap="+map); +log.info(map.containsKey(3)); +log.info("get(3)="+map.get(3)); +log.info(map.containsValue(1000.0)); +log.info("keyOf(1000.0)="+map.keyOf(1000.0)); + +// clear +map.clear(); +log.info("\nmap="+map); +log.info("size="+map.size()); +</PRE> + </TD> +</TABLE> + +yields the following output + +<TABLE> + <TD CLASS="PRE"> +<PRE> +map=[0->100.0, 3->1000.0, 9->71.0, 100000->70.0] +size=4 +true +get(3)=1000.0 +false +get(4)=0.0 +true +keyOf(71.0)=9 + +map=[0->100.0, 9->71.0, 100000->70.0] +false +get(3)=0.0 +false +keyOf(1000.0)=-2147483648 + +map=[] +size=0 +</PRE> + </TD> +</TABLE> +<h2> 5. Notes </h2> + +<p> + Note that implementations are not synchronized. + +<p> + Choosing efficient parameters for hash maps is not always easy. + However, since parameters determine efficiency and memory requirements, here is a quick guide how to choose them. + If your use case does not heavily operate on hash maps but uses them just because they provide + convenient functionality, you can safely skip this section. + For those of you who care, read on. + +<p> + + There are three parameters that can be customized upon map construction: <tt>initialCapacity</tt>, + <tt>minLoadFactor</tt> and <tt>maxLoadFactor</tt>. + The more memory one can afford, the faster a hash map. + The hash map's capacity is the maximum number of associations that can be added without needing to allocate new + internal memory. + A larger capacity means faster adding, searching and removing. + The <tt>initialCapacity</tt> corresponds to the capacity used upon instance construction. + +<p> + The <tt>loadFactor</tt> of a hash map measures the degree of "fullness". + It is given by the number of assocations (<tt>size()</tt>) + divided by the hash map capacity <tt>(0.0 <= loadFactor <= 1.0)</tt>. + The more associations are added, the larger the loadFactor and the more hash map performance degrades. + Therefore, when the loadFactor exceeds a customizable threshold (<tt>maxLoadFactor</tt>), the hash map is + automatically grown. + In such a way performance degradation can be avoided. + Similarly, when the loadFactor falls below a customizable threshold (<tt>minLoadFactor</tt>), the hash map is + automatically shrinked. + In such a way excessive memory consumption can be avoided. + Automatic resizing (both growing and shrinking) obeys the following invariant: + +<p> + <tt>capacity * minLoadFactor <= size() <= capacity * maxLoadFactor</tt> + +<p> The term <tt>capacity * minLoadFactor</tt> is called the <i>low water mark</i>, + <tt>capacity * maxLoadFactor</tt> is called the <i>high water mark</i>. In other + words, the number of associations may vary within the water mark constraints. + When it goes out of range, the map is automatically resized and memory consumption + changes proportionally. +<ul> + <li>To tune for memory at the expense of performance, both increase <tt>minLoadFactor</tt> and <tt>maxLoadFactor</tt>. + <li>To tune for performance at the expense of memory, both decrease <tt>minLoadFactor</tt> and <tt>maxLoadFactor</tt>. + As as special case set <tt>minLoadFactor=0</tt> to avoid any automatic shrinking. +</ul> +Resizing large hash maps can be time consuming, <tt>O(size())</tt>, and should be avoided if possible (maintaining +primes is not the reason). +Unnecessary growing operations can be avoided if the number of associations is known before they are added, or can be +estimated.<p> + In such a case good parameters are as follows: + +<p> + <i>For chaining:</i> + <br>Set the <tt>initialCapacity = 1.4*expectedSize</tt> or greater. + <br>Set the <tt>maxLoadFactor = 0.8</tt> or greater. + +<p> + <i>For open addressing:</i> + <br>Set the <tt>initialCapacity = 2*expectedSize</tt> or greater. Alternatively call <tt>ensureCapacity(...)</tt>. + <br>Set the <tt>maxLoadFactor = 0.5</tt>. + <br>Never set <tt>maxLoadFactor > 0.55</tt>; open addressing exponentially slows down beyond that point. + +<p> + In this way the hash map will never need to grow and still stay fast. + It is never a good idea to set <tt>maxLoadFactor < 0.1</tt>, + because the hash map would grow too often. + If it is entirelly unknown how many associations the application will use, + the default constructor should be used. The map will grow and shrink as needed. + +<p> + + <b>Comparision of chaining and open addressing</b> + +<p> Chaining is faster than open addressing, when assuming unconstrained memory + consumption. Open addressing is more space efficient than chaining, because + it does not create entry objects but uses primitive arrays which are considerably + smaller. Entry objects consume significant amounts of memory compared to the + information they actually hold. Open addressing also poses no problems to the + garbage collector. In contrast, chaining can create millions of entry objects + which are linked; a nightmare for any garbage collector. In addition, entry + object creation is a bit slow. <br> + Therefore, with the same amount of memory, or even less memory, hash maps with + larger capacity can be maintained under open addressing, which yields smaller + loadFactors, which in turn keeps performance competitive with chaining. In our + benchmarks, using significantly less memory, open addressing usually is not + more than 1.2-1.5 times slower than chaining. + +<p><b>Further readings</b>: + <br>Knuth D., The Art of Computer Programming: Searching and Sorting, 3rd ed. + <br>Griswold W., Townsend G., The Design and Implementation of Dynamic Hashing for Sets and Tables in Icon, Software - + Practice and Experience, Vol. 23(4), 351-367 (April 1993). + <br>Larson P., Dynamic hash tables, Comm. of the ACM, 31, (4), 1988. + +<p> + <b>Performance:</b> + +<p> + Time complexity: + <br>The classes offer <i>expected</i> time complexity <tt>O(1)</tt> (i.e. constant time) for the basic operations + <tt>put</tt>, <tt>get</tt>, <tt>removeKey</tt>, <tt>containsKey</tt> and <tt>size</tt>, + assuming the hash function disperses the elements properly among the buckets. + Otherwise, pathological cases, although highly improbable, can occur, degrading performance to <tt>O(N)</tt> in the + worst case. + Operations <tt>containsValue</tt> and <tt>keyOf</tt> are <tt>O(N)</tt>. + +<p> + Memory requirements for <i>open addressing</i>: + <br>worst case: <tt>memory [bytes] = (1/minLoadFactor) * size() * (1 + sizeOf(key) + sizeOf(value))</tt>. + <br>best case: <tt>memory [bytes] = (1/maxLoadFactor) * size() * (1 + sizeOf(key) + sizeOf(value))</tt>. + Where <tt>sizeOf(int) = 4</tt>, <tt>sizeOf(double) = 8</tt>, <tt>sizeOf(Object) = 4</tt>, etc. + Thus, an <tt>OpenIntIntHashMap</tt> with minLoadFactor=0.25 and maxLoadFactor=0.5 and 1000000 associations uses + between 17 MB and 34 MB. + The same map with 1000 associations uses between 17 and 34 KB. + +<p> + +</BODY> +</HTML> + Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/package.html URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/package.html?rev=1339121&view=auto ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/package.html (added) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/package.html Wed May 16 11:29:08 2012 @@ -0,0 +1,5 @@ +<HTML> +<BODY> +Core base classes; Operations on primitive arrays such as sorting, partitioning and permuting. +</BODY> +</HTML> Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java?rev=1339121&view=auto ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java (added) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java Wed May 16 11:29:08 2012 @@ -0,0 +1,188 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* +Copyright 1999 CERN - European Organization for Nuclear Research. +Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose +is hereby granted without fee, provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear in supporting documentation. +CERN makes no representations about the suitability of this software for any purpose. +It is provided "as is" without expressed or implied warranty. +*/ +package org.apache.mahout.math.set; + +import org.apache.mahout.math.PersistentObject; +import org.apache.mahout.math.map.PrimeFinder; + +public abstract class AbstractSet extends PersistentObject { + //public static boolean debug = false; // debug only + + /** The number of distinct associations in the map; its "size()". */ + protected int distinct; + + /** + * The table capacity c=table.length always satisfies the invariant <tt>c * minLoadFactor <= s <= c * + * maxLoadFactor</tt>, where s=size() is the number of associations currently contained. The term "c * minLoadFactor" + * is called the "lowWaterMark", "c * maxLoadFactor" is called the "highWaterMark". In other words, the table capacity + * (and proportionally the memory used by this class) oscillates within these constraints. The terms are precomputed + * and cached to avoid recalculating them each time put(..) or removeKey(...) is called. + */ + protected int lowWaterMark; + protected int highWaterMark; + + /** The minimum load factor for the hashtable. */ + protected double minLoadFactor; + + /** The maximum load factor for the hashtable. */ + protected double maxLoadFactor; + + // these are public access for unit tests. + public static final int defaultCapacity = 277; + public static final double defaultMinLoadFactor = 0.2; + public static final double defaultMaxLoadFactor = 0.5; + + /** + * Chooses a new prime table capacity optimized for growing that (approximately) satisfies the invariant <tt>c * + * minLoadFactor <= size <= c * maxLoadFactor</tt> and has at least one FREE slot for the given size. + */ + protected int chooseGrowCapacity(int size, double minLoad, double maxLoad) { + return nextPrime(Math.max(size + 1, (int) ((4 * size / (3 * minLoad + maxLoad))))); + } + + /** + * Returns new high water mark threshold based on current capacity and maxLoadFactor. + * + * @return int the new threshold. + */ + protected int chooseHighWaterMark(int capacity, double maxLoad) { + return Math.min(capacity - 2, (int) (capacity * maxLoad)); //makes sure there is always at least one FREE slot + } + + /** + * Returns new low water mark threshold based on current capacity and minLoadFactor. + * + * @return int the new threshold. + */ + protected int chooseLowWaterMark(int capacity, double minLoad) { + return (int) (capacity * minLoad); + } + + /** + * Chooses a new prime table capacity neither favoring shrinking nor growing, that (approximately) satisfies the + * invariant <tt>c * minLoadFactor <= size <= c * maxLoadFactor</tt> and has at least one FREE slot for the given + * size. + */ + protected int chooseMeanCapacity(int size, double minLoad, double maxLoad) { + return nextPrime(Math.max(size + 1, (int) ((2 * size / (minLoad + maxLoad))))); + } + + /** + * Chooses a new prime table capacity optimized for shrinking that (approximately) satisfies the invariant <tt>c * + * minLoadFactor <= size <= c * maxLoadFactor</tt> and has at least one FREE slot for the given size. + */ + protected int chooseShrinkCapacity(int size, double minLoad, double maxLoad) { + return nextPrime(Math.max(size + 1, (int) ((4 * size / (minLoad + 3 * maxLoad))))); + } + + /** Removes all (key,value) associations from the receiver. */ + public abstract void clear(); + + /** + * Ensures that the receiver can hold at least the specified number of elements without needing to allocate new + * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. <p> This + * method never need be called; it is for performance tuning only. Calling this method before <tt>put()</tt>ing a + * large number of associations boosts performance, because the receiver will grow only once instead of potentially + * many times. <p> <b>This default implementation does nothing.</b> Override this method if necessary. + * + * @param minCapacity the desired minimum capacity. + */ + public void ensureCapacity(int minCapacity) { + } + + /** + * Returns <tt>true</tt> if the receiver contains no (key,value) associations. + * + * @return <tt>true</tt> if the receiver contains no (key,value) associations. + */ + public boolean isEmpty() { + return distinct == 0; + } + + /** + * Returns a prime number which is <code>>= desiredCapacity</code> and very close to <code>desiredCapacity</code> + * (within 11% if <code>desiredCapacity >= 1000</code>). + * + * @param desiredCapacity the capacity desired by the user. + * @return the capacity which should be used for a hashtable. + */ + protected int nextPrime(int desiredCapacity) { + return PrimeFinder.nextPrime(desiredCapacity); + } + + /** + * Initializes the receiver. You will almost certainly need to override this method in subclasses to initialize the + * hash table. + * + * @param initialCapacity the initial capacity of the receiver. + * @param minLoadFactor the minLoadFactor of the receiver. + * @param maxLoadFactor the maxLoadFactor of the receiver. + * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) || + * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >= + * maxLoadFactor)</tt>. + */ + protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) { + if (initialCapacity < 0) { + throw new IllegalArgumentException("Initial Capacity must not be less than zero: " + initialCapacity); + } + if (minLoadFactor < 0.0 || minLoadFactor >= 1.0) { + throw new IllegalArgumentException("Illegal minLoadFactor: " + minLoadFactor); + } + if (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) { + throw new IllegalArgumentException("Illegal maxLoadFactor: " + maxLoadFactor); + } + if (minLoadFactor >= maxLoadFactor) { + throw new IllegalArgumentException( + "Illegal minLoadFactor: " + minLoadFactor + " and maxLoadFactor: " + maxLoadFactor); + } + } + + /** + * Returns the number of (key,value) associations currently contained. + * + * @return the number of (key,value) associations currently contained. + */ + public int size() { + return distinct; + } + + /** + * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An + * application can use this operation to minimize the storage of the receiver. <p> This default implementation does + * nothing. Override this method if necessary. + */ + public void trimToSize() { + } + + protected static boolean equalsMindTheNull(Object a, Object b) { + if (a == null && b == null) { + return true; + } + if (a == null || b == null) { + return false; + } + return a.equals(b); + } +} Copied: mahout/trunk/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java (from r1339109, mahout/trunk/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t) URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java?p2=mahout/trunk/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java&p1=mahout/trunk/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t&r1=1339109&r2=1339121&rev=1339121&view=diff ============================================================================== --- mahout/trunk/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java Wed May 16 11:29:08 2012 @@ -19,40 +19,37 @@ package org.apache.mahout.math.set; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Set; -import org.apache.mahout.math.function.${keyTypeCap}Procedure; -import org.apache.mahout.math.list.${keyTypeCap}ArrayList; -import org.apache.mahout.math.map.HashFunctions; +import org.apache.mahout.math.function.ObjectProcedure; import org.apache.mahout.math.map.PrimeFinder; /** - * Open hash set of ${keyType} items; + * Open hashing alternative to java.util.HashSet. **/ -public class Open${keyTypeCap}HashSet extends Abstract${keyTypeCap}Set { +public class OpenHashSet<T> extends AbstractSet implements Set<T> { protected static final byte FREE = 0; protected static final byte FULL = 1; protected static final byte REMOVED = 2; -#if (${keyTypeFloating} == 'true') -#set ($noKeyComment = "${keyTypeCap}.NaN") - protected static final ${keyType} NO_KEY_VALUE = ${keyTypeCap}.NaN; -#else -#set ($noKeyComment = "0") - protected static final ${keyType} NO_KEY_VALUE = 0; -#end + protected static final char NO_KEY_VALUE = 0; /** The hash table keys. */ - protected ${keyType}[] table; + private Object[] table; /** The state of each hash table entry (FREE, FULL, REMOVED). */ - protected byte[] state; + private byte[] state; /** The number of table entries in state==FREE. */ - protected int freeEntries; + private int freeEntries; /** Constructs an empty map with default capacity and default load factors. */ - public Open${keyTypeCap}HashSet() { + public OpenHashSet() { this(defaultCapacity); } @@ -62,7 +59,7 @@ public class Open${keyTypeCap}HashSet ex * @param initialCapacity the initial capacity of the map. * @throws IllegalArgumentException if the initial capacity is less than zero. */ - public Open${keyTypeCap}HashSet(int initialCapacity) { + public OpenHashSet(int initialCapacity) { this(initialCapacity, defaultMinLoadFactor, defaultMaxLoadFactor); } @@ -76,7 +73,7 @@ public class Open${keyTypeCap}HashSet ex * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >= * maxLoadFactor)</tt>. */ - public Open${keyTypeCap}HashSet(int initialCapacity, double minLoadFactor, double maxLoadFactor) { + public OpenHashSet(int initialCapacity, double minLoadFactor, double maxLoadFactor) { setUp(initialCapacity, minLoadFactor, maxLoadFactor); } @@ -94,9 +91,10 @@ public class Open${keyTypeCap}HashSet ex * * @return a deep copy of the receiver. */ + @SuppressWarnings("unchecked") @Override public Object clone() { - Open${keyTypeCap}HashSet copy = (Open${keyTypeCap}HashSet) super.clone(); + OpenHashSet<T> copy = (OpenHashSet<T>) super.clone(); copy.table = copy.table.clone(); copy.state = copy.state.clone(); return copy; @@ -108,8 +106,9 @@ public class Open${keyTypeCap}HashSet ex * @return <tt>true</tt> if the receiver contains the specified key. */ @Override - public boolean contains(${keyType} key) { - return indexOfKey(key) >= 0; + @SuppressWarnings("unchecked") + public boolean contains(Object key) { + return indexOfKey((T)key) >= 0; } /** @@ -140,11 +139,11 @@ public class Open${keyTypeCap}HashSet ex * continues. * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise. */ - @Override - public boolean forEachKey(${keyTypeCap}Procedure procedure) { + @SuppressWarnings("unchecked") + public boolean forEachKey(ObjectProcedure<T> procedure) { for (int i = table.length; i-- > 0;) { if (state[i] == FULL) { - if (!procedure.apply(table[i])) { + if (!procedure.apply((T)table[i])) { return false; } } @@ -159,10 +158,12 @@ public class Open${keyTypeCap}HashSet ex * at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at * slot index. */ - protected int indexOfInsertion(${keyType} key) { - final int length = table.length; + protected int indexOfInsertion(T key) { + Object[] tab = table; + byte[] stat = state; + int length = tab.length; - final int hash = HashFunctions.hash(key) & 0x7FFFFFFF; + int hash = key.hashCode() & 0x7FFFFFFF; int i = hash % length; int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html //int decrement = (hash / length) % length; @@ -172,7 +173,7 @@ public class Open${keyTypeCap}HashSet ex // stop if we find a removed or free slot, or if we find the key itself // do NOT skip over removed slots (yes, open addressing is like that...) - while (state[i] == FULL && table[i] != key) { + while (stat[i] == FULL && tab[i] != key) { i -= decrement; //hashCollisions++; if (i < 0) { @@ -180,25 +181,25 @@ public class Open${keyTypeCap}HashSet ex } } - if (state[i] == REMOVED) { + if (stat[i] == REMOVED) { // stop if we find a free slot, or if we find the key itself. // do skip over removed slots (yes, open addressing is like that...) // assertion: there is at least one FREE slot. - final int j = i; - while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) { + int j = i; + while (stat[i] != FREE && (stat[i] == REMOVED || tab[i] != key)) { i -= decrement; //hashCollisions++; if (i < 0) { i += length; } } - if (state[i] == FREE) { + if (stat[i] == FREE) { i = j; } } - if (state[i] == FULL) { + if (stat[i] == FULL) { // key already contained at slot i. // return a negative number identifying the slot. return -i - 1; @@ -212,10 +213,12 @@ public class Open${keyTypeCap}HashSet ex * @param key the key to be searched in the receiver. * @return the index where the key is contained in the receiver, returns -1 if the key was not found. */ - protected int indexOfKey(${keyType} key) { - final int length = table.length; + protected int indexOfKey(T key) { + Object[] tab = table; + byte[] stat = state; + int length = tab.length; - final int hash = HashFunctions.hash(key) & 0x7FFFFFFF; + int hash = key.hashCode() & 0x7FFFFFFF; int i = hash % length; int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html //int decrement = (hash / length) % length; @@ -225,7 +228,7 @@ public class Open${keyTypeCap}HashSet ex // stop if we find a free slot, or if we find the key itself. // do skip over removed slots (yes, open addressing is like that...) - while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) { + while (stat[i] != FREE && (stat[i] == REMOVED || (!key.equals(tab[i])))) { i -= decrement; //hashCollisions++; if (i < 0) { @@ -233,7 +236,7 @@ public class Open${keyTypeCap}HashSet ex } } - if (state[i] == FREE) { + if (stat[i] == FREE) { return -1; } // not found return i; //found, return index where key is contained @@ -241,39 +244,32 @@ public class Open${keyTypeCap}HashSet ex /** * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this - * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to - * be <i>identical</i> to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}. - * <p> This method can be used + * call returns the specified list has a new size that equals <tt>this.size()</tt>. + * This method can be used * to iterate over the keys of the receiver. * * @param list the list to be filled, can have any size. */ - @Override - public void keys(${keyTypeCap}ArrayList list) { - list.setSize(distinct); - ${keyType} [] elements = list.elements(); - - int j = 0; - for (int i = table.length; i-- > 0;) { - if (state[i] == FULL) { - elements[j++] = table[i]; + @SuppressWarnings("unchecked") + public void keys(List<T> list) { + list.clear(); + + + Object [] tab = table; + byte[] stat = state; + + for (int i = tab.length; i-- > 0;) { + if (stat[i] == FULL) { + list.add((T)tab[i]); } } } - /** - * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if - * existing. - * - * @param key the key the value shall be associated with. - * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did - * already contain such a key - the new value has now replaced the formerly associated value. - */ + @SuppressWarnings("unchecked") @Override - public boolean add(${keyType} key) { - int i = indexOfInsertion(key); + public boolean add(Object key) { + int i = indexOfInsertion((T)key); if (i < 0) { //already contained - //i = -i - 1; return false; } @@ -293,8 +289,9 @@ public class Open${keyTypeCap}HashSet ex if (this.freeEntries < 1) { //delta int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor); rehash(newCapacity); + return add(key); } - + return true; } @@ -303,27 +300,30 @@ public class Open${keyTypeCap}HashSet ex * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water * mark. */ + @SuppressWarnings("unchecked") protected void rehash(int newCapacity) { int oldCapacity = table.length; //if (oldCapacity == newCapacity) return; - ${keyType}[] oldTable = table; + Object[] oldTable = table; byte[] oldState = state; - this.table = new ${keyType}[newCapacity]; - this.state = new byte[newCapacity]; + Object[] newTable = new Object[newCapacity]; + byte[] newState = new byte[newCapacity]; this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor); this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor); + this.table = newTable; + this.state = newState; this.freeEntries = newCapacity - this.distinct; // delta for (int i = oldCapacity; i-- > 0;) { if (oldState[i] == FULL) { - ${keyType} element = oldTable[i]; - int index = indexOfInsertion(element); - this.table[index] = element; - this.state[index] = FULL; + Object element = oldTable[i]; + int index = indexOfInsertion((T)element); + newTable[index] = element; + newState[index] = FULL; } } } @@ -334,9 +334,10 @@ public class Open${keyTypeCap}HashSet ex * @param key the key to be removed from the receiver. * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise. */ + @SuppressWarnings("unchecked") @Override - public boolean remove(${keyType} key) { - int i = indexOfKey(key); + public boolean remove(Object key) { + int i = indexOfKey((T)key); if (i < 0) { return false; } // key not contained @@ -371,7 +372,7 @@ public class Open${keyTypeCap}HashSet ex capacity = 1; } // open addressing needs at least one FREE slot at any time. - this.table = new ${keyType}[capacity]; + this.table = new Object[capacity]; this.state = new byte[capacity]; // memory will be exhausted long before this pathological case happens, anyway. @@ -413,11 +414,125 @@ public class Open${keyTypeCap}HashSet ex * @param minLoadFactor * @param maxLoadFactor */ - protected void getInternalFactors(int[] capacity, + void getInternalFactors(int[] capacity, double[] minLoadFactor, double[] maxLoadFactor) { capacity[0] = table.length; minLoadFactor[0] = this.minLoadFactor; maxLoadFactor[0] = this.maxLoadFactor; } + + @Override + public boolean isEmpty() { + return size() == 0; + } + + /** + * OpenHashSet instances are only equal to other OpenHashSet instances, not to + * any other collection. Hypothetically, we should check for and permit + * equals on other Sets. + */ + @SuppressWarnings("unchecked") + public boolean equals(Object obj) { + if (obj == this) { + return true; + } + + if (!(obj instanceof OpenHashSet)) { + return false; + } + final OpenHashSet<T> other = (OpenHashSet<T>) obj; + if (other.size() != size()) { + return false; + } + + return + forEachKey( + new ObjectProcedure<T>() { + @Override + public boolean apply(T key) { + return other.contains(key); + } + } + ); + } + + /** + * Implement the standard Java Collections iterator. Note that 'remove' is silently + * ineffectual here. This method is provided for convenience, only. + */ + @Override + public Iterator<T> iterator() { + List<T> keyList = new ArrayList<T>(); + keys(keyList); + return keyList.iterator(); + } + + @Override + public Object[] toArray() { + List<T> keyList = new ArrayList<T>(); + keys(keyList); + return keyList.toArray(); + } + + @Override + public boolean addAll(Collection<? extends T> c) { + boolean anyAdded = false; + for(T o : c) { + boolean added = add(o); + anyAdded |= added; + } + return anyAdded; + } + + @Override + public boolean containsAll(Collection<?> c) { + for (Object o : c) { + if (!contains(o)) { + return false; + } + } + return true; + } + + @Override + public boolean removeAll(Collection<?> c) { + boolean anyRemoved = false; + for(Object o : c) { + boolean removed = remove(o); + anyRemoved |= removed; + } + return anyRemoved; + } + + @Override + public boolean retainAll(Collection<?> c) { + final Collection<?> finalCollection = c; + final boolean[] modified = new boolean[1]; + modified[0] = false; + forEachKey(new ObjectProcedure<T>() { + + @Override + public boolean apply(T element) { + if (!finalCollection.contains(element)) { + remove(element); + modified[0] = true; + } + return true; + }}); + return modified[0]; + } + + @Override + public <T2> T2[] toArray(T2[] a) { + List<T> keys = new ArrayList<T>(); + keys(keys); + return keys.toArray(a); + } + + public List<T> keys() { + List<T> keys = new ArrayList<T>(); + keys(keys); + return keys; + } }
