mrglavas    2004/07/30 15:10:55

  Modified:    java/src/org/apache/xerces/util SymbolTable.java
  Log:
  JIRA Issue #998:

  http://nagoya.apache.org/jira/browse/XERCESJ-998

  

  Improve the performance of the symbol table by

  dynamically resizing it and rehashing it once it

  reaches some threshold. Previously the SymbolTable

  did not scale well, since the size of the table was

  fixed.  Neeraj had started looking at this a couple

  years ago.  Thanks to John Kim for contributing

  this performance improvement.
  
  Revision  Changes    Path
  1.9       +140 -16   xml-xerces/java/src/org/apache/xerces/util/SymbolTable.java
  
  Index: SymbolTable.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/util/SymbolTable.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- SymbolTable.java  24 Feb 2004 23:15:53 -0000      1.8
  +++ SymbolTable.java  30 Jul 2004 22:10:54 -0000      1.9
  @@ -38,10 +38,40 @@
    *   characters are especially prone to this poor hashing behavior.
    *  </li>
    * </ul>
  + * 
  + * An instance of <code>SymbolTable</code> has two parameters that affect its
  + * performance: <i>initial capacity</i> and <i>load factor</i>.  The
  + * <i>capacity</i> is the number of <i>buckets</i> in the SymbolTable, and the
  + * <i>initial capacity</i> is simply the capacity at the time the SymbolTable
  + * is created.  Note that the SymbolTable is <i>open</i>: in the case of a "hash
  + * collision", a single bucket stores multiple entries, which must be searched
  + * sequentially.  The <i>load factor</i> is a measure of how full the SymbolTable
  + * is allowed to get before its capacity is automatically increased.
  + * When the number of entries in the SymbolTable exceeds the product of the load
  + * factor and the current capacity, the capacity is increased by calling the
  + * <code>rehash</code> method.<p>
    *
  + * Generally, the default load factor (.75) offers a good tradeoff between
  + * time and space costs.  Higher values decrease the space overhead but
  + * increase the time cost to look up an entry (which is reflected in most
  + * <tt>SymbolTable</tt> operations, including <tt>addSymbol</tt> and 
<tt>containsSymbol</tt>).<p>
  + *
  + * The initial capacity controls a tradeoff between wasted space and the
  + * need for <code>rehash</code> operations, which are time-consuming.
  + * No <code>rehash</code> operations will <i>ever</i> occur if the initial
  + * capacity is greater than the maximum number of entries the
  + * <tt>Hashtable</tt> will contain divided by its load factor.  However,
  + * setting the initial capacity too high can waste space.<p>
  + *
  + * If many entries are to be made into a <code>SymbolTable</code>, 
  + * creating it with a sufficiently large capacity may allow the 
  + * entries to be inserted more efficiently than letting it perform 
  + * automatic rehashing as needed to grow the table. <p>
  +
    * @see SymbolHash
    *
    * @author Andy Clark
  + * @author John Kim, IBM
    *
    * @version $Id$
    */
  @@ -61,22 +91,71 @@
       /** Buckets. */
       protected Entry[] fBuckets = null;
   
  -    // actual table size
  +    /** actual table size **/
       protected int fTableSize;
   
  +    /** The total number of entries in the hash table. */
  +    protected transient int fCount;
  +
  +    /** The table is rehashed when its size exceeds this threshold.  (The
  +     * value of this field is (int)(capacity * loadFactor).) */
  +    protected int fThreshold;
  +                                                      
  +    /** The load factor for the SymbolTable. */
  +    protected float fLoadFactor;
  +
       //
       // Constructors
       //
  -
  -    /** Constructs a symbol table with a default number of buckets. */
  -    public SymbolTable() {
  -        this(TABLE_SIZE);
  +    
  +    /**
  +     * Constructs a new, empty SymbolTable with the specified initial 
  +     * capacity and the specified load factor.
  +     *
  +     * @param      initialCapacity   the initial capacity of the SymbolTable.
  +     * @param      loadFactor        the load factor of the SymbolTable.
  +     * @throws     IllegalArgumentException  if the initial capacity is less
  +     *             than zero, or if the load factor is nonpositive.
  +     */
  +    public SymbolTable(int initialCapacity, float loadFactor) {
  +        
  +        if (initialCapacity < 0) {
  +            throw new IllegalArgumentException("Illegal Capacity: " + 
initialCapacity);
  +        }
  +        
  +        if (loadFactor <= 0 || Float.isNaN(loadFactor)) {
  +            throw new IllegalArgumentException("Illegal Load: " + loadFactor);
  +        }
  +        
  +        if (initialCapacity == 0) {
  +            initialCapacity = 1;
  +        }
  +        
  +        fLoadFactor = loadFactor;
  +        fTableSize = initialCapacity;
  +        fBuckets = new Entry[fTableSize];
  +        fThreshold = (int)(fTableSize * loadFactor);
  +        fCount = 0;
       }
   
  -    /** Constructs a symbol table with a specified number of buckets. */
  -    public SymbolTable(int tableSize) {
  -        fTableSize = tableSize;
  -        fBuckets = new Entry[fTableSize];
  +    /**
  +     * Constructs a new, empty SymbolTable with the specified initial capacity
  +     * and default load factor, which is <tt>0.75</tt>.
  +     *
  +     * @param     initialCapacity   the initial capacity of the hashtable.
  +     * @throws    IllegalArgumentException if the initial capacity is less
  +     *            than zero.
  +     */
  +    public SymbolTable(int initialCapacity) {
  +        this(initialCapacity, 0.75f);
  +    }
  +    
  +    /**
  +     * Constructs a new, empty SymbolTable with a default initial capacity (101)
  +     * and load factor, which is <tt>0.75</tt>. 
  +     */
  +    public SymbolTable() {
  +        this(TABLE_SIZE, 0.75f);
       }
   
       //
  @@ -92,7 +171,7 @@
        * @param symbol The new symbol.
        */
       public String addSymbol(String symbol) {
  -
  +        
           // search for identical symbol
           int bucket = hash(symbol) % fTableSize;
           int length = symbol.length();
  @@ -106,12 +185,19 @@
                   return entry.symbol;
               }
           }
  -
  +        
  +        if (fCount >= fThreshold) {
  +            // Rehash the table if the threshold is exceeded
  +            rehash();
  +            bucket = hash(symbol) % fTableSize;
  +        } 
  +        
           // create new entry
           Entry entry = new Entry(symbol, fBuckets[bucket]);
           fBuckets[bucket] = entry;
  +        ++fCount;
           return entry.symbol;
  -
  +        
       } // addSymbol(String):String
   
       /**
  @@ -125,7 +211,7 @@
        * @param length The length of the new symbol in the buffer.
        */
       public String addSymbol(char[] buffer, int offset, int length) {
  -
  +        
           // search for identical symbol
           int bucket = hash(buffer, offset, length) % fTableSize;
           OUTER: for (Entry entry = fBuckets[bucket]; entry != null; entry = 
entry.next) {
  @@ -138,12 +224,19 @@
                   return entry.symbol;
               }
           }
  -
  +        
  +        if (fCount >= fThreshold) {
  +            // Rehash the table if the threshold is exceeded
  +            rehash();
  +            bucket = hash(buffer, offset, length) % fTableSize;
  +        } 
  +        
           // add new entry
           Entry entry = new Entry(buffer, offset, length, fBuckets[bucket]);
           fBuckets[bucket] = entry;
  +        ++fCount;
           return entry.symbol;
  -
  +        
       } // addSymbol(char[],int,int):String
   
       /**
  @@ -185,6 +278,37 @@
           return code & 0x7FFFFFF;
   
       } // hash(char[],int,int):int
  +
  +    /**
  +     * Increases the capacity of and internally reorganizes this 
  +     * SymbolTable, in order to accommodate and access its entries more 
  +     * efficiently.  This method is called automatically when the 
  +     * number of keys in the SymbolTable exceeds this hashtable's capacity 
  +     * and load factor. 
  +     */
  +    protected void rehash() {
  +
  +        int oldCapacity = fBuckets.length;
  +        Entry[] oldTable = fBuckets;
  +
  +        int newCapacity = oldCapacity * 2 + 1;
  +        Entry[] newTable = new Entry[newCapacity];
  +
  +        fThreshold = (int)(newCapacity * fLoadFactor);
  +        fBuckets = newTable;
  +        fTableSize = fBuckets.length;
  +
  +        for (int i = oldCapacity ; i-- > 0 ;) {
  +            for (Entry old = oldTable[i] ; old != null ; ) {
  +                Entry e = old;
  +                old = old.next;
  +
  +                int index = hash(e.characters, 0, e.characters.length) % 
newCapacity;
  +                e.next = newTable[index];
  +                newTable[index] = e;
  +            }
  +        }
  +    }
   
       /**
        * Returns true if the symbol table already contains the specified
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to