minchau 2003/06/08 23:26:57 Modified: java/src/org/apache/xml/serializer CharInfo.java Log: Speed up CharInfo.isSpecial() by replacing its internal use of java.util.BitSet with a more tuned bitset. Also sped up the common case of this method for ASCII values (0-126) by saving booleans in a cached array rather than looking them up via their bits in the bitset. Submitted by: Brian Minchau Revision Changes Path 1.4 +179 -11 xml-xalan/java/src/org/apache/xml/serializer/CharInfo.java Index: CharInfo.java =================================================================== RCS file: /home/cvs/xml-xalan/java/src/org/apache/xml/serializer/CharInfo.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- CharInfo.java 30 Apr 2003 02:45:54 -0000 1.3 +++ CharInfo.java 9 Jun 2003 06:26:57 -0000 1.4 @@ -82,8 +82,6 @@ public class CharInfo { - /** Bit map that tells if a given character should have special treatment. */ - BitSet m_specialsMap = new BitSet(65535); /** Lookup table for characters to entity references. */ private Hashtable m_charToEntityRef = new Hashtable(); @@ -105,10 +103,63 @@ /** The carriage return character, which the parser should always normalize. */ public static char S_CARRIAGERETURN = 0x0D; + + /** Copy the first 0,1 ... ASCII_MAX values into an array */ + private static final int ASCII_MAX = 128; + + /** Array of values is faster access than a set of bits */ + private boolean[] quickASCII = new boolean[ASCII_MAX]; + + private boolean[] isCleanASCII = new boolean[ASCII_MAX]; + + /** An array of bits to record if the character is in the set. + * Although information in this array is complete, the + * quickASCII array is used first because access to its values + * is common and faster. + */ + private int array_of_bits[] = createEmptySetOfIntegers(65535); + + + // 5 for 32 bit words, 6 for 64 bit words ... + /* + * This constant is used to shift an integer to quickly + * calculate which element its bit is stored in. + * 5 for 32 bit words (int) , 6 for 64 bit words (long) + */ + private static final int SHIFT_PER_WORD = 5; + + /* + * A mask to get the low order bits which are used to + * calculate the value of the bit within a given word, + * that will represent the presence of the integer in the + * set. + * + * 0x1F for 32 bit words (int), + * or 0x3F for 64 bit words (long) + */ + private static final int LOW_ORDER_BITMASK = 0x1f; + + /* + * This is used for optimizing the lookup of bits representing + * the integers in the set. It is the index of the first element + * in the array array_of_bits[] that is not used. + */ + private int firstWordNotUsed; + + /** + * This constructor is private, just to force the use + * of the getCharInfo(entitiesResource) factory + * + */ + private CharInfo() + { + } /** * Constructor that reads in a resource file that describes the mapping of * characters to entity references. + * This constructor is private, just to force the use + * of the getCharInfo(entitiesResource) factory * * Resource files must be encoded in UTF-8 and can either be properties * files with a .properties extension assumed. Alternatively, they can @@ -120,12 +171,12 @@ * quot 34 * amp 38 * </pre> - * + * * @param entitiesResource Name of properties or resource file that should * be loaded, which describes that mapping of characters to entity * references. */ - public CharInfo(String entitiesResource) + private CharInfo(String entitiesResource) { PropertyResourceBundle entities; InputStream is = null; @@ -152,8 +203,8 @@ code = Integer.parseInt(value); defineEntity(name, (char) code); } - m_specialsMap.set(S_LINEFEED); - m_specialsMap.set(S_CARRIAGERETURN); + set(S_LINEFEED); + set(S_CARRIAGERETURN); } else { // Load user specified resource file by using URL loading, it // requires a valid URI as parameter; @@ -249,8 +300,8 @@ } is.close(); - m_specialsMap.set(S_LINEFEED); - m_specialsMap.set(S_CARRIAGERETURN); + set(S_LINEFEED); + set(S_CARRIAGERETURN); } catch (Exception except) { throw new RuntimeException( XMLMessages.createXMLMessage( @@ -267,6 +318,22 @@ } } } + + // initialize the array with a cache of the BitSet values + for (int i=0; i<ASCII_MAX; i++) + quickASCII[i] = get(i); + + // initialize the array with a cache of values + // for use by ToStream.character(char[], int , int) + for (int ch = 0; ch <ASCII_MAX; ch++) + if((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch))) + && (!get(ch))) + || ('"' == ch)) + { + isCleanASCII[ch] = true; + } + else + isCleanASCII[ch] = false; } /** @@ -285,7 +352,7 @@ CharKey character = new CharKey(value); m_charToEntityRef.put(character, name); - m_specialsMap.set(value); + set(value); } private CharKey m_charKey = new CharKey(); @@ -320,10 +387,36 @@ * @return true if the character should have any special treatment, such as * when writing out attribute values, or entity references. */ - public boolean isSpecial(char value) + public final boolean isSpecial(int value) { - return m_specialsMap.get(value); + // for performance try the values in the boolean array first, + // this is faster access than the BitSet for common ASCII values + + if (value < ASCII_MAX) + return quickASCII[value]; + + // rather than java.util.BitSet, our private + // implementation is faster (and less general). + return get(value); } + + /** + * This method is used to determine if an ASCII character is "clean" + * @param value the character to check (0 to 127). + * @return true if the character can go to the writer as-is + */ + public final boolean isASCIIClean(int value) + { + return isCleanASCII[value]; + } + +// In the future one might want to use the array directly and avoid +// the method call, but I think the JIT alreay inlines this well enough +// so don't do it (for now) - bjm +// public final boolean[] getASCIIClean() +// { +// return isCleanASCII; +// } /** @@ -423,4 +516,79 @@ } + + + + /** + * Returns the array element holding the bit value for the + * given integer + * @param i the integer that might be in the set of integers + * + */ + private static int arrayIndex(int i) { + return (i >> SHIFT_PER_WORD); + } + + /** + * For a given integer in the set it returns the single bit + * value used within a given word that represents whether + * the integer is in the set or not. + */ + private static int bit(int i) { + int ret = (1 << (i & LOW_ORDER_BITMASK)); + return ret; + } + + /** + * Creates a new empty set of integers (characters) + * @param max the maximum integer to be in the set. + */ + private int[] createEmptySetOfIntegers(int max) { + firstWordNotUsed = 0; // an optimization + + int[] arr = new int[arrayIndex(max - 1) + 1]; + return arr; + + } + + /** + * Adds the integer (character) to the set of integers. + * @param i the integer to add to the set, valid values are + * 0, 1, 2 ... up to the maximum that was specified at + * the creation of the set. + */ + private final void set(int i) { + int j = (i >> SHIFT_PER_WORD); // this word is used + int k = j + 1; + + if(firstWordNotUsed < k) // for optimization purposes. + firstWordNotUsed = k; + + array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK)); + } + + + /** + * Return true if the integer (character)is in the set of integers. + * + * This implementation uses an array of integers with 32 bits per + * integer. If a bit is set to 1 the corresponding integer is + * in the set of integers. + * + * @param i an integer that is tested to see if it is the + * set of integers, or not. + */ + private final boolean get(int i) { + + boolean in_the_set = false; + int j = (i >> SHIFT_PER_WORD); // wordIndex(i) + // an optimization here, ... a quick test to see + // if this integer is beyond any of the words in use + if(j < firstWordNotUsed) + in_the_set = (array_of_bits[j] & + (1 << (i & LOW_ORDER_BITMASK)) + ) != 0; // 0L for 64 bit words + return in_the_set; + } + }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]