minchau 2003/06/08 23:26:57
Modified: java/src/org/apache/xml/serializer CharInfo.java
Log:
Speed up CharInfo.isSpecial() by replacing its internal use of
java.util.BitSet with a more tuned bitset.
Also sped up the common case of this method for ASCII values (0-126) by
saving booleans in a cached array rather than looking them up via their
bits in the bitset.
Submitted by: Brian Minchau
Revision Changes Path
1.4 +179 -11
xml-xalan/java/src/org/apache/xml/serializer/CharInfo.java
Index: CharInfo.java
===================================================================
RCS file:
/home/cvs/xml-xalan/java/src/org/apache/xml/serializer/CharInfo.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- CharInfo.java 30 Apr 2003 02:45:54 -0000 1.3
+++ CharInfo.java 9 Jun 2003 06:26:57 -0000 1.4
@@ -82,8 +82,6 @@
public class CharInfo
{
- /** Bit map that tells if a given character should have special
treatment. */
- BitSet m_specialsMap = new BitSet(65535);
/** Lookup table for characters to entity references. */
private Hashtable m_charToEntityRef = new Hashtable();
@@ -105,10 +103,63 @@
/** The carriage return character, which the parser should always
normalize. */
public static char S_CARRIAGERETURN = 0x0D;
+
+ /** Copy the first 0,1 ... ASCII_MAX values into an array */
+ private static final int ASCII_MAX = 128;
+
+ /** Array of values is faster access than a set of bits */
+ private boolean[] quickASCII = new boolean[ASCII_MAX];
+
+ private boolean[] isCleanASCII = new boolean[ASCII_MAX];
+
+ /** An array of bits to record if the character is in the set.
+ * Although information in this array is complete, the
+ * quickASCII array is used first because access to its values
+ * is common and faster.
+ */
+ private int array_of_bits[] = createEmptySetOfIntegers(65535);
+
+
+ // 5 for 32 bit words, 6 for 64 bit words ...
+ /*
+ * This constant is used to shift an integer to quickly
+ * calculate which element its bit is stored in.
+ * 5 for 32 bit words (int) , 6 for 64 bit words (long)
+ */
+ private static final int SHIFT_PER_WORD = 5;
+
+ /*
+ * A mask to get the low order bits which are used to
+ * calculate the value of the bit within a given word,
+ * that will represent the presence of the integer in the
+ * set.
+ *
+ * 0x1F for 32 bit words (int),
+ * or 0x3F for 64 bit words (long)
+ */
+ private static final int LOW_ORDER_BITMASK = 0x1f;
+
+ /*
+ * This is used for optimizing the lookup of bits representing
+ * the integers in the set. It is the index of the first element
+ * in the array array_of_bits[] that is not used.
+ */
+ private int firstWordNotUsed;
+
+ /**
+ * This constructor is private, just to force the use
+ * of the getCharInfo(entitiesResource) factory
+ *
+ */
+ private CharInfo()
+ {
+ }
/**
* Constructor that reads in a resource file that describes the mapping
of
* characters to entity references.
+ * This constructor is private, just to force the use
+ * of the getCharInfo(entitiesResource) factory
*
* Resource files must be encoded in UTF-8 and can either be properties
* files with a .properties extension assumed. Alternatively, they can
@@ -120,12 +171,12 @@
* quot 34
* amp 38
* </pre>
- *
+ *
* @param entitiesResource Name of properties or resource file that
should
* be loaded, which describes that mapping of characters to entity
* references.
*/
- public CharInfo(String entitiesResource)
+ private CharInfo(String entitiesResource)
{
PropertyResourceBundle entities;
InputStream is = null;
@@ -152,8 +203,8 @@
code = Integer.parseInt(value);
defineEntity(name, (char) code);
}
- m_specialsMap.set(S_LINEFEED);
- m_specialsMap.set(S_CARRIAGERETURN);
+ set(S_LINEFEED);
+ set(S_CARRIAGERETURN);
} else {
// Load user specified resource file by using URL loading, it
// requires a valid URI as parameter;
@@ -249,8 +300,8 @@
}
is.close();
- m_specialsMap.set(S_LINEFEED);
- m_specialsMap.set(S_CARRIAGERETURN);
+ set(S_LINEFEED);
+ set(S_CARRIAGERETURN);
} catch (Exception except) {
throw new RuntimeException(
XMLMessages.createXMLMessage(
@@ -267,6 +318,22 @@
}
}
}
+
+ // initialize the array with a cache of the BitSet values
+ for (int i=0; i<ASCII_MAX; i++)
+ quickASCII[i] = get(i);
+
+ // initialize the array with a cache of values
+ // for use by ToStream.character(char[], int , int)
+ for (int ch = 0; ch <ASCII_MAX; ch++)
+ if((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch)))
+ && (!get(ch)))
+ || ('"' == ch))
+ {
+ isCleanASCII[ch] = true;
+ }
+ else
+ isCleanASCII[ch] = false;
}
/**
@@ -285,7 +352,7 @@
CharKey character = new CharKey(value);
m_charToEntityRef.put(character, name);
- m_specialsMap.set(value);
+ set(value);
}
private CharKey m_charKey = new CharKey();
@@ -320,10 +387,36 @@
* @return true if the character should have any special treatment, such
as
* when writing out attribute values, or entity references.
*/
- public boolean isSpecial(char value)
+ public final boolean isSpecial(int value)
{
- return m_specialsMap.get(value);
+ // for performance try the values in the boolean array first,
+ // this is faster access than the BitSet for common ASCII values
+
+ if (value < ASCII_MAX)
+ return quickASCII[value];
+
+ // rather than java.util.BitSet, our private
+ // implementation is faster (and less general).
+ return get(value);
}
+
+ /**
+ * This method is used to determine if an ASCII character is "clean"
+ * @param value the character to check (0 to 127).
+ * @return true if the character can go to the writer as-is
+ */
+ public final boolean isASCIIClean(int value)
+ {
+ return isCleanASCII[value];
+ }
+
+// In the future one might want to use the array directly and avoid
+// the method call, but I think the JIT alreay inlines this well enough
+// so don't do it (for now) - bjm
+// public final boolean[] getASCIIClean()
+// {
+// return isCleanASCII;
+// }
/**
@@ -423,4 +516,79 @@
}
+
+
+
+ /**
+ * Returns the array element holding the bit value for the
+ * given integer
+ * @param i the integer that might be in the set of integers
+ *
+ */
+ private static int arrayIndex(int i) {
+ return (i >> SHIFT_PER_WORD);
+ }
+
+ /**
+ * For a given integer in the set it returns the single bit
+ * value used within a given word that represents whether
+ * the integer is in the set or not.
+ */
+ private static int bit(int i) {
+ int ret = (1 << (i & LOW_ORDER_BITMASK));
+ return ret;
+ }
+
+ /**
+ * Creates a new empty set of integers (characters)
+ * @param max the maximum integer to be in the set.
+ */
+ private int[] createEmptySetOfIntegers(int max) {
+ firstWordNotUsed = 0; // an optimization
+
+ int[] arr = new int[arrayIndex(max - 1) + 1];
+ return arr;
+
+ }
+
+ /**
+ * Adds the integer (character) to the set of integers.
+ * @param i the integer to add to the set, valid values are
+ * 0, 1, 2 ... up to the maximum that was specified at
+ * the creation of the set.
+ */
+ private final void set(int i) {
+ int j = (i >> SHIFT_PER_WORD); // this word is used
+ int k = j + 1;
+
+ if(firstWordNotUsed < k) // for optimization purposes.
+ firstWordNotUsed = k;
+
+ array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK));
+ }
+
+
+ /**
+ * Return true if the integer (character)is in the set of integers.
+ *
+ * This implementation uses an array of integers with 32 bits per
+ * integer. If a bit is set to 1 the corresponding integer is
+ * in the set of integers.
+ *
+ * @param i an integer that is tested to see if it is the
+ * set of integers, or not.
+ */
+ private final boolean get(int i) {
+
+ boolean in_the_set = false;
+ int j = (i >> SHIFT_PER_WORD); // wordIndex(i)
+ // an optimization here, ... a quick test to see
+ // if this integer is beyond any of the words in use
+ if(j < firstWordNotUsed)
+ in_the_set = (array_of_bits[j] &
+ (1 << (i & LOW_ORDER_BITMASK))
+ ) != 0; // 0L for 64 bit words
+ return in_the_set;
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]