scolebourne 2004/10/06 15:29:25
Modified: lang/src/java/org/apache/commons/lang/text package.html
lang/src/test/org/apache/commons/lang/text
TextTestSuite.java
Added: lang/src/java/org/apache/commons/lang/text StrTokenizer.java
lang/src/test/org/apache/commons/lang/text
StrTokenizerTest.java
Removed: lang/src/test/org/apache/commons/lang TokenizerTest.java
lang/src/java/org/apache/commons/lang Tokenizer.java
Log:
Rename Tokenizer to StrTokenizer and move to text subpackage
Revision Changes Path
1.2 +3 -1
jakarta-commons/lang/src/java/org/apache/commons/lang/text/package.html
Index: package.html
===================================================================
RCS file:
/home/cvs/jakarta-commons/lang/src/java/org/apache/commons/lang/text/package.html,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- package.html 30 Sep 2004 16:57:05 -0000 1.1
+++ package.html 6 Oct 2004 22:29:24 -0000 1.2
@@ -16,7 +16,9 @@
<html>
<body>
<p>
-Provides classes for handling text in conjunction with [EMAIL PROTECTED] java.text}.
+Provides classes for handling and manipulating text, partly as an extension to
[EMAIL PROTECTED] java.text}.
+The classes in this package are, for the most part, intended to be instantiated.
+(ie. they are not utility classes with lots of static methods)
</p>
@since 2.1
</body>
1.1
jakarta-commons/lang/src/java/org/apache/commons/lang/text/StrTokenizer.java
Index: StrTokenizer.java
===================================================================
/*
* Copyright 2003-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.lang.text;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.ListIterator;
/**
* Tokenizes a string based based on delimiters (separators)
* and supporting quoting and ignored character concepts.
* <p>
* This class can split a String into many smaller strings.
* It aims to do a similar job to java util StringTokenizer, however it offers
* much more control and flexibility. By default, it is setup like StringTokenizer.
* <p>
* The input String is split into a number of <i>tokens</i>.
* Each token is separated from the next String by a <i>delimiter</i>.
* One or more delimiter characters must be specified.
* <p>
* The processing then strips all the <i>ignored</i> characters from each side of
the token.
* The token may also have <i>quotes</i> to mark an area not to be stripped or
tokenized.
* Empty tokens may be removed or returned as null.
* This example is based on the CSV tokenizer.
* <pre>
* "a,b,c" - Three tokens "a","b","c" (comma delimiter)
* "a, b , c" - Three tokens "a","b","c" (ignored space characters stripped)
* "a, " b ", c" - Three tokens "a"," b ","c" (quoted text untouched)
* </pre>
* <p>
*
* This tokenizer has the following properties and options:
*
* <table>
* <tr>
* <th>Property</th><th>Type</th><th>Default</th>
* </tr>
* <tr>
* <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
* </tr>
* <tr>
* <td>quote</td><td>NoneMatcher</td><td>{}</td>
* </tr>
* <tr>
* <td>ignore</td><td>NoneMatcher</td><td>{}</td>
* </tr>
* <tr>
* <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
* </tr>
* <tr>
* <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
* </tr>
* </table>
*
* @author Matthew Inger
* @author Stephen Colebourne
* @author Gary D. Gregory
* @since 2.1
* @version $Id: StrTokenizer.java,v 1.1 2004/10/06 22:29:24 scolebourne Exp $
*/
public class StrTokenizer implements ListIterator, Cloneable {
/**
* A Matcher which matches the comma character.
* Best used for <code>delimiter</code>.
*/
public static final Matcher COMMA_MATCHER = new CharMatcher(',');
/**
* A Matcher which matches the tab character.
* Best used for <code>delimiter</code>.
*/
public static final Matcher TAB_MATCHER = new CharMatcher('\t');
/**
* A Matcher which matches the space character.
* Best used for <code>delimiter</code>.
*/
public static final Matcher SPACE_MATCHER = new CharMatcher(' ');
/**
* A Matcher which matches the same characters as StringTokenizer,
* namely space, tab, newline, formfeed.
* Best used for <code>delimiter</code>.
*/
public static final Matcher SPLIT_MATCHER = createCharSetMatcher(" \t\n\r\f");
/**
* A Matcher which matches the double quote character.
* Best used for <code>quote</code>.
*/
public static final Matcher SINGLE_QUOTE_MATCHER = new CharMatcher('\'');
/**
* A Matcher which matches the double quote character.
* Best used for <code>quote</code>.
*/
public static final Matcher DOUBLE_QUOTE_MATCHER = new CharMatcher('"');
/**
* A Matcher which matches the String trim() whitespace characters.
* Best used for <code>ignored</code>.
*/
public static final Matcher TRIM_MATCHER = new TrimMatcher();
/**
* A Matcher that matches no characters. Don't use this for delimiters!
* Best used for <code>ignored</code>.
*/
public static final Matcher NONE_MATCHER = new NoMatcher();
private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
static {
CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(COMMA_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(TAB_MATCHER);
TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
}
/** The text to work on */
private char chars[];
/** The input text, null if char[] input */
private String text;
/** The parsed tokens */
private String tokens[];
/** The current iteration position */
private int tokenPos;
/** The delimiter matcher */
private Matcher delim = SPLIT_MATCHER;
/** The quote matcher */
private Matcher quote = NONE_MATCHER;
/** The ignored matcher */
private Matcher ignored = NONE_MATCHER;
/** Whether to return empty tokens as null */
private boolean emptyAsNull = false;
/** Whether to ignore empty tokens */
private boolean ignoreEmptyTokens = true;
//-----------------------------------------------------------------------
/**
* Constructor that creates a matcher from a set of characters.
*
* @param chars the characters to match, must not be null
* @throws IllegalArgumentException if the character set is null or empty
*/
public static Matcher createCharSetMatcher(char[] chars) {
if (chars == null || chars.length == 0) {
throw new IllegalArgumentException("Characters must not be null or
empty");
}
if (chars.length == 1) {
return new CharMatcher(chars[0]);
}
return new CharSetMatcher(chars);
}
/**
* Constructor that creates a matcher from a string representing a set of
characters.
*
* @param chars the characters to match, must not be null
* @throws IllegalArgumentException if the character set is null or empty
*/
public static Matcher createCharSetMatcher(String chars) {
if (chars == null || chars.length() == 0) {
throw new IllegalArgumentException("Characters must not be null or
empty");
}
if (chars.length() == 1) {
return new CharMatcher(chars.charAt(0));
}
return new CharSetMatcher(chars.toCharArray());
}
/**
* Constructor that creates a matcher from a character.
*
* @param ch the character to match, must not be null
*/
public static Matcher createCharMatcher(char ch) {
return new CharMatcher(ch);
}
/**
* Constructor that creates a matcher from a string.
*
* @param str the string to match, must not be null
* @throws IllegalArgumentException if the string is null or empty
*/
public static Matcher createStringMatcher(String str) {
if (str == null || str.length() == 0) {
throw new IllegalArgumentException("String must not be null or empty");
}
return new StringMatcher(str);
}
//-----------------------------------------------------------------------
/**
* Gets a new tokenizer instance which parses Comma Seperated Value strings.
* You must call a "reset" method to set the string which you want to parse.
*/
public static StrTokenizer getCSVInstance() {
return (StrTokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
}
/**
* Gets a new tokenizer instance which parses Comma Seperated Value strings
* initializing it with the given input.
*
* @param input the string to parse
*/
public static StrTokenizer getCSVInstance(String input) {
StrTokenizer tok = (StrTokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
tok.reset(input);
return tok;
}
/**
* Gets a new tokenizer instance which parses Comma Seperated Value strings
* initializing it with the given input.
*
* @param input the text to parse
*/
public static StrTokenizer getCSVInstance(char[] input) {
StrTokenizer tok = (StrTokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
tok.reset(input);
return tok;
}
/**
* Gets a new tokenizer instance which parses Tab Seperated Value strings.
* You must call a "reset" method to set the string which you want to parse.
*/
public static StrTokenizer getTSVInstance() {
return (StrTokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
}
/**
* Gets a new tokenizer instance which parses Tab Seperated Value strings
* initializing it with the given input.
*
* @param input the string to parse
*/
public static StrTokenizer getTSVInstance(String input) {
StrTokenizer tok = (StrTokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
tok.reset(input);
return tok;
}
/**
* Gets a new tokenizer instance which parses Tab Seperated Value strings
* initializing it with the given input.
*
* @param input the text to parse
*/
public static StrTokenizer getTSVInstance(char[] input) {
StrTokenizer tok = (StrTokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
tok.reset(input);
return tok;
}
//-----------------------------------------------------------------------
/**
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer, but with no text to tokenize.
* <p>
* This constructor is normally used with [EMAIL PROTECTED] #reset(String)}.
*/
public StrTokenizer() {
super();
this.text = "";
this.chars = new char[0];
}
/**
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
*/
public StrTokenizer(String input) {
super();
this.text = input;
this.chars = input.toCharArray(); // no clone as toCharArray() clones
}
/**
* Constructs a tokenizer splitting on the specified delimiter character.
*
* @param input the string which is to be parsed
* @param delim the field delimiter character
*/
public StrTokenizer(String input, char delim) {
this(input);
setDelimiterChar(delim);
}
/**
* Constructs a tokenizer splitting on the specified delimiter string.
*
* @param input the string which is to be parsed
* @param delim the field delimiter string
*/
public StrTokenizer(String input, String delim) {
this(input);
setDelimiterString(delim);
}
/**
* Constructs a tokenizer splitting using the specified delimiter matcher.
*
* @param input the string which is to be parsed
* @param delim the field delimiter matcher
*/
public StrTokenizer(String input, Matcher delim) {
this(input);
setDelimiterMatcher(delim);
}
/**
* Constructs a tokenizer splitting on the specified delimiter character
* and handling quotes using the specified quote character.
*
* @param input the string which is to be parsed
* @param delim the field delimiter character
* @param quote the field quoted string character
*/
public StrTokenizer(String input, char delim, char quote) {
this(input, delim);
setQuoteChar(quote);
}
/**
* Constructs a tokenizer splitting using the specified delimiter matcher
* and handling quotes using the specified quote matcher.
*
* @param input the string which is to be parsed
* @param delim the field delimiter matcher
* @param quote the field quoted string matcher
*/
public StrTokenizer(String input, Matcher delim, Matcher quote) {
this(input, delim);
setQuoteMatcher(quote);
}
/**
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed, cloned
*/
public StrTokenizer(char[] input) {
super();
this.text = null;
this.chars = (char[]) input.clone();
}
/**
* Constructs a tokenizer splitting on the specified character.
*
* @param input the string which is to be parsed, cloned
* @param delim the field delimiter character
*/
public StrTokenizer(char[] input, char delim) {
this(input);
setDelimiterChar(delim);
}
/**
* Constructs a tokenizer splitting on the specified string.
*
* @param input the string which is to be parsed, cloned
* @param delim the field delimiter string
*/
public StrTokenizer(char[] input, String delim) {
this(input);
setDelimiterString(delim);
}
/**
* Constructs a tokenizer splitting using the specified delimiter matcher.
*
* @param input the string which is to be parsed, cloned
* @param delim the field delimiter matcher
*/
public StrTokenizer(char[] input, Matcher delim) {
this(input);
setDelimiterMatcher(delim);
}
/**
* Constructs a tokenizer splitting on the specified delimiter character
* and handling quotes using the specified quote character.
*
* @param input the string which is to be parsed, cloned
* @param delim the field delimiter character
* @param quote the field quoted string character
*/
public StrTokenizer(char[] input, char delim, char quote) {
this(input, delim);
setQuoteChar(quote);
}
/**
* Constructs a tokenizer splitting using the specified delimiter matcher
* and handling quotes using the specified quote matcher.
*
* @param input the string which is to be parsed, cloned
* @param delim the field delimiter character
* @param quote the field quoted string character
*/
public StrTokenizer(char[] input, Matcher delim, Matcher quote) {
this(input, delim);
setQuoteMatcher(quote);
}
// API
//-----------------------------------------------------------------------
/**
* Gets the number of tokens found in the String.
*
* @return the number of matched tokens
*/
public int size() {
tokenize();
return tokens.length;
}
/**
* Gets the next token from the String.
*
* @return the next sequential token, or null when no more tokens are found
*/
public String nextToken() {
if (hasNext()) {
return tokens[tokenPos++];
} else {
return null;
}
}
/**
* Gets the previous token from the String.
*
* @return the previous sequential token, or null when no more tokens are found
*/
public String previousToken() {
if (hasPrevious()) {
return tokens[--tokenPos];
} else {
return null;
}
}
/**
* Gets a copy of the full token list.
*
* @return the tokens as a String array
*/
public String[] getAllTokens() {
tokenize();
return (String[]) tokens.clone();
}
/**
* Resets this tokenizer, forgetting all parsing and iteration already completed.
* <p>
* This method allows the same tokenizer to be reused for the same String.
*/
public void reset() {
tokenPos = 0;
tokens = null;
}
/**
* Reset this tokenizer, giving it a new input string to parse.
* In this manner you can re-use a tokenizer with the same settings
* on multiple input lines.
*
* @param input the new string to tokenize
*/
public void reset(String input) {
reset();
this.text = input;
chars = input.toCharArray(); // no clone as toCharArray() clones
}
/**
* Reset this tokenizer, giving it a new input string to parse.
* In this manner you can re-use a tokenizer with the same settings
* on multiple input lines.
*
* @param input the new character array to tokenize, cloned
*/
public void reset(char [] input) {
reset();
this.text = null;
chars = (char[]) input.clone();
}
// ListIterator
//-----------------------------------------------------------------------
/**
* Checks whether there are any more tokens.
*
* @return true if there are more tokens
*/
public boolean hasNext() {
tokenize();
return (tokenPos < tokens.length);
}
/**
* Gets the next token. This method is equivalent to [EMAIL PROTECTED]
#nextToken()}.
*
* @return the next String token
*/
public Object next() {
return nextToken();
}
/**
* Gets the index of the next token to return.
*
* @return the next token index
*/
public int nextIndex() {
return tokenPos;
}
/**
* Checks whether there are any previous tokens that can be iterated to.
*
* @return true if there are previous tokens
*/
public boolean hasPrevious() {
tokenize();
return (tokenPos > 0);
}
/**
* Gets the token previous to the last returned token.
*
* @return the previous token
*/
public Object previous() {
return previousToken();
}
/**
* Gets the index of the previous token.
*
* @return the previous token index
*/
public int previousIndex() {
return (tokenPos - 1);
}
/**
* Unsupported ListIterator operation.
*
* @throws UnsupportedOperationException always
*/
public void remove() {
throw new UnsupportedOperationException("remove() is unsupported");
}
/**
* Unsupported ListIterator operation.
*
* @throws UnsupportedOperationException always
*/
public void set(Object obj) {
throw new UnsupportedOperationException("set() is unsupported");
}
/**
* Unsupported ListIterator operation.
*
* @throws UnsupportedOperationException always
*/
public void add(Object obj) {
throw new UnsupportedOperationException("add() is unsupported");
}
// Implementation
//-----------------------------------------------------------------------
/**
* Performs the tokenization if it hasn't already been done.
*/
private void tokenize() {
if (tokens == null) {
this.tokens = readTokens();
}
}
/**
* Read all the tokens.
*/
private String[] readTokens() {
int len = chars.length;
char cbuf[] = new char[len];
StringBuffer token = new StringBuffer();
int start = 0;
List tokens = new ArrayList();
String tok = null;
// Keep going until we run out of characters
while (start < len) {
// read the next token
start = readNextToken(start, cbuf, token);
tok = token.toString();
// Add the token, following the rules
// in this object
addToken(tokens, tok);
// Reset the string buffer to zero length
token.setLength(0);
// Handle the special case where the very last
// character is a delimiter, in which case, we
// need another empty string
if (start == len && delim.isMatch(chars, len, start - 1) == 1) {
// Add the token, following the rules
// in this object
addToken(tokens, "");
}
}
return (String[]) tokens.toArray(new String[tokens.size()]);
}
/**
* Adds a token to a list, paying attention to the parameters we've set.
*
* @param list the list to add to
* @param tok the token to add
*/
private void addToken(List list, String tok) {
if (tok == null || tok.length() == 0) {
if (ignoreEmptyTokens) {
return;
}
if (emptyAsNull) {
tok = null;
}
}
list.add(tok);
}
/**
* Reads character by character through the String to get the next token.
*
* @param start the first character of field
* @param cbuf a character buffer for temporary computations (so we
* don't have to keep recreating one)
* @param token a StringBuffer where the output token will go
* @return the starting position of the next field (the character
* immediately after the delimiter, or if end of string found,
* then the length of string
*/
private int readNextToken(int start, char cbuf[], StringBuffer token) {
token.setLength(0);
int len = chars.length;
// Skip all leading whitespace, unless it is the
// field delimiter or the quote character
int ignoreLen = 0;
int delimLen = 0;
int quoteLen = 0;
while (start < len &&
(ignoreLen = ignored.isMatch(chars, len, start)) >= 1 &&
(delimLen = delim.isMatch(chars, len, start)) < 1 &&
(quoteLen = quote.isMatch(chars, len, start)) < 1) {
start += ignoreLen;
}
if (start >= len) {
return start;
} else {
// lengths not setup
if ((delimLen = delim.isMatch(chars, len, start)) >= 1) {
start += delimLen;
} else if ((quoteLen = quote.isMatch(chars, len, start)) >= 1) {
start = readQuoted(start + quoteLen, cbuf, token);
} else {
start = readUnquoted(start, token);
}
}
//
// // lengths not setup
// if ((delimLen = delim.isMatch(chars, start)) >= 1) {
// start += delimLen;
// } else if ((quoteLen = quote.isMatch(chars, start)) >= 1) {
// start = readQuoted(start + quoteLen, cbuf, token);
// } else {
// start = readUnquoted(start, token);
// }
// } else {
// if (delimLen > 0) {
// start += delimLen;
// } else if (quoteLen >= 1) {
// start = readQuoted(start + quoteLen, cbuf, token);
// } else {
// start = readUnquoted(start, token);
// }
// }
return start;
}
/**
* Reads a quoted string token.
*
* @param start The first character of field, immediately after any quote
* @param cbuf A character buffer for temporary computations (so we
* don't have to keep recreating one)
* @param token A StringBuffer where the output token will go.
* @return The starting position of the next field (the character
* immediately after the delimiter, or if end of string found,
* then the length of string.
*/
private int readQuoted(int start, char cbuf[], StringBuffer token) {
// Loop until we've found the end of the quoted
// string or the end of the input
int cbufcnt = 0;
int pos = start;
boolean done = false;
boolean quoting = true;
int len = chars.length;
int delimLen = 0;
int quoteLen = 0;
while (pos < len && !done) {
// Quoting mode can occur several times throughout
// a given string, so must switch between quoting
// and non-quoting until we encounter a non-quoted
// delimiter, or end of string, which indicates end
// of token.
if (quoting) {
// If we've found a quote character, see if it's
// followed by a second quote. If so, then we need
// to actually put the quote character into the token
// rather than end the token.
if ((quoteLen = quote.isMatch(chars, len, pos)) >= 1) {
if (pos + 1 < len && chars[pos + 1] == chars[pos]) {
cbuf[cbufcnt++] = chars[pos];
pos += 2;
} else {
// End the quoting if we get to this condition
quoting = false;
pos += quoteLen;
}
} else {
// Otherwise, just put the character into the token
cbuf[cbufcnt++] = chars[pos];
pos++;
}
}
// If we're not in quoting mode, if we encounter
// a delimiter, the token is ended. If we encounter
// a quote, we start quoting mode, otherwise, just append
// the character
else {
// If we're
if ((delimLen = delim.isMatch(chars, len, pos)) >= 1) {
done = true;
} else {
if ((quoteLen = quote.isMatch(chars, len, pos)) >= 1) {
quoting = true;
pos += quoteLen;
} else {
cbuf[cbufcnt++] = chars[pos];
pos++;
}
}
}
}
token.append(cbuf, 0, cbufcnt);
return pos + delimLen;
}
/**
* Read an unquoted string until a delimiter is found.
*
* @param start the first character of field
* @param token a StringBuffer where the output token will go.
* @return the starting position of the next field (the character
* immediately after the delimiter, or if end of string found,
* then the length of string.
*/
private int readUnquoted(int start, StringBuffer token) {
// Find delimiter or end of string
char[] chars = this.chars;
int len = chars.length;
int pos = start;
int delimLen = 0;
while (pos < len && (delimLen = delim.isMatch(chars, len, pos)) < 1) {
pos++;
}
token.append(chars, start, Math.min(pos, len) - start);
return pos + delimLen;
}
// Delimiter
//-----------------------------------------------------------------------
/**
* Gets the field delimiter matcher.
*
* @return the delimiter matcher in use
*/
public Matcher getDelimiterMatcher() {
return delim;
}
/**
* Sets the field delimiter matcher.
* <p>
* The delimitier is used to separate one token from another.
*
* @param delim the delimiter matcher to use
*/
public void setDelimiterMatcher(Matcher delim) {
if (delim == null) {
this.delim = NONE_MATCHER;
} else {
this.delim = delim;
}
}
/**
* Sets the field delimiter character
*
* @param delim the delimiter character to use
*/
public void setDelimiterChar(char delim) {
setDelimiterMatcher(new CharMatcher(delim));
}
/**
* Sets the field delimiter character
*
* @param delim the delimiter character to use
*/
public void setDelimiterString(String delim) {
if (delim == null || delim.length() == 0) {
setDelimiterMatcher(NONE_MATCHER);
} else if (delim.length() == 1) {
setDelimiterMatcher(new CharMatcher(delim.charAt(0)));
} else {
setDelimiterMatcher(new StringMatcher(delim));
}
}
// Quote
//-----------------------------------------------------------------------
/**
* Gets the quote matcher currently in use.
* <p>
* The quote character is used to wrap data between the tokens.
* This enables delimiters to be entered as data.
* The default value is '"' (double quote).
*
* @return the quote matcher in use
*/
public Matcher getQuoteMatcher() {
return quote;
}
/**
* Set the quote matcher to use.
* <p>
* The quote character is used to wrap data between the tokens.
* This enables delimiters to be entered as data.
*
* @param quote the quote matcher to use, null ignored
*/
public void setQuoteMatcher(Matcher quote) {
if (quote != null) {
this.quote = quote;
}
}
/**
* Sets the quote character to use.
* <p>
* The quote character is used to wrap data between the tokens.
* This enables delimiters to be entered as data.
*
* @param quote the quote character to use
*/
public void setQuoteChar(char quote) {
setQuoteMatcher(new CharMatcher(quote));
}
// Ignored
//-----------------------------------------------------------------------
/**
* Gets the ignored character matcher.
* <p>
* These characters are ignored when parsing the String, unless they are
* within a quoted region.
* The default value is space (' ') and all char control characters (32 and
less).
*
* @return the ignored matcher in use
*/
public Matcher getIgnoredMatcher() {
return ignored;
}
/**
* Set the matcher for characters to ignore.
* <p>
* These characters are ignored when parsing the String, unless they are
* within a quoted region.
*
* @param ignored the ignored matcher to use, null ignored
*/
public void setIgnoredMatcher(Matcher ignored) {
if (ignored != null) {
this.ignored = ignored;
}
}
/**
* Set the character to ignore.
* <p>
* This character is ignored when parsing the String, unless it is
* within a quoted region.
*
* @param ignored the ignored character to use
*/
public void setIgnoredChar(char ignored) {
setIgnoredMatcher(new CharMatcher(ignored));
}
//-----------------------------------------------------------------------
/**
* Gets whether the tokenizer currently returns empty tokens as null.
* The default for this property is false.
*
* @return true if empty tokens are returned as null
*/
public boolean isEmptyTokenAsNull() {
return emptyAsNull;
}
/**
* Sets whether the tokenizer should return empty tokens as null.
* The default for this property is false.
*
* @param emptyAsNull whether empty tokens are returned as null
*/
public void setEmptyTokenAsNull(boolean emptyAsNull) {
this.emptyAsNull = emptyAsNull;
}
//-----------------------------------------------------------------------
/**
* Gets whether the tokenizer currently ignores empty tokens.
* The default for this property is false.
*
* @return true if empty tokens are not returned
*/
public boolean isIgnoreEmptyTokens() {
return ignoreEmptyTokens;
}
/**
* Sets whether the tokenizer should ignore and not return empty tokens.
* The default for this property is false.
*
* @param ignoreEmptyTokens whether empty tokens are not returned
*/
public void setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
this.ignoreEmptyTokens = ignoreEmptyTokens;
}
//-----------------------------------------------------------------------
/**
* Gets the String content that the tokenizer is parsing.
*
* @return the string content being parsed
*/
public String getContent() {
if (text == null) {
text = new String(chars);
}
return text;
}
//-----------------------------------------------------------------------
/**
* Create a new instance of this Tokenizer.
* The new instance is reset so that it will be at the start of the token list.
*/
public Object clone() {
try {
StrTokenizer cloned = (StrTokenizer) super.clone();
// chars[] does not need additional clone as it is treated as immutable
cloned.reset();
return cloned;
} catch (CloneNotSupportedException ex) {
return null;
}
}
//-----------------------------------------------------------------------
/**
* Defines the interface used to match a set of characters during tokenization.
* Standard implementations of this interface are provided in the library.
* These are accessed via the create*() factory methods on StrTokenizer.
* If your application needs more unusual matching, implement this interface
directly.
*/
public static interface Matcher {
/**
* Returns true if the specified character position matches.
* <p>
* This method is called to check for a match.
* The parameter <code>pos</code> represents the current position to be
* checked in the string <code>text</code> (a character array which must
* not be changed).
* The text length is also provided for efficiency.
* The API guarantees that <code>pos</code> is a valid index for
<code>text</code>.
* <p>
* The matching code may check one character or many.
* It must return zero for no match, or a positive number if a match was
found.
* The number indicates the number of characters that matched.
*
* @param text the text content to match against, do not change
* @param textLen the length of the text
* @param pos the starting position for the match, valid for text
* @return the number of matching characters, zero for no match
*/
int isMatch(char[] text, int textLen, int pos);
}
//-----------------------------------------------------------------------
/**
* Class used to define a set of characters for matching purposes.
*/
static final class CharSetMatcher implements Matcher {
private char[] chars;
/**
* Constructor that creates a matcher from a character array.
*
* @param chars the characters to match, must not be null
*/
CharSetMatcher(char chars[]) {
super();
this.chars = (char[]) chars.clone();
Arrays.sort(this.chars);
}
/**
* Returns whether or not the given charatcer matches.
*
* @param text the text content to match against
* @param textLen the length of the text
* @param pos the starting position
* @return the number of matching characters, zero for no match
*/
public int isMatch(char[] text, int textLen, int pos) {
return (Arrays.binarySearch(chars, text[pos]) >= 0 ? 1 : 0);
}
}
//-----------------------------------------------------------------------
/**
* Class used to define a character for matching purposes.
*/
static final class CharMatcher implements Matcher {
private char ch;
/**
* Constructor that creates a matcher that matches a single character.
*
* @param ch the character to match
*/
CharMatcher(char ch) {
super();
this.ch = ch;
}
/**
* Returns whether or not the given character matches.
*
* @param text the text content to match against
* @param textLen the length of the text
* @param pos the starting position
* @return the number of matching characters, zero for no match
*/
public int isMatch(char[] text, int textLen, int pos) {
return (ch == text[pos] ? 1 : 0);
}
}
//-----------------------------------------------------------------------
/**
* Class used to define a set of characters for matching purposes.
*/
static final class StringMatcher implements Matcher {
private char[] chars;
/**
* Constructor that creates a matcher from a String.
*
* @param chars the characters to match, must not be null
*/
StringMatcher(String str) {
super();
chars = str.toCharArray();
}
/**
* Returns whether or not the given text matches the stored string.
*
* @param text the text content to match against
* @param textLen the length of the text
* @param pos the starting position
* @return the number of matching characters, zero for no match
*/
public int isMatch(char[] text, int textLen, int pos) {
int len = chars.length;
if (pos + len >= textLen) {
return 0;
}
for (int i = 0; i < chars.length; i++, pos++) {
if (chars[i] != text[pos]) {
return 0;
}
}
return len;
}
}
//-----------------------------------------------------------------------
/**
* Class used to match no characters.
*/
static final class NoMatcher implements Matcher {
NoMatcher() {
super();
}
/**
* Always returns <code>false</code>.
*
* @param text the text content to match against
* @param textLen the length of the text
* @param pos the starting position
* @return the number of matching characters, zero for no match
*/
public int isMatch(char[] text, int textLen, int pos) {
return 0;
}
}
//-----------------------------------------------------------------------
/**
* Class used to match whitespace as per trim().
*/
static final class TrimMatcher implements Matcher {
TrimMatcher() {
super();
}
/**
* Returns whether or not the given charatcer matches.
*
* @param text the text content to match against
* @param textLen the length of the text
* @param pos the starting position
* @return the number of matching characters, zero for no match
*/
public int isMatch(char[] text, int textLen, int pos) {
return (text[pos] <= 32 ? 1 : 0);
}
}
}
1.2 +3 -1
jakarta-commons/lang/src/test/org/apache/commons/lang/text/TextTestSuite.java
Index: TextTestSuite.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/lang/src/test/org/apache/commons/lang/text/TextTestSuite.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- TextTestSuite.java 5 Sep 2004 00:56:31 -0000 1.1
+++ TextTestSuite.java 6 Oct 2004 22:29:24 -0000 1.2
@@ -49,6 +49,8 @@
TestSuite suite = new TestSuite();
suite.setName("Commons-Lang-Text Tests");
suite.addTest(InterpolationTest.suite());
+ suite.addTest(StrTokenizerTest.suite());
return suite;
}
+
}
1.1
jakarta-commons/lang/src/test/org/apache/commons/lang/text/StrTokenizerTest.java
Index: StrTokenizerTest.java
===================================================================
/*
* Copyright 2003-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.lang.text;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.textui.TestRunner;
import org.apache.commons.lang.ObjectUtils;
/**
* Unit test for Tokenizer.
*
* @author Matthew Inger
*/
public class StrTokenizerTest extends TestCase {
/**
* JUnit constructor.
* @param name
*/
public StrTokenizerTest(String name) {
super(name);
}
public static Test suite() {
TestSuite suite = new TestSuite(StrTokenizerTest.class);
suite.setName("TokenizerTest Tests");
return suite;
}
public static void main(String[] args) {
TestRunner.run(suite());
}
//-----------------------------------------------------------------------
public void test1() {
String input = "a;b;c;\"d;\"\"e\";f; ; ;";
StrTokenizer tok = new StrTokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(StrTokenizer.TRIM_MATCHER);
tok.setIgnoreEmptyTokens(false);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
{
"a",
"b",
"c",
"d;\"e",
"f",
"",
"",
"",
};
assertTrue(tokens.length == expected.length);
for (int i = 0; i < expected.length; i++) {
assertTrue("token[" + i + "] was '" + tokens[i]
+ "' but was expected to be '" + expected[i]
+ "'",
ObjectUtils.equals(expected[i], tokens[i]));
}
}
public void test2() {
String input = "a;b;c ;\"d;\"\"e\";f; ; ;";
StrTokenizer tok = new StrTokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(StrTokenizer.NONE_MATCHER);
tok.setIgnoreEmptyTokens(false);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
{
"a",
"b",
"c ",
"d;\"e",
"f",
" ",
" ",
"",
};
assertTrue(tokens.length == expected.length);
for (int i = 0; i < expected.length; i++) {
assertTrue("token[" + i + "] was '" + tokens[i]
+ "' but was expected to be '" + expected[i]
+ "'",
ObjectUtils.equals(expected[i], tokens[i]));
}
}
public void test3() {
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
StrTokenizer tok = new StrTokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(StrTokenizer.NONE_MATCHER);
tok.setIgnoreEmptyTokens(false);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
{
"a",
"b",
" c",
"d;\"e",
"f",
" ",
" ",
"",
};
assertTrue(tokens.length == expected.length);
for (int i = 0; i < expected.length; i++) {
assertTrue("token[" + i + "] was '" + tokens[i]
+ "' but was expected to be '" + expected[i]
+ "'",
ObjectUtils.equals(expected[i], tokens[i]));
}
}
public void test4() {
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
StrTokenizer tok = new StrTokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(StrTokenizer.TRIM_MATCHER);
tok.setIgnoreEmptyTokens(true);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
{
"a",
"b",
"c",
"d;\"e",
"f",
};
assertTrue(tokens.length == expected.length);
for (int i = 0; i < expected.length; i++) {
assertTrue("token[" + i + "] was '" + tokens[i]
+ "' but was expected to be '" + expected[i]
+ "'",
ObjectUtils.equals(expected[i], tokens[i]));
}
}
public void test5() {
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
StrTokenizer tok = new StrTokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(StrTokenizer.TRIM_MATCHER);
tok.setIgnoreEmptyTokens(false);
tok.setEmptyTokenAsNull(true);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
{
"a",
"b",
"c",
"d;\"e",
"f",
null,
null,
null,
};
assertTrue(tokens.length == expected.length);
for (int i = 0; i < expected.length; i++) {
assertTrue("token[" + i + "] was '" + tokens[i]
+ "' but was expected to be '" + expected[i]
+ "'",
ObjectUtils.equals(expected[i], tokens[i]));
}
}
public void test6() {
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
StrTokenizer tok = new StrTokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(StrTokenizer.TRIM_MATCHER);
tok.setIgnoreEmptyTokens(false);
// tok.setTreatingEmptyAsNull(true);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
{
"a",
"b",
" c",
"d;\"e",
"f",
null,
null,
null,
};
int nextCount = 0;
while (tok.hasNext()) {
tok.next();
nextCount++;
}
int prevCount = 0;
while (tok.hasPrevious()) {
tok.previous();
prevCount++;
}
assertTrue(tokens.length == expected.length);
assertTrue("could not cycle through entire token list"
+ " using the 'hasNext' and 'next' methods",
nextCount == expected.length);
assertTrue("could not cycle through entire token list"
+ " using the 'hasPrevious' and 'previous' methods",
prevCount == expected.length);
}
public void test7() {
String input = "a b c \"d e\" f ";
StrTokenizer tok = new StrTokenizer(input);
tok.setDelimiterMatcher(StrTokenizer.SPACE_MATCHER);
tok.setQuoteMatcher(StrTokenizer.DOUBLE_QUOTE_MATCHER);
tok.setIgnoredMatcher(StrTokenizer.NONE_MATCHER);
tok.setIgnoreEmptyTokens(false);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
{
"a",
"",
"",
"b",
"c",
"d e",
"f",
"",
};
assertTrue(tokens.length == expected.length);
for (int i = 0; i < expected.length; i++) {
assertTrue("token[" + i + "] was '" + tokens[i]
+ "' but was expected to be '" + expected[i]
+ "'",
ObjectUtils.equals(expected[i], tokens[i]));
}
}
public void test8() {
String input = "a b c \"d e\" f ";
StrTokenizer tok = new StrTokenizer(input);
tok.setDelimiterMatcher(StrTokenizer.SPACE_MATCHER);
tok.setQuoteMatcher(StrTokenizer.DOUBLE_QUOTE_MATCHER);
tok.setIgnoredMatcher(StrTokenizer.NONE_MATCHER);
tok.setIgnoreEmptyTokens(true);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
{
"a",
"b",
"c",
"d e",
"f",
};
assertTrue(tokens.length == expected.length);
for (int i = 0; i < expected.length; i++) {
assertTrue("token[" + i + "] was '" + tokens[i]
+ "' but was expected to be '" + expected[i]
+ "'",
ObjectUtils.equals(expected[i], tokens[i]));
}
}
public void testBasic1() {
String input = "a b c";
StrTokenizer tok = new StrTokenizer(input);
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasic2() {
String input = "a \nb\fc";
StrTokenizer tok = new StrTokenizer(input);
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasic3() {
String input = "a \nb\u0001\fc";
StrTokenizer tok = new StrTokenizer(input);
assertEquals("a", tok.next());
assertEquals("b\u0001", tok.next());
assertEquals("c", tok.next());
}
public void testBasic4() {
String input = "a \"b\" c";
StrTokenizer tok = new StrTokenizer(input);
assertEquals("a", tok.next());
assertEquals("\"b\"", tok.next());
assertEquals("c", tok.next());
}
public void testBasicQuoted1() {
String input = "a \"b\" c";
StrTokenizer tok = new StrTokenizer(input, ' ', '"');
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasicDelim1() {
String input = "a:b:c";
StrTokenizer tok = new StrTokenizer(input, ':');
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasicDelim2() {
String input = "a:b:c";
StrTokenizer tok = new StrTokenizer(input, ',');
assertEquals("a:b:c", tok.next());
}
public void testBasicEmpty1() {
String input = "a b c";
StrTokenizer tok = new StrTokenizer(input);
tok.setIgnoreEmptyTokens(false);
assertEquals("a", tok.next());
assertEquals("", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasicEmpty2() {
String input = "a b c";
StrTokenizer tok = new StrTokenizer(input);
tok.setIgnoreEmptyTokens(false);
tok.setEmptyTokenAsNull(true);
assertEquals("a", tok.next());
assertEquals(null, tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testGetContent() {
String input = "a b c \"d e\" f ";
StrTokenizer tok = new StrTokenizer(input);
assertSame(input, tok.getContent());
tok = new StrTokenizer(input.toCharArray());
assertEquals(input, tok.getContent());
}
public void testReset() {
String input = "a b c";
StrTokenizer tok = new StrTokenizer(input);
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
tok.reset();
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
tok.reset("d e");
assertEquals("d", tok.next());
assertEquals("e", tok.next());
tok.reset("f g".toCharArray());
assertEquals("f", tok.next());
assertEquals("g", tok.next());
}
public void testMatcher() {
assertEquals(1, StrTokenizer.SPACE_MATCHER.isMatch(new char[] {' '}, 1, 0));
assertEquals(0, StrTokenizer.SPACE_MATCHER.isMatch(new char[] {'\n'}, 1, 0));
assertEquals(0, StrTokenizer.SPACE_MATCHER.isMatch(new char[] {'\u0001'}, 1,
0));
assertEquals(1, StrTokenizer.TRIM_MATCHER.isMatch(new char[] {' '}, 1, 0));
assertEquals(1, StrTokenizer.TRIM_MATCHER.isMatch(new char[] {'\n'}, 1, 0));
assertEquals(1, StrTokenizer.TRIM_MATCHER.isMatch(new char[] {'\u0001'}, 1,
0));
assertEquals(1, StrTokenizer.SPLIT_MATCHER.isMatch(new char[] {' '}, 1, 0));
assertEquals(1, StrTokenizer.SPLIT_MATCHER.isMatch(new char[] {'\n'}, 1, 0));
assertEquals(0, StrTokenizer.SPLIT_MATCHER.isMatch(new char[] {'\u0001'}, 1,
0));
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]