It looks like now all the CSVLexer ivars can be private. Gary
On Tue, Aug 6, 2013 at 11:44 AM, <s...@apache.org> wrote: > Author: sebb > Date: Tue Aug 6 15:44:41 2013 > New Revision: 1511006 > > URL: http://svn.apache.org/r1511006 > Log: > Merge Lexer with CSVLexer > > Removed: > > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java > Modified: > > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java > > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java > > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java > > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java > > Modified: > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java?rev=1511006&r1=1511005&r2=1511006&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java > (original) > +++ > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java > Tue Aug 6 15:44:41 2013 > @@ -17,6 +17,13 @@ > > package org.apache.commons.csv; > > +import static org.apache.commons.csv.Constants.BACKSPACE; > +import static org.apache.commons.csv.Constants.CR; > +import static org.apache.commons.csv.Constants.END_OF_STREAM; > +import static org.apache.commons.csv.Constants.FF; > +import static org.apache.commons.csv.Constants.LF; > +import static org.apache.commons.csv.Constants.TAB; > +import static org.apache.commons.csv.Constants.UNDEFINED; > import static org.apache.commons.csv.Token.Type.COMMENT; > import static org.apache.commons.csv.Token.Type.EOF; > import static org.apache.commons.csv.Token.Type.EORECORD; > @@ -30,11 +37,38 @@ import java.io.IOException; > * > * @version $Id$ > */ > -final class CSVLexer extends Lexer { > +final class CSVLexer { > + > + /** > + * Constant char to use for disabling comments, escapes and > encapsulation. The value -2 is used because it > + * won't be confused with an EOF signal (-1), and because the Unicode > value {@code FFFE} would be encoded as two > + * chars (using surrogates) and thus there should never be a > collision with a real text char. > + */ > + private static final char DISABLED = '\ufffe'; > + > + private final char delimiter; > + private final char escape; > + private final char quoteChar; > + private final char commmentStart; > + > + final boolean ignoreSurroundingSpaces; > + final boolean ignoreEmptyLines; > + > + final CSVFormat format; > + > + /** The input stream */ > + final ExtendedBufferedReader in; > > /** INTERNAL API. ctor needs to be public so can be called > dynamically by PerformanceTest class */ > CSVLexer(final CSVFormat format, final ExtendedBufferedReader in) { > - super(format, in); > + this.format = format; > + this.in = in; > + this.delimiter = format.getDelimiter(); > + this.escape = mapNullToDisabled(format.getEscape()); > + this.quoteChar = mapNullToDisabled(format.getQuoteChar()); > + this.commmentStart = mapNullToDisabled(format.getCommentStart()); > + this.ignoreSurroundingSpaces = > format.getIgnoreSurroundingSpaces(); > + this.ignoreEmptyLines = format.getIgnoreEmptyLines(); > } > > /** > @@ -48,7 +82,6 @@ final class CSVLexer extends Lexer { > * @throws java.io.IOException > * on stream access error > */ > - @Override > Token nextToken(final Token token) throws IOException { > > // get the last read char (required for empty line detection) > @@ -257,4 +290,144 @@ final class CSVLexer extends Lexer { > } > } > > + private final char mapNullToDisabled(final Character c) { > + return c == null ? DISABLED : c.charValue(); > + } > + > + /** > + * Returns the current line number > + * > + * @return the current line number > + */ > + long getCurrentLineNumber() { > + return in.getCurrentLineNumber(); > + } > + > + // TODO escape handling needs more work > + /** > + * Handle an escape sequence. > + * The current character must be the escape character. > + * On return, the next character is available by calling {@link > ExtendedBufferedReader#getLastChar()} > + * on the input stream. > + * > + * @return the unescaped character (as an int) or {@link > END_OF_STREAM} if char following the escape is invalid. > + * @throws IOException if there is a problem reading the stream or > the end of stream is detected: > + * the escape character is not allowed at end of strem > + */ > + int readEscape() throws IOException { > + // the escape char has just been read (normally a backslash) > + final int ch = in.read(); > + switch (ch) { > + case 'r': > + return CR; > + case 'n': > + return LF; > + case 't': > + return TAB; > + case 'b': > + return BACKSPACE; > + case 'f': > + return FF; > + case CR: > + case LF: > + case FF: // TODO is this correct? > + case TAB: // TODO is this correct? Do tabs need to be escaped? > + case BACKSPACE: // TODO is this correct? > + return ch; > + case END_OF_STREAM: > + throw new IOException("EOF whilst processing escape > sequence"); > + default: > + // Now check for meta-characters > + if (isMetaChar(ch)) { > + return ch; > + } > + // indicate unexpected char - available from in.getLastChar() > + return END_OF_STREAM; > + } > + } > + > + void trimTrailingSpaces(final StringBuilder buffer) { > + int length = buffer.length(); > + while (length > 0 && Character.isWhitespace(buffer.charAt(length > - 1))) { > + length = length - 1; > + } > + if (length != buffer.length()) { > + buffer.setLength(length); > + } > + } > + > + /** > + * Greedily accepts \n, \r and \r\n This checker consumes silently > the second control-character... > + * > + * @return true if the given or next character is a line-terminator > + */ > + boolean readEndOfLine(int ch) throws IOException { > + // check if we have \r\n... > + if (ch == CR && in.lookAhead() == LF) { > + // note: does not change ch outside of this method! > + ch = in.read(); > + } > + return ch == LF || ch == CR; > + } > + > + boolean isClosed() { > + return in.isClosed(); > + } > + > + /** > + * @return true if the given char is a whitespace character > + */ > + boolean isWhitespace(final int ch) { > + return !isDelimiter(ch) && Character.isWhitespace((char) ch); > + } > + > + /** > + * Checks if the current character represents the start of a line: a > CR, LF or is at the start of the file. > + * > + * @param ch the character to check > + * @return true if the character is at the start of a line. > + */ > + boolean isStartOfLine(final int ch) { > + return ch == LF || ch == CR || ch == UNDEFINED; > + } > + > + /** > + * @return true if the given character indicates end of file > + */ > + boolean isEndOfFile(final int ch) { > + return ch == END_OF_STREAM; > + } > + > + boolean isDelimiter(final int ch) { > + return ch == delimiter; > + } > + > + boolean isEscape(final int ch) { > + return ch == escape; > + } > + > + boolean isQuoteChar(final int ch) { > + return ch == quoteChar; > + } > + > + boolean isCommentStart(final int ch) { > + return ch == commmentStart; > + } > + > + private boolean isMetaChar(final int ch) { > + return ch == delimiter || > + ch == escape || > + ch == quoteChar || > + ch == commmentStart; > + } > + > + /** > + * Closes resources. > + * > + * @throws IOException > + * If an I/O error occurs > + */ > + void close() throws IOException { > + in.close(); > + } > } > > Modified: > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java?rev=1511006&r1=1511005&r2=1511006&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java > (original) > +++ > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java > Tue Aug 6 15:44:41 2013 > @@ -217,7 +217,7 @@ public final class CSVParser implements > private final CSVFormat format; > private final Map<String, Integer> headerMap; > > - private final Lexer lexer; > + private final CSVLexer lexer; > > /** A record buffer for getRecord(). Grows as necessary and is > reused. */ > private final List<String> record = new ArrayList<String>(); > > Modified: > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java?rev=1511006&r1=1511005&r2=1511006&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java > (original) > +++ > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java > Tue Aug 6 15:44:41 2013 > @@ -52,14 +52,14 @@ public class CSVLexerTest { > formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\'); > } > > - private Lexer getLexer(final String input, final CSVFormat format) { > + private CSVLexer getLexer(final String input, final CSVFormat format) > { > return new CSVLexer(format, new ExtendedBufferedReader(new > StringReader(input))); > } > > @Test > public void testSurroundingSpacesAreDeleted() throws IOException { > final String code = "noSpaces, leadingSpaces,trailingSpaces , > surroundingSpaces , ,,"; > - final Lexer parser = getLexer(code, > CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true)); > + final CSVLexer parser = getLexer(code, > CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true)); > assertThat(parser.nextToken(new Token()), matches(TOKEN, > "noSpaces")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, > "leadingSpaces")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, > "trailingSpaces")); > @@ -72,7 +72,7 @@ public class CSVLexerTest { > @Test > public void testSurroundingTabsAreDeleted() throws IOException { > final String code = > "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,"; > - final Lexer parser = getLexer(code, > CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true)); > + final CSVLexer parser = getLexer(code, > CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true)); > assertThat(parser.nextToken(new Token()), matches(TOKEN, > "noTabs")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, > "leadingTab")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, > "trailingTab")); > @@ -99,7 +99,7 @@ public class CSVLexerTest { > "\n"+ > "\n"; > final CSVFormat format = > CSVFormat.DEFAULT.withIgnoreEmptyLines(true); > - final Lexer parser = getLexer(code, format); > + final CSVLexer parser = getLexer(code, format); > > assertThat(parser.nextToken(new Token()), matches(TOKEN, > "first")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, "line")); > @@ -123,7 +123,7 @@ public class CSVLexerTest { > "# penultimate comment\n"+ > "# Final comment\n"; > final CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#'); > - final Lexer parser = getLexer(code, format); > + final CSVLexer parser = getLexer(code, format); > > assertThat(parser.nextToken(new Token()), matches(TOKEN, > "first")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, "line")); > @@ -161,7 +161,7 @@ public class CSVLexerTest { > final CSVFormat format = > CSVFormat.DEFAULT.withCommentStart('#').withIgnoreEmptyLines(false); > assertFalse("Should not ignore empty lines", > format.getIgnoreEmptyLines()); > > - final Lexer parser = getLexer(code, format); > + final CSVLexer parser = getLexer(code, format); > > > assertThat(parser.nextToken(new Token()), matches(TOKEN, "1")); > @@ -199,7 +199,7 @@ public class CSVLexerTest { > final String code = "a,\\,,b\\\n\\,,"; > final CSVFormat format = CSVFormat.DEFAULT; > assertFalse(format.isEscaping()); > - final Lexer parser = getLexer(code, format); > + final CSVLexer parser = getLexer(code, format); > > assertThat(parser.nextToken(new Token()), matches(TOKEN, "a")); > // an unquoted single backslash is not an escape char > @@ -221,7 +221,7 @@ public class CSVLexerTest { > final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne"; > final CSVFormat format = > formatWithEscaping.withIgnoreEmptyLines(false); > assertTrue(format.isEscaping()); > - final Lexer parser = getLexer(code, format); > + final CSVLexer parser = getLexer(code, format); > > assertThat(parser.nextToken(new Token()), matches(TOKEN, "a")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, ",")); > @@ -241,7 +241,7 @@ public class CSVLexerTest { > * a, " foo " ,b > */ > final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" > ,b\na, \" foo \" ,b"; > - final Lexer parser = getLexer(code, > CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true)); > + final CSVLexer parser = getLexer(code, > CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true)); > assertThat(parser.nextToken(new Token()), matches(TOKEN, "a")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo")); > assertThat(parser.nextToken(new Token()), matches(EORECORD, "b")); > @@ -261,7 +261,7 @@ public class CSVLexerTest { > @Test > public void testNextToken5() throws IOException { > final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t > \n\""; > - final Lexer parser = getLexer(code, CSVFormat.DEFAULT); > + final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT); > assertThat(parser.nextToken(new Token()), matches(TOKEN, "a")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, > "foo\n")); > assertThat(parser.nextToken(new Token()), matches(EORECORD, "b")); > @@ -280,7 +280,7 @@ public class CSVLexerTest { > */ > final String code = "a;'b and '' more\n'\n!comment;;;;\n;;"; > final CSVFormat format = > CSVFormat.DEFAULT.withQuoteChar('\'').withCommentStart('!').withDelimiter(';'); > - final Lexer parser = getLexer(code, format); > + final CSVLexer parser = getLexer(code, format); > assertThat(parser.nextToken(new Token()), matches(TOKEN, "a")); > assertThat(parser.nextToken(new Token()), matches(EORECORD, "b > and ' more\n")); > } > @@ -289,7 +289,7 @@ public class CSVLexerTest { > @Test > public void testDelimiterIsWhitespace() throws IOException { > final String code = "one\ttwo\t\tfour \t five\t six"; > - final Lexer parser = getLexer(code, CSVFormat.TDF); > + final CSVLexer parser = getLexer(code, CSVFormat.TDF); > assertThat(parser.nextToken(new Token()), matches(TOKEN, "one")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, "two")); > assertThat(parser.nextToken(new Token()), matches(TOKEN, "")); > @@ -300,96 +300,96 @@ public class CSVLexerTest { > > @Test > public void testEscapedCR() throws Exception { > - final Lexer lexer = getLexer("character\\" + CR + "Escaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character\\" + CR + "Escaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > CR + "Escaped")); > } > > @Test > public void testCR() throws Exception { > - final Lexer lexer = getLexer("character" + CR + "NotEscaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character" + CR + "NotEscaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character")); > assertThat(lexer.nextToken(new Token()), > hasContent("NotEscaped")); > } > > @Test > public void testEscapedLF() throws Exception { > - final Lexer lexer = getLexer("character\\" + LF + "Escaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character\\" + LF + "Escaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > LF + "Escaped")); > } > > @Test > public void testLF() throws Exception { > - final Lexer lexer = getLexer("character" + LF + "NotEscaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character" + LF + "NotEscaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character")); > assertThat(lexer.nextToken(new Token()), > hasContent("NotEscaped")); > } > > @Test // TODO is this correct? Do we expect <esc>TAB to be unescaped? > public void testEscapedTab() throws Exception { > - final Lexer lexer = getLexer("character\\" + TAB + "Escaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character\\" + TAB + "Escaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > TAB + "Escaped")); > } > > @Test > public void testTab() throws Exception { > - final Lexer lexer = getLexer("character" + TAB + "NotEscaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character" + TAB + "NotEscaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > TAB + "NotEscaped")); > } > > @Test // TODO is this correct? Do we expect <esc>BACKSPACE to be > unescaped? > public void testEscapedBackspace() throws Exception { > - final Lexer lexer = getLexer("character\\" + BACKSPACE + > "Escaped", formatWithEscaping); > + final CSVLexer lexer = getLexer("character\\" + BACKSPACE + > "Escaped", formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > BACKSPACE + "Escaped")); > } > > @Test > public void testBackspace() throws Exception { > - final Lexer lexer = getLexer("character" + BACKSPACE + > "NotEscaped", formatWithEscaping); > + final CSVLexer lexer = getLexer("character" + BACKSPACE + > "NotEscaped", formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > BACKSPACE + "NotEscaped")); > } > > @Test // TODO is this correct? Do we expect <esc>FF to be unescaped? > public void testEscapedFF() throws Exception { > - final Lexer lexer = getLexer("character\\" + FF + "Escaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character\\" + FF + "Escaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > FF + "Escaped")); > } > > @Test > public void testFF() throws Exception { > - final Lexer lexer = getLexer("character" + FF + "NotEscaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character" + FF + "NotEscaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > FF + "NotEscaped")); > } > > @Test > public void testEscapedMySqlNullValue() throws Exception { > // MySQL uses \N to symbolize null values. We have to restore this > - final Lexer lexer = getLexer("character\\NEscaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character\\NEscaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), > hasContent("character\\NEscaped")); > } > > @Test > public void testEscapedCharacter() throws Exception { > - final Lexer lexer = getLexer("character\\aEscaped", > formatWithEscaping); > + final CSVLexer lexer = getLexer("character\\aEscaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), > hasContent("character\\aEscaped")); > } > > @Test > public void testEscapedControlCharacter() throws Exception { > // we are explicitly using an escape different from \ here > - final Lexer lexer = getLexer("character!rEscaped", > CSVFormat.DEFAULT.withEscape('!')); > + final CSVLexer lexer = getLexer("character!rEscaped", > CSVFormat.DEFAULT.withEscape('!')); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > CR + "Escaped")); > } > > @Test > public void testEscapedControlCharacter2() throws Exception { > - final Lexer lexer = getLexer("character\\rEscaped", > CSVFormat.DEFAULT.withEscape('\\')); > + final CSVLexer lexer = getLexer("character\\rEscaped", > CSVFormat.DEFAULT.withEscape('\\')); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > CR + "Escaped")); > } > > @Test(expected = IOException.class) > public void testEscapingAtEOF() throws Exception { > final String code = "escaping at EOF is evil\\"; > - final Lexer lexer = getLexer(code, formatWithEscaping); > + final CSVLexer lexer = getLexer(code, formatWithEscaping); > > lexer.nextToken(new Token()); > } > > Modified: > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java?rev=1511006&r1=1511005&r2=1511006&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java > (original) > +++ > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java > Tue Aug 6 15:44:41 2013 > @@ -224,9 +224,9 @@ public class PerformanceTest { > } > > > - private static Constructor<Lexer> getLexerCtor(final String clazz) > throws Exception { > + private static Constructor<CSVLexer> getLexerCtor(final String clazz) > throws Exception { > @SuppressWarnings("unchecked") > - final Class<Lexer> lexer = (Class<Lexer>) > Class.forName("org.apache.commons.csv." + clazz); > + final Class<CSVLexer> lexer = (Class<CSVLexer>) > Class.forName("org.apache.commons.csv." + clazz); > return lexer.getConstructor(new Class<?>[]{CSVFormat.class, > ExtendedBufferedReader.class}); > } > > @@ -235,7 +235,7 @@ public class PerformanceTest { > String dynamic = ""; > for (int i = 0; i < max; i++) { > final ExtendedBufferedReader input = new > ExtendedBufferedReader(getReader()); > - Lexer lexer = null; > + CSVLexer lexer = null; > if (test.startsWith("CSVLexer")) { > dynamic="!"; > lexer = getLexerCtor(test).newInstance(new > Object[]{format, input}); > > > -- E-Mail: garydgreg...@gmail.com | ggreg...@apache.org Java Persistence with Hibernate, Second Edition<http://www.manning.com/bauer3/> JUnit in Action, Second Edition <http://www.manning.com/tahchiev/> Spring Batch in Action <http://www.manning.com/templier/> Blog: http://garygregory.wordpress.com Home: http://garygregory.com/ Tweet! http://twitter.com/GaryGregory