Benedikt
2014-10-29 6:44 GMT+01:00 <ggreg...@apache.org>:
Author: ggregory
Date: Wed Oct 29 05:44:40 2014
New Revision: 1635052
URL: http://svn.apache.org/r1635052
Log:
[CSV-131] Save positions of records to enable random access. The floor
is
open for code review and further discussion based on the comments in the
Jira.
Modified:
commons/proper/csv/trunk/src/changes/changes.xml
commons/proper/csv/trunk/src/main/java/org/apache/commons/
csv/CSVParser.java
commons/proper/csv/trunk/src/main/java/org/apache/commons/
csv/CSVRecord.java
commons/proper/csv/trunk/src/test/java/org/apache/commons/
csv/CSVParserTest.java
commons/proper/csv/trunk/src/test/java/org/apache/commons/
csv/CSVRecordTest.java
Modified: commons/proper/csv/trunk/src/changes/changes.xml
URL:
http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/
changes/changes.xml?rev=1635052&r1=1635051&r2=1635052&view=diff
============================================================
==================
--- commons/proper/csv/trunk/src/changes/changes.xml (original)
+++ commons/proper/csv/trunk/src/changes/changes.xml Wed Oct 29
05:44:40
2014
@@ -39,6 +39,7 @@
</properties>
<body>
<release version="1.1" date="2014-mm-dd" description="Feature and
bug
fix release">
+ <action issue="CSV-131" type="add" dev="ggregory" due-to="Holger
Stratmann">Save positions of records to enable random access</action>
<action issue="CSV-130" type="fix" dev="ggregory" due-to="Sergei
Lebedev">CSVFormat#withHeader doesn't work well with #printComment, add
withHeaderComments(String...)</action>
<action issue="CSV-128" type="fix"
dev="ggregory">CSVFormat.EXCEL
should ignore empty header names</action>
<action issue="CSV-129" type="add" dev="ggregory">Add
CSVFormat#with 0-arg methods matching boolean arg methods</action>
Modified:
commons/proper/csv/trunk/src/main/java/org/apache/commons/
csv/CSVParser.java
URL:
http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/
main/java/org/apache/commons/csv/CSVParser.java?rev=
1635052&r1=1635051&r2=1635052&view=diff
============================================================
==================
---
commons/proper/csv/trunk/src/main/java/org/apache/commons/
csv/CSVParser.java
(original)
+++
commons/proper/csv/trunk/src/main/java/org/apache/commons/
csv/CSVParser.java
Wed Oct 29 05:44:40 2014
@@ -219,6 +219,12 @@ public final class CSVParser implements
private final List<String> record = new ArrayList<String>();
private long recordNumber;
+
+ /**
+ * Lexer offset if the parser does not start parsing at the
beginning
of the source. Usually used in combination
+ * with {@link #setNextRecordNumber(long)}
+ */
+ private long characterOffset;
private final Token reusableToken = new Token();
@@ -296,6 +302,43 @@ public final class CSVParser implements
}
/**
+ * Sets the record number to be assigned to the next record read.
+ * <p>
+ * Use this if the reader is not positioned at the first record
when
you create the parser. For example, the first
+ * record read might be the 51st record in the source file.
+ * </p>
+ * <p>
+ * If you want the records to also have the correct character
position referring to the underlying source, call
+ * {@link #setNextCharacterPosition(long)}.
+ * </p>
+ *
+ * @param nextRecordNumber
+ * the next record number
+ * @since 1.1
+ */
+ public void setNextRecordNumber(long nextRecordNumber) {
+ this.recordNumber = nextRecordNumber - 1;
+ }
+
+ /**
+ * Sets the current position in the source stream regardless of
where
the parser and lexer start reading.
+ * <p>
+ * For example: We open a file and seek to position 5434 in order
to
start reading at record 42. In order to have
+ * the parser assign the correct characterPosition to records, we
call this method.
+ * </p>
+ * <p>
+ * If you want the records to also have the correct record numbers,
call {@link #setNextRecordNumber(long)}
+ * </p>
+ *
+ * @param position
+ * the new character position
+ * @since 1.1
+ */
+ public void setNextCharacterPosition(long position) {
+ this.characterOffset = position - lexer.getCharacterPosition();
+ }
+
+ /**
* Returns the current record number in the input stream.
*
* <p>
@@ -445,6 +488,7 @@ public final class CSVParser implements
CSVRecord result = null;
this.record.clear();
StringBuilder sb = null;
+ final long startCharPosition = lexer.getCharacterPosition() +
this.characterOffset;
do {
this.reusableToken.reset();
this.lexer.nextToken(this.reusableToken);
@@ -480,7 +524,7 @@ public final class CSVParser implements
this.recordNumber++;
final String comment = sb == null ? null : sb.toString();
result = new CSVRecord(this.record.toArray(new
String[this.record.size()]), this.headerMap, comment,
- this.recordNumber);
+ this.recordNumber, startCharPosition);
}
return result;
}
Modified:
commons/proper/csv/trunk/src/main/java/org/apache/commons/
csv/CSVRecord.java
URL:
http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/
main/java/org/apache/commons/csv/CSVRecord.java?rev=
1635052&r1=1635051&r2=1635052&view=diff
============================================================
==================
---
commons/proper/csv/trunk/src/main/java/org/apache/commons/
csv/CSVRecord.java
(original)
+++
commons/proper/csv/trunk/src/main/java/org/apache/commons/
csv/CSVRecord.java
Wed Oct 29 05:44:40 2014
@@ -36,6 +36,8 @@ public final class CSVRecord implements
private static final long serialVersionUID = 1L;
+ private final long characterPosition;
+
/** The accumulated comments (if any) */
private final String comment;
@@ -44,15 +46,16 @@ public final class CSVRecord implements
/** The record number. */
private final long recordNumber;
-
+
/** The values of the record */
private final String[] values;
- CSVRecord(final String[] values, final Map<String, Integer>
mapping,
final String comment, final long recordNumber) {
+ CSVRecord(final String[] values, final Map<String, Integer>
mapping,
final String comment, final long recordNumber, long characterPosition) {
this.recordNumber = recordNumber;
this.values = values != null ? values : EMPTY_STRING_ARRAY;
this.mapping = mapping;
this.comment = comment;
+ this.characterPosition = characterPosition;
}
/**
@@ -110,6 +113,16 @@ public final class CSVRecord implements
}
/**
+ * Returns the start position of this record as a character
position
in the source stream. This may or may not
+ * correspond to the byte position depending on the character set.
+ *
+ * @return the position of this record in the source stream.
+ */
+ public long getCharacterPosition() {
+ return characterPosition;
+ }
+
+ /**
* Returns the comment for this record, if any.
*
* @return the comment for this record, or null if no comment for
this record is available.
Modified:
commons/proper/csv/trunk/src/test/java/org/apache/commons/
csv/CSVParserTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/
test/java/org/apache/commons/csv/CSVParserTest.java?rev=
1635052&r1=1635051&r2=1635052&view=diff
============================================================
==================
---
commons/proper/csv/trunk/src/test/java/org/apache/commons/
csv/CSVParserTest.java
(original)
+++
commons/proper/csv/trunk/src/test/java/org/apache/commons/
csv/CSVParserTest.java
Wed Oct 29 05:44:40 2014
@@ -299,22 +299,23 @@ public class CSVParserTest {
}
}
-// @Test
-// public void testStartWithEmptyLinesThenHeaders() throws
Exception
{
-// final String[] codes = { "\r\n\r\n\r\nhello,\r\n\r\n\r\n",
"hello,\n\n\n", "hello,\"\"\r\n\r\n\r\n", "hello,\"\"\n\n\n" };
-// final String[][] res = { { "hello", "" }, { "" }, // Excel
format does not ignore empty lines
-// { "" } };
-// for (final String code : codes) {
-// final CSVParser parser = CSVParser.parse(code,
CSVFormat.EXCEL);
-// final List<CSVRecord> records = parser.getRecords();
-// assertEquals(res.length, records.size());
-// assertTrue(records.size() > 0);
-// for (int i = 0; i < res.length; i++) {
-// assertArrayEquals(res[i], records.get(i).values());
-// }
-// parser.close();
-// }
-// }
+ // @Test
+ // public void testStartWithEmptyLinesThenHeaders() throws
Exception {
+ // final String[] codes = { "\r\n\r\n\r\nhello,\r\n\r\n\r\n",
"hello,\n\n\n", "hello,\"\"\r\n\r\n\r\n",
+ // "hello,\"\"\n\n\n" };
+ // final String[][] res = { { "hello", "" }, { "" }, // Excel
format
does not ignore empty lines
+ // { "" } };
+ // for (final String code : codes) {
+ // final CSVParser parser = CSVParser.parse(code, CSVFormat.EXCEL);
+ // final List<CSVRecord> records = parser.getRecords();
+ // assertEquals(res.length, records.size());
+ // assertTrue(records.size() > 0);
+ // for (int i = 0; i < res.length; i++) {
+ // assertArrayEquals(res[i], records.get(i).values());
+ // }
+ // parser.close();
+ // }
+ // }
@Test
public void testEndOfFileBehaviorCSV() throws Exception {
@@ -475,6 +476,16 @@ public class CSVParserTest {
}
@Test
+ public void testGetRecordPositionWithCRLF() throws Exception {
+ this.validateRecordPosition(CRLF);
+ }
+
+ @Test
+ public void testGetRecordPositionWithLF() throws Exception {
+ this.validateRecordPosition(String.valueOf(LF));
+ }
+
+ @Test
public void testGetOneLine() throws IOException {
final CSVParser parser = CSVParser.parse(CSV_INPUT_1,
CSVFormat.DEFAULT);
final CSVRecord record = parser.getRecords().get(0);
@@ -902,4 +913,65 @@ public class CSVParserTest {
parser.close();
}
+ private void validateRecordPosition(final String lineSeparator)
throws IOException {
+ final String nl = lineSeparator; // used as linebreak in values
for better distinction
+
+ String code = "a,b,c" + lineSeparator + "1,2,3" + lineSeparator
+
+ // to see if recordPosition correctly points to the enclosing
quote
+ "'A" + nl + "A','B" + nl + "B',CC" + lineSeparator +
+ // unicode test... not very relevant while operating on
strings instead of bytes, but for
+ // completeness...
+ "\u00c4,\u00d6,\u00dc" + lineSeparator + "EOF,EOF,EOF";
+
+ final CSVFormat format =
CSVFormat.newFormat(',').withQuote('\'').withRecordSeparator(
lineSeparator);
+ CSVParser parser = CSVParser.parse(code, format);
+
+ CSVRecord record;
+ assertEquals(0, parser.getRecordNumber());
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(1, record.getRecordNumber());
+ assertEquals(code.indexOf('a'), record.getCharacterPosition())
;
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(2, record.getRecordNumber());
+ assertEquals(code.indexOf('1'), record.getCharacterPosition())
;
+
+ assertNotNull(record = parser.nextRecord());
+ final long positionRecord3 = record.getCharacterPosition();
+ assertEquals(3, record.getRecordNumber());
+ assertEquals(code.indexOf("'A"),
record.getCharacterPosition());
+ assertEquals("A" + lineSeparator + "A", record.get(0));
+ assertEquals("B" + lineSeparator + "B", record.get(1));
+ assertEquals("CC", record.get(2));
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(4, record.getRecordNumber());
+ assertEquals(code.indexOf('\u00c4'),
record.getCharacterPosition());
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(5, record.getRecordNumber());
+ assertEquals(code.indexOf("EOF"),
record.getCharacterPosition());
+
+ parser.close();
+
+ // now try to read starting at record 3
+ parser = CSVParser.parse(code.substring((int)
positionRecord3),
format);
+ parser.setNextRecordNumber(3);
+ parser.setNextCharacterPosition(positionRecord3);
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(3, record.getRecordNumber());
+ assertEquals(code.indexOf("'A"),
record.getCharacterPosition());
+ assertEquals("A" + lineSeparator + "A", record.get(0));
+ assertEquals("B" + lineSeparator + "B", record.get(1));
+ assertEquals("CC", record.get(2));
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(4, record.getRecordNumber());
+ assertEquals(code.indexOf('\u00c4'),
record.getCharacterPosition());
+ assertEquals("\u00c4", record.get(0));
+
+ parser.close();
+ }
}
Modified:
commons/proper/csv/trunk/src/test/java/org/apache/commons/
csv/CSVRecordTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/
test/java/org/apache/commons/csv/CSVRecordTest.java?rev=
1635052&r1=1635051&r2=1635052&view=diff
============================================================
==================
---
commons/proper/csv/trunk/src/test/java/org/apache/commons/
csv/CSVRecordTest.java
(original)
+++
commons/proper/csv/trunk/src/test/java/org/apache/commons/
csv/CSVRecordTest.java
Wed Oct 29 05:44:40 2014
@@ -45,12 +45,12 @@ public class CSVRecordTest {
@Before
public void setUp() throws Exception {
values = new String[] { "A", "B", "C" };
- record = new CSVRecord(values, null, null, 0);
+ record = new CSVRecord(values, null, null, 0, -1);
header = new HashMap<String, Integer>();
header.put("first", Integer.valueOf(0));
header.put("second", Integer.valueOf(1));
header.put("third", Integer.valueOf(2));
- recordWithHeader = new CSVRecord(values, header, null, 0);
+ recordWithHeader = new CSVRecord(values, header, null, 0, -1);
}
@Test
--
http://people.apache.org/~britter/
http://www.systemoutprintln.de/
http://twitter.com/BenediktRitter
http://github.com/britter