This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-csv.git
The following commit(s) were added to refs/heads/master by this push: new ac280e7 [CSV-93] Allow the handling of NULL values. ac280e7 is described below commit ac280e705d99cc1168f93f7d9a5f92618bebbb97 Author: Gary Gregory <garydgreg...@gmail.com> AuthorDate: Thu Jul 8 12:36:23 2021 -0400 [CSV-93] Allow the handling of NULL values. - [CSV-253] Handle absent values in input (null). - Cleaned up version of PR 77 from dota17 where: - Don't duplicate two state items from the format. - Use try-with-resources. - Remove useless parens. - Update Javaodc. - Sort members in the new tests. - Use builder. --- .../java/org/apache/commons/csv/CSVParser.java | 37 ++++- src/main/java/org/apache/commons/csv/Lexer.java | 1 + .../java/org/apache/commons/csv/QuoteMode.java | 2 +- src/main/java/org/apache/commons/csv/Token.java | 3 + .../org/apache/commons/csv/CSVPrinterTest.java | 2 +- .../apache/commons/csv/issues/JiraCsv253Test.java | 54 ++++++++ .../apache/commons/csv/issues/JiraCsv93Test.java | 150 +++++++++++++++++++++ 7 files changed, 243 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 3224013..94bb361 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -446,8 +446,7 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable { if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) { return; } - final String nullString = this.format.getNullString(); - this.recordList.add(inputClean.equals(nullString) ? null : inputClean); + this.recordList.add(handleNull(inputClean)); } /** @@ -636,7 +635,26 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable { } /** - * Gets whether this parser is closed. + * Handle whether input is parsed as null + * + * @param input + * the cell data to further processed + * @return null if input is parsed as null, or input itself if input isn't parsed as null + */ + private String handleNull(final String input) { + final boolean isQuoted = this.reusableToken.isQuoted; + final String nullString = format.getNullString(); + final boolean strictQuoteMode = isStrictQuoteMode(); + if (input.equals(nullString)) { + // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode + return strictQuoteMode && isQuoted ? input : null; + } + // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode + return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input; + } + + /** + * Tests whether this parser is closed. * * @return whether this parser is closed. */ @@ -645,7 +663,18 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable { } /** - * Returns an iterator on the records. + * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}. + * + * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or + * {@link QuoteMode#NON_NUMERIC}. + */ + private boolean isStrictQuoteMode() { + return this.format.getQuoteMode() == QuoteMode.ALL_NON_NULL || + this.format.getQuoteMode() == QuoteMode.NON_NUMERIC; + } + + /** + * Returns the record iterator. * * <p> * An {@link IOException} caught during the iteration are re-thrown as an diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index a0c75aa..1f14543 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -324,6 +324,7 @@ final class Lexer implements Closeable { * on invalid state: EOF before closing encapsulator or invalid character before delimiter or EOL */ private Token parseEncapsulatedToken(final Token token) throws IOException { + token.isQuoted = true; // save current line number in case needed for IOE final long startLineNumber = getCurrentLineNumber(); int c; diff --git a/src/main/java/org/apache/commons/csv/QuoteMode.java b/src/main/java/org/apache/commons/csv/QuoteMode.java index 272deb7..a9b33a1 100644 --- a/src/main/java/org/apache/commons/csv/QuoteMode.java +++ b/src/main/java/org/apache/commons/csv/QuoteMode.java @@ -17,7 +17,7 @@ package org.apache.commons.csv; /** - * Defines quoting behavior when printing. + * Defines quoting behavior. */ public enum QuoteMode { diff --git a/src/main/java/org/apache/commons/csv/Token.java b/src/main/java/org/apache/commons/csv/Token.java index dff7d01..2dedc58 100644 --- a/src/main/java/org/apache/commons/csv/Token.java +++ b/src/main/java/org/apache/commons/csv/Token.java @@ -55,10 +55,13 @@ final class Token { /** Token ready flag: indicates a valid token with content (ready for the parser). */ boolean isReady; + boolean isQuoted; + void reset() { content.setLength(0); type = INVALID; isReady = false; + isQuoted = false; } /** diff --git a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java index 780a502..6890bc2 100644 --- a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java +++ b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java @@ -873,7 +873,7 @@ public class CSVPrinterTest { String expected = "\"NULL\"\tNULL\n"; assertEquals(expected, writer.toString()); String[] record0 = toFirstRecordValues(expected, format); - assertArrayEquals(new Object[2], record0); + assertArrayEquals(s, record0); s = new String[] { "\\N", null }; format = CSVFormat.MYSQL.withNullString("\\N"); diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv253Test.java b/src/test/java/org/apache/commons/csv/issues/JiraCsv253Test.java new file mode 100644 index 0000000..9050731 --- /dev/null +++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv253Test.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.csv.issues; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Iterator; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.commons.csv.QuoteMode; +import org.junit.jupiter.api.Test; + +/** + * Setting QuoteMode:ALL_NON_NULL or NON_NUMERIC can distinguish between empty string columns and absent value columns. + */ +public class JiraCsv253Test { + + private void assertArrayEqual(final String[] expected, final CSVRecord actual) { + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], actual.get(i)); + } + } + + @Test + public void testHandleAbsentValues() throws IOException { + final String source = "\"John\",,\"Doe\"\n" + ",\"AA\",123\n" + "\"John\",90,\n" + "\"\",,90"; + final CSVFormat csvFormat = CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.NON_NUMERIC).build(); + try (final CSVParser parser = csvFormat.parse(new StringReader(source))) { + final Iterator<CSVRecord> csvRecords = parser.iterator(); + assertArrayEqual(new String[] {"John", null, "Doe"}, csvRecords.next()); + assertArrayEqual(new String[] {null, "AA", "123"}, csvRecords.next()); + assertArrayEqual(new String[] {"John", "90", null}, csvRecords.next()); + assertArrayEqual(new String[] {"", null, "90"}, csvRecords.next()); + } + } +} diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv93Test.java b/src/test/java/org/apache/commons/csv/issues/JiraCsv93Test.java new file mode 100644 index 0000000..e122348 --- /dev/null +++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv93Test.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.csv.issues; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.commons.csv.QuoteMode; +import org.junit.jupiter.api.Test; + +/** + * Add more tests about null value. + * <p> + * QuoteMode:ALL_NON_NULL (Quotes all non-null fields, null will not be quoted but not null will be quoted). when + * withNullString("NULL"), NULL String value ("NULL") and null value (null) will be formatted as '"NULL",NULL'. So it + * also should be parsed as NULL String value and null value (["NULL", null]), It should be distinguish in parsing. And + * when don't set nullString in CSVFormat, String '"",' should be parsed as "" and null (["", null]) according to null + * will not be quoted but not null will be quoted. QuoteMode:NON_NUMERIC, same as ALL_NON_NULL. + * </p> + * <p> + * This can solve the problem of distinguishing between empty string columns and absent value columns which just like + * Jira CSV-253 to a certain extent. + * </p> + */ +public class JiraCsv93Test { + private static Object[] objects1 = {"abc", "", null, "a,b,c", 123}; + + private static Object[] objects2 = {"abc", "NULL", null, "a,b,c", 123}; + + private void every(final CSVFormat csvFormat, final Object[] objects, final String format, final String[] data) + throws IOException { + final String source = csvFormat.format(objects); + assertEquals(format, csvFormat.format(objects)); + try (final CSVParser csvParser = csvFormat.parse(new StringReader(source))) { + final CSVRecord csvRecord = csvParser.iterator().next(); + for (int i = 0; i < data.length; i++) { + assertEquals(csvRecord.get(i), data[i]); + } + } + } + + @Test + public void testWithNotSetNullString() throws IOException { + // @formatter:off + every(CSVFormat.DEFAULT, + objects1, + "abc,,,\"a,b,c\",123", + new String[]{"abc", "", "", "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.ALL).build(), + objects1, + "\"abc\",\"\",,\"a,b,c\",\"123\"", + new String[]{"abc", "", "", "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.ALL_NON_NULL).build(), + objects1, + "\"abc\",\"\",,\"a,b,c\",\"123\"", + new String[]{"abc", "", null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.MINIMAL).build(), + objects1, + "abc,,,\"a,b,c\",123", + new String[]{"abc", "", "", "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setEscape('?').setQuoteMode(QuoteMode.NONE).build(), + objects1, + "abc,,,a?,b?,c,123", + new String[]{"abc", "", "", "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.NON_NUMERIC).build(), + objects1, + "\"abc\",\"\",,\"a,b,c\",123", + new String[]{"abc", "", null, "a,b,c", "123"}); + // @formatter:on + } + + @Test + public void testWithSetNullStringEmptyString() throws IOException { + // @formatter:off + every(CSVFormat.DEFAULT.builder().setNullString("").build(), + objects1, + "abc,,,\"a,b,c\",123", + new String[]{"abc", null, null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.ALL).build(), + objects1, + "\"abc\",\"\",\"\",\"a,b,c\",\"123\"", + new String[]{"abc", null, null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.ALL_NON_NULL).build(), + objects1, + "\"abc\",\"\",,\"a,b,c\",\"123\"", + new String[]{"abc", "", null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.MINIMAL).build(), + objects1, + "abc,,,\"a,b,c\",123", + new String[]{"abc", null, null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("").setEscape('?').setQuoteMode(QuoteMode.NONE).build(), + objects1, + "abc,,,a?,b?,c,123", + new String[]{"abc", null, null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.NON_NUMERIC).build(), + objects1, + "\"abc\",\"\",,\"a,b,c\",123", + new String[]{"abc", "", null, "a,b,c", "123"}); + // @formatter:on + } + + @Test + public void testWithSetNullStringNULL() throws IOException { + // @formatter:off + every(CSVFormat.DEFAULT.builder().setNullString("NULL").build(), + objects2, + "abc,NULL,NULL,\"a,b,c\",123", + new String[]{"abc", null, null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.ALL).build(), + objects2, + "\"abc\",\"NULL\",\"NULL\",\"a,b,c\",\"123\"", + new String[]{"abc", null, null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.ALL_NON_NULL).build(), + objects2, + "\"abc\",\"NULL\",NULL,\"a,b,c\",\"123\"", + new String[]{"abc", "NULL", null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.MINIMAL).build(), + objects2, + "abc,NULL,NULL,\"a,b,c\",123", + new String[]{"abc", null, null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("NULL").setEscape('?').setQuoteMode(QuoteMode.NONE).build(), + objects2, + "abc,NULL,NULL,a?,b?,c,123", + new String[]{"abc", null, null, "a,b,c", "123"}); + every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.NON_NUMERIC).build(), + objects2, + "\"abc\",\"NULL\",NULL,\"a,b,c\",123", + new String[]{"abc", "NULL", null, "a,b,c", "123"}); + // @formatter:on + } +} \ No newline at end of file