METAMODEL-1141: Fixed Closes #145
(cherry picked from commit 018c9d1) Project: http://git-wip-us.apache.org/repos/asf/metamodel/repo Commit: http://git-wip-us.apache.org/repos/asf/metamodel/commit/fd6eef36 Tree: http://git-wip-us.apache.org/repos/asf/metamodel/tree/fd6eef36 Diff: http://git-wip-us.apache.org/repos/asf/metamodel/diff/fd6eef36 Branch: refs/heads/4.6.x Commit: fd6eef36da168574edad0bf6236006a7a7017bc0 Parents: 6b91fa6 Author: Dennis Du Krøger <d...@hp23c.dk> Authored: Thu Jun 8 05:33:48 2017 +0200 Committer: Dennis Du Krøger <l...@apache.org> Committed: Thu Aug 24 21:56:16 2017 +0200 ---------------------------------------------------------------------- csv/pom.xml | 4 +- .../apache/metamodel/csv/CsvDataContext.java | 31 +++++---- .../org/apache/metamodel/csv/CsvDataSet.java | 2 +- .../java/org/apache/metamodel/csv/CsvTable.java | 2 +- .../metamodel/csv/SingleLineCsvDataSet.java | 8 +-- .../apache/metamodel/csv/SingleLineCsvRow.java | 9 +-- .../metamodel/csv/DoubleQuoteEscapeTest.java | 69 ++++++++++++++++++++ .../metamodel/csv/SingleLineCsvDataSetTest.java | 6 +- .../metamodel/csv/SingleLineCsvRowTest.java | 2 +- .../test/resources/csv_doublequoteescape.csv | 10 +++ csv/src/test/resources/csv_weirdquotes.csv | 10 +++ csv/src/test/resources/tickets.csv | 2 +- 12 files changed, 125 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/pom.xml ---------------------------------------------------------------------- diff --git a/csv/pom.xml b/csv/pom.xml index 99945b1..31493a0 100644 --- a/csv/pom.xml +++ b/csv/pom.xml @@ -38,9 +38,9 @@ under the License. <scope>test</scope> </dependency> <dependency> - <groupId>net.sf.opencsv</groupId> + <groupId>com.opencsv</groupId> <artifactId>opencsv</artifactId> - <version>2.1</version> + <version>3.9</version> </dependency> <dependency> <groupId>junit</groupId> http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java ---------------------------------------------------------------------- diff --git a/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java b/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java index ee456f0..c8df5a7 100644 --- a/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java +++ b/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java @@ -48,8 +48,10 @@ import org.apache.metamodel.util.UrlResource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import au.com.bytecode.opencsv.CSVParser; -import au.com.bytecode.opencsv.CSVReader; +import com.opencsv.CSVParserBuilder; +import com.opencsv.CSVReader; +import com.opencsv.ICSVParser; +import com.opencsv.RFC4180ParserBuilder; /** * DataContext implementation for reading CSV files. @@ -378,23 +380,30 @@ public final class CsvDataContext extends QueryPostprocessDataContext implements return new CsvDataSet(csvReader, columns, maxRowsOrNull, columnCount, failOnInconsistentRowLength); } - final CSVParser csvParser = new CSVParser(_configuration.getSeparatorChar(), _configuration.getQuoteChar(), - _configuration.getEscapeChar()); - return new SingleLineCsvDataSet(reader, csvParser, columns, maxRowsOrNull, columnCount, + return new SingleLineCsvDataSet(reader, createParser(), columns, maxRowsOrNull, columnCount, failOnInconsistentRowLength); } + private ICSVParser createParser() { + final ICSVParser parser; + if (_configuration.getEscapeChar() == _configuration.getQuoteChar()) { + parser = new RFC4180ParserBuilder().withSeparator(_configuration.getSeparatorChar()) + .withQuoteChar(_configuration.getQuoteChar()).build(); + } else { + parser = new CSVParserBuilder().withSeparator(_configuration.getSeparatorChar()) + .withQuoteChar(_configuration.getQuoteChar()).withEscapeChar(_configuration.getEscapeChar()) + .build(); + } + return parser; + } + protected CSVReader createCsvReader(int skipLines) { final Reader reader = FileHelper.getReader(_resource.read(), _configuration.getEncoding()); - final CSVReader csvReader = new CSVReader(reader, _configuration.getSeparatorChar(), - _configuration.getQuoteChar(), _configuration.getEscapeChar(), skipLines); - return csvReader; + return new CSVReader(reader, skipLines, createParser()); } protected CSVReader createCsvReader(BufferedReader reader) { - final CSVReader csvReader = new CSVReader(reader, _configuration.getSeparatorChar(), - _configuration.getQuoteChar(), _configuration.getEscapeChar()); - return csvReader; + return new CSVReader(reader, CSVReader.DEFAULT_SKIP_LINES, createParser()); } @Override http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java ---------------------------------------------------------------------- diff --git a/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java b/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java index 2edacb1..b93db3e 100644 --- a/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java +++ b/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java @@ -27,7 +27,7 @@ import org.apache.metamodel.data.Row; import org.apache.metamodel.schema.Column; import org.apache.metamodel.util.FileHelper; -import au.com.bytecode.opencsv.CSVReader; +import com.opencsv.CSVReader; /** * Streaming DataSet implementation for CSV support http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/main/java/org/apache/metamodel/csv/CsvTable.java ---------------------------------------------------------------------- diff --git a/csv/src/main/java/org/apache/metamodel/csv/CsvTable.java b/csv/src/main/java/org/apache/metamodel/csv/CsvTable.java index 334af7e..0887eee 100644 --- a/csv/src/main/java/org/apache/metamodel/csv/CsvTable.java +++ b/csv/src/main/java/org/apache/metamodel/csv/CsvTable.java @@ -32,7 +32,7 @@ import org.apache.metamodel.schema.naming.ColumnNamingSession; import org.apache.metamodel.schema.naming.ColumnNamingStrategy; import org.apache.metamodel.util.FileHelper; -import au.com.bytecode.opencsv.CSVReader; +import com.opencsv.CSVReader; final class CsvTable extends AbstractTable { http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java ---------------------------------------------------------------------- diff --git a/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java index de6e7eb..250c249 100644 --- a/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java +++ b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java @@ -28,7 +28,7 @@ import org.apache.metamodel.data.Row; import org.apache.metamodel.schema.Column; import org.apache.metamodel.util.FileHelper; -import au.com.bytecode.opencsv.CSVParser; +import com.opencsv.ICSVParser; /** * A specialized DataSet implementation for the CSV module under circumstances @@ -38,7 +38,7 @@ import au.com.bytecode.opencsv.CSVParser; final class SingleLineCsvDataSet extends AbstractDataSet { private final BufferedReader _reader; - private final CSVParser _csvParser; + private final ICSVParser _csvParser; private final int _columnsInTable; private final boolean _failOnInconsistentRowLength; @@ -46,7 +46,7 @@ final class SingleLineCsvDataSet extends AbstractDataSet { private volatile Integer _rowsRemaining; private volatile Row _row; - public SingleLineCsvDataSet(BufferedReader reader, CSVParser csvParser, Column[] columns, Integer maxRows, + public SingleLineCsvDataSet(BufferedReader reader, ICSVParser csvParser, Column[] columns, Integer maxRows, int columnsInTable, boolean failOnInconsistentRowLength) { super(columns); _reader = reader; @@ -91,7 +91,7 @@ final class SingleLineCsvDataSet extends AbstractDataSet { return _columnsInTable; } - protected CSVParser getCsvParser() { + protected ICSVParser getCsvParser() { return _csvParser; } http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java ---------------------------------------------------------------------- diff --git a/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java index 439dc91..fe9fda5 100644 --- a/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java +++ b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java @@ -7,7 +7,7 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an @@ -29,8 +29,6 @@ import org.apache.metamodel.schema.Column; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import au.com.bytecode.opencsv.CSVParser; - /** * Specialized row implementation for single-line CSV values */ @@ -41,7 +39,7 @@ final class SingleLineCsvRow extends AbstractRow { private static final Logger logger = LoggerFactory.getLogger(SingleLineCsvRow.class); private final transient SingleLineCsvDataSet _dataSet; - + private final String _line; private final int _columnsInTable; private final boolean _failOnInconsistentRowLength; @@ -101,8 +99,7 @@ final class SingleLineCsvRow extends AbstractRow { private String[] parseLine() { try { - final CSVParser parser = _dataSet.getCsvParser(); - return parser.parseLine(_line); + return _dataSet.getCsvParser().parseLine(_line); } catch (IOException e) { if (_failOnInconsistentRowLength) { throw new MetaModelException("Failed to parse CSV line no. " + _rowNumber + ": " + _line, e); http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/test/java/org/apache/metamodel/csv/DoubleQuoteEscapeTest.java ---------------------------------------------------------------------- diff --git a/csv/src/test/java/org/apache/metamodel/csv/DoubleQuoteEscapeTest.java b/csv/src/test/java/org/apache/metamodel/csv/DoubleQuoteEscapeTest.java new file mode 100644 index 0000000..72194e3 --- /dev/null +++ b/csv/src/test/java/org/apache/metamodel/csv/DoubleQuoteEscapeTest.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE + * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package org.apache.metamodel.csv; + +import static junit.framework.TestCase.assertTrue; +import static org.junit.Assert.assertEquals; + +import java.io.File; + +import org.apache.metamodel.data.DataSet; +import org.junit.Test; + +public class DoubleQuoteEscapeTest { + + @Test + public void testDoubleQuoteEscape() throws Exception { + CsvConfiguration configuration = new CsvConfiguration(1, "UTF-8", ',', '"', '"'); + CsvDataContext dc = new CsvDataContext(new File("src/test/resources/csv_doublequoteescape.csv"), configuration); + + DataSet dataSet = dc.query().from("csv_doublequoteescape.csv").select("age", "name").execute(); + + assertTrue(dataSet.next()); + assertEquals("Row[values=[18, mi\"ke]]", dataSet.getRow().toString()); + + assertEquals("mi\"ke", dataSet.getRow().getValue(dc.getColumnByQualifiedLabel("name"))); + + assertTrue(dataSet.next()); + assertEquals("Row[values=[19, mic\"hael]]", dataSet.getRow().toString()); + assertTrue(dataSet.next()); + assertEquals("Row[values=[18, pet\"er]]", dataSet.getRow().toString()); + assertTrue(dataSet.next()); + assertEquals("Row[values=[18, barbar\"a, \"\"barb]]", dataSet.getRow().toString()); + + dataSet.close(); + } + + @Test + public void testWeirdQuotes() throws Exception { + CsvConfiguration configuration = new CsvConfiguration(1, "UTF-8", ',', '\\', '\\'); + CsvDataContext dc = new CsvDataContext(new File("src/test/resources/csv_weirdquotes.csv"), configuration); + + DataSet dataSet = dc.query().from("csv_weirdquotes.csv").select("age", "name").execute(); + + assertTrue(dataSet.next()); + assertEquals("Row[values=[18, mi\\ke]]", dataSet.getRow().toString()); + + assertEquals("mi\\ke", dataSet.getRow().getValue(dc.getColumnByQualifiedLabel("name"))); + + assertTrue(dataSet.next()); + assertEquals("Row[values=[19, mic\\hael]]", dataSet.getRow().toString()); + assertTrue(dataSet.next()); + assertEquals("Row[values=[18, pet\\er]]", dataSet.getRow().toString()); + assertTrue(dataSet.next()); + assertEquals("Row[values=[18, barbar\\a, \\\\barb]]", dataSet.getRow().toString()); + + dataSet.close(); + } + +} http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvDataSetTest.java ---------------------------------------------------------------------- diff --git a/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvDataSetTest.java b/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvDataSetTest.java index 1489016..2de30d4 100644 --- a/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvDataSetTest.java +++ b/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvDataSetTest.java @@ -21,11 +21,11 @@ package org.apache.metamodel.csv; import java.io.File; import java.util.Arrays; -import junit.framework.TestCase; - import org.apache.metamodel.data.DataSet; import org.apache.metamodel.schema.Table; +import junit.framework.TestCase; + public class SingleLineCsvDataSetTest extends TestCase { public void testGetValueInNonPhysicalOrder() throws Exception { @@ -64,4 +64,4 @@ public class SingleLineCsvDataSetTest extends TestCase { assertFalse(ds.next()); ds.close(); } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvRowTest.java ---------------------------------------------------------------------- diff --git a/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvRowTest.java b/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvRowTest.java index bed494e..a01821c 100644 --- a/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvRowTest.java +++ b/csv/src/test/java/org/apache/metamodel/csv/SingleLineCsvRowTest.java @@ -29,7 +29,7 @@ import org.apache.metamodel.schema.MutableColumn; import org.junit.Assert; import org.junit.Test; -import au.com.bytecode.opencsv.CSVParser; +import com.opencsv.CSVParser; public class SingleLineCsvRowTest { http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/test/resources/csv_doublequoteescape.csv ---------------------------------------------------------------------- diff --git a/csv/src/test/resources/csv_doublequoteescape.csv b/csv/src/test/resources/csv_doublequoteescape.csv new file mode 100644 index 0000000..b55e870 --- /dev/null +++ b/csv/src/test/resources/csv_doublequoteescape.csv @@ -0,0 +1,10 @@ +id,name,gender,age +1,"mi""ke",male,18 +2,"mic""hael",male,19 +3,"pet""er",male,18 +5,"barbar""a, """"barb",female,18 +4,"bob",male,17 +6,"cha""rlotte",female,18 +7,"hill""ary",female,20 +8,"ver""a",female,17 +9,"car""rie",female,17 http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/test/resources/csv_weirdquotes.csv ---------------------------------------------------------------------- diff --git a/csv/src/test/resources/csv_weirdquotes.csv b/csv/src/test/resources/csv_weirdquotes.csv new file mode 100644 index 0000000..8853379 --- /dev/null +++ b/csv/src/test/resources/csv_weirdquotes.csv @@ -0,0 +1,10 @@ +id,name,gender,age +1,\mi\\ke\,male,18 +2,\mic\\hael\,male,19 +3,\pet\\er\,male,18 +5,\barbar\\a, \\\\barb\,female,18 +4,\bob\,male,17 +6,\cha\\rlotte\,female,18 +7,\hill\\ary\,female,20 +8,\ver\\a\,female,17 +9,\car\\rie\,female,17 http://git-wip-us.apache.org/repos/asf/metamodel/blob/fd6eef36/csv/src/test/resources/tickets.csv ---------------------------------------------------------------------- diff --git a/csv/src/test/resources/tickets.csv b/csv/src/test/resources/tickets.csv index 3ca2398..b61dcd1 100644 --- a/csv/src/test/resources/tickets.csv +++ b/csv/src/test/resources/tickets.csv @@ -10,7 +10,7 @@ One way of improving this could be through caching. Another way could be through 3,DataCleaner 1.5 Release,202,Pattern Finder improvements,DataCleaner-core,None,enhancement,darrenH,assigned,2008-08-19T04:27:12Z+0200,2008-09-16T09:21:56Z+0200,"__Pattern Finder suggestions__: * have an option to ignore repeating spaces (so {{{\"aaa aaaaa\" and \"aaa aaaaa\"}}} are counted as one pattern. * have an option to ignore case, and a different option to preserve case. - * have an option to treat all 'special' characters as one pattern (so \"aaa*\", \"aaa/\", \"aaa\\" etc' are counted as one pattern, maybe denoted \"aaaS\"). + * have an option to treat all 'special' characters as one pattern (so \"aaa*\", \"aaa/\", \"aaa\" etc' are counted as one pattern, maybe denoted \"aaaS\"). * have an option to treat all groups of characters as a single sub-pattern. The idea is to be able to distinguish easily between names (say) that have two words and names that have more. This option should have a single pattern for \"Kasper Sorensen\" and \"George Bush\" (the pattern should ideally be \"An An\", denoting any numbers of alpha, space, any number of alpha). But \"Gorge W Bush\" will have the pattern \"An A An\". * it should be possible to combine the above options",BenBor 3,DataCleaner 1.5 Release,217,Commandline execution of .dcv and .dcp files,DataCleaner-core,None,enhancement,,new,2008-09-04T08:46:56Z+0200,2008-10-19T23:36:38Z+0200,"We should make a command line version of DataCleaner which could take in a .dcp or .dcv file, execute it and save the results in a file or a database (#117). This should ideally be done in DataCleaner core as it would be straight forward to reuse it in DataCleaner-webmonitor in the future then.",kasper