Repository: any23 Updated Branches: refs/heads/master 5a3ba9d1f -> 7a7db2006
ANY23-264 Upgrade to use public commons-csv instead of custom SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/7a7db200 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/7a7db200 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/7a7db200 Branch: refs/heads/master Commit: 7a7db20069da63bb1312dd12820c2f2ca301856f Parents: 5a3ba9d Author: Hans <[email protected]> Authored: Thu Feb 8 23:27:30 2018 -0600 Committer: Hans <[email protected]> Committed: Thu Feb 8 23:27:30 2018 -0600 ---------------------------------------------------------------------- .../any23/extractor/csv/CSVExtractor.java | 25 ++++++--- .../any23/extractor/csv/CSVReaderBuilder.java | 59 +++++++++----------- pom.xml | 2 +- 3 files changed, 43 insertions(+), 43 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/7a7db200/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java b/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java index e72162b..298d930 100644 --- a/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java @@ -28,6 +28,7 @@ import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.CSV; import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Value; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; @@ -38,6 +39,7 @@ import org.eclipse.rdf4j.model.vocabulary.XMLSchema; import java.io.IOException; import java.io.InputStream; import java.util.StringTokenizer; +import java.util.Iterator; /** * This extractor produces <i>RDF</i> from a <i>CSV file</i> . @@ -77,17 +79,18 @@ public class CSVExtractor implements Extractor.ContentExtractor { // build the parser csvParser = CSVReaderBuilder.build(in); + Iterator<CSVRecord> rows = csvParser.iterator(); // get the header and generate the IRIs for column names - String[] header = csvParser.getLine(); + CSVRecord header = rows.hasNext() ? rows.next() : null; headerIRIs = processHeader(header, documentIRI); // write triples to describe properties writeHeaderPropertiesMetadata(header, out); - String[] nextLine; int index = 0; - while ((nextLine = csvParser.getLine()) != null) { + while (rows.hasNext()) { + CSVRecord nextLine = rows.next(); IRI rowSubject = RDFUtils.iri( documentIRI.toString(), "row/" + index @@ -151,17 +154,18 @@ public class CSVExtractor implements Extractor.ContentExtractor { * @param header * @param out */ - private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) { + private void writeHeaderPropertiesMetadata(CSVRecord header, ExtractionResult out) { int index = 0; for (IRI singleHeader : headerIRIs) { if (index > headerIRIs.length) { break; } - if (!RDFUtils.isAbsoluteIRI(header[index])) { + String headerString = header.get(index); + if (!RDFUtils.isAbsoluteIRI(headerString)) { out.writeTriple( singleHeader, RDFS.LABEL, - SimpleValueFactory.getInstance().createLiteral(header[index]) + SimpleValueFactory.getInstance().createLiteral(headerString) ); } out.writeTriple( @@ -181,8 +185,11 @@ public class CSVExtractor implements Extractor.ContentExtractor { * @param header * @return an array of {@link IRI}s identifying the column names. */ - private IRI[] processHeader(String[] header, IRI documentIRI) { - IRI[] result = new IRI[header.length]; + private IRI[] processHeader(CSVRecord header, IRI documentIRI) { + if (header == null) + return new IRI[0]; + + IRI[] result = new IRI[header.size()]; int index = 0; for (String h : header) { String candidate = h.trim(); @@ -222,7 +229,7 @@ public class CSVExtractor implements Extractor.ContentExtractor { */ private void produceRowStatements( IRI rowSubject, - String[] values, + CSVRecord values, ExtractionResult out ) { int index = 0; http://git-wip-us.apache.org/repos/asf/any23/blob/7a7db200/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java ---------------------------------------------------------------------- diff --git a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java index 75bb583..87d764d 100644 --- a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java +++ b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java @@ -19,11 +19,13 @@ package org.apache.any23.extractor.csv; import org.apache.any23.configuration.DefaultConfiguration; import org.apache.commons.csv.CSVParser; -import org.apache.commons.csv.CSVStrategy; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVRecord; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.util.Iterator; /** * This class is responsible to build a reader first guessing the configuration @@ -38,21 +40,19 @@ public class CSVReaderBuilder { private static final String DEFAULT_COMMENT_DELIMITER = "#"; - public static final char NULL_CHAR = ' '; - private static final char[] popularDelimiters = {'\t', '|', ',', ';'}; private static DefaultConfiguration defaultConfiguration = DefaultConfiguration.singleton(); - private static final CSVStrategy[] strategies; + private static final CSVFormat[] strategies; static { - strategies = new CSVStrategy[ popularDelimiters.length + 1 ]; - strategies[0] = CSVStrategy.DEFAULT_STRATEGY; + strategies = new CSVFormat[popularDelimiters.length + 1]; + strategies[0] = CSVFormat.DEFAULT; int index = 1; - for(char dlmt : popularDelimiters) { - strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR); + for (char dlmt : popularDelimiters) { + strategies[index++] = CSVFormat.DEFAULT.withDelimiter(dlmt); } } @@ -65,9 +65,10 @@ public class CSVReaderBuilder { * @throws java.io.IOException */ public static CSVParser build(InputStream is) throws IOException { - CSVStrategy bestStrategy = getBestStrategy(is); - if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration(); - return new CSVParser( new InputStreamReader(is), bestStrategy ); + CSVFormat bestStrategy = getBestStrategy(is); + if (bestStrategy == null) + bestStrategy = getCSVStrategyFromConfiguration(); + return new CSVParser(new InputStreamReader(is), bestStrategy); } /** @@ -82,20 +83,16 @@ public class CSVReaderBuilder { return getBestStrategy(is) != null; } - private static CSVStrategy getBestStrategy(InputStream is) throws IOException { - for( CSVStrategy strategy : strategies ) { - if( testStrategy(is, strategy) ) { + private static CSVFormat getBestStrategy(InputStream is) throws IOException { + for (CSVFormat strategy : strategies) { + if (testStrategy(is, strategy)) { return strategy; } } return null; } - private static CSVStrategy getCsvStrategy(char delimiter, char comment) { - return new CSVStrategy(delimiter, '\'', comment); - } - - private static CSVStrategy getCSVStrategyFromConfiguration() { + private static CSVFormat getCSVStrategyFromConfiguration() { char fieldDelimiter = getCharValueFromConfiguration( "any23.extraction.csv.field", DEFAULT_FIELD_DELIMITER @@ -104,7 +101,7 @@ public class CSVReaderBuilder { "any23.extraction.csv.comment", DEFAULT_COMMENT_DELIMITER ); - return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter); + return CSVFormat.DEFAULT.withDelimiter(fieldDelimiter).withCommentMarker(commentDelimiter); } private static char getCharValueFromConfiguration(String property, String defaultValue) { @@ -112,7 +109,7 @@ public class CSVReaderBuilder { property, defaultValue ); - if (delimiter.length() != 1 || delimiter.equals("")) { + if (delimiter.length() != 1) { throw new RuntimeException(property + " value must be a single character"); } return delimiter.charAt(0); @@ -128,29 +125,25 @@ public class CSVReaderBuilder { * @throws IOException * @param is */ - private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException { + private static boolean testStrategy(InputStream is, CSVFormat strategy) throws IOException { final int MIN_COLUMNS = 2; is.mark(Integer.MAX_VALUE); try { - final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy); + final Iterator<CSVRecord> rows = new CSVParser(new InputStreamReader(is), strategy).iterator(); int linesToCheck = 5; int headerColumnCount = -1; - while (linesToCheck > 0) { - String[] row; - row = parser.getLine(); - if (row == null) { - break; - } - if (row.length < MIN_COLUMNS) { + while (linesToCheck > 0 && rows.hasNext()) { + int rowLength = rows.next().size(); + if (rowLength < MIN_COLUMNS) { return false; } if (headerColumnCount == -1) { // first row - headerColumnCount = row.length; + headerColumnCount = rowLength; } else { // make sure rows have the same number of columns or one more than the header - if (row.length < headerColumnCount) { + if (rowLength < headerColumnCount) { return false; - } else if (row.length - 1 > headerColumnCount) { + } else if (rowLength - 1 > headerColumnCount) { return false; } } http://git-wip-us.apache.org/repos/asf/any23/blob/7a7db200/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 4455cd1..14f5ee2 100644 --- a/pom.xml +++ b/pom.xml @@ -516,7 +516,7 @@ <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-csv</artifactId> - <version>1.0-SNAPSHOT-rev1148315</version> + <version>1.5</version> </dependency> <dependency> <groupId>commons-io</groupId>
