Repository: any23 Updated Branches: refs/heads/master 5bc7e46a8 -> b0baa9407
Fix ANY23-308 - validate yaml file - rename csvutils -> utils - bring all utility class into util module - update README Signed-off-by: Jacek Grzebyta <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/ae036a7a Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/ae036a7a Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/ae036a7a Branch: refs/heads/master Commit: ae036a7af2a8c5a5572b6e17832f69bd8f4b4ba4 Parents: bd69aef Author: Jacek Grzebyta <[email protected]> Authored: Tue Jul 11 11:57:16 2017 +0100 Committer: Jacek Grzebyta <[email protected]> Committed: Tue Jul 11 11:57:16 2017 +0100 ---------------------------------------------------------------------- README.md | 2 +- cli/pom.xml | 2 +- .../org/apache/any23/cli/YAMLRoverTest.java | 76 +++++++++ core/pom.xml | 7 +- .../any23/extractor/yaml/YAMLExtractor.java | 7 +- .../any23/extractor/yaml/YAMLExtractorTest.java | 14 +- .../extractor/yaml/YAMLTikaParserTest.java | 48 ++++++ csvutils/pom.xml | 106 ------------ .../any23/extractor/csv/CSVReaderBuilder.java | 166 ------------------- csvutils/src/test/resources/log4j.properties | 34 ---- mime/pom.xml | 2 +- .../apache/any23/mime/TikaMIMETypeDetector.java | 17 +- pom.xml | 7 +- utils/pom.xml | 123 ++++++++++++++ .../any23/extractor/csv/CSVReaderBuilder.java | 166 +++++++++++++++++++ .../any23/extractor/yaml/YAMLValidator.java | 105 ++++++++++++ .../any23/yaml/utils/YAMLValidatorTest.java | 66 ++++++++ utils/src/test/resources/log4j.properties | 35 ++++ 18 files changed, 659 insertions(+), 324 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/README.md ---------------------------------------------------------------------- diff --git a/README.md b/README.md index 9db7126..6c52061 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Any23 documentation can be found on the [website](http://any23.apache.org) * [api](https://github.com/lewismc/any23/tree/master/api): Any23 library external API. * [core](https://github.com/lewismc/any23/tree/master/core): The library core codebase. - * [csvutils](https://github.com/lewismc/any23/tree/master/csvutils): A CSV specific package + * [utils](https://github.com/lewismc/any23/tree/master/utils): An utilities package * [encoding](https://github.com/lewismc/any23/tree/master/encoding): Encoding detection library. * [mime](https://github.com/lewismc/any23/tree/master/mime): MIME Type detection library. * [nquads](https://github.com/lewismc/any23/tree/master/nquads): NQuads parsing and serialization library. http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/cli/pom.xml ---------------------------------------------------------------------- diff --git a/cli/pom.xml b/cli/pom.xml index 5acedfb..47b9c06 100644 --- a/cli/pom.xml +++ b/cli/pom.xml @@ -50,7 +50,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>apache-any23-csvutils</artifactId> + <artifactId>apache-any23-utils</artifactId> <version>${project.version}</version> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java new file mode 100644 index 0000000..17e8916 --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java @@ -0,0 +1,76 @@ +/* + * Copyright 2017 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.cli; + +import com.google.common.io.Files; +import java.io.File; +import java.io.IOException; +import org.apache.pdfbox.util.Charsets; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Unit test for issue ANY23-308 + * + * @author Jacek Grzebyta (grzebyta.dev [at] gmail.com) + */ +public class YAMLRoverTest extends ToolTestBase { + + private static final String file1 = "/org/apache/any23/extractor/yaml/simple-load.yml"; + + private static final String baseUri = "urn:test"; + + private final Logger log = LoggerFactory.getLogger(getClass()); + + public YAMLRoverTest() { + super(Rover.class); + } + + @Test + public void simpleTest() + throws Exception { + File outputFile = File.createTempFile("rover-test", ".ttl", tempDirectory); + File logfile = File.createTempFile("test-log", ".txt", tempDirectory); + + int exitCode = runTool(String.format("-l %s -o %s -f turtle -e yaml,csv -d %s %s", + logfile.getAbsolutePath(), + outputFile.getAbsolutePath(), + baseUri, + copyResourceToTempFile(file1).getAbsolutePath())); + + Assert.assertTrue(logfile.exists()); + log.debug("Log file location: {}", logfile.getAbsolutePath()); + log.info("Log file content: \n{}\n", Files.toString(logfile, Charsets.UTF_8)); + + Assert.assertEquals("Unexpected exit code.", 0, exitCode); + assertFileContainsString(outputFile, baseUri); + } + + /** + * + * @param f + * @param s Expected string in the file + * @return + */ + public void assertFileContainsString(File f, String s) throws IOException { + String fileContent = Files.toString(f, Charsets.UTF_8); + log.trace("File content: \n{}\n", fileContent); + Assert.assertTrue(fileContent.contains(s)); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/pom.xml ---------------------------------------------------------------------- diff --git a/core/pom.xml b/core/pom.xml index f03c672..c410799 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -38,7 +38,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>apache-any23-csvutils</artifactId> + <artifactId>apache-any23-utils</artifactId> <version>${project.version}</version> </dependency> <dependency> @@ -78,11 +78,6 @@ <groupId>com.beust</groupId> <artifactId>jcommander</artifactId> </dependency> - <dependency> - <groupId>org.yaml</groupId> - <artifactId>snakeyaml</artifactId> - <version>1.17</version> - </dependency> <!-- BEGIN: Tika --> <dependency> http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java index 64548f1..5c73082 100644 --- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java @@ -17,8 +17,6 @@ package org.apache.any23.extractor.yaml; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -31,7 +29,6 @@ import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.util.StringUtils; import org.apache.any23.vocab.YAML; -import org.apache.commons.lang.WordUtils; import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Value; @@ -54,7 +51,7 @@ public class YAMLExtractor implements Extractor.ContentExtractor { private int nodeId = 0; - private IRI documentRoot; + private Resource documentRoot; @Override public void setStopAtFirstError(boolean f) { @@ -65,7 +62,7 @@ public class YAMLExtractor implements Extractor.ContentExtractor { ExtractionResult out) throws IOException, ExtractionException { IRI documentURI = context.getDocumentIRI(); - documentRoot = RDFUtils.uri(documentURI.toString() + "root"); + documentRoot = makeUri("root", documentURI, false); log.debug("process: {}", documentURI.toString()); out.writeNamespace(vocab.PREFIX, vocab.NS); http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java index 0cf8d14..b265c5f 100644 --- a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java @@ -27,7 +27,6 @@ import org.eclipse.rdf4j.model.Statement; import org.eclipse.rdf4j.model.vocabulary.RDF; import org.eclipse.rdf4j.model.vocabulary.RDFS; import org.eclipse.rdf4j.repository.RepositoryResult; -import org.semarglproject.vocab.XSD; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -93,4 +92,17 @@ public class YAMLExtractorTest extends AbstractExtractorTestCase { RepositoryResult<Statement> docs = getStatements(null, null, RDF.NIL); Assert.assertTrue(Iterations.asList(docs).size() == 2); } + + /** + * Comma separated values are parsed as well. + * + * @throws Exception + */ + @Test + public void csvTest() + throws Exception { + assertExtract("/org/apache/any23/extractor/csv/test-comma.csv"); + log.debug(dumpModelToTurtle()); + assertModelNotEmpty(); + } } http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java new file mode 100644 index 0000000..4727c84 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java @@ -0,0 +1,48 @@ +/* + * Copyright 2017 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.yaml; + +import java.io.InputStream; +import org.apache.any23.mime.MIMEType; +import org.apache.any23.mime.TikaMIMETypeDetector; +import org.apache.any23.mime.purifier.WhiteSpacesPurifier; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author jacek + */ +public class YAMLTikaParserTest { + + private static final String file1 = "/org/apache/any23/extractor/yaml/simple-load.yml"; + + private final Logger log = LoggerFactory.getLogger(getClass()); + + @Test + public void tikaDetect() + throws Exception { + InputStream is = YAMLTikaParserTest.class.getResourceAsStream(file1); + TikaMIMETypeDetector detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier()); + MIMEType type = detector.guessMIMEType(null, is, null); + + log.info("Type: {}", type.toString()); + + Assert.assertEquals("text/x-yaml", type.toString()); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/csvutils/pom.xml ---------------------------------------------------------------------- diff --git a/csvutils/pom.xml b/csvutils/pom.xml deleted file mode 100644 index 8f5b18d..0000000 --- a/csvutils/pom.xml +++ /dev/null @@ -1,106 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - - <parent> - <artifactId>apache-any23</artifactId> - <groupId>org.apache.any23</groupId> - <version>2.1-SNAPSHOT</version> - <relativePath>..</relativePath> - </parent> - - <artifactId>apache-any23-csvutils</artifactId> - - <name>Apache Any23 :: CSV Utilities</name> - <description>CSV specific library.</description> - - <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>apache-any23-api</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-csv</artifactId> - </dependency> - <!-- Logging --> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> - <version>${slf4j.logger.version}</version> - <scope>test</scope> - </dependency> - </dependencies> - - <build> - <resources> - <resource> - <directory>${basedir}/../</directory> - <targetPath>META-INF</targetPath> - <includes> - <include>LICENSE.txt</include> - <include>NOTICE.txt</include> - </includes> - </resource> - </resources> - <pluginManagement> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-assembly-plugin</artifactId> - <version>${maven-assembly-plugin.version}</version> - <executions> - <execution> - <id>assembly</id> - <phase>package</phase> - <goals> - <goal>single</goal> - </goals> - </execution> - </executions> - <configuration> - <attach>true</attach> - <skipAssembly>true</skipAssembly> - <tarLongFileMode>gnu</tarLongFileMode> - </configuration> - </plugin> - </plugins> - </pluginManagement> - </build> - - <profiles> - <profile> - <id>release</id> - <build> - <resources> - <resource> - <directory>${basedir}/../</directory> - <targetPath>${project.build.directory}/apidocs/META-INF</targetPath> - <includes> - <include>LICENSE.txt</include> - <include>NOTICE.txt</include> - </includes> - </resource> - </resources> - </build> - </profile> - </profiles> - -</project> http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java ---------------------------------------------------------------------- diff --git a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java deleted file mode 100644 index 75bb583..0000000 --- a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.any23.extractor.csv; - -import org.apache.any23.configuration.DefaultConfiguration; -import org.apache.commons.csv.CSVParser; -import org.apache.commons.csv.CSVStrategy; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; - -/** - * This class is responsible to build a reader first guessing the configuration - * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}. - * - * @author Davide Palmisano ( [email protected] ) - * @author Michele Mostarda ( [email protected] ) - */ -public class CSVReaderBuilder { - - private static final String DEFAULT_FIELD_DELIMITER = ","; - - private static final String DEFAULT_COMMENT_DELIMITER = "#"; - - public static final char NULL_CHAR = ' '; - - private static final char[] popularDelimiters = {'\t', '|', ',', ';'}; - - private static DefaultConfiguration defaultConfiguration = - DefaultConfiguration.singleton(); - - private static final CSVStrategy[] strategies; - - static { - strategies = new CSVStrategy[ popularDelimiters.length + 1 ]; - strategies[0] = CSVStrategy.DEFAULT_STRATEGY; - int index = 1; - for(char dlmt : popularDelimiters) { - strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR); - } - } - - /** - * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing - * from the provided <i>CSV</i> file. - * - * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration. - * @return a {@link CSVParser} - * @throws java.io.IOException - */ - public static CSVParser build(InputStream is) throws IOException { - CSVStrategy bestStrategy = getBestStrategy(is); - if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration(); - return new CSVParser( new InputStreamReader(is), bestStrategy ); - } - - /** - * Checks whether the given input stream is a CSV or not. - * - * @param is input stream to be verified. - * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content. - * <code>false</code> otherwise. - * @throws IOException - */ - public static boolean isCSV(InputStream is) throws IOException { - return getBestStrategy(is) != null; - } - - private static CSVStrategy getBestStrategy(InputStream is) throws IOException { - for( CSVStrategy strategy : strategies ) { - if( testStrategy(is, strategy) ) { - return strategy; - } - } - return null; - } - - private static CSVStrategy getCsvStrategy(char delimiter, char comment) { - return new CSVStrategy(delimiter, '\'', comment); - } - - private static CSVStrategy getCSVStrategyFromConfiguration() { - char fieldDelimiter = getCharValueFromConfiguration( - "any23.extraction.csv.field", - DEFAULT_FIELD_DELIMITER - ); - char commentDelimiter = getCharValueFromConfiguration( - "any23.extraction.csv.comment", - DEFAULT_COMMENT_DELIMITER - ); - return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter); - } - - private static char getCharValueFromConfiguration(String property, String defaultValue) { - String delimiter = defaultConfiguration.getProperty( - property, - defaultValue - ); - if (delimiter.length() != 1 || delimiter.equals("")) { - throw new RuntimeException(property + " value must be a single character"); - } - return delimiter.charAt(0); - } - - /** - * make sure the reader has correct delimiter and quotation set. - * Check first lines and make sure they have the same amount of columns and at least 2 - * - * @param is input stream to be checked - * @param strategy strategy to be verified. - * @return - * @throws IOException - * @param is - */ - private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException { - final int MIN_COLUMNS = 2; - - is.mark(Integer.MAX_VALUE); - try { - final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy); - int linesToCheck = 5; - int headerColumnCount = -1; - while (linesToCheck > 0) { - String[] row; - row = parser.getLine(); - if (row == null) { - break; - } - if (row.length < MIN_COLUMNS) { - return false; - } - if (headerColumnCount == -1) { // first row - headerColumnCount = row.length; - } else { // make sure rows have the same number of columns or one more than the header - if (row.length < headerColumnCount) { - return false; - } else if (row.length - 1 > headerColumnCount) { - return false; - } - } - linesToCheck--; - } - return true; - } finally { - is.reset(); - } - } - - -} http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/csvutils/src/test/resources/log4j.properties ---------------------------------------------------------------------- diff --git a/csvutils/src/test/resources/log4j.properties b/csvutils/src/test/resources/log4j.properties deleted file mode 100644 index a7ad0af..0000000 --- a/csvutils/src/test/resources/log4j.properties +++ /dev/null @@ -1,34 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -log4j.rootCategory=INFO, R, O - -# Stdout -log4j.appender.O=org.apache.log4j.ConsoleAppender - -# File -#log4j.appender.R=org.apache.log4j.RollingFileAppender -#log4j.appender.R.File=log4j.log - -# Control the maximum log file size -#log4j.appender.R.MaxFileSize=100KB - -# Archive log files (one backup file here) -log4j.appender.R.MaxBackupIndex=1 - -log4j.appender.R.layout=org.apache.log4j.PatternLayout -log4j.appender.O.layout=org.apache.log4j.PatternLayout - -log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n -log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/mime/pom.xml ---------------------------------------------------------------------- diff --git a/mime/pom.xml b/mime/pom.xml index 9db7d3b..2014758 100644 --- a/mime/pom.xml +++ b/mime/pom.xml @@ -38,7 +38,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>apache-any23-csvutils</artifactId> + <artifactId>apache-any23-utils</artifactId> <version>${project.version}</version> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java ---------------------------------------------------------------------- diff --git a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java b/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java index e0584a1..77955cb 100644 --- a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java +++ b/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java @@ -36,6 +36,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.regex.Pattern; +import org.apache.any23.extractor.yaml.YAMLValidator; /** * Implementation of {@link MIMETypeDetector} based on @@ -134,6 +135,17 @@ public class TikaMIMETypeDetector implements MIMETypeDetector { } /** + * Checks if the stream contains a valid <i>YAML</i> content. + * + * @param is + * @return + * @throws IOException + */ + public static boolean checkYAMLFormat(InputStream is) throws IOException { + return YAMLValidator.isYAML(is); + } + + /** * Tries to apply one of the given patterns on a sample of the input stream. * * @param patterns the patterns to apply. @@ -263,8 +275,9 @@ public class TikaMIMETypeDetector implements MIMETypeDetector { type = RDFFormat.TURTLE.getDefaultMIMEType(); } else if( checkCSVFormat(input) ) { type = CSV_MIMETYPE; - } - else { + } else if (checkYAMLFormat(input)) { // YAML detection must be at the end + type = "text/x-yaml"; + } else { type = MimeTypes.OCTET_STREAM; } } http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 23ab57f..ac2a9bd 100644 --- a/pom.xml +++ b/pom.xml @@ -199,7 +199,7 @@ <modules> <module>api</module> <module>test-resources</module> - <module>csvutils</module> + <module>utils</module> <module>mime</module> <module>encoding</module> <module>core</module> @@ -527,6 +527,11 @@ <artifactId>metainf-services</artifactId> <version>1.5</version> </dependency> + <dependency> + <groupId>org.yaml</groupId> + <artifactId>snakeyaml</artifactId> + <version>1.17</version> + </dependency> <!-- END: plugins --> <!-- BEGIN: Test Dependencies --> http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/pom.xml ---------------------------------------------------------------------- diff --git a/utils/pom.xml b/utils/pom.xml new file mode 100644 index 0000000..a6f34ec --- /dev/null +++ b/utils/pom.xml @@ -0,0 +1,123 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <artifactId>apache-any23</artifactId> + <groupId>org.apache.any23</groupId> + <version>2.1-SNAPSHOT</version> + <relativePath>..</relativePath> + </parent> + + <artifactId>apache-any23-utils</artifactId> + + <name>Apache Any23 :: Utilities</name> + <description>Utilities library</description> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-api</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-csv</artifactId> + </dependency> + <dependency> + <groupId>org.yaml</groupId> + <artifactId>snakeyaml</artifactId> + </dependency> + <!-- Logging --> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <version>${slf4j.logger.version}</version> + <scope>test</scope> + </dependency> + <!-- Testing --> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-test-resources</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <resources> + <resource> + <directory>${basedir}/../</directory> + <targetPath>META-INF</targetPath> + <includes> + <include>LICENSE.txt</include> + <include>NOTICE.txt</include> + </includes> + </resource> + </resources> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <version>${maven-assembly-plugin.version}</version> + <executions> + <execution> + <id>assembly</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + </execution> + </executions> + <configuration> + <attach>true</attach> + <skipAssembly>true</skipAssembly> + <tarLongFileMode>gnu</tarLongFileMode> + </configuration> + </plugin> + </plugins> + </pluginManagement> + </build> + + <profiles> + <profile> + <id>release</id> + <build> + <resources> + <resource> + <directory>${basedir}/../</directory> + <targetPath>${project.build.directory}/apidocs/META-INF</targetPath> + <includes> + <include>LICENSE.txt</include> + <include>NOTICE.txt</include> + </includes> + </resource> + </resources> + </build> + </profile> + </profiles> + +</project> http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java ---------------------------------------------------------------------- diff --git a/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java new file mode 100644 index 0000000..75bb583 --- /dev/null +++ b/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.csv; + +import org.apache.any23.configuration.DefaultConfiguration; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVStrategy; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +/** + * This class is responsible to build a reader first guessing the configuration + * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}. + * + * @author Davide Palmisano ( [email protected] ) + * @author Michele Mostarda ( [email protected] ) + */ +public class CSVReaderBuilder { + + private static final String DEFAULT_FIELD_DELIMITER = ","; + + private static final String DEFAULT_COMMENT_DELIMITER = "#"; + + public static final char NULL_CHAR = ' '; + + private static final char[] popularDelimiters = {'\t', '|', ',', ';'}; + + private static DefaultConfiguration defaultConfiguration = + DefaultConfiguration.singleton(); + + private static final CSVStrategy[] strategies; + + static { + strategies = new CSVStrategy[ popularDelimiters.length + 1 ]; + strategies[0] = CSVStrategy.DEFAULT_STRATEGY; + int index = 1; + for(char dlmt : popularDelimiters) { + strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR); + } + } + + /** + * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing + * from the provided <i>CSV</i> file. + * + * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration. + * @return a {@link CSVParser} + * @throws java.io.IOException + */ + public static CSVParser build(InputStream is) throws IOException { + CSVStrategy bestStrategy = getBestStrategy(is); + if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration(); + return new CSVParser( new InputStreamReader(is), bestStrategy ); + } + + /** + * Checks whether the given input stream is a CSV or not. + * + * @param is input stream to be verified. + * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content. + * <code>false</code> otherwise. + * @throws IOException + */ + public static boolean isCSV(InputStream is) throws IOException { + return getBestStrategy(is) != null; + } + + private static CSVStrategy getBestStrategy(InputStream is) throws IOException { + for( CSVStrategy strategy : strategies ) { + if( testStrategy(is, strategy) ) { + return strategy; + } + } + return null; + } + + private static CSVStrategy getCsvStrategy(char delimiter, char comment) { + return new CSVStrategy(delimiter, '\'', comment); + } + + private static CSVStrategy getCSVStrategyFromConfiguration() { + char fieldDelimiter = getCharValueFromConfiguration( + "any23.extraction.csv.field", + DEFAULT_FIELD_DELIMITER + ); + char commentDelimiter = getCharValueFromConfiguration( + "any23.extraction.csv.comment", + DEFAULT_COMMENT_DELIMITER + ); + return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter); + } + + private static char getCharValueFromConfiguration(String property, String defaultValue) { + String delimiter = defaultConfiguration.getProperty( + property, + defaultValue + ); + if (delimiter.length() != 1 || delimiter.equals("")) { + throw new RuntimeException(property + " value must be a single character"); + } + return delimiter.charAt(0); + } + + /** + * make sure the reader has correct delimiter and quotation set. + * Check first lines and make sure they have the same amount of columns and at least 2 + * + * @param is input stream to be checked + * @param strategy strategy to be verified. + * @return + * @throws IOException + * @param is + */ + private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException { + final int MIN_COLUMNS = 2; + + is.mark(Integer.MAX_VALUE); + try { + final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy); + int linesToCheck = 5; + int headerColumnCount = -1; + while (linesToCheck > 0) { + String[] row; + row = parser.getLine(); + if (row == null) { + break; + } + if (row.length < MIN_COLUMNS) { + return false; + } + if (headerColumnCount == -1) { // first row + headerColumnCount = row.length; + } else { // make sure rows have the same number of columns or one more than the header + if (row.length < headerColumnCount) { + return false; + } else if (row.length - 1 > headerColumnCount) { + return false; + } + } + linesToCheck--; + } + return true; + } finally { + is.reset(); + } + } + + +} http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java ---------------------------------------------------------------------- diff --git a/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java b/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java new file mode 100644 index 0000000..5a5f63d --- /dev/null +++ b/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java @@ -0,0 +1,105 @@ +/* + * Copyright 2017 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.yaml; + +import com.google.common.collect.Iterables; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Scanner; +import java.util.regex.Pattern; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.yaml.snakeyaml.Yaml; + +/** + * Utility class provides static methods for YAML validation. + * + * @author Jacek Grzebyta (grzebyta.dev [at] gmail.com) + */ +public class YAMLValidator { + + private static final Logger log = LoggerFactory.getLogger(YAMLValidator.class); + + private static final Pattern YAML_PATTERN = Pattern.compile("^%YAML.*", Pattern.CASE_INSENSITIVE); + + /** + * Detects if is contains valid YAML content. + * <p> + * In the first instance it checks if there is "%YAML" head. If not check + * using the brute force method by parsing input stream with yaml parser. + * </p> + * <p> + * NB. Only "false" results are trusted. Even if result is "true" you cannot + * be sure that InputStream contains YAML intentional context because + * comma-separated-values are pars-able by YAML parser as well. + * </p> + * + * @param is {@link InputStream} + * @return + * @throws IOException + */ + public static boolean isYAML(InputStream is) throws IOException { + if (is == null) { + return false; + } + + if (!is.markSupported()) { + is = new BufferedInputStream(is); + } + + boolean result = false; + + // mark the reading frame position. MUST BE FIRST + is.mark(Integer.MAX_VALUE); + + while (true) { + // if is is empty than return false + if (is.available() <= 0) { + break; + } + + Scanner sc = new Scanner(is); + String out = sc.findWithinHorizon(YAML_PATTERN, 0); + + if (out != null && !out.isEmpty()) { + log.debug("Head: {}", out); + result = true; + break; + } + log.debug("Still not found. output is: {}", out); + is.reset(); + + try { + Yaml yml = new Yaml(); + Iterable<Object> parsedOut = yml.loadAll(is); + + if (Iterables.size(parsedOut) > 0) { + result = true; + break; + } + } catch (Exception ex) { + //do nothing + } + + // final break + break; + } + + is.reset(); // MUST BE AT THE END + return result; + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java ---------------------------------------------------------------------- diff --git a/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java b/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java new file mode 100644 index 0000000..fddf2fb --- /dev/null +++ b/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java @@ -0,0 +1,66 @@ +/* + * Copyright 2017 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.yaml.utils; + +import org.apache.any23.extractor.yaml.YAMLValidator; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collection; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author Jacek Grzebyta ( grzebyta.dev [at] gmail.com) + */ +@RunWith(Parameterized.class) +public class YAMLValidatorTest { + + private String path; + + private Boolean expected; + + private Logger log = LoggerFactory.getLogger(getClass()); + + public YAMLValidatorTest(String path, Boolean expected) { + this.path = path; + this.expected = expected; + } + + @Parameterized.Parameters + public static Collection<Object[]> getFiles() { + return Arrays.asList(new Object[][]{ + {"/org/apache/any23/extractor/yaml/simple-load.yml", Boolean.TRUE}, + {"/org/apache/any23/extractor/yaml/simple-load_no_head.yml", Boolean.TRUE}, + {"/org/apache/any23/extractor/yaml/different-integers.yml", Boolean.TRUE}, + {"/org/apache/any23/extractor/yaml/different-float.yml", Boolean.TRUE}, + {"/org/apache/any23/extractor/csv/test-comma.csv", Boolean.TRUE}}); + } + + @Test + public void runTest() + throws Exception { + log.info("Try path: {}", path); + InputStream is = YAMLValidatorTest.class.getResourceAsStream(path); + boolean result = YAMLValidator.isYAML(is); + log.debug("Test resutl: {}", result); + Assert.assertSame(expected, result); + + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/test/resources/log4j.properties ---------------------------------------------------------------------- diff --git a/utils/src/test/resources/log4j.properties b/utils/src/test/resources/log4j.properties new file mode 100644 index 0000000..3860396 --- /dev/null +++ b/utils/src/test/resources/log4j.properties @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# log4j.rootCategory=INFO, R, O +log4j.rootCategory=INFO, O + +# Stdout +log4j.appender.O=org.apache.log4j.ConsoleAppender + +# File +#log4j.appender.R=org.apache.log4j.RollingFileAppender +#log4j.appender.R.File=log4j.log + +# Control the maximum log file size +#log4j.appender.R.MaxFileSize=100KB + +# Archive log files (one backup file here) +log4j.appender.R.MaxBackupIndex=1 + +log4j.appender.R.layout=org.apache.log4j.PatternLayout +log4j.appender.O.layout=org.apache.log4j.PatternLayout + +log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n +log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n
