ANY23-80 : Split out CLI into its own module Signed-off-by: Peter Ansell <[email protected]>
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/242b130b Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/242b130b Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/242b130b Branch: refs/heads/master Commit: 242b130b4670507e240bf9fec1fb8f9aad647870 Parents: 82e5645 Author: Peter Ansell <[email protected]> Authored: Thu Jan 12 10:35:17 2017 +1100 Committer: Peter Ansell <[email protected]> Committed: Thu Jan 12 10:35:17 2017 +1100 ---------------------------------------------------------------------- cli/pom.xml | 253 ++++++++++++++++++ .../any23/cli/ExtractorDocumentation.java | 186 +++++++++++++ .../org/apache/any23/cli/MicrodataParser.java | 99 +++++++ .../java/org/apache/any23/cli/MimeDetector.java | 101 +++++++ .../org/apache/any23/cli/PluginVerifier.java | 86 ++++++ .../main/java/org/apache/any23/cli/Rover.java | 265 +++++++++++++++++++ .../java/org/apache/any23/cli/ToolRunner.java | 263 ++++++++++++++++++ .../java/org/apache/any23/cli/VocabPrinter.java | 54 ++++ .../java/org/apache/any23/cli/package-info.java | 22 ++ .../any23/cli/ExtractorDocumentationTest.java | 57 ++++ .../apache/any23/cli/MicrodataParserTest.java | 46 ++++ .../org/apache/any23/cli/MimeDetectorTest.java | 51 ++++ .../apache/any23/cli/PluginVerifierTest.java | 38 +++ .../java/org/apache/any23/cli/RoverTest.java | 139 ++++++++++ .../org/apache/any23/cli/ToolRunnerTest.java | 65 +++++ .../java/org/apache/any23/cli/ToolTestBase.java | 91 +++++++ .../org/apache/any23/cli/VocabPrinterTest.java | 38 +++ .../any23/cli/ExtractorDocumentation.java | 186 ------------- .../org/apache/any23/cli/MicrodataParser.java | 99 ------- .../java/org/apache/any23/cli/MimeDetector.java | 101 ------- .../org/apache/any23/cli/PluginVerifier.java | 86 ------ .../main/java/org/apache/any23/cli/Rover.java | 265 ------------------- .../java/org/apache/any23/cli/ToolRunner.java | 263 ------------------ .../java/org/apache/any23/cli/VocabPrinter.java | 54 ---- .../java/org/apache/any23/cli/package-info.java | 22 -- .../any23/cli/ExtractorDocumentationTest.java | 57 ---- .../apache/any23/cli/MicrodataParserTest.java | 46 ---- .../org/apache/any23/cli/MimeDetectorTest.java | 51 ---- .../apache/any23/cli/PluginVerifierTest.java | 38 --- .../java/org/apache/any23/cli/RoverTest.java | 139 ---------- .../org/apache/any23/cli/ToolRunnerTest.java | 65 ----- .../java/org/apache/any23/cli/ToolTestBase.java | 91 ------- .../org/apache/any23/cli/VocabPrinterTest.java | 38 --- plugins/basic-crawler/pom.xml | 16 +- plugins/html-scraper/pom.xml | 1 - plugins/office-scraper/pom.xml | 1 - pom.xml | 1 + 37 files changed, 1870 insertions(+), 1604 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/pom.xml ---------------------------------------------------------------------- diff --git a/cli/pom.xml b/cli/pom.xml new file mode 100644 index 0000000..c01f3b7 --- /dev/null +++ b/cli/pom.xml @@ -0,0 +1,253 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.any23</groupId> + <artifactId>apache-any23</artifactId> + <version>2.0-SNAPSHOT</version> + <relativePath>../</relativePath> + </parent> + + <artifactId>apache-any23-cli</artifactId> + + <name>Apache Any23 :: CLI</name> + <description>Command line interface.</description> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-api</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-csvutils</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-mime</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-encoding</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-test-resources</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + </dependency> + <dependency> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + </dependency> + <dependency> + <groupId>net.sourceforge.nekohtml</groupId> + <artifactId>nekohtml</artifactId> + </dependency> + <dependency> + <groupId>com.beust</groupId> + <artifactId>jcommander</artifactId> + </dependency> + + <!-- BEGIN: Tika --> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + </dependency> + <!-- END: Tika --> + + <!-- BEGIN: Sesame --> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-model</artifactId> + </dependency> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-rio-api</artifactId> + </dependency> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-rio-jsonld</artifactId> + </dependency> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-rio-turtle</artifactId> + </dependency> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-rio-rdfxml</artifactId> + </dependency> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-rio-ntriples</artifactId> + </dependency> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-rio-trix</artifactId> + </dependency> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-repository-sail</artifactId> + </dependency> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-sail-memory</artifactId> + </dependency> + <dependency> + <groupId>org.eclipse.rdf4j</groupId> + <artifactId>rdf4j-repository-api</artifactId> + </dependency> + <dependency> + <groupId>org.semarglproject</groupId> + <artifactId>semargl-rdf4j</artifactId> + </dependency> + <!-- END: Sesame --> + + <!-- BEGIN: Apache Commons, this version is hosted in the + any23-repository-external repository --> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-csv</artifactId> + </dependency> + <!-- END: Apache Commons CSV --> + + <!-- BEGIN: Test Dependencies --> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.mockito</groupId> + <artifactId>mockito-core</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </dependency> + <!-- END: Test Dependencies --> + </dependencies> + + <build> + <resources> + <resource> + <directory>${basedir}/src/main/resources</directory> + <filtering>true</filtering> + </resource> + + <resource> + <directory>${basedir}/../</directory> + <targetPath>META-INF</targetPath> + <includes> + <include>LICENSE.txt</include> + <include>NOTICE.txt</include> + </includes> + </resource> + </resources> + + <plugins> + <!-- generates the bin launchers --> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>appassembler-maven-plugin</artifactId> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>assemble</goal> + </goals> + </execution> + </executions> + <configuration> + <programs> + <program> + <mainClass>org.apache.any23.cli.ToolRunner</mainClass> + <name>any23</name> + </program> + </programs> + <configurationDirectory>conf</configurationDirectory> + <configurationSourceDirectory>${basedir}/src/test/resources</configurationSourceDirectory> + <copyConfigurationDirectory>true</copyConfigurationDirectory> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <executions> + <execution> + <goals> + <goal>test-jar</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + + <profiles> + <profile> + <id>release</id> + <build> + <resources> + <resource> + <directory>${basedir}/../</directory> + <targetPath>${project.build.directory}/apidocs/META-INF</targetPath> + <includes> + <include>LICENSE.txt</include> + <include>NOTICE.txt</include> + </includes> + </resource> + </resources> + </build> + </profile> + </profiles> + +</project> http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java ---------------------------------------------------------------------- diff --git a/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java b/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java new file mode 100644 index 0000000..9a0410b --- /dev/null +++ b/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import org.apache.any23.extractor.ExampleInputOutput; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.Extractor; +import org.apache.any23.extractor.ExtractorRegistryImpl; +import org.apache.any23.extractor.Extractor.BlindExtractor; +import org.apache.any23.extractor.Extractor.ContentExtractor; +import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.ExtractorRegistry; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +/** + * This class provides some command-line documentation + * about available extractors and their usage. + */ +@Parameters( commandNames = { "extractor" }, commandDescription= "Utility for obtaining documentation about metadata extractors.") +public class ExtractorDocumentation implements Tool { + + @Parameter( names = { "-l", "--list" }, description = "shows the names of all available extractors" ) + private boolean showList; + + @Parameter( names = { "-i", "--input" }, description = "shows example input for the given extractor" ) + private boolean showInput; + + @Parameter( names = { "-o", "--outut" }, description = "shows example output for the given extractor" ) + private boolean showOutput; + + @Parameter( names = { "-a", "--all" }, description = "shows a report about all available extractors" ) + private boolean showAll; + + @Parameter( arity = 1, description = "Extractor name" ) + private List<String> extractor = new LinkedList<String>(); + + public void run() throws Exception { + if (showList) { + printExtractorList(ExtractorRegistryImpl.getInstance()); + } else if (showInput) { + if (extractor.isEmpty()) { + throw new IllegalArgumentException("Required argument for -i: extractor name"); + } + + printExampleInput(extractor.get(0), ExtractorRegistryImpl.getInstance()); + } else if (showOutput) { + if (extractor.isEmpty()) { + throw new IllegalArgumentException("Required argument for -o: extractor name"); + } + + printExampleOutput(extractor.get(0), ExtractorRegistryImpl.getInstance()); + } else if (showAll) { + printReport(ExtractorRegistryImpl.getInstance()); + } + } + + /** + * Print an error message. + * + * @param msg the error message to be printed + */ + public void printError(String msg) { + System.err.println(msg); + } + + /** + * Prints the list of all the available extractors. + * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry} + * containing all extractors + */ + public void printExtractorList(ExtractorRegistry registry) { + for (ExtractorFactory factory : registry.getExtractorGroup()) { + System.out.println( String.format("%25s [%15s]", factory.getExtractorName(), factory.getExtractorLabel())); + } + } + + /** + * Prints an example of input for the provided extractor. + * + * @param extractorName the name of the extractor + * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry} + * containing all extractors + * @throws IOException raised if no extractor is found with that name + */ + public void printExampleInput(String extractorName, ExtractorRegistry registry) throws IOException { + ExtractorFactory<?> factory = getFactory(registry, extractorName); + ExampleInputOutput example = new ExampleInputOutput(factory); + String input = example.getExampleInput(); + if (input == null) { + throw new IllegalArgumentException("Extractor " + extractorName + " provides no example input"); + } + System.out.println(input); + } + + /** + * Prints an output example for the given extractor. + * + * @param extractorName the extractor name + * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry} + * containing all extractors + * @throws IOException raised if no extractor is found with that name + * @throws ExtractionException if there is an error duing extraction + */ + public void printExampleOutput(String extractorName, ExtractorRegistry registry) throws IOException, ExtractionException { + ExtractorFactory<?> factory = getFactory(registry, extractorName); + ExampleInputOutput example = new ExampleInputOutput(factory); + String output = example.getExampleOutput(); + if (output == null) { + throw new IllegalArgumentException("Extractor " + extractorName + " provides no example output"); + } + System.out.println(output); + } + + /** + * Prints a complete report on all the available extractors. + * + * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry} + * containing all extractors + * @throws IOException raised if no extractor is found with that name + * @throws ExtractionException if there is an error duing extraction + */ + public void printReport(ExtractorRegistry registry) throws IOException, ExtractionException { + for (String extractorName : registry.getAllNames()) { + ExtractorFactory<?> factory = registry.getFactory(extractorName); + ExampleInputOutput example = new ExampleInputOutput(factory); + System.out.println("Extractor: " + extractorName); + System.out.println("\ttype: " + getType(factory)); + System.out.println(); + final String exampleInput = example.getExampleInput(); + if(exampleInput == null) { + System.out.println("(No Example Available)"); + } else { + System.out.println("-------- Example Input --------"); + System.out.println(exampleInput); + System.out.println("-------- Example Output --------"); + String output = example.getExampleOutput(); + System.out.println(output == null || output.trim().length() == 0 ? "(No Output Generated)" : output); + } + System.out.println("================================"); + System.out.println(); + } + } + + private ExtractorFactory<?> getFactory(ExtractorRegistry registry, String name) { + if (!registry.isRegisteredName(name)) { + throw new IllegalArgumentException("Unknown extractor name: " + name); + } + return registry.getFactory(name); + } + + private String getType(ExtractorFactory<?> factory) { + Extractor<?> extractor = factory.createExtractor(); + if (extractor instanceof BlindExtractor) { + return BlindExtractor.class.getSimpleName(); + } + if (extractor instanceof TagSoupDOMExtractor) { + return TagSoupDOMExtractor.class.getSimpleName(); + } + if (extractor instanceof ContentExtractor) { + return ContentExtractor.class.getSimpleName(); + } + return "?"; + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java ---------------------------------------------------------------------- diff --git a/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java b/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java new file mode 100644 index 0000000..19c59bf --- /dev/null +++ b/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import com.beust.jcommander.IStringConverter; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.ParameterException; +import com.beust.jcommander.Parameters; +import org.apache.any23.extractor.html.TagSoupParser; +import org.apache.any23.http.DefaultHTTPClient; +import org.apache.any23.source.DocumentSource; +import org.apache.any23.source.FileDocumentSource; +import org.apache.any23.source.HTTPDocumentSource; +import org.apache.any23.util.StreamUtils; + +import java.io.File; +import java.io.InputStream; +import java.net.URISyntaxException; +import java.util.LinkedList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Command line <i>Microdata</i> parser, accepting both files and URLs and + * returing a <i>JSON</i> representation of the extracted metadata as described at + * <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>. + * + * @author Michele Mostarda ([email protected]) + */ +@Parameters( commandNames = { "microdata" }, commandDescription = "Commandline Tool for extracting Microdata from file/HTTP source.") +public class MicrodataParser implements Tool { + + private static final Pattern HTTP_DOCUMENT_PATTERN = Pattern.compile("^https?://.*"); + + private static final Pattern FILE_DOCUMENT_PATTERN = Pattern.compile("^file:(.*)$"); + + @Parameter( + arity = 1, + description = "Input document URL, {http://path/to/resource.html|file:/path/to/localFile.html}", + converter = MicrodataParserDocumentSourceConverter.class + ) + private List<DocumentSource> document = new LinkedList<DocumentSource>(); + + public void run() throws Exception { + if (document.isEmpty()) { + throw new IllegalArgumentException("No input document URL specified"); + } + InputStream documentInputInputStream = null; + try { + final DocumentSource documentSource = document.get(0); + documentInputInputStream = documentSource.openInputStream(); + final TagSoupParser tagSoupParser = new TagSoupParser( + documentInputInputStream, + documentSource.getDocumentIRI() + ); + org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), System.out); + } finally { + if (documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream); + } + } + + public static final class MicrodataParserDocumentSourceConverter implements IStringConverter<DocumentSource> { + + @Override + public DocumentSource convert( String value ) { + final Matcher httpMatcher = HTTP_DOCUMENT_PATTERN.matcher(value); + if (httpMatcher.find()) { + try { + return new HTTPDocumentSource(DefaultHTTPClient.createInitializedHTTPClient(), value); + } catch ( URISyntaxException e ) { + throw new ParameterException("Invalid source IRI: '" + value + "'"); + } + } + final Matcher fileMatcher = FILE_DOCUMENT_PATTERN.matcher(value); + if (fileMatcher.find()) { + return new FileDocumentSource( new File( fileMatcher.group(1) ) ); + } + throw new ParameterException("Invalid source protocol: '" + value + "'"); + } + + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/MimeDetector.java ---------------------------------------------------------------------- diff --git a/cli/src/main/java/org/apache/any23/cli/MimeDetector.java b/cli/src/main/java/org/apache/any23/cli/MimeDetector.java new file mode 100644 index 0000000..c9072cb --- /dev/null +++ b/cli/src/main/java/org/apache/any23/cli/MimeDetector.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import com.beust.jcommander.IStringConverter; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import org.apache.any23.http.DefaultHTTPClient; +import org.apache.any23.http.DefaultHTTPClientConfiguration; +import org.apache.any23.http.HTTPClient; +import org.apache.any23.mime.MIMEType; +import org.apache.any23.mime.MIMETypeDetector; +import org.apache.any23.mime.TikaMIMETypeDetector; +import org.apache.any23.source.DocumentSource; +import org.apache.any23.source.FileDocumentSource; +import org.apache.any23.source.HTTPDocumentSource; +import org.apache.any23.source.StringDocumentSource; + +import java.io.File; +import java.net.URISyntaxException; +import java.util.LinkedList; +import java.util.List; + +/** + * Commandline tool to detect <b>MIME Type</b>s from + * file, HTTP and direct input sources. + * The implementation of this tool is based on {@link org.apache.any23.mime.TikaMIMETypeDetector}. + * + * @author Michele Mostarda ([email protected]) + */ +@Parameters(commandNames = { "mimes" }, commandDescription = "MIME Type Detector Tool.") +public class MimeDetector implements Tool{ + + public static final String FILE_DOCUMENT_PREFIX = "file://"; + + public static final String INLINE_DOCUMENT_PREFIX = "inline://"; + + public static final String URL_DOCUMENT_RE = "^https?://.*"; + + @Parameter( + arity = 1, + description = "Input document URL, {http://path/to/resource.html|file:///path/to/local.file|inline:// some inline content}", + converter = MimeDetectorDocumentSourceConverter.class + ) + private List<DocumentSource> document = new LinkedList<DocumentSource>(); + + public void run() throws Exception { + if (document.isEmpty()) { + throw new IllegalArgumentException("No input document URL specified"); + } + + final DocumentSource documentSource = document.get(0); + final MIMETypeDetector detector = new TikaMIMETypeDetector(); + final MIMEType mimeType = detector.guessMIMEType( + documentSource.getDocumentIRI(), + documentSource.openInputStream(), + MIMEType.parse(documentSource.getContentType()) + ); + System.out.println(mimeType); + } + + public static final class MimeDetectorDocumentSourceConverter implements IStringConverter<DocumentSource> { + + @Override + public DocumentSource convert( String document ) { + if (document.startsWith(FILE_DOCUMENT_PREFIX)) { + return new FileDocumentSource( new File( document.substring(FILE_DOCUMENT_PREFIX.length()) ) ); + } + if (document.startsWith(INLINE_DOCUMENT_PREFIX)) { + return new StringDocumentSource( document.substring(INLINE_DOCUMENT_PREFIX.length()), "" ); + } + if (document.matches(URL_DOCUMENT_RE)) { + final HTTPClient client = new DefaultHTTPClient(); + client.init( DefaultHTTPClientConfiguration.singleton() ); + try { + return new HTTPDocumentSource(client, document); + } catch ( URISyntaxException e ) { + throw new IllegalArgumentException("Invalid source IRI: '" + document + "'"); + } + } + throw new IllegalArgumentException("Unsupported protocol for document " + document); + } + + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java ---------------------------------------------------------------------- diff --git a/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java b/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java new file mode 100644 index 0000000..a747b49 --- /dev/null +++ b/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.beust.jcommander.converters.FileConverter; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.mime.MIMEType; +import org.apache.any23.plugin.Any23PluginManager; +import org.apache.any23.plugin.Author; +import java.io.File; +import java.io.PrintStream; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Commandline utility to verify the <b>Any23</b> plugins + * and extract basic information. + * + * @author Michele Mostarda ([email protected]) + */ +@Parameters(commandNames = { "verify" }, commandDescription = "Utility for plugin management verification.") +public class PluginVerifier implements Tool { + + private Any23PluginManager pluginManager = Any23PluginManager.getInstance(); + + @Parameter( + description = "plugins-dir", + converter = FileConverter.class + ) + private List<File> pluginsDirs = new LinkedList<File>(); + + public void run() throws Exception { + if (pluginsDirs.isEmpty()) { + throw new IllegalArgumentException("No plugin directory specified."); + } + + final File pluginsDir = pluginsDirs.get(0); + if (!pluginsDir.isDirectory()) { + throw new IllegalArgumentException("<plugins-dir> must be a valid dir."); + } + + pluginManager.loadJARDir(pluginsDir); + + final Iterator<ExtractorFactory> plugins = pluginManager.getExtractors(); + + while (plugins.hasNext()) { + printPluginData(plugins.next(), System.out); + System.out.println("------------------------------------------------------------------------"); + } + } + + private String getMimeTypesStr(Collection<MIMEType> mimeTypes) { + final StringBuilder sb = new StringBuilder(); + for (MIMEType mt : mimeTypes) { + sb.append(mt).append(' '); + } + return sb.toString(); + } + + private void printPluginData(ExtractorFactory extractorFactory, PrintStream ps) { + final Author authorAnnotation = extractorFactory.getClass().getAnnotation(Author.class); + ps.printf("Plugin author : %s\n", authorAnnotation == null ? "<unknown>" : authorAnnotation.name()); + ps.printf("Plugin factory : %s\n", extractorFactory.getClass()); + ps.printf("Plugin mime-types: %s\n", getMimeTypesStr( extractorFactory.getSupportedMIMETypes() )); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/Rover.java ---------------------------------------------------------------------- diff --git a/cli/src/main/java/org/apache/any23/cli/Rover.java b/cli/src/main/java/org/apache/any23/cli/Rover.java new file mode 100644 index 0000000..26a8663 --- /dev/null +++ b/cli/src/main/java/org/apache/any23/cli/Rover.java @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import com.beust.jcommander.IStringConverter; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.ParameterException; +import com.beust.jcommander.Parameters; +import com.beust.jcommander.converters.FileConverter; +import org.apache.any23.Any23; +import org.apache.any23.configuration.Configuration; +import org.apache.any23.configuration.DefaultConfiguration; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionParameters.ValidationMode; +import org.apache.any23.filter.IgnoreAccidentalRDFa; +import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments; +import org.apache.any23.source.DocumentSource; +import org.apache.any23.writer.BenchmarkTripleHandler; +import org.apache.any23.writer.LoggingTripleHandler; +import org.apache.any23.writer.ReportingTripleHandler; +import org.apache.any23.writer.TripleHandler; +import org.apache.any23.writer.TripleHandlerException; +import org.apache.any23.writer.WriterFactoryRegistry; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.LinkedList; +import java.util.List; + +import static java.lang.String.format; + +/** + * A default rover implementation. Goes and fetches a URL using an hint + * as to what format should require, then tries to convert it to RDF. + * + * @author Michele Mostarda ([email protected]) + * @author Richard Cyganiak ([email protected]) + * @author Gabriele Renzi + */ +@Parameters(commandNames = { "rover" }, commandDescription = "Any23 Command Line Tool.") +public class Rover implements Tool { + + private static final List<String> FORMATS = WriterFactoryRegistry.getInstance().getIdentifiers(); + + private static final int DEFAULT_FORMAT_INDEX = 0; + + private static final Logger logger = LoggerFactory.getLogger(Rover.class); + + @Parameter( + names = { "-o", "--output" }, + description = "Specify Output file (defaults to standard output)", + converter = PrintStreamConverter.class + ) + private PrintStream outputStream = System.out; + + @Parameter(description = "input IRIs {<url>|<file>}+", converter = ArgumentToIRIConverter.class) + protected List<String> inputIRIs = new LinkedList<String>(); + + @Parameter(names = { "-e", "--extractors" }, description = "a comma-separated list of extractors, e.g. rdf-xml,rdf-turtle") + private List<String> extractors = new LinkedList<String>(); + + @Parameter(names = { "-f", "--format" }, description = "the output format") + private String format = FORMATS.get(DEFAULT_FORMAT_INDEX); + + @Parameter( + names = { "-l", "--log" }, + description = "Produce log within a file.", + converter = FileConverter.class + ) + private File logFile = null; + + @Parameter(names = { "-s", "--stats" }, description = "Print out extraction statistics.") + private boolean statistics; + + @Parameter(names = { "-t", "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones).") + private boolean noTrivial; + + @Parameter(names = { "-p", "--pedantic" }, description = "Validate and fixes HTML content detecting commons issues.") + private boolean pedantic; + + @Parameter(names = { "-n", "--nesting" }, description = "Disable production of nesting triples.") + private boolean nestingDisabled; + + @Parameter(names = { "-d", "--defaultns" }, description = "Override the default namespace used to produce statements.") + private String defaultns; + + // non parameters + + private TripleHandler tripleHandler; + + private ReportingTripleHandler reportingTripleHandler; + + private BenchmarkTripleHandler benchmarkTripleHandler; + + private Any23 any23; + + private ExtractionParameters extractionParameters; + + protected void configure() { + try { + tripleHandler = WriterFactoryRegistry.getInstance().getWriterInstanceByIdentifier(format, outputStream); + } catch (Exception e) { + throw new NullPointerException( + format("Invalid output format '%s', admitted values: %s", + format, + FORMATS + ) + ); + } + + if (logFile != null) { + try { + tripleHandler = new LoggingTripleHandler(tripleHandler, new PrintWriter(logFile)); + } catch (FileNotFoundException fnfe) { + throw new IllegalArgumentException( format("Can not write to log file [%s]", logFile), fnfe ); + } + } + + if (statistics) { + benchmarkTripleHandler = new BenchmarkTripleHandler(tripleHandler); + tripleHandler = benchmarkTripleHandler; + } + + if (noTrivial) { + tripleHandler = new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(tripleHandler), + true // suppress stylesheet triples. + ); + } + + reportingTripleHandler = new ReportingTripleHandler(tripleHandler); + + final Configuration configuration = DefaultConfiguration.singleton(); + extractionParameters = + pedantic + ? + new ExtractionParameters(configuration, ValidationMode.ValidateAndFix, nestingDisabled) + : + new ExtractionParameters(configuration, ValidationMode.None , nestingDisabled); + if (defaultns != null) { + extractionParameters.setProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY, + defaultns); + } + + any23 = (extractors.isEmpty()) ? new Any23() + : new Any23(extractors.toArray(new String[extractors.size()])); + any23.setHTTPUserAgent(Any23.DEFAULT_HTTP_CLIENT_USER_AGENT + "/" + Any23.VERSION); + } + + protected String printReports() { + final StringBuilder sb = new StringBuilder(); + if (benchmarkTripleHandler != null) sb.append( benchmarkTripleHandler.report() ).append('\n'); + if (reportingTripleHandler != null) sb.append( reportingTripleHandler.printReport() ).append('\n'); + return sb.toString(); + } + + protected void performExtraction(DocumentSource documentSource) throws Exception { + if (!any23.extract(extractionParameters, documentSource, tripleHandler).hasMatchingExtractors()) { + throw new IllegalStateException(format("No suitable extractors found for source %s", documentSource)); + } + } + + protected void close() { + if (tripleHandler != null) { + try { + tripleHandler.close(); + } catch (TripleHandlerException the) { + throw new RuntimeException("Error while closing TripleHandler", the); + } + } + + if (outputStream != null && outputStream != System.out) { // TODO: low - find better solution to avoid closing system out. + outputStream.close(); + } + } + + public void run() throws Exception { + if (inputIRIs.isEmpty()) { + throw new IllegalArgumentException("Expected at least 1 argument."); + } + + configure(); + + // perform conversions + + try { + final long start = System.currentTimeMillis(); + for (String inputIRI : inputIRIs) { + DocumentSource source = any23.createDocumentSource(inputIRI); + + performExtraction( source ); + } + final long elapsed = System.currentTimeMillis() - start; + + if (benchmarkTripleHandler != null) { + System.err.println(benchmarkTripleHandler.report()); + } + + logger.info("Extractors used: " + reportingTripleHandler.getExtractorNames()); + logger.info(reportingTripleHandler.getTotalTriples() + " triples, " + elapsed + "ms"); + } finally { + close(); + } + } + + public static final class ArgumentToIRIConverter implements IStringConverter<String> { + + @Override + public String convert(String uri) { + uri = uri.trim(); + if (uri.toLowerCase().startsWith("http:") || uri.toLowerCase().startsWith("https:")) { + try { + return new URL(uri).toString(); + } catch (MalformedURLException murle) { + throw new ParameterException(format("Invalid IRI: '%s': %s", uri, murle.getMessage())); + } + } + + final File f = new File(uri); + if (!f.exists()) { + throw new ParameterException(format("No such file: [%s]", f.getAbsolutePath())); + } + if (f.isDirectory()) { + throw new ParameterException(format("Found a directory: [%s]", f.getAbsolutePath())); + } + return f.toURI().toString(); + } + + } + + public static final class PrintStreamConverter implements IStringConverter<PrintStream> { + + @Override + public PrintStream convert( String value ) { + final File file = new File(value); + try { + return new PrintStream(file); + } catch (FileNotFoundException fnfe) { + throw new ParameterException(format("Cannot open file '%s': %s", file, fnfe.getMessage())); + } + } + + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/ToolRunner.java ---------------------------------------------------------------------- diff --git a/cli/src/main/java/org/apache/any23/cli/ToolRunner.java b/cli/src/main/java/org/apache/any23/cli/ToolRunner.java new file mode 100644 index 0000000..90daeb3 --- /dev/null +++ b/cli/src/main/java/org/apache/any23/cli/ToolRunner.java @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.converters.FileConverter; +import org.apache.any23.Any23; +import org.apache.any23.plugin.Any23PluginManager; +import org.apache.any23.util.LogUtils; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.util.Date; +import java.util.Iterator; +import java.util.Map; +import java.util.Properties; + +import static java.lang.System.currentTimeMillis; +import static java.lang.System.exit; + +/** + * This class is the main class responsible to provide a uniform command-line + * access points to all the others tools like {@link Rover}. + * + * @see ExtractorDocumentation + * @see Rover + */ +public final class ToolRunner { + + public static final File DEFAULT_PLUGIN_DIR = new File(new File(System.getProperty("user.home")), ".any23/plugins"); + + private static final PrintStream infoStream = System.err; + + @Parameter( names = { "-h", "--help" }, description = "Display help information." ) + private boolean printHelp; + + @Parameter( names = { "-v", "--version" }, description = "Display version information." ) + private boolean showVersion; + + @Parameter( names = { "-X", "--verbose" }, description = "Produce execution verbose output." ) + private boolean verbose; + + @Parameter( + names = { "--plugins-dir" }, + description = "The Any23 plugins directory.", + converter = FileConverter.class + ) + private File pluginsDir = DEFAULT_PLUGIN_DIR; + + public static void main( String[] args ) throws Exception { + exit( new ToolRunner().execute( args ) ); + } + + public int execute(String...args) throws Exception { + JCommander commander = new JCommander(this); + commander.setProgramName(System.getProperty("app.name")); + + // TODO (low) : this dirty solution has been introduced because it is not possible to + // parse arguments ( commander.parse() ) twice. + final File pluginsDirOption; + try { + pluginsDirOption = parsePluginDirOption(args); + } catch (Exception e) { + System.err.println(e.getMessage()); + return 1; + } + if(pluginsDirOption != null) { + pluginsDir = pluginsDirOption; + } + + // add all plugins first + final Iterator<Tool> tools = getToolsInClasspath(); + while (tools.hasNext()) { + Tool tool = tools.next(); + commander.addCommand(tool); + } + + commander.parse(args); + + Map<String, JCommander> commands = commander.getCommands(); + String parsedCommand = commander.getParsedCommand(); + + if (printHelp) { + commander.usage(); + return 0; + } + + if (showVersion) { + printVersionInfo(); + return 0; + } + + if(parsedCommand == null) { + infoStream.println("A command must be specified."); + commander.usage(); + return 1; + } + + if (verbose) { + LogUtils.setVerboseLogging(); + } else { + LogUtils.setDefaultLogging(); + } + + long start = currentTimeMillis(); + int exit = 0; + + Throwable error = null; + + // execute the parsed command + infoStream.println(); + infoStream.println( "------------------------------------------------------------------------" ); + infoStream.printf( "Apache Any23 :: %s%n", parsedCommand ); + infoStream.println( "------------------------------------------------------------------------" ); + infoStream.println(); + + try { + Tool.class.cast( commands.get( parsedCommand ).getObjects().get( 0 ) ).run(); + } catch (Throwable t) { + exit = 1; + error = t; + } finally { + infoStream.println(); + infoStream.println( "------------------------------------------------------------------------" ); + infoStream.printf( "Apache Any23 %s%n", ( exit != 0 ) ? "FAILURE" : "SUCCESS" ); + + if (exit != 0) { + infoStream.println(); + + if (verbose) { + System.err.println( "Execution terminated with errors:" ); + error.printStackTrace(infoStream); + } else { + infoStream.printf( "Execution terminated with errors: %s%n", error.getMessage() ); + } + + infoStream.println(); + } + + infoStream.printf( "Total time: %ss%n", ( ( currentTimeMillis() - start ) / 1000 ) ); + infoStream.printf( "Finished at: %s%n", new Date() ); + + final Runtime runtime = Runtime.getRuntime(); + final int megaUnit = 1024 * 1024; + infoStream.printf( "Final Memory: %sM/%sM%n", ( runtime.totalMemory() - runtime.freeMemory() ) / megaUnit, + runtime.totalMemory() / megaUnit ); + + infoStream.println( "------------------------------------------------------------------------" ); + } + + return exit; + } + + Iterator<Tool> getToolsInClasspath() throws IOException { + final Any23PluginManager pluginManager = Any23PluginManager.getInstance(); + if (pluginsDir.exists() && pluginsDir.isDirectory()) { + pluginManager.loadJARDir(pluginsDir); + } + return pluginManager.getTools(); + } + + private static void printVersionInfo() { + Properties properties = new Properties(); + InputStream input = ToolRunner.class.getClassLoader().getResourceAsStream( "META-INF/maven/org.apache.any23/any23-core/pom.properties" ); + + if ( input != null ) { + try { + properties.load( input ); + } catch ( IOException e ) { + // ignore, just don't load the properties + } finally { + try { + input.close(); + } catch (IOException e) { + // close quietly + } + } + } + + infoStream.printf( "Apache Any23 %s%n", Any23.VERSION ); + infoStream.printf( "Java version: %s, vendor: %s%n", + System.getProperty( "java.version" ), + System.getProperty( "java.vendor" ) ); + infoStream.printf( "Java home: %s%n", System.getProperty( "java.home" ) ); + infoStream.printf( "Default locale: %s_%s, platform encoding: %s%n", + System.getProperty( "user.language" ), + System.getProperty( "user.country" ), + System.getProperty( "sun.jnu.encoding" ) ); + infoStream.printf( "OS name: \"%s\", version: \"%s\", arch: \"%s\", family: \"%s\"%n", + System.getProperty( "os.name" ), + System.getProperty( "os.version" ), + System.getProperty( "os.arch" ), + getOsFamily() ); + } + + private static final String getOsFamily() { + String osName = System.getProperty( "os.name" ).toLowerCase(); + String pathSep = System.getProperty( "path.separator" ); + + if (osName.contains("windows")) { + return "windows"; + } else if (osName.contains("os/2")) { + return "os/2"; + } else if (osName.contains("z/os") || osName.contains("os/390")) { + return "z/os"; + } else if (osName.contains("os/400")) { + return "os/400"; + } else if (pathSep.equals( ";" )) { + return "dos"; + } else if (osName.contains("mac")) { + if (osName.endsWith("x")) { + return "mac"; // MACOSX + } + return "unix"; + } else if (osName.contains("nonstop_kernel")) { + return "tandem"; + } else if (osName.contains("openvms")) { + return "openvms"; + } else if (pathSep.equals(":")) { + return "unix"; + } + + return "undefined"; + } + + private static File parsePluginDirOption(String[] args) { + int optionIndex = -1; + for(int i = 0; i < args.length; i++) { + if("--plugins-dir".equals(args[i])) { + optionIndex = i; + } + } + if(optionIndex == -1) return null; + + if(optionIndex == args.length - 1) { + throw new IllegalArgumentException("Missing argument for --plugins-dir option."); + } + final File pluginsDir = new File( args[optionIndex + 1] ); + if( ! pluginsDir.isDirectory() ) { + throw new IllegalArgumentException("Expected a directory for --plugins-dir option value."); + } + return pluginsDir; + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java ---------------------------------------------------------------------- diff --git a/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java b/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java new file mode 100644 index 0000000..7fde887 --- /dev/null +++ b/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import org.apache.any23.vocab.RDFSchemaUtils; +import org.eclipse.rdf4j.rio.RDFFormat; +import org.eclipse.rdf4j.rio.RDFWriterRegistry; +import org.eclipse.rdf4j.rio.Rio; + +import com.beust.jcommander.IStringConverter; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; + +/** + * Prints out the vocabulary <i>RDFSchema</i> as <i>NQuads</i>. + * + * @author Michele Mostarda ([email protected]) + */ +@Parameters(commandNames = { "vocab" }, commandDescription = "Prints out the RDF Schema of the vocabularies used by Any23.") +public class VocabPrinter implements Tool { + + @Parameter(names = { "-f", "--format" }, description = "Vocabulary output format", converter = RDFFormatConverter.class) + private RDFFormat format = RDFFormat.NQUADS; + + public void run() throws Exception { + RDFSchemaUtils.serializeVocabularies(format, System.out); + } + + public static final class RDFFormatConverter implements + IStringConverter<RDFFormat> { + + @Override + public RDFFormat convert(String value) { + return RDFWriterRegistry.getInstance().getFileFormatForMIMEType(value).orElseThrow(Rio.unsupportedFormat(value)); + } + + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/package-info.java ---------------------------------------------------------------------- diff --git a/cli/src/main/java/org/apache/any23/cli/package-info.java b/cli/src/main/java/org/apache/any23/cli/package-info.java new file mode 100644 index 0000000..40ae928 --- /dev/null +++ b/cli/src/main/java/org/apache/any23/cli/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package contains some command-line utilities which allow users + * to use the main <i>Any23</i> features via <i>commandline</i> shell. + */ +package org.apache.any23.cli; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/ExtractorDocumentationTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/ExtractorDocumentationTest.java b/cli/src/test/java/org/apache/any23/cli/ExtractorDocumentationTest.java new file mode 100644 index 0000000..98616ba --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/ExtractorDocumentationTest.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import org.junit.Test; + +/** + * Test case for {@link ExtractorDocumentation} CLI. + * + * @author Michele Mostarda ([email protected]) + */ +public class ExtractorDocumentationTest extends ToolTestBase { + + private static final String TARGET_EXTRACTOR = "html-microdata"; + + public ExtractorDocumentationTest() { + super(ExtractorDocumentation.class); + } + + @Test + public void testList() throws Exception { + runToolCheckExit0("--list"); + } + + @Test + public void testAll() throws Exception { + runToolCheckExit0("--all"); + } + + //@Ignore("no available example") + @Test + public void testExampleInput() throws Exception { + runToolCheckExit0("-i", TARGET_EXTRACTOR); + } + + //@Ignore("no available example") + @Test + public void testExampleOutput() throws Exception { + runToolCheckExit0("-o", TARGET_EXTRACTOR); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java b/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java new file mode 100644 index 0000000..a80e729 --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import org.junit.Ignore; +import org.junit.Test; + +/** + * Test case for {@link MicrodataParser} CLI. + * + * @author Michele Mostarda ([email protected]) + */ +public class MicrodataParserTest extends ToolTestBase { + + public MicrodataParserTest() { + super(MicrodataParser.class); + } + + @Test + public void testRunOnFile() throws Exception { + runToolCheckExit0("file:"+copyResourceToTempFile("/microdata/microdata-nested.html").getAbsolutePath()); + } + + @Ignore("ANY23-140 - Revise Any23 tests to remove fetching of web content") + @Test + public void testRunOnHTTPResource() throws Exception { + runToolCheckExit0("http://www.imdb.com/title/tt1375666/"); + } + + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java b/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java new file mode 100644 index 0000000..3894d32 --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import org.junit.Test; + +/** + * Test case for {@link MimeDetector} CLI. + * + * @author Michele Mostarda ([email protected]) + */ +public class MimeDetectorTest extends ToolTestBase { + + public MimeDetectorTest() { + super(MimeDetector.class); + } + + @Test + public void testDetectURL() throws Exception { + assumeOnlineAllowed(); + runToolCheckExit0("http://twitter.com#micmos"); + } + + @Test + public void testDetectFile() throws Exception { + assumeOnlineAllowed(); + runToolCheckExit0("file://"+copyResourceToTempFile("/application/trix/test1.trx").getAbsolutePath()); + } + + @Test + public void testDetectInline() throws Exception { + assumeOnlineAllowed(); + runToolCheckExit0( new String[] {"inline://<http://s> <http://p> <http://o> ."} ); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/PluginVerifierTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/PluginVerifierTest.java b/cli/src/test/java/org/apache/any23/cli/PluginVerifierTest.java new file mode 100644 index 0000000..bdee9ae --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/PluginVerifierTest.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import org.junit.Test; + +/** + * Test case for {@link PluginVerifier} CLI. + * + * @author Michele Mostarda ([email protected]) + */ +public class PluginVerifierTest extends ToolTestBase { + + public PluginVerifierTest() { + super(PluginVerifier.class); + } + + @Test + public void testRun() throws Exception { + runToolCheckExit0("."); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/RoverTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/RoverTest.java b/cli/src/test/java/org/apache/any23/cli/RoverTest.java new file mode 100644 index 0000000..893220a --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/RoverTest.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.util.FileUtils; +import org.apache.any23.util.StringUtils; +import org.apache.any23.util.URLUtils; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.Ignore; +import org.junit.Test; +import org.eclipse.rdf4j.model.Statement; +import org.eclipse.rdf4j.rio.RDFFormat; + +import java.io.File; +import java.util.Arrays; + +/** + * Test case for {@link Rover}. + * + * @author Michele Mostarda ([email protected]) + */ +@Ignore("Twitter microdata not parsing correctly right now") +public class RoverTest extends ToolTestBase { + + private static final String[] TARGET_FILES = { + "/microdata/microdata-nested.html", + "/org/apache/any23/extractor/csv/test-semicolon.csv" + }; + + private static final String[] TARGET_URLS = { + "http://twitter.com/micmos", + "http://twitter.com/dpalmisano" + }; + + public RoverTest() { + super(Rover.class); + } + + @Test + public void testRunMultiFiles() throws Exception { + + String[] copiedTargets = new String[TARGET_FILES.length]; + for(int i = 0; i < TARGET_FILES.length; i++) + { + File tempFile = copyResourceToTempFile(TARGET_FILES[i]); + + copiedTargets[i] = tempFile.getAbsolutePath(); + } + + runWithMultiSourcesAndVerify(copiedTargets, 0); + } + + @Test + public void testRunWithDefaultNS() throws Exception { + final String DEFAULT_GRAPH = "http://test/default/ns"; + final File outFile = File.createTempFile("rover-test", "out", tempDirectory); + final int exitCode = runTool( + String.format( + "-o %s -f nquads -p -n %s -d %s", + outFile.getAbsolutePath(), + copyResourceToTempFile("/cli/rover-test1.nq").getAbsolutePath(), + DEFAULT_GRAPH + ) + ); + + Assert.assertEquals("Unexpected exit code.", 0, exitCode); + Assert.assertTrue(outFile.exists()); + final String fileContent = FileUtils.readFileContent(outFile); + final String[] lines = fileContent.split("\\n"); + int graphCounter = 0; + for(String line : lines) { + if(line.contains(DEFAULT_GRAPH)) { + graphCounter++; + } + } + Assert.assertEquals(0, graphCounter); + } + + /* BEGIN: online tests. */ + + @Test + public void testRunMultiURLs() throws Exception { + // Assuming first accessibility to remote resources. + assumeOnlineAllowed(); + for(String targetURL : TARGET_URLS) { + Assume.assumeTrue( URLUtils.isOnline(targetURL) ); + } + + runWithMultiSourcesAndVerify(TARGET_URLS, 0); + } + + private void runWithMultiSourcesAndVerify(String[] targets, int expectedExit) throws Exception { + final File outFile = File.createTempFile("rover-test", "out", tempDirectory); + final File logFile = File.createTempFile("rover-test", "log", tempDirectory); + + final int exitCode = runTool( + String.format( + "-o %s -f nquads -l %s -p -n %s", + outFile.getAbsolutePath(), + logFile.getAbsolutePath(), + StringUtils.join(" ", targets) + ) + ); + Assert.assertEquals("Unexpected exit code.", expectedExit, exitCode); + + Assert.assertTrue(outFile.exists()); + Assert.assertTrue(logFile.exists()); + + final String logFileContent = FileUtils.readFileContent(logFile); + Assert.assertEquals( + "Unexpected number of log lines.", + targets.length + 1, // Header line. + StringUtils.countNL(logFileContent) + ); + + final String outNQuads = FileUtils.readFileContent(outFile); + final Statement[] statements = RDFUtils.parseRDF(RDFFormat.NQUADS, outNQuads); + System.out.println(Arrays.toString(statements)); + Assert.assertTrue("Unexpected number of statements.", statements.length > 9); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/ToolRunnerTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/ToolRunnerTest.java b/cli/src/test/java/org/apache/any23/cli/ToolRunnerTest.java new file mode 100644 index 0000000..881a782 --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/ToolRunnerTest.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import junit.framework.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import static org.junit.Assert.assertTrue; + +/** + * Test case for {@link ToolRunner}. + * + * @author Michele Mostarda ([email protected]) + */ +public class ToolRunnerTest { + + private final Set<Class<? extends Tool>> coreTools = new HashSet<Class<? extends Tool>>(){{ + add(ExtractorDocumentation.class); + add(MicrodataParser.class); + add(MimeDetector.class); + add(PluginVerifier.class); + add(Rover.class); + add(VocabPrinter.class); + }}; + + @Test + public void testGetToolsInClasspath() throws IOException { + Iterator<Tool> tools = new ToolRunner().getToolsInClasspath(); + assertTrue("No core tools have been detected", tools.hasNext()); + while (tools.hasNext()) { + assertTrue("Some core tools have not been detected.", coreTools.contains(tools.next().getClass())); + } + } + + @Test + public void testGetVersion() throws Exception { + Assert.assertEquals(0, new ToolRunner().execute("-v") ); + } + + @Test + public void testGetHelp() throws Exception { + Assert.assertEquals(0, new ToolRunner().execute("-h") ); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java b/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java new file mode 100644 index 0000000..fef49cd --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import com.beust.jcommander.Parameters; +import org.apache.any23.Any23OnlineTestBase; + +import java.util.Arrays; + +import static java.lang.String.format; +import static org.junit.Assert.assertEquals; + +/** + * Base class for <i>CLI</i> related tests. + * + * @author Michele Mostarda ([email protected]) + */ +// TODO: improve support for Tool testing, intercept i/o streams. +public abstract class ToolTestBase extends Any23OnlineTestBase { + + public static final String TOOL_RUN_METHOD = "run"; + + private final Class<? extends Tool> toolClazz; + + protected ToolTestBase(Class<? extends Tool> tool) { + if (tool == null) throw new NullPointerException(); + toolClazz = tool; + } + + /** + * Runs the underlying tool. + * + * @param args tool arguments. + * @return the tool exit code. + * @throws Exception + */ + protected int runTool(String... args) throws Exception { + final String commandName = toolClazz.getAnnotation( Parameters.class ).commandNames()[0]; + + final String[] enhancedArgs = new String[args.length + 1]; + enhancedArgs[0] = commandName; + System.arraycopy( args, 0, enhancedArgs, 1, args.length ); + + return new ToolRunner().execute( enhancedArgs ); + } + + /** + * Runs the underlying tool. + * + * @param args args tool arguments. + * @return the tool exit code. + * @throws Exception + */ + protected int runTool(String args) throws Exception { + return runTool(args.split(" ")); + } + + /** + * Runs the underlying tool and verify the exit code to <code>0</code>. + * + * @param args tool arguments. + * @throws Exception + */ + protected void runToolCheckExit0(String... args) throws Exception { + assertEquals( + format( + "Unexpected exit code for tool [%s] invoked with %s", + toolClazz.getSimpleName(), + Arrays.asList(args) + ), + 0, + runTool(args) + ); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/VocabPrinterTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/VocabPrinterTest.java b/cli/src/test/java/org/apache/any23/cli/VocabPrinterTest.java new file mode 100644 index 0000000..1c841dc --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/VocabPrinterTest.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import org.junit.Test; + +/** + * Test case for {@link VocabPrinter} CLI. + * + * @author Michele Mostarda ([email protected]) + */ +public class VocabPrinterTest extends ToolTestBase { + + public VocabPrinterTest() { + super(VocabPrinter.class); + } + + @Test + public void testRun() throws Exception { + runToolCheckExit0(); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/core/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java b/core/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java deleted file mode 100644 index 9a0410b..0000000 --- a/core/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.any23.cli; - -import com.beust.jcommander.Parameter; -import com.beust.jcommander.Parameters; -import org.apache.any23.extractor.ExampleInputOutput; -import org.apache.any23.extractor.ExtractionException; -import org.apache.any23.extractor.Extractor; -import org.apache.any23.extractor.ExtractorRegistryImpl; -import org.apache.any23.extractor.Extractor.BlindExtractor; -import org.apache.any23.extractor.Extractor.ContentExtractor; -import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor; -import org.apache.any23.extractor.ExtractorFactory; -import org.apache.any23.extractor.ExtractorRegistry; - -import java.io.IOException; -import java.util.LinkedList; -import java.util.List; - -/** - * This class provides some command-line documentation - * about available extractors and their usage. - */ -@Parameters( commandNames = { "extractor" }, commandDescription= "Utility for obtaining documentation about metadata extractors.") -public class ExtractorDocumentation implements Tool { - - @Parameter( names = { "-l", "--list" }, description = "shows the names of all available extractors" ) - private boolean showList; - - @Parameter( names = { "-i", "--input" }, description = "shows example input for the given extractor" ) - private boolean showInput; - - @Parameter( names = { "-o", "--outut" }, description = "shows example output for the given extractor" ) - private boolean showOutput; - - @Parameter( names = { "-a", "--all" }, description = "shows a report about all available extractors" ) - private boolean showAll; - - @Parameter( arity = 1, description = "Extractor name" ) - private List<String> extractor = new LinkedList<String>(); - - public void run() throws Exception { - if (showList) { - printExtractorList(ExtractorRegistryImpl.getInstance()); - } else if (showInput) { - if (extractor.isEmpty()) { - throw new IllegalArgumentException("Required argument for -i: extractor name"); - } - - printExampleInput(extractor.get(0), ExtractorRegistryImpl.getInstance()); - } else if (showOutput) { - if (extractor.isEmpty()) { - throw new IllegalArgumentException("Required argument for -o: extractor name"); - } - - printExampleOutput(extractor.get(0), ExtractorRegistryImpl.getInstance()); - } else if (showAll) { - printReport(ExtractorRegistryImpl.getInstance()); - } - } - - /** - * Print an error message. - * - * @param msg the error message to be printed - */ - public void printError(String msg) { - System.err.println(msg); - } - - /** - * Prints the list of all the available extractors. - * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry} - * containing all extractors - */ - public void printExtractorList(ExtractorRegistry registry) { - for (ExtractorFactory factory : registry.getExtractorGroup()) { - System.out.println( String.format("%25s [%15s]", factory.getExtractorName(), factory.getExtractorLabel())); - } - } - - /** - * Prints an example of input for the provided extractor. - * - * @param extractorName the name of the extractor - * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry} - * containing all extractors - * @throws IOException raised if no extractor is found with that name - */ - public void printExampleInput(String extractorName, ExtractorRegistry registry) throws IOException { - ExtractorFactory<?> factory = getFactory(registry, extractorName); - ExampleInputOutput example = new ExampleInputOutput(factory); - String input = example.getExampleInput(); - if (input == null) { - throw new IllegalArgumentException("Extractor " + extractorName + " provides no example input"); - } - System.out.println(input); - } - - /** - * Prints an output example for the given extractor. - * - * @param extractorName the extractor name - * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry} - * containing all extractors - * @throws IOException raised if no extractor is found with that name - * @throws ExtractionException if there is an error duing extraction - */ - public void printExampleOutput(String extractorName, ExtractorRegistry registry) throws IOException, ExtractionException { - ExtractorFactory<?> factory = getFactory(registry, extractorName); - ExampleInputOutput example = new ExampleInputOutput(factory); - String output = example.getExampleOutput(); - if (output == null) { - throw new IllegalArgumentException("Extractor " + extractorName + " provides no example output"); - } - System.out.println(output); - } - - /** - * Prints a complete report on all the available extractors. - * - * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry} - * containing all extractors - * @throws IOException raised if no extractor is found with that name - * @throws ExtractionException if there is an error duing extraction - */ - public void printReport(ExtractorRegistry registry) throws IOException, ExtractionException { - for (String extractorName : registry.getAllNames()) { - ExtractorFactory<?> factory = registry.getFactory(extractorName); - ExampleInputOutput example = new ExampleInputOutput(factory); - System.out.println("Extractor: " + extractorName); - System.out.println("\ttype: " + getType(factory)); - System.out.println(); - final String exampleInput = example.getExampleInput(); - if(exampleInput == null) { - System.out.println("(No Example Available)"); - } else { - System.out.println("-------- Example Input --------"); - System.out.println(exampleInput); - System.out.println("-------- Example Output --------"); - String output = example.getExampleOutput(); - System.out.println(output == null || output.trim().length() == 0 ? "(No Output Generated)" : output); - } - System.out.println("================================"); - System.out.println(); - } - } - - private ExtractorFactory<?> getFactory(ExtractorRegistry registry, String name) { - if (!registry.isRegisteredName(name)) { - throw new IllegalArgumentException("Unknown extractor name: " + name); - } - return registry.getFactory(name); - } - - private String getType(ExtractorFactory<?> factory) { - Extractor<?> extractor = factory.createExtractor(); - if (extractor instanceof BlindExtractor) { - return BlindExtractor.class.getSimpleName(); - } - if (extractor instanceof TagSoupDOMExtractor) { - return TagSoupDOMExtractor.class.getSimpleName(); - } - if (extractor instanceof ContentExtractor) { - return ContentExtractor.class.getSimpleName(); - } - return "?"; - } - -}
