This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3226
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3226 by this push:
new 1d12e57 TIKA-3226 -- add emitter interface and stub for SolrEmitter;
rename packages to be consistent with tika-parsers submodules
1d12e57 is described below
commit 1d12e57a93f23c4845a46b97e3ffb2ffe9382b7f
Author: tballison <[email protected]>
AuthorDate: Fri Jan 15 14:30:27 2021 -0500
TIKA-3226 -- add emitter interface and stub for SolrEmitter;
rename packages to be consistent with tika-parsers submodules
---
pom.xml | 1 +
.../main/java/org/apache/tika/emitter/Emitter.java | 19 +++++
.../apache/tika/emitter/TikaEmitterException.java | 9 +++
.../org/apache/tika/fetcher/FileSystemFetcher.java | 2 +-
.../apache/tika/metadata/TikaCoreProperties.java | 14 +++-
{tika-fetchers => tika-emitters}/pom.xml | 9 ++-
.../tika-emitter-fs}/pom.xml | 27 ++++---
.../apache/tika/emitter/fs/FileSystemEmitter.java | 79 ++++++++++++++++++++
.../tika-emitter-solr}/pom.xml | 20 ++---
.../org/apache/tika/emitter/solr/SolrEmitter.java | 85 ++++++++++++++++++++++
tika-fetchers/pom.xml | 4 +-
.../{jdbc-fetcher => tika-fetcher-jdbc}/pom.xml | 0
.../org/apache/tika/fetcher/jdbc/JDBCFetcher.java | 0
.../{s3-fetcher => tika-fetcher-s3}/pom.xml | 15 ----
.../java/org/apache/tika/fetcher/s3/S3Fetcher.java | 0
.../org/apache/tika/fetcher/s3/TestS3Fetcher.java | 0
.../src/test/resources/tika-config-s3.xml | 0
17 files changed, 240 insertions(+), 44 deletions(-)
diff --git a/pom.xml b/pom.xml
index aa1e6b9..0547122 100644
--- a/pom.xml
+++ b/pom.xml
@@ -51,6 +51,7 @@
<module>tika-example</module>
<module>tika-java7</module>
<module>tika-eval</module>
+ <module>tika-emitters</module>
</modules>
<profiles>
diff --git a/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java
b/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java
new file mode 100644
index 0000000..aa8ee55
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java
@@ -0,0 +1,19 @@
+package org.apache.tika.emitter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.util.List;
+
+public interface Emitter {
+
+ String getName();
+
+ //TODO: do we need a key or can we pass that in metadatalist?
+ //If we do need it, how do we populate it?
+ void emit(List<Metadata> metadataList) throws IOException, TikaException;
+ //TODO we can add this later?
+ //void emit(String txt, Metadata metadata) throws IOException,
TikaException;
+
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/emitter/TikaEmitterException.java
b/tika-core/src/main/java/org/apache/tika/emitter/TikaEmitterException.java
new file mode 100644
index 0000000..6926b78
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/emitter/TikaEmitterException.java
@@ -0,0 +1,9 @@
+package org.apache.tika.emitter;
+
+import org.apache.tika.exception.TikaException;
+
+public class TikaEmitterException extends TikaException {
+ public TikaEmitterException(String msg) {
+ super(msg);
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
index 41074db..83a6677 100644
--- a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
@@ -45,7 +45,7 @@ public class FileSystemFetcher implements Fetcher {
public InputStream fetch(String fetchString, Metadata metadata)
throws IOException, TikaException {
FetchPrefixKeyPair fetchPrefixKeyPair =
FetchPrefixKeyPair.create(fetchString);
- metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME,
fetchPrefixKeyPair.getKey());
+ metadata.set(TikaCoreProperties.SOURCE_PATH,
fetchPrefixKeyPair.getKey());
Path p = null;
if (basePath != null) {
p = basePath.resolve(fetchPrefixKeyPair.getKey());
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 29e80fd..a8e804a 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -113,12 +113,22 @@ public interface TikaCoreProperties {
/**
* Some file formats can store information about their original
- * file name/location or about their attachment's original file
name/location.
+ * file name/location or about their attachment's original file
name/location
+ * within the file.
*/
- public static final Property ORIGINAL_RESOURCE_NAME =
+ Property ORIGINAL_RESOURCE_NAME =
Property.internalTextBag(TIKA_META_PREFIX+"origResourceName");
/**
+ * This should be used to store the path (relative or full)
+ * of the source file, including the file name,
+ * e.g. doc/path/to/my_pdf.pdf
+ *
+ * This can also be used for a primary key within a database.
+ */
+ Property SOURCE_PATH =
+ Property.internalText(TIKA_META_PREFIX+"sourcePath");
+ /**
* This is currently used to identify Content-Type that may be
* included within a document, such as in html documents
* (e.g. <meta http-equiv="content-type" content="text/html;
charset=UTF-8">)
diff --git a/tika-fetchers/pom.xml b/tika-emitters/pom.xml
similarity index 88%
copy from tika-fetchers/pom.xml
copy to tika-emitters/pom.xml
index 113caae..21eed2b 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-emitters/pom.xml
@@ -28,14 +28,15 @@
</parent>
<modelVersion>4.0.0</modelVersion>
- <artifactId>tika-fetchers</artifactId>
+ <artifactId>tika-emitters</artifactId>
<packaging>pom</packaging>
- <name>Apache Tika fetchers</name>
+ <name>Apache Tika emitters</name>
<url>http://tika.apache.org/</url>
<modules>
- <module>s3-fetcher</module>
- <module>jdbc-fetcher</module>
+ <module>tika-emitter-fs</module>
+ <module>tika-emitter-solr</module>
</modules>
+
</project>
\ No newline at end of file
diff --git a/tika-fetchers/pom.xml b/tika-emitters/tika-emitter-fs/pom.xml
similarity index 64%
copy from tika-fetchers/pom.xml
copy to tika-emitters/tika-emitter-fs/pom.xml
index 113caae..6c5c2cf 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-emitters/tika-emitter-fs/pom.xml
@@ -21,21 +21,28 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
+ <artifactId>tika-emitters</artifactId>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
<version>2.0.0-SNAPSHOT</version>
- <relativePath>../tika-parent/pom.xml</relativePath>
</parent>
<modelVersion>4.0.0</modelVersion>
- <artifactId>tika-fetchers</artifactId>
- <packaging>pom</packaging>
- <name>Apache Tika fetchers</name>
- <url>http://tika.apache.org/</url>
+ <artifactId>tika-emitter-fs</artifactId>
- <modules>
- <module>s3-fetcher</module>
- <module>jdbc-fetcher</module>
- </modules>
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <!-- should serialization be provided or bundled? -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-serialization</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
</project>
\ No newline at end of file
diff --git
a/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
new file mode 100644
index 0000000..5c5016b
--- /dev/null
+++
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
@@ -0,0 +1,79 @@
+package org.apache.tika.emitter.fs;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.emitter.Emitter;
+import org.apache.tika.emitter.TikaEmitterException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+
+public class FileSystemEmitter implements Emitter {
+
+ private String name = "fs";
+ private Path basePath = null;
+ private String fileExtension = "json";
+
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public void emit(List<Metadata> metadataList) throws IOException,
TikaException {
+ Path output;
+ if (metadataList == null || metadataList.size() == 0) {
+ throw new TikaEmitterException("metadata list must not be null or
of size 0");
+ }
+
+ String relPath = metadataList.get(0)
+ .get(TikaCoreProperties.SOURCE_PATH);
+
+ if (basePath != null) {
+ output = basePath.resolve(relPath);
+ } else {
+ output = Paths.get(relPath);
+ }
+
+ if (!Files.isDirectory(output.getParent())) {
+ Files.createDirectories(output.getParent());
+ }
+ try (Writer writer = Files.newBufferedWriter(output,
StandardCharsets.UTF_8)) {
+ JsonMetadataList.toJson(metadataList, writer);
+ }
+ }
+
+ @Field
+ public void setBasePath(Path basePath) {
+ this.basePath = basePath;
+ }
+
+ /**
+ * If you want to customize the output file's file extension.
+ * Do not include the "."
+ * @param fileExtension
+ */
+ @Field
+ public void setFileExtension(String fileExtension) {
+ this.fileExtension = fileExtension;
+ }
+
+ /**
+ * Set this so to uniquely identify this emitter if
+ * there might be others available. The default is "fs"
+ * @param name
+ */
+ @Field
+ public void setName(String name) {
+ this.name = name;
+ }
+}
diff --git a/tika-fetchers/pom.xml b/tika-emitters/tika-emitter-solr/pom.xml
similarity index 76%
copy from tika-fetchers/pom.xml
copy to tika-emitters/tika-emitter-solr/pom.xml
index 113caae..8ee76af 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-emitters/tika-emitter-solr/pom.xml
@@ -21,21 +21,21 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
+ <artifactId>tika-emitters</artifactId>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
<version>2.0.0-SNAPSHOT</version>
- <relativePath>../tika-parent/pom.xml</relativePath>
</parent>
<modelVersion>4.0.0</modelVersion>
- <artifactId>tika-fetchers</artifactId>
- <packaging>pom</packaging>
- <name>Apache Tika fetchers</name>
- <url>http://tika.apache.org/</url>
+ <artifactId>tika-emitter-solr</artifactId>
- <modules>
- <module>s3-fetcher</module>
- <module>jdbc-fetcher</module>
- </modules>
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
</project>
\ No newline at end of file
diff --git
a/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
b/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
new file mode 100644
index 0000000..9f73cc8
--- /dev/null
+++
b/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
@@ -0,0 +1,85 @@
+package org.apache.tika.emitter.solr;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.emitter.Emitter;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+public class SolrEmitter implements Emitter, Initializable {
+
+ private String name = "solr";
+ boolean collapseEmbeddedFiles = false;
+ private String url;
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public void emit(List<Metadata> metadataList) throws IOException,
+ TikaException {
+
+ }
+
+ /**
+ * If set to true, this concatenates text from all embedded files
+ * with the primary document's text but throws out the metadata
+ * from the embedded files.
+ *
+ * If set to false (default), the SolrEmitter will emit attachments
+ * as "children" of the parent.
+ *
+ * @param collapseEmbeddedFiles
+ */
+ @Field
+ public void setCollapseEmbeddedFiles(boolean collapseEmbeddedFiles) {
+ this.collapseEmbeddedFiles = collapseEmbeddedFiles;
+ }
+
+ @Field
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ /**
+ * Specify the url for Solr
+ * @param url
+ */
+ @Field
+ public void setSolrUrl(String url) {
+ this.url = url;
+ }
+
+ //TODO: add username/password for authentication?
+
+ /**
+ * Specify the field in the first Metadata that should be
+ * used as the id field for the document.
+ *
+ * @param idField
+ */
+ @Field
+ public void setIdField(String idField) {
+
+ }
+
+ @Override
+ public void initialize(Map<String, Param> params) throws
TikaConfigException {
+ //TODO: build the client here
+
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler
problemHandler) throws TikaConfigException {
+
+ }
+}
diff --git a/tika-fetchers/pom.xml b/tika-fetchers/pom.xml
index 113caae..833a56f 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-fetchers/pom.xml
@@ -34,8 +34,8 @@
<url>http://tika.apache.org/</url>
<modules>
- <module>s3-fetcher</module>
- <module>jdbc-fetcher</module>
+ <module>tika-fetcher-jdbc</module>
+ <module>tika-fetcher-s3</module>
</modules>
</project>
\ No newline at end of file
diff --git a/tika-fetchers/jdbc-fetcher/pom.xml
b/tika-fetchers/tika-fetcher-jdbc/pom.xml
similarity index 100%
rename from tika-fetchers/jdbc-fetcher/pom.xml
rename to tika-fetchers/tika-fetcher-jdbc/pom.xml
diff --git
a/tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
b/tika-fetchers/tika-fetcher-jdbc/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
similarity index 100%
rename from
tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
rename to
tika-fetchers/tika-fetcher-jdbc/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
diff --git a/tika-fetchers/s3-fetcher/pom.xml
b/tika-fetchers/tika-fetcher-s3/pom.xml
similarity index 91%
rename from tika-fetchers/s3-fetcher/pom.xml
rename to tika-fetchers/tika-fetcher-s3/pom.xml
index 0247262..2ac83e1 100644
--- a/tika-fetchers/s3-fetcher/pom.xml
+++ b/tika-fetchers/tika-fetcher-s3/pom.xml
@@ -40,10 +40,6 @@
<artifactId>commons-logging</artifactId>
</exclusion>
<exclusion>
- <groupId>com.amazonaws</groupId>
- <artifactId>aws-java-sdk-simpleworkflow</artifactId>
- </exclusion>
- <exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</exclusion>
@@ -60,17 +56,6 @@
<version>${jackson.version}</version>
</dependency>
<dependency>
- <groupId>com.amazonaws</groupId>
- <artifactId>aws-java-sdk-simpleworkflow</artifactId>
- <version>${aws.version}</version>
- <exclusions>
- <exclusion>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-databind</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>${commons.logging.version}</version>
diff --git
a/tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
b/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
similarity index 100%
rename from
tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
rename to
tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
diff --git
a/tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
b/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
similarity index 100%
rename from
tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
rename to
tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
diff --git a/tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml
b/tika-fetchers/tika-fetcher-s3/src/test/resources/tika-config-s3.xml
similarity index 100%
rename from tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml
rename to tika-fetchers/tika-fetcher-s3/src/test/resources/tika-config-s3.xml