This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3226
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-3226 by this push:
     new 1d12e57  TIKA-3226 -- add emitter interface and stub for SolrEmitter; 
rename packages to be consistent with tika-parsers submodules
1d12e57 is described below

commit 1d12e57a93f23c4845a46b97e3ffb2ffe9382b7f
Author: tballison <[email protected]>
AuthorDate: Fri Jan 15 14:30:27 2021 -0500

    TIKA-3226 -- add emitter interface and stub for SolrEmitter;
    rename packages to be consistent with tika-parsers submodules
---
 pom.xml                                            |  1 +
 .../main/java/org/apache/tika/emitter/Emitter.java | 19 +++++
 .../apache/tika/emitter/TikaEmitterException.java  |  9 +++
 .../org/apache/tika/fetcher/FileSystemFetcher.java |  2 +-
 .../apache/tika/metadata/TikaCoreProperties.java   | 14 +++-
 {tika-fetchers => tika-emitters}/pom.xml           |  9 ++-
 .../tika-emitter-fs}/pom.xml                       | 27 ++++---
 .../apache/tika/emitter/fs/FileSystemEmitter.java  | 79 ++++++++++++++++++++
 .../tika-emitter-solr}/pom.xml                     | 20 ++---
 .../org/apache/tika/emitter/solr/SolrEmitter.java  | 85 ++++++++++++++++++++++
 tika-fetchers/pom.xml                              |  4 +-
 .../{jdbc-fetcher => tika-fetcher-jdbc}/pom.xml    |  0
 .../org/apache/tika/fetcher/jdbc/JDBCFetcher.java  |  0
 .../{s3-fetcher => tika-fetcher-s3}/pom.xml        | 15 ----
 .../java/org/apache/tika/fetcher/s3/S3Fetcher.java |  0
 .../org/apache/tika/fetcher/s3/TestS3Fetcher.java  |  0
 .../src/test/resources/tika-config-s3.xml          |  0
 17 files changed, 240 insertions(+), 44 deletions(-)

diff --git a/pom.xml b/pom.xml
index aa1e6b9..0547122 100644
--- a/pom.xml
+++ b/pom.xml
@@ -51,6 +51,7 @@
     <module>tika-example</module>
     <module>tika-java7</module>
     <module>tika-eval</module>
+      <module>tika-emitters</module>
   </modules>
 
   <profiles>
diff --git a/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java 
b/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java
new file mode 100644
index 0000000..aa8ee55
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java
@@ -0,0 +1,19 @@
+package org.apache.tika.emitter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.util.List;
+
+public interface Emitter {
+
+    String getName();
+
+    //TODO: do we need a key or can we pass that in metadatalist?
+    //If we do need it, how do we populate it?
+    void emit(List<Metadata> metadataList) throws IOException, TikaException;
+    //TODO we can add this later?
+    //void emit(String txt, Metadata metadata) throws IOException, 
TikaException;
+
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/emitter/TikaEmitterException.java 
b/tika-core/src/main/java/org/apache/tika/emitter/TikaEmitterException.java
new file mode 100644
index 0000000..6926b78
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/emitter/TikaEmitterException.java
@@ -0,0 +1,9 @@
+package org.apache.tika.emitter;
+
+import org.apache.tika.exception.TikaException;
+
+public class TikaEmitterException extends TikaException {
+    public TikaEmitterException(String msg) {
+        super(msg);
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java 
b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
index 41074db..83a6677 100644
--- a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
@@ -45,7 +45,7 @@ public class FileSystemFetcher implements Fetcher {
     public InputStream fetch(String fetchString, Metadata metadata)
             throws IOException, TikaException {
         FetchPrefixKeyPair fetchPrefixKeyPair = 
FetchPrefixKeyPair.create(fetchString);
-        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, 
fetchPrefixKeyPair.getKey());
+        metadata.set(TikaCoreProperties.SOURCE_PATH, 
fetchPrefixKeyPair.getKey());
         Path p = null;
         if (basePath != null) {
             p = basePath.resolve(fetchPrefixKeyPair.getKey());
diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java 
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 29e80fd..a8e804a 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -113,12 +113,22 @@ public interface TikaCoreProperties {
 
     /**
      * Some file formats can store information about their original
-     * file name/location or about their attachment's original file 
name/location.
+     * file name/location or about their attachment's original file 
name/location
+     * within the file.
      */
-    public static final Property ORIGINAL_RESOURCE_NAME =
+    Property ORIGINAL_RESOURCE_NAME =
             Property.internalTextBag(TIKA_META_PREFIX+"origResourceName");
 
     /**
+     * This should be used to store the path (relative or full)
+     * of the source file, including the file name,
+     * e.g. doc/path/to/my_pdf.pdf
+     *
+     * This can also be used for a primary key within a database.
+     */
+    Property SOURCE_PATH =
+            Property.internalText(TIKA_META_PREFIX+"sourcePath");
+    /**
      * This is currently used to identify Content-Type that may be
      * included within a document, such as in html documents
      * (e.g. <meta http-equiv="content-type" content="text/html; 
charset=UTF-8">)
diff --git a/tika-fetchers/pom.xml b/tika-emitters/pom.xml
similarity index 88%
copy from tika-fetchers/pom.xml
copy to tika-emitters/pom.xml
index 113caae..21eed2b 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-emitters/pom.xml
@@ -28,14 +28,15 @@
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>tika-fetchers</artifactId>
+    <artifactId>tika-emitters</artifactId>
     <packaging>pom</packaging>
-    <name>Apache Tika fetchers</name>
+    <name>Apache Tika emitters</name>
     <url>http://tika.apache.org/</url>
 
     <modules>
-        <module>s3-fetcher</module>
-        <module>jdbc-fetcher</module>
+        <module>tika-emitter-fs</module>
+        <module>tika-emitter-solr</module>
     </modules>
 
+
 </project>
\ No newline at end of file
diff --git a/tika-fetchers/pom.xml b/tika-emitters/tika-emitter-fs/pom.xml
similarity index 64%
copy from tika-fetchers/pom.xml
copy to tika-emitters/tika-emitter-fs/pom.xml
index 113caae..6c5c2cf 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-emitters/tika-emitter-fs/pom.xml
@@ -21,21 +21,28 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
     <parent>
+        <artifactId>tika-emitters</artifactId>
         <groupId>org.apache.tika</groupId>
-        <artifactId>tika-parent</artifactId>
         <version>2.0.0-SNAPSHOT</version>
-        <relativePath>../tika-parent/pom.xml</relativePath>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>tika-fetchers</artifactId>
-    <packaging>pom</packaging>
-    <name>Apache Tika fetchers</name>
-    <url>http://tika.apache.org/</url>
+    <artifactId>tika-emitter-fs</artifactId>
 
-    <modules>
-        <module>s3-fetcher</module>
-        <module>jdbc-fetcher</module>
-    </modules>
+    <dependencies>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${project.version}</version>
+            <scope>provided</scope>
+        </dependency>
+        <!-- should serialization be provided or bundled? -->
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-serialization</artifactId>
+            <version>${project.version}</version>
+            <scope>provided</scope>
+        </dependency>
+    </dependencies>
 
 </project>
\ No newline at end of file
diff --git 
a/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
 
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
new file mode 100644
index 0000000..5c5016b
--- /dev/null
+++ 
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
@@ -0,0 +1,79 @@
+package org.apache.tika.emitter.fs;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.emitter.Emitter;
+import org.apache.tika.emitter.TikaEmitterException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+
+public class FileSystemEmitter implements Emitter {
+
+    private String name = "fs";
+    private Path basePath = null;
+    private String fileExtension = "json";
+
+
+    @Override
+    public String getName() {
+        return name;
+    }
+
+    @Override
+    public void emit(List<Metadata> metadataList) throws IOException, 
TikaException {
+        Path output;
+        if (metadataList == null || metadataList.size() == 0) {
+            throw new TikaEmitterException("metadata list must not be null or 
of size 0");
+        }
+
+        String relPath = metadataList.get(0)
+                .get(TikaCoreProperties.SOURCE_PATH);
+
+        if (basePath != null) {
+            output = basePath.resolve(relPath);
+        } else {
+            output = Paths.get(relPath);
+        }
+
+        if (!Files.isDirectory(output.getParent())) {
+            Files.createDirectories(output.getParent());
+        }
+        try (Writer writer = Files.newBufferedWriter(output, 
StandardCharsets.UTF_8)) {
+            JsonMetadataList.toJson(metadataList, writer);
+        }
+    }
+
+    @Field
+    public void setBasePath(Path basePath) {
+        this.basePath = basePath;
+    }
+
+    /**
+     * If you want to customize the output file's file extension.
+     * Do not include the "."
+     * @param fileExtension
+     */
+    @Field
+    public void setFileExtension(String fileExtension) {
+        this.fileExtension = fileExtension;
+    }
+
+    /**
+     * Set this so to uniquely identify this emitter if
+     * there might be others available. The default is "fs"
+     * @param name
+     */
+    @Field
+    public void setName(String name) {
+        this.name = name;
+    }
+}
diff --git a/tika-fetchers/pom.xml b/tika-emitters/tika-emitter-solr/pom.xml
similarity index 76%
copy from tika-fetchers/pom.xml
copy to tika-emitters/tika-emitter-solr/pom.xml
index 113caae..8ee76af 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-emitters/tika-emitter-solr/pom.xml
@@ -21,21 +21,21 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
     <parent>
+        <artifactId>tika-emitters</artifactId>
         <groupId>org.apache.tika</groupId>
-        <artifactId>tika-parent</artifactId>
         <version>2.0.0-SNAPSHOT</version>
-        <relativePath>../tika-parent/pom.xml</relativePath>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>tika-fetchers</artifactId>
-    <packaging>pom</packaging>
-    <name>Apache Tika fetchers</name>
-    <url>http://tika.apache.org/</url>
+    <artifactId>tika-emitter-solr</artifactId>
 
-    <modules>
-        <module>s3-fetcher</module>
-        <module>jdbc-fetcher</module>
-    </modules>
+    <dependencies>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${project.version}</version>
+            <scope>provided</scope>
+        </dependency>
+    </dependencies>
 
 </project>
\ No newline at end of file
diff --git 
a/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
 
b/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
new file mode 100644
index 0000000..9f73cc8
--- /dev/null
+++ 
b/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
@@ -0,0 +1,85 @@
+package org.apache.tika.emitter.solr;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.emitter.Emitter;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+public class SolrEmitter implements Emitter, Initializable {
+
+    private String name = "solr";
+    boolean collapseEmbeddedFiles = false;
+    private String url;
+
+    @Override
+    public String getName() {
+        return name;
+    }
+
+    @Override
+    public void emit(List<Metadata> metadataList) throws IOException,
+            TikaException {
+
+    }
+
+    /**
+     * If set to true, this concatenates text from all embedded files
+     * with the primary document's text but throws out the metadata
+     * from the embedded files.
+     *
+     * If set to false (default), the SolrEmitter will emit attachments
+     * as "children" of the parent.
+     *
+     * @param collapseEmbeddedFiles
+     */
+    @Field
+    public void setCollapseEmbeddedFiles(boolean collapseEmbeddedFiles) {
+        this.collapseEmbeddedFiles = collapseEmbeddedFiles;
+    }
+
+    @Field
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    /**
+     * Specify the url for Solr
+     * @param url
+     */
+    @Field
+    public void setSolrUrl(String url) {
+        this.url = url;
+    }
+
+    //TODO: add username/password for authentication?
+
+    /**
+     * Specify the field in the first Metadata that should be
+     * used as the id field for the document.
+     *
+     * @param idField
+     */
+    @Field
+    public void setIdField(String idField) {
+
+    }
+
+    @Override
+    public void initialize(Map<String, Param> params) throws 
TikaConfigException {
+        //TODO: build the client here
+
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler 
problemHandler) throws TikaConfigException {
+
+    }
+}
diff --git a/tika-fetchers/pom.xml b/tika-fetchers/pom.xml
index 113caae..833a56f 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-fetchers/pom.xml
@@ -34,8 +34,8 @@
     <url>http://tika.apache.org/</url>
 
     <modules>
-        <module>s3-fetcher</module>
-        <module>jdbc-fetcher</module>
+        <module>tika-fetcher-jdbc</module>
+        <module>tika-fetcher-s3</module>
     </modules>
 
 </project>
\ No newline at end of file
diff --git a/tika-fetchers/jdbc-fetcher/pom.xml 
b/tika-fetchers/tika-fetcher-jdbc/pom.xml
similarity index 100%
rename from tika-fetchers/jdbc-fetcher/pom.xml
rename to tika-fetchers/tika-fetcher-jdbc/pom.xml
diff --git 
a/tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
 
b/tika-fetchers/tika-fetcher-jdbc/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
similarity index 100%
rename from 
tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
rename to 
tika-fetchers/tika-fetcher-jdbc/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
diff --git a/tika-fetchers/s3-fetcher/pom.xml 
b/tika-fetchers/tika-fetcher-s3/pom.xml
similarity index 91%
rename from tika-fetchers/s3-fetcher/pom.xml
rename to tika-fetchers/tika-fetcher-s3/pom.xml
index 0247262..2ac83e1 100644
--- a/tika-fetchers/s3-fetcher/pom.xml
+++ b/tika-fetchers/tika-fetcher-s3/pom.xml
@@ -40,10 +40,6 @@
                     <artifactId>commons-logging</artifactId>
                 </exclusion>
                 <exclusion>
-                    <groupId>com.amazonaws</groupId>
-                    <artifactId>aws-java-sdk-simpleworkflow</artifactId>
-                </exclusion>
-                <exclusion>
                     <groupId>com.fasterxml.jackson.core</groupId>
                     <artifactId>jackson-core</artifactId>
                 </exclusion>
@@ -60,17 +56,6 @@
             <version>${jackson.version}</version>
         </dependency>
         <dependency>
-            <groupId>com.amazonaws</groupId>
-            <artifactId>aws-java-sdk-simpleworkflow</artifactId>
-            <version>${aws.version}</version>
-            <exclusions>
-                <exclusion>
-                    <groupId>com.fasterxml.jackson.core</groupId>
-                    <artifactId>jackson-databind</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
-        <dependency>
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
             <version>${commons.logging.version}</version>
diff --git 
a/tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
 
b/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
similarity index 100%
rename from 
tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
rename to 
tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
diff --git 
a/tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
 
b/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
similarity index 100%
rename from 
tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
rename to 
tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
diff --git a/tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml 
b/tika-fetchers/tika-fetcher-s3/src/test/resources/tika-config-s3.xml
similarity index 100%
rename from tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml
rename to tika-fetchers/tika-fetcher-s3/src/test/resources/tika-config-s3.xml

Reply via email to