This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3226
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-3226 by this push:
     new 4dc5a47  TIKA-3226 -- update FileSystemFetcher and add first steps 
towards a jdbc fetcher
4dc5a47 is described below

commit 4dc5a47ca1a70818d1fdbd172937a4089c55670e
Author: tballison <[email protected]>
AuthorDate: Thu Jan 14 17:07:51 2021 -0500

    TIKA-3226 -- update FileSystemFetcher and add first steps towards a jdbc 
fetcher
---
 .../org/apache/tika/fetcher/FileSystemFetcher.java |  36 +++++-
 tika-fetchers/jdbc-fetcher/pom.xml                 | 122 +++++++++++++++++++++
 .../org/apache/tika/fetcher/jdbc/JDBCFetcher.java  | 100 +++++++++++++++++
 tika-fetchers/pom.xml                              |   1 +
 4 files changed, 253 insertions(+), 6 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java 
b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
index 5060d35..41074db 100644
--- a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
@@ -16,13 +16,16 @@
  */
 package org.apache.tika.fetcher;
 
+import org.apache.tika.config.Field;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
 
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Collections;
@@ -31,9 +34,8 @@ import java.util.Set;
 public class FileSystemFetcher implements Fetcher {
 
     private static String PREFIX = "file";
-    public static Property PATH_BASE = Property.externalText(PREFIX + ":base");
     private static final Set<String> SUPPORTED = Collections.singleton(PREFIX);
-
+    private Path basePath = null;
     @Override
     public Set<String> getSupportedPrefixes() {
         return SUPPORTED;
@@ -43,13 +45,35 @@ public class FileSystemFetcher implements Fetcher {
     public InputStream fetch(String fetchString, Metadata metadata)
             throws IOException, TikaException {
         FetchPrefixKeyPair fetchPrefixKeyPair = 
FetchPrefixKeyPair.create(fetchString);
-        String base = metadata.get(PATH_BASE);
+        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, 
fetchPrefixKeyPair.getKey());
         Path p = null;
-        if (base != null) {
-            p = Paths.get(base).resolve(fetchPrefixKeyPair.getKey());
+        if (basePath != null) {
+            p = basePath.resolve(fetchPrefixKeyPair.getKey());
+            if (!Files.isRegularFile(p)) {
+                if (!Files.isDirectory(basePath)) {
+                    throw new IOException("BasePath is not a directory: 
"+basePath);
+                } else {
+                    throw new 
FileNotFoundException(p.toAbsolutePath().toString());
+                }
+            }
         } else {
             p = Paths.get(fetchPrefixKeyPair.getKey());
+            if (!Files.isRegularFile(p)) {
+                throw new FileNotFoundException(p.toAbsolutePath().toString());
+            }
         }
         return TikaInputStream.get(p, metadata);
     }
+
+    /**
+     * If clients will send in relative paths, this
+     * must be set to allow this fetcher to fetch the
+     * full path.
+     *
+     * @param basePath
+     */
+    @Field
+    public void setBasePath(String basePath) {
+        this.basePath = Paths.get(basePath);
+    }
 }
diff --git a/tika-fetchers/jdbc-fetcher/pom.xml 
b/tika-fetchers/jdbc-fetcher/pom.xml
new file mode 100644
index 0000000..06f79d5
--- /dev/null
+++ b/tika-fetchers/jdbc-fetcher/pom.xml
@@ -0,0 +1,122 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <parent>
+        <artifactId>tika-fetchers</artifactId>
+        <groupId>org.apache.tika</groupId>
+        <version>2.0.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>jdbc-fetcher</artifactId>
+
+    <dependencies>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${project.version}</version>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.h2database</groupId>
+            <artifactId>h2</artifactId>
+            <version>${h2.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            
<Automatic-Module-Name>org.apache.tika.fetcher.jdbc</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    
<mainClass>org.apache.tika.eval.app.TikaEvalCLI</mainClass>
+                                </transformer>
+
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
 />
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    
<file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    
<file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
+
+</project>
\ No newline at end of file
diff --git 
a/tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
 
b/tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
new file mode 100644
index 0000000..b6dd4f0
--- /dev/null
+++ 
b/tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher.jdbc;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fetcher.Fetcher;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+public class JDBCFetcher implements Fetcher, Initializable {
+
+    private int requestPoolSize = 10;
+    private String table;
+    private List<String> metadataColumns = new ArrayList<>();
+    private String binaryField;
+    private String connectionString;
+
+
+    @Override
+    public void initialize(Map<String, Param> params) throws 
TikaConfigException {
+        //TODO: init prepared
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler 
problemHandler) throws TikaConfigException {
+        //no-op
+    }
+
+    @Override
+    public Set<String> getSupportedPrefixes() {
+        return null;
+    }
+
+    @Override
+    public InputStream fetch(String fetchString, Metadata metadata) throws 
TikaException, IOException {
+        return null;
+    }
+
+    /**
+     * The prepared inserts are pooled. The default is 10.
+     * This sets how big this pool should be.
+     * @param requestPoolSize
+     */
+    @Field
+    public void setRequestPoolSize(int requestPoolSize) {
+        this.requestPoolSize = requestPoolSize;
+    }
+
+    @Field
+    public void setJDBCConnectionString(String connectionString) {
+        this.connectionString = connectionString;
+    }
+
+    @Field
+    public void setTable(String table) {
+        this.table = table;
+    }
+
+    @Field
+    public void setMetadataFields(List<String> cols) {
+
+    }
+
+    /**
+     * If there's a blob or text field that you
+     * want Tika to parse.
+     *
+     * This is optional.
+     * @param binaryField
+     */
+    @Field
+    public void setBinaryField(String binaryField) {
+
+    }
+}
diff --git a/tika-fetchers/pom.xml b/tika-fetchers/pom.xml
index 61e0963..113caae 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-fetchers/pom.xml
@@ -35,6 +35,7 @@
 
     <modules>
         <module>s3-fetcher</module>
+        <module>jdbc-fetcher</module>
     </modules>
 
 </project>
\ No newline at end of file

Reply via email to