This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3226
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3226 by this push:
new 4dc5a47 TIKA-3226 -- update FileSystemFetcher and add first steps
towards a jdbc fetcher
4dc5a47 is described below
commit 4dc5a47ca1a70818d1fdbd172937a4089c55670e
Author: tballison <[email protected]>
AuthorDate: Thu Jan 14 17:07:51 2021 -0500
TIKA-3226 -- update FileSystemFetcher and add first steps towards a jdbc
fetcher
---
.../org/apache/tika/fetcher/FileSystemFetcher.java | 36 +++++-
tika-fetchers/jdbc-fetcher/pom.xml | 122 +++++++++++++++++++++
.../org/apache/tika/fetcher/jdbc/JDBCFetcher.java | 100 +++++++++++++++++
tika-fetchers/pom.xml | 1 +
4 files changed, 253 insertions(+), 6 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
index 5060d35..41074db 100644
--- a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
@@ -16,13 +16,16 @@
*/
package org.apache.tika.fetcher;
+import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
@@ -31,9 +34,8 @@ import java.util.Set;
public class FileSystemFetcher implements Fetcher {
private static String PREFIX = "file";
- public static Property PATH_BASE = Property.externalText(PREFIX + ":base");
private static final Set<String> SUPPORTED = Collections.singleton(PREFIX);
-
+ private Path basePath = null;
@Override
public Set<String> getSupportedPrefixes() {
return SUPPORTED;
@@ -43,13 +45,35 @@ public class FileSystemFetcher implements Fetcher {
public InputStream fetch(String fetchString, Metadata metadata)
throws IOException, TikaException {
FetchPrefixKeyPair fetchPrefixKeyPair =
FetchPrefixKeyPair.create(fetchString);
- String base = metadata.get(PATH_BASE);
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME,
fetchPrefixKeyPair.getKey());
Path p = null;
- if (base != null) {
- p = Paths.get(base).resolve(fetchPrefixKeyPair.getKey());
+ if (basePath != null) {
+ p = basePath.resolve(fetchPrefixKeyPair.getKey());
+ if (!Files.isRegularFile(p)) {
+ if (!Files.isDirectory(basePath)) {
+ throw new IOException("BasePath is not a directory:
"+basePath);
+ } else {
+ throw new
FileNotFoundException(p.toAbsolutePath().toString());
+ }
+ }
} else {
p = Paths.get(fetchPrefixKeyPair.getKey());
+ if (!Files.isRegularFile(p)) {
+ throw new FileNotFoundException(p.toAbsolutePath().toString());
+ }
}
return TikaInputStream.get(p, metadata);
}
+
+ /**
+ * If clients will send in relative paths, this
+ * must be set to allow this fetcher to fetch the
+ * full path.
+ *
+ * @param basePath
+ */
+ @Field
+ public void setBasePath(String basePath) {
+ this.basePath = Paths.get(basePath);
+ }
}
diff --git a/tika-fetchers/jdbc-fetcher/pom.xml
b/tika-fetchers/jdbc-fetcher/pom.xml
new file mode 100644
index 0000000..06f79d5
--- /dev/null
+++ b/tika-fetchers/jdbc-fetcher/pom.xml
@@ -0,0 +1,122 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <artifactId>tika-fetchers</artifactId>
+ <groupId>org.apache.tika</groupId>
+ <version>2.0.0-SNAPSHOT</version>
+ </parent>
+ <modelVersion>4.0.0</modelVersion>
+
+ <artifactId>jdbc-fetcher</artifactId>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.h2database</groupId>
+ <artifactId>h2</artifactId>
+ <version>${h2.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.fetcher.jdbc</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven.shade.version}</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <!-- <filters> -->
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*</exclude>
+ <exclude>LICENSE.txt</exclude>
+ <exclude>NOTICE.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+
<mainClass>org.apache.tika.eval.app.TikaEvalCLI</mainClass>
+ </transformer>
+
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
/>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/LICENSE</resource>
+
<file>target/classes/META-INF/LICENSE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/NOTICE</resource>
+ <file>target/classes/META-INF/NOTICE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/DEPENDENCIES</resource>
+
<file>target/classes/META-INF/DEPENDENCIES</file>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
diff --git
a/tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
b/tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
new file mode 100644
index 0000000..b6dd4f0
--- /dev/null
+++
b/tika-fetchers/jdbc-fetcher/src/main/java/org/apache/tika/fetcher/jdbc/JDBCFetcher.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher.jdbc;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fetcher.Fetcher;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+public class JDBCFetcher implements Fetcher, Initializable {
+
+ private int requestPoolSize = 10;
+ private String table;
+ private List<String> metadataColumns = new ArrayList<>();
+ private String binaryField;
+ private String connectionString;
+
+
+ @Override
+ public void initialize(Map<String, Param> params) throws
TikaConfigException {
+ //TODO: init prepared
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler
problemHandler) throws TikaConfigException {
+ //no-op
+ }
+
+ @Override
+ public Set<String> getSupportedPrefixes() {
+ return null;
+ }
+
+ @Override
+ public InputStream fetch(String fetchString, Metadata metadata) throws
TikaException, IOException {
+ return null;
+ }
+
+ /**
+ * The prepared inserts are pooled. The default is 10.
+ * This sets how big this pool should be.
+ * @param requestPoolSize
+ */
+ @Field
+ public void setRequestPoolSize(int requestPoolSize) {
+ this.requestPoolSize = requestPoolSize;
+ }
+
+ @Field
+ public void setJDBCConnectionString(String connectionString) {
+ this.connectionString = connectionString;
+ }
+
+ @Field
+ public void setTable(String table) {
+ this.table = table;
+ }
+
+ @Field
+ public void setMetadataFields(List<String> cols) {
+
+ }
+
+ /**
+ * If there's a blob or text field that you
+ * want Tika to parse.
+ *
+ * This is optional.
+ * @param binaryField
+ */
+ @Field
+ public void setBinaryField(String binaryField) {
+
+ }
+}
diff --git a/tika-fetchers/pom.xml b/tika-fetchers/pom.xml
index 61e0963..113caae 100644
--- a/tika-fetchers/pom.xml
+++ b/tika-fetchers/pom.xml
@@ -35,6 +35,7 @@
<modules>
<module>s3-fetcher</module>
+ <module>jdbc-fetcher</module>
</modules>
</project>
\ No newline at end of file