This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3226
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 3d0692adfe0f17f698ba298cc5c7f1a01af32bf7
Author: tballison <[email protected]>
AuthorDate: Thu Jan 14 14:39:29 2021 -0500

    TIKA-3226 -- initial fetcher with s3 example
---
 pom.xml                                            |   1 +
 .../java/org/apache/tika/config/TikaConfig.java    | 104 +++++++++++++
 .../org/apache/tika/fetcher/DefaultFetcher.java    |  80 ++++++++++
 .../apache/tika/fetcher/FetchPrefixKeyPair.java    |  53 +++++++
 .../main/java/org/apache/tika/fetcher/Fetcher.java |  41 ++++++
 .../tika/fetcher/FetcherStringException.java       |  29 ++++
 .../org/apache/tika/fetcher/FileSystemFetcher.java |  55 +++++++
 tika-fetchers/pom.xml                              |  40 +++++
 tika-fetchers/s3-fetcher/pom.xml                   | 161 +++++++++++++++++++++
 .../java/org/apache/tika/fetcher/s3/S3Fetcher.java | 124 ++++++++++++++++
 .../org/apache/tika/fetcher/s3/TestS3Fetcher.java  |  63 ++++++++
 .../src/test/resources/tika-config-s3.xml          |  27 ++++
 tika-parent/pom.xml                                |   1 +
 13 files changed, 779 insertions(+)

diff --git a/pom.xml b/pom.xml
index 6a9ef15..aa1e6b9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -37,6 +37,7 @@
   <modules>
     <module>tika-parent</module>
     <module>tika-core</module>
+    <module>tika-fetchers</module>
     <module>tika-parsers</module>
     <module>tika-bundles</module>
     <module>tika-xmp</module>
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index d7fb8b3..20742d8 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -48,6 +48,8 @@ import org.apache.tika.detect.Detector;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.fetcher.DefaultFetcher;
+import org.apache.tika.fetcher.Fetcher;
 import org.apache.tika.language.translate.DefaultTranslator;
 import org.apache.tika.language.translate.Translator;
 import org.apache.tika.metadata.filter.CompositeMetadataFilter;
@@ -112,6 +114,10 @@ public class TikaConfig {
         return new DefaultMetadataFilter(loader);
     }
 
+    private static Fetcher getDefaultFetcher(ServiceLoader loader) {
+        return new DefaultFetcher(loader);
+    }
+
     //use this to look for unneeded instantiations of TikaConfig
     protected static AtomicInteger TIMES_INSTANTIATED = new AtomicInteger();
 
@@ -124,6 +130,7 @@ public class TikaConfig {
     private final ExecutorService executorService;
     private final EncodingDetector encodingDetector;
     private final MetadataFilter metadataFilter;
+    private final Fetcher fetcher;
 
     public TikaConfig(String file)
             throws TikaException, IOException, SAXException {
@@ -190,6 +197,7 @@ public class TikaConfig {
         ExecutorServiceXmlLoader executorLoader = new 
ExecutorServiceXmlLoader();
         EncodingDetectorXmlLoader encodingDetectorXmlLoader = new 
EncodingDetectorXmlLoader();
         MetadataFilterXmlLoader metadataFilterXmlLoader = new 
MetadataFilterXmlLoader();
+        FetcherXmlLoader fetcherXmlLoader = new FetcherXmlLoader();
         updateXMLReaderUtils(element);
         this.mimeTypes = typesFromDomElement(element);
         this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
@@ -200,6 +208,7 @@ public class TikaConfig {
         this.translator = translatorLoader.loadOverall(element, mimeTypes, 
loader);
         this.executorService = executorLoader.loadOverall(element, mimeTypes, 
loader);
         this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, 
mimeTypes, loader);
+        this.fetcher = fetcherXmlLoader.loadOverall(element, mimeTypes, 
loader);
         this.serviceLoader = loader;
         TIMES_INSTANTIATED.incrementAndGet();
     }
@@ -226,6 +235,7 @@ public class TikaConfig {
         this.translator = getDefaultTranslator(serviceLoader);
         this.executorService = getDefaultExecutorService();
         this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
+        this.fetcher = getDefaultFetcher(serviceLoader);
         TIMES_INSTANTIATED.incrementAndGet();
     }
 
@@ -262,6 +272,7 @@ public class TikaConfig {
             this.translator = getDefaultTranslator(serviceLoader);
             this.executorService = getDefaultExecutorService();
             this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
+            this.fetcher = getDefaultFetcher(serviceLoader);
         } else {
             ServiceLoader tmpServiceLoader = new ServiceLoader();
             try (InputStream stream = getConfigInputStream(config, 
tmpServiceLoader)) {
@@ -273,6 +284,7 @@ public class TikaConfig {
                 TranslatorXmlLoader translatorLoader = new 
TranslatorXmlLoader();
                 ExecutorServiceXmlLoader executorLoader = new 
ExecutorServiceXmlLoader();
                 MetadataFilterXmlLoader metadataFilterXmlLoader = new 
MetadataFilterXmlLoader();
+                FetcherXmlLoader fetcherXmlLoader = new FetcherXmlLoader();
 
                 this.mimeTypes = typesFromDomElement(element);
                 this.encodingDetector = 
encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader);
@@ -284,6 +296,7 @@ public class TikaConfig {
                 this.translator = translatorLoader.loadOverall(element, 
mimeTypes, serviceLoader);
                 this.executorService = executorLoader.loadOverall(element, 
mimeTypes, serviceLoader);
                 this.metadataFilter = 
metadataFilterXmlLoader.loadOverall(element, mimeTypes, serviceLoader);
+                this.fetcher = fetcherXmlLoader.loadOverall(element, 
mimeTypes, serviceLoader);
             } catch (SAXException e) {
                 throw new TikaException(
                         "Specified Tika configuration has syntax errors: "
@@ -411,6 +424,11 @@ public class TikaConfig {
     public MetadataFilter getMetadataFilter() {
         return metadataFilter;
     }
+
+    public Fetcher getFetcher() {
+        return fetcher;
+    }
+
     /**
      * Provides a default configuration (TikaConfig).  Currently creates a
      * new instance each time it's called; we may be able to have it
@@ -1267,4 +1285,90 @@ public class TikaConfig {
         }
     }
 
+    private static class FetcherXmlLoader extends
+            XmlLoader<Fetcher, Fetcher> {
+
+        boolean supportsComposite() {
+            return true;
+        }
+
+        String getParentTagName() {
+            return "fetchers";
+        }
+
+        String getLoaderTagName() {
+            return "fetcher";
+        }
+
+        @Override
+        Class<? extends Fetcher> getLoaderClass() {
+            return Fetcher.class;
+        }
+
+
+        @Override
+        boolean isComposite(Fetcher loaded) {
+            return loaded instanceof DefaultFetcher;
+        }
+
+        @Override
+        boolean isComposite(Class<? extends Fetcher> loadedClass) {
+            return DefaultFetcher.class.isAssignableFrom(loadedClass);
+        }
+
+        @Override
+        Fetcher preLoadOne(Class<? extends Fetcher> loadedClass,
+                                  String classname, MimeTypes mimeTypes) 
throws TikaException {
+            // Check for classes which can't be set in config
+            // Continue with normal loading
+            return null;
+        }
+
+        @Override
+        Fetcher createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
+            return getDefaultFetcher(loader);
+        }
+
+        //this ignores the service loader
+        @Override
+        Fetcher createComposite(List<Fetcher> loaded, MimeTypes mimeTypes, 
ServiceLoader loader) {
+            return new DefaultFetcher(loaded);
+        }
+
+        @Override
+        Fetcher createComposite(Class<? extends Fetcher> fetcherClass,
+                                       List<Fetcher> childFetchers,
+                                       Set<Class<? extends Fetcher>> 
excludeFilters,
+                                       Map<String, Param> params, MimeTypes 
mimeTypes, ServiceLoader loader)
+                throws InvocationTargetException, IllegalAccessException,
+                InstantiationException {
+            Fetcher fetcher = null;
+            Constructor<? extends Fetcher> c;
+
+            // Try the possible default and composite detector constructors
+            if (fetcher == null) {
+                try {
+                    c = fetcherClass.getConstructor(ServiceLoader.class, 
Collection.class);
+                    fetcher = c.newInstance(loader, excludeFilters);
+                } catch (NoSuchMethodException me) {
+                    me.printStackTrace();
+                }
+            }
+            if (fetcher == null) {
+                try {
+                    c = fetcherClass.getConstructor(List.class);
+                    fetcher = c.newInstance(childFetchers);
+                } catch (NoSuchMethodException me) {
+                    me.printStackTrace();
+                }
+            }
+
+            return fetcher;
+        }
+
+        @Override
+        Fetcher decorate(Fetcher created, Element element) {
+            return created; // No decoration of MetadataFilters
+        }
+    }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java 
b/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java
new file mode 100644
index 0000000..77e77d7
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher;
+
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * Utility class that will apply the appropriate fetcher
+ * to the fetchString based on the prefix.
+ *
+ * This does not allow multiple fetchers supporting the same prefix.
+ */
+public class DefaultFetcher implements Fetcher {
+
+    private final Map<String, Fetcher> fetcherMap = new ConcurrentHashMap<>();
+
+    private static List<Fetcher> getDefaultFilters(
+            ServiceLoader loader) {
+        return loader.loadStaticServiceProviders(Fetcher.class);
+    }
+
+
+    public DefaultFetcher(ServiceLoader serviceLoader) {
+        this(getDefaultFilters(serviceLoader));
+    }
+
+    public DefaultFetcher(List<Fetcher> fetchers) {
+        for (Fetcher fetcher : fetchers) {
+            for (String supportedPrefix : fetcher.getSupportedPrefixes()) {
+                if (fetcherMap.containsKey(supportedPrefix)) {
+                    throw new IllegalArgumentException(
+                            "Multiple fetchers cannot support the same prefix: 
"
+                            + supportedPrefix);
+                }
+                fetcherMap.put(supportedPrefix, fetcher);
+            }
+        }
+    }
+
+    @Override
+    public Set<String> getSupportedPrefixes() {
+        return fetcherMap.keySet();
+    }
+
+    @Override
+    public InputStream fetch(String fetchString, Metadata metadata)
+            throws IOException, TikaException {
+        FetchPrefixKeyPair fetchPrefixKeyPair = 
FetchPrefixKeyPair.create(fetchString);
+
+        Fetcher fetcher = fetcherMap.get(fetchPrefixKeyPair.getPrefix());
+        if (fetcher == null) {
+            throw new IllegalArgumentException("Can't find fetcher for prefix: 
"+
+                    fetchPrefixKeyPair.getPrefix());
+        }
+        return fetcher.fetch(fetchString, metadata);
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/fetcher/FetchPrefixKeyPair.java 
b/tika-core/src/main/java/org/apache/tika/fetcher/FetchPrefixKeyPair.java
new file mode 100644
index 0000000..9b263ae
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/FetchPrefixKeyPair.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher;
+
+public class FetchPrefixKeyPair {
+    private final String prefix;
+    private final String key;
+
+    private FetchPrefixKeyPair(String prefix, String key) {
+        this.prefix = prefix;
+        this.key = key;
+    }
+
+    public static FetchPrefixKeyPair create(String fetchString) throws 
FetcherStringException {
+        int prefixIndex = fetchString.indexOf(":");
+        if (prefixIndex < 0) {
+            throw new FetcherStringException("Can't find fetcher prefix, e.g. 
the 's3' in s3:/myfile");
+        }
+        String prefix = fetchString.substring(0, prefixIndex);
+        String key = fetchString.substring(prefixIndex+1);
+        return new FetchPrefixKeyPair(prefix, key);
+    }
+
+    public String getPrefix() {
+        return prefix;
+    }
+
+    public String getKey() {
+        return key;
+    }
+
+    @Override
+    public String toString() {
+        return "FetchPrefixKeyPair{" +
+                "prefix='" + prefix + '\'' +
+                ", key='" + key + '\'' +
+                '}';
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/fetcher/Fetcher.java 
b/tika-core/src/main/java/org/apache/tika/fetcher/Fetcher.java
new file mode 100644
index 0000000..12c6a5b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/Fetcher.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+/**
+ * Interface for an object that will fetch an InputStream given
+ * a fetch string.  This will also update the metadata object
+ * based on the fetch.
+ *
+ * Implementations of Fetcher must be thread safe.
+ *
+ * The fetchString must start with a prefix that can be
+ * used to uniquely select the fetcher, e.g. file:my_file.pdf, 
s3:bucket/path/to/my_file
+ *
+ * Each fetcher must specify which prefixes it can handle.
+ */
+public interface Fetcher {
+    Set<String> getSupportedPrefixes();
+    InputStream fetch(String fetchString, Metadata metadata) throws 
TikaException, IOException;
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/fetcher/FetcherStringException.java 
b/tika-core/src/main/java/org/apache/tika/fetcher/FetcherStringException.java
new file mode 100644
index 0000000..9f1e93b
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/fetcher/FetcherStringException.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ * If something goes wrong in parsing the fetcher string
+ */
+public class FetcherStringException extends TikaException {
+
+    public FetcherStringException(String msg) {
+        super(msg);
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java 
b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
new file mode 100644
index 0000000..5060d35
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Collections;
+import java.util.Set;
+
+public class FileSystemFetcher implements Fetcher {
+
+    private static String PREFIX = "file";
+    public static Property PATH_BASE = Property.externalText(PREFIX + ":base");
+    private static final Set<String> SUPPORTED = Collections.singleton(PREFIX);
+
+    @Override
+    public Set<String> getSupportedPrefixes() {
+        return SUPPORTED;
+    }
+
+    @Override
+    public InputStream fetch(String fetchString, Metadata metadata)
+            throws IOException, TikaException {
+        FetchPrefixKeyPair fetchPrefixKeyPair = 
FetchPrefixKeyPair.create(fetchString);
+        String base = metadata.get(PATH_BASE);
+        Path p = null;
+        if (base != null) {
+            p = Paths.get(base).resolve(fetchPrefixKeyPair.getKey());
+        } else {
+            p = Paths.get(fetchPrefixKeyPair.getKey());
+        }
+        return TikaInputStream.get(p, metadata);
+    }
+}
diff --git a/tika-fetchers/pom.xml b/tika-fetchers/pom.xml
new file mode 100644
index 0000000..61e0963
--- /dev/null
+++ b/tika-fetchers/pom.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <parent>
+        <groupId>org.apache.tika</groupId>
+        <artifactId>tika-parent</artifactId>
+        <version>2.0.0-SNAPSHOT</version>
+        <relativePath>../tika-parent/pom.xml</relativePath>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>tika-fetchers</artifactId>
+    <packaging>pom</packaging>
+    <name>Apache Tika fetchers</name>
+    <url>http://tika.apache.org/</url>
+
+    <modules>
+        <module>s3-fetcher</module>
+    </modules>
+
+</project>
\ No newline at end of file
diff --git a/tika-fetchers/s3-fetcher/pom.xml b/tika-fetchers/s3-fetcher/pom.xml
new file mode 100644
index 0000000..0247262
--- /dev/null
+++ b/tika-fetchers/s3-fetcher/pom.xml
@@ -0,0 +1,161 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <parent>
+        <artifactId>tika-fetchers</artifactId>
+        <groupId>org.apache.tika</groupId>
+        <version>2.0.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>s3-fetcher</artifactId>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.amazonaws</groupId>
+            <artifactId>aws-java-sdk-s3</artifactId>
+            <version>${aws.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>commons-logging</groupId>
+                    <artifactId>commons-logging</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>com.amazonaws</groupId>
+                    <artifactId>aws-java-sdk-simpleworkflow</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>com.fasterxml.jackson.core</groupId>
+                    <artifactId>jackson-core</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>com.fasterxml.jackson.core</groupId>
+                    <artifactId>jackson-databind</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>${jackson.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.amazonaws</groupId>
+            <artifactId>aws-java-sdk-simpleworkflow</artifactId>
+            <version>${aws.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>com.fasterxml.jackson.core</groupId>
+                    <artifactId>jackson-databind</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>commons-logging</groupId>
+            <artifactId>commons-logging</artifactId>
+            <version>${commons.logging.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${project.version}</version>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            
<Automatic-Module-Name>org.apache.tika.fetcher.s3</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    
<mainClass>org.apache.tika.eval.app.TikaEvalCLI</mainClass>
+                                </transformer>
+
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
 />
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    
<file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    
<file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
+</project>
\ No newline at end of file
diff --git 
a/tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
 
b/tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
new file mode 100644
index 0000000..4e592c7
--- /dev/null
+++ 
b/tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher.s3;
+
+import com.amazonaws.auth.profile.ProfileCredentialsProvider;
+import com.amazonaws.services.s3.AmazonS3;
+import com.amazonaws.services.s3.AmazonS3ClientBuilder;
+import com.amazonaws.services.s3.model.GetObjectRequest;
+import com.amazonaws.services.s3.model.S3Object;
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fetcher.FetchPrefixKeyPair;
+import org.apache.tika.fetcher.Fetcher;
+import org.apache.tika.fetcher.FetcherStringException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Fetches files from s3. Example string: s3://my_bucket/path/to/my_file.pdf
+ * This will parse the bucket out of that string and retrieve the path.
+ */
+public class S3Fetcher implements Fetcher, Initializable {
+
+    private static final String PREFIX = "s3";
+    private static final Set<String> SUPPORTED = Collections.singleton(PREFIX);
+
+    private String region;
+    private String profile;
+    private boolean extractUserMetadata = true;
+    private AmazonS3 s3Client;
+
+    @Override
+    public Set<String> getSupportedPrefixes() {
+        return SUPPORTED;
+    }
+
+    @Override
+    public InputStream fetch(String fetchString, Metadata metadata)
+            throws TikaException, IOException {
+        FetchPrefixKeyPair fetchPrefixKeyPair = 
FetchPrefixKeyPair.create(fetchString);
+        String bucketKey = fetchPrefixKeyPair.getKey();
+        if (bucketKey.startsWith("//")) {
+            bucketKey = bucketKey.substring(2);
+        } else if (bucketKey.startsWith("/")) {
+            bucketKey = bucketKey.substring(1);
+        }
+        int i = bucketKey.indexOf("/");
+        if (i < 0) {
+            throw new FetcherStringException("Couldn't find bucket:" + 
fetchPrefixKeyPair.getKey());
+        }
+        String bucket = bucketKey.substring(0, i);
+        String key = bucketKey.substring(i + 1);
+        //should we cache this to a local file so that
+        //we can close the s3Object?
+        S3Object fullObject = s3Client.getObject(new GetObjectRequest(bucket, 
key));
+        if (extractUserMetadata) {
+            for (Map.Entry<String, String> e :
+                    
fullObject.getObjectMetadata().getUserMetadata().entrySet()) {
+                metadata.add(PREFIX + ":" + e.getKey(), e.getValue());
+            }
+        }
+        return TikaInputStream.get(
+                fullObject.getObjectContent());
+    }
+
+    @Field
+    public void setRegion(String region) {
+        this.region = region;
+    }
+
+    @Field
+    public void setProfile(String profile) {
+        this.profile = profile;
+    }
+
+    /**
+     * Whether or not to extract user metadata from the S3Object
+     *
+     * @param extractUserMetadata
+     */
+    @Field
+    public void setExtractUserMetadata(boolean extractUserMetadata) {
+        this.extractUserMetadata = extractUserMetadata;
+    }
+
+    @Override
+    public void initialize(Map<String, Param> params) throws 
TikaConfigException {
+        //params have already been set
+        //ignore them
+        s3Client = AmazonS3ClientBuilder.standard()
+                .withRegion(region)
+                .withCredentials(new ProfileCredentialsProvider(profile))
+                .build();
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler 
problemHandler) throws TikaConfigException {
+
+    }
+}
diff --git 
a/tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
 
b/tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
new file mode 100644
index 0000000..1e352f9
--- /dev/null
+++ 
b/tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher.s3;
+import com.amazonaws.regions.Regions;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.fetcher.Fetcher;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.util.Collections;
+
+@Ignore("write actual unit tests")
+public class TestS3Fetcher {
+    private static final String FETCH_STRING = "";
+    private Path outputFile = Paths.get("");
+    private String region = "us-east-1";
+    private String profile = "";
+
+    @Test
+    public void testBasic() throws Exception {
+        S3Fetcher fetcher = new S3Fetcher();
+        fetcher.setProfile(profile);
+        fetcher.setRegion(region);
+        fetcher.initialize(Collections.EMPTY_MAP);
+
+        Metadata metadata = new Metadata();
+        try (InputStream is = fetcher.fetch(FETCH_STRING, metadata)) {
+            Files.copy(is, outputFile, StandardCopyOption.REPLACE_EXISTING);
+        }
+    }
+
+    @Test
+    public void testConfig() throws Exception {
+        TikaConfig config = new TikaConfig(
+                this.getClass().getResourceAsStream("/tika-config-s3.xml")
+        );
+        Fetcher fetcher = config.getFetcher();
+        Metadata metadata = new Metadata();
+        try (InputStream is = fetcher.fetch(FETCH_STRING, metadata)) {
+            Files.copy(is, outputFile, StandardCopyOption.REPLACE_EXISTING);
+        }
+    }
+}
diff --git a/tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml 
b/tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml
new file mode 100644
index 0000000..39a2ce8
--- /dev/null
+++ b/tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <fetchers>
+        <fetcher class="org.apache.tika.fetcher.s3.S3Fetcher">
+            <params>
+                <param name="region" type="string">us-east-1</param>
+                <param name="profile" type="string">my_profile</param>
+            </params>
+        </fetcher>
+    </fetchers>
+</properties>
\ No newline at end of file
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index b219b02..d19598f 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -274,6 +274,7 @@
     <rat.version>0.13</rat.version>
 
     <!-- dependency versions -->
+    <aws.version>1.11.937</aws.version>
     <boilerpipe.version>1.1.0</boilerpipe.version>
     <!-- used by POI, PDFBox and Jackcess ...try to sync -->
     <bouncycastle.version>1.68</bouncycastle.version>

Reply via email to