This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-3226 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3d0692adfe0f17f698ba298cc5c7f1a01af32bf7 Author: tballison <[email protected]> AuthorDate: Thu Jan 14 14:39:29 2021 -0500 TIKA-3226 -- initial fetcher with s3 example --- pom.xml | 1 + .../java/org/apache/tika/config/TikaConfig.java | 104 +++++++++++++ .../org/apache/tika/fetcher/DefaultFetcher.java | 80 ++++++++++ .../apache/tika/fetcher/FetchPrefixKeyPair.java | 53 +++++++ .../main/java/org/apache/tika/fetcher/Fetcher.java | 41 ++++++ .../tika/fetcher/FetcherStringException.java | 29 ++++ .../org/apache/tika/fetcher/FileSystemFetcher.java | 55 +++++++ tika-fetchers/pom.xml | 40 +++++ tika-fetchers/s3-fetcher/pom.xml | 161 +++++++++++++++++++++ .../java/org/apache/tika/fetcher/s3/S3Fetcher.java | 124 ++++++++++++++++ .../org/apache/tika/fetcher/s3/TestS3Fetcher.java | 63 ++++++++ .../src/test/resources/tika-config-s3.xml | 27 ++++ tika-parent/pom.xml | 1 + 13 files changed, 779 insertions(+) diff --git a/pom.xml b/pom.xml index 6a9ef15..aa1e6b9 100644 --- a/pom.xml +++ b/pom.xml @@ -37,6 +37,7 @@ <modules> <module>tika-parent</module> <module>tika-core</module> + <module>tika-fetchers</module> <module>tika-parsers</module> <module>tika-bundles</module> <module>tika-xmp</module> diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index d7fb8b3..20742d8 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -48,6 +48,8 @@ import org.apache.tika.detect.Detector; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; +import org.apache.tika.fetcher.DefaultFetcher; +import org.apache.tika.fetcher.Fetcher; import org.apache.tika.language.translate.DefaultTranslator; import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.filter.CompositeMetadataFilter; @@ -112,6 +114,10 @@ public class TikaConfig { return new DefaultMetadataFilter(loader); } + private static Fetcher getDefaultFetcher(ServiceLoader loader) { + return new DefaultFetcher(loader); + } + //use this to look for unneeded instantiations of TikaConfig protected static AtomicInteger TIMES_INSTANTIATED = new AtomicInteger(); @@ -124,6 +130,7 @@ public class TikaConfig { private final ExecutorService executorService; private final EncodingDetector encodingDetector; private final MetadataFilter metadataFilter; + private final Fetcher fetcher; public TikaConfig(String file) throws TikaException, IOException, SAXException { @@ -190,6 +197,7 @@ public class TikaConfig { ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader(); EncodingDetectorXmlLoader encodingDetectorXmlLoader = new EncodingDetectorXmlLoader(); MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader(); + FetcherXmlLoader fetcherXmlLoader = new FetcherXmlLoader(); updateXMLReaderUtils(element); this.mimeTypes = typesFromDomElement(element); this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); @@ -200,6 +208,7 @@ public class TikaConfig { this.translator = translatorLoader.loadOverall(element, mimeTypes, loader); this.executorService = executorLoader.loadOverall(element, mimeTypes, loader); this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, loader); + this.fetcher = fetcherXmlLoader.loadOverall(element, mimeTypes, loader); this.serviceLoader = loader; TIMES_INSTANTIATED.incrementAndGet(); } @@ -226,6 +235,7 @@ public class TikaConfig { this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); this.metadataFilter = getDefaultMetadataFilter(serviceLoader); + this.fetcher = getDefaultFetcher(serviceLoader); TIMES_INSTANTIATED.incrementAndGet(); } @@ -262,6 +272,7 @@ public class TikaConfig { this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); this.metadataFilter = getDefaultMetadataFilter(serviceLoader); + this.fetcher = getDefaultFetcher(serviceLoader); } else { ServiceLoader tmpServiceLoader = new ServiceLoader(); try (InputStream stream = getConfigInputStream(config, tmpServiceLoader)) { @@ -273,6 +284,7 @@ public class TikaConfig { TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader(); ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader(); MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader(); + FetcherXmlLoader fetcherXmlLoader = new FetcherXmlLoader(); this.mimeTypes = typesFromDomElement(element); this.encodingDetector = encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader); @@ -284,6 +296,7 @@ public class TikaConfig { this.translator = translatorLoader.loadOverall(element, mimeTypes, serviceLoader); this.executorService = executorLoader.loadOverall(element, mimeTypes, serviceLoader); this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, serviceLoader); + this.fetcher = fetcherXmlLoader.loadOverall(element, mimeTypes, serviceLoader); } catch (SAXException e) { throw new TikaException( "Specified Tika configuration has syntax errors: " @@ -411,6 +424,11 @@ public class TikaConfig { public MetadataFilter getMetadataFilter() { return metadataFilter; } + + public Fetcher getFetcher() { + return fetcher; + } + /** * Provides a default configuration (TikaConfig). Currently creates a * new instance each time it's called; we may be able to have it @@ -1267,4 +1285,90 @@ public class TikaConfig { } } + private static class FetcherXmlLoader extends + XmlLoader<Fetcher, Fetcher> { + + boolean supportsComposite() { + return true; + } + + String getParentTagName() { + return "fetchers"; + } + + String getLoaderTagName() { + return "fetcher"; + } + + @Override + Class<? extends Fetcher> getLoaderClass() { + return Fetcher.class; + } + + + @Override + boolean isComposite(Fetcher loaded) { + return loaded instanceof DefaultFetcher; + } + + @Override + boolean isComposite(Class<? extends Fetcher> loadedClass) { + return DefaultFetcher.class.isAssignableFrom(loadedClass); + } + + @Override + Fetcher preLoadOne(Class<? extends Fetcher> loadedClass, + String classname, MimeTypes mimeTypes) throws TikaException { + // Check for classes which can't be set in config + // Continue with normal loading + return null; + } + + @Override + Fetcher createDefault(MimeTypes mimeTypes, ServiceLoader loader) { + return getDefaultFetcher(loader); + } + + //this ignores the service loader + @Override + Fetcher createComposite(List<Fetcher> loaded, MimeTypes mimeTypes, ServiceLoader loader) { + return new DefaultFetcher(loaded); + } + + @Override + Fetcher createComposite(Class<? extends Fetcher> fetcherClass, + List<Fetcher> childFetchers, + Set<Class<? extends Fetcher>> excludeFilters, + Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader) + throws InvocationTargetException, IllegalAccessException, + InstantiationException { + Fetcher fetcher = null; + Constructor<? extends Fetcher> c; + + // Try the possible default and composite detector constructors + if (fetcher == null) { + try { + c = fetcherClass.getConstructor(ServiceLoader.class, Collection.class); + fetcher = c.newInstance(loader, excludeFilters); + } catch (NoSuchMethodException me) { + me.printStackTrace(); + } + } + if (fetcher == null) { + try { + c = fetcherClass.getConstructor(List.class); + fetcher = c.newInstance(childFetchers); + } catch (NoSuchMethodException me) { + me.printStackTrace(); + } + } + + return fetcher; + } + + @Override + Fetcher decorate(Fetcher created, Element element) { + return created; // No decoration of MetadataFilters + } + } } diff --git a/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java b/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java new file mode 100644 index 0000000..77e77d7 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fetcher; + +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Utility class that will apply the appropriate fetcher + * to the fetchString based on the prefix. + * + * This does not allow multiple fetchers supporting the same prefix. + */ +public class DefaultFetcher implements Fetcher { + + private final Map<String, Fetcher> fetcherMap = new ConcurrentHashMap<>(); + + private static List<Fetcher> getDefaultFilters( + ServiceLoader loader) { + return loader.loadStaticServiceProviders(Fetcher.class); + } + + + public DefaultFetcher(ServiceLoader serviceLoader) { + this(getDefaultFilters(serviceLoader)); + } + + public DefaultFetcher(List<Fetcher> fetchers) { + for (Fetcher fetcher : fetchers) { + for (String supportedPrefix : fetcher.getSupportedPrefixes()) { + if (fetcherMap.containsKey(supportedPrefix)) { + throw new IllegalArgumentException( + "Multiple fetchers cannot support the same prefix: " + + supportedPrefix); + } + fetcherMap.put(supportedPrefix, fetcher); + } + } + } + + @Override + public Set<String> getSupportedPrefixes() { + return fetcherMap.keySet(); + } + + @Override + public InputStream fetch(String fetchString, Metadata metadata) + throws IOException, TikaException { + FetchPrefixKeyPair fetchPrefixKeyPair = FetchPrefixKeyPair.create(fetchString); + + Fetcher fetcher = fetcherMap.get(fetchPrefixKeyPair.getPrefix()); + if (fetcher == null) { + throw new IllegalArgumentException("Can't find fetcher for prefix: "+ + fetchPrefixKeyPair.getPrefix()); + } + return fetcher.fetch(fetchString, metadata); + } +} diff --git a/tika-core/src/main/java/org/apache/tika/fetcher/FetchPrefixKeyPair.java b/tika-core/src/main/java/org/apache/tika/fetcher/FetchPrefixKeyPair.java new file mode 100644 index 0000000..9b263ae --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/fetcher/FetchPrefixKeyPair.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fetcher; + +public class FetchPrefixKeyPair { + private final String prefix; + private final String key; + + private FetchPrefixKeyPair(String prefix, String key) { + this.prefix = prefix; + this.key = key; + } + + public static FetchPrefixKeyPair create(String fetchString) throws FetcherStringException { + int prefixIndex = fetchString.indexOf(":"); + if (prefixIndex < 0) { + throw new FetcherStringException("Can't find fetcher prefix, e.g. the 's3' in s3:/myfile"); + } + String prefix = fetchString.substring(0, prefixIndex); + String key = fetchString.substring(prefixIndex+1); + return new FetchPrefixKeyPair(prefix, key); + } + + public String getPrefix() { + return prefix; + } + + public String getKey() { + return key; + } + + @Override + public String toString() { + return "FetchPrefixKeyPair{" + + "prefix='" + prefix + '\'' + + ", key='" + key + '\'' + + '}'; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/fetcher/Fetcher.java b/tika-core/src/main/java/org/apache/tika/fetcher/Fetcher.java new file mode 100644 index 0000000..12c6a5b --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/fetcher/Fetcher.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fetcher; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Set; + +/** + * Interface for an object that will fetch an InputStream given + * a fetch string. This will also update the metadata object + * based on the fetch. + * + * Implementations of Fetcher must be thread safe. + * + * The fetchString must start with a prefix that can be + * used to uniquely select the fetcher, e.g. file:my_file.pdf, s3:bucket/path/to/my_file + * + * Each fetcher must specify which prefixes it can handle. + */ +public interface Fetcher { + Set<String> getSupportedPrefixes(); + InputStream fetch(String fetchString, Metadata metadata) throws TikaException, IOException; +} diff --git a/tika-core/src/main/java/org/apache/tika/fetcher/FetcherStringException.java b/tika-core/src/main/java/org/apache/tika/fetcher/FetcherStringException.java new file mode 100644 index 0000000..9f1e93b --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/fetcher/FetcherStringException.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fetcher; + +import org.apache.tika.exception.TikaException; + +/** + * If something goes wrong in parsing the fetcher string + */ +public class FetcherStringException extends TikaException { + + public FetcherStringException(String msg) { + super(msg); + } +} diff --git a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java new file mode 100644 index 0000000..5060d35 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fetcher; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.Set; + +public class FileSystemFetcher implements Fetcher { + + private static String PREFIX = "file"; + public static Property PATH_BASE = Property.externalText(PREFIX + ":base"); + private static final Set<String> SUPPORTED = Collections.singleton(PREFIX); + + @Override + public Set<String> getSupportedPrefixes() { + return SUPPORTED; + } + + @Override + public InputStream fetch(String fetchString, Metadata metadata) + throws IOException, TikaException { + FetchPrefixKeyPair fetchPrefixKeyPair = FetchPrefixKeyPair.create(fetchString); + String base = metadata.get(PATH_BASE); + Path p = null; + if (base != null) { + p = Paths.get(base).resolve(fetchPrefixKeyPair.getKey()); + } else { + p = Paths.get(fetchPrefixKeyPair.getKey()); + } + return TikaInputStream.get(p, metadata); + } +} diff --git a/tika-fetchers/pom.xml b/tika-fetchers/pom.xml new file mode 100644 index 0000000..61e0963 --- /dev/null +++ b/tika-fetchers/pom.xml @@ -0,0 +1,40 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parent</artifactId> + <version>2.0.0-SNAPSHOT</version> + <relativePath>../tika-parent/pom.xml</relativePath> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>tika-fetchers</artifactId> + <packaging>pom</packaging> + <name>Apache Tika fetchers</name> + <url>http://tika.apache.org/</url> + + <modules> + <module>s3-fetcher</module> + </modules> + +</project> \ No newline at end of file diff --git a/tika-fetchers/s3-fetcher/pom.xml b/tika-fetchers/s3-fetcher/pom.xml new file mode 100644 index 0000000..0247262 --- /dev/null +++ b/tika-fetchers/s3-fetcher/pom.xml @@ -0,0 +1,161 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>tika-fetchers</artifactId> + <groupId>org.apache.tika</groupId> + <version>2.0.0-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>s3-fetcher</artifactId> + + <dependencies> + <dependency> + <groupId>com.amazonaws</groupId> + <artifactId>aws-java-sdk-s3</artifactId> + <version>${aws.version}</version> + <exclusions> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + <exclusion> + <groupId>com.amazonaws</groupId> + <artifactId>aws-java-sdk-simpleworkflow</artifactId> + </exclusion> + <exclusion> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + </exclusion> + <exclusion> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + <version>${jackson.version}</version> + </dependency> + <dependency> + <groupId>com.amazonaws</groupId> + <artifactId>aws-java-sdk-simpleworkflow</artifactId> + <version>${aws.version}</version> + <exclusions> + <exclusion> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>${commons.logging.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <configuration> + <archive> + <manifestEntries> + <Automatic-Module-Name>org.apache.tika.fetcher.s3</Automatic-Module-Name> + </manifestEntries> + </archive> + </configuration> + <executions> + <execution> + <goals> + <goal>test-jar</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-shade-plugin</artifactId> + <version>${maven.shade.version}</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <createDependencyReducedPom> + false + </createDependencyReducedPom> + <!-- <filters> --> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>META-INF/*</exclude> + <exclude>LICENSE.txt</exclude> + <exclude>NOTICE.txt</exclude> + </excludes> + </filter> + </filters> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <mainClass>org.apache.tika.eval.app.TikaEvalCLI</mainClass> + </transformer> + + <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" /> + <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer"> + <resource>META-INF/LICENSE</resource> + <file>target/classes/META-INF/LICENSE</file> + </transformer> + <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer"> + <resource>META-INF/NOTICE</resource> + <file>target/classes/META-INF/NOTICE</file> + </transformer> + <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer"> + <resource>META-INF/DEPENDENCIES</resource> + <file>target/classes/META-INF/DEPENDENCIES</file> + </transformer> + </transformers> + </configuration> + </execution> + </executions> + </plugin> + + </plugins> + </build> +</project> \ No newline at end of file diff --git a/tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java b/tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java new file mode 100644 index 0000000..4e592c7 --- /dev/null +++ b/tika-fetchers/s3-fetcher/src/main/java/org/apache/tika/fetcher/s3/S3Fetcher.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fetcher.s3; + +import com.amazonaws.auth.profile.ProfileCredentialsProvider; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3ClientBuilder; +import com.amazonaws.services.s3.model.GetObjectRequest; +import com.amazonaws.services.s3.model.S3Object; +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.fetcher.FetchPrefixKeyPair; +import org.apache.tika.fetcher.Fetcher; +import org.apache.tika.fetcher.FetcherStringException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Map; +import java.util.Set; + +/** + * Fetches files from s3. Example string: s3://my_bucket/path/to/my_file.pdf + * This will parse the bucket out of that string and retrieve the path. + */ +public class S3Fetcher implements Fetcher, Initializable { + + private static final String PREFIX = "s3"; + private static final Set<String> SUPPORTED = Collections.singleton(PREFIX); + + private String region; + private String profile; + private boolean extractUserMetadata = true; + private AmazonS3 s3Client; + + @Override + public Set<String> getSupportedPrefixes() { + return SUPPORTED; + } + + @Override + public InputStream fetch(String fetchString, Metadata metadata) + throws TikaException, IOException { + FetchPrefixKeyPair fetchPrefixKeyPair = FetchPrefixKeyPair.create(fetchString); + String bucketKey = fetchPrefixKeyPair.getKey(); + if (bucketKey.startsWith("//")) { + bucketKey = bucketKey.substring(2); + } else if (bucketKey.startsWith("/")) { + bucketKey = bucketKey.substring(1); + } + int i = bucketKey.indexOf("/"); + if (i < 0) { + throw new FetcherStringException("Couldn't find bucket:" + fetchPrefixKeyPair.getKey()); + } + String bucket = bucketKey.substring(0, i); + String key = bucketKey.substring(i + 1); + //should we cache this to a local file so that + //we can close the s3Object? + S3Object fullObject = s3Client.getObject(new GetObjectRequest(bucket, key)); + if (extractUserMetadata) { + for (Map.Entry<String, String> e : + fullObject.getObjectMetadata().getUserMetadata().entrySet()) { + metadata.add(PREFIX + ":" + e.getKey(), e.getValue()); + } + } + return TikaInputStream.get( + fullObject.getObjectContent()); + } + + @Field + public void setRegion(String region) { + this.region = region; + } + + @Field + public void setProfile(String profile) { + this.profile = profile; + } + + /** + * Whether or not to extract user metadata from the S3Object + * + * @param extractUserMetadata + */ + @Field + public void setExtractUserMetadata(boolean extractUserMetadata) { + this.extractUserMetadata = extractUserMetadata; + } + + @Override + public void initialize(Map<String, Param> params) throws TikaConfigException { + //params have already been set + //ignore them + s3Client = AmazonS3ClientBuilder.standard() + .withRegion(region) + .withCredentials(new ProfileCredentialsProvider(profile)) + .build(); + } + + @Override + public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { + + } +} diff --git a/tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java b/tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java new file mode 100644 index 0000000..1e352f9 --- /dev/null +++ b/tika-fetchers/s3-fetcher/src/test/java/org/apache/tika/fetcher/s3/TestS3Fetcher.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fetcher.s3; +import com.amazonaws.regions.Regions; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.fetcher.Fetcher; +import org.apache.tika.metadata.Metadata; +import org.junit.Ignore; +import org.junit.Test; + +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.Collections; + +@Ignore("write actual unit tests") +public class TestS3Fetcher { + private static final String FETCH_STRING = ""; + private Path outputFile = Paths.get(""); + private String region = "us-east-1"; + private String profile = ""; + + @Test + public void testBasic() throws Exception { + S3Fetcher fetcher = new S3Fetcher(); + fetcher.setProfile(profile); + fetcher.setRegion(region); + fetcher.initialize(Collections.EMPTY_MAP); + + Metadata metadata = new Metadata(); + try (InputStream is = fetcher.fetch(FETCH_STRING, metadata)) { + Files.copy(is, outputFile, StandardCopyOption.REPLACE_EXISTING); + } + } + + @Test + public void testConfig() throws Exception { + TikaConfig config = new TikaConfig( + this.getClass().getResourceAsStream("/tika-config-s3.xml") + ); + Fetcher fetcher = config.getFetcher(); + Metadata metadata = new Metadata(); + try (InputStream is = fetcher.fetch(FETCH_STRING, metadata)) { + Files.copy(is, outputFile, StandardCopyOption.REPLACE_EXISTING); + } + } +} diff --git a/tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml b/tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml new file mode 100644 index 0000000..39a2ce8 --- /dev/null +++ b/tika-fetchers/s3-fetcher/src/test/resources/tika-config-s3.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <fetchers> + <fetcher class="org.apache.tika.fetcher.s3.S3Fetcher"> + <params> + <param name="region" type="string">us-east-1</param> + <param name="profile" type="string">my_profile</param> + </params> + </fetcher> + </fetchers> +</properties> \ No newline at end of file diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index b219b02..d19598f 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -274,6 +274,7 @@ <rat.version>0.13</rat.version> <!-- dependency versions --> + <aws.version>1.11.937</aws.version> <boilerpipe.version>1.1.0</boilerpipe.version> <!-- used by POI, PDFBox and Jackcess ...try to sync --> <bouncycastle.version>1.68</bouncycastle.version>
