This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4512 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4e08b8050ebf1ecb4d4cbd114c74af3ca8429739 Author: tallison <[email protected]> AuthorDate: Fri Oct 10 13:47:48 2025 -0400 TIKA-4512 -- first steps --- .../main/java/org/apache/tika/cli/TikaPipes.java | 10 +++ .../tika/pipes/s3/tests/PipeIntegrationTests.java | 1 - tika-parent/pom.xml | 5 ++ tika-pipes/pom.xml | 5 ++ .../tika/async/cli/TikaConfigAsyncWriter.java | 3 +- tika-pipes/tika-fetchers/pom.xml | 17 +++- .../pipes/fetcher/azblob/TestAZBlobFetcher.java | 1 - .../tika-fetcher-file-system}/pom.xml | 83 ++++++++++-------- .../src/main/assembly/assembly.xml | 30 +++++++ .../tika/pipes/fetcher/fs/FileSystemFetcher.java | 99 ++++++++-------------- .../pipes/fetcher/fs/FileSystemFetcherPlugin.java} | 37 ++++---- .../fetcher/fs/config/FileSystemFetcherConfig.java | 31 +++++++ .../src/main/resources/plugin.properties | 21 +++++ .../pipes/fetcher/fs/FileSystemFetcherTest.java | 9 +- .../fs/config/FileSystemFetcherConfigTest.java | 26 ++++++ .../tika/pipes/fetcher/s3/TestGCSFetcher.java | 1 - .../tika/pipes/fetcher/http/HttpFetcher.java | 1 - .../apache/tika/pipes/fetcher/s3/S3Fetcher.java | 1 - .../tika/pipes/fetcher/s3/TestS3Fetcher.java | 1 - .../{tika-pipes-core => tika-pipes-api}/pom.xml | 35 ++------ .../apache/tika/pipes/api}/fetcher/Fetcher.java | 9 +- .../tika/pipes/api/fetcher/FetcherConfig.java | 8 ++ .../tika/pipes/api}/fetcher/RangeFetcher.java | 2 +- tika-pipes/tika-pipes-core/pom.xml | 9 ++ .../org/apache/tika/pipes/core/PipesServer.java | 4 +- .../tika/pipes/core/fetcher/AbstractFetcher.java | 1 + .../tika/pipes/core/fetcher/EmptyFetcher.java | 7 ++ .../tika/pipes/core/fetcher/FetcherManager.java | 75 ++++++++++++---- .../apache/tika/pipes/fetcher/url/UrlFetcher.java | 53 ------------ .../apache/tika/pipes/core/PipesServerTest.java | 8 +- .../tika/pipes/core/TikaPipesConfigTest.java | 6 +- .../apache/tika/pipes/core/async/MockFetcher.java | 8 +- .../tika/pipes/core/fetcher/MockFetcher.java | 14 +-- .../tika/server/core/FetcherStreamFactory.java | 2 - 34 files changed, 364 insertions(+), 259 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaPipes.java b/tika-app/src/main/java/org/apache/tika/cli/TikaPipes.java new file mode 100644 index 000000000..3e8c57108 --- /dev/null +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaPipes.java @@ -0,0 +1,10 @@ +package org.apache.tika.cli; + +import org.apache.tika.pipes.core.fetcher.FetcherManager; + +public class TikaPipes { + public static void main(String[] args) throws Exception{ + FetcherManager fetcherManager = FetcherManager.load(); + System.out.println(fetcherManager.getFetcher().getName()); + } +} diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java index e1e6ea78c..c965ec08f 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java @@ -46,7 +46,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.core.FetchEmitTuple; import org.apache.tika.pipes.core.emitter.Emitter; import org.apache.tika.pipes.core.emitter.EmitterManager; -import org.apache.tika.pipes.core.fetcher.Fetcher; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.pipes.core.pipesiterator.CallablePipesIterator; import org.apache.tika.pipes.core.pipesiterator.PipesIterator; diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index ddcfbbf81..24ead4fa6 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -1145,6 +1145,11 @@ <artifactId>hdf5-platform</artifactId> <version>${hdf5.version}</version> </dependency> + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <version>3.13.0</version> + </dependency> <dependency> <groupId>com.nimbusds</groupId> <artifactId>nimbus-jose-jwt</artifactId> diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml index 28271d11c..3c478a64c 100644 --- a/tika-pipes/pom.xml +++ b/tika-pipes/pom.xml @@ -30,6 +30,7 @@ <packaging>pom</packaging> <modules> + <module>tika-pipes-api</module> <module>tika-pipes-core</module> <module>tika-httpclient-commons</module> <module>tika-fetchers</module> @@ -39,6 +40,10 @@ <module>tika-async-cli</module> </modules> <dependencies> + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-core</artifactId> diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java index 5ff8f5d46..d60adf07e 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java @@ -49,7 +49,7 @@ class TikaConfigAsyncWriter { private static final Logger LOG = LoggerFactory.getLogger(TikaAsyncCLI.class); - private static final String FETCHER_NAME = "fsf"; + private static final String FETCHER_NAME = "file-system-fetcher"; private static final String EMITTER_NAME = "fse"; private final SimpleAsyncConfig simpleAsyncConfig; @@ -83,7 +83,6 @@ class TikaConfigAsyncWriter { document.appendChild(properties); } writePipesIterator(document, properties); - writeFetchers(document, properties); writeEmitters(document, properties); writeAsync(document, properties); Transformer transformer = TransformerFactory diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml index 999d269fc..7f84ff002 100644 --- a/tika-pipes/tika-fetchers/pom.xml +++ b/tika-pipes/tika-fetchers/pom.xml @@ -32,15 +32,26 @@ <packaging>pom</packaging> <modules> - <module>tika-fetcher-http</module> + <module>tika-fetcher-file-system</module> +<!-- <module>tika-fetcher-http</module> <module>tika-fetcher-s3</module> <module>tika-fetcher-gcs</module> <module>tika-fetcher-az-blob</module> - <module>tika-fetcher-microsoft-graph</module> + <module>tika-fetcher-microsoft-graph</module> --> </modules> <dependencies> - + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <!-- !!! VERY IMPORTANT --> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-pipes-api</artifactId> + <version>${project.version}</version> + </dependency> </dependencies> <scm> <tag>3.0.0-rc1</tag> diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/java/org/apache/tika/pipes/fetcher/azblob/TestAZBlobFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/java/org/apache/tika/pipes/fetcher/azblob/TestAZBlobFetcher.java index bbfc63c65..b20fd983b 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/java/org/apache/tika/pipes/fetcher/azblob/TestAZBlobFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/java/org/apache/tika/pipes/fetcher/azblob/TestAZBlobFetcher.java @@ -30,7 +30,6 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.core.fetcher.Fetcher; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.serialization.JsonMetadataList; diff --git a/tika-pipes/tika-pipes-core/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-file-system/pom.xml similarity index 51% copy from tika-pipes/tika-pipes-core/pom.xml copy to tika-pipes/tika-fetchers/tika-fetcher-file-system/pom.xml index 2ac8805dc..e9abd82f6 100644 --- a/tika-pipes/tika-pipes-core/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/pom.xml @@ -19,66 +19,75 @@ --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> <parent> + <artifactId>tika-fetchers</artifactId> <groupId>org.apache.tika</groupId> - <artifactId>tika-pipes</artifactId> <version>4.0.0-SNAPSHOT</version> - <relativePath>../pom.xml</relativePath> </parent> <modelVersion>4.0.0</modelVersion> - <artifactId>tika-pipes-core</artifactId> - - <name>Apache Tika Pipes Core</name> - <url>https://tika.apache.org/</url> + <artifactId>tika-fetcher-file-system</artifactId> + <name>Apache Tika file system fetcher</name> <dependencies> <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> </dependency> <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-serialization</artifactId> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> <version>${project.version}</version> + <scope>provided</scope> </dependency> <dependency> - <groupId>com.martensigwart</groupId> - <artifactId>fakeload</artifactId> - <version>${fakeload.version}</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <type>test-jar</type> - <scope>test</scope> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j2-impl</artifactId> + <scope>provided</scope> </dependency> </dependencies> + <build> <plugins> <plugin> - <groupId>org.apache.rat</groupId> - <artifactId>apache-rat-plugin</artifactId> - <configuration> - <excludes> - <exclude>src/test/resources/test-documents/file-list.txt</exclude> - <exclude>src/test/resources/test-documents/testOverlappingText.pdf</exclude> - </excludes> - </configuration> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <version>3.6.1</version> + <executions> + <execution> + <id>copy-dependencies</id> + <phase>package</phase> + <goals> + <goal>copy-dependencies</goal> + </goals> + <configuration> + <outputDirectory>${project.build.directory}/lib</outputDirectory> + <includeScope>compile</includeScope> + <excludeArtifactIds>tika-core,tika-pipes-core</excludeArtifactIds> + </configuration> + </execution> + </executions> </plugin> <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> + <artifactId>maven-assembly-plugin</artifactId> <configuration> - <archive> - <manifestEntries> - <Automatic-Module-Name>org.apache.tika.pipes.core</Automatic-Module-Name> - </manifestEntries> - </archive> + <descriptors> + <descriptor>src/main/assembly/assembly.xml</descriptor> + </descriptors> + <appendAssemblyId>false</appendAssemblyId> </configuration> + <executions> + <execution> + <id>make-assembly</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + </execution> + </executions> </plugin> </plugins> </build> + <scm> + <tag>3.0.0-rc1</tag> + </scm> </project> diff --git a/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/assembly/assembly.xml b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/assembly/assembly.xml new file mode 100644 index 000000000..d614dfc36 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/assembly/assembly.xml @@ -0,0 +1,30 @@ +<assembly xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns="http://maven.apache.org/ASSEMBLY/2.0.0" + xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 + http://maven.apache.org/xsd/assembly-2.0.0.xsd"> + <id>dependencies-zip</id> + <formats> + <format>zip</format> + </formats> + <includeBaseDirectory>false</includeBaseDirectory> + <fileSets> + <fileSet> + <directory>${project.build.directory}/lib</directory> + <outputDirectory>/lib</outputDirectory> + </fileSet> + <fileSet> + <directory>${project.build.directory}</directory> + <outputDirectory>/lib</outputDirectory> + <includes> + <include>${project.artifactId}-${project.version}.jar</include> + </includes> + </fileSet> + <fileSet> + <directory>${project.basedir}/src/main/resources</directory> + <outputDirectory>/</outputDirectory> + <includes> + <include>plugin.properties</include> + </includes> + </fileSet> + </fileSets> +</assembly> diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java similarity index 70% rename from tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java rename to tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java index 087564564..85b2f03f7 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java @@ -25,15 +25,11 @@ import java.nio.file.Paths; import java.nio.file.attribute.BasicFileAttributes; import java.nio.file.attribute.FileTime; import java.util.Date; -import java.util.Map; +import org.pf4j.Extension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.config.Field; -import org.apache.tika.config.Initializable; -import org.apache.tika.config.InitializableProblemHandler; -import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -42,30 +38,37 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.core.fetcher.AbstractFetcher; +import org.apache.tika.pipes.api.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.fs.config.FileSystemFetcherConfig; +import org.apache.tika.utils.StringUtils; -public class FileSystemFetcher extends AbstractFetcher implements Initializable { - public FileSystemFetcher() { - } - public FileSystemFetcher(FileSystemFetcherConfig fileSystemFetcherConfig) { - setBasePath(fileSystemFetcherConfig.getBasePath()); - setExtractFileSystemMetadata(fileSystemFetcherConfig.isExtractFileSystemMetadata()); +@Extension +public class FileSystemFetcher implements Fetcher { + + public FileSystemFetcher() { } private static final Logger LOG = LoggerFactory.getLogger(FileSystemFetcher.class); - //Warning! basePath can be null! - private Path basePath = null; - - private boolean extractFileSystemMetadata = false; + private FileSystemFetcherConfig defaultFileSystemFetcherConfig = new FileSystemFetcherConfig(); static boolean isDescendant(Path root, Path descendant) { return descendant.toAbsolutePath().normalize() .startsWith(root.toAbsolutePath().normalize()); } + @Override + public void loadDefaultConfig(InputStream is) throws IOException, TikaConfigException { + defaultFileSystemFetcherConfig = FileSystemFetcherConfig.load(is); + checkInitialization(defaultFileSystemFetcherConfig); + } + + @Override + public String getName() { + return "file-system-fetcher"; + } + @Override public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws IOException, TikaException { if (fetchKey.contains("\u0000")) { @@ -73,33 +76,33 @@ public class FileSystemFetcher extends AbstractFetcher implements Initializable "Please review the life decisions that led you to requesting " + "a file name with this character in it."); } + FileSystemFetcherConfig config = parseContext.get(FileSystemFetcherConfig.class, defaultFileSystemFetcherConfig); Path p = null; - if (basePath != null) { + if (! StringUtils.isBlank(config.getBasePath())) { + Path basePath = Paths.get(config.getBasePath()); + if (!Files.isDirectory(basePath)) { + throw new IOException("BasePath is not a directory: " + basePath); + } p = basePath.resolve(fetchKey); if (!p.toRealPath().startsWith(basePath.toRealPath())) { throw new IllegalArgumentException( "fetchKey must resolve to be a descendant of the 'basePath'"); } - } else { - p = Paths.get(fetchKey); } metadata.set(TikaCoreProperties.SOURCE_PATH, fetchKey); - updateFileSystemMetadata(p, metadata); - + LOG.warn("about to read from {} with base={}", p.toAbsolutePath(), config.getBasePath()); if (!Files.isRegularFile(p)) { - if (basePath != null && !Files.isDirectory(basePath)) { - throw new IOException("BasePath is not a directory: " + basePath); - } else { - throw new FileNotFoundException(p.toAbsolutePath().toString()); - } + throw new FileNotFoundException(p.toAbsolutePath().toString()); } + updateFileSystemMetadata(p, metadata, config); + return TikaInputStream.get(p, metadata); } - private void updateFileSystemMetadata(Path p, Metadata metadata) throws IOException { - if (! extractFileSystemMetadata) { + private void updateFileSystemMetadata(Path p, Metadata metadata, FileSystemFetcherConfig config) throws IOException { + if (! config.isExtractFileSystemMetadata()) { return; } BasicFileAttributes attrs = Files.readAttributes(p, BasicFileAttributes.class); @@ -120,42 +123,10 @@ public class FileSystemFetcher extends AbstractFetcher implements Initializable * * @return the basePath or <code>null</code> if no base path was set */ - public Path getBasePath() { - return basePath; - } - - /** - * Default behavior si that clients will send in relative paths, this - * must be set to allow this fetcher to fetch the - * full path. - * - * @param basePath - */ - @Field - public void setBasePath(String basePath) { - this.basePath = Paths.get(basePath); - } - - /** - * Extract file system metadata (created, modified, accessed) when fetching file. - * The default is <code>false</code>. - * - * @param extractFileSystemMetadata - */ - @Field - public void setExtractFileSystemMetadata(boolean extractFileSystemMetadata) { - this.extractFileSystemMetadata = extractFileSystemMetadata; - } - - @Override - public void initialize(Map<String, Param> params) throws TikaConfigException { - //no-op - } - - @Override - public void checkInitialization(InitializableProblemHandler problemHandler) + private void checkInitialization(FileSystemFetcherConfig config) throws TikaConfigException { - if (basePath == null || basePath.toString().isBlank()) { + String basePath = config.getBasePath(); + if (basePath == null || basePath.isBlank()) { LOG.warn("'basePath' has not been set. " + "This means that client code or clients can read from any file that this " + "process has permissions to read. If you are running tika-server, make " + @@ -174,7 +145,7 @@ public class FileSystemFetcher extends AbstractFetcher implements Initializable " Please use the tika-fetcher-s3 module"); } - if (basePath.toAbsolutePath().toString().contains("\u0000")) { + if (basePath.contains("\u0000")) { throw new TikaConfigException( "base path must not contain \u0000. " + "Seriously, what were you thinking?"); } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherPlugin.java similarity index 53% rename from tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java rename to tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherPlugin.java index 890148c81..f6a457e0a 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherPlugin.java @@ -14,29 +14,36 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher.fs.config; +package org.apache.tika.pipes.fetcher.fs; -import org.apache.tika.pipes.core.fetcher.config.AbstractConfig; +import org.pf4j.Plugin; +import org.pf4j.PluginWrapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -public class FileSystemFetcherConfig extends AbstractConfig { - private String basePath; - private boolean extractFileSystemMetadata; +public class FileSystemFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(FileSystemFetcherPlugin.class); - public String getBasePath() { - return basePath; + public FileSystemFetcherPlugin(PluginWrapper wrapper) { + super(wrapper); } - public FileSystemFetcherConfig setBasePath(String basePath) { - this.basePath = basePath; - return this; + @Override + public void start() { + LOG.info("Starting"); + super.start(); } - public boolean isExtractFileSystemMetadata() { - return extractFileSystemMetadata; + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); } - public FileSystemFetcherConfig setExtractFileSystemMetadata(boolean extractFileSystemMetadata) { - this.extractFileSystemMetadata = extractFileSystemMetadata; - return this; + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); } + } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java new file mode 100644 index 000000000..da88befc4 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java @@ -0,0 +1,31 @@ +package org.apache.tika.pipes.fetcher.fs.config; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; + +import com.fasterxml.jackson.databind.ObjectMapper; + +public class FileSystemFetcherConfig { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static FileSystemFetcherConfig load(InputStream is) throws IOException { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { + return OBJECT_MAPPER.readValue(reader, FileSystemFetcherConfig.class); + } + } + + private String basePath; + private boolean extractFileSystemMetadata = false; + + public String getBasePath() { + return basePath; + } + + public boolean isExtractFileSystemMetadata() { + return extractFileSystemMetadata; + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/resources/plugin.properties new file mode 100644 index 000000000..b2488f75d --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=file-system-fetcher +plugin.class=org.apache.tika.pipes.fetcher.fs.FileSystemFetcherPlugin +plugin.version=4.0.0-SNAPSHOT +plugin.provider=Local File System Fetcher +plugin.description=Capable of fetching the local file system diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java similarity index 87% rename from tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java rename to tika-pipes/tika-fetchers/tika-fetcher-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java index 5c493da59..465e9f195 100644 --- a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java @@ -26,9 +26,6 @@ import java.nio.file.Paths; import org.junit.jupiter.api.Test; -import org.apache.tika.config.InitializableProblemHandler; -import org.apache.tika.pipes.fetcher.fs.FileSystemFetcher; - public class FileSystemFetcherTest { @@ -50,9 +47,9 @@ public class FileSystemFetcherTest { public void testNullByte() throws Exception { FileSystemFetcher f = new FileSystemFetcher(); assertThrows(InvalidPathException.class, () -> { - f.setBasePath("bad\u0000path"); - f.setName("fs"); - f.checkInitialization(InitializableProblemHandler.IGNORE); + //f.setBasePath("bad\u0000path"); + + //f.checkInitialization(InitializableProblemHandler.IGNORE); }); } } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfigTest.java b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfigTest.java new file mode 100644 index 000000000..e412a9fae --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfigTest.java @@ -0,0 +1,26 @@ +package org.apache.tika.pipes.fetcher.fs.config; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; + +import org.junit.jupiter.api.Test; + +public class FileSystemFetcherConfigTest { + + @Test + public void testBasic() throws Exception { + String json = """ + { + "basePath":"/some/base/path", + "extractFileSystemMetadata":true + } + """; + + FileSystemFetcherConfig config = FileSystemFetcherConfig.load(new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))); + assertEquals("/some/base/path", config.getBasePath()); + assertTrue(config.isExtractFileSystemMetadata()); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java index 3844337f3..5efb19598 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java @@ -31,7 +31,6 @@ import org.junit.jupiter.api.io.TempDir; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.core.fetcher.Fetcher; import org.apache.tika.pipes.core.fetcher.FetcherManager; @Disabled("write actual unit tests") diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java index ee1953cb7..849d3bf6a 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java @@ -76,7 +76,6 @@ import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.core.fetcher.AbstractFetcher; -import org.apache.tika.pipes.core.fetcher.RangeFetcher; import org.apache.tika.pipes.core.fetcher.config.FetcherConfigContainer; import org.apache.tika.pipes.fetcher.http.config.HttpFetcherConfig; import org.apache.tika.pipes.fetcher.http.config.HttpHeaders; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java index 9176a7409..c6a21078e 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java @@ -57,7 +57,6 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.core.fetcher.AbstractFetcher; -import org.apache.tika.pipes.core.fetcher.RangeFetcher; import org.apache.tika.pipes.fetcher.s3.config.S3FetcherConfig; import org.apache.tika.utils.StringUtils; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/pipes/fetcher/s3/TestS3Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/pipes/fetcher/s3/TestS3Fetcher.java index d492f9a41..6c27357a2 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/pipes/fetcher/s3/TestS3Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/pipes/fetcher/s3/TestS3Fetcher.java @@ -28,7 +28,6 @@ import org.junit.jupiter.api.Test; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.core.fetcher.Fetcher; import org.apache.tika.pipes.core.fetcher.FetcherManager; @Disabled("write actual unit tests") diff --git a/tika-pipes/tika-pipes-core/pom.xml b/tika-pipes/tika-pipes-api/pom.xml similarity index 64% copy from tika-pipes/tika-pipes-core/pom.xml copy to tika-pipes/tika-pipes-api/pom.xml index 2ac8805dc..79e47faa5 100644 --- a/tika-pipes/tika-pipes-core/pom.xml +++ b/tika-pipes/tika-pipes-api/pom.xml @@ -26,55 +26,34 @@ </parent> <modelVersion>4.0.0</modelVersion> - <artifactId>tika-pipes-core</artifactId> + <artifactId>tika-pipes-api</artifactId> <name>Apache Tika Pipes Core</name> <url>https://tika.apache.org/</url> <dependencies> <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-serialization</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>com.martensigwart</groupId> - <artifactId>fakeload</artifactId> - <version>${fakeload.version}</version> - <scope>test</scope> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <scope>provided</scope> </dependency> + <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-core</artifactId> <version>${project.version}</version> - <type>test-jar</type> - <scope>test</scope> + <scope>provided</scope> </dependency> </dependencies> <build> <plugins> - <plugin> - <groupId>org.apache.rat</groupId> - <artifactId>apache-rat-plugin</artifactId> - <configuration> - <excludes> - <exclude>src/test/resources/test-documents/file-list.txt</exclude> - <exclude>src/test/resources/test-documents/testOverlappingText.pdf</exclude> - </excludes> - </configuration> - </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId> <configuration> <archive> <manifestEntries> - <Automatic-Module-Name>org.apache.tika.pipes.core</Automatic-Module-Name> + <Automatic-Module-Name>org.apache.tika.pipes.api</Automatic-Module-Name> </manifestEntries> </archive> </configuration> diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/Fetcher.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/Fetcher.java similarity index 83% rename from tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/Fetcher.java rename to tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/Fetcher.java index 07e9fe077..ef670ef8c 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/Fetcher.java +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/Fetcher.java @@ -14,11 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.core.fetcher; +package org.apache.tika.pipes.api.fetcher; import java.io.IOException; import java.io.InputStream; +import org.pf4j.ExtensionPoint; + +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -30,7 +33,9 @@ import org.apache.tika.parser.ParseContext; * <p> * Implementations of Fetcher must be thread safe. */ -public interface Fetcher { +public interface Fetcher extends ExtensionPoint { + + void loadDefaultConfig(InputStream is) throws TikaConfigException, IOException; String getName(); diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/FetcherConfig.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/FetcherConfig.java new file mode 100644 index 000000000..8ba3f2747 --- /dev/null +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/FetcherConfig.java @@ -0,0 +1,8 @@ +package org.apache.tika.pipes.api.fetcher; + +public abstract class FetcherConfig { + + public boolean allowRuntimeModifications() { + return false; + } +} diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/RangeFetcher.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/RangeFetcher.java similarity index 97% rename from tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/RangeFetcher.java rename to tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/RangeFetcher.java index f35f43e1a..025b4cd82 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/RangeFetcher.java +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/RangeFetcher.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.core.fetcher; +package org.apache.tika.pipes.api.fetcher; import java.io.IOException; import java.io.InputStream; diff --git a/tika-pipes/tika-pipes-core/pom.xml b/tika-pipes/tika-pipes-core/pom.xml index 2ac8805dc..190192b8a 100644 --- a/tika-pipes/tika-pipes-core/pom.xml +++ b/tika-pipes/tika-pipes-core/pom.xml @@ -32,6 +32,15 @@ <url>https://tika.apache.org/</url> <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pipes-api</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-core</artifactId> diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java index 3d7e7288a..8d8467cc7 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java @@ -66,6 +66,7 @@ import org.apache.tika.parser.DigestingParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.pipes.api.fetcher.Fetcher; import org.apache.tika.pipes.core.emitter.EmitData; import org.apache.tika.pipes.core.emitter.EmitKey; import org.apache.tika.pipes.core.emitter.Emitter; @@ -75,7 +76,6 @@ import org.apache.tika.pipes.core.emitter.TikaEmitterException; import org.apache.tika.pipes.core.extractor.BasicEmbeddedDocumentBytesHandler; import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.core.extractor.EmittingEmbeddedDocumentBytesHandler; -import org.apache.tika.pipes.core.fetcher.Fetcher; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; @@ -819,7 +819,7 @@ public class PipesServer implements Runnable { protected void initializeResources() throws TikaException, IOException, SAXException { //TODO allowed named configurations in tika config this.tikaConfig = new TikaConfig(tikaConfigPath); - this.fetcherManager = FetcherManager.load(tikaConfigPath); + this.fetcherManager = FetcherManager.load(); //skip initialization of the emitters if emitting //from the pipesserver is turned off. if (maxForEmitBatchBytes > -1) { diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/AbstractFetcher.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/AbstractFetcher.java index 872f603f0..9bd73c9d5 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/AbstractFetcher.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/AbstractFetcher.java @@ -17,6 +17,7 @@ package org.apache.tika.pipes.core.fetcher; import org.apache.tika.config.Field; +import org.apache.tika.pipes.api.fetcher.Fetcher; public abstract class AbstractFetcher implements Fetcher { diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/EmptyFetcher.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/EmptyFetcher.java index 8e604662a..cd114ee00 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/EmptyFetcher.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/EmptyFetcher.java @@ -19,12 +19,19 @@ package org.apache.tika.pipes.core.fetcher; import java.io.IOException; import java.io.InputStream; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.api.fetcher.Fetcher; public class EmptyFetcher implements Fetcher { + @Override + public void loadDefaultConfig(InputStream is) throws TikaConfigException { + //no-op + } + @Override public String getName() { return "empty"; diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/FetcherManager.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/FetcherManager.java index 7eff996ef..67d5eae2d 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/FetcherManager.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/FetcherManager.java @@ -20,45 +20,84 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import org.apache.tika.config.ConfigBase; +import org.pf4j.DefaultPluginManager; +import org.pf4j.PluginManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; +import org.apache.tika.pipes.api.fetcher.Fetcher; /** * Utility class to hold multiple fetchers. * <p> * This forbids multiple fetchers supporting the same name. */ -public class FetcherManager extends ConfigBase { +public class FetcherManager { + + private static final Logger LOG = LoggerFactory.getLogger(FetcherManager.class); - public static FetcherManager load(Path p) throws IOException, TikaConfigException { - try (InputStream is = - Files.newInputStream(p)) { - return FetcherManager.buildComposite("fetchers", FetcherManager.class, - "fetcher", Fetcher.class, is); + public static FetcherManager load() throws IOException, TikaConfigException { + PluginManager pluginManager = new DefaultPluginManager(); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + List<Path> pluginRoots = pluginManager.getPluginsRoots(); + Map<String, Fetcher> fetcherMap = new HashMap<>(); + List<Fetcher> fetchers = pluginManager.getExtensions(Fetcher.class); + System.out.println("HERE " + fetchers.size()); + //if (LOG.isDebugEnabled()) { + loadDebug(pluginRoots, fetchers); + //} + for (Fetcher fetcher : pluginManager.getExtensions(Fetcher.class)) { + Path p = findConfig(pluginRoots, fetcher.getName()); + if (p == null) { + LOG.warn("couldn't find config for {}", fetcher.getName()); + } else { + try (InputStream is = Files.newInputStream(p)) { + fetcher.loadDefaultConfig(is); + } + } + fetcherMap.put(fetcher.getName(), fetcher); } + return new FetcherManager(fetcherMap); + } - private final Map<String, Fetcher> fetcherMap = new ConcurrentHashMap<>(); - public FetcherManager(List<Fetcher> fetchers) throws TikaConfigException { - for (Fetcher fetcher : fetchers) { - String name = fetcher.getName(); - if (name == null || name.isBlank()) { - throw new TikaConfigException("fetcher name must not be blank"); - } - if (fetcherMap.containsKey(fetcher.getName())) { - throw new TikaConfigException( - "Multiple fetchers cannot support the same prefix: " + fetcher.getName()); + private static void loadDebug(List<Path> pluginRoots, List<Fetcher> fetchers) { + for (Path p : pluginRoots) { + LOG.warn("plugin root: {}", p.toAbsolutePath()); + } + LOG.warn("loaded {} fetchers", fetchers.size()); + for (Fetcher f : fetchers) { + LOG.warn("fetcher name={} class={}", f.getName(), f.getClass()); + } + } + + private static Path findConfig(List<Path> pluginRoots, String name) { + String target = name + ".json"; + for (Path p : pluginRoots) { + Path candidate = p.toAbsolutePath().resolve(target); + if (Files.exists(candidate)) { + return candidate; } - fetcherMap.put(fetcher.getName(), fetcher); } + return null; + } + + private final Map<String, Fetcher> fetcherMap = new ConcurrentHashMap<>(); + + private FetcherManager(Map<String, Fetcher> fetcherMap) throws TikaConfigException { + this.fetcherMap.putAll(fetcherMap); } + public Fetcher getFetcher(String fetcherName) throws IOException, TikaException { Fetcher fetcher = fetcherMap.get(fetcherName); if (fetcher == null) { diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java deleted file mode 100644 index c93ce297d..000000000 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.fetcher.url; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; -import java.util.Locale; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.core.fetcher.AbstractFetcher; - -/** - * Simple fetcher for URLs. This simply calls {@link TikaInputStream#get(URL)}. - * This intentionally does not support fetching for files. - * Please use the FileSystemFetcher for that. If you need more advanced control (passwords, - * timeouts, proxies, etc), please use the tika-fetcher-http module. - */ -public class UrlFetcher extends AbstractFetcher { - - @Override - public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws IOException, TikaException { - if (fetchKey.contains("\u0000")) { - throw new IllegalArgumentException("URL must not contain \u0000. " + - "Please review the life decisions that led you to requesting " + - "a URL with this character in it."); - } - if (fetchKey.toLowerCase(Locale.US).trim().startsWith("file:")) { - throw new IllegalArgumentException( - "The UrlFetcher does not fetch from file shares; " + - "please use the FileSystemFetcher"); - } - return TikaInputStream.get(new URL(fetchKey), metadata); - } - -} diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java index f116bb65f..424196fde 100644 --- a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java @@ -33,11 +33,11 @@ import org.junit.jupiter.api.io.TempDir; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.api.fetcher.Fetcher; import org.apache.tika.pipes.core.emitter.EmitKey; import org.apache.tika.pipes.core.extractor.BasicEmbeddedDocumentBytesHandler; import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.core.fetcher.FetchKey; -import org.apache.tika.pipes.core.fetcher.Fetcher; import org.apache.tika.pipes.core.fetcher.FetcherManager; public class PipesServerTest extends TikaTest { @@ -71,7 +71,7 @@ public class PipesServerTest extends TikaTest { FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id", new FetchKey("fs", "mock.xml"), new EmitKey("", "")); - Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); + Fetcher fetcher = FetcherManager.load().getFetcher(); PipesServer.MetadataListAndEmbeddedBytes parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); assertEquals("5f3b924303e960ce35d7f705e91d3018dd110a9c3cef0546a91fe013d6dad6fd", @@ -111,7 +111,7 @@ public class PipesServerTest extends TikaTest { FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id", new FetchKey("fs", "mock.xml"), new EmitKey("", ""), new Metadata(), parseContext); - Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); + Fetcher fetcher = FetcherManager.load().getFetcher(); PipesServer.MetadataListAndEmbeddedBytes parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); assertEquals(2, parseData.metadataList.size()); @@ -168,7 +168,7 @@ public class PipesServerTest extends TikaTest { new FetchKey("fs", "mock.xml"), new EmitKey("", ""), new Metadata(), parseContext); - Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); + Fetcher fetcher = FetcherManager.load().getFetcher(); PipesServer.MetadataListAndEmbeddedBytes parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); assertEquals(2, parseData.metadataList.size()); diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/TikaPipesConfigTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/TikaPipesConfigTest.java index 3deb66a1e..d231a18df 100644 --- a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/TikaPipesConfigTest.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/TikaPipesConfigTest.java @@ -33,14 +33,12 @@ import org.apache.tika.pipes.core.async.AsyncConfig; import org.apache.tika.pipes.core.async.MockReporter; import org.apache.tika.pipes.core.emitter.Emitter; import org.apache.tika.pipes.core.emitter.EmitterManager; -import org.apache.tika.pipes.core.fetcher.Fetcher; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.pipes.core.pipesiterator.PipesIterator; -import org.apache.tika.pipes.fetcher.fs.FileSystemFetcher; public class TikaPipesConfigTest extends AbstractTikaConfigTest { //this handles tests for the newer pipes type configs. - +/* @Test public void testFetchers() throws Exception { FetcherManager m = FetcherManager.load(getConfigFilePath("fetchers-config.xml")); @@ -74,7 +72,7 @@ public class TikaPipesConfigTest extends AbstractTikaConfigTest { FetcherManager fetcherManager = FetcherManager.load( getConfigFilePath("fetchers-nobasepath-config.xml")); - } + }*/ @Test public void testEmitters() throws Exception { diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/async/MockFetcher.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/async/MockFetcher.java index afcd6a33a..be05f8b73 100644 --- a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/async/MockFetcher.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/async/MockFetcher.java @@ -21,10 +21,11 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.core.fetcher.Fetcher; +import org.apache.tika.pipes.api.fetcher.Fetcher; public class MockFetcher implements Fetcher { @@ -32,6 +33,11 @@ public class MockFetcher implements Fetcher { "<metadata action=\"add\" name=\"dc:creator\">Nikolai Lobachevsky</metadata>" + "<write element=\"p\">main_content</write>" + "</mock>").getBytes(StandardCharsets.UTF_8); + @Override + public void loadDefaultConfig(InputStream is) throws TikaConfigException { + //no-op + } + @Override public String getName() { return "mock"; diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/fetcher/MockFetcher.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/fetcher/MockFetcher.java index a1f6ac548..71a662438 100644 --- a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/fetcher/MockFetcher.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/fetcher/MockFetcher.java @@ -31,7 +31,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -public class MockFetcher extends AbstractFetcher implements Initializable { +public class MockFetcher extends AbstractFetcher{ private Map<String, Param> params; @@ -50,20 +50,12 @@ public class MockFetcher extends AbstractFetcher implements Initializable { this.byteString = byteString; } - @Override - public void initialize(Map<String, Param> params) throws TikaConfigException { - this.params = params; - } @Override - public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { - if (throwOnCheck) { - throw new TikaConfigException("throw on check"); - } + public void loadDefaultConfig(InputStream is) throws TikaConfigException { + //no-op } - @Override public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException { return byteString == null ? new ByteArrayInputStream(new byte[0]) : diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java index 66e4b69fc..571da5b8d 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java @@ -32,9 +32,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.core.fetcher.Fetcher; import org.apache.tika.pipes.core.fetcher.FetcherManager; -import org.apache.tika.pipes.core.fetcher.RangeFetcher; import org.apache.tika.server.core.resource.TikaResource; /**
