This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2b9ba8612 TIKA-3885: Add a tika-async-cli module
2b9ba8612 is described below
commit 2b9ba8612b20d2779863f08b908e89cc001b483f
Author: tballison <[email protected]>
AuthorDate: Wed Oct 19 10:49:52 2022 -0400
TIKA-3885: Add a tika-async-cli module
---
CHANGES.txt | 2 +
pom.xml | 2 +-
tika-app/pom.xml | 5 +
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 4 +-
tika-bom/pom.xml | 6 +-
.../apache/tika/pipes/async/AsyncProcessor.java | 24 ----
tika-pipes/pom.xml | 1 +
tika-pipes/tika-async-cli/pom.xml | 136 +++++++++++++++++++++
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 60 +++++++++
9 files changed, 212 insertions(+), 28 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 472ab79df..5370bc65e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 2.5.1 - ???
+ * Add a tika-async-cli module (TIKA-3885).
+
* Fetch keys sent via headers to tika server are now URL decoded
(TIKA-3864).
diff --git a/pom.xml b/pom.xml
index 935922d79..b75efbdaf 100644
--- a/pom.xml
+++ b/pom.xml
@@ -44,8 +44,8 @@
<module>tika-xmp</module>
<module>tika-batch</module>
<module>tika-langdetect</module>
- <module>tika-app</module>
<module>tika-pipes</module>
+ <module>tika-app</module>
<module>tika-server</module>
<module>tika-integration-tests</module>
<module>tika-eval</module>
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 94248b041..7cc33ca2e 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -73,6 +73,11 @@
<artifactId>tika-emitter-fs</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-async-cli</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<!-- logging -->
<dependency>
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index f174b674a..ee7feacfc 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -66,6 +66,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.Tika;
+import org.apache.tika.async.cli.TikaAsyncCLI;
import org.apache.tika.batch.BatchProcessDriverCLI;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.config.TikaConfigSerializer;
@@ -99,7 +100,6 @@ import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.digestutils.CommonsDigester;
import org.apache.tika.parser.pdf.PDFParserConfig;
-import org.apache.tika.pipes.async.AsyncProcessor;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
@@ -282,7 +282,7 @@ public class TikaCLI {
tikaConfigPath = arg.substring(config.length());
}
}
- AsyncProcessor.main(new String[]{ tikaConfigPath});
+ TikaAsyncCLI.main(new String[]{ tikaConfigPath});
}
/**
diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index 71b528a80..2f325bdf4 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -375,7 +375,11 @@
<artifactId>tika-pipes-iterator-solr</artifactId>
<version>2.5.1-SNAPSHOT</version>
</dependency>
-
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-async-cli</artifactId>
+ <version>2.5.1-SNAPSHOT</version>
+ </dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-httpclient-commons</artifactId>
diff --git
a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java
b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java
index 7a948045d..fee2ba37e 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java
@@ -19,7 +19,6 @@ package org.apache.tika.pipes.async;
import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Path;
-import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
@@ -319,27 +318,4 @@ public class AsyncProcessor implements Closeable {
}
}
}
-
- public static void main(String[] args) throws Exception {
- Path tikaConfigPath = Paths.get(args[0]);
- PipesIterator pipesIterator = PipesIterator.build(tikaConfigPath);
- long start = System.currentTimeMillis();
- try (AsyncProcessor processor = new AsyncProcessor(tikaConfigPath,
pipesIterator)) {
-
- for (FetchEmitTuple t : pipesIterator) {
- processor.offer(t, 2000);
- }
- processor.finished();
- while (true) {
- if (processor.checkActive()) {
- Thread.sleep(500);
- } else {
- break;
- }
- }
- long elapsed = System.currentTimeMillis() - start;
- LOG.info("Successfully finished processing {} files in {} ms",
- processor.getTotalProcessed(), elapsed);
- }
- }
}
diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml
index b6876578a..dbfd4f6d7 100644
--- a/tika-pipes/pom.xml
+++ b/tika-pipes/pom.xml
@@ -35,6 +35,7 @@
<module>tika-emitters</module>
<module>tika-pipes-iterators</module>
<module>tika-pipes-reporters</module>
+ <module>tika-async-cli</module>
</modules>
<dependencyManagement>
diff --git a/tika-pipes/tika-async-cli/pom.xml
b/tika-pipes/tika-async-cli/pom.xml
new file mode 100644
index 000000000..f5d244075
--- /dev/null
+++ b/tika-pipes/tika-async-cli/pom.xml
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
https://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-pipes</artifactId>
+ <version>2.5.1-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <modelVersion>4.0.0</modelVersion>
+
+ <artifactId>tika-async-cli</artifactId>
+
+ <name>Apache Tika Async CLI</name>
+ <url>https://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <!-- logging -->
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-slf4j2-impl</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-emitter-fs</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-serialization</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.pipes.reporters.fs.status</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven.shade.version}</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <!-- <filters> -->
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*</exclude>
+ <exclude>LICENSE.txt</exclude>
+ <exclude>NOTICE.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+ <mainClass>org.apache.tika.async.cli.TikaAsyncCLI</mainClass>
+ <manifestEntries>
+ <Multi-Release>true</Multi-Release>
+ </manifestEntries>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/LICENSE</resource>
+ <file>target/classes/META-INF/LICENSE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/NOTICE</resource>
+ <file>target/classes/META-INF/NOTICE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/DEPENDENCIES</resource>
+ <file>target/classes/META-INF/DEPENDENCIES</file>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+ <scm>
+ <tag>2.2.1-rc2</tag>
+ </scm>
+</project>
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
new file mode 100644
index 000000000..4490e33fd
--- /dev/null
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.async.cli;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.concurrent.TimeoutException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.pipes.FetchEmitTuple;
+import org.apache.tika.pipes.async.AsyncProcessor;
+import org.apache.tika.pipes.pipesiterator.PipesIterator;
+
+public class TikaAsyncCLI {
+
+ private static final long TIMEOUT_MS = 600_000;
+ private static final Logger LOG =
LoggerFactory.getLogger(TikaAsyncCLI.class);
+
+ public static void main(String[] args) throws Exception {
+ Path tikaConfigPath = Paths.get(args[0]);
+ PipesIterator pipesIterator = PipesIterator.build(tikaConfigPath);
+ long start = System.currentTimeMillis();
+ try (AsyncProcessor processor = new AsyncProcessor(tikaConfigPath,
pipesIterator)) {
+
+ for (FetchEmitTuple t : pipesIterator) {
+ boolean offered = processor.offer(t, TIMEOUT_MS);
+ if (! offered) {
+ throw new TimeoutException("timed out waiting to add a
fetch emit tuple");
+ }
+ }
+ processor.finished();
+ while (true) {
+ if (processor.checkActive()) {
+ Thread.sleep(500);
+ } else {
+ break;
+ }
+ }
+ long elapsed = System.currentTimeMillis() - start;
+ LOG.info("Successfully finished processing {} files in {} ms",
+ processor.getTotalProcessed(), elapsed);
+ }
+ }
+}