This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4520 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8b716deb1a4682277de107d9eaf885e163b11fae Author: Nicholas DiPiazza <[email protected]> AuthorDate: Fri Oct 10 10:05:20 2025 -0500 refactor: rename integration test resources and update pom.xml for tika-pipes modules --- tika-integration-tests/pom.xml | 4 - tika-parent/pom.xml | 26 +- tika-pipes/pom.xml | 1 + .../tika-pipes-integration-tests}/pom.xml | 41 +- .../tika-pipes-integration-test-base/pom.xml | 32 + .../tika-pipes-kafka-integration-tests/README.md | 0 .../tika-pipes-kafka-integration-tests/pom.xml | 2 +- .../tika/pipes/kafka/tests/TikaPipesKafkaTest.java | 0 .../src/test/resources/log4j2.xml | 0 .../resources/pipes-fork-server-custom-log4j2.xml | 0 .../src/test/resources/tika-config-kafka.xml | 0 .../pom.xml | 2 +- .../pipes/opensearch/tests/OpenSearchTest.java | 0 .../opensearch/tests/OpensearchTestClient.java | 0 .../resources/opensearch/opensearch-mappings.json | 0 .../opensearch-parent-child-mappings.json | 0 .../opensearch/tika-config-opensearch.xml | 0 .../resources/pipes-fork-server-custom-log4j2.xml | 0 .../src/test/resources/test-documents/fake_oom.xml | 0 .../src/test/resources/test-documents/npe.xml | 0 .../src/test/resources/test-documents/oom.xml | 0 .../test-documents/test_recursive_embedded.docx | Bin .../tika-pipes-s3-integration-tests/pom.xml | 3 +- .../tika/pipes/s3/tests/PipeIntegrationTests.java | 0 .../tika/pipes/s3/tests/S3PipeIntegrationTest.java | 0 .../src/test/resources/docker-compose.yml | 0 .../src/test/resources/log4j2.xml | 0 .../resources/pipes-fork-server-custom-log4j2.xml | 0 .../resources/tika-config-s3-integration-test.xml | 0 .../src/test/resources/tika-config-s3ToFs.xml | 0 .../src/test/resources/tika-config-s3Tos3.xml | 0 .../tika-pipes-solr-integration-tests/pom.xml | 4 +- .../tika/pipes/solr/tests/TikaPipesSolr8Test.java | 0 .../pipes/solr/tests/TikaPipesSolr8ZkTest.java | 0 .../tika/pipes/solr/tests/TikaPipesSolr9Test.java | 0 .../pipes/solr/tests/TikaPipesSolr9ZkTest.java | 0 .../pipes/solr/tests/TikaPipesSolrTestBase.java | 0 .../src/test/resources/embedded/embedded.docx | Bin .../src/test/resources/log4j2.xml | 0 .../src/test/resources/logback.xml | 0 .../resources/pipes-fork-server-custom-log4j2.xml | 0 .../test/resources/tika-async-log4j2.properties | 0 .../src/test/resources/tika-config-solr-urls.xml | 0 tika-server/pom.xml | 1 + tika-server/tika-client-grpc/README.md | 313 +++++++++ tika-server/tika-client-grpc/pom.xml | 34 + .../apache/tika/grpc/client/TikaGrpcClient.java | 755 +++++++++++++++++++++ .../grpc/client/config/TikaGrpcClientConfig.java | 243 +++++++ .../client/exception/TikaGrpcClientException.java | 56 ++ .../tika/grpc/client/TikaGrpcClientTest.java | 113 +++ .../tika/server/config/ParseContextConfig.java | 56 ++ .../tika/server/config/TikaConfigLoader.java | 58 ++ .../tika/server/controller/DetectorController.java | 7 - .../tika/server/controller/MetadataController.java | 77 ++- .../RecursiveMetadataAndContentController.java | 7 - .../server/controller/ServerStatusController.java | 8 - .../tika/server/controller/TikaController.java | 7 - .../controller/TranslateResourceController.java | 8 - .../controller/UnpackResourceController.java | 8 - .../server/controller/XmpMetadataController.java | 10 +- .../TikaServerParseException.java} | 35 +- .../tika/server/service/TikaLoggingService.java | 47 ++ .../server/service/TikaParseContextService.java | 66 ++ .../tika/server/service/TikaParsingService.java | 184 +++++ 64 files changed, 2069 insertions(+), 139 deletions(-) diff --git a/tika-integration-tests/pom.xml b/tika-integration-tests/pom.xml index 8e472929f..a103b1965 100644 --- a/tika-integration-tests/pom.xml +++ b/tika-integration-tests/pom.xml @@ -32,11 +32,7 @@ <packaging>pom</packaging> <modules> - <module>tika-pipes-solr-integration-tests</module> - <module>tika-pipes-opensearch-integration-tests</module> - <module>tika-pipes-s3-integration-tests</module> <module>tika-resource-loading-tests</module> - <module>tika-pipes-kafka-integration-tests</module> <module>tika-woodstox-tests</module> </modules> diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index dab3cc3db..0ebd3dd34 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -471,6 +471,13 @@ <dependencyManagement> <dependencies> + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-dependencies</artifactId> + <version>3.4.1</version> + <type>pom</type> + <scope>import</scope> + </dependency> <dependency> <groupId>org.codehaus.plexus</groupId> <artifactId>plexus-utils</artifactId> @@ -873,16 +880,6 @@ <artifactId>junit</artifactId> <version>${junit4.version}</version> </dependency> - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-api</artifactId> - <version>${junit5.version}</version> - </dependency> - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-engine</artifactId> - <version>${junit5.version}</version> - </dependency> <dependency> <groupId>net.java.dev.jna</groupId> <artifactId>jna</artifactId> @@ -1194,13 +1191,8 @@ <dependencies> <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-api</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-engine</artifactId> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> </dependencies> diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml index f8dcc35ea..058033b3a 100644 --- a/tika-pipes/pom.xml +++ b/tika-pipes/pom.xml @@ -70,5 +70,6 @@ <module>tika-pipes-emitters</module> <module>tika-pipes-proto</module> <module>tika-pipes-cli</module> + <module>tika-pipes-integration-tests</module> </modules> </project> diff --git a/tika-integration-tests/pom.xml b/tika-pipes/tika-pipes-integration-tests/pom.xml similarity index 63% copy from tika-integration-tests/pom.xml copy to tika-pipes/tika-pipes-integration-tests/pom.xml index 8e472929f..062550bf3 100644 --- a/tika-integration-tests/pom.xml +++ b/tika-pipes/tika-pipes-integration-tests/pom.xml @@ -20,14 +20,14 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> <parent> <groupId>org.apache.tika</groupId> - <artifactId>tika-parent</artifactId> + <artifactId>tika-pipes</artifactId> <version>4.0.0-SNAPSHOT</version> - <relativePath>../tika-parent/pom.xml</relativePath> + <relativePath>../pom.xml</relativePath> </parent> <modelVersion>4.0.0</modelVersion> - <artifactId>tika-integration-tests</artifactId> - <name>Apache Tika integration tests</name> + <artifactId>tika-pipes-integration-tests</artifactId> + <name>Apache Tika Pipes integration tests</name> <packaging>pom</packaging> @@ -35,18 +35,47 @@ <module>tika-pipes-solr-integration-tests</module> <module>tika-pipes-opensearch-integration-tests</module> <module>tika-pipes-s3-integration-tests</module> - <module>tika-resource-loading-tests</module> <module>tika-pipes-kafka-integration-tests</module> - <module>tika-woodstox-tests</module> + <module>tika-pipes-integration-test-base</module> </modules> <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pipes-integration-test-base</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-core</artifactId> <version>${project.version}</version> <scope>test</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pipes-core</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pipes-core</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-emitters-core</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-fetchers-core</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-serialization</artifactId> diff --git a/tika-pipes/tika-pipes-integration-tests/tika-pipes-integration-test-base/pom.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-integration-test-base/pom.xml new file mode 100644 index 000000000..10b89bd21 --- /dev/null +++ b/tika-pipes/tika-pipes-integration-tests/tika-pipes-integration-test-base/pom.xml @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-pipes-integration-tests</artifactId> + <version>4.0.0-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>tika-pipes-integration-test-base</artifactId> + <name>Tika Pipes Integration Test Base Library</name> + <description>Tika Integration test common utilities.</description> + <properties> + <maven.compiler.release>17</maven.compiler.release> + </properties> + <dependencies> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-server-grpc</artifactId> + <version>${project.parent.version}</version> + </dependency> + <dependency> + <groupId>org.testcontainers</groupId> + <artifactId>testcontainers</artifactId> + </dependency> + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-test</artifactId> + </dependency> + </dependencies> +</project> diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/README.md b/tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/README.md similarity index 100% rename from tika-integration-tests/tika-pipes-kafka-integration-tests/README.md rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/README.md diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/pom.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/pom.xml similarity index 97% rename from tika-integration-tests/tika-pipes-kafka-integration-tests/pom.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/pom.xml index 8857cc91d..7905ddebf 100644 --- a/tika-integration-tests/tika-pipes-kafka-integration-tests/pom.xml +++ b/tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/pom.xml @@ -19,7 +19,7 @@ --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <parent> - <artifactId>tika-integration-tests</artifactId> + <artifactId>tika-pipes-integration-tests</artifactId> <groupId>org.apache.tika</groupId> <version>4.0.0-SNAPSHOT</version> </parent> diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java similarity index 100% rename from tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/log4j2.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/log4j2.xml similarity index 100% rename from tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/log4j2.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/log4j2.xml diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml similarity index 100% rename from tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml similarity index 100% rename from tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml similarity index 98% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml index 89322c85f..08d14ce1e 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml +++ b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml @@ -19,7 +19,7 @@ --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <parent> - <artifactId>tika-integration-tests</artifactId> + <artifactId>tika-pipes-integration-tests</artifactId> <groupId>org.apache.tika</groupId> <version>4.0.0-SNAPSHOT</version> </parent> diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-mappings.json b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-mappings.json similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-mappings.json rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-mappings.json diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/fake_oom.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/fake_oom.xml similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/fake_oom.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/fake_oom.xml diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/npe.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/npe.xml similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/npe.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/npe.xml diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/oom.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/oom.xml similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/oom.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/oom.xml diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx b/tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx similarity index 100% rename from tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/test-documents/test_recursive_embedded.docx diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/pom.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/pom.xml similarity index 96% rename from tika-integration-tests/tika-pipes-s3-integration-tests/pom.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/pom.xml index 8137e8154..fb53d8300 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/pom.xml +++ b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/pom.xml @@ -19,9 +19,10 @@ --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <parent> - <artifactId>tika-integration-tests</artifactId> <groupId>org.apache.tika</groupId> + <artifactId>tika-pipes-integration-tests</artifactId> <version>4.0.0-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> </parent> <modelVersion>4.0.0</modelVersion> diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java similarity index 100% rename from tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java similarity index 100% rename from tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/docker-compose.yml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/docker-compose.yml similarity index 100% rename from tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/docker-compose.yml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/docker-compose.yml diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/log4j2.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/log4j2.xml similarity index 100% rename from tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/log4j2.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/log4j2.xml diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml similarity index 100% rename from tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml similarity index 100% rename from tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml similarity index 100% rename from tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml similarity index 100% rename from tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/pom.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/pom.xml similarity index 97% rename from tika-integration-tests/tika-pipes-solr-integration-tests/pom.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/pom.xml index a27132262..ce3e91bb9 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/pom.xml +++ b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/pom.xml @@ -19,7 +19,7 @@ --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <parent> - <artifactId>tika-integration-tests</artifactId> + <artifactId>tika-pipes-integration-tests</artifactId> <groupId>org.apache.tika</groupId> <version>4.0.0-SNAPSHOT</version> </parent> @@ -70,4 +70,4 @@ <scm> <tag>3.0.0-rc1</tag> </scm> -</project> \ No newline at end of file +</project> diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8ZkTest.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8ZkTest.java similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8ZkTest.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8ZkTest.java diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9ZkTest.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9ZkTest.java similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9ZkTest.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9ZkTest.java diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/embedded/embedded.docx diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/log4j2.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/log4j2.xml similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/log4j2.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/log4j2.xml diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/logback.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/logback.xml similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/logback.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/logback.xml diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/pipes-fork-server-custom-log4j2.xml diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-async-log4j2.properties b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-async-log4j2.properties similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-async-log4j2.properties rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-async-log4j2.properties diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml b/tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml similarity index 100% rename from tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml rename to tika-pipes/tika-pipes-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml diff --git a/tika-server/pom.xml b/tika-server/pom.xml index 68fcac963..ee0d394ac 100644 --- a/tika-server/pom.xml +++ b/tika-server/pom.xml @@ -39,6 +39,7 @@ <modules> <module>tika-server-spring</module> <module>tika-server-grpc</module> + <module>tika-client-grpc</module> </modules> <parent> diff --git a/tika-server/tika-client-grpc/README.md b/tika-server/tika-client-grpc/README.md new file mode 100644 index 000000000..671c58c47 --- /dev/null +++ b/tika-server/tika-client-grpc/README.md @@ -0,0 +1,313 @@ +# Tika gRPC Client + +A Java client library for connecting to Apache Tika gRPC servers. This library provides a simple, easy-to-use interface for interacting with Tika gRPC services, abstracting away the complexity of gRPC communication while providing both synchronous and asynchronous operations. + +## Features + +- **Easy-to-use API**: Simple method calls for all Tika gRPC operations +- **Synchronous and Asynchronous support**: Choose the right approach for your use case +- **Connection management**: Automatic connection handling with configurable timeouts and keep-alive +- **Type-safe**: Uses generated protocol buffer classes directly +- **Resource management**: Proper cleanup with try-with-resources support +- **Comprehensive coverage**: Supports all Tika gRPC operations (fetchers, emitters, pipe iterators, pipe jobs) + +## Installation + +Add the dependency to your Maven project: + +```xml +<dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-grpc-client</artifactId> + <version>4.0.0-SNAPSHOT</version> +</dependency> +``` + +For Gradle: + +```gradle +implementation 'org.apache.tika:tika-grpc-client:4.0.0-SNAPSHOT' +``` + +## Quick Start + +### Basic Usage + +```java +import org.apache.tika.grpc.client.TikaGrpcClient; +import org.apache.tika.grpc.client.config.TikaGrpcClientConfig; +import org.apache.tika.FetchAndParseReply; + +// Create client with default configuration (localhost:9090) +try (TikaGrpcClient client = TikaGrpcClient.createDefault()) { + + // Save a fetcher configuration + client.saveFetcher("my-fetcher", "file-system-fetcher", + "{\"basePath\": \"/tmp\"}"); + + // Fetch and parse a document + FetchAndParseReply result = client.fetchAndParse("my-fetcher", "document.pdf"); + + // Access the results + System.out.println("Status: " + result.getStatus()); + System.out.println("Metadata count: " + result.getMetadataCount()); + + // Access metadata fields + if (result.getMetadataCount() > 0) { + result.getMetadata(0).getFieldsMap().forEach((key, valueList) -> { + System.out.println(key + ": " + valueList.getValuesList()); + }); + } +} +``` + +### Custom Configuration + +```java +import org.apache.tika.grpc.client.TikaGrpcClient; +import org.apache.tika.grpc.client.config.TikaGrpcClientConfig; + +TikaGrpcClientConfig config = TikaGrpcClientConfig.builder() + .host("tika-server.example.com") + .port(9090) + .tlsEnabled(true) + .maxInboundMessageSize(16 * 1024 * 1024) // 16MB + .connectionTimeoutSeconds(30) + .keepAliveTimeSeconds(60) + .build(); + +try (TikaGrpcClient client = new TikaGrpcClient(config)) { + // Use the client... +} +``` + +## API Reference + +### Fetcher Operations + +```java +// Save a fetcher +String fetcherId = client.saveFetcher("my-fetcher", "file-system-fetcher", + "{\"basePath\": \"/documents\"}"); + +// Get fetcher info +GetFetcherReply fetcherInfo = client.getFetcher("my-fetcher"); +System.out.println("Plugin ID: " + fetcherInfo.getPluginId()); + +// List all fetchers +ListFetchersReply fetchers = client.listFetchers(1, 10); // page 1, 10 per page +for (GetFetcherReply fetcher : fetchers.getGetFetcherRepliesList()) { + System.out.println("Fetcher: " + fetcher.getFetcherId()); +} + +// Delete a fetcher +boolean deleted = client.deleteFetcher("my-fetcher"); + +// Get fetcher configuration schema +String schema = client.getFetcherConfigJsonSchema("file-system-fetcher"); +``` + +### Parse Operations + +```java +// Simple fetch and parse +FetchAndParseReply result = client.fetchAndParse("my-fetcher", "document.pdf"); + +// With additional configuration +FetchAndParseReply result = client.fetchAndParse( + "my-fetcher", + "document.pdf", + "{\"headers\": {\"Authorization\": \"Bearer token\"}}", // fetch metadata + "{\"source\": \"api\"}", // added metadata + "{\"maxStringLength\": 10000}" // parse context +); + +// Asynchronous parsing +CompletableFuture<FetchAndParseReply> future = + client.fetchAndParseAsync("my-fetcher", "document.pdf"); + +future.thenAccept(result -> { + System.out.println("Async result: " + result.getStatus()); +}).exceptionally(throwable -> { + System.err.println("Error: " + throwable.getMessage()); + return null; +}); +``` + +### Emitter Operations + +```java +// Save an emitter +String emitterId = client.saveEmitter("my-emitter", "file-system-emitter", + "{\"basePath\": \"/output\"}"); + +// Get emitter info +GetEmitterReply emitterInfo = client.getEmitter("my-emitter"); + +// List all emitters +ListEmittersReply emitters = client.listEmitters(1, 10); + +// Delete an emitter +boolean deleted = client.deleteEmitter("my-emitter"); + +// Get emitter configuration schema +String schema = client.getEmitterConfigJsonSchema("file-system-emitter"); +``` + +### Pipe Iterator Operations + +```java +// Save a pipe iterator +String iteratorId = client.savePipeIterator("my-iterator", "csv-pipe-iterator", + "{\"csvFile\": \"/path/to/files.csv\"}"); + +// Get pipe iterator info +GetPipeIteratorReply iteratorInfo = client.getPipeIterator("my-iterator"); + +// List all pipe iterators +ListPipeIteratorsReply iterators = client.listPipeIterators(1, 10); + +// Delete a pipe iterator +boolean deleted = client.deletePipeIterator("my-iterator"); + +// Get pipe iterator configuration schema +String schema = client.getPipeIteratorConfigJsonSchema("csv-pipe-iterator"); +``` + +### Pipe Job Operations + +```java +// Run a pipe job +String jobId = client.runPipeJob( + "my-iterator", // pipe iterator ID + "my-fetcher", // fetcher ID + "my-emitter", // emitter ID + 3600 // timeout in seconds +); + +// Check job status +GetPipeJobReply jobStatus = client.getPipeJob(jobId); +System.out.println("Job running: " + jobStatus.getIsRunning()); +System.out.println("Job completed: " + jobStatus.getIsCompleted()); +System.out.println("Job has error: " + jobStatus.getHasError()); +``` + +## Configuration Options + +| Option | Default | Description | +|--------|---------|-------------| +| `host` | `"localhost"` | Server hostname or IP address | +| `port` | `9090` | Server port | +| `tlsEnabled` | `false` | Enable TLS encryption | +| `maxInboundMessageSize` | `4MB` | Maximum message size | +| `connectionTimeoutSeconds` | `30` | Connection timeout | +| `keepAliveTimeSeconds` | `30` | Keep-alive interval | +| `keepAliveTimeoutSeconds` | `5` | Keep-alive timeout | + +## Error Handling + +All client operations throw `TikaGrpcClientException` for any errors: + +```java +try { + FetchAndParseReply result = client.fetchAndParse("my-fetcher", "document.pdf"); +} catch (TikaGrpcClientException e) { + System.err.println("gRPC operation failed: " + e.getMessage()); + + // Access the underlying cause if needed + Throwable cause = e.getCause(); + if (cause instanceof StatusRuntimeException) { + StatusRuntimeException grpcError = (StatusRuntimeException) cause; + System.err.println("gRPC status: " + grpcError.getStatus()); + } +} +``` + +## Connection Health + +Check if the connection is healthy: + +```java +if (client.isConnected()) { + System.out.println("Client is connected to the server"); +} else { + System.out.println("Client is not connected"); +} +``` + +## Working with Metadata + +The `FetchAndParseReply` contains rich metadata information: + +```java +FetchAndParseReply result = client.fetchAndParse("my-fetcher", "document.pdf"); + +// Check status +if ("SUCCESS".equals(result.getStatus())) { + // Process metadata + for (int i = 0; i < result.getMetadataCount(); i++) { + Metadata metadata = result.getMetadata(i); + + metadata.getFieldsMap().forEach((fieldName, valueList) -> { + System.out.print(fieldName + ": "); + + // Handle multiple values per field + for (Value value : valueList.getValuesList()) { + switch (value.getValueCase()) { + case STRING_VALUE: + System.out.print(value.getStringValue() + " "); + break; + case INT_VALUE: + System.out.print(value.getIntValue() + " "); + break; + case BOOL_VALUE: + System.out.print(value.getBoolValue() + " "); + break; + case DOUBLE_VALUE: + System.out.print(value.getDoubleValue() + " "); + break; + default: + System.out.print("null "); + } + } + System.out.println(); + }); + } +} else { + System.err.println("Parse failed with status: " + result.getStatus()); + if (!result.getErrorMessage().isEmpty()) { + System.err.println("Error: " + result.getErrorMessage()); + } +} +``` + +## Best Practices + +1. **Use try-with-resources**: Always use try-with-resources or manually call `close()` to properly cleanup connections. + +2. **Configure timeouts**: Set appropriate timeouts based on your document sizes and network conditions. + +3. **Handle errors gracefully**: Wrap operations in try-catch blocks and handle `TikaGrpcClientException`. + +4. **Reuse client instances**: Create one client instance and reuse it for multiple operations rather than creating new instances. + +5. **Check connection health**: Use `isConnected()` to verify connectivity before critical operations. + +6. **Configure message sizes**: Increase `maxInboundMessageSize` if you're processing large documents. + +## Thread Safety + +The `TikaGrpcClient` is thread-safe and can be used concurrently from multiple threads. The underlying gRPC channel handles concurrent requests efficiently. + +## Examples + +See the `src/test/java` directory for comprehensive examples and integration tests. + +## Requirements + +- Java 17 or later +- A running Tika gRPC server + +## License + +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/tika-server/tika-client-grpc/pom.xml b/tika-server/tika-client-grpc/pom.xml new file mode 100644 index 000000000..e47e570e1 --- /dev/null +++ b/tika-server/tika-client-grpc/pom.xml @@ -0,0 +1,34 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parent</artifactId> + <version>4.0.0-SNAPSHOT</version> + <relativePath>../tika-server/pom.xml</relativePath> + </parent> + <artifactId>tika-client-grpc</artifactId> + <name>Tika Grpc Server Client</name> + <description>Tika grpc server client that is capable of speaking with a Tika Grpc Server instance.</description> + <properties> + <java.version>17</java.version> + </properties> + <dependencies> + <dependency> + <groupId>org.projectlombok</groupId> + <artifactId>lombok</artifactId> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-pipes-proto</artifactId> + <version>${project.parent.version}</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + <version>${project.parent.version}</version> + </dependency> + </dependencies> +</project> diff --git a/tika-server/tika-client-grpc/src/main/java/org/apache/tika/grpc/client/TikaGrpcClient.java b/tika-server/tika-client-grpc/src/main/java/org/apache/tika/grpc/client/TikaGrpcClient.java new file mode 100644 index 000000000..caa3473fb --- /dev/null +++ b/tika-server/tika-client-grpc/src/main/java/org/apache/tika/grpc/client/TikaGrpcClient.java @@ -0,0 +1,755 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.grpc.client; + +import java.io.Closeable; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; + +import io.grpc.ManagedChannel; +import io.grpc.ManagedChannelBuilder; +import io.grpc.stub.StreamObserver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.DeleteEmitterReply; +import org.apache.tika.DeleteEmitterRequest; +import org.apache.tika.DeleteFetcherReply; +import org.apache.tika.DeleteFetcherRequest; +import org.apache.tika.DeletePipeIteratorReply; +import org.apache.tika.DeletePipeIteratorRequest; +import org.apache.tika.FetchAndParseReply; +import org.apache.tika.FetchAndParseRequest; +import org.apache.tika.GetEmitterConfigJsonSchemaReply; +import org.apache.tika.GetEmitterConfigJsonSchemaRequest; +import org.apache.tika.GetEmitterReply; +import org.apache.tika.GetEmitterRequest; +import org.apache.tika.GetFetcherConfigJsonSchemaReply; +import org.apache.tika.GetFetcherConfigJsonSchemaRequest; +import org.apache.tika.GetFetcherReply; +import org.apache.tika.GetFetcherRequest; +import org.apache.tika.GetPipeIteratorConfigJsonSchemaReply; +import org.apache.tika.GetPipeIteratorConfigJsonSchemaRequest; +import org.apache.tika.GetPipeIteratorReply; +import org.apache.tika.GetPipeIteratorRequest; +import org.apache.tika.GetPipeJobReply; +import org.apache.tika.GetPipeJobRequest; +import org.apache.tika.ListEmittersReply; +import org.apache.tika.ListEmittersRequest; +import org.apache.tika.ListFetchersReply; +import org.apache.tika.ListFetchersRequest; +import org.apache.tika.ListPipeIteratorsReply; +import org.apache.tika.ListPipeIteratorsRequest; +import org.apache.tika.RunPipeJobReply; +import org.apache.tika.RunPipeJobRequest; +import org.apache.tika.SaveEmitterReply; +import org.apache.tika.SaveEmitterRequest; +import org.apache.tika.SaveFetcherReply; +import org.apache.tika.SaveFetcherRequest; +import org.apache.tika.SavePipeIteratorReply; +import org.apache.tika.SavePipeIteratorRequest; +import org.apache.tika.TikaGrpc; +import org.apache.tika.grpc.client.config.TikaGrpcClientConfig; +import org.apache.tika.grpc.client.exception.TikaGrpcClientException; + +/** + * A high-level client for connecting to Apache Tika gRPC servers. + * + * This client provides an easy-to-use interface for interacting with Tika gRPC services, + * abstracting away the complexity of gRPC communication while providing both synchronous + * and asynchronous operations. + * + * Usage example: + * <pre> + * TikaGrpcClientConfig config = TikaGrpcClientConfig.builder() + * .host("localhost") + * .port(9090) + * .build(); + * + * try (TikaGrpcClient client = new TikaGrpcClient(config)) { + * // Save a fetcher configuration + * client.saveFetcher("my-fetcher", "file-system-fetcher", "{\"basePath\": \"/tmp\"}"); + * + * // Fetch and parse a document + * FetchAndParseReply result = client.fetchAndParse("my-fetcher", "document.pdf"); + * System.out.println("Status: " + result.getStatus()); + * } + * </pre> + */ +public class TikaGrpcClient implements Closeable { + + private static final Logger LOG = LoggerFactory.getLogger(TikaGrpcClient.class); + + private final TikaGrpcClientConfig config; + private final ManagedChannel channel; + private final TikaGrpc.TikaBlockingStub blockingStub; + private final TikaGrpc.TikaStub asyncStub; + + private volatile boolean closed = false; + + /** + * Creates a new TikaGrpcClient with the specified configuration. + * + * @param config the client configuration + * @throws TikaGrpcClientException if the client cannot be initialized + */ + public TikaGrpcClient(TikaGrpcClientConfig config) throws TikaGrpcClientException { + this.config = config; + + try { + ManagedChannelBuilder<?> channelBuilder = ManagedChannelBuilder + .forAddress(config.getHost(), config.getPort()); + + if (!config.isTlsEnabled()) { + channelBuilder.usePlaintext(); + } + + // Apply additional channel configurations + if (config.getMaxInboundMessageSize() > 0) { + channelBuilder.maxInboundMessageSize(config.getMaxInboundMessageSize()); + } + if (config.getKeepAliveTimeSeconds() > 0) { + channelBuilder.keepAliveTime(config.getKeepAliveTimeSeconds(), TimeUnit.SECONDS); + } + if (config.getKeepAliveTimeoutSeconds() > 0) { + channelBuilder.keepAliveTimeout(config.getKeepAliveTimeoutSeconds(), TimeUnit.SECONDS); + } + + this.channel = channelBuilder.build(); + this.blockingStub = TikaGrpc.newBlockingStub(channel); + this.asyncStub = TikaGrpc.newStub(channel); + + LOG.info("TikaGrpcClient initialized for {}:{}", config.getHost(), config.getPort()); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to initialize TikaGrpcClient", e); + } + } + + /** + * Creates a TikaGrpcClient with default configuration (localhost:9090). + * + * @return a new TikaGrpcClient instance + * @throws TikaGrpcClientException if the client cannot be initialized + */ + public static TikaGrpcClient createDefault() throws TikaGrpcClientException { + return new TikaGrpcClient(TikaGrpcClientConfig.createDefault()); + } + + // Fetcher Operations + + /** + * Saves a fetcher configuration to the server. + * + * @param fetcherId unique identifier for the fetcher + * @param pluginId the plugin ID of the fetcher class + * @param fetcherConfigJson JSON configuration for the fetcher + * @return the fetcher ID that was saved + * @throws TikaGrpcClientException if the operation fails + */ + public String saveFetcher(String fetcherId, String pluginId, String fetcherConfigJson) + throws TikaGrpcClientException { + checkNotClosed(); + + try { + SaveFetcherRequest request = SaveFetcherRequest.newBuilder() + .setFetcherId(fetcherId) + .setPluginId(pluginId) + .setFetcherConfigJson(fetcherConfigJson) + .build(); + + SaveFetcherReply reply = blockingStub.saveFetcher(request); + return reply.getFetcherId(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to save fetcher: " + fetcherId, e); + } + } + + /** + * Retrieves fetcher information from the server. + * + * @param fetcherId the ID of the fetcher to retrieve + * @return fetcher information + * @throws TikaGrpcClientException if the operation fails + */ + public GetFetcherReply getFetcher(String fetcherId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + GetFetcherRequest request = GetFetcherRequest.newBuilder() + .setFetcherId(fetcherId) + .build(); + + return blockingStub.getFetcher(request); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to get fetcher: " + fetcherId, e); + } + } + + /** + * Lists all fetchers stored on the server. + * + * @param pageNumber the page number (starting from 1) + * @param pageSize the number of fetchers per page + * @return list of fetcher information + * @throws TikaGrpcClientException if the operation fails + */ + public ListFetchersReply listFetchers(int pageNumber, int pageSize) throws TikaGrpcClientException { + checkNotClosed(); + + try { + ListFetchersRequest request = ListFetchersRequest.newBuilder() + .setPageNumber(pageNumber) + .setNumFetchersPerPage(pageSize) + .build(); + + return blockingStub.listFetchers(request); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to list fetchers", e); + } + } + + /** + * Deletes a fetcher from the server. + * + * @param fetcherId the ID of the fetcher to delete + * @return true if the deletion was successful + * @throws TikaGrpcClientException if the operation fails + */ + public boolean deleteFetcher(String fetcherId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + DeleteFetcherRequest request = DeleteFetcherRequest.newBuilder() + .setFetcherId(fetcherId) + .build(); + + DeleteFetcherReply reply = blockingStub.deleteFetcher(request); + return reply.getSuccess(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to delete fetcher: " + fetcherId, e); + } + } + + /** + * Gets the JSON schema for a fetcher configuration. + * + * @param pluginId the plugin ID of the fetcher + * @return the JSON schema as a string + * @throws TikaGrpcClientException if the operation fails + */ + public String getFetcherConfigJsonSchema(String pluginId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + GetFetcherConfigJsonSchemaRequest request = GetFetcherConfigJsonSchemaRequest.newBuilder() + .setPluginId(pluginId) + .build(); + + GetFetcherConfigJsonSchemaReply reply = blockingStub.getFetcherConfigJsonSchema(request); + return reply.getFetcherConfigJsonSchema(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to get fetcher config schema for: " + pluginId, e); + } + } + + // Parse Operations + + /** + * Fetches and parses a document using the specified fetcher. + * + * @param fetcherId the ID of the fetcher to use + * @param fetchKey the key/path of the document to fetch + * @return the parse result containing metadata and status + * @throws TikaGrpcClientException if the operation fails + */ + public FetchAndParseReply fetchAndParse(String fetcherId, String fetchKey) + throws TikaGrpcClientException { + return fetchAndParse(fetcherId, fetchKey, null, null, null); + } + + /** + * Fetches and parses a document with additional configuration. + * + * @param fetcherId the ID of the fetcher to use + * @param fetchKey the key/path of the document to fetch + * @param fetchMetadataJson additional fetch metadata (optional) + * @param addedMetadataJson additional metadata to add to the result (optional) + * @param parseContextJson custom parse context configuration (optional) + * @return the parse result containing metadata and status + * @throws TikaGrpcClientException if the operation fails + */ + public FetchAndParseReply fetchAndParse(String fetcherId, String fetchKey, + String fetchMetadataJson, String addedMetadataJson, + String parseContextJson) throws TikaGrpcClientException { + checkNotClosed(); + + try { + FetchAndParseRequest.Builder requestBuilder = FetchAndParseRequest.newBuilder() + .setFetcherId(fetcherId) + .setFetchKey(fetchKey); + + if (fetchMetadataJson != null) { + requestBuilder.setFetchMetadataJson(fetchMetadataJson); + } + if (addedMetadataJson != null) { + requestBuilder.setAddedMetadataJson(addedMetadataJson); + } + if (parseContextJson != null) { + requestBuilder.setParseContextJson(parseContextJson); + } + + return blockingStub.fetchAndParse(requestBuilder.build()); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to fetch and parse: " + fetchKey, e); + } + } + + /** + * Fetches and parses a document asynchronously. + * + * @param fetcherId the ID of the fetcher to use + * @param fetchKey the key/path of the document to fetch + * @return a CompletableFuture containing the parse result + */ + public CompletableFuture<FetchAndParseReply> fetchAndParseAsync(String fetcherId, String fetchKey) { + return fetchAndParseAsync(fetcherId, fetchKey, null, null, null); + } + + /** + * Fetches and parses a document asynchronously with additional configuration. + * + * @param fetcherId the ID of the fetcher to use + * @param fetchKey the key/path of the document to fetch + * @param fetchMetadataJson additional fetch metadata (optional) + * @param addedMetadataJson additional metadata to add to the result (optional) + * @param parseContextJson custom parse context configuration (optional) + * @return a CompletableFuture containing the parse result + */ + public CompletableFuture<FetchAndParseReply> fetchAndParseAsync(String fetcherId, String fetchKey, + String fetchMetadataJson, String addedMetadataJson, + String parseContextJson) { + CompletableFuture<FetchAndParseReply> future = new CompletableFuture<>(); + + try { + checkNotClosed(); + } catch (TikaGrpcClientException e) { + future.completeExceptionally(e); + return future; + } + + try { + FetchAndParseRequest.Builder requestBuilder = FetchAndParseRequest.newBuilder() + .setFetcherId(fetcherId) + .setFetchKey(fetchKey); + + if (fetchMetadataJson != null) { + requestBuilder.setFetchMetadataJson(fetchMetadataJson); + } + if (addedMetadataJson != null) { + requestBuilder.setAddedMetadataJson(addedMetadataJson); + } + if (parseContextJson != null) { + requestBuilder.setParseContextJson(parseContextJson); + } + + asyncStub.fetchAndParse(requestBuilder.build(), new StreamObserver<>() { + @Override + public void onNext(FetchAndParseReply reply) { + future.complete(reply); + } + + @Override + public void onError(Throwable t) { + future.completeExceptionally(new TikaGrpcClientException("Async fetch and parse failed", t)); + } + + @Override + public void onCompleted() { + // Response already handled in onNext + } + }); + + } catch (Exception e) { + future.completeExceptionally(new TikaGrpcClientException("Failed to start async fetch and parse", e)); + } + + return future; + } + + // Emitter Operations + + /** + * Saves an emitter configuration to the server. + * + * @param emitterId unique identifier for the emitter + * @param pluginId the plugin ID of the emitter class + * @param emitterConfigJson JSON configuration for the emitter + * @return the emitter ID that was saved + * @throws TikaGrpcClientException if the operation fails + */ + public String saveEmitter(String emitterId, String pluginId, String emitterConfigJson) + throws TikaGrpcClientException { + checkNotClosed(); + + try { + SaveEmitterRequest request = SaveEmitterRequest.newBuilder() + .setEmitterId(emitterId) + .setPluginId(pluginId) + .setEmitterConfigJson(emitterConfigJson) + .build(); + + SaveEmitterReply reply = blockingStub.saveEmitter(request); + return reply.getEmitterId(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to save emitter: " + emitterId, e); + } + } + + /** + * Retrieves emitter information from the server. + * + * @param emitterId the ID of the emitter to retrieve + * @return emitter information + * @throws TikaGrpcClientException if the operation fails + */ + public GetEmitterReply getEmitter(String emitterId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + GetEmitterRequest request = GetEmitterRequest.newBuilder() + .setEmitterId(emitterId) + .build(); + + return blockingStub.getEmitter(request); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to get emitter: " + emitterId, e); + } + } + + /** + * Lists all emitters stored on the server. + * + * @param pageNumber the page number (starting from 1) + * @param pageSize the number of emitters per page + * @return list of emitter information + * @throws TikaGrpcClientException if the operation fails + */ + public ListEmittersReply listEmitters(int pageNumber, int pageSize) throws TikaGrpcClientException { + checkNotClosed(); + + try { + ListEmittersRequest request = ListEmittersRequest.newBuilder() + .setPageNumber(pageNumber) + .setNumEmittersPerPage(pageSize) + .build(); + + return blockingStub.listEmitters(request); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to list emitters", e); + } + } + + /** + * Deletes an emitter from the server. + * + * @param emitterId the ID of the emitter to delete + * @return true if the deletion was successful + * @throws TikaGrpcClientException if the operation fails + */ + public boolean deleteEmitter(String emitterId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + DeleteEmitterRequest request = DeleteEmitterRequest.newBuilder() + .setEmitterId(emitterId) + .build(); + + DeleteEmitterReply reply = blockingStub.deleteEmitter(request); + return reply.getSuccess(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to delete emitter: " + emitterId, e); + } + } + + /** + * Gets the JSON schema for an emitter configuration. + * + * @param pluginId the plugin ID of the emitter + * @return the JSON schema as a string + * @throws TikaGrpcClientException if the operation fails + */ + public String getEmitterConfigJsonSchema(String pluginId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + GetEmitterConfigJsonSchemaRequest request = GetEmitterConfigJsonSchemaRequest.newBuilder() + .setPluginId(pluginId) + .build(); + + GetEmitterConfigJsonSchemaReply reply = blockingStub.getEmitterConfigJsonSchema(request); + return reply.getEmitterConfigJsonSchema(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to get emitter config schema for: " + pluginId, e); + } + } + + // Pipe Iterator Operations + + /** + * Saves a pipe iterator configuration to the server. + * + * @param pipeIteratorId unique identifier for the pipe iterator + * @param pluginId the plugin ID of the pipe iterator class + * @param pipeIteratorConfigJson JSON configuration for the pipe iterator + * @return the pipe iterator ID that was saved + * @throws TikaGrpcClientException if the operation fails + */ + public String savePipeIterator(String pipeIteratorId, String pluginId, String pipeIteratorConfigJson) + throws TikaGrpcClientException { + checkNotClosed(); + + try { + SavePipeIteratorRequest request = SavePipeIteratorRequest.newBuilder() + .setPipeIteratorId(pipeIteratorId) + .setPluginId(pluginId) + .setPipeIteratorConfigJson(pipeIteratorConfigJson) + .build(); + + SavePipeIteratorReply reply = blockingStub.savePipeIterator(request); + return reply.getPipeIteratorId(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to save pipe iterator: " + pipeIteratorId, e); + } + } + + /** + * Retrieves pipe iterator information from the server. + * + * @param pipeIteratorId the ID of the pipe iterator to retrieve + * @return pipe iterator information + * @throws TikaGrpcClientException if the operation fails + */ + public GetPipeIteratorReply getPipeIterator(String pipeIteratorId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + GetPipeIteratorRequest request = GetPipeIteratorRequest.newBuilder() + .setPipeIteratorId(pipeIteratorId) + .build(); + + return blockingStub.getPipeIterator(request); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to get pipe iterator: " + pipeIteratorId, e); + } + } + + /** + * Lists all pipe iterators stored on the server. + * + * @param pageNumber the page number (starting from 1) + * @param pageSize the number of pipe iterators per page + * @return list of pipe iterator information + * @throws TikaGrpcClientException if the operation fails + */ + public ListPipeIteratorsReply listPipeIterators(int pageNumber, int pageSize) throws TikaGrpcClientException { + checkNotClosed(); + + try { + ListPipeIteratorsRequest request = ListPipeIteratorsRequest.newBuilder() + .setPageNumber(pageNumber) + .setNumPipeIteratorsPerPage(pageSize) + .build(); + + return blockingStub.listPipeIterators(request); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to list pipe iterators", e); + } + } + + /** + * Deletes a pipe iterator from the server. + * + * @param pipeIteratorId the ID of the pipe iterator to delete + * @return true if the deletion was successful + * @throws TikaGrpcClientException if the operation fails + */ + public boolean deletePipeIterator(String pipeIteratorId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + DeletePipeIteratorRequest request = DeletePipeIteratorRequest.newBuilder() + .setPipeIteratorId(pipeIteratorId) + .build(); + + DeletePipeIteratorReply reply = blockingStub.deletePipeIterator(request); + return reply.getSuccess(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to delete pipe iterator: " + pipeIteratorId, e); + } + } + + /** + * Gets the JSON schema for a pipe iterator configuration. + * + * @param pluginId the plugin ID of the pipe iterator + * @return the JSON schema as a string + * @throws TikaGrpcClientException if the operation fails + */ + public String getPipeIteratorConfigJsonSchema(String pluginId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + GetPipeIteratorConfigJsonSchemaRequest request = GetPipeIteratorConfigJsonSchemaRequest.newBuilder() + .setPluginId(pluginId) + .build(); + + GetPipeIteratorConfigJsonSchemaReply reply = blockingStub.getPipeIteratorConfigJsonSchema(request); + return reply.getPipeIteratorConfigJsonSchema(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to get pipe iterator config schema for: " + pluginId, e); + } + } + + // Pipe Job Operations + + /** + * Runs a pipe job and returns the job ID. + * + * @param pipeIteratorId the ID of the pipe iterator to use + * @param fetcherId the ID of the fetcher to use + * @param emitterId the ID of the emitter to use + * @param timeoutSeconds hard timeout for job completion + * @return the job ID + * @throws TikaGrpcClientException if the operation fails + */ + public String runPipeJob(String pipeIteratorId, String fetcherId, String emitterId, int timeoutSeconds) + throws TikaGrpcClientException { + checkNotClosed(); + + try { + RunPipeJobRequest request = RunPipeJobRequest.newBuilder() + .setPipeIteratorId(pipeIteratorId) + .setFetcherId(fetcherId) + .setEmitterId(emitterId) + .setJobCompletionTimeoutSeconds(timeoutSeconds) + .build(); + + RunPipeJobReply reply = blockingStub.runPipeJob(request); + return reply.getPipeJobId(); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to run pipe job", e); + } + } + + /** + * Gets the status of a pipe job. + * + * @param pipeJobId the ID of the pipe job + * @return the pipe job status + * @throws TikaGrpcClientException if the operation fails + */ + public GetPipeJobReply getPipeJob(String pipeJobId) throws TikaGrpcClientException { + checkNotClosed(); + + try { + GetPipeJobRequest request = GetPipeJobRequest.newBuilder() + .setPipeJobId(pipeJobId) + .build(); + + return blockingStub.getPipeJob(request); + + } catch (Exception e) { + throw new TikaGrpcClientException("Failed to get pipe job: " + pipeJobId, e); + } + } + + // Utility Methods + + /** + * Checks if the connection to the server is healthy. + * + * @return true if the connection is healthy + */ + public boolean isConnected() { + if (closed) { + return false; + } + + try { + // Try a simple operation to test connectivity + listFetchers(1, 1); + return true; + } catch (Exception e) { + LOG.debug("Connection check failed", e); + return false; + } + } + + /** + * Gets the current client configuration. + * + * @return the client configuration + */ + public TikaGrpcClientConfig getConfig() { + return config; + } + + private void checkNotClosed() throws TikaGrpcClientException { + if (closed) { + throw new TikaGrpcClientException("Client has been closed"); + } + } + + @Override + public void close() { + if (!closed) { + closed = true; + if (channel != null && !channel.isShutdown()) { + channel.shutdown(); + try { + if (!channel.awaitTermination(5, TimeUnit.SECONDS)) { + LOG.warn("Channel did not terminate within 5 seconds, forcing shutdown"); + channel.shutdownNow(); + } + } catch (InterruptedException e) { + LOG.warn("Interrupted while waiting for channel termination", e); + channel.shutdownNow(); + Thread.currentThread().interrupt(); + } + } + LOG.info("TikaGrpcClient closed"); + } + } +} diff --git a/tika-server/tika-client-grpc/src/main/java/org/apache/tika/grpc/client/config/TikaGrpcClientConfig.java b/tika-server/tika-client-grpc/src/main/java/org/apache/tika/grpc/client/config/TikaGrpcClientConfig.java new file mode 100644 index 000000000..95e196650 --- /dev/null +++ b/tika-server/tika-client-grpc/src/main/java/org/apache/tika/grpc/client/config/TikaGrpcClientConfig.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.grpc.client.config; + +/** + * Configuration class for TikaGrpcClient. + * + * This class holds all the configuration parameters needed to connect to a Tika gRPC server. + * Use the builder pattern to create instances with custom settings. + * + * Example usage: + * <pre> + * TikaGrpcClientConfig config = TikaGrpcClientConfig.builder() + * .host("tika-server.example.com") + * .port(9090) + * .tlsEnabled(true) + * .maxInboundMessageSize(16 * 1024 * 1024) // 16MB + * .connectionTimeoutSeconds(30) + * .build(); + * </pre> + */ +public class TikaGrpcClientConfig { + + private static final String DEFAULT_HOST = "localhost"; + private static final int DEFAULT_PORT = 9090; + private static final boolean DEFAULT_TLS_ENABLED = false; + private static final int DEFAULT_MAX_INBOUND_MESSAGE_SIZE = 4 * 1024 * 1024; // 4MB + private static final int DEFAULT_CONNECTION_TIMEOUT_SECONDS = 30; + private static final int DEFAULT_KEEP_ALIVE_TIME_SECONDS = 30; + private static final int DEFAULT_KEEP_ALIVE_TIMEOUT_SECONDS = 5; + + private final String host; + private final int port; + private final boolean tlsEnabled; + private final int maxInboundMessageSize; + private final int connectionTimeoutSeconds; + private final int keepAliveTimeSeconds; + private final int keepAliveTimeoutSeconds; + + private TikaGrpcClientConfig(Builder builder) { + this.host = builder.host; + this.port = builder.port; + this.tlsEnabled = builder.tlsEnabled; + this.maxInboundMessageSize = builder.maxInboundMessageSize; + this.connectionTimeoutSeconds = builder.connectionTimeoutSeconds; + this.keepAliveTimeSeconds = builder.keepAliveTimeSeconds; + this.keepAliveTimeoutSeconds = builder.keepAliveTimeoutSeconds; + } + + /** + * Creates a default configuration (localhost:9090, no TLS). + * + * @return default configuration + */ + public static TikaGrpcClientConfig createDefault() { + return new Builder().build(); + } + + /** + * Creates a new builder for TikaGrpcClientConfig. + * + * @return a new builder instance + */ + public static Builder builder() { + return new Builder(); + } + + // Getters + + public String getHost() { + return host; + } + + public int getPort() { + return port; + } + + public boolean isTlsEnabled() { + return tlsEnabled; + } + + public int getMaxInboundMessageSize() { + return maxInboundMessageSize; + } + + public int getConnectionTimeoutSeconds() { + return connectionTimeoutSeconds; + } + + public int getKeepAliveTimeSeconds() { + return keepAliveTimeSeconds; + } + + public int getKeepAliveTimeoutSeconds() { + return keepAliveTimeoutSeconds; + } + + @Override + public String toString() { + return "TikaGrpcClientConfig{" + + "host='" + host + '\'' + + ", port=" + port + + ", tlsEnabled=" + tlsEnabled + + ", maxInboundMessageSize=" + maxInboundMessageSize + + ", connectionTimeoutSeconds=" + connectionTimeoutSeconds + + ", keepAliveTimeSeconds=" + keepAliveTimeSeconds + + ", keepAliveTimeoutSeconds=" + keepAliveTimeoutSeconds + + '}'; + } + + /** + * Builder for TikaGrpcClientConfig. + */ + public static class Builder { + private String host = DEFAULT_HOST; + private int port = DEFAULT_PORT; + private boolean tlsEnabled = DEFAULT_TLS_ENABLED; + private int maxInboundMessageSize = DEFAULT_MAX_INBOUND_MESSAGE_SIZE; + private int connectionTimeoutSeconds = DEFAULT_CONNECTION_TIMEOUT_SECONDS; + private int keepAliveTimeSeconds = DEFAULT_KEEP_ALIVE_TIME_SECONDS; + private int keepAliveTimeoutSeconds = DEFAULT_KEEP_ALIVE_TIMEOUT_SECONDS; + + private Builder() {} + + /** + * Sets the host name or IP address of the Tika gRPC server. + * + * @param host the server host (default: "localhost") + * @return this builder + */ + public Builder host(String host) { + if (host == null || host.trim().isEmpty()) { + throw new IllegalArgumentException("Host cannot be null or empty"); + } + this.host = host.trim(); + return this; + } + + /** + * Sets the port of the Tika gRPC server. + * + * @param port the server port (default: 9090) + * @return this builder + */ + public Builder port(int port) { + if (port <= 0 || port > 65535) { + throw new IllegalArgumentException("Port must be between 1 and 65535"); + } + this.port = port; + return this; + } + + /** + * Enables or disables TLS encryption. + * + * @param tlsEnabled whether to use TLS (default: false) + * @return this builder + */ + public Builder tlsEnabled(boolean tlsEnabled) { + this.tlsEnabled = tlsEnabled; + return this; + } + + /** + * Sets the maximum inbound message size. + * + * @param maxInboundMessageSize the maximum message size in bytes (default: 4MB) + * @return this builder + */ + public Builder maxInboundMessageSize(int maxInboundMessageSize) { + if (maxInboundMessageSize <= 0) { + throw new IllegalArgumentException("Max inbound message size must be positive"); + } + this.maxInboundMessageSize = maxInboundMessageSize; + return this; + } + + /** + * Sets the connection timeout. + * + * @param connectionTimeoutSeconds the connection timeout in seconds (default: 30) + * @return this builder + */ + public Builder connectionTimeoutSeconds(int connectionTimeoutSeconds) { + if (connectionTimeoutSeconds <= 0) { + throw new IllegalArgumentException("Connection timeout must be positive"); + } + this.connectionTimeoutSeconds = connectionTimeoutSeconds; + return this; + } + + /** + * Sets the keep-alive time. + * + * @param keepAliveTimeSeconds the keep-alive time in seconds (default: 30) + * @return this builder + */ + public Builder keepAliveTimeSeconds(int keepAliveTimeSeconds) { + if (keepAliveTimeSeconds <= 0) { + throw new IllegalArgumentException("Keep-alive time must be positive"); + } + this.keepAliveTimeSeconds = keepAliveTimeSeconds; + return this; + } + + /** + * Sets the keep-alive timeout. + * + * @param keepAliveTimeoutSeconds the keep-alive timeout in seconds (default: 5) + * @return this builder + */ + public Builder keepAliveTimeoutSeconds(int keepAliveTimeoutSeconds) { + if (keepAliveTimeoutSeconds <= 0) { + throw new IllegalArgumentException("Keep-alive timeout must be positive"); + } + this.keepAliveTimeoutSeconds = keepAliveTimeoutSeconds; + return this; + } + + /** + * Builds the configuration object. + * + * @return a new TikaGrpcClientConfig instance + */ + public TikaGrpcClientConfig build() { + return new TikaGrpcClientConfig(this); + } + } +} diff --git a/tika-server/tika-client-grpc/src/main/java/org/apache/tika/grpc/client/exception/TikaGrpcClientException.java b/tika-server/tika-client-grpc/src/main/java/org/apache/tika/grpc/client/exception/TikaGrpcClientException.java new file mode 100644 index 000000000..f0de276f9 --- /dev/null +++ b/tika-server/tika-client-grpc/src/main/java/org/apache/tika/grpc/client/exception/TikaGrpcClientException.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.grpc.client.exception; + +/** + * Exception thrown by TikaGrpcClient operations. + * + * This exception wraps all errors that can occur during gRPC communication with the Tika server, + * including network errors, server errors, and client configuration issues. + */ +public class TikaGrpcClientException extends Exception { + + private static final long serialVersionUID = 1L; + + /** + * Constructs a new TikaGrpcClientException with the specified message. + * + * @param message the error message + */ + public TikaGrpcClientException(String message) { + super(message); + } + + /** + * Constructs a new TikaGrpcClientException with the specified message and cause. + * + * @param message the error message + * @param cause the underlying cause + */ + public TikaGrpcClientException(String message, Throwable cause) { + super(message, cause); + } + + /** + * Constructs a new TikaGrpcClientException with the specified cause. + * + * @param cause the underlying cause + */ + public TikaGrpcClientException(Throwable cause) { + super(cause); + } +} \ No newline at end of file diff --git a/tika-server/tika-client-grpc/src/test/java/org/apache/tika/grpc/client/TikaGrpcClientTest.java b/tika-server/tika-client-grpc/src/test/java/org/apache/tika/grpc/client/TikaGrpcClientTest.java new file mode 100644 index 000000000..c776b1ea7 --- /dev/null +++ b/tika-server/tika-client-grpc/src/test/java/org/apache/tika/grpc/client/TikaGrpcClientTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.grpc.client; + +import org.apache.tika.grpc.client.config.TikaGrpcClientConfig; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Unit tests for TikaGrpcClient configuration and basic functionality. + */ +public class TikaGrpcClientTest { + + @Test + public void testDefaultConfig() { + TikaGrpcClientConfig config = TikaGrpcClientConfig.createDefault(); + + assertEquals("localhost", config.getHost()); + assertEquals(9090, config.getPort()); + assertFalse(config.isTlsEnabled()); + assertEquals(4 * 1024 * 1024, config.getMaxInboundMessageSize()); + assertEquals(30, config.getConnectionTimeoutSeconds()); + } + + @Test + public void testCustomConfig() { + TikaGrpcClientConfig config = TikaGrpcClientConfig.builder() + .host("example.com") + .port(8080) + .tlsEnabled(true) + .maxInboundMessageSize(8 * 1024 * 1024) + .connectionTimeoutSeconds(60) + .build(); + + assertEquals("example.com", config.getHost()); + assertEquals(8080, config.getPort()); + assertTrue(config.isTlsEnabled()); + assertEquals(8 * 1024 * 1024, config.getMaxInboundMessageSize()); + assertEquals(60, config.getConnectionTimeoutSeconds()); + } + + @Test + public void testConfigValidation() { + // Test invalid host + assertThrows(IllegalArgumentException.class, () -> { + TikaGrpcClientConfig.builder().host(null); + }); + + assertThrows(IllegalArgumentException.class, () -> { + TikaGrpcClientConfig.builder().host(""); + }); + + // Test invalid port + assertThrows(IllegalArgumentException.class, () -> { + TikaGrpcClientConfig.builder().port(0); + }); + + assertThrows(IllegalArgumentException.class, () -> { + TikaGrpcClientConfig.builder().port(65536); + }); + + // Test invalid message size + assertThrows(IllegalArgumentException.class, () -> { + TikaGrpcClientConfig.builder().maxInboundMessageSize(-1); + }); + + // Test invalid timeout + assertThrows(IllegalArgumentException.class, () -> { + TikaGrpcClientConfig.builder().connectionTimeoutSeconds(-1); + }); + } + + @Test + public void testClientCreation() { + // Test that client can be created (though it won't connect without a server) + TikaGrpcClientConfig config = TikaGrpcClientConfig.createDefault(); + + assertDoesNotThrow(() -> { + try (TikaGrpcClient client = new TikaGrpcClient(config)) { + assertNotNull(client); + assertNotNull(client.getConfig()); + assertEquals("localhost", client.getConfig().getHost()); + assertEquals(9090, client.getConfig().getPort()); + } + }); + } + + @Test + public void testCreateDefaultClient() { + assertDoesNotThrow(() -> { + try (TikaGrpcClient client = TikaGrpcClient.createDefault()) { + assertNotNull(client); + assertEquals("localhost", client.getConfig().getHost()); + assertEquals(9090, client.getConfig().getPort()); + } + }); + } +} diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/ParseContextConfig.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/ParseContextConfig.java new file mode 100644 index 000000000..cb5d75ba1 --- /dev/null +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/ParseContextConfig.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.config; + +import jakarta.ws.rs.core.MultivaluedMap; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; + +/** + * Implementations must be thread-safe! + * <p> + * This class translates http headers into objects/configurations set + * via the ParseContext + */ +public interface ParseContextConfig { + + /** + * Configures the parseContext with present headers. + * + * @param headers the headers. + * @param metadata the metadata. + * @param context the parse context to configure. + */ + void configure(MultivaluedMap<String, String> headers, Metadata metadata, ParseContext context); +} diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/TikaConfigLoader.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/TikaConfigLoader.java index d388855ff..bc70fc0f4 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/TikaConfigLoader.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/TikaConfigLoader.java @@ -21,6 +21,8 @@ package org.apache.tika.server.config; import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -28,9 +30,17 @@ import org.springframework.core.env.Environment; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.DigestingParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.digestutils.BouncyCastleDigester; +import org.apache.tika.parser.digestutils.CommonsDigester; @Configuration public class TikaConfigLoader { + private static final Logger LOG = LoggerFactory.getLogger(TikaConfigLoader.class); + private static final int DEFAULT_DIGEST_MARK_LIMIT = 20 * 1024 * 1024; + private final Environment environment; @Autowired @@ -50,4 +60,52 @@ public class TikaConfigLoader { } return TikaConfig.getDefaultConfig(); } + + @Bean + public DigestingParser.Digester digester() { + String digestConfig = environment.getProperty("tika.server.digest", ""); + + if (StringUtils.isBlank(digestConfig)) { + LOG.info("No digest configuration found, digester will not be enabled"); + return null; + } + + int digestMarkLimit = environment.getProperty("tika.server.digestMarkLimit", + Integer.class, DEFAULT_DIGEST_MARK_LIMIT); + + try { + // Try CommonsDigester first + return new CommonsDigester(digestMarkLimit, digestConfig); + } catch (IllegalArgumentException commonsException) { + try { + // Fall back to BouncyCastleDigester + return new BouncyCastleDigester(digestMarkLimit, digestConfig); + } catch (IllegalArgumentException bcException) { + throw new IllegalArgumentException( + "Tried both CommonsDigester (" + commonsException.getMessage() + + ") and BouncyCastleDigester (" + bcException.getMessage() + ")", + bcException); + } + } + } + + @Bean + public Parser parser() throws TikaException { + TikaConfig tikaConfig = tikaConfig(); + Parser parser = new AutoDetectParser(tikaConfig); + + DigestingParser.Digester digester = digester(); + if (digester != null) { + boolean skipContainer = false; + if (tikaConfig.getAutoDetectParserConfig().getDigesterFactory() != null && + tikaConfig.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument()) { + skipContainer = true; + } + LOG.info("Wrapping parser with DigestingParser, skipContainer: {}", skipContainer); + return new DigestingParser(parser, digester, skipContainer); + } + + LOG.info("Using AutoDetectParser without digester"); + return parser; + } } diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/DetectorController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/DetectorController.java index b4f62d083..e03758dd7 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/DetectorController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/DetectorController.java @@ -18,7 +18,6 @@ package org.apache.tika.server.controller; import java.io.IOException; import java.io.InputStream; -import java.util.Optional; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -28,7 +27,6 @@ import org.springframework.core.io.Resource; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.context.request.NativeWebRequest; import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; @@ -58,11 +56,6 @@ public class DetectorController implements DetectorResourceApi { this.tikaConfig = tikaConfig; } - @Override - public Optional<NativeWebRequest> getRequest() { - return DetectorResourceApi.super.getRequest(); - } - @Override public ResponseEntity<String> putStream(Resource body, String contentDisposition) { if (body == null) { diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/MetadataController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/MetadataController.java index 3b931040c..acae7c536 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/MetadataController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/MetadataController.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Map; -import java.util.Optional; import jakarta.servlet.http.HttpServletRequest; import org.slf4j.Logger; @@ -31,17 +30,25 @@ import org.springframework.http.HttpHeaders; import org.springframework.http.HttpStatus; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; +import org.springframework.util.MultiValueMap; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.PutMapping; import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RequestHeader; import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.context.request.NativeWebRequest; import org.springframework.web.multipart.MultipartFile; +import org.apache.tika.extractor.DocumentSelector; +import org.apache.tika.language.detect.LanguageHandler; import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.DigestingParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.apache.tika.server.api.MetadataResourceApi; +import org.apache.tika.server.service.TikaLoggingService; import org.apache.tika.server.service.TikaMetadataFillerService; +import org.apache.tika.server.service.TikaParseContextService; +import org.apache.tika.server.service.TikaParsingService; /** * Controller for metadata extraction services. @@ -55,10 +62,20 @@ public class MetadataController implements MetadataResourceApi { @Autowired private TikaMetadataFillerService tikaMetadataFillerService; - @Override - public Optional<NativeWebRequest> getRequest() { - return MetadataResourceApi.super.getRequest(); - } + @Autowired + private TikaLoggingService tikaLoggingService; + + @Autowired + private TikaParseContextService tikaParseContextService; + + @Autowired + private TikaParsingService tikaParsingService; + + @Autowired + private Parser parser; + + @Autowired + private DigestingParser.Digester digester; @Override public ResponseEntity<Map<String, String>> postDocumentMetaForm(MultipartFile file) { @@ -199,31 +216,29 @@ public class MetadataController implements MetadataResourceApi { * Core metadata parsing method ported from legacy implementation */ protected Metadata parseMetadata(InputStream is, Metadata metadata, Map<String, String> headers) throws IOException { -// final ParseContext context = new ParseContext(); -// Parser parser = TikaResource.createParser(); -// -// MultiValueMap<String, String> multiValueHeaders = new org.springframework.util.LinkedMultiValueMap<>(); -// for (Map.Entry<String, String> entry : headers.entrySet()) { -// multiValueHeaders.add(entry.getKey(), entry.getValue()); -// } -// -// tikaMetadataFillerService.fillMetadata(parser, metadata, multiValueHeaders); -// fillParseContext(multiValueHeaders, metadata, context); -// -// // No need to parse embedded docs -// context.set(DocumentSelector.class, metadata1 -> false); -// -// TikaResource.logRequest(LOG, "/meta", metadata); -// TikaResource.parse(parser, LOG, "/meta", is, new LanguageHandler() { -// public void endDocument() { -// String language = getLanguage().getLanguage(); -// if (language != null) { -// metadata.set("language", language); -// } -// } -// }, metadata, context); -// return metadata; - return null; + final ParseContext context = new ParseContext(); + + MultiValueMap<String, String> multiValueHeaders = new org.springframework.util.LinkedMultiValueMap<>(); + for (Map.Entry<String, String> entry : headers.entrySet()) { + multiValueHeaders.add(entry.getKey(), entry.getValue()); + } + + tikaMetadataFillerService.fillMetadata(parser, metadata, multiValueHeaders); + tikaParseContextService.fillParseContext(multiValueHeaders, metadata, context); + + // No need to parse embedded docs + context.set(DocumentSelector.class, metadata1 -> false); + + tikaLoggingService.logRequest(LOG, "/meta", metadata); + tikaParsingService.parse(parser, LOG, "/meta", is, new LanguageHandler() { + public void endDocument() { + String language = getLanguage().getLanguage(); + if (language != null) { + metadata.set("language", language); + } + } + }, metadata, context); + return metadata; } /** diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/RecursiveMetadataAndContentController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/RecursiveMetadataAndContentController.java index e56b2b207..fcbd2c424 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/RecursiveMetadataAndContentController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/RecursiveMetadataAndContentController.java @@ -18,12 +18,10 @@ package org.apache.tika.server.controller; import java.util.List; import java.util.Map; -import java.util.Optional; import org.springframework.core.io.Resource; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.context.request.NativeWebRequest; import org.springframework.web.multipart.MultipartFile; import org.apache.tika.server.api.RecursiveMetadataAndContentApi; @@ -34,11 +32,6 @@ import org.apache.tika.server.api.RecursiveMetadataAndContentApi; */ @RestController public class RecursiveMetadataAndContentController implements RecursiveMetadataAndContentApi { - @Override - public Optional<NativeWebRequest> getRequest() { - return RecursiveMetadataAndContentApi.super.getRequest(); - } - @Override public ResponseEntity<List<Map<String, String>>> postRmetaForm(MultipartFile file) { return RecursiveMetadataAndContentApi.super.postRmetaForm(file); diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/ServerStatusController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/ServerStatusController.java index d4a657bee..1e2e67189 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/ServerStatusController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/ServerStatusController.java @@ -16,11 +16,8 @@ */ package org.apache.tika.server.controller; -import java.util.Optional; - import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.context.request.NativeWebRequest; import org.apache.tika.server.api.ServerStatusApi; import org.apache.tika.server.model.GetStatus200Response; @@ -31,11 +28,6 @@ import org.apache.tika.server.model.GetStatus200Response; */ @RestController public class ServerStatusController implements ServerStatusApi { - @Override - public Optional<NativeWebRequest> getRequest() { - return ServerStatusApi.super.getRequest(); - } - @Override public ResponseEntity<GetStatus200Response> getStatus() { return ServerStatusApi.super.getStatus(); diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/TikaController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/TikaController.java index 94520e131..482dd51c4 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/TikaController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/TikaController.java @@ -17,12 +17,10 @@ package org.apache.tika.server.controller; import java.util.Map; -import java.util.Optional; import org.springframework.core.io.Resource; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.context.request.NativeWebRequest; import org.springframework.web.multipart.MultipartFile; import org.apache.tika.server.api.TikaResourceApi; @@ -33,11 +31,6 @@ import org.apache.tika.server.api.TikaResourceApi; */ @RestController public class TikaController implements TikaResourceApi { - @Override - public Optional<NativeWebRequest> getRequest() { - return TikaResourceApi.super.getRequest(); - } - @Override public ResponseEntity<String> getTika() { return TikaResourceApi.super.getTika(); diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/TranslateResourceController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/TranslateResourceController.java index 2d9b12c9b..6266d4ba4 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/TranslateResourceController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/TranslateResourceController.java @@ -16,12 +16,9 @@ */ package org.apache.tika.server.controller; -import java.util.Optional; - import org.springframework.core.io.Resource; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.context.request.NativeWebRequest; import org.apache.tika.server.api.TranslateResourceApi; @@ -31,11 +28,6 @@ import org.apache.tika.server.api.TranslateResourceApi; */ @RestController public class TranslateResourceController implements TranslateResourceApi { - @Override - public Optional<NativeWebRequest> getRequest() { - return TranslateResourceApi.super.getRequest(); - } - @Override public ResponseEntity<String> getVersion() { return TranslateResourceApi.super.getVersion(); diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/UnpackResourceController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/UnpackResourceController.java index d927dbd4c..c15cc6934 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/UnpackResourceController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/UnpackResourceController.java @@ -16,12 +16,9 @@ */ package org.apache.tika.server.controller; -import java.util.Optional; - import org.springframework.core.io.Resource; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.context.request.NativeWebRequest; import org.apache.tika.server.api.UnpackResourceApi; @@ -32,11 +29,6 @@ import org.apache.tika.server.api.UnpackResourceApi; */ @RestController public class UnpackResourceController implements UnpackResourceApi { - @Override - public Optional<NativeWebRequest> getRequest() { - return UnpackResourceApi.super.getRequest(); - } - @Override public ResponseEntity<Resource> putUnpack(Resource body) { return UnpackResourceApi.super.putUnpack(body); diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/XmpMetadataController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/XmpMetadataController.java index d0dfd71ca..a3b4d3fd0 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/XmpMetadataController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/XmpMetadataController.java @@ -20,21 +20,15 @@ package org.apache.tika.server.controller; -import java.util.Optional; - import org.springframework.core.io.Resource; import org.springframework.http.ResponseEntity; -import org.springframework.web.context.request.NativeWebRequest; +import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; import org.apache.tika.server.api.XmpMetadataResourceApi; +@RestController public class XmpMetadataController implements XmpMetadataResourceApi { - @Override - public Optional<NativeWebRequest> getRequest() { - return XmpMetadataResourceApi.super.getRequest(); - } - @Override public ResponseEntity<String> postDocumentXmpmetaForm(MultipartFile file) { return XmpMetadataResourceApi.super.postDocumentXmpmetaForm(file); diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/ParserService.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/exception/TikaServerParseException.java similarity index 50% rename from tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/ParserService.java rename to tika-server/tika-server-spring/src/main/java/org/apache/tika/server/exception/TikaServerParseException.java index 01e0d5575..8628d6908 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/ParserService.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/exception/TikaServerParseException.java @@ -17,32 +17,21 @@ * * */ +package org.apache.tika.server.exception; -package org.apache.tika.server.service; - -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.DigestingParser; -import org.apache.tika.parser.Parser; - -public class ParserService { +import jakarta.ws.rs.WebApplicationException; +/** + * Simple wrapper exception to be thrown for consistent handling + * of exceptions that can happen during a parse. + */ +public class TikaServerParseException extends WebApplicationException { - @SuppressWarnings("serial") - public static Parser createParser() { - final Parser parser = new AutoDetectParser(TIKA_CONFIG); + public TikaServerParseException(String msg) { + super(msg); + } - if (DIGESTER != null) { - boolean skipContainer = false; - if (TIKA_CONFIG - .getAutoDetectParserConfig() - .getDigesterFactory() != null && TIKA_CONFIG - .getAutoDetectParserConfig() - .getDigesterFactory() - .isSkipContainerDocument()) { - skipContainer = true; - } - return new DigestingParser(parser, DIGESTER, skipContainer); - } - return parser; + public TikaServerParseException(Exception e) { + super(e); } } diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaLoggingService.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaLoggingService.java new file mode 100644 index 000000000..ace663730 --- /dev/null +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaLoggingService.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.service; + +import org.slf4j.Logger; +import org.springframework.stereotype.Service; + +import org.apache.tika.metadata.HttpHeaders; +import org.apache.tika.metadata.Metadata; + +/** + * Service for logging Tika server requests. + * Replaces the static logRequest method from TikaResource. + */ +@Service +public class TikaLoggingService { + + /** + * Logs information about a Tika server request. + * This is equivalent to the static logRequest method from TikaResource. + * + * @param logger the logger to use for logging + * @param endpoint the endpoint being accessed + * @param metadata the metadata object containing request information + */ + public void logRequest(Logger logger, String endpoint, Metadata metadata) { + if (metadata.get(HttpHeaders.CONTENT_TYPE) == null) { + logger.info("{} (autodetecting type)", endpoint); + } else { + logger.info("{} ({})", endpoint, metadata.get(HttpHeaders.CONTENT_TYPE)); + } + } +} \ No newline at end of file diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaParseContextService.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaParseContextService.java new file mode 100644 index 000000000..27b43d5c2 --- /dev/null +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaParseContextService.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.service; + +import java.util.List; + +import jakarta.ws.rs.core.MultivaluedHashMap; +import org.springframework.stereotype.Service; +import org.springframework.util.MultiValueMap; + +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.server.config.ParseContextConfig; + +/** + * Service for configuring ParseContext from HTTP headers. + * Replaces the static fillParseContext method from TikaResource. + */ +@Service +public class TikaParseContextService { + private final List<ParseContextConfig> parseContextConfigs; + + public TikaParseContextService() { + // Load all available ParseContextConfig implementations using Tika's ServiceLoader + this.parseContextConfigs = new ServiceLoader(getClass().getClassLoader()) + .loadServiceProviders(ParseContextConfig.class); + } + + /** + * Fills the ParseContext based on HTTP headers and metadata. + * This is equivalent to the static fillParseContext method from TikaResource. + * + * @param httpHeaders the HTTP headers from the request + * @param metadata the metadata object + * @param parseContext the parse context to configure + */ + public void fillParseContext(MultiValueMap<String, String> httpHeaders, Metadata metadata, ParseContext parseContext) { + jakarta.ws.rs.core.MultivaluedMap<String, String> jakartaHeaders = new MultivaluedHashMap<>(); + for (String key : httpHeaders.keySet()) { + List<String> values = httpHeaders.get(key); + if (values != null) { + for (String value : values) { + jakartaHeaders.add(key, value); + } + } + } + for (ParseContextConfig config : parseContextConfigs) { + config.configure(jakartaHeaders, metadata, parseContext); + } + } +} diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaParsingService.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaParsingService.java new file mode 100644 index 000000000..00180b524 --- /dev/null +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaParsingService.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.service; + +import java.io.IOException; +import java.io.InputStream; + +import org.slf4j.Logger; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.env.Environment; +import org.springframework.http.HttpStatus; +import org.springframework.stereotype.Service; +import org.springframework.web.server.ResponseStatusException; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.config.TikaTaskTimeout; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.WriteLimitReachedException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.pipes.core.exception.TikaServerParseException; + +/** + * Service for parsing documents using Tika. + * Replaces the static parse method from TikaResource with Spring Boot adaptation. + * + * Note: This simplified version doesn't include the full ServerStatus task management + * that was present in the original TikaResource.parse method, as that was designed + * for the fork-mode server architecture. In Spring Boot, we rely on the container's + * request handling and timeout mechanisms. + */ +@Service +public class TikaParsingService { + + private static final long DEFAULT_TASK_TIMEOUT_MILLIS = 300000; // 5 minutes + private static final long DEFAULT_MINIMUM_TIMEOUT_MILLIS = 30000; // 30 seconds + + private final Environment environment; + private final boolean isOperating; + + @Autowired + public TikaParsingService(Environment environment) { + this.environment = environment; + // In Spring Boot mode, we're always operating (no fork mode complexity) + this.isOperating = true; + } + + /** + * Parses a document using the specified parser and handler. + * This is equivalent to the static parse method from TikaResource, adapted for Spring Boot. + * + * @param parser the parser to use + * @param logger the logger to use for logging errors + * @param path the file path (for logging purposes) + * @param inputStream the input stream to parse (will be closed by this method) + * @param handler the content handler to receive parsing events + * @param metadata the metadata object + * @param parseContext the parse context + * @throws IOException if an error occurs during parsing + */ + public void parse(Parser parser, Logger logger, String path, InputStream inputStream, + ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException { + + checkIsOperating(); + + String fileName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + long timeoutMillis = getTaskTimeout(parseContext); + + // Note: In the original TikaResource, there was complex task tracking with ServerStatus. + // In Spring Boot, we rely on the container's request handling and don't need the same + // level of task management since we're not in fork mode. + + try { + // Validate timeout before parsing + validateTimeout(timeoutMillis); + + parser.parse(inputStream, handler, metadata, parseContext); + + } catch (SAXException e) { + throw new TikaServerParseException(e); + } catch (EncryptedDocumentException e) { + logger.warn("{}: Encrypted document ({})", path, fileName, e); + throw new TikaServerParseException(e); + } catch (Exception e) { + if (!WriteLimitReachedException.isWriteLimitReached(e)) { + logger.warn("{}: Text extraction failed ({})", path, fileName, e); + } + throw new TikaServerParseException(e); + } catch (OutOfMemoryError e) { + logger.error("{}: Out of memory error ({})", path, fileName, e); + // In the original, this would set SERVER_STATUS to ERROR and potentially restart the fork + // In Spring Boot, we just log and rethrow - the container will handle the error + throw e; + } finally { + // Always close the input stream + if (inputStream != null) { + try { + inputStream.close(); + } catch (IOException e) { + logger.warn("Error closing input stream", e); + } + } + } + } + + /** + * Checks if the server is operating. In Spring Boot mode, this is always true. + * In the original TikaResource, this checked ServerStatus for fork mode. + */ + private void checkIsOperating() { + if (!isOperating) { + throw new ResponseStatusException(HttpStatus.SERVICE_UNAVAILABLE, "Server is not operating"); + } + } + + /** + * Gets the task timeout from the parse context or configuration. + * Adapted from TikaResource.getTaskTimeout(). + * + * @param parseContext the parse context + * @return timeout in milliseconds + */ + private long getTaskTimeout(ParseContext parseContext) { + TikaTaskTimeout tikaTaskTimeout = parseContext.get(TikaTaskTimeout.class); + long defaultTimeout = environment.getProperty("tika.server.taskTimeoutMillis", + Long.class, DEFAULT_TASK_TIMEOUT_MILLIS); + + if (tikaTaskTimeout != null) { + long requestedTimeout = tikaTaskTimeout.getTimeoutMillis(); + + if (requestedTimeout > defaultTimeout) { + throw new IllegalArgumentException( + "Can't request a timeout (" + requestedTimeout + "ms) greater than the " + + "taskTimeoutMillis set in the server config (" + defaultTimeout + "ms)"); + } + + long minimumTimeout = environment.getProperty("tika.server.minimumTimeoutMillis", + Long.class, DEFAULT_MINIMUM_TIMEOUT_MILLIS); + + if (requestedTimeout < minimumTimeout) { + throw new ResponseStatusException(HttpStatus.BAD_REQUEST, + "taskTimeoutMillis must be > minimumTimeoutMillis, currently set to (" + + minimumTimeout + "ms)"); + } + + return requestedTimeout; + } + + return defaultTimeout; + } + + /** + * Validates that the timeout is within acceptable bounds. + * + * @param timeoutMillis the timeout to validate + */ + private void validateTimeout(long timeoutMillis) { + long minimumTimeout = environment.getProperty("tika.server.minimumTimeoutMillis", + Long.class, DEFAULT_MINIMUM_TIMEOUT_MILLIS); + + if (timeoutMillis < minimumTimeout) { + throw new ResponseStatusException(HttpStatus.BAD_REQUEST, + "Timeout (" + timeoutMillis + "ms) is less than minimum allowed (" + + minimumTimeout + "ms)"); + } + } +}
