This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4334 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7c1efa31f344b737be4a7f602b3a7941357e5a49 Author: Nicholas DiPiazza <[email protected]> AuthorDate: Wed Oct 1 07:53:58 2025 -0500 TIKA-4334: move grpc out of pipes and into tika-server, implement more controller --- tika-pipes/pom.xml | 1 - tika-server/README.md | 89 --------- tika-server/pom.xml | 1 + .../tika-server-grpc}/README.md | 0 .../tika-server-grpc}/corpa-files/014760.docx | Bin .../tika-server-grpc}/corpa-files/017091.docx | Bin .../tika-server-grpc}/corpa-files/017097.docx | Bin .../tika-server-grpc}/corpa-files/018367.docx | Bin .../tika-server-grpc}/corpa-files/018370.docx | Bin .../tika-server-grpc}/corpa-files/018371.docx | Bin .../tika-server-grpc}/corpa-files/021524.docx | Bin .../corpa-files/file-that-confuses-tika.log | 0 .../tika-server-grpc}/docker-build/Dockerfile | 0 .../tika-server-grpc}/docker-build/docker-build.sh | 0 .../docker-build/start-tika-grpc.sh | 0 .../tika-server-grpc}/pom.xml | 4 +- .../apache/tika/pipes/TikaPipesApplication.java | 0 .../tika/pipes/grpc/TikaGrpcServerCustomizer.java | 0 .../apache/tika/pipes/grpc/TikaGrpcService.java | 0 .../java/org/apache/tika/pipes/job/JobStatus.java | 0 .../tika/pipes/model/FetchAndParseStatus.java | 0 .../parser/TikaPipesApplicationConfiguration.java | 0 .../plugin/ClasspathPluginPropertiesFinder.java | 0 .../tika/pipes/plugin/GrpcPluginManager.java | 7 +- .../org/apache/tika/pipes/plugin/PluginConfig.java | 0 .../apache/tika/pipes/repo/EmitterRepository.java | 0 .../apache/tika/pipes/repo/FetcherRepository.java | 0 .../pipes/repo/IgniteRepositoryConfiguration.java | 0 .../tika/pipes/repo/JobStatusRepository.java | 0 .../tika/pipes/repo/PipeIteratorRepository.java | 0 .../src/main/resources/application.yaml | 0 .../tika/pipes/TikaPipesIntegrationTestBase.java | 0 .../pipes/grpc/TikaGrpcServerEmittersCrudTest.java | 0 .../grpc/TikaGrpcServerFetchAndParseTest.java | 0 .../pipes/grpc/TikaGrpcServerFetchersCrudTest.java | 0 .../pipes/grpc/TikaGrpcServerJsonSchemaTest.java | 0 .../grpc/TikaGrpcServerPipeIteratorsCrudTest.java | 0 .../tika/pipes/grpc/TikaGrpcServerPipeJobTest.java | 0 tika-server/tika-server-spring/pom.xml | 21 +- .../controller/InformationServicesController.java | 73 ------- .../controller/LanguageResourceController.java | 37 +++- .../tika/server/controller/MetadataController.java | 222 ++++++++++++++++++++- .../exception/TikaServerRuntimeException.java | 35 ++++ .../apache/tika/server/service/ParserService.java | 48 +++++ .../server/service/TikaMetadataFillerService.java | 124 ++++++++++++ ...formationServicesControllerIntegrationTest.java | 39 ---- .../LanguageResourceControllerIntegrationTest.java | 105 +++++++++- .../MetadataControllerIntegrationTest.java | 221 +++++++++++++++++++- .../src/test/resources/test-documents/english.txt | 2 + .../src/test/resources/test-documents/french.txt | 2 + .../test-documents/password-protected.doc | Bin 0 -> 22016 bytes .../src/test/resources/test-documents/test.doc | Bin 0 -> 9216 bytes 52 files changed, 798 insertions(+), 233 deletions(-) diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml index feb6c2537..f8dcc35ea 100644 --- a/tika-pipes/pom.xml +++ b/tika-pipes/pom.xml @@ -69,7 +69,6 @@ <module>tika-pipes-fetchers</module> <module>tika-pipes-emitters</module> <module>tika-pipes-proto</module> - <module>tika-pipes-grpc</module> <module>tika-pipes-cli</module> </modules> </project> diff --git a/tika-server/README.md b/tika-server/README.md deleted file mode 100644 index 7ecb77312..000000000 --- a/tika-server/README.md +++ /dev/null @@ -1,89 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -# Apache Tika Server - -https://cwiki.apache.org/confluence/display/TIKA/TikaJAXRS - -Running -------- -``` -$ java -jar tika-server/target/tika-server.jar --help - usage: tikaserver - -?,--help this help message - -h,--host <arg> host name (default = localhost) - -l,--log <arg> request URI log level ('debug' or 'info') - -p,--port <arg> listen port (default = 9998) - -s,--includeStack whether or not to return a stack trace - if there is an exception during 'parse' -``` - -Running via Docker ------------------- -Assuming you have Docker installed, you can use a prebuilt image: - -`docker run -d -p 9998:9998 apache/tika` - -This will load Apache Tika Server and expose its interface on: - -`http://localhost:9998` - -You may also be interested in the https://github.com/apache/tika-docker project -which provides prebuilt Docker images. - -Installing as a Service on Linux ------------------------ -To run as a service on Linux you need to run the `install_tika_service.sh` script. - -Assuming you have the binary distribution like `tika-server-2.0.0-bin.tgz`, -then you can extract the install script via: - -`tar xzf tika-server-2.0.0-bin.tgz --strip-components=2 tika-server-2.0.0-bin/bin/install_tika_service.sh` - -and then run the installation process via: - -`./install_tika_service.sh ./tika-server-2.0.0-bin.tgz` - - -Usage ------ -Usage examples from command line with `curl` utility: - -* Extract plain text: -`curl -T price.xls http://localhost:9998/tika` - -* Extract text with mime-type hint: -`curl -v -H "Content-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document" -T document.docx http://localhost:9998/tika` - -* Get all document attachments as ZIP-file: -`curl -v -T Doc1_ole.doc http://localhost:9998/unpacker > /var/tmp/x.zip` - -* Extract metadata to CSV format: -`curl -T price.xls http://localhost:9998/meta` - -* Detect media type from CSV format using file extension hint: -`curl -X PUT -H "Content-Disposition: attachment; filename=foo.csv" --upload-file foo.csv http://localhost:9998/detect/stream` - - -HTTP Return Codes ------------------ -`200` - Ok -`204` - No content (for example when we are unpacking file without attachments) -`415` - Unknown file type -`422` - Unparsable document of known type (password protected documents and unsupported versions like Biff5 Excel) -`500` - Internal error diff --git a/tika-server/pom.xml b/tika-server/pom.xml index 32593068f..68fcac963 100644 --- a/tika-server/pom.xml +++ b/tika-server/pom.xml @@ -38,6 +38,7 @@ <modelVersion>4.0.0</modelVersion> <modules> <module>tika-server-spring</module> + <module>tika-server-grpc</module> </modules> <parent> diff --git a/tika-pipes/tika-pipes-grpc/README.md b/tika-server/tika-server-grpc/README.md similarity index 100% rename from tika-pipes/tika-pipes-grpc/README.md rename to tika-server/tika-server-grpc/README.md diff --git a/tika-pipes/tika-pipes-grpc/corpa-files/014760.docx b/tika-server/tika-server-grpc/corpa-files/014760.docx similarity index 100% rename from tika-pipes/tika-pipes-grpc/corpa-files/014760.docx rename to tika-server/tika-server-grpc/corpa-files/014760.docx diff --git a/tika-pipes/tika-pipes-grpc/corpa-files/017091.docx b/tika-server/tika-server-grpc/corpa-files/017091.docx similarity index 100% rename from tika-pipes/tika-pipes-grpc/corpa-files/017091.docx rename to tika-server/tika-server-grpc/corpa-files/017091.docx diff --git a/tika-pipes/tika-pipes-grpc/corpa-files/017097.docx b/tika-server/tika-server-grpc/corpa-files/017097.docx similarity index 100% rename from tika-pipes/tika-pipes-grpc/corpa-files/017097.docx rename to tika-server/tika-server-grpc/corpa-files/017097.docx diff --git a/tika-pipes/tika-pipes-grpc/corpa-files/018367.docx b/tika-server/tika-server-grpc/corpa-files/018367.docx similarity index 100% rename from tika-pipes/tika-pipes-grpc/corpa-files/018367.docx rename to tika-server/tika-server-grpc/corpa-files/018367.docx diff --git a/tika-pipes/tika-pipes-grpc/corpa-files/018370.docx b/tika-server/tika-server-grpc/corpa-files/018370.docx similarity index 100% rename from tika-pipes/tika-pipes-grpc/corpa-files/018370.docx rename to tika-server/tika-server-grpc/corpa-files/018370.docx diff --git a/tika-pipes/tika-pipes-grpc/corpa-files/018371.docx b/tika-server/tika-server-grpc/corpa-files/018371.docx similarity index 100% rename from tika-pipes/tika-pipes-grpc/corpa-files/018371.docx rename to tika-server/tika-server-grpc/corpa-files/018371.docx diff --git a/tika-pipes/tika-pipes-grpc/corpa-files/021524.docx b/tika-server/tika-server-grpc/corpa-files/021524.docx similarity index 100% rename from tika-pipes/tika-pipes-grpc/corpa-files/021524.docx rename to tika-server/tika-server-grpc/corpa-files/021524.docx diff --git a/tika-pipes/tika-pipes-grpc/corpa-files/file-that-confuses-tika.log b/tika-server/tika-server-grpc/corpa-files/file-that-confuses-tika.log similarity index 100% rename from tika-pipes/tika-pipes-grpc/corpa-files/file-that-confuses-tika.log rename to tika-server/tika-server-grpc/corpa-files/file-that-confuses-tika.log diff --git a/tika-pipes/tika-pipes-grpc/docker-build/Dockerfile b/tika-server/tika-server-grpc/docker-build/Dockerfile similarity index 100% rename from tika-pipes/tika-pipes-grpc/docker-build/Dockerfile rename to tika-server/tika-server-grpc/docker-build/Dockerfile diff --git a/tika-pipes/tika-pipes-grpc/docker-build/docker-build.sh b/tika-server/tika-server-grpc/docker-build/docker-build.sh similarity index 100% rename from tika-pipes/tika-pipes-grpc/docker-build/docker-build.sh rename to tika-server/tika-server-grpc/docker-build/docker-build.sh diff --git a/tika-pipes/tika-pipes-grpc/docker-build/start-tika-grpc.sh b/tika-server/tika-server-grpc/docker-build/start-tika-grpc.sh similarity index 100% rename from tika-pipes/tika-pipes-grpc/docker-build/start-tika-grpc.sh rename to tika-server/tika-server-grpc/docker-build/start-tika-grpc.sh diff --git a/tika-pipes/tika-pipes-grpc/pom.xml b/tika-server/tika-server-grpc/pom.xml similarity index 99% rename from tika-pipes/tika-pipes-grpc/pom.xml rename to tika-server/tika-server-grpc/pom.xml index d2d561af5..1f24b5e59 100644 --- a/tika-pipes/tika-pipes-grpc/pom.xml +++ b/tika-server/tika-server-grpc/pom.xml @@ -4,11 +4,11 @@ <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.apache.tika</groupId> - <artifactId>tika-pipes</artifactId> + <artifactId>tika-server</artifactId> <version>4.0.0-SNAPSHOT</version> <relativePath>../pom.xml</relativePath> </parent> - <artifactId>tika-pipes-grpc</artifactId> + <artifactId>tika-server-grpc</artifactId> <name>Tika Pipes GRPC Server</name> <description>Tika Pipes GRPC Server provides bidirectional streams to and from the Tika parsers for running parsers on files in a platform independent and efficient way using HTTP2. diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/TikaPipesApplication.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/TikaPipesApplication.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/TikaPipesApplication.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/TikaPipesApplication.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerCustomizer.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerCustomizer.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerCustomizer.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerCustomizer.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcService.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcService.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcService.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcService.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/job/JobStatus.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/job/JobStatus.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/job/JobStatus.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/job/JobStatus.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/model/FetchAndParseStatus.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/model/FetchAndParseStatus.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/model/FetchAndParseStatus.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/model/FetchAndParseStatus.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/parser/TikaPipesApplicationConfiguration.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/parser/TikaPipesApplicationConfiguration.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/parser/TikaPipesApplicationConfiguration.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/parser/TikaPipesApplicationConfiguration.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/plugin/ClasspathPluginPropertiesFinder.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/plugin/ClasspathPluginPropertiesFinder.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/plugin/ClasspathPluginPropertiesFinder.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/plugin/ClasspathPluginPropertiesFinder.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/plugin/GrpcPluginManager.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/plugin/GrpcPluginManager.java similarity index 93% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/plugin/GrpcPluginManager.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/plugin/GrpcPluginManager.java index 818c48268..999b1f9f7 100644 --- a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/plugin/GrpcPluginManager.java +++ b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/plugin/GrpcPluginManager.java @@ -6,7 +6,12 @@ import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang.StringUtils; -import org.pf4j.*; +import org.pf4j.DefaultPluginManager; +import org.pf4j.PluginDescriptor; +import org.pf4j.PluginDescriptorFinder; +import org.pf4j.PluginLoader; +import org.pf4j.PluginRepository; +import org.pf4j.PluginWrapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Component; diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/plugin/PluginConfig.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/plugin/PluginConfig.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/plugin/PluginConfig.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/plugin/PluginConfig.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/EmitterRepository.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/EmitterRepository.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/EmitterRepository.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/EmitterRepository.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/FetcherRepository.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/FetcherRepository.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/FetcherRepository.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/FetcherRepository.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/IgniteRepositoryConfiguration.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/IgniteRepositoryConfiguration.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/IgniteRepositoryConfiguration.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/IgniteRepositoryConfiguration.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/JobStatusRepository.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/JobStatusRepository.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/JobStatusRepository.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/JobStatusRepository.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/PipeIteratorRepository.java b/tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/PipeIteratorRepository.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/java/org/apache/tika/pipes/repo/PipeIteratorRepository.java rename to tika-server/tika-server-grpc/src/main/java/org/apache/tika/pipes/repo/PipeIteratorRepository.java diff --git a/tika-pipes/tika-pipes-grpc/src/main/resources/application.yaml b/tika-server/tika-server-grpc/src/main/resources/application.yaml similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/main/resources/application.yaml rename to tika-server/tika-server-grpc/src/main/resources/application.yaml diff --git a/tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/TikaPipesIntegrationTestBase.java b/tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/TikaPipesIntegrationTestBase.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/TikaPipesIntegrationTestBase.java rename to tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/TikaPipesIntegrationTestBase.java diff --git a/tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerEmittersCrudTest.java b/tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerEmittersCrudTest.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerEmittersCrudTest.java rename to tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerEmittersCrudTest.java diff --git a/tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerFetchAndParseTest.java b/tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerFetchAndParseTest.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerFetchAndParseTest.java rename to tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerFetchAndParseTest.java diff --git a/tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerFetchersCrudTest.java b/tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerFetchersCrudTest.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerFetchersCrudTest.java rename to tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerFetchersCrudTest.java diff --git a/tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerJsonSchemaTest.java b/tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerJsonSchemaTest.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerJsonSchemaTest.java rename to tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerJsonSchemaTest.java diff --git a/tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerPipeIteratorsCrudTest.java b/tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerPipeIteratorsCrudTest.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerPipeIteratorsCrudTest.java rename to tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerPipeIteratorsCrudTest.java diff --git a/tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerPipeJobTest.java b/tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerPipeJobTest.java similarity index 100% rename from tika-pipes/tika-pipes-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerPipeJobTest.java rename to tika-server/tika-server-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerPipeJobTest.java diff --git a/tika-server/tika-server-spring/pom.xml b/tika-server/tika-server-spring/pom.xml index 5fbd2c67b..2e4a7bea1 100644 --- a/tika-server/tika-server-spring/pom.xml +++ b/tika-server/tika-server-spring/pom.xml @@ -23,7 +23,7 @@ <parent> <groupId>org.apache.tika</groupId> - <artifactId>tika-parent</artifactId> + <artifactId>tika-server</artifactId> <version>4.0.0-SNAPSHOT</version> <relativePath>../pom.xml</relativePath> </parent> @@ -110,6 +110,12 @@ <groupId>org.apache.tika</groupId> <artifactId>tika-pipes-core</artifactId> <version>${project.parent.version}</version> + <exclusions> + <exclusion> + <groupId>org.jetbrains</groupId> + <artifactId>annotations</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> @@ -125,6 +131,12 @@ <version>${springdoc.version}</version> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-langdetect-optimaize</artifactId> + <version>${project.version}</version> + </dependency> + <!-- WebJars Locator for resolving WebJars resources --> <dependency> <groupId>org.webjars</groupId> @@ -149,6 +161,13 @@ <version>0.2.6</version> </dependency> + <!-- Jakarta JAX-RS API for MultivaluedMap --> + <dependency> + <groupId>jakarta.ws.rs</groupId> + <artifactId>jakarta.ws.rs-api</artifactId> + <version>3.1.0</version> + </dependency> + <!-- Test Dependencies --> <dependency> <groupId>org.springframework.boot</groupId> diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/InformationServicesController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/InformationServicesController.java deleted file mode 100644 index 2816c1f11..000000000 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/InformationServicesController.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.server.controller; - -import java.util.Map; - -import org.springframework.http.ResponseEntity; -import org.springframework.web.bind.annotation.RestController; - -import org.apache.tika.server.api.InformationServicesApi; -import org.apache.tika.server.model.DefaultDetector; -import org.apache.tika.server.model.DetailedParsers; -import org.apache.tika.server.model.GetMimetypeDetails200Response; -import org.apache.tika.server.model.Parsers; - -/** - * Controller for utility information services. - * Handles endpoints for available parsers, detectors, mime types, etc. - * Covers Information Services tag. - */ -@RestController -public class InformationServicesController implements InformationServicesApi { - - @Override - public ResponseEntity<String> getEndpoints() { - // TODO: Implement endpoint listing - return InformationServicesApi.super.getEndpoints(); - } - - @Override - public ResponseEntity<DefaultDetector> getDetectors() { - // TODO: Implement detector information - return InformationServicesApi.super.getDetectors(); - } - - @Override - public ResponseEntity<Map<String, Object>> getMimetypes() { - // TODO: Implement mime types listing - return InformationServicesApi.super.getMimetypes(); - } - - @Override - public ResponseEntity<GetMimetypeDetails200Response> getMimetypeDetails(String type, String subtype) { - // TODO: Implement specific mime type details - return InformationServicesApi.super.getMimetypeDetails(type, subtype); - } - - @Override - public ResponseEntity<Parsers> getParsers() { - // TODO: Implement parsers listing - return InformationServicesApi.super.getParsers(); - } - - @Override - public ResponseEntity<DetailedParsers> getParsersDetails() { - // TODO: Implement detailed parsers information - return InformationServicesApi.super.getParsersDetails(); - } -} diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/LanguageResourceController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/LanguageResourceController.java index ed0bc10da..09699fadf 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/LanguageResourceController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/LanguageResourceController.java @@ -16,14 +16,20 @@ */ package org.apache.tika.server.controller; -import java.util.Optional; +import static java.nio.charset.StandardCharsets.UTF_8; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.io.IOUtils; import org.springframework.core.io.Resource; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.context.request.NativeWebRequest; +import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector; +import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.server.api.LanguageResourceApi; +import org.apache.tika.server.exception.TikaServerRuntimeException; /** * Controller for language identification services. @@ -31,28 +37,37 @@ import org.apache.tika.server.api.LanguageResourceApi; */ @RestController public class LanguageResourceController implements LanguageResourceApi { - @Override - public Optional<NativeWebRequest> getRequest() { - return LanguageResourceApi.super.getRequest(); - } - @Override public ResponseEntity<String> postLanguageStream(Resource body) { - return LanguageResourceApi.super.postLanguageStream(body); + try { + InputStream is = body.getInputStream(); + String fileTxt = IOUtils.toString(is, UTF_8); + LanguageResult language = new OptimaizeLangDetector() + .loadModels() + .detect(fileTxt); + String detectedLang = language.getLanguage(); + return ResponseEntity.ok(detectedLang); + } catch (IOException e) { + throw new TikaServerRuntimeException(e); + } } @Override public ResponseEntity<String> postLanguageString(String body) { - return LanguageResourceApi.super.postLanguageString(body); + LanguageResult language = new OptimaizeLangDetector() + .loadModels() + .detect(body); + String detectedLang = language.getLanguage(); + return ResponseEntity.ok(detectedLang); } @Override public ResponseEntity<String> putLanguageStream(Resource body) { - return LanguageResourceApi.super.putLanguageStream(body); + return postLanguageStream(body); } @Override public ResponseEntity<String> putLanguageString(String body) { - return LanguageResourceApi.super.putLanguageString(body); + return postLanguageString(body); } } diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/MetadataController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/MetadataController.java index ad0d5fbab..3b931040c 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/MetadataController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/MetadataController.java @@ -16,23 +16,45 @@ */ package org.apache.tika.server.controller; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; import java.util.Map; import java.util.Optional; +import jakarta.servlet.http.HttpServletRequest; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.Resource; +import org.springframework.http.HttpHeaders; +import org.springframework.http.HttpStatus; +import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PathVariable; +import org.springframework.web.bind.annotation.PutMapping; +import org.springframework.web.bind.annotation.RequestBody; +import org.springframework.web.bind.annotation.RequestHeader; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.context.request.NativeWebRequest; import org.springframework.web.multipart.MultipartFile; +import org.apache.tika.metadata.Metadata; import org.apache.tika.server.api.MetadataResourceApi; +import org.apache.tika.server.service.TikaMetadataFillerService; /** * Controller for metadata extraction services. * Handles the /meta and /xmpmeta endpoints for Metadata Resource and XMP Metadata Resource tags. + * Ported from the legacy JAX-RS MetadataResource implementation. */ @RestController public class MetadataController implements MetadataResourceApi { + private static final Logger LOG = LoggerFactory.getLogger(MetadataController.class); + + @Autowired + private TikaMetadataFillerService tikaMetadataFillerService; + @Override public Optional<NativeWebRequest> getRequest() { return MetadataResourceApi.super.getRequest(); @@ -40,16 +62,210 @@ public class MetadataController implements MetadataResourceApi { @Override public ResponseEntity<Map<String, String>> postDocumentMetaForm(MultipartFile file) { - return MetadataResourceApi.super.postDocumentMetaForm(file); + try { + Metadata metadata = new Metadata(); + Map<String, String> headers = new HashMap<>(); + // Convert MultipartFile headers if available + if (file.getOriginalFilename() != null) { + headers.put("Content-Disposition", "filename=" + file.getOriginalFilename()); + } + if (file.getContentType() != null) { + headers.put("Content-Type", file.getContentType()); + } + + Metadata result = parseMetadata(file.getInputStream(), metadata, headers); + return ResponseEntity.ok(metadataToMap(result)); + } catch (Exception e) { + LOG.error("Failed to process multipart form metadata extraction", e); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build(); + } } @Override public ResponseEntity<Map<String, String>> putDocumentGetMetaValue(String metadataKey, Resource body) { - return MetadataResourceApi.super.putDocumentGetMetaValue(metadataKey, body); + try { + Metadata metadata = new Metadata(); + Map<String, String> headers = new HashMap<>(); + + boolean success = false; + try { + parseMetadata(body.getInputStream(), metadata, headers); + success = true; + } catch (Exception e) { + LOG.info("Failed to process field {}", metadataKey, e); + } + + if (!success || metadata.get(metadataKey) == null) { + return ResponseEntity.status(success ? HttpStatus.NOT_FOUND : HttpStatus.BAD_REQUEST).build(); + } + + // Remove fields we don't care about for the response + Metadata filteredMetadata = new Metadata(); + String[] values = metadata.getValues(metadataKey); + for (String value : values) { + filteredMetadata.add(metadataKey, value); + } + + return ResponseEntity.ok(metadataToMap(filteredMetadata)); + } catch (Exception e) { + LOG.error("Failed to extract metadata field: " + metadataKey, e); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build(); + } } @Override public ResponseEntity<Map<String, String>> putDocumentMeta(Resource body) { - return MetadataResourceApi.super.putDocumentMeta(body); + try { + Metadata metadata = new Metadata(); + Map<String, String> headers = new HashMap<>(); + + Metadata result = parseMetadata(body.getInputStream(), metadata, headers); + return ResponseEntity.ok(metadataToMap(result)); + } catch (Exception e) { + LOG.error("Failed to extract metadata", e); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build(); + } + } + + /** + * Additional Spring Boot endpoints to handle the legacy API patterns + */ + @PutMapping(value = "/meta", produces = {MediaType.APPLICATION_JSON_VALUE, MediaType.TEXT_PLAIN_VALUE}) + public ResponseEntity<?> getMetadata(@RequestBody Resource body, + @RequestHeader HttpHeaders httpHeaders, + @RequestHeader(value = "Accept", defaultValue = MediaType.APPLICATION_JSON_VALUE) String accept, + HttpServletRequest request) throws Exception { + Metadata metadata = new Metadata(); + Map<String, String> headerMap = convertHeaders(httpHeaders); + + Metadata result = parseMetadata(body.getInputStream(), metadata, headerMap); + + if (accept.contains(MediaType.TEXT_PLAIN_VALUE)) { + return ResponseEntity.ok() + .contentType(MediaType.TEXT_PLAIN) + .body(metadataToText(result)); + } else { + return ResponseEntity.ok() + .contentType(MediaType.APPLICATION_JSON) + .body(metadataToMap(result)); + } + } + + @PutMapping(value = "/meta/{field}", produces = {MediaType.APPLICATION_JSON_VALUE, MediaType.TEXT_PLAIN_VALUE}) + public ResponseEntity<?> getMetadataField(@RequestBody Resource body, + @PathVariable String field, + @RequestHeader HttpHeaders httpHeaders, + @RequestHeader(value = "Accept", defaultValue = MediaType.APPLICATION_JSON_VALUE) String accept, + HttpServletRequest request) throws Exception { + + HttpStatus defaultErrorResponse = HttpStatus.BAD_REQUEST; + Metadata metadata = new Metadata(); + Map<String, String> headerMap = convertHeaders(httpHeaders); + + boolean success = false; + try { + parseMetadata(body.getInputStream(), metadata, headerMap); + defaultErrorResponse = HttpStatus.NOT_FOUND; + success = true; + } catch (Exception e) { + LOG.info("Failed to process field {}", field, e); + } + + if (!success || metadata.get(field) == null) { + return ResponseEntity.status(defaultErrorResponse) + .body("Failed to get metadata field " + field); + } + + // Remove fields we don't care about for the response + Metadata filteredMetadata = new Metadata(); + String[] values = metadata.getValues(field); + for (String value : values) { + filteredMetadata.add(field, value); + } + + if (accept.contains(MediaType.TEXT_PLAIN_VALUE)) { + String value = filteredMetadata.get(field); + return ResponseEntity.ok() + .contentType(MediaType.TEXT_PLAIN) + .body(value != null ? value : ""); + } else { + return ResponseEntity.ok() + .contentType(MediaType.APPLICATION_JSON) + .body(metadataToMap(filteredMetadata)); + } + } + + /** + * Core metadata parsing method ported from legacy implementation + */ + protected Metadata parseMetadata(InputStream is, Metadata metadata, Map<String, String> headers) throws IOException { +// final ParseContext context = new ParseContext(); +// Parser parser = TikaResource.createParser(); +// +// MultiValueMap<String, String> multiValueHeaders = new org.springframework.util.LinkedMultiValueMap<>(); +// for (Map.Entry<String, String> entry : headers.entrySet()) { +// multiValueHeaders.add(entry.getKey(), entry.getValue()); +// } +// +// tikaMetadataFillerService.fillMetadata(parser, metadata, multiValueHeaders); +// fillParseContext(multiValueHeaders, metadata, context); +// +// // No need to parse embedded docs +// context.set(DocumentSelector.class, metadata1 -> false); +// +// TikaResource.logRequest(LOG, "/meta", metadata); +// TikaResource.parse(parser, LOG, "/meta", is, new LanguageHandler() { +// public void endDocument() { +// String language = getLanguage().getLanguage(); +// if (language != null) { +// metadata.set("language", language); +// } +// } +// }, metadata, context); +// return metadata; + return null; + } + + /** + * Convert Metadata to Map for JSON responses + */ + private Map<String, String> metadataToMap(Metadata metadata) { + Map<String, String> map = new HashMap<>(); + for (String name : metadata.names()) { + String[] values = metadata.getValues(name); + if (values.length == 1) { + map.put(name, values[0]); + } else if (values.length > 1) { + map.put(name, String.join(", ", values)); + } + } + return map; + } + + /** + * Convert Metadata to text for plain text responses + */ + private String metadataToText(Metadata metadata) { + StringBuilder sb = new StringBuilder(); + for (String name : metadata.names()) { + String[] values = metadata.getValues(name); + for (String value : values) { + sb.append(name).append(": ").append(value).append("\n"); + } + } + return sb.toString(); + } + + /** + * Convert Spring HttpHeaders to Map + */ + private Map<String, String> convertHeaders(HttpHeaders httpHeaders) { + Map<String, String> headerMap = new HashMap<>(); + for (Map.Entry<String, java.util.List<String>> entry : httpHeaders.entrySet()) { + if (!entry.getValue().isEmpty()) { + headerMap.put(entry.getKey(), entry.getValue().get(0)); + } + } + return headerMap; } } diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/exception/TikaServerRuntimeException.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/exception/TikaServerRuntimeException.java new file mode 100644 index 000000000..b0335f556 --- /dev/null +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/exception/TikaServerRuntimeException.java @@ -0,0 +1,35 @@ +/* + * + * * Licensed to the Apache Software Foundation (ASF) under one or more + * * contributor license agreements. See the NOTICE file distributed with + * * this work for additional information regarding copyright ownership. + * * The ASF licenses this file to You under the Apache License, Version 2.0 + * * (the "License"); you may not use this file except in compliance with + * * the License. You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + * + */ + +package org.apache.tika.server.exception; + +public class TikaServerRuntimeException extends RuntimeException { + public TikaServerRuntimeException(String message) { + super(message); + } + + public TikaServerRuntimeException(String message, Throwable cause) { + super(message, cause); + } + + public TikaServerRuntimeException(Throwable cause) { + super(cause); + } +} diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/ParserService.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/ParserService.java new file mode 100644 index 000000000..01e0d5575 --- /dev/null +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/ParserService.java @@ -0,0 +1,48 @@ +/* + * + * * Licensed to the Apache Software Foundation (ASF) under one or more + * * contributor license agreements. See the NOTICE file distributed with + * * this work for additional information regarding copyright ownership. + * * The ASF licenses this file to You under the Apache License, Version 2.0 + * * (the "License"); you may not use this file except in compliance with + * * the License. You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + * + */ + +package org.apache.tika.server.service; + +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.DigestingParser; +import org.apache.tika.parser.Parser; + +public class ParserService { + + + @SuppressWarnings("serial") + public static Parser createParser() { + final Parser parser = new AutoDetectParser(TIKA_CONFIG); + + if (DIGESTER != null) { + boolean skipContainer = false; + if (TIKA_CONFIG + .getAutoDetectParserConfig() + .getDigesterFactory() != null && TIKA_CONFIG + .getAutoDetectParserConfig() + .getDigesterFactory() + .isSkipContainerDocument()) { + skipContainer = true; + } + return new DigestingParser(parser, DIGESTER, skipContainer); + } + return parser; + } +} diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaMetadataFillerService.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaMetadataFillerService.java new file mode 100644 index 000000000..5f543d732 --- /dev/null +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/service/TikaMetadataFillerService.java @@ -0,0 +1,124 @@ +/* + * + * * Licensed to the Apache Software Foundation (ASF) under one or more + * * contributor license agreements. See the NOTICE file distributed with + * * this work for additional information regarding copyright ownership. + * * The ASF licenses this file to You under the Apache License, Version 2.0 + * * (the "License"); you may not use this file except in compliance with + * * the License. You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + * + */ + +/* + * + * * Licensed to the Apache Software Foundation (ASF) under one or more + * * contributor license agreements. See the NOTICE file distributed with + * * this work for additional information regarding copyright ownership. + * * The ASF licenses this file to You under the Apache License, Version 2.0 + * * (the "License"); you may not use this file except in compliance with + * * the License. You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + * + */ + +package org.apache.tika.server.service; + +import java.util.List; +import java.util.Map; + +import org.springframework.http.HttpHeaders; +import org.springframework.stereotype.Component; +import org.springframework.util.MultiValueMap; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.Parser; + +@Component +public class TikaMetadataFillerService { + private static final String META_PREFIX = "X-Tika-Meta-"; + + @SuppressWarnings("serial") + public void fillMetadata(Parser parser, Metadata metadata, MultiValueMap<String, String> httpHeaders) { + String fileName = detectFilename(httpHeaders); + if (fileName != null) { + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName); + } + + String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE); + jakarta.ws.rs.core.MediaType mediaType = (contentTypeHeader == null || "*/*".equals(contentTypeHeader)) ? null : jakarta.ws.rs.core.MediaType.valueOf(contentTypeHeader); + if (mediaType != null && "xml".equals(mediaType.getSubtype())) { + mediaType = null; + } + + if (mediaType != null && mediaType.equals(jakarta.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) { + mediaType = null; + } + + if (mediaType != null) { + metadata.add(Metadata.CONTENT_TYPE, mediaType.toString()); + metadata.add(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE, mediaType.toString()); + } + + if (httpHeaders.containsKey("Content-Length")) { + metadata.set(Metadata.CONTENT_LENGTH, httpHeaders.getFirst("Content-Length")); + } + + for (Map.Entry<String, List<String>> e : httpHeaders.entrySet()) { + if (e + .getKey() + .startsWith(META_PREFIX)) { + String tikaKey = e + .getKey() + .substring(META_PREFIX.length()); + for (String value : e.getValue()) { + metadata.add(tikaKey, value); + } + } + } + } + + private String detectFilename(MultiValueMap<String, String> httpHeaders) { + String disposition = httpHeaders.getFirst("Content-Disposition"); + if (disposition != null) { + // Parse filename from Content-Disposition header + String[] parts = disposition.split(";"); + for (String part : parts) { + part = part.trim(); + if (part.startsWith("filename=")) { + String filename = part.substring("filename=".length()); + // Remove surrounding quotes if present + if (filename.startsWith("\"") && filename.endsWith("\"")) { + filename = filename.substring(1, filename.length() - 1); + } + return filename; + } + } + } + + // Check for X-Tika-OCRTesseractPath or similar headers that might contain filename + String resourceName = httpHeaders.getFirst(TikaCoreProperties.RESOURCE_NAME_KEY.getName()); + if (resourceName != null) { + return resourceName; + } + + return null; + } +} diff --git a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/InformationServicesControllerIntegrationTest.java b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/InformationServicesControllerIntegrationTest.java deleted file mode 100644 index af5b0f94f..000000000 --- a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/InformationServicesControllerIntegrationTest.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.server.controller; - -import org.junit.jupiter.api.Test; - -import org.apache.tika.server.IntegrationTestBase; - -/** - * Integration tests for InformationServicesController. - * Tests utility information service endpoints. - */ -public class InformationServicesControllerIntegrationTest extends IntegrationTestBase { - - @Test - public void testPlaceholder() { - // TODO: Implement integration tests for InformationServicesController - // - Test GET / for endpoint listing - // - Test GET /detectors for detector information - // - Test GET /mime-types for mime type listing - // - Test GET /mime-types/{type}/{subtype} for specific mime type details - // - Test GET /parsers for parser listing - // - Test GET /parsers/details for detailed parser information - } -} diff --git a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/LanguageResourceControllerIntegrationTest.java b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/LanguageResourceControllerIntegrationTest.java index ae4e68975..5b6bfa807 100644 --- a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/LanguageResourceControllerIntegrationTest.java +++ b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/LanguageResourceControllerIntegrationTest.java @@ -16,21 +16,116 @@ */ package org.apache.tika.server.controller; +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post; +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.put; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.content; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; + +import java.io.InputStream; + import org.junit.jupiter.api.Test; +import org.springframework.http.MediaType; import org.apache.tika.server.IntegrationTestBase; /** * Integration tests for LanguageResourceController. * Tests language identification service endpoints. + * Ported from the legacy JAX-RS implementation. */ public class LanguageResourceControllerIntegrationTest extends IntegrationTestBase { + private static final String LANG_PATH = "/language"; + private static final String LANG_STREAM_PATH = LANG_PATH + "/stream"; + private static final String LANG_STRING_PATH = LANG_PATH + "/string"; + private static final String ENGLISH_STRING = "This is English!"; + private static final String FRENCH_STRING = "c'est comme ci comme ça"; + + @Test + public void testDetectEnglishString() throws Exception { + mockMvc.perform(put(LANG_STRING_PATH) + .contentType(MediaType.TEXT_PLAIN) + .accept(MediaType.TEXT_PLAIN) + .content(ENGLISH_STRING)) + .andExpect(status().isOk()) + .andExpect(content().string("en")); + } + + @Test + public void testDetectFrenchString() throws Exception { + mockMvc.perform(put(LANG_STRING_PATH) + .contentType(MediaType.TEXT_PLAIN) + .accept(MediaType.TEXT_PLAIN) + .content(FRENCH_STRING)) + .andExpect(status().isOk()) + .andExpect(content().string("fr")); + } + + @Test + public void testDetectEnglishFile() throws Exception { + InputStream englishStream = getClass().getResourceAsStream("/test-documents/english.txt"); + + mockMvc.perform(put(LANG_STREAM_PATH) + .contentType(MediaType.TEXT_PLAIN) + .accept(MediaType.TEXT_PLAIN) + .content(englishStream.readAllBytes())) + .andExpect(status().isOk()) + .andExpect(content().string("en")); + } + + @Test + public void testDetectFrenchFile() throws Exception { + InputStream frenchStream = getClass().getResourceAsStream("/test-documents/french.txt"); + + mockMvc.perform(put(LANG_STREAM_PATH) + .contentType(MediaType.TEXT_PLAIN) + .accept(MediaType.TEXT_PLAIN) + .content(frenchStream.readAllBytes())) + .andExpect(status().isOk()) + .andExpect(content().string("fr")); + } + + @Test + public void testDetectEnglishStringPost() throws Exception { + mockMvc.perform(post(LANG_STRING_PATH) + .contentType(MediaType.TEXT_PLAIN) + .accept(MediaType.TEXT_PLAIN) + .content(ENGLISH_STRING)) + .andExpect(status().isOk()) + .andExpect(content().string("en")); + } + + @Test + public void testDetectFrenchStringPost() throws Exception { + mockMvc.perform(post(LANG_STRING_PATH) + .contentType(MediaType.TEXT_PLAIN) + .accept(MediaType.TEXT_PLAIN) + .content(FRENCH_STRING)) + .andExpect(status().isOk()) + .andExpect(content().string("fr")); + } + @Test - public void testPlaceholder() { - // TODO: Implement integration tests for LanguageResourceController - // - Test POST/PUT /language/stream for UTF-8 text file language identification - // - Test POST/PUT /language/string for text string language identification - // - Test language detection accuracy with various languages + public void testDetectEnglishFilePost() throws Exception { + InputStream englishStream = getClass().getResourceAsStream("/test-documents/english.txt"); + + mockMvc.perform(post(LANG_STREAM_PATH) + .contentType(MediaType.TEXT_PLAIN) + .accept(MediaType.TEXT_PLAIN) + .content(englishStream.readAllBytes())) + .andExpect(status().isOk()) + .andExpect(content().string("en")); + } + + @Test + public void testDetectFrenchFilePost() throws Exception { + InputStream frenchStream = getClass().getResourceAsStream("/test-documents/french.txt"); + + mockMvc.perform(post(LANG_STREAM_PATH) + .contentType(MediaType.TEXT_PLAIN) + .accept(MediaType.TEXT_PLAIN) + .content(frenchStream.readAllBytes())) + .andExpect(status().isOk()) + .andExpect(content().string("fr")); } } diff --git a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/MetadataControllerIntegrationTest.java b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/MetadataControllerIntegrationTest.java index bf2354bc5..c1854c046 100644 --- a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/MetadataControllerIntegrationTest.java +++ b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/MetadataControllerIntegrationTest.java @@ -16,24 +16,229 @@ */ package org.apache.tika.server.controller; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.put; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; + import org.junit.jupiter.api.Test; +import org.springframework.http.MediaType; +import org.springframework.test.web.servlet.MvcResult; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.serialization.JsonMetadata; import org.apache.tika.server.IntegrationTestBase; /** * Integration tests for MetadataController. * Tests metadata extraction service endpoints. + * Ported from the legacy JAX-RS implementation and adapted for Spring Boot supported content types. */ public class MetadataControllerIntegrationTest extends IntegrationTestBase { + private static final String META_PATH = "/meta"; + private static final String TEST_DOC = "/test-documents/test.doc"; + private static final String TEST_PASSWORD_PROTECTED = "/test-documents/password-protected.doc"; + + @Test + public void testSimpleWordJSON() throws Exception { + InputStream docStream = getClass().getResourceAsStream(TEST_DOC); + + MvcResult result = mockMvc.perform(put(META_PATH) + .contentType("application/msword") + .accept("application/json") + .content(docStream.readAllBytes())) + .andExpect(status().isOk()) + .andReturn(); + + String jsonContent = result.getResponse().getContentAsString(); + Reader reader = new InputStreamReader(new ByteArrayInputStream(jsonContent.getBytes(UTF_8)), UTF_8); + + Metadata metadata = JsonMetadata.fromJson(reader); + + // Basic validation - check that we have some metadata + assertNotNull(metadata); + assertTrue(metadata.names().length > 0); + + // Check for common metadata fields that should be present + String contentType = metadata.get("Content-Type"); + assertNotNull(contentType, "Content-Type should be present"); + assertTrue(contentType.contains("application") || contentType.contains("msword"), + "Content type should indicate Word document"); + } + + @Test + public void testSimpleWordText() throws Exception { + InputStream docStream = getClass().getResourceAsStream(TEST_DOC); + + MvcResult result = mockMvc.perform(put(META_PATH) + .contentType("application/msword") + .accept("text/plain") + .content(docStream.readAllBytes())) + .andExpect(status().isOk()) + .andReturn(); + + String textContent = result.getResponse().getContentAsString(); + assertNotNull(textContent); + assertTrue(textContent.length() > 0, "Should return some text content"); + } + + @Test + public void testPasswordProtected() throws Exception { + InputStream docStream = getClass().getResourceAsStream(TEST_PASSWORD_PROTECTED); + + // Won't work, no password given + mockMvc.perform(put(META_PATH) + .contentType("application/msword") + .accept("application/json") + .content(docStream.readAllBytes())) + .andExpect(status().is5xxServerError()); + + // Try again, this time with the wrong password + docStream = getClass().getResourceAsStream(TEST_PASSWORD_PROTECTED); + mockMvc.perform(put(META_PATH) + .contentType("application/msword") + .accept("application/json") + .header("Password", "wrong password") + .content(docStream.readAllBytes())) + .andExpect(status().is5xxServerError()); + + // Try again, this time with the correct password + docStream = getClass().getResourceAsStream(TEST_PASSWORD_PROTECTED); + MvcResult result = mockMvc.perform(put(META_PATH) + .contentType("application/msword") + .accept("application/json") + .header("Password", "tika") + .content(docStream.readAllBytes())) + .andExpect(status().isOk()) + .andReturn(); + + // Check results + String jsonContent = result.getResponse().getContentAsString(); + Reader reader = new InputStreamReader(new ByteArrayInputStream(jsonContent.getBytes(UTF_8)), UTF_8); + + Metadata metadata = JsonMetadata.fromJson(reader); + assertNotNull(metadata); + assertTrue(metadata.names().length > 0, "Should have extracted metadata with correct password"); + } + + @Test + public void testJSON() throws Exception { + InputStream docStream = getClass().getResourceAsStream(TEST_DOC); + + MvcResult result = mockMvc.perform(put(META_PATH) + .contentType("application/msword") + .accept("application/json") + .content(docStream.readAllBytes())) + .andExpect(status().isOk()) + .andReturn(); + + String jsonContent = result.getResponse().getContentAsString(); + Reader reader = new InputStreamReader(new ByteArrayInputStream(jsonContent.getBytes(UTF_8)), UTF_8); + + Metadata metadata = JsonMetadata.fromJson(reader); + assertNotNull(metadata); + assertTrue(metadata.names().length > 0, "Should have extracted metadata"); + } + + @Test + public void testGetField_XXX_NotFound() throws Exception { + InputStream docStream = getClass().getResourceAsStream(TEST_DOC); + + mockMvc.perform(put(META_PATH + "/xxx") + .contentType("application/msword") + .accept(MediaType.APPLICATION_JSON_VALUE) + .content(docStream.readAllBytes())) + .andExpect(status().isNotFound()); + } + @Test - public void testPlaceholder() { - // TODO: Implement integration tests for MetadataController - // - Test PUT /meta for metadata extraction - // - Test PUT /meta/{metadata_key} for specific metadata key extraction - // - Test POST /meta/form for multipart form metadata extraction - // - Test PUT /xmpmeta for XMP metadata extraction in RDF/XML format - // - Test PUT /xmpmeta/{metadata_key} for specific XMP metadata key extraction - // - Test POST /xmpmeta/form for multipart form XMP metadata extraction + public void testGetField_ContentType_JSON_Found() throws Exception { + InputStream docStream = getClass().getResourceAsStream(TEST_DOC); + + MvcResult result = mockMvc.perform(put(META_PATH + "/Content-Type") + .contentType("application/msword") + .accept(MediaType.APPLICATION_JSON_VALUE) + .content(docStream.readAllBytes())) + .andExpect(status().isOk()) + .andReturn(); + + String jsonContent = result.getResponse().getContentAsString(); + Reader reader = new InputStreamReader(new ByteArrayInputStream(jsonContent.getBytes(UTF_8)), UTF_8); + + Metadata metadata = JsonMetadata.fromJson(reader); + assertNotNull(metadata.get("Content-Type")); + assertEquals(1, metadata.names().length, "Should only return the requested field"); + } + + @Test + public void testGetField_Author_TEXT_Partial_BAD_REQUEST() throws Exception { + InputStream docStream = getClass().getResourceAsStream(TEST_DOC); + byte[] partialContent = copyBytes(docStream, 8000); + + mockMvc.perform(put(META_PATH + "/Author") + .contentType("application/msword") + .accept(MediaType.TEXT_PLAIN_VALUE) + .content(partialContent)) + .andExpect(status().isBadRequest()); + } + + @Test + public void testGetField_ContentType_TEXT_Partial_Found() throws Exception { + InputStream docStream = getClass().getResourceAsStream(TEST_DOC); + byte[] partialContent = copyBytes(docStream, 12000); + + MvcResult result = mockMvc.perform(put(META_PATH + "/Content-Type") + .contentType("application/msword") + .accept(MediaType.TEXT_PLAIN_VALUE) + .content(partialContent)) + .andExpect(status().isOk()) + .andReturn(); + + String content = result.getResponse().getContentAsString(); + assertNotNull(content); + assertTrue(content.length() > 0, "Should return content type value"); + } + + @Test + public void testGetField_ContentType_JSON_Partial_Found() throws Exception { + InputStream docStream = getClass().getResourceAsStream(TEST_DOC); + byte[] partialContent = copyBytes(docStream, 12000); + + MvcResult result = mockMvc.perform(put(META_PATH + "/Content-Type") + .contentType("application/msword") + .accept(MediaType.APPLICATION_JSON_VALUE) + .content(partialContent)) + .andExpect(status().isOk()) + .andReturn(); + + String jsonContent = result.getResponse().getContentAsString(); + Reader reader = new InputStreamReader(new ByteArrayInputStream(jsonContent.getBytes(UTF_8)), UTF_8); + + Metadata metadata = JsonMetadata.fromJson(reader); + assertNotNull(metadata.get("Content-Type")); + assertEquals(1, metadata.names().length, "Should only return the requested field"); + } + + /** + * Helper method to copy a specified number of bytes from an InputStream. + * This simulates partial document uploads for testing partial parsing scenarios. + */ + private byte[] copyBytes(InputStream stream, int maxBytes) throws Exception { + byte[] buffer = new byte[maxBytes]; + int bytesRead = stream.read(buffer); + if (bytesRead < maxBytes) { + byte[] result = new byte[bytesRead]; + System.arraycopy(buffer, 0, result, 0, bytesRead); + return result; + } + return buffer; } } diff --git a/tika-server/tika-server-spring/src/test/resources/test-documents/english.txt b/tika-server/tika-server-spring/src/test/resources/test-documents/english.txt new file mode 100644 index 000000000..dca6086c9 --- /dev/null +++ b/tika-server/tika-server-spring/src/test/resources/test-documents/english.txt @@ -0,0 +1,2 @@ +This is a sample English text for language detection testing. The quick brown fox jumps over the lazy dog. Language detection algorithms should be able to identify this text as English based on common words and patterns. This text contains enough content to provide a reliable language detection result. + diff --git a/tika-server/tika-server-spring/src/test/resources/test-documents/french.txt b/tika-server/tika-server-spring/src/test/resources/test-documents/french.txt new file mode 100644 index 000000000..ae68dae23 --- /dev/null +++ b/tika-server/tika-server-spring/src/test/resources/test-documents/french.txt @@ -0,0 +1,2 @@ +Ceci est un exemple de texte français pour tester la détection de langue. Le renard brun et rapide saute par-dessus le chien paresseux. Les algorithmes de détection de langue devraient être capables d'identifier ce texte comme étant du français basé sur les mots communs et les modèles. Ce texte contient suffisamment de contenu pour fournir un résultat fiable de détection de langue. + diff --git a/tika-server/tika-server-spring/src/test/resources/test-documents/password-protected.doc b/tika-server/tika-server-spring/src/test/resources/test-documents/password-protected.doc new file mode 100644 index 000000000..b407783d2 Binary files /dev/null and b/tika-server/tika-server-spring/src/test/resources/test-documents/password-protected.doc differ diff --git a/tika-server/tika-server-spring/src/test/resources/test-documents/test.doc b/tika-server/tika-server-spring/src/test/resources/test-documents/test.doc new file mode 100644 index 000000000..93198c87c Binary files /dev/null and b/tika-server/tika-server-spring/src/test/resources/test-documents/test.doc differ
